diff --git a/projects/aloe/BUILD b/projects/aloe/BUILD new file mode 100644 index 0000000..2e06bd6 --- /dev/null +++ b/projects/aloe/BUILD @@ -0,0 +1,11 @@ +py_project( + name = "lib" +) + +zapp_binary( + name = "aloe", + main = "src/python/aloe/__main__.py", + deps = [ + ":lib", + ], +) diff --git a/projects/aloe/NOTES.md b/projects/aloe/NOTES.md new file mode 100644 index 0000000..d7051b7 --- /dev/null +++ b/projects/aloe/NOTES.md @@ -0,0 +1,14 @@ + +traceroute to twitter.com (104.244.42.129), 30 hops max, 60 byte packets + 3 10.60.142.2 (10.60.142.2) 117.502 ms 10.60.142.3 (10.60.142.3) 75.624 ms 10.60.142.2 (10.60.142.2) 117.709 ms + 4 68.85.107.81 (68.85.107.81) 74.019 ms 75.439 ms 68.85.107.85 (68.85.107.85) 75.275 ms + 5 68.86.103.9 (68.86.103.9) 75.305 ms 75.333 ms 75.308 ms + 6 24.124.155.129 (24.124.155.129) 75.281 ms 32.947 ms 35.459 ms + 7 ae-501-ar01.denver.co.denver.comcast.net (96.216.22.130) 41.920 ms 41.893 ms 74.385 ms + 8 ae-501-ar01.denver.co.denver.comcast.net (96.216.22.130) 41.818 ms 41.792 ms be-36031-cs03.1601milehigh.co.ibone.comcast.net (96.110.43.249) 41.765 ms + 9 be-3202-pe02.910fifteenth.co.ibone.comcast.net (96.110.38.118) 43.861 ms 45.557 ms be-36041-cs04.1601milehigh.co.ibone.comcast.net (96.110.43.253) 39.431 ms +10 be-3102-pe02.910fifteenth.co.ibone.comcast.net (96.110.38.114) 39.019 ms be-3402-pe02.910fifteenth.co.ibone.comcast.net (96.110.38.126) 39.033 ms be-3302-pe02.910fifteenth.co.ibone.comcast.net (96.110.38.122) 38.965 ms +11 sjo-b23-link.ip.twelve99.net (213.155.133.171) 60.909 ms 60.630 ms 173.167.57.142 (173.167.57.142) 38.432 ms +12 twitter-ic322868-sjo-b21.ip.twelve99-cust.net (62.115.49.193) 60.311 ms sjo-b23-link.ip.twelve99.net (213.155.133.171) 59.480 ms twitter-ic322868-sjo-b21.ip.twelve99-cust.net (62.115.49.193) 60.263 ms +13 twitter-ic322868-sjo-b21.ip.twelve99-cust.net (62.115.49.193) 65.467 ms 65.687 ms * +14 * * 104.244.42.129 (104.244.42.129) 63.604 ms diff --git a/projects/aloe/src/python/aloe/__main__.py b/projects/aloe/src/python/aloe/__main__.py new file mode 100644 index 0000000..8425b9c --- /dev/null +++ b/projects/aloe/src/python/aloe/__main__.py @@ -0,0 +1,137 @@ +"""Aloe - A shitty weathermapping tool. + +Periodically traceroutes the egress network, and then walks pings out the egress network recording times and hosts which +failed to respond. Expects a network in excess of 90% packet delivery, but with variable timings. Intended to probe for +when packet delivery latencies radically degrade and maintain a report file. + +""" + +import sys +import argparse +import logging +from datetime import datetime, timedelta +from ping import ping +from traceroute import TraceElem, traceroute +from subprocess import CalledProcessError +from typing import NamedTuple +from collections import defaultdict + + +log = logging.getLogger(__name__) + +parser = argparse.ArgumentParser() +parser.add_argument("hosts", nargs="+") + + +def distinct(iter): + s = set() + l = [] + for e in iter: + if e in s: + continue + else: + l.append(e) + s.add(e) + return l + + +class Host(NamedTuple): + hostname: str + ip: str + rank: int + latency: timedelta + samples: int = 1 + + def mean_latency(self): + return self.latency / self.samples + + +class Topology(object): + LOCALHOST = Host("localhost", "127.0.0.1", 0, timedelta(seconds=0.1)) + + def __init__(self): + self._graph = defaultdict(set) # Dict[ip, List[ip]] + self._nodes = {self.LOCALHOST.ip: self.LOCALHOST} # Dict[ip, Host] + + def add_traceroute(self, trace): + trace = list(trace) + hosts = [] + newhosts = [self.LOCALHOST.ip] + rank = 0 + for e in trace: + if e.ip not in self._nodes: + self._nodes[e.ip] = Host(e.hostname, e.ip, e.rank, e.latency, 1) + else: + self._nodes[e.ip] = Host(e.hostname, e.ip, e.rank, e.latency + self._nodes[e.ip].latency, self._nodes[e.ip].samples + 1) + + if e.rank > rank: + if newhosts: + for h2 in newhosts: + for h1 in hosts: + self._graph[h1].add(h2) + hosts = newhosts + newhosts = [] + rank = e.rank + + if e.rank == rank: + newhosts.append(e.ip) + + def render(self): + for n in sorted(self._nodes.values(), key=lambda n: n.rank): + print(f"{n.hostname} ({n.ip}) => {self._graph[n.ip]}") + + +def compute_topology(hostlist): + """Walk a series of traceroute tuples, computing a 'worst expected latency' topology from them.""" + + topology = Topology() + for h in hostlist: + topology.add_traceroute(traceroute(h)) + + return sorted(topology._nodes.values(), key=lambda n: n.rank) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + opts, args = parser.parse_known_args() + + now = start = datetime.now() + reconfigure_delay = timedelta(minutes=5) + configure_at = now - reconfigure_delay + + topology = [] + + with open("incidents.txt", "a") as fp: + while True: + now = datetime.now() + + if configure_at <= now: + log.info("Attempting to reconfigure network topology...") + try: + topology = compute_topology(opts.hosts) + configure_at = now + reconfigure_delay + for h in topology: + log.info(f"in topology {h}") + except CalledProcessError: + pass + + for h in topology: + if h.rank == 0: + continue + + fail = False + try: + if ping(h.ip, timeout=h.mean_latency() * 2) != 0: + fail = True + except Exception as e: + fail = True + log.exception(e) + + if fail: + msg = f"{datetime.now()} failed to reach {h.hostname} ({h.ip})" + log.warning(msg) + fp.write(msg + "\n") + + else: + sys.stderr.write('.') + sys.stderr.flush() diff --git a/projects/aloe/src/python/ping.py b/projects/aloe/src/python/ping.py new file mode 100644 index 0000000..402f8aa --- /dev/null +++ b/projects/aloe/src/python/ping.py @@ -0,0 +1,18 @@ +"""A shitty ping wrapper.""" + + +from datetime import timedelta +from subprocess import check_call, DEVNULL + + +def ping(host: str, + count: int = 3, + interval: float = 0.3, + timeout: timedelta = timedelta(seconds=3)): + return check_call(["ping", "-q", + "-i", str(interval), + "-c", str(count), + "-W", str(timeout.total_seconds()), + host], + stdout=DEVNULL, + stderr=DEVNULL,) diff --git a/projects/aloe/src/python/traceroute.py b/projects/aloe/src/python/traceroute.py new file mode 100644 index 0000000..d026f93 --- /dev/null +++ b/projects/aloe/src/python/traceroute.py @@ -0,0 +1,48 @@ +"""A shitty traceroute wrapper.""" + +from datetime import timedelta +import re +from subprocess import ( + CalledProcessError, + check_call, + check_output, + DEVNULL, +) +from typing import Iterator, List, NamedTuple + + +class TraceElem(NamedTuple): + hostname: str + ip: str + latency: timedelta + rank: int + + +_LINE = re.compile(r"\*|(((?P[-_\w\d\.]*)\s+\((?P[a-f\d\.:]*)\)\s+)?(?P[\d\.]*) ms)") + + +def _parse_traceroute(lines: List[str]) -> Iterator[TraceElem]: + for rank, l in zip(range(1, 1<<64), lines): + ip = None + hostname = None + for m in re.finditer(_LINE, l): + if m.group("latency"): + ip = m.group("ip") or ip + hostname = m.group("hostname") or hostname + yield TraceElem(hostname, ip, timedelta(milliseconds=float(m.group("latency"))), rank) + + +def traceroute(host: str, icmp=True, timeout=timedelta(seconds=5)) -> Iterator[TraceElem]: + # FIXME: Make ICMP mode an option, not on always + yield from _parse_traceroute( + check_output(["traceroute", + # Set wait; note use of total_seconds which is "real" valued. + "-w", str(timeout.total_seconds()), + # Use ICMP probes same as PING. + # This means all probed hosts will be pingable/ping compliant. + # May miss parts of the topology as a result. + "-I", + host], + stderr=DEVNULL,) + .decode("utf-8") + .splitlines()) diff --git a/projects/aloe/test/python/test_traceroute.py b/projects/aloe/test/python/test_traceroute.py new file mode 100644 index 0000000..3a78199 --- /dev/null +++ b/projects/aloe/test/python/test_traceroute.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 + +from typing import List +from traceroute import _parse_traceroute, TraceElem +from datetime import timedelta + +import pytest + + +def parse_traceroute(lines): + """Testing helper.""" + return list(_parse_traceroute(lines)) + + +@pytest.mark.parametrize("example, expected", [ + # Basic case, one match + ("3 10.60.142.2 (10.60.142.2) 117.502 ms", + [TraceElem("10.60.142.2", "10.60.142.2", timedelta(milliseconds=117.502))]), + # Multiple matches on one line + ("3 10.60.142.2 (10.60.142.2) 117.502 ms 10.60.142.3 (10.60.142.3) 75.624 ms 10.60.142.2 (10.60.142.2) 117.709 ms", + [TraceElem("10.60.142.2", "10.60.142.2", timedelta(milliseconds=117.502)), + TraceElem("10.60.142.3", "10.60.142.3", timedelta(milliseconds=75.624)), + TraceElem("10.60.142.2", "10.60.142.2", timedelta(milliseconds=117.709))]), + # Context sensitive case - traceroute doesn't always print the host & IP. + ("7 ae-501-ar01.denver.co.denver.comcast.net (96.216.22.130) 41.920 ms 41.893 ms 74.385 ms", + [TraceElem("ae-501-ar01.denver.co.denver.comcast.net", "96.216.22.130", timedelta(milliseconds=41.920)), + TraceElem("ae-501-ar01.denver.co.denver.comcast.net", "96.216.22.130", timedelta(milliseconds=41.893)), + TraceElem("ae-501-ar01.denver.co.denver.comcast.net", "96.216.22.130", timedelta(milliseconds=74.385))]), +]) +def test_examples(example: str, expected: List[TraceElem]): + assert parse_traceroute(example.splitlines()) == expected