From 8469ab77589d8a3df53a25f0cf3b52274de0a8bc Mon Sep 17 00:00:00 2001
From: Reid 'arrdem' McKenzie <me@arrdem.com>
Date: Sat, 20 Nov 2021 14:39:14 -0700
Subject: [PATCH] WIP on Aloe

---
 projects/aloe/BUILD                          |  11 ++
 projects/aloe/NOTES.md                       |  14 ++
 projects/aloe/src/python/aloe/__main__.py    | 137 +++++++++++++++++++
 projects/aloe/src/python/ping.py             |  18 +++
 projects/aloe/src/python/traceroute.py       |  48 +++++++
 projects/aloe/test/python/test_traceroute.py |  31 +++++
 6 files changed, 259 insertions(+)
 create mode 100644 projects/aloe/BUILD
 create mode 100644 projects/aloe/NOTES.md
 create mode 100644 projects/aloe/src/python/aloe/__main__.py
 create mode 100644 projects/aloe/src/python/ping.py
 create mode 100644 projects/aloe/src/python/traceroute.py
 create mode 100644 projects/aloe/test/python/test_traceroute.py

diff --git a/projects/aloe/BUILD b/projects/aloe/BUILD
new file mode 100644
index 0000000..2e06bd6
--- /dev/null
+++ b/projects/aloe/BUILD
@@ -0,0 +1,11 @@
+py_project(
+    name = "lib"
+)
+
+zapp_binary(
+    name = "aloe",
+    main = "src/python/aloe/__main__.py",
+    deps = [
+        ":lib",
+    ],
+)
diff --git a/projects/aloe/NOTES.md b/projects/aloe/NOTES.md
new file mode 100644
index 0000000..d7051b7
--- /dev/null
+++ b/projects/aloe/NOTES.md
@@ -0,0 +1,14 @@
+
+traceroute to twitter.com (104.244.42.129), 30 hops max, 60 byte packets
+ 3  10.60.142.2 (10.60.142.2)  117.502 ms 10.60.142.3 (10.60.142.3)  75.624 ms 10.60.142.2 (10.60.142.2)  117.709 ms
+ 4  68.85.107.81 (68.85.107.81)  74.019 ms  75.439 ms 68.85.107.85 (68.85.107.85)  75.275 ms
+ 5  68.86.103.9 (68.86.103.9)  75.305 ms  75.333 ms  75.308 ms
+ 6  24.124.155.129 (24.124.155.129)  75.281 ms  32.947 ms  35.459 ms
+ 7  ae-501-ar01.denver.co.denver.comcast.net (96.216.22.130)  41.920 ms  41.893 ms  74.385 ms
+ 8  ae-501-ar01.denver.co.denver.comcast.net (96.216.22.130)  41.818 ms  41.792 ms be-36031-cs03.1601milehigh.co.ibone.comcast.net (96.110.43.249)  41.765 ms
+ 9  be-3202-pe02.910fifteenth.co.ibone.comcast.net (96.110.38.118)  43.861 ms  45.557 ms be-36041-cs04.1601milehigh.co.ibone.comcast.net (96.110.43.253)  39.431 ms
+10  be-3102-pe02.910fifteenth.co.ibone.comcast.net (96.110.38.114)  39.019 ms be-3402-pe02.910fifteenth.co.ibone.comcast.net (96.110.38.126)  39.033 ms be-3302-pe02.910fifteenth.co.ibone.comcast.net (96.110.38.122)  38.965 ms
+11  sjo-b23-link.ip.twelve99.net (213.155.133.171)  60.909 ms  60.630 ms 173.167.57.142 (173.167.57.142)  38.432 ms
+12  twitter-ic322868-sjo-b21.ip.twelve99-cust.net (62.115.49.193)  60.311 ms sjo-b23-link.ip.twelve99.net (213.155.133.171)  59.480 ms twitter-ic322868-sjo-b21.ip.twelve99-cust.net (62.115.49.193)  60.263 ms
+13  twitter-ic322868-sjo-b21.ip.twelve99-cust.net (62.115.49.193)  65.467 ms  65.687 ms *
+14  * * 104.244.42.129 (104.244.42.129)  63.604 ms
diff --git a/projects/aloe/src/python/aloe/__main__.py b/projects/aloe/src/python/aloe/__main__.py
new file mode 100644
index 0000000..8425b9c
--- /dev/null
+++ b/projects/aloe/src/python/aloe/__main__.py
@@ -0,0 +1,137 @@
+"""Aloe - A shitty weathermapping tool.
+
+Periodically traceroutes the egress network, and then walks pings out the egress network recording times and hosts which
+failed to respond. Expects a network in excess of 90% packet delivery, but with variable timings. Intended to probe for
+when packet delivery latencies radically degrade and maintain a report file.
+
+"""
+
+import sys
+import argparse
+import logging
+from datetime import datetime, timedelta
+from ping import ping
+from traceroute import TraceElem, traceroute
+from subprocess import CalledProcessError
+from typing import NamedTuple
+from collections import defaultdict
+
+
+log = logging.getLogger(__name__)
+
+parser = argparse.ArgumentParser()
+parser.add_argument("hosts", nargs="+")
+
+
+def distinct(iter):
+    s = set()
+    l = []
+    for e in iter:
+        if e in s:
+            continue
+        else:
+            l.append(e)
+            s.add(e)
+    return l
+
+
+class Host(NamedTuple):
+    hostname: str
+    ip: str
+    rank: int
+    latency: timedelta
+    samples: int = 1
+
+    def mean_latency(self):
+        return self.latency / self.samples
+
+
+class Topology(object):
+    LOCALHOST = Host("localhost", "127.0.0.1", 0, timedelta(seconds=0.1))
+
+    def __init__(self):
+        self._graph = defaultdict(set) # Dict[ip, List[ip]]
+        self._nodes = {self.LOCALHOST.ip: self.LOCALHOST} # Dict[ip, Host]
+
+    def add_traceroute(self, trace):
+        trace = list(trace)
+        hosts = []
+        newhosts = [self.LOCALHOST.ip]
+        rank = 0
+        for e in trace:
+            if e.ip not in self._nodes:
+                self._nodes[e.ip] = Host(e.hostname, e.ip, e.rank, e.latency, 1)
+            else:
+                self._nodes[e.ip] = Host(e.hostname, e.ip, e.rank, e.latency + self._nodes[e.ip].latency, self._nodes[e.ip].samples + 1)
+
+            if e.rank > rank:
+                if newhosts:
+                    for h2 in newhosts:
+                        for h1 in hosts:
+                            self._graph[h1].add(h2)
+                    hosts = newhosts
+                    newhosts = []
+                rank = e.rank
+
+            if e.rank == rank:
+                newhosts.append(e.ip)
+
+    def render(self):
+        for n in sorted(self._nodes.values(), key=lambda n: n.rank):
+            print(f"{n.hostname} ({n.ip}) => {self._graph[n.ip]}")
+
+
+def compute_topology(hostlist):
+    """Walk a series of traceroute tuples, computing a 'worst expected latency' topology from them."""
+
+    topology = Topology()
+    for h in hostlist:
+        topology.add_traceroute(traceroute(h))
+
+    return sorted(topology._nodes.values(), key=lambda n: n.rank)
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    opts, args = parser.parse_known_args()
+
+    now = start = datetime.now()
+    reconfigure_delay = timedelta(minutes=5)
+    configure_at = now - reconfigure_delay
+
+    topology = []
+
+    with open("incidents.txt", "a") as fp:
+        while True:
+            now = datetime.now()
+
+            if configure_at <= now:
+                log.info("Attempting to reconfigure network topology...")
+                try:
+                    topology = compute_topology(opts.hosts)
+                    configure_at = now + reconfigure_delay
+                    for h in topology:
+                        log.info(f"in topology {h}")
+                except CalledProcessError:
+                    pass
+
+            for h in topology:
+                if h.rank == 0:
+                    continue
+
+                fail = False
+                try:
+                    if ping(h.ip, timeout=h.mean_latency() * 2) != 0:
+                        fail = True
+                except Exception as e:
+                    fail = True
+                    log.exception(e)
+
+                if fail:
+                    msg = f"{datetime.now()} failed to reach {h.hostname} ({h.ip})"
+                    log.warning(msg)
+                    fp.write(msg + "\n")
+
+                else:
+                    sys.stderr.write('.')
+                    sys.stderr.flush()
diff --git a/projects/aloe/src/python/ping.py b/projects/aloe/src/python/ping.py
new file mode 100644
index 0000000..402f8aa
--- /dev/null
+++ b/projects/aloe/src/python/ping.py
@@ -0,0 +1,18 @@
+"""A shitty ping wrapper."""
+
+
+from datetime import timedelta
+from subprocess import check_call, DEVNULL
+
+
+def ping(host: str,
+         count: int = 3,
+         interval: float = 0.3,
+         timeout: timedelta = timedelta(seconds=3)):
+    return check_call(["ping", "-q",
+                       "-i", str(interval),
+                       "-c", str(count),
+                       "-W", str(timeout.total_seconds()),
+                       host],
+                      stdout=DEVNULL,
+                      stderr=DEVNULL,)
diff --git a/projects/aloe/src/python/traceroute.py b/projects/aloe/src/python/traceroute.py
new file mode 100644
index 0000000..d026f93
--- /dev/null
+++ b/projects/aloe/src/python/traceroute.py
@@ -0,0 +1,48 @@
+"""A shitty traceroute wrapper."""
+
+from datetime import timedelta
+import re
+from subprocess import (
+    CalledProcessError,
+    check_call,
+    check_output,
+    DEVNULL,
+)
+from typing import Iterator, List, NamedTuple
+
+
+class TraceElem(NamedTuple):
+    hostname: str
+    ip: str
+    latency: timedelta
+    rank: int
+
+
+_LINE = re.compile(r"\*|(((?P<hostname>[-_\w\d\.]*)\s+\((?P<ip>[a-f\d\.:]*)\)\s+)?(?P<latency>[\d\.]*) ms)")
+
+
+def _parse_traceroute(lines: List[str]) -> Iterator[TraceElem]:
+    for rank, l in zip(range(1, 1<<64), lines):
+        ip = None
+        hostname = None
+        for m in re.finditer(_LINE, l):
+            if m.group("latency"):
+                ip = m.group("ip") or ip
+                hostname = m.group("hostname") or hostname
+                yield TraceElem(hostname, ip, timedelta(milliseconds=float(m.group("latency"))), rank)
+
+
+def traceroute(host: str, icmp=True, timeout=timedelta(seconds=5)) -> Iterator[TraceElem]:
+    # FIXME: Make ICMP mode an option, not on always
+    yield from _parse_traceroute(
+        check_output(["traceroute",
+                      # Set wait; note use of total_seconds which is "real" valued.
+                      "-w", str(timeout.total_seconds()),
+                      # Use ICMP probes same as PING.
+                      # This means all probed hosts will be pingable/ping compliant.
+                      # May miss parts of the topology as a result.
+                      "-I",
+                      host],
+                     stderr=DEVNULL,)
+        .decode("utf-8")
+        .splitlines())
diff --git a/projects/aloe/test/python/test_traceroute.py b/projects/aloe/test/python/test_traceroute.py
new file mode 100644
index 0000000..3a78199
--- /dev/null
+++ b/projects/aloe/test/python/test_traceroute.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+
+from typing import List
+from traceroute import _parse_traceroute, TraceElem
+from datetime import timedelta
+
+import pytest
+
+
+def parse_traceroute(lines):
+    """Testing helper."""
+    return list(_parse_traceroute(lines))
+
+
+@pytest.mark.parametrize("example, expected", [
+    # Basic case, one match
+    ("3  10.60.142.2 (10.60.142.2)  117.502 ms",
+     [TraceElem("10.60.142.2", "10.60.142.2", timedelta(milliseconds=117.502))]),
+    # Multiple matches on one line
+    ("3  10.60.142.2 (10.60.142.2)  117.502 ms 10.60.142.3 (10.60.142.3)  75.624 ms 10.60.142.2 (10.60.142.2)  117.709 ms",
+     [TraceElem("10.60.142.2", "10.60.142.2", timedelta(milliseconds=117.502)),
+      TraceElem("10.60.142.3", "10.60.142.3", timedelta(milliseconds=75.624)),
+      TraceElem("10.60.142.2", "10.60.142.2", timedelta(milliseconds=117.709))]),
+    # Context sensitive case - traceroute doesn't always print the host & IP.
+    ("7  ae-501-ar01.denver.co.denver.comcast.net (96.216.22.130)  41.920 ms  41.893 ms  74.385 ms",
+     [TraceElem("ae-501-ar01.denver.co.denver.comcast.net", "96.216.22.130", timedelta(milliseconds=41.920)),
+      TraceElem("ae-501-ar01.denver.co.denver.comcast.net", "96.216.22.130", timedelta(milliseconds=41.893)),
+      TraceElem("ae-501-ar01.denver.co.denver.comcast.net", "96.216.22.130", timedelta(milliseconds=74.385))]),
+])
+def test_examples(example: str, expected: List[TraceElem]):
+    assert parse_traceroute(example.splitlines()) == expected