WIP on Aloe

This commit is contained in:
Reid 'arrdem' McKenzie 2021-11-20 14:39:14 -07:00
parent b6b1f23188
commit a4cd4568cf
6 changed files with 259 additions and 0 deletions

11
projects/aloe/BUILD Normal file
View file

@ -0,0 +1,11 @@
py_project(
name = "lib"
)
zapp_binary(
name = "aloe",
main = "src/python/aloe/__main__.py",
deps = [
":lib",
],
)

14
projects/aloe/NOTES.md Normal file
View file

@ -0,0 +1,14 @@
traceroute to twitter.com (104.244.42.129), 30 hops max, 60 byte packets
3 10.60.142.2 (10.60.142.2) 117.502 ms 10.60.142.3 (10.60.142.3) 75.624 ms 10.60.142.2 (10.60.142.2) 117.709 ms
4 68.85.107.81 (68.85.107.81) 74.019 ms 75.439 ms 68.85.107.85 (68.85.107.85) 75.275 ms
5 68.86.103.9 (68.86.103.9) 75.305 ms 75.333 ms 75.308 ms
6 24.124.155.129 (24.124.155.129) 75.281 ms 32.947 ms 35.459 ms
7 ae-501-ar01.denver.co.denver.comcast.net (96.216.22.130) 41.920 ms 41.893 ms 74.385 ms
8 ae-501-ar01.denver.co.denver.comcast.net (96.216.22.130) 41.818 ms 41.792 ms be-36031-cs03.1601milehigh.co.ibone.comcast.net (96.110.43.249) 41.765 ms
9 be-3202-pe02.910fifteenth.co.ibone.comcast.net (96.110.38.118) 43.861 ms 45.557 ms be-36041-cs04.1601milehigh.co.ibone.comcast.net (96.110.43.253) 39.431 ms
10 be-3102-pe02.910fifteenth.co.ibone.comcast.net (96.110.38.114) 39.019 ms be-3402-pe02.910fifteenth.co.ibone.comcast.net (96.110.38.126) 39.033 ms be-3302-pe02.910fifteenth.co.ibone.comcast.net (96.110.38.122) 38.965 ms
11 sjo-b23-link.ip.twelve99.net (213.155.133.171) 60.909 ms 60.630 ms 173.167.57.142 (173.167.57.142) 38.432 ms
12 twitter-ic322868-sjo-b21.ip.twelve99-cust.net (62.115.49.193) 60.311 ms sjo-b23-link.ip.twelve99.net (213.155.133.171) 59.480 ms twitter-ic322868-sjo-b21.ip.twelve99-cust.net (62.115.49.193) 60.263 ms
13 twitter-ic322868-sjo-b21.ip.twelve99-cust.net (62.115.49.193) 65.467 ms 65.687 ms *
14 * * 104.244.42.129 (104.244.42.129) 63.604 ms

View file

@ -0,0 +1,137 @@
"""Aloe - A shitty weathermapping tool.
Periodically traceroutes the egress network, and then walks pings out the egress network recording times and hosts which
failed to respond. Expects a network in excess of 90% packet delivery, but with variable timings. Intended to probe for
when packet delivery latencies radically degrade and maintain a report file.
"""
import sys
import argparse
import logging
from datetime import datetime, timedelta
from ping import ping
from traceroute import TraceElem, traceroute
from subprocess import CalledProcessError
from typing import NamedTuple
from collections import defaultdict
log = logging.getLogger(__name__)
parser = argparse.ArgumentParser()
parser.add_argument("hosts", nargs="+")
def distinct(iter):
s = set()
l = []
for e in iter:
if e in s:
continue
else:
l.append(e)
s.add(e)
return l
class Host(NamedTuple):
hostname: str
ip: str
rank: int
latency: timedelta
samples: int = 1
def mean_latency(self):
return self.latency / self.samples
class Topology(object):
LOCALHOST = Host("localhost", "127.0.0.1", 0, timedelta(seconds=0.1))
def __init__(self):
self._graph = defaultdict(set) # Dict[ip, List[ip]]
self._nodes = {self.LOCALHOST.ip: self.LOCALHOST} # Dict[ip, Host]
def add_traceroute(self, trace):
trace = list(trace)
hosts = []
newhosts = [self.LOCALHOST.ip]
rank = 0
for e in trace:
if e.ip not in self._nodes:
self._nodes[e.ip] = Host(e.hostname, e.ip, e.rank, e.latency, 1)
else:
self._nodes[e.ip] = Host(e.hostname, e.ip, e.rank, e.latency + self._nodes[e.ip].latency, self._nodes[e.ip].samples + 1)
if e.rank > rank:
if newhosts:
for h2 in newhosts:
for h1 in hosts:
self._graph[h1].add(h2)
hosts = newhosts
newhosts = []
rank = e.rank
if e.rank == rank:
newhosts.append(e.ip)
def render(self):
for n in sorted(self._nodes.values(), key=lambda n: n.rank):
print(f"{n.hostname} ({n.ip}) => {self._graph[n.ip]}")
def compute_topology(hostlist):
"""Walk a series of traceroute tuples, computing a 'worst expected latency' topology from them."""
topology = Topology()
for h in hostlist:
topology.add_traceroute(traceroute(h))
return sorted(topology._nodes.values(), key=lambda n: n.rank)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
opts, args = parser.parse_known_args()
now = start = datetime.now()
reconfigure_delay = timedelta(minutes=5)
configure_at = now - reconfigure_delay
topology = []
with open("incidents.txt", "a") as fp:
while True:
now = datetime.now()
if configure_at <= now:
log.info("Attempting to reconfigure network topology...")
try:
topology = compute_topology(opts.hosts)
configure_at = now + reconfigure_delay
for h in topology:
log.info(f"in topology {h}")
except CalledProcessError:
pass
for h in topology:
if h.rank == 0:
continue
fail = False
try:
if ping(h.ip, timeout=h.mean_latency() * 2) != 0:
fail = True
except Exception as e:
fail = True
log.exception(e)
if fail:
msg = f"{datetime.now()} failed to reach {h.hostname} ({h.ip})"
log.warning(msg)
fp.write(msg + "\n")
else:
sys.stderr.write('.')
sys.stderr.flush()

View file

@ -0,0 +1,18 @@
"""A shitty ping wrapper."""
from datetime import timedelta
from subprocess import check_call, DEVNULL
def ping(host: str,
count: int = 3,
interval: float = 0.3,
timeout: timedelta = timedelta(seconds=3)):
return check_call(["ping", "-q",
"-i", str(interval),
"-c", str(count),
"-W", str(timeout.total_seconds()),
host],
stdout=DEVNULL,
stderr=DEVNULL,)

View file

@ -0,0 +1,48 @@
"""A shitty traceroute wrapper."""
from datetime import timedelta
import re
from subprocess import (
CalledProcessError,
check_call,
check_output,
DEVNULL,
)
from typing import Iterator, List, NamedTuple
class TraceElem(NamedTuple):
hostname: str
ip: str
latency: timedelta
rank: int
_LINE = re.compile(r"\*|(((?P<hostname>[-_\w\d\.]*)\s+\((?P<ip>[a-f\d\.:]*)\)\s+)?(?P<latency>[\d\.]*) ms)")
def _parse_traceroute(lines: List[str]) -> Iterator[TraceElem]:
for rank, l in zip(range(1, 1<<64), lines):
ip = None
hostname = None
for m in re.finditer(_LINE, l):
if m.group("latency"):
ip = m.group("ip") or ip
hostname = m.group("hostname") or hostname
yield TraceElem(hostname, ip, timedelta(milliseconds=float(m.group("latency"))), rank)
def traceroute(host: str, icmp=True, timeout=timedelta(seconds=5)) -> Iterator[TraceElem]:
# FIXME: Make ICMP mode an option, not on always
yield from _parse_traceroute(
check_output(["traceroute",
# Set wait; note use of total_seconds which is "real" valued.
"-w", str(timeout.total_seconds()),
# Use ICMP probes same as PING.
# This means all probed hosts will be pingable/ping compliant.
# May miss parts of the topology as a result.
"-I",
host],
stderr=DEVNULL,)
.decode("utf-8")
.splitlines())

View file

@ -0,0 +1,31 @@
#!/usr/bin/env python3
from typing import List
from traceroute import _parse_traceroute, TraceElem
from datetime import timedelta
import pytest
def parse_traceroute(lines):
"""Testing helper."""
return list(_parse_traceroute(lines))
@pytest.mark.parametrize("example, expected", [
# Basic case, one match
("3 10.60.142.2 (10.60.142.2) 117.502 ms",
[TraceElem("10.60.142.2", "10.60.142.2", timedelta(milliseconds=117.502))]),
# Multiple matches on one line
("3 10.60.142.2 (10.60.142.2) 117.502 ms 10.60.142.3 (10.60.142.3) 75.624 ms 10.60.142.2 (10.60.142.2) 117.709 ms",
[TraceElem("10.60.142.2", "10.60.142.2", timedelta(milliseconds=117.502)),
TraceElem("10.60.142.3", "10.60.142.3", timedelta(milliseconds=75.624)),
TraceElem("10.60.142.2", "10.60.142.2", timedelta(milliseconds=117.709))]),
# Context sensitive case - traceroute doesn't always print the host & IP.
("7 ae-501-ar01.denver.co.denver.comcast.net (96.216.22.130) 41.920 ms 41.893 ms 74.385 ms",
[TraceElem("ae-501-ar01.denver.co.denver.comcast.net", "96.216.22.130", timedelta(milliseconds=41.920)),
TraceElem("ae-501-ar01.denver.co.denver.comcast.net", "96.216.22.130", timedelta(milliseconds=41.893)),
TraceElem("ae-501-ar01.denver.co.denver.comcast.net", "96.216.22.130", timedelta(milliseconds=74.385))]),
])
def test_examples(example: str, expected: List[TraceElem]):
assert parse_traceroute(example.splitlines()) == expected