WIP on Aloe
This commit is contained in:
parent
b6b1f23188
commit
a4cd4568cf
6 changed files with 259 additions and 0 deletions
11
projects/aloe/BUILD
Normal file
11
projects/aloe/BUILD
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
py_project(
|
||||||
|
name = "lib"
|
||||||
|
)
|
||||||
|
|
||||||
|
zapp_binary(
|
||||||
|
name = "aloe",
|
||||||
|
main = "src/python/aloe/__main__.py",
|
||||||
|
deps = [
|
||||||
|
":lib",
|
||||||
|
],
|
||||||
|
)
|
14
projects/aloe/NOTES.md
Normal file
14
projects/aloe/NOTES.md
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
|
||||||
|
traceroute to twitter.com (104.244.42.129), 30 hops max, 60 byte packets
|
||||||
|
3 10.60.142.2 (10.60.142.2) 117.502 ms 10.60.142.3 (10.60.142.3) 75.624 ms 10.60.142.2 (10.60.142.2) 117.709 ms
|
||||||
|
4 68.85.107.81 (68.85.107.81) 74.019 ms 75.439 ms 68.85.107.85 (68.85.107.85) 75.275 ms
|
||||||
|
5 68.86.103.9 (68.86.103.9) 75.305 ms 75.333 ms 75.308 ms
|
||||||
|
6 24.124.155.129 (24.124.155.129) 75.281 ms 32.947 ms 35.459 ms
|
||||||
|
7 ae-501-ar01.denver.co.denver.comcast.net (96.216.22.130) 41.920 ms 41.893 ms 74.385 ms
|
||||||
|
8 ae-501-ar01.denver.co.denver.comcast.net (96.216.22.130) 41.818 ms 41.792 ms be-36031-cs03.1601milehigh.co.ibone.comcast.net (96.110.43.249) 41.765 ms
|
||||||
|
9 be-3202-pe02.910fifteenth.co.ibone.comcast.net (96.110.38.118) 43.861 ms 45.557 ms be-36041-cs04.1601milehigh.co.ibone.comcast.net (96.110.43.253) 39.431 ms
|
||||||
|
10 be-3102-pe02.910fifteenth.co.ibone.comcast.net (96.110.38.114) 39.019 ms be-3402-pe02.910fifteenth.co.ibone.comcast.net (96.110.38.126) 39.033 ms be-3302-pe02.910fifteenth.co.ibone.comcast.net (96.110.38.122) 38.965 ms
|
||||||
|
11 sjo-b23-link.ip.twelve99.net (213.155.133.171) 60.909 ms 60.630 ms 173.167.57.142 (173.167.57.142) 38.432 ms
|
||||||
|
12 twitter-ic322868-sjo-b21.ip.twelve99-cust.net (62.115.49.193) 60.311 ms sjo-b23-link.ip.twelve99.net (213.155.133.171) 59.480 ms twitter-ic322868-sjo-b21.ip.twelve99-cust.net (62.115.49.193) 60.263 ms
|
||||||
|
13 twitter-ic322868-sjo-b21.ip.twelve99-cust.net (62.115.49.193) 65.467 ms 65.687 ms *
|
||||||
|
14 * * 104.244.42.129 (104.244.42.129) 63.604 ms
|
137
projects/aloe/src/python/aloe/__main__.py
Normal file
137
projects/aloe/src/python/aloe/__main__.py
Normal file
|
@ -0,0 +1,137 @@
|
||||||
|
"""Aloe - A shitty weathermapping tool.
|
||||||
|
|
||||||
|
Periodically traceroutes the egress network, and then walks pings out the egress network recording times and hosts which
|
||||||
|
failed to respond. Expects a network in excess of 90% packet delivery, but with variable timings. Intended to probe for
|
||||||
|
when packet delivery latencies radically degrade and maintain a report file.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from ping import ping
|
||||||
|
from traceroute import TraceElem, traceroute
|
||||||
|
from subprocess import CalledProcessError
|
||||||
|
from typing import NamedTuple
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("hosts", nargs="+")
|
||||||
|
|
||||||
|
|
||||||
|
def distinct(iter):
|
||||||
|
s = set()
|
||||||
|
l = []
|
||||||
|
for e in iter:
|
||||||
|
if e in s:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
l.append(e)
|
||||||
|
s.add(e)
|
||||||
|
return l
|
||||||
|
|
||||||
|
|
||||||
|
class Host(NamedTuple):
|
||||||
|
hostname: str
|
||||||
|
ip: str
|
||||||
|
rank: int
|
||||||
|
latency: timedelta
|
||||||
|
samples: int = 1
|
||||||
|
|
||||||
|
def mean_latency(self):
|
||||||
|
return self.latency / self.samples
|
||||||
|
|
||||||
|
|
||||||
|
class Topology(object):
|
||||||
|
LOCALHOST = Host("localhost", "127.0.0.1", 0, timedelta(seconds=0.1))
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._graph = defaultdict(set) # Dict[ip, List[ip]]
|
||||||
|
self._nodes = {self.LOCALHOST.ip: self.LOCALHOST} # Dict[ip, Host]
|
||||||
|
|
||||||
|
def add_traceroute(self, trace):
|
||||||
|
trace = list(trace)
|
||||||
|
hosts = []
|
||||||
|
newhosts = [self.LOCALHOST.ip]
|
||||||
|
rank = 0
|
||||||
|
for e in trace:
|
||||||
|
if e.ip not in self._nodes:
|
||||||
|
self._nodes[e.ip] = Host(e.hostname, e.ip, e.rank, e.latency, 1)
|
||||||
|
else:
|
||||||
|
self._nodes[e.ip] = Host(e.hostname, e.ip, e.rank, e.latency + self._nodes[e.ip].latency, self._nodes[e.ip].samples + 1)
|
||||||
|
|
||||||
|
if e.rank > rank:
|
||||||
|
if newhosts:
|
||||||
|
for h2 in newhosts:
|
||||||
|
for h1 in hosts:
|
||||||
|
self._graph[h1].add(h2)
|
||||||
|
hosts = newhosts
|
||||||
|
newhosts = []
|
||||||
|
rank = e.rank
|
||||||
|
|
||||||
|
if e.rank == rank:
|
||||||
|
newhosts.append(e.ip)
|
||||||
|
|
||||||
|
def render(self):
|
||||||
|
for n in sorted(self._nodes.values(), key=lambda n: n.rank):
|
||||||
|
print(f"{n.hostname} ({n.ip}) => {self._graph[n.ip]}")
|
||||||
|
|
||||||
|
|
||||||
|
def compute_topology(hostlist):
|
||||||
|
"""Walk a series of traceroute tuples, computing a 'worst expected latency' topology from them."""
|
||||||
|
|
||||||
|
topology = Topology()
|
||||||
|
for h in hostlist:
|
||||||
|
topology.add_traceroute(traceroute(h))
|
||||||
|
|
||||||
|
return sorted(topology._nodes.values(), key=lambda n: n.rank)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
opts, args = parser.parse_known_args()
|
||||||
|
|
||||||
|
now = start = datetime.now()
|
||||||
|
reconfigure_delay = timedelta(minutes=5)
|
||||||
|
configure_at = now - reconfigure_delay
|
||||||
|
|
||||||
|
topology = []
|
||||||
|
|
||||||
|
with open("incidents.txt", "a") as fp:
|
||||||
|
while True:
|
||||||
|
now = datetime.now()
|
||||||
|
|
||||||
|
if configure_at <= now:
|
||||||
|
log.info("Attempting to reconfigure network topology...")
|
||||||
|
try:
|
||||||
|
topology = compute_topology(opts.hosts)
|
||||||
|
configure_at = now + reconfigure_delay
|
||||||
|
for h in topology:
|
||||||
|
log.info(f"in topology {h}")
|
||||||
|
except CalledProcessError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
for h in topology:
|
||||||
|
if h.rank == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
fail = False
|
||||||
|
try:
|
||||||
|
if ping(h.ip, timeout=h.mean_latency() * 2) != 0:
|
||||||
|
fail = True
|
||||||
|
except Exception as e:
|
||||||
|
fail = True
|
||||||
|
log.exception(e)
|
||||||
|
|
||||||
|
if fail:
|
||||||
|
msg = f"{datetime.now()} failed to reach {h.hostname} ({h.ip})"
|
||||||
|
log.warning(msg)
|
||||||
|
fp.write(msg + "\n")
|
||||||
|
|
||||||
|
else:
|
||||||
|
sys.stderr.write('.')
|
||||||
|
sys.stderr.flush()
|
18
projects/aloe/src/python/ping.py
Normal file
18
projects/aloe/src/python/ping.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
"""A shitty ping wrapper."""
|
||||||
|
|
||||||
|
|
||||||
|
from datetime import timedelta
|
||||||
|
from subprocess import check_call, DEVNULL
|
||||||
|
|
||||||
|
|
||||||
|
def ping(host: str,
|
||||||
|
count: int = 3,
|
||||||
|
interval: float = 0.3,
|
||||||
|
timeout: timedelta = timedelta(seconds=3)):
|
||||||
|
return check_call(["ping", "-q",
|
||||||
|
"-i", str(interval),
|
||||||
|
"-c", str(count),
|
||||||
|
"-W", str(timeout.total_seconds()),
|
||||||
|
host],
|
||||||
|
stdout=DEVNULL,
|
||||||
|
stderr=DEVNULL,)
|
48
projects/aloe/src/python/traceroute.py
Normal file
48
projects/aloe/src/python/traceroute.py
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
"""A shitty traceroute wrapper."""
|
||||||
|
|
||||||
|
from datetime import timedelta
|
||||||
|
import re
|
||||||
|
from subprocess import (
|
||||||
|
CalledProcessError,
|
||||||
|
check_call,
|
||||||
|
check_output,
|
||||||
|
DEVNULL,
|
||||||
|
)
|
||||||
|
from typing import Iterator, List, NamedTuple
|
||||||
|
|
||||||
|
|
||||||
|
class TraceElem(NamedTuple):
|
||||||
|
hostname: str
|
||||||
|
ip: str
|
||||||
|
latency: timedelta
|
||||||
|
rank: int
|
||||||
|
|
||||||
|
|
||||||
|
_LINE = re.compile(r"\*|(((?P<hostname>[-_\w\d\.]*)\s+\((?P<ip>[a-f\d\.:]*)\)\s+)?(?P<latency>[\d\.]*) ms)")
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_traceroute(lines: List[str]) -> Iterator[TraceElem]:
|
||||||
|
for rank, l in zip(range(1, 1<<64), lines):
|
||||||
|
ip = None
|
||||||
|
hostname = None
|
||||||
|
for m in re.finditer(_LINE, l):
|
||||||
|
if m.group("latency"):
|
||||||
|
ip = m.group("ip") or ip
|
||||||
|
hostname = m.group("hostname") or hostname
|
||||||
|
yield TraceElem(hostname, ip, timedelta(milliseconds=float(m.group("latency"))), rank)
|
||||||
|
|
||||||
|
|
||||||
|
def traceroute(host: str, icmp=True, timeout=timedelta(seconds=5)) -> Iterator[TraceElem]:
|
||||||
|
# FIXME: Make ICMP mode an option, not on always
|
||||||
|
yield from _parse_traceroute(
|
||||||
|
check_output(["traceroute",
|
||||||
|
# Set wait; note use of total_seconds which is "real" valued.
|
||||||
|
"-w", str(timeout.total_seconds()),
|
||||||
|
# Use ICMP probes same as PING.
|
||||||
|
# This means all probed hosts will be pingable/ping compliant.
|
||||||
|
# May miss parts of the topology as a result.
|
||||||
|
"-I",
|
||||||
|
host],
|
||||||
|
stderr=DEVNULL,)
|
||||||
|
.decode("utf-8")
|
||||||
|
.splitlines())
|
31
projects/aloe/test/python/test_traceroute.py
Normal file
31
projects/aloe/test/python/test_traceroute.py
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
from typing import List
|
||||||
|
from traceroute import _parse_traceroute, TraceElem
|
||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def parse_traceroute(lines):
|
||||||
|
"""Testing helper."""
|
||||||
|
return list(_parse_traceroute(lines))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("example, expected", [
|
||||||
|
# Basic case, one match
|
||||||
|
("3 10.60.142.2 (10.60.142.2) 117.502 ms",
|
||||||
|
[TraceElem("10.60.142.2", "10.60.142.2", timedelta(milliseconds=117.502))]),
|
||||||
|
# Multiple matches on one line
|
||||||
|
("3 10.60.142.2 (10.60.142.2) 117.502 ms 10.60.142.3 (10.60.142.3) 75.624 ms 10.60.142.2 (10.60.142.2) 117.709 ms",
|
||||||
|
[TraceElem("10.60.142.2", "10.60.142.2", timedelta(milliseconds=117.502)),
|
||||||
|
TraceElem("10.60.142.3", "10.60.142.3", timedelta(milliseconds=75.624)),
|
||||||
|
TraceElem("10.60.142.2", "10.60.142.2", timedelta(milliseconds=117.709))]),
|
||||||
|
# Context sensitive case - traceroute doesn't always print the host & IP.
|
||||||
|
("7 ae-501-ar01.denver.co.denver.comcast.net (96.216.22.130) 41.920 ms 41.893 ms 74.385 ms",
|
||||||
|
[TraceElem("ae-501-ar01.denver.co.denver.comcast.net", "96.216.22.130", timedelta(milliseconds=41.920)),
|
||||||
|
TraceElem("ae-501-ar01.denver.co.denver.comcast.net", "96.216.22.130", timedelta(milliseconds=41.893)),
|
||||||
|
TraceElem("ae-501-ar01.denver.co.denver.comcast.net", "96.216.22.130", timedelta(milliseconds=74.385))]),
|
||||||
|
])
|
||||||
|
def test_examples(example: str, expected: List[TraceElem]):
|
||||||
|
assert parse_traceroute(example.splitlines()) == expected
|
Loading…
Reference in a new issue