WIP on Aloe
This commit is contained in:
parent
d49034c0fa
commit
8469ab7758
6 changed files with 259 additions and 0 deletions
11
projects/aloe/BUILD
Normal file
11
projects/aloe/BUILD
Normal file
|
@ -0,0 +1,11 @@
|
|||
py_project(
|
||||
name = "lib"
|
||||
)
|
||||
|
||||
zapp_binary(
|
||||
name = "aloe",
|
||||
main = "src/python/aloe/__main__.py",
|
||||
deps = [
|
||||
":lib",
|
||||
],
|
||||
)
|
14
projects/aloe/NOTES.md
Normal file
14
projects/aloe/NOTES.md
Normal file
|
@ -0,0 +1,14 @@
|
|||
|
||||
traceroute to twitter.com (104.244.42.129), 30 hops max, 60 byte packets
|
||||
3 10.60.142.2 (10.60.142.2) 117.502 ms 10.60.142.3 (10.60.142.3) 75.624 ms 10.60.142.2 (10.60.142.2) 117.709 ms
|
||||
4 68.85.107.81 (68.85.107.81) 74.019 ms 75.439 ms 68.85.107.85 (68.85.107.85) 75.275 ms
|
||||
5 68.86.103.9 (68.86.103.9) 75.305 ms 75.333 ms 75.308 ms
|
||||
6 24.124.155.129 (24.124.155.129) 75.281 ms 32.947 ms 35.459 ms
|
||||
7 ae-501-ar01.denver.co.denver.comcast.net (96.216.22.130) 41.920 ms 41.893 ms 74.385 ms
|
||||
8 ae-501-ar01.denver.co.denver.comcast.net (96.216.22.130) 41.818 ms 41.792 ms be-36031-cs03.1601milehigh.co.ibone.comcast.net (96.110.43.249) 41.765 ms
|
||||
9 be-3202-pe02.910fifteenth.co.ibone.comcast.net (96.110.38.118) 43.861 ms 45.557 ms be-36041-cs04.1601milehigh.co.ibone.comcast.net (96.110.43.253) 39.431 ms
|
||||
10 be-3102-pe02.910fifteenth.co.ibone.comcast.net (96.110.38.114) 39.019 ms be-3402-pe02.910fifteenth.co.ibone.comcast.net (96.110.38.126) 39.033 ms be-3302-pe02.910fifteenth.co.ibone.comcast.net (96.110.38.122) 38.965 ms
|
||||
11 sjo-b23-link.ip.twelve99.net (213.155.133.171) 60.909 ms 60.630 ms 173.167.57.142 (173.167.57.142) 38.432 ms
|
||||
12 twitter-ic322868-sjo-b21.ip.twelve99-cust.net (62.115.49.193) 60.311 ms sjo-b23-link.ip.twelve99.net (213.155.133.171) 59.480 ms twitter-ic322868-sjo-b21.ip.twelve99-cust.net (62.115.49.193) 60.263 ms
|
||||
13 twitter-ic322868-sjo-b21.ip.twelve99-cust.net (62.115.49.193) 65.467 ms 65.687 ms *
|
||||
14 * * 104.244.42.129 (104.244.42.129) 63.604 ms
|
137
projects/aloe/src/python/aloe/__main__.py
Normal file
137
projects/aloe/src/python/aloe/__main__.py
Normal file
|
@ -0,0 +1,137 @@
|
|||
"""Aloe - A shitty weathermapping tool.
|
||||
|
||||
Periodically traceroutes the egress network, and then walks pings out the egress network recording times and hosts which
|
||||
failed to respond. Expects a network in excess of 90% packet delivery, but with variable timings. Intended to probe for
|
||||
when packet delivery latencies radically degrade and maintain a report file.
|
||||
|
||||
"""
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
from ping import ping
|
||||
from traceroute import TraceElem, traceroute
|
||||
from subprocess import CalledProcessError
|
||||
from typing import NamedTuple
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("hosts", nargs="+")
|
||||
|
||||
|
||||
def distinct(iter):
|
||||
s = set()
|
||||
l = []
|
||||
for e in iter:
|
||||
if e in s:
|
||||
continue
|
||||
else:
|
||||
l.append(e)
|
||||
s.add(e)
|
||||
return l
|
||||
|
||||
|
||||
class Host(NamedTuple):
|
||||
hostname: str
|
||||
ip: str
|
||||
rank: int
|
||||
latency: timedelta
|
||||
samples: int = 1
|
||||
|
||||
def mean_latency(self):
|
||||
return self.latency / self.samples
|
||||
|
||||
|
||||
class Topology(object):
|
||||
LOCALHOST = Host("localhost", "127.0.0.1", 0, timedelta(seconds=0.1))
|
||||
|
||||
def __init__(self):
|
||||
self._graph = defaultdict(set) # Dict[ip, List[ip]]
|
||||
self._nodes = {self.LOCALHOST.ip: self.LOCALHOST} # Dict[ip, Host]
|
||||
|
||||
def add_traceroute(self, trace):
|
||||
trace = list(trace)
|
||||
hosts = []
|
||||
newhosts = [self.LOCALHOST.ip]
|
||||
rank = 0
|
||||
for e in trace:
|
||||
if e.ip not in self._nodes:
|
||||
self._nodes[e.ip] = Host(e.hostname, e.ip, e.rank, e.latency, 1)
|
||||
else:
|
||||
self._nodes[e.ip] = Host(e.hostname, e.ip, e.rank, e.latency + self._nodes[e.ip].latency, self._nodes[e.ip].samples + 1)
|
||||
|
||||
if e.rank > rank:
|
||||
if newhosts:
|
||||
for h2 in newhosts:
|
||||
for h1 in hosts:
|
||||
self._graph[h1].add(h2)
|
||||
hosts = newhosts
|
||||
newhosts = []
|
||||
rank = e.rank
|
||||
|
||||
if e.rank == rank:
|
||||
newhosts.append(e.ip)
|
||||
|
||||
def render(self):
|
||||
for n in sorted(self._nodes.values(), key=lambda n: n.rank):
|
||||
print(f"{n.hostname} ({n.ip}) => {self._graph[n.ip]}")
|
||||
|
||||
|
||||
def compute_topology(hostlist):
|
||||
"""Walk a series of traceroute tuples, computing a 'worst expected latency' topology from them."""
|
||||
|
||||
topology = Topology()
|
||||
for h in hostlist:
|
||||
topology.add_traceroute(traceroute(h))
|
||||
|
||||
return sorted(topology._nodes.values(), key=lambda n: n.rank)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
opts, args = parser.parse_known_args()
|
||||
|
||||
now = start = datetime.now()
|
||||
reconfigure_delay = timedelta(minutes=5)
|
||||
configure_at = now - reconfigure_delay
|
||||
|
||||
topology = []
|
||||
|
||||
with open("incidents.txt", "a") as fp:
|
||||
while True:
|
||||
now = datetime.now()
|
||||
|
||||
if configure_at <= now:
|
||||
log.info("Attempting to reconfigure network topology...")
|
||||
try:
|
||||
topology = compute_topology(opts.hosts)
|
||||
configure_at = now + reconfigure_delay
|
||||
for h in topology:
|
||||
log.info(f"in topology {h}")
|
||||
except CalledProcessError:
|
||||
pass
|
||||
|
||||
for h in topology:
|
||||
if h.rank == 0:
|
||||
continue
|
||||
|
||||
fail = False
|
||||
try:
|
||||
if ping(h.ip, timeout=h.mean_latency() * 2) != 0:
|
||||
fail = True
|
||||
except Exception as e:
|
||||
fail = True
|
||||
log.exception(e)
|
||||
|
||||
if fail:
|
||||
msg = f"{datetime.now()} failed to reach {h.hostname} ({h.ip})"
|
||||
log.warning(msg)
|
||||
fp.write(msg + "\n")
|
||||
|
||||
else:
|
||||
sys.stderr.write('.')
|
||||
sys.stderr.flush()
|
18
projects/aloe/src/python/ping.py
Normal file
18
projects/aloe/src/python/ping.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
"""A shitty ping wrapper."""
|
||||
|
||||
|
||||
from datetime import timedelta
|
||||
from subprocess import check_call, DEVNULL
|
||||
|
||||
|
||||
def ping(host: str,
|
||||
count: int = 3,
|
||||
interval: float = 0.3,
|
||||
timeout: timedelta = timedelta(seconds=3)):
|
||||
return check_call(["ping", "-q",
|
||||
"-i", str(interval),
|
||||
"-c", str(count),
|
||||
"-W", str(timeout.total_seconds()),
|
||||
host],
|
||||
stdout=DEVNULL,
|
||||
stderr=DEVNULL,)
|
48
projects/aloe/src/python/traceroute.py
Normal file
48
projects/aloe/src/python/traceroute.py
Normal file
|
@ -0,0 +1,48 @@
|
|||
"""A shitty traceroute wrapper."""
|
||||
|
||||
from datetime import timedelta
|
||||
import re
|
||||
from subprocess import (
|
||||
CalledProcessError,
|
||||
check_call,
|
||||
check_output,
|
||||
DEVNULL,
|
||||
)
|
||||
from typing import Iterator, List, NamedTuple
|
||||
|
||||
|
||||
class TraceElem(NamedTuple):
|
||||
hostname: str
|
||||
ip: str
|
||||
latency: timedelta
|
||||
rank: int
|
||||
|
||||
|
||||
_LINE = re.compile(r"\*|(((?P<hostname>[-_\w\d\.]*)\s+\((?P<ip>[a-f\d\.:]*)\)\s+)?(?P<latency>[\d\.]*) ms)")
|
||||
|
||||
|
||||
def _parse_traceroute(lines: List[str]) -> Iterator[TraceElem]:
|
||||
for rank, l in zip(range(1, 1<<64), lines):
|
||||
ip = None
|
||||
hostname = None
|
||||
for m in re.finditer(_LINE, l):
|
||||
if m.group("latency"):
|
||||
ip = m.group("ip") or ip
|
||||
hostname = m.group("hostname") or hostname
|
||||
yield TraceElem(hostname, ip, timedelta(milliseconds=float(m.group("latency"))), rank)
|
||||
|
||||
|
||||
def traceroute(host: str, icmp=True, timeout=timedelta(seconds=5)) -> Iterator[TraceElem]:
|
||||
# FIXME: Make ICMP mode an option, not on always
|
||||
yield from _parse_traceroute(
|
||||
check_output(["traceroute",
|
||||
# Set wait; note use of total_seconds which is "real" valued.
|
||||
"-w", str(timeout.total_seconds()),
|
||||
# Use ICMP probes same as PING.
|
||||
# This means all probed hosts will be pingable/ping compliant.
|
||||
# May miss parts of the topology as a result.
|
||||
"-I",
|
||||
host],
|
||||
stderr=DEVNULL,)
|
||||
.decode("utf-8")
|
||||
.splitlines())
|
31
projects/aloe/test/python/test_traceroute.py
Normal file
31
projects/aloe/test/python/test_traceroute.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from typing import List
|
||||
from traceroute import _parse_traceroute, TraceElem
|
||||
from datetime import timedelta
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def parse_traceroute(lines):
|
||||
"""Testing helper."""
|
||||
return list(_parse_traceroute(lines))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("example, expected", [
|
||||
# Basic case, one match
|
||||
("3 10.60.142.2 (10.60.142.2) 117.502 ms",
|
||||
[TraceElem("10.60.142.2", "10.60.142.2", timedelta(milliseconds=117.502))]),
|
||||
# Multiple matches on one line
|
||||
("3 10.60.142.2 (10.60.142.2) 117.502 ms 10.60.142.3 (10.60.142.3) 75.624 ms 10.60.142.2 (10.60.142.2) 117.709 ms",
|
||||
[TraceElem("10.60.142.2", "10.60.142.2", timedelta(milliseconds=117.502)),
|
||||
TraceElem("10.60.142.3", "10.60.142.3", timedelta(milliseconds=75.624)),
|
||||
TraceElem("10.60.142.2", "10.60.142.2", timedelta(milliseconds=117.709))]),
|
||||
# Context sensitive case - traceroute doesn't always print the host & IP.
|
||||
("7 ae-501-ar01.denver.co.denver.comcast.net (96.216.22.130) 41.920 ms 41.893 ms 74.385 ms",
|
||||
[TraceElem("ae-501-ar01.denver.co.denver.comcast.net", "96.216.22.130", timedelta(milliseconds=41.920)),
|
||||
TraceElem("ae-501-ar01.denver.co.denver.comcast.net", "96.216.22.130", timedelta(milliseconds=41.893)),
|
||||
TraceElem("ae-501-ar01.denver.co.denver.comcast.net", "96.216.22.130", timedelta(milliseconds=74.385))]),
|
||||
])
|
||||
def test_examples(example: str, expected: List[TraceElem]):
|
||||
assert parse_traceroute(example.splitlines()) == expected
|
Loading…
Reference in a new issue