From 74d543824b16106ce7e0425b132f72e0f0f64919 Mon Sep 17 00:00:00 2001 From: Reid 'arrdem' McKenzie Date: Sat, 20 Nov 2021 23:06:26 -0700 Subject: [PATCH] And done --- projects/aloe/README.md | 46 +++++++++++++++++++++++ projects/aloe/src/python/aloe/__main__.py | 15 +++++--- 2 files changed, 56 insertions(+), 5 deletions(-) create mode 100644 projects/aloe/README.md diff --git a/projects/aloe/README.md b/projects/aloe/README.md new file mode 100644 index 0000000..bdbb7d5 --- /dev/null +++ b/projects/aloe/README.md @@ -0,0 +1,46 @@ +# Aloe + +> - A [cactus](https://www.cacti.net/)-like plant +> - Traditionally used for a variety of skin conditions + +Aloe is a quick and dirty network weathermapping tool, much like MTR or Cacti. +Aloe uses multiple threads to first establish a rough network topology via ICMP traceroutes, and then monitor it with ICMP pings. + +## Usage + +``` sh +$ bazel build //projects/aloe +$ sudo ./bazel-bin/projects/aloe/aloe twitter.com google.com +``` + +If hosts in topology stop responding for 10s or more (the polling interval is ~1s), they are declared to be warning. +If hosts in topology stop responding for 5s, they are declared down. +If a host in topology resume responding after 5s or more, they are declared to have recovered. +If hosts in topology stop responding for 30 min, they are declared dead and monitoring is stopped. +The topology is reconfigured every 5 min to account to DHCP and upstream changes. + +A log of all these events is built in a plain-text format to `incidents.txt`. +The format of this file is - + +``` +UP +WARN +DOWN +RECOVERED +DEAD +``` + +## Future work + +- [ ] Log topology +- [ ] Attempt to identify "root cause" incidents in the route graph which explain downstream failures +- [ ] Use sqlite3 for aggregation not a plain text file +- [ ] Use a more sophisticated counters and debounce model of host state in the main thread +- [ ] Find some way to incorporate rolling counters (mean, median, stddev, deciles, max) into the UI +- [ ] FFS find some way NOT to depend on that nonsense box-diagram service + +## License + +Copyright Reid 'arrdem' McKenzie, 11/20/2021. + +Published under the terms of the MIT license. diff --git a/projects/aloe/src/python/aloe/__main__.py b/projects/aloe/src/python/aloe/__main__.py index 2bde754..fb8e4ea 100644 --- a/projects/aloe/src/python/aloe/__main__.py +++ b/projects/aloe/src/python/aloe/__main__.py @@ -119,7 +119,7 @@ if __name__ == "__main__": flush_at = now + flush_delay recovered_duration = timedelta(seconds=5) - dead_duration = timedelta(seconds=30) + dead_duration = timedelta(minutes=30) topology = None id = unique_identifier() @@ -159,29 +159,34 @@ if __name__ == "__main__": try: timestamp, res = q.get(timeout=0.1) last = last_seen.get(res.address) + delta = timestamp - last if last else None if res.address not in workers: pass elif res.is_alive: - if last and (delta := timestamp - last) > recovered_duration: + last_seen[res.address] = timestamp + if last and delta > recovered_duration: fp.write( f"RECOVERED\t{res.address}\t{timestamp.isoformat()}\t{delta.total_seconds()}\n" ) elif not last: fp.write(f"UP\t{res.address}\t{timestamp.isoformat()}\n") - last_seen[res.address] = timestamp elif not res.is_alive: - if last and (delta := timestamp - last) > dead_duration: + if last and delta > dead_duration: workers[h.address].terminate() del workers[h.address] del topology[h.address] + del last_seen[h.address] fp.write( f"DEAD\t{res.address}\t{timestamp.isoformat()}\t{delta.total_seconds()}\n" ) - else: + elif last and delta < recovered: + fp.write(f"WARN\t{res.address}\t{timestamp.isoformat()}\n") + + elif last and delta > recovered: fp.write(f"DOWN\t{res.address}\t{timestamp.isoformat()}\n") except queue.Empty: