And done
This commit is contained in:
parent
0016eaa63d
commit
6c0d3dc535
2 changed files with 56 additions and 5 deletions
46
projects/aloe/README.md
Normal file
46
projects/aloe/README.md
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
# Aloe
|
||||||
|
|
||||||
|
> - A [cactus](https://www.cacti.net/)-like plant
|
||||||
|
> - Traditionally used for a variety of skin conditions
|
||||||
|
|
||||||
|
Aloe is a quick and dirty network weathermapping tool, much like MTR or Cacti.
|
||||||
|
Aloe uses multiple threads to first establish a rough network topology via ICMP traceroutes, and then monitor it with ICMP pings.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
``` sh
|
||||||
|
$ bazel build //projects/aloe
|
||||||
|
$ sudo ./bazel-bin/projects/aloe/aloe twitter.com google.com
|
||||||
|
```
|
||||||
|
|
||||||
|
If hosts in topology stop responding for 10s or more (the polling interval is ~1s), they are declared to be warning.
|
||||||
|
If hosts in topology stop responding for 5s, they are declared down.
|
||||||
|
If a host in topology resume responding after 5s or more, they are declared to have recovered.
|
||||||
|
If hosts in topology stop responding for 30 min, they are declared dead and monitoring is stopped.
|
||||||
|
The topology is reconfigured every 5 min to account to DHCP and upstream changes.
|
||||||
|
|
||||||
|
A log of all these events is built in a plain-text format to `incidents.txt`.
|
||||||
|
The format of this file is -
|
||||||
|
|
||||||
|
```
|
||||||
|
UP <ip> <date>
|
||||||
|
WARN <ip> <date>
|
||||||
|
DOWN <ip> <date>
|
||||||
|
RECOVERED <ip> <date> <duration>
|
||||||
|
DEAD <ip> <date> <duration>
|
||||||
|
```
|
||||||
|
|
||||||
|
## Future work
|
||||||
|
|
||||||
|
- [ ] Log topology
|
||||||
|
- [ ] Attempt to identify "root cause" incidents in the route graph which explain downstream failures
|
||||||
|
- [ ] Use sqlite3 for aggregation not a plain text file
|
||||||
|
- [ ] Use a more sophisticated counters and debounce model of host state in the main thread
|
||||||
|
- [ ] Find some way to incorporate rolling counters (mean, median, stddev, deciles, max) into the UI
|
||||||
|
- [ ] FFS find some way NOT to depend on that nonsense box-diagram service
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
Copyright Reid 'arrdem' McKenzie, 11/20/2021.
|
||||||
|
|
||||||
|
Published under the terms of the MIT license.
|
|
@ -119,7 +119,7 @@ if __name__ == "__main__":
|
||||||
flush_at = now + flush_delay
|
flush_at = now + flush_delay
|
||||||
|
|
||||||
recovered_duration = timedelta(seconds=5)
|
recovered_duration = timedelta(seconds=5)
|
||||||
dead_duration = timedelta(seconds=30)
|
dead_duration = timedelta(minutes=30)
|
||||||
|
|
||||||
topology = None
|
topology = None
|
||||||
id = unique_identifier()
|
id = unique_identifier()
|
||||||
|
@ -159,29 +159,34 @@ if __name__ == "__main__":
|
||||||
try:
|
try:
|
||||||
timestamp, res = q.get(timeout=0.1)
|
timestamp, res = q.get(timeout=0.1)
|
||||||
last = last_seen.get(res.address)
|
last = last_seen.get(res.address)
|
||||||
|
delta = timestamp - last if last else None
|
||||||
|
|
||||||
if res.address not in workers:
|
if res.address not in workers:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
elif res.is_alive:
|
elif res.is_alive:
|
||||||
if last and (delta := timestamp - last) > recovered_duration:
|
last_seen[res.address] = timestamp
|
||||||
|
if last and delta > recovered_duration:
|
||||||
fp.write(
|
fp.write(
|
||||||
f"RECOVERED\t{res.address}\t{timestamp.isoformat()}\t{delta.total_seconds()}\n"
|
f"RECOVERED\t{res.address}\t{timestamp.isoformat()}\t{delta.total_seconds()}\n"
|
||||||
)
|
)
|
||||||
elif not last:
|
elif not last:
|
||||||
fp.write(f"UP\t{res.address}\t{timestamp.isoformat()}\n")
|
fp.write(f"UP\t{res.address}\t{timestamp.isoformat()}\n")
|
||||||
last_seen[res.address] = timestamp
|
|
||||||
|
|
||||||
elif not res.is_alive:
|
elif not res.is_alive:
|
||||||
if last and (delta := timestamp - last) > dead_duration:
|
if last and delta > dead_duration:
|
||||||
workers[h.address].terminate()
|
workers[h.address].terminate()
|
||||||
del workers[h.address]
|
del workers[h.address]
|
||||||
del topology[h.address]
|
del topology[h.address]
|
||||||
|
del last_seen[h.address]
|
||||||
fp.write(
|
fp.write(
|
||||||
f"DEAD\t{res.address}\t{timestamp.isoformat()}\t{delta.total_seconds()}\n"
|
f"DEAD\t{res.address}\t{timestamp.isoformat()}\t{delta.total_seconds()}\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
elif last and delta < recovered:
|
||||||
|
fp.write(f"WARN\t{res.address}\t{timestamp.isoformat()}\n")
|
||||||
|
|
||||||
|
elif last and delta > recovered:
|
||||||
fp.write(f"DOWN\t{res.address}\t{timestamp.isoformat()}\n")
|
fp.write(f"DOWN\t{res.address}\t{timestamp.isoformat()}\n")
|
||||||
|
|
||||||
except queue.Empty:
|
except queue.Empty:
|
||||||
|
|
Loading…
Reference in a new issue