And done
This commit is contained in:
parent
0016eaa63d
commit
6c0d3dc535
2 changed files with 56 additions and 5 deletions
46
projects/aloe/README.md
Normal file
46
projects/aloe/README.md
Normal file
|
@ -0,0 +1,46 @@
|
|||
# Aloe
|
||||
|
||||
> - A [cactus](https://www.cacti.net/)-like plant
|
||||
> - Traditionally used for a variety of skin conditions
|
||||
|
||||
Aloe is a quick and dirty network weathermapping tool, much like MTR or Cacti.
|
||||
Aloe uses multiple threads to first establish a rough network topology via ICMP traceroutes, and then monitor it with ICMP pings.
|
||||
|
||||
## Usage
|
||||
|
||||
``` sh
|
||||
$ bazel build //projects/aloe
|
||||
$ sudo ./bazel-bin/projects/aloe/aloe twitter.com google.com
|
||||
```
|
||||
|
||||
If hosts in topology stop responding for 10s or more (the polling interval is ~1s), they are declared to be warning.
|
||||
If hosts in topology stop responding for 5s, they are declared down.
|
||||
If a host in topology resume responding after 5s or more, they are declared to have recovered.
|
||||
If hosts in topology stop responding for 30 min, they are declared dead and monitoring is stopped.
|
||||
The topology is reconfigured every 5 min to account to DHCP and upstream changes.
|
||||
|
||||
A log of all these events is built in a plain-text format to `incidents.txt`.
|
||||
The format of this file is -
|
||||
|
||||
```
|
||||
UP <ip> <date>
|
||||
WARN <ip> <date>
|
||||
DOWN <ip> <date>
|
||||
RECOVERED <ip> <date> <duration>
|
||||
DEAD <ip> <date> <duration>
|
||||
```
|
||||
|
||||
## Future work
|
||||
|
||||
- [ ] Log topology
|
||||
- [ ] Attempt to identify "root cause" incidents in the route graph which explain downstream failures
|
||||
- [ ] Use sqlite3 for aggregation not a plain text file
|
||||
- [ ] Use a more sophisticated counters and debounce model of host state in the main thread
|
||||
- [ ] Find some way to incorporate rolling counters (mean, median, stddev, deciles, max) into the UI
|
||||
- [ ] FFS find some way NOT to depend on that nonsense box-diagram service
|
||||
|
||||
## License
|
||||
|
||||
Copyright Reid 'arrdem' McKenzie, 11/20/2021.
|
||||
|
||||
Published under the terms of the MIT license.
|
|
@ -119,7 +119,7 @@ if __name__ == "__main__":
|
|||
flush_at = now + flush_delay
|
||||
|
||||
recovered_duration = timedelta(seconds=5)
|
||||
dead_duration = timedelta(seconds=30)
|
||||
dead_duration = timedelta(minutes=30)
|
||||
|
||||
topology = None
|
||||
id = unique_identifier()
|
||||
|
@ -159,29 +159,34 @@ if __name__ == "__main__":
|
|||
try:
|
||||
timestamp, res = q.get(timeout=0.1)
|
||||
last = last_seen.get(res.address)
|
||||
delta = timestamp - last if last else None
|
||||
|
||||
if res.address not in workers:
|
||||
pass
|
||||
|
||||
elif res.is_alive:
|
||||
if last and (delta := timestamp - last) > recovered_duration:
|
||||
last_seen[res.address] = timestamp
|
||||
if last and delta > recovered_duration:
|
||||
fp.write(
|
||||
f"RECOVERED\t{res.address}\t{timestamp.isoformat()}\t{delta.total_seconds()}\n"
|
||||
)
|
||||
elif not last:
|
||||
fp.write(f"UP\t{res.address}\t{timestamp.isoformat()}\n")
|
||||
last_seen[res.address] = timestamp
|
||||
|
||||
elif not res.is_alive:
|
||||
if last and (delta := timestamp - last) > dead_duration:
|
||||
if last and delta > dead_duration:
|
||||
workers[h.address].terminate()
|
||||
del workers[h.address]
|
||||
del topology[h.address]
|
||||
del last_seen[h.address]
|
||||
fp.write(
|
||||
f"DEAD\t{res.address}\t{timestamp.isoformat()}\t{delta.total_seconds()}\n"
|
||||
)
|
||||
|
||||
else:
|
||||
elif last and delta < recovered:
|
||||
fp.write(f"WARN\t{res.address}\t{timestamp.isoformat()}\n")
|
||||
|
||||
elif last and delta > recovered:
|
||||
fp.write(f"DOWN\t{res.address}\t{timestamp.isoformat()}\n")
|
||||
|
||||
except queue.Empty:
|
||||
|
|
Loading…
Reference in a new issue