Add overwatch as-is
This commit is contained in:
parent
82d7966046
commit
cfedd826c9
3 changed files with 388 additions and 0 deletions
14
projects/overwatch/README.md
Normal file
14
projects/overwatch/README.md
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
# Overwatch
|
||||||
|
|
||||||
|
> Overwatch is a force protection tactic in modern warfare where one small unit or military vehicle supports another while it is executing fire and movement tactics.
|
||||||
|
> An overwatching unit takes a position where it can observe the terrain ahead, especially likely enemy positions.
|
||||||
|
> This allows it to provide effective covering fire for advancing friendly units.
|
||||||
|
> The term "overwatch" was coined in U.S. military doctrine in the 1950s.
|
||||||
|
|
||||||
|
Overwatch is a monitoring and intervention system.
|
||||||
|
|
||||||
|
Like its namesake, it seeks to provide a baseline of support for other automation - although initial versions will likely implement their own interventions.
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
MIT
|
343
projects/overwatch/bin/overwatchd
Executable file
343
projects/overwatch/bin/overwatchd
Executable file
|
@ -0,0 +1,343 @@
|
||||||
|
#!python3
|
||||||
|
|
||||||
|
"""Evil monitoring.
|
||||||
|
|
||||||
|
Ping hosts, syslogging at INFO if they're up and happy, otherwise using Telnet scripting to force
|
||||||
|
reset them and syslogging at CRIT with what the uptime was prior to forced reboot.
|
||||||
|
|
||||||
|
Hosts are debounced, so that they have a chance to return before monitoring resumes.
|
||||||
|
|
||||||
|
No effort is made to detect network conditions or poweroffs.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
import logging
|
||||||
|
from logging.handlers import SysLogHandler
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import signal
|
||||||
|
import socket
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from sys import exit
|
||||||
|
import syslog
|
||||||
|
from telnetlib import Telnet
|
||||||
|
from threading import Event, Lock, Thread
|
||||||
|
from time import sleep
|
||||||
|
|
||||||
|
from kazoo.client import KazooClient
|
||||||
|
from kazoo.exceptions import (
|
||||||
|
ConnectionLoss,
|
||||||
|
LockTimeout,
|
||||||
|
NodeExistsError,
|
||||||
|
NoNodeError,
|
||||||
|
SessionExpiredError
|
||||||
|
)
|
||||||
|
from kazoo.handlers.threading import KazooTimeoutError
|
||||||
|
from kazoo.recipe.lock import Lock as KazooLock
|
||||||
|
from kook.client import KookClient, lock
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
|
||||||
|
log = logging.getLogger("arrdem.overwatchd")
|
||||||
|
|
||||||
|
|
||||||
|
CONFIG = {
|
||||||
|
# APC PDU credentials
|
||||||
|
"pdu_username": "apc",
|
||||||
|
"pdu_password": "debda7f140 -c",
|
||||||
|
|
||||||
|
# Hosts recover in about 40s,
|
||||||
|
# But only stop responding to pings for about 6-8s.
|
||||||
|
"debounce": 40,
|
||||||
|
|
||||||
|
# Once a host is up, 5s of no ping is indicative.
|
||||||
|
"threshold": 5,
|
||||||
|
|
||||||
|
# Number of seconds to wait before refreshing inventory.
|
||||||
|
"loop_trip": 60.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def zdec(i: int):
|
||||||
|
"""Decrement, stopping at 0."""
|
||||||
|
|
||||||
|
if i <= 1:
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
return i - 1
|
||||||
|
|
||||||
|
|
||||||
|
def ping(hostname: str,
|
||||||
|
count: int = 2,
|
||||||
|
timeout: int = 1):
|
||||||
|
"""Send count packets to a hostname, with a timeout of timeout"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
return subprocess.check_call(["ping", "-q", "-c", str(count), "-W", str(timeout), hostname],
|
||||||
|
stderr=subprocess.DEVNULL,
|
||||||
|
stdout=subprocess.DEVNULL) == 0
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def check_port(hostname: str,
|
||||||
|
timeout: int = 1,
|
||||||
|
port: int = 22,
|
||||||
|
banner: bytes = b"SSH"):
|
||||||
|
try:
|
||||||
|
conn = Telnet(hostname, port)
|
||||||
|
offset, match, data = conn.expect([banner], timeout=timeout)
|
||||||
|
conn.close()
|
||||||
|
return match is not None
|
||||||
|
except ConnectionRefusedError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def do_reboot(client: KookClient, pdu_uri: str, port: int):
|
||||||
|
"""Get a shared lock, telnet to the PDU, reset the port and log out."""
|
||||||
|
|
||||||
|
pdu_hostname, pdu_port = pdu_uri.split(":", 1)
|
||||||
|
pdu_host = client.host(pdu_hostname)
|
||||||
|
log.info(f"Attempting to force a reset port {port} using {pdu_uri}")
|
||||||
|
|
||||||
|
def l(text):
|
||||||
|
return (text + "\r").encode("utf-8")
|
||||||
|
|
||||||
|
def expect(conn, text):
|
||||||
|
offset, match, data = conn.expect([bytes(text)], timeout=1)
|
||||||
|
if offset is None:
|
||||||
|
raise Exception(f"Unable to match pattern {text} in conn {conn}")
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
|
||||||
|
def apc_login(conn):
|
||||||
|
expect(conn, b"User Name")
|
||||||
|
conn.write(l(CONFIG["pdu_username"]))
|
||||||
|
expect(conn, b"Password")
|
||||||
|
conn.write(l(CONFIG["pdu_password"]))
|
||||||
|
|
||||||
|
def apc_command(conn, cmd):
|
||||||
|
expect(conn, b"APC>")
|
||||||
|
conn.write(l(cmd))
|
||||||
|
|
||||||
|
# To ensure only one process logs into the PDU at once
|
||||||
|
with lock(pdu_host.lock):
|
||||||
|
conn = Telnet(pdu_hostname, int(pdu_port))
|
||||||
|
apc_login(conn)
|
||||||
|
apc_command(conn, "reboot " + str(port))
|
||||||
|
apc_command(conn, "quit")
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def host_ignored(host):
|
||||||
|
return {"true": True, "false": False}.get(host.canonicalized_vars().get("overwatchd_ignored"))
|
||||||
|
|
||||||
|
|
||||||
|
def monitor_lock(client, hostname):
|
||||||
|
return KazooLock(client.client, f"/overwatch/host/{hostname}", f"{os.getpid()}@{socket.getfqdn()}")
|
||||||
|
|
||||||
|
|
||||||
|
def monitor(client: KookClient, shutdown: Event, hostname: str):
|
||||||
|
# FIXME (arrdem 2019-06-25):
|
||||||
|
# Attrs could change / host could move and we wouldn't handle that.
|
||||||
|
|
||||||
|
log.info(f"Monitoring {hostname}")
|
||||||
|
|
||||||
|
threshold = CONFIG["threshold"]
|
||||||
|
debounce = timedelta(seconds=CONFIG["debounce"])
|
||||||
|
loop_time = CONFIG["loop_trip"]
|
||||||
|
|
||||||
|
_monitor_lock = monitor_lock(client, hostname)
|
||||||
|
while not shutdown.is_set():
|
||||||
|
try:
|
||||||
|
# Rather tha directly contending forever, we contend on a loop so that
|
||||||
|
# we'll shut down workers in a reasonable amount of time.
|
||||||
|
with lock(_monitor_lock, timeout=2):
|
||||||
|
# Loop variables
|
||||||
|
now = start = datetime.today()
|
||||||
|
counter = 0
|
||||||
|
|
||||||
|
while not shutdown.is_set():
|
||||||
|
host = client.host(hostname)
|
||||||
|
|
||||||
|
# The host has been deleted, abort
|
||||||
|
if host is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
# FIXME (arrdem 2019-08-04):
|
||||||
|
# This could be slow???
|
||||||
|
attrs = host.canonicalized_vars()
|
||||||
|
host_address = attrs.get("host_address")
|
||||||
|
|
||||||
|
# Update the date info
|
||||||
|
now = datetime.today()
|
||||||
|
delta = now - start
|
||||||
|
|
||||||
|
if delta < debounce:
|
||||||
|
sleep(2)
|
||||||
|
|
||||||
|
elif counter >= threshold:
|
||||||
|
# Bounce the box, wait for it to become healthy again and hand off
|
||||||
|
uptime = delta.total_seconds() - counter
|
||||||
|
|
||||||
|
# Use a timeout so that contending host(s) give up.
|
||||||
|
with lock(host.lock):
|
||||||
|
log.critical(f"{hostname} detected unhealthy for {counter}s after {uptime}s up, forcing reboot!")
|
||||||
|
do_reboot(client, attrs.get("pdu_uri"), attrs.get("pdu_outlet"))
|
||||||
|
# Hand off to another worker, who will have to debounce first
|
||||||
|
break
|
||||||
|
|
||||||
|
elif not host.is_locked() and not host_ignored(host):
|
||||||
|
healthy = True
|
||||||
|
|
||||||
|
# FIXME (arrdem 2019-06-26):
|
||||||
|
# Factor these healthchecks out - ideally into a separate system
|
||||||
|
if not ping(host_address):
|
||||||
|
log.warning(f"{hostname} did not return ping")
|
||||||
|
healthy = False
|
||||||
|
|
||||||
|
elif not check_port(host_address, port=22, banner=b"SSH"):
|
||||||
|
log.warning(f"{hostname} did not respond to ssh port knock")
|
||||||
|
healthy = False
|
||||||
|
|
||||||
|
if not healthy:
|
||||||
|
log.warning(f"{hostname} has no active lock, incrementing")
|
||||||
|
counter += 1
|
||||||
|
|
||||||
|
else:
|
||||||
|
log.debug(f"{hostname} seems healthy...")
|
||||||
|
# Otherwise we zdec the score.
|
||||||
|
counter = zdec(counter)
|
||||||
|
|
||||||
|
# delta > debounce implied by if ordering
|
||||||
|
if delta.total_seconds() % 60 // 1 == 0:
|
||||||
|
log.info(f"{hostname} healthy for {delta.total_seconds()}s")
|
||||||
|
|
||||||
|
# If we've taken a full round of keeping an eye on this host and someone else is ready hand off
|
||||||
|
if counter == 0 and delta.total_seconds() > loop_time and len(_monitor_lock.contenders()) > 0:
|
||||||
|
log.info(f"Letting someone else keep an eye on {hostname}")
|
||||||
|
break
|
||||||
|
|
||||||
|
sleep(2)
|
||||||
|
|
||||||
|
sleep(2)
|
||||||
|
|
||||||
|
except (SessionExpiredError, ConnectionLoss):
|
||||||
|
log.critical(f"Monitoring for host {hostname} lost ZK, retrying....")
|
||||||
|
while not shutdown.is_set():
|
||||||
|
try:
|
||||||
|
client.restart()
|
||||||
|
break
|
||||||
|
except KazooTimeoutError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
except LockTimeout:
|
||||||
|
sleep(random.random())
|
||||||
|
|
||||||
|
|
||||||
|
def cli_monitor(client):
|
||||||
|
"""CLI entry point. Monitor all available inventory."""
|
||||||
|
|
||||||
|
children_lock = Lock()
|
||||||
|
children = {}
|
||||||
|
shutdown = Event()
|
||||||
|
|
||||||
|
def _shutdown(*args):
|
||||||
|
shutdown.set()
|
||||||
|
with children_lock:
|
||||||
|
for c in children.values():
|
||||||
|
c.join()
|
||||||
|
exit(0)
|
||||||
|
|
||||||
|
signal.signal(signal.SIGINT, _shutdown)
|
||||||
|
signal.signal(signal.SIGQUIT, _shutdown)
|
||||||
|
|
||||||
|
def monitor_maybe(host):
|
||||||
|
attrs = host.canonicalized_vars()
|
||||||
|
if attrs.get("host_address") and attrs.get("pdu_outlet") and attrs.get("pdu_uri"):
|
||||||
|
log.info(f"Host {host} has requisite k/vs, monitoring...")
|
||||||
|
t = Thread(target=monitor,
|
||||||
|
args=(client, shutdown, host.name,))
|
||||||
|
t.start()
|
||||||
|
children[host.name] = t
|
||||||
|
return True
|
||||||
|
|
||||||
|
def intake_hosts(*args, **kwargs):
|
||||||
|
with children_lock:
|
||||||
|
hosts = list(client.hosts(watch=intake_hosts))
|
||||||
|
random.shuffle(hosts)
|
||||||
|
for host in hosts:
|
||||||
|
if host.name not in children:
|
||||||
|
monitor_maybe(host)
|
||||||
|
|
||||||
|
while not shutdown.is_set():
|
||||||
|
with children_lock:
|
||||||
|
for hostname, thread in list(children.items()):
|
||||||
|
if not thread.is_alive():
|
||||||
|
thread.join()
|
||||||
|
del children[hostname]
|
||||||
|
|
||||||
|
intake_hosts()
|
||||||
|
|
||||||
|
sleep(CONFIG.get("loop_trip"))
|
||||||
|
|
||||||
|
|
||||||
|
def cli_spy(client):
|
||||||
|
"""CLI entry point. Spy on who's doing the monitoring."""
|
||||||
|
|
||||||
|
tree = "/overwatch/host"
|
||||||
|
lock = Lock()
|
||||||
|
locks = {}
|
||||||
|
|
||||||
|
def _locks_thread(*args, **kwargs):
|
||||||
|
with lock:
|
||||||
|
for hostname in client.client.get_children(tree, watch=_locks_thread):
|
||||||
|
if hostname not in locks:
|
||||||
|
locks[hostname] = monitor_lock(client, hostname)
|
||||||
|
|
||||||
|
_locks_thread()
|
||||||
|
|
||||||
|
while True:
|
||||||
|
with lock:
|
||||||
|
for hostname, host_lock in locks.items():
|
||||||
|
contenders = host_lock.contenders()
|
||||||
|
if contenders:
|
||||||
|
holder, *contenders = contenders
|
||||||
|
if contenders:
|
||||||
|
print(f"{hostname: <16} is monitored by {holder}, with {len(contenders)} contenders")
|
||||||
|
else:
|
||||||
|
print(f"{hostname: <16} is monitored by {holder} uncontended")
|
||||||
|
else:
|
||||||
|
print(f"{hostname: <16} is unmonitored")
|
||||||
|
|
||||||
|
sleep(2)
|
||||||
|
os.system("clear")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Make kazoo shut up some
|
||||||
|
logging.getLogger("kazoo").setLevel(logging.WARNING)
|
||||||
|
|
||||||
|
# We'll be quiet too some
|
||||||
|
log.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
# And forward to syslog
|
||||||
|
fmt = logging.Formatter("%(asctime)s %(relativeCreated)6d %(threadName)s - %(name)s - %(levelname)s - %(message)s")
|
||||||
|
slh = SysLogHandler("/dev/log")
|
||||||
|
#slh = logging.StreamHandler()
|
||||||
|
slh.setFormatter(fmt)
|
||||||
|
log.addHandler(slh)
|
||||||
|
|
||||||
|
client = KookClient()
|
||||||
|
|
||||||
|
if "spy" in sys.argv:
|
||||||
|
cli_spy(client)
|
||||||
|
|
||||||
|
elif "monitor" in sys.argv:
|
||||||
|
cli_monitor(client)
|
||||||
|
|
||||||
|
else:
|
||||||
|
print("Usage: overwatchd [monitor | spy]")
|
||||||
|
exit(1)
|
31
projects/overwatch/setup.py
Normal file
31
projects/overwatch/setup.py
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
from setuptools import setup
|
||||||
|
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name="arrdem.overwatch",
|
||||||
|
# Package metadata
|
||||||
|
version="0.0.7",
|
||||||
|
license="MIT",
|
||||||
|
description="A Kook backed inventory monitoring syste",
|
||||||
|
long_description=open("README.md").read(),
|
||||||
|
long_description_content_type="text/markdown",
|
||||||
|
author="Reid 'arrdem' McKenzie",
|
||||||
|
author_email="me@arrdem.com",
|
||||||
|
url="https://git.arrdem.com/arrdem/overwatch",
|
||||||
|
classifiers=[
|
||||||
|
"License :: OSI Approved :: MIT License",
|
||||||
|
"Development Status :: 3 - Alpha",
|
||||||
|
"Intended Audience :: Developers",
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
"Programming Language :: Python :: 3.7",
|
||||||
|
],
|
||||||
|
# Package setup
|
||||||
|
scripts=[
|
||||||
|
"bin/overwatchd",
|
||||||
|
],
|
||||||
|
install_requires=[
|
||||||
|
"arrdem.kook>=0.1.0",
|
||||||
|
"kazoo>=2.6.1",
|
||||||
|
"PyYaml>=5.1.2",
|
||||||
|
],
|
||||||
|
)
|
Loading…
Reference in a new issue