#!python3

"""Evil monitoring.

Ping hosts, syslogging at INFO if they're up and happy, otherwise using Telnet scripting to force
reset them and syslogging at CRIT with what the uptime was prior to forced reboot.

Hosts are debounced, so that they have a chance to return before monitoring resumes.

No effort is made to detect network conditions or poweroffs.

"""

from datetime import datetime, timedelta
import logging
from logging.handlers import SysLogHandler
import os
import random
import signal
import socket
import subprocess
import sys
from sys import exit
from telnetlib import Telnet
from threading import Event, Lock, Thread
from time import sleep

from kazoo.exceptions import (
    ConnectionLoss,
    LockTimeout,
    SessionExpiredError,
)
from kazoo.handlers.threading import (
    KazooTimeoutError,
)
from kazoo.recipe.lock import Lock as KazooLock
from kook.client import KookClient, lock


log = logging.getLogger("arrdem.overwatchd")


CONFIG = {
  # APC PDU credentials
  "pdu_username": "apc",
  "pdu_password": "debda7f140 -c",

  # Hosts recover in about 40s,
  # But only stop responding to pings for about 6-8s.
  "debounce": 40,

  # Once a host is up, 5s of no ping is indicative.
  "threshold": 5,

  # Number of seconds to wait before refreshing inventory.
  "loop_trip": 60.0,
}


def zdec(i: int):
  """Decrement, stopping at 0."""

  if i <= 1:
    return 0
  else:
    return i - 1


def ping(hostname: str,
         count: int = 2,
         timeout: int = 1):
  """Send count packets to a hostname, with a timeout of timeout"""

  try:
    return subprocess.check_call(["ping", "-q", "-c", str(count), "-W", str(timeout), hostname],
                                 stderr=subprocess.DEVNULL,
                                 stdout=subprocess.DEVNULL) == 0
  except subprocess.CalledProcessError:
    return False


def check_port(hostname: str,
               timeout: int = 1,
               port: int = 22,
               banner: bytes = b"SSH"):
  try:
    conn = Telnet(hostname, port)
    offset, match, data = conn.expect([banner], timeout=timeout)
    conn.close()
    return match is not None
  except ConnectionRefusedError:
    return False


def do_reboot(client: KookClient, pdu_uri: str, port: int):
  """Get a shared lock, telnet to the PDU, reset the port and log out."""

  pdu_hostname, pdu_port = pdu_uri.split(":", 1)
  pdu_host = client.host(pdu_hostname)
  log.info(f"Attempting to force a reset port {port} using {pdu_uri}")

  def l(text):
    return (text + "\r").encode("utf-8")

  def expect(conn, text):
    offset, match, data = conn.expect([bytes(text)], timeout=1)
    if offset is None:
      raise Exception(f"Unable to match pattern {text} in conn {conn}")
    else:
      return

  def apc_login(conn):
    expect(conn, b"User Name")
    conn.write(l(CONFIG["pdu_username"]))
    expect(conn, b"Password")
    conn.write(l(CONFIG["pdu_password"]))

  def apc_command(conn, cmd):
    expect(conn, b"APC>")
    conn.write(l(cmd))

  # To ensure only one process logs into the PDU at once
  with lock(pdu_host.lock):
    conn = Telnet(pdu_hostname, int(pdu_port))
    apc_login(conn)
    apc_command(conn, "reboot " + str(port))
    apc_command(conn, "quit")
    conn.close()


def host_ignored(host):
  return {"true": True, "false": False}.get(host.canonicalized_vars().get("overwatchd_ignored"))


def monitor_lock(client, hostname):
  return KazooLock(client.client, f"/overwatch/host/{hostname}", f"{os.getpid()}@{socket.getfqdn()}")


def monitor(client: KookClient, shutdown: Event, hostname: str):
  # FIXME (arrdem 2019-06-25):
  #   Attrs could change / host could move and we wouldn't handle that.

  log.info(f"Monitoring {hostname}")

  threshold = CONFIG["threshold"]
  debounce = timedelta(seconds=CONFIG["debounce"])
  loop_time = CONFIG["loop_trip"]

  _monitor_lock = monitor_lock(client, hostname)
  while not shutdown.is_set():
    try:
      # Rather tha directly contending forever, we contend on a loop so that
      # we'll shut down workers in a reasonable amount of time.
      with lock(_monitor_lock, timeout=2):
        # Loop variables
        now = start = datetime.today()
        counter = 0

        while not shutdown.is_set():
          host = client.host(hostname)

          # The host has been deleted, abort
          if host is None:
            return

          # FIXME (arrdem 2019-08-04):
          #   This could be slow???
          attrs = host.canonicalized_vars()
          host_address = attrs.get("host_address")

          # Update the date info
          now = datetime.today()
          delta = now - start

          if delta < debounce:
            sleep(2)

          elif counter >= threshold:
            # Bounce the box, wait for it to become healthy again and hand off
            uptime = delta.total_seconds() - counter

            # Use a timeout so that contending host(s) give up.
            with lock(host.lock):
              log.critical(f"{hostname} detected unhealthy for {counter}s after {uptime}s up, forcing reboot!")
              do_reboot(client, attrs.get("pdu_uri"), attrs.get("pdu_outlet"))
              # Hand off to another worker, who will have to debounce first
              break

          elif not host.is_locked() and not host_ignored(host):
            healthy = True

            # FIXME (arrdem 2019-06-26):
            #   Factor these healthchecks out - ideally into a separate system
            if not ping(host_address):
              log.warning(f"{hostname} did not return ping")
              healthy = False

            elif not check_port(host_address, port=22, banner=b"SSH"):
              log.warning(f"{hostname} did not respond to ssh port knock")
              healthy = False

            if not healthy:
              log.warning(f"{hostname} has no active lock, incrementing")
              counter += 1

            else:
              log.debug(f"{hostname} seems healthy...")
              # Otherwise we zdec the score.
              counter = zdec(counter)

              # delta > debounce implied by if ordering
              if delta.total_seconds() % 60 // 1 == 0:
                log.info(f"{hostname} healthy for {delta.total_seconds()}s")

              # If we've taken a full round of keeping an eye on this host and someone else is ready hand off
              if counter == 0 and delta.total_seconds() > loop_time and len(_monitor_lock.contenders()) > 0:
                log.info(f"Letting someone else keep an eye on {hostname}")
                break

            sleep(2)

      sleep(2)

    except (SessionExpiredError, ConnectionLoss):
      log.critical(f"Monitoring for host {hostname} lost ZK, retrying....")
      while not shutdown.is_set():
        try:
          client.restart()
          break
        except KazooTimeoutError:
          continue

    except LockTimeout:
      sleep(random.random())


def cli_monitor(client):
  """CLI entry point. Monitor all available inventory."""

  children_lock = Lock()
  children = {}
  shutdown = Event()

  def _shutdown(*args):
    shutdown.set()
    with children_lock:
      for c in children.values():
        c.join()
    exit(0)

  signal.signal(signal.SIGINT, _shutdown)
  signal.signal(signal.SIGQUIT, _shutdown)

  def monitor_maybe(host):
    attrs = host.canonicalized_vars()
    if attrs.get("host_address") and attrs.get("pdu_outlet") and attrs.get("pdu_uri"):
      log.info(f"Host {host} has requisite k/vs, monitoring...")
      t = Thread(target=monitor,
                 args=(client, shutdown, host.name,))
      t.start()
      children[host.name] = t
      return True

  def intake_hosts(*args, **kwargs):
    with children_lock:
      hosts = list(client.hosts(watch=intake_hosts))
      random.shuffle(hosts)
      for host in hosts:
        if host.name not in children:
          monitor_maybe(host)

  while not shutdown.is_set():
    with children_lock:
      for hostname, thread in list(children.items()):
        if not thread.is_alive():
          thread.join()
          del children[hostname]

    intake_hosts()

    sleep(CONFIG.get("loop_trip"))


def cli_spy(client):
  """CLI entry point. Spy on who's doing the monitoring."""

  tree = "/overwatch/host"
  lock = Lock()
  locks = {}

  def _locks_thread(*args, **kwargs):
    with lock:
      for hostname in client.client.get_children(tree, watch=_locks_thread):
        if hostname not in locks:
          locks[hostname] = monitor_lock(client, hostname)

  _locks_thread()

  while True:
    with lock:
      for hostname, host_lock in locks.items():
        contenders = host_lock.contenders()
        if contenders:
          holder, *contenders = contenders
          if contenders:
            print(f"{hostname: <16} is monitored by {holder}, with {len(contenders)} contenders")
          else:
            print(f"{hostname: <16} is monitored by {holder} uncontended")
        else:
          print(f"{hostname: <16} is unmonitored")

    sleep(2)
    os.system("clear")


if __name__ == "__main__":
  # Make kazoo shut up some
  logging.getLogger("kazoo").setLevel(logging.WARNING)

  # We'll be quiet too some
  log.setLevel(logging.INFO)

  # And forward to syslog
  fmt = logging.Formatter("%(asctime)s %(relativeCreated)6d %(threadName)s - %(name)s - %(levelname)s -  %(message)s")
  slh = SysLogHandler("/dev/log")
  #slh = logging.StreamHandler()
  slh.setFormatter(fmt)
  log.addHandler(slh)

  client = KookClient()

  if "spy" in sys.argv:
    cli_spy(client)

  elif "monitor" in sys.argv:
    cli_monitor(client)

  else:
    print("Usage: overwatchd [monitor | spy]")
    exit(1)