Add the `watchdog/` package (thin Python-3.12 stdlib-only daemon) and the `orchestrator-watchdog` compose service — the brain half of the domain-0 observability pair. F1a (ORCH-099) exposes GET /metrics raw signal; F1b reads it, augments with host / container / dependency probes, runs each signal through a generalised pure decision function (decide(signal_active, prev, now, cooldown), a strict superset of disk_watchdog.decide_action) with per-signal in-memory dedup/throttle/recovery, and alerts over its OWN independent Telegram channel. Key properties (ADR-001): - Observer separated from observed: separate container; /metrics not answering is itself the master `orch_down` alarm (debounced K ticks — no flap on a hiccup). - Strictly read-only: docker.sock GET-only + mounted :ro (double guard), host paths :ro, no DB/disk writes, no process control — self-hosting-safe. - never-raise on three levels (per-source/per-tick/per-send) + WATCHDOG_ENABLED kill-switch (disabled -> inert idle-loop, not exit). - Disk anti-duplicate (D6): disk_watchdog (ORCH-063) stays sole owner of the 85% alert; sidecar carries orch_down + an opt-in 97% ceiling (default off). - NO import from src/** (C-1); src/**, STAGE_TRANSITIONS, QG_CHECKS, check_*, DB schema — untouched. env_file optional so a missing .env.watchdog never breaks `docker compose up` for the prod orchestrator. Tests: tests/watchdog/ (TC-01…TC-13) + full tests/ regression green (TC-14). Docs: CHANGELOG, .env.example canon (WATCHDOG_*); architecture README + adr-0033 authored at the architecture stage. Refs: ORCH-100 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
120 lines
4.7 KiB
Python
120 lines
4.7 KiB
Python
"""Collector: container statuses over a READ-ONLY ``docker.sock`` (D1, D2, FR-5).
|
|
|
|
Raw HTTP-over-unix-socket via stdlib (``socket.AF_UNIX`` +
|
|
``http.client.HTTPConnection`` subclass) — NO ``docker`` pip package. The client
|
|
issues ``GET`` requests ONLY (``GET /containers/json``,
|
|
``GET /containers/<name>/json``) — it is read-only **by construction**: there is
|
|
no method that POSTs / starts / stops / restarts / execs (AC-6, TC-09). The
|
|
mount is additionally ``:ro``, a second guarantee.
|
|
|
|
``classify_container`` is a pure function (Up / healthy / restarting / exited /
|
|
unhealthy) and ``container_alarm`` decides whether the status is alerting — both
|
|
testable without a live Docker.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import http.client
|
|
import json
|
|
import logging
|
|
import socket
|
|
|
|
logger = logging.getLogger("watchdog.collectors.containers")
|
|
|
|
# A container is "healthy" (no alarm) only in these states.
|
|
_OK_STATES = frozenset({"running", "healthy"})
|
|
|
|
|
|
class _UnixHTTPConnection(http.client.HTTPConnection):
|
|
"""``HTTPConnection`` over an ``AF_UNIX`` socket (stdlib only, GET-only use)."""
|
|
|
|
def __init__(self, sock_path: str, timeout: float):
|
|
super().__init__("localhost", timeout=timeout)
|
|
self._sock_path = sock_path
|
|
|
|
def connect(self) -> None: # noqa: D401 - override
|
|
sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
|
sock.settimeout(self.timeout)
|
|
sock.connect(self._sock_path)
|
|
self.sock = sock
|
|
|
|
|
|
class DockerSockReader:
|
|
"""Read-only Docker API client over the unix socket.
|
|
|
|
EXPOSES READ METHODS ONLY (``list_containers`` / ``inspect``); the single
|
|
private primitive ``_get`` hard-codes the ``GET`` HTTP method, so no caller
|
|
can ever mutate the Docker state (AC-6 / TC-09). never-raise: any socket /
|
|
HTTP / parse error degrades to ``None`` / ``[]``.
|
|
"""
|
|
|
|
def __init__(self, sock_path: str = "/var/run/docker.sock", timeout_s: float = 3.0):
|
|
self._sock_path = sock_path
|
|
self._timeout = timeout_s
|
|
|
|
def _get(self, path: str) -> object | None:
|
|
"""Issue a single ``GET <path>`` over the socket. never-raise.
|
|
|
|
This is the ONLY request primitive and it is GET-only — the read-only
|
|
guarantee is structural, not policy.
|
|
"""
|
|
conn = None
|
|
try:
|
|
conn = _UnixHTTPConnection(self._sock_path, self._timeout)
|
|
conn.request("GET", path)
|
|
resp = conn.getresponse()
|
|
body = resp.read()
|
|
if resp.status >= 400:
|
|
logger.warning("watchdog: docker GET %s -> %s", path, resp.status)
|
|
return None
|
|
return json.loads(body.decode("utf-8", errors="replace"))
|
|
except Exception as e: # noqa: BLE001 - docker unreachable -> degrade
|
|
logger.warning("watchdog: docker GET %s failed: %s", path, e)
|
|
return None
|
|
finally:
|
|
if conn is not None:
|
|
try:
|
|
conn.close()
|
|
except Exception: # noqa: BLE001
|
|
pass
|
|
|
|
def list_containers(self) -> list:
|
|
"""``GET /containers/json?all=1`` — every container (read-only)."""
|
|
data = self._get("/containers/json?all=1")
|
|
return data if isinstance(data, list) else []
|
|
|
|
def inspect(self, name: str) -> dict | None:
|
|
"""``GET /containers/<name>/json`` — one container's detail (read-only)."""
|
|
data = self._get(f"/containers/{name}/json")
|
|
return data if isinstance(data, dict) else None
|
|
|
|
|
|
def classify_container(inspect: dict | None) -> str:
|
|
"""Pure classifier: inspect-JSON -> a coarse status token (D5).
|
|
|
|
Returns one of ``running`` / ``healthy`` / ``unhealthy`` / ``restarting`` /
|
|
``exited`` / ``created`` / ``paused`` / ``dead`` / ``unknown``. When a
|
|
healthcheck is present its verdict (``healthy`` / ``unhealthy``) takes
|
|
precedence over the bare ``running`` state. Never raises.
|
|
"""
|
|
try:
|
|
if not inspect:
|
|
return "unknown"
|
|
state = inspect.get("State")
|
|
if not isinstance(state, dict):
|
|
return "unknown"
|
|
status = (state.get("Status") or "").strip().lower()
|
|
health = state.get("Health")
|
|
if isinstance(health, dict):
|
|
hstatus = (health.get("Status") or "").strip().lower()
|
|
if hstatus in ("healthy", "unhealthy"):
|
|
return hstatus
|
|
return status or "unknown"
|
|
except Exception as e: # noqa: BLE001 - classification must never crash
|
|
logger.warning("watchdog: classify_container error: %s", e)
|
|
return "unknown"
|
|
|
|
|
|
def container_alarm(status: str) -> bool:
|
|
"""True when ``status`` is NOT a healthy state (restarting/exited/unhealthy/...)."""
|
|
return (status or "").strip().lower() not in _OK_STATES
|