Files
orchestrator/watchdog/collectors/containers.py
claude-bot 259b507906 feat(watchdog): sidecar-watchdog F1b — monitoring brain in a separate container (ORCH-100)
Add the `watchdog/` package (thin Python-3.12 stdlib-only daemon) and the
`orchestrator-watchdog` compose service — the brain half of the domain-0
observability pair. F1a (ORCH-099) exposes GET /metrics raw signal; F1b reads it,
augments with host / container / dependency probes, runs each signal through a
generalised pure decision function (decide(signal_active, prev, now, cooldown),
a strict superset of disk_watchdog.decide_action) with per-signal in-memory
dedup/throttle/recovery, and alerts over its OWN independent Telegram channel.

Key properties (ADR-001):
- Observer separated from observed: separate container; /metrics not answering is
  itself the master `orch_down` alarm (debounced K ticks — no flap on a hiccup).
- Strictly read-only: docker.sock GET-only + mounted :ro (double guard), host
  paths :ro, no DB/disk writes, no process control — self-hosting-safe.
- never-raise on three levels (per-source/per-tick/per-send) + WATCHDOG_ENABLED
  kill-switch (disabled -> inert idle-loop, not exit).
- Disk anti-duplicate (D6): disk_watchdog (ORCH-063) stays sole owner of the 85%
  alert; sidecar carries orch_down + an opt-in 97% ceiling (default off).
- NO import from src/** (C-1); src/**, STAGE_TRANSITIONS, QG_CHECKS, check_*, DB
  schema — untouched. env_file optional so a missing .env.watchdog never breaks
  `docker compose up` for the prod orchestrator.

Tests: tests/watchdog/ (TC-01…TC-13) + full tests/ regression green (TC-14).
Docs: CHANGELOG, .env.example canon (WATCHDOG_*); architecture README + adr-0033
authored at the architecture stage.

Refs: ORCH-100

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-10 09:36:02 +03:00

120 lines
4.7 KiB
Python

"""Collector: container statuses over a READ-ONLY ``docker.sock`` (D1, D2, FR-5).
Raw HTTP-over-unix-socket via stdlib (``socket.AF_UNIX`` +
``http.client.HTTPConnection`` subclass) — NO ``docker`` pip package. The client
issues ``GET`` requests ONLY (``GET /containers/json``,
``GET /containers/<name>/json``) — it is read-only **by construction**: there is
no method that POSTs / starts / stops / restarts / execs (AC-6, TC-09). The
mount is additionally ``:ro``, a second guarantee.
``classify_container`` is a pure function (Up / healthy / restarting / exited /
unhealthy) and ``container_alarm`` decides whether the status is alerting — both
testable without a live Docker.
"""
from __future__ import annotations
import http.client
import json
import logging
import socket
logger = logging.getLogger("watchdog.collectors.containers")
# A container is "healthy" (no alarm) only in these states.
_OK_STATES = frozenset({"running", "healthy"})
class _UnixHTTPConnection(http.client.HTTPConnection):
"""``HTTPConnection`` over an ``AF_UNIX`` socket (stdlib only, GET-only use)."""
def __init__(self, sock_path: str, timeout: float):
super().__init__("localhost", timeout=timeout)
self._sock_path = sock_path
def connect(self) -> None: # noqa: D401 - override
sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
sock.settimeout(self.timeout)
sock.connect(self._sock_path)
self.sock = sock
class DockerSockReader:
"""Read-only Docker API client over the unix socket.
EXPOSES READ METHODS ONLY (``list_containers`` / ``inspect``); the single
private primitive ``_get`` hard-codes the ``GET`` HTTP method, so no caller
can ever mutate the Docker state (AC-6 / TC-09). never-raise: any socket /
HTTP / parse error degrades to ``None`` / ``[]``.
"""
def __init__(self, sock_path: str = "/var/run/docker.sock", timeout_s: float = 3.0):
self._sock_path = sock_path
self._timeout = timeout_s
def _get(self, path: str) -> object | None:
"""Issue a single ``GET <path>`` over the socket. never-raise.
This is the ONLY request primitive and it is GET-only — the read-only
guarantee is structural, not policy.
"""
conn = None
try:
conn = _UnixHTTPConnection(self._sock_path, self._timeout)
conn.request("GET", path)
resp = conn.getresponse()
body = resp.read()
if resp.status >= 400:
logger.warning("watchdog: docker GET %s -> %s", path, resp.status)
return None
return json.loads(body.decode("utf-8", errors="replace"))
except Exception as e: # noqa: BLE001 - docker unreachable -> degrade
logger.warning("watchdog: docker GET %s failed: %s", path, e)
return None
finally:
if conn is not None:
try:
conn.close()
except Exception: # noqa: BLE001
pass
def list_containers(self) -> list:
"""``GET /containers/json?all=1`` — every container (read-only)."""
data = self._get("/containers/json?all=1")
return data if isinstance(data, list) else []
def inspect(self, name: str) -> dict | None:
"""``GET /containers/<name>/json`` — one container's detail (read-only)."""
data = self._get(f"/containers/{name}/json")
return data if isinstance(data, dict) else None
def classify_container(inspect: dict | None) -> str:
"""Pure classifier: inspect-JSON -> a coarse status token (D5).
Returns one of ``running`` / ``healthy`` / ``unhealthy`` / ``restarting`` /
``exited`` / ``created`` / ``paused`` / ``dead`` / ``unknown``. When a
healthcheck is present its verdict (``healthy`` / ``unhealthy``) takes
precedence over the bare ``running`` state. Never raises.
"""
try:
if not inspect:
return "unknown"
state = inspect.get("State")
if not isinstance(state, dict):
return "unknown"
status = (state.get("Status") or "").strip().lower()
health = state.get("Health")
if isinstance(health, dict):
hstatus = (health.get("Status") or "").strip().lower()
if hstatus in ("healthy", "unhealthy"):
return hstatus
return status or "unknown"
except Exception as e: # noqa: BLE001 - classification must never crash
logger.warning("watchdog: classify_container error: %s", e)
return "unknown"
def container_alarm(status: str) -> bool:
"""True when ``status`` is NOT a healthy state (restarting/exited/unhealthy/...)."""
return (status or "").strip().lower() not in _OK_STATES