Add the `watchdog/` package (thin Python-3.12 stdlib-only daemon) and the `orchestrator-watchdog` compose service — the brain half of the domain-0 observability pair. F1a (ORCH-099) exposes GET /metrics raw signal; F1b reads it, augments with host / container / dependency probes, runs each signal through a generalised pure decision function (decide(signal_active, prev, now, cooldown), a strict superset of disk_watchdog.decide_action) with per-signal in-memory dedup/throttle/recovery, and alerts over its OWN independent Telegram channel. Key properties (ADR-001): - Observer separated from observed: separate container; /metrics not answering is itself the master `orch_down` alarm (debounced K ticks — no flap on a hiccup). - Strictly read-only: docker.sock GET-only + mounted :ro (double guard), host paths :ro, no DB/disk writes, no process control — self-hosting-safe. - never-raise on three levels (per-source/per-tick/per-send) + WATCHDOG_ENABLED kill-switch (disabled -> inert idle-loop, not exit). - Disk anti-duplicate (D6): disk_watchdog (ORCH-063) stays sole owner of the 85% alert; sidecar carries orch_down + an opt-in 97% ceiling (default off). - NO import from src/** (C-1); src/**, STAGE_TRANSITIONS, QG_CHECKS, check_*, DB schema — untouched. env_file optional so a missing .env.watchdog never breaks `docker compose up` for the prod orchestrator. Tests: tests/watchdog/ (TC-01…TC-13) + full tests/ regression green (TC-14). Docs: CHANGELOG, .env.example canon (WATCHDOG_*); architecture README + adr-0033 authored at the architecture stage. Refs: ORCH-100 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
76 lines
2.7 KiB
Python
76 lines
2.7 KiB
Python
"""Collector: host metrics — memory (/proc/meminfo), disk (shutil.disk_usage).
|
|
|
|
stdlib-only, the same primitives ``disk_watchdog`` uses (D1). Every reader is
|
|
never-raise: a missing path / unreadable proc-file degrades to ``None`` (one
|
|
signal skipped), never a tick crash (D8). CPU "hung agent" liveness is computed
|
|
from the ``/metrics`` envelope (cpu_ticks), not here.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import shutil
|
|
|
|
logger = logging.getLogger("watchdog.collectors.host")
|
|
|
|
|
|
def read_mem_used_pct(meminfo_path: str = "/proc/meminfo") -> float | None:
|
|
"""Host memory used-% from ``/proc/meminfo`` (``MemTotal`` / ``MemAvailable``).
|
|
|
|
``used_pct = (1 - MemAvailable/MemTotal) * 100``. Returns ``None`` on a
|
|
missing file / unparseable content / non-Linux (never raises).
|
|
"""
|
|
try:
|
|
fields: dict[str, int] = {}
|
|
with open(meminfo_path, "r") as f:
|
|
for line in f:
|
|
parts = line.split(":")
|
|
if len(parts) != 2:
|
|
continue
|
|
key = parts[0].strip()
|
|
val = parts[1].strip().split()
|
|
if val:
|
|
try:
|
|
fields[key] = int(val[0]) # value is in kB
|
|
except ValueError:
|
|
continue
|
|
total = fields.get("MemTotal")
|
|
avail = fields.get("MemAvailable")
|
|
if not total or avail is None:
|
|
return None
|
|
used_pct = (1.0 - (avail / total)) * 100.0
|
|
return round(used_pct, 1)
|
|
except Exception as e: # noqa: BLE001 - degrade one signal, keep the tick
|
|
logger.warning("watchdog: cannot read memory: %s", e)
|
|
return None
|
|
|
|
|
|
def read_disk_used_pct(path: str) -> float | None:
|
|
"""Disk used-% for one path via ``shutil.disk_usage`` (1:1 with disk_watchdog).
|
|
|
|
Returns ``None`` if the path is missing / unreadable (never raises).
|
|
"""
|
|
try:
|
|
usage = shutil.disk_usage(path)
|
|
total = int(usage.total)
|
|
if total <= 0:
|
|
return None
|
|
return round(int(usage.used) / total * 100.0, 1)
|
|
except Exception as e: # noqa: BLE001 - skip this path, keep the tick
|
|
logger.warning("watchdog: cannot measure disk %s: %s", path, e)
|
|
return None
|
|
|
|
|
|
def max_disk_used_pct(paths: list[str]) -> tuple[str, float] | None:
|
|
"""The fullest of ``paths`` as ``(path, used_pct)`` — the worst-case ceiling.
|
|
|
|
A path that cannot be measured is skipped; ``None`` if none could be read.
|
|
"""
|
|
worst: tuple[str, float] | None = None
|
|
for p in paths:
|
|
pct = read_disk_used_pct(p)
|
|
if pct is None:
|
|
continue
|
|
if worst is None or pct > worst[1]:
|
|
worst = (p, pct)
|
|
return worst
|