Add the `watchdog/` package (thin Python-3.12 stdlib-only daemon) and the `orchestrator-watchdog` compose service — the brain half of the domain-0 observability pair. F1a (ORCH-099) exposes GET /metrics raw signal; F1b reads it, augments with host / container / dependency probes, runs each signal through a generalised pure decision function (decide(signal_active, prev, now, cooldown), a strict superset of disk_watchdog.decide_action) with per-signal in-memory dedup/throttle/recovery, and alerts over its OWN independent Telegram channel. Key properties (ADR-001): - Observer separated from observed: separate container; /metrics not answering is itself the master `orch_down` alarm (debounced K ticks — no flap on a hiccup). - Strictly read-only: docker.sock GET-only + mounted :ro (double guard), host paths :ro, no DB/disk writes, no process control — self-hosting-safe. - never-raise on three levels (per-source/per-tick/per-send) + WATCHDOG_ENABLED kill-switch (disabled -> inert idle-loop, not exit). - Disk anti-duplicate (D6): disk_watchdog (ORCH-063) stays sole owner of the 85% alert; sidecar carries orch_down + an opt-in 97% ceiling (default off). - NO import from src/** (C-1); src/**, STAGE_TRANSITIONS, QG_CHECKS, check_*, DB schema — untouched. env_file optional so a missing .env.watchdog never breaks `docker compose up` for the prod orchestrator. Tests: tests/watchdog/ (TC-01…TC-13) + full tests/ regression green (TC-14). Docs: CHANGELOG, .env.example canon (WATCHDOG_*); architecture README + adr-0033 authored at the architecture stage. Refs: ORCH-100 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
70 lines
2.3 KiB
Python
70 lines
2.3 KiB
Python
"""TC-07: kill-switch + env-driven config (no hardcoded thresholds).
|
|
|
|
``WATCHDOG_ENABLED=false`` -> the daemon is inert (idle, no ticks). Thresholds /
|
|
intervals / timeouts come from env, not constants.
|
|
"""
|
|
from watchdog.config import Config
|
|
|
|
|
|
def test_killswitch_off_is_inert(monkeypatch):
|
|
from watchdog import __main__ as entry
|
|
|
|
cfg = Config.from_env({"WATCHDOG_ENABLED": "false", "WATCHDOG_INTERVAL_S": "0"})
|
|
assert cfg.enabled is False
|
|
|
|
built = {"n": 0}
|
|
|
|
class _Dog:
|
|
def tick(self):
|
|
built["n"] += 1
|
|
|
|
# If run() ever constructed a Watchdog / ticked while disabled, this would fire.
|
|
monkeypatch.setattr(entry, "Watchdog", lambda c: _Dog())
|
|
monkeypatch.setattr(entry.time, "sleep", lambda *_: None)
|
|
entry.run(cfg=cfg, max_ticks=3)
|
|
assert built["n"] == 0 # inert: never ticked
|
|
|
|
|
|
def test_thresholds_read_from_env():
|
|
cfg = Config.from_env(
|
|
{
|
|
"WATCHDOG_INTERVAL_S": "7",
|
|
"WATCHDOG_MEM_PCT": "77",
|
|
"WATCHDOG_QUEUE_DEPTH": "9",
|
|
"WATCHDOG_AGENT_HUNG_MIN": "5",
|
|
"WATCHDOG_STAGE_STUCK_MIN": "11",
|
|
"WATCHDOG_ORCH_DOWN_TICKS": "4",
|
|
"WATCHDOG_COOLDOWN_S": "60",
|
|
"WATCHDOG_HTTP_TIMEOUT_S": "2",
|
|
"WATCHDOG_CONTAINERS": "orchestrator,plane-app",
|
|
"WATCHDOG_DEPS": "gitea=http://g/healthz,plane=http://p/",
|
|
}
|
|
)
|
|
assert cfg.interval_s == 7.0
|
|
assert cfg.mem_pct == 77.0
|
|
assert cfg.queue_depth == 9
|
|
assert cfg.agent_hung_s == 5 * 60.0
|
|
assert cfg.stage_stuck_s == 11 * 60.0
|
|
assert cfg.orch_down_ticks == 4
|
|
assert cfg.cooldown_s == 60.0
|
|
assert cfg.http_timeout_s == 2.0
|
|
assert cfg.containers == ["orchestrator", "plane-app"]
|
|
assert cfg.deps == {"gitea": "http://g/healthz", "plane": "http://p/"}
|
|
|
|
|
|
def test_defaults_when_env_absent():
|
|
cfg = Config.from_env({})
|
|
assert cfg.enabled is True
|
|
assert cfg.interval_s == 30.0
|
|
assert cfg.metrics_url.endswith(":8500/metrics")
|
|
assert cfg.disk_crit_enabled is False
|
|
assert cfg.containers == ["orchestrator"]
|
|
assert cfg.deps == {}
|
|
|
|
|
|
def test_malformed_env_degrades_to_default():
|
|
# A garbage numeric value must not crash config; it degrades to the default.
|
|
cfg = Config.from_env({"WATCHDOG_INTERVAL_S": "abc", "WATCHDOG_MEM_PCT": ""})
|
|
assert cfg.interval_s == 30.0
|
|
assert cfg.mem_pct == 90.0
|