Add the `watchdog/` package (thin Python-3.12 stdlib-only daemon) and the `orchestrator-watchdog` compose service — the brain half of the domain-0 observability pair. F1a (ORCH-099) exposes GET /metrics raw signal; F1b reads it, augments with host / container / dependency probes, runs each signal through a generalised pure decision function (decide(signal_active, prev, now, cooldown), a strict superset of disk_watchdog.decide_action) with per-signal in-memory dedup/throttle/recovery, and alerts over its OWN independent Telegram channel. Key properties (ADR-001): - Observer separated from observed: separate container; /metrics not answering is itself the master `orch_down` alarm (debounced K ticks — no flap on a hiccup). - Strictly read-only: docker.sock GET-only + mounted :ro (double guard), host paths :ro, no DB/disk writes, no process control — self-hosting-safe. - never-raise on three levels (per-source/per-tick/per-send) + WATCHDOG_ENABLED kill-switch (disabled -> inert idle-loop, not exit). - Disk anti-duplicate (D6): disk_watchdog (ORCH-063) stays sole owner of the 85% alert; sidecar carries orch_down + an opt-in 97% ceiling (default off). - NO import from src/** (C-1); src/**, STAGE_TRANSITIONS, QG_CHECKS, check_*, DB schema — untouched. env_file optional so a missing .env.watchdog never breaks `docker compose up` for the prod orchestrator. Tests: tests/watchdog/ (TC-01…TC-13) + full tests/ regression green (TC-14). Docs: CHANGELOG, .env.example canon (WATCHDOG_*); architecture README + adr-0033 authored at the architecture stage. Refs: ORCH-100 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
89 lines
2.8 KiB
Python
89 lines
2.8 KiB
Python
"""TC-06: three-level never-raise.
|
|
|
|
A raising collector (host / containers / deps) degrades ONE signal and the tick
|
|
reaches the end collecting the rest; a raising send is swallowed; the daemon
|
|
loop survives a raising tick.
|
|
"""
|
|
from watchdog.config import Config
|
|
from watchdog.core import Watchdog
|
|
|
|
|
|
class _BoomDocker:
|
|
def inspect(self, name):
|
|
raise RuntimeError("docker socket blew up")
|
|
|
|
|
|
class _Notifier:
|
|
def __init__(self):
|
|
self.sent = []
|
|
|
|
def send(self, text):
|
|
self.sent.append(text)
|
|
return True
|
|
|
|
|
|
class _BoomNotifier:
|
|
def send(self, text):
|
|
raise RuntimeError("telegram blew up")
|
|
|
|
|
|
def _cfg(**kw):
|
|
base = {
|
|
"WATCHDOG_TG_BOT_TOKEN": "t",
|
|
"WATCHDOG_TG_CHAT_ID": "c",
|
|
"WATCHDOG_CONTAINERS": "orchestrator",
|
|
}
|
|
return Config.from_env({**base, **kw})
|
|
|
|
|
|
def _good_fetch_patch(dog, monkeypatch):
|
|
from watchdog.collectors import orch as orch_mod
|
|
|
|
env = {"schema_version": 1, "generated_at": "2026-06-10T00:00:00Z",
|
|
"clk_tck": 100, "agents": [], "stages": [],
|
|
"queue": {"depth": 0, "counts": {"failed": 0}}}
|
|
monkeypatch.setattr(
|
|
orch_mod, "fetch_metrics",
|
|
lambda *a, **k: orch_mod.FetchResult(ok=True, envelope=env),
|
|
)
|
|
|
|
|
|
def test_per_source_broken_container_degrades_one_signal(monkeypatch):
|
|
notifier = _Notifier()
|
|
dog = Watchdog(_cfg(), notifier=notifier, docker=_BoomDocker())
|
|
_good_fetch_patch(dog, monkeypatch)
|
|
# Should not raise; tick completes and produces results for other sources.
|
|
results = dog.tick()
|
|
keys = [getattr(s, "key", None) for _, s in results]
|
|
# orch_down evaluated (orch was up -> not active) and container evaluated.
|
|
assert "orch_down" in keys
|
|
assert ("container_down", "orchestrator") in keys
|
|
|
|
|
|
def test_per_send_failure_is_swallowed(monkeypatch):
|
|
# A raising notifier must not break the tick (per-send never-raise).
|
|
cfg = _cfg(WATCHDOG_MEM_PCT="0") # mem >= 0 always -> force an alert send
|
|
dog = Watchdog(cfg, notifier=_BoomNotifier(), docker=_BoomDocker())
|
|
_good_fetch_patch(dog, monkeypatch)
|
|
monkeypatch.setattr(
|
|
"watchdog.collectors.host.read_mem_used_pct", lambda *a, **k: 50.0
|
|
)
|
|
# Must not raise despite the notifier exploding on a triggered alert.
|
|
dog.tick()
|
|
|
|
|
|
def test_per_tick_loop_survives_raising_tick(monkeypatch):
|
|
# The __main__ run loop must survive a tick that raises (outer never-raise).
|
|
from watchdog import __main__ as entry
|
|
|
|
cfg = _cfg(WATCHDOG_INTERVAL_S="0")
|
|
|
|
class _BoomDog:
|
|
def tick(self):
|
|
raise RuntimeError("tick blew up")
|
|
|
|
monkeypatch.setattr(entry, "Watchdog", lambda c: _BoomDog())
|
|
monkeypatch.setattr(entry.time, "sleep", lambda *_: None)
|
|
# max_ticks bounds the loop; it must return cleanly, not propagate.
|
|
entry.run(cfg=cfg, max_ticks=3)
|