Files
orchestrator/tests/watchdog/test_never_raise.py
claude-bot 259b507906 feat(watchdog): sidecar-watchdog F1b — monitoring brain in a separate container (ORCH-100)
Add the `watchdog/` package (thin Python-3.12 stdlib-only daemon) and the
`orchestrator-watchdog` compose service — the brain half of the domain-0
observability pair. F1a (ORCH-099) exposes GET /metrics raw signal; F1b reads it,
augments with host / container / dependency probes, runs each signal through a
generalised pure decision function (decide(signal_active, prev, now, cooldown),
a strict superset of disk_watchdog.decide_action) with per-signal in-memory
dedup/throttle/recovery, and alerts over its OWN independent Telegram channel.

Key properties (ADR-001):
- Observer separated from observed: separate container; /metrics not answering is
  itself the master `orch_down` alarm (debounced K ticks — no flap on a hiccup).
- Strictly read-only: docker.sock GET-only + mounted :ro (double guard), host
  paths :ro, no DB/disk writes, no process control — self-hosting-safe.
- never-raise on three levels (per-source/per-tick/per-send) + WATCHDOG_ENABLED
  kill-switch (disabled -> inert idle-loop, not exit).
- Disk anti-duplicate (D6): disk_watchdog (ORCH-063) stays sole owner of the 85%
  alert; sidecar carries orch_down + an opt-in 97% ceiling (default off).
- NO import from src/** (C-1); src/**, STAGE_TRANSITIONS, QG_CHECKS, check_*, DB
  schema — untouched. env_file optional so a missing .env.watchdog never breaks
  `docker compose up` for the prod orchestrator.

Tests: tests/watchdog/ (TC-01…TC-13) + full tests/ regression green (TC-14).
Docs: CHANGELOG, .env.example canon (WATCHDOG_*); architecture README + adr-0033
authored at the architecture stage.

Refs: ORCH-100

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-10 09:36:02 +03:00

89 lines
2.8 KiB
Python

"""TC-06: three-level never-raise.
A raising collector (host / containers / deps) degrades ONE signal and the tick
reaches the end collecting the rest; a raising send is swallowed; the daemon
loop survives a raising tick.
"""
from watchdog.config import Config
from watchdog.core import Watchdog
class _BoomDocker:
def inspect(self, name):
raise RuntimeError("docker socket blew up")
class _Notifier:
def __init__(self):
self.sent = []
def send(self, text):
self.sent.append(text)
return True
class _BoomNotifier:
def send(self, text):
raise RuntimeError("telegram blew up")
def _cfg(**kw):
base = {
"WATCHDOG_TG_BOT_TOKEN": "t",
"WATCHDOG_TG_CHAT_ID": "c",
"WATCHDOG_CONTAINERS": "orchestrator",
}
return Config.from_env({**base, **kw})
def _good_fetch_patch(dog, monkeypatch):
from watchdog.collectors import orch as orch_mod
env = {"schema_version": 1, "generated_at": "2026-06-10T00:00:00Z",
"clk_tck": 100, "agents": [], "stages": [],
"queue": {"depth": 0, "counts": {"failed": 0}}}
monkeypatch.setattr(
orch_mod, "fetch_metrics",
lambda *a, **k: orch_mod.FetchResult(ok=True, envelope=env),
)
def test_per_source_broken_container_degrades_one_signal(monkeypatch):
notifier = _Notifier()
dog = Watchdog(_cfg(), notifier=notifier, docker=_BoomDocker())
_good_fetch_patch(dog, monkeypatch)
# Should not raise; tick completes and produces results for other sources.
results = dog.tick()
keys = [getattr(s, "key", None) for _, s in results]
# orch_down evaluated (orch was up -> not active) and container evaluated.
assert "orch_down" in keys
assert ("container_down", "orchestrator") in keys
def test_per_send_failure_is_swallowed(monkeypatch):
# A raising notifier must not break the tick (per-send never-raise).
cfg = _cfg(WATCHDOG_MEM_PCT="0") # mem >= 0 always -> force an alert send
dog = Watchdog(cfg, notifier=_BoomNotifier(), docker=_BoomDocker())
_good_fetch_patch(dog, monkeypatch)
monkeypatch.setattr(
"watchdog.collectors.host.read_mem_used_pct", lambda *a, **k: 50.0
)
# Must not raise despite the notifier exploding on a triggered alert.
dog.tick()
def test_per_tick_loop_survives_raising_tick(monkeypatch):
# The __main__ run loop must survive a tick that raises (outer never-raise).
from watchdog import __main__ as entry
cfg = _cfg(WATCHDOG_INTERVAL_S="0")
class _BoomDog:
def tick(self):
raise RuntimeError("tick blew up")
monkeypatch.setattr(entry, "Watchdog", lambda c: _BoomDog())
monkeypatch.setattr(entry.time, "sleep", lambda *_: None)
# max_ticks bounds the loop; it must return cleanly, not propagate.
entry.run(cfg=cfg, max_ticks=3)