Add the `watchdog/` package (thin Python-3.12 stdlib-only daemon) and the `orchestrator-watchdog` compose service — the brain half of the domain-0 observability pair. F1a (ORCH-099) exposes GET /metrics raw signal; F1b reads it, augments with host / container / dependency probes, runs each signal through a generalised pure decision function (decide(signal_active, prev, now, cooldown), a strict superset of disk_watchdog.decide_action) with per-signal in-memory dedup/throttle/recovery, and alerts over its OWN independent Telegram channel. Key properties (ADR-001): - Observer separated from observed: separate container; /metrics not answering is itself the master `orch_down` alarm (debounced K ticks — no flap on a hiccup). - Strictly read-only: docker.sock GET-only + mounted :ro (double guard), host paths :ro, no DB/disk writes, no process control — self-hosting-safe. - never-raise on three levels (per-source/per-tick/per-send) + WATCHDOG_ENABLED kill-switch (disabled -> inert idle-loop, not exit). - Disk anti-duplicate (D6): disk_watchdog (ORCH-063) stays sole owner of the 85% alert; sidecar carries orch_down + an opt-in 97% ceiling (default off). - NO import from src/** (C-1); src/**, STAGE_TRANSITIONS, QG_CHECKS, check_*, DB schema — untouched. env_file optional so a missing .env.watchdog never breaks `docker compose up` for the prod orchestrator. Tests: tests/watchdog/ (TC-01…TC-13) + full tests/ regression green (TC-14). Docs: CHANGELOG, .env.example canon (WATCHDOG_*); architecture README + adr-0033 authored at the architecture stage. Refs: ORCH-100 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
119 lines
4.6 KiB
Python
119 lines
4.6 KiB
Python
"""TC-11: tolerance to the /metrics contract.
|
|
|
|
Unknown fields are ignored, a missing optional does not crash, and a
|
|
schema_version above the known one logs a warning (no crash). Also covers the
|
|
envelope-derived signal evaluation (agent_hung / stage_stuck / job_failed /
|
|
queue_depth).
|
|
"""
|
|
import logging
|
|
|
|
from watchdog.collectors import orch as orch_mod
|
|
from watchdog.config import Config
|
|
from watchdog.signals import AgentSample, eval_envelope
|
|
|
|
|
|
def _cfg(**kw):
|
|
return Config.from_env(kw)
|
|
|
|
|
|
def test_unknown_field_ignored():
|
|
body = '{"schema_version":1,"stages":[],"brand_new_field":42}'
|
|
env = orch_mod.parse_envelope(body)
|
|
assert env["brand_new_field"] == 42 # tolerated, not a crash
|
|
|
|
|
|
def test_missing_optional_not_an_error():
|
|
env = orch_mod.parse_envelope('{"schema_version":1}')
|
|
ev = eval_envelope(env, _cfg(), prev_agents={}, prev_failed=None)
|
|
assert ev.signals == [] # no stages/agents/queue -> no signals, no crash
|
|
|
|
|
|
def test_non_object_body_raises_valueerror():
|
|
import pytest
|
|
|
|
with pytest.raises(ValueError):
|
|
orch_mod.parse_envelope("[1,2,3]")
|
|
|
|
|
|
def test_schema_version_bump_warns(caplog):
|
|
env = {"schema_version": 999}
|
|
with caplog.at_level(logging.WARNING):
|
|
orch_mod.check_schema_version(env)
|
|
assert any("schema_version" in r.message for r in caplog.records)
|
|
|
|
|
|
def test_parse_generated_at_roundtrip_and_tolerant():
|
|
assert orch_mod.parse_generated_at({"generated_at": "2026-06-10T00:00:00Z"})
|
|
assert orch_mod.parse_generated_at({"generated_at": "garbage"}) is None
|
|
assert orch_mod.parse_generated_at({}) is None
|
|
|
|
|
|
def test_queue_depth_and_job_failed_signals():
|
|
env = {
|
|
"schema_version": 1,
|
|
"queue": {"depth": 25, "counts": {"failed": 5}},
|
|
}
|
|
cfg = _cfg(WATCHDOG_QUEUE_DEPTH="20")
|
|
# First tick: failed baseline established, depth over threshold fires.
|
|
ev = eval_envelope(env, cfg, prev_agents={}, prev_failed=None)
|
|
keys = {s.key for s in ev.signals}
|
|
assert "queue_depth" in keys
|
|
assert "job_failed" not in keys # no prior baseline -> no edge yet
|
|
assert ev.failed_count == 5
|
|
|
|
# Next tick: failed grew 5 -> 7 -> edge job_failed alert.
|
|
env2 = {"queue": {"depth": 0, "counts": {"failed": 7}}}
|
|
ev2 = eval_envelope(env2, cfg, prev_agents={}, prev_failed=ev.failed_count)
|
|
jf = [s for s in ev2.signals if s.key == "job_failed"]
|
|
assert len(jf) == 1 and jf[0].edge is True and jf[0].active is True
|
|
|
|
|
|
def test_stage_stuck_signal():
|
|
env = {"stages": [{"work_item": "ORCH-1", "stage": "review", "age_in_stage_s": 9999}]}
|
|
cfg = _cfg(WATCHDOG_STAGE_STUCK_MIN="1") # 60s threshold
|
|
ev = eval_envelope(env, cfg, prev_agents={}, prev_failed=None)
|
|
stuck = [s for s in ev.signals if s.key == ("stage_stuck", "ORCH-1")]
|
|
assert len(stuck) == 1 and stuck[0].active is True
|
|
|
|
|
|
def test_agent_hung_needs_two_polls_and_low_cpu():
|
|
cfg = _cfg(WATCHDOG_AGENT_HUNG_MIN="1", WATCHDOG_AGENT_CPU_FLOOR="0.01")
|
|
env = {
|
|
"schema_version": 1,
|
|
"generated_at": "2026-06-10T00:01:40Z", # +100s vs prev sample below
|
|
"clk_tck": 100,
|
|
"agents": [{"run_id": 7, "agent": "developer", "runtime_s": 999, "cpu_ticks": 50}],
|
|
}
|
|
prev_t = orch_mod.parse_generated_at({"generated_at": "2026-06-10T00:00:00Z"})
|
|
prev = {7: AgentSample(cpu_ticks=40, generated_at=prev_t)}
|
|
# Δticks=10 over clk_tck=100 -> 0.1 CPU-seconds over 100s -> frac 0.001 < floor.
|
|
ev = eval_envelope(env, cfg, prev_agents=prev, prev_failed=None)
|
|
hung = [s for s in ev.signals if s.key == ("agent_hung", 7)]
|
|
assert len(hung) == 1 and hung[0].active is True
|
|
|
|
|
|
def test_agent_hung_skipped_when_cpu_ticks_null():
|
|
cfg = _cfg(WATCHDOG_AGENT_HUNG_MIN="1")
|
|
env = {
|
|
"generated_at": "2026-06-10T00:01:40Z",
|
|
"clk_tck": 100,
|
|
"agents": [{"run_id": 8, "runtime_s": 999, "cpu_ticks": None}],
|
|
}
|
|
prev = {8: AgentSample(cpu_ticks=10, generated_at=0.0)}
|
|
ev = eval_envelope(env, cfg, prev_agents=prev, prev_failed=None)
|
|
assert [s for s in ev.signals if s.key == ("agent_hung", 8)] == []
|
|
|
|
|
|
def test_agent_busy_not_hung():
|
|
cfg = _cfg(WATCHDOG_AGENT_HUNG_MIN="1", WATCHDOG_AGENT_CPU_FLOOR="0.01")
|
|
env = {
|
|
"generated_at": "2026-06-10T00:01:40Z",
|
|
"clk_tck": 100,
|
|
"agents": [{"run_id": 9, "runtime_s": 999, "cpu_ticks": 5000}],
|
|
}
|
|
prev_t = orch_mod.parse_generated_at({"generated_at": "2026-06-10T00:00:00Z"})
|
|
prev = {9: AgentSample(cpu_ticks=40, generated_at=prev_t)}
|
|
# Big Δticks -> high CPU fraction -> not hung.
|
|
ev = eval_envelope(env, cfg, prev_agents=prev, prev_failed=None)
|
|
assert [s for s in ev.signals if s.key == ("agent_hung", 9)] == []
|