Close the observability gap between agent_hung (only tracked jobs by jobs.pid)
and orphaned pytest subprocesses the orchestrator launches itself
(merge_gate.retest_branch / coverage_gate.measure_coverage). On a timeout-kill of
the agent (-9, ORCH-109) the grand-child pytest reparents onto tini and keeps
running for days, starving CPU and failing merge-gate re-test — with no alert.
Strictly inside the observer (watchdog/** + the watchdog compose service):
- watchdog/collectors/proc.py: stdlib-only /proc scan (under pid: host),
read-only, never-raise -> []; pure parsers split from I/O (tested on a fake
/proc tree). Never reads /proc/<pid>/environ.
- watchdog/signals.py: pure proc_signals builder, per-entity
("proc_blocking", pid), active iff age_s > proc_age_s; actionable RU detail.
- watchdog/core.py: opt-in tick block (gated on proc_enabled -> zero overhead /
byte-for-byte when off) + RECOVERY synthesis for a vanished process through the
existing decide()/AlertState (no new anti-spam logic).
- watchdog/config.py: WATCHDOG_PROC_{ENABLED(false),AGE_MIN(60),PATTERNS(pytest),
COOLDOWN_S(1800)}; default threshold > max(merge_retest_timeout_s=600,
coverage_run_timeout_s=900) so a legit in-flight run never crosses it.
- docker-compose.yml: pid: host on orchestrator-watchdog ONLY (read-only privilege).
Anti-false-positive and no overlap with agent_hung are by construction (cmdline
scope + age threshold), not fragile cross-namespace PID matching.
Canon synced: WATCHDOG_PROC_* in .env.watchdog.example <-> .env.example block;
documented in LITE_SETUP.md and docs/architecture/README.md (architect). src/**,
/metrics, schema_version, STAGE_TRANSITIONS, QG_CHECKS, check_*, machine-verdict
and the DB schema are untouched; deploy rebuilds only the sidecar, prod
orchestrator is not restarted (NFR-3).
Tests: tests/watchdog/test_proc_blocking_signal.py (TC-01..TC-06),
test_proc_collector.py (/proc parsing), test_tick_proc_blocking_integration.py
(TC-07), plus pid: host and proc-config assertions. Full pytest tests/ green (1930).
Refs: ORCH-111
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
102 lines
3.5 KiB
Python
102 lines
3.5 KiB
Python
"""TC-07: kill-switch + env-driven config (no hardcoded thresholds).
|
|
|
|
``WATCHDOG_ENABLED=false`` -> the daemon is inert (idle, no ticks). Thresholds /
|
|
intervals / timeouts come from env, not constants.
|
|
"""
|
|
from watchdog.config import Config
|
|
|
|
|
|
def test_killswitch_off_is_inert(monkeypatch):
|
|
from watchdog import __main__ as entry
|
|
|
|
cfg = Config.from_env({"WATCHDOG_ENABLED": "false", "WATCHDOG_INTERVAL_S": "0"})
|
|
assert cfg.enabled is False
|
|
|
|
built = {"n": 0}
|
|
|
|
class _Dog:
|
|
def tick(self):
|
|
built["n"] += 1
|
|
|
|
# If run() ever constructed a Watchdog / ticked while disabled, this would fire.
|
|
monkeypatch.setattr(entry, "Watchdog", lambda c: _Dog())
|
|
monkeypatch.setattr(entry.time, "sleep", lambda *_: None)
|
|
entry.run(cfg=cfg, max_ticks=3)
|
|
assert built["n"] == 0 # inert: never ticked
|
|
|
|
|
|
def test_thresholds_read_from_env():
|
|
cfg = Config.from_env(
|
|
{
|
|
"WATCHDOG_INTERVAL_S": "7",
|
|
"WATCHDOG_MEM_PCT": "77",
|
|
"WATCHDOG_QUEUE_DEPTH": "9",
|
|
"WATCHDOG_AGENT_HUNG_MIN": "5",
|
|
"WATCHDOG_STAGE_STUCK_MIN": "11",
|
|
"WATCHDOG_ORCH_DOWN_TICKS": "4",
|
|
"WATCHDOG_COOLDOWN_S": "60",
|
|
"WATCHDOG_HTTP_TIMEOUT_S": "2",
|
|
"WATCHDOG_CONTAINERS": "orchestrator,plane-app",
|
|
"WATCHDOG_DEPS": "gitea=http://g/healthz,plane=http://p/",
|
|
}
|
|
)
|
|
assert cfg.interval_s == 7.0
|
|
assert cfg.mem_pct == 77.0
|
|
assert cfg.queue_depth == 9
|
|
assert cfg.agent_hung_s == 5 * 60.0
|
|
assert cfg.stage_stuck_s == 11 * 60.0
|
|
assert cfg.orch_down_ticks == 4
|
|
assert cfg.cooldown_s == 60.0
|
|
assert cfg.http_timeout_s == 2.0
|
|
assert cfg.containers == ["orchestrator", "plane-app"]
|
|
assert cfg.deps == {"gitea": "http://g/healthz", "plane": "http://p/"}
|
|
|
|
|
|
def test_defaults_when_env_absent():
|
|
cfg = Config.from_env({})
|
|
assert cfg.enabled is True
|
|
assert cfg.interval_s == 30.0
|
|
assert cfg.metrics_url.endswith(":8500/metrics")
|
|
assert cfg.disk_crit_enabled is False
|
|
assert cfg.containers == ["orchestrator"]
|
|
assert cfg.deps == {}
|
|
|
|
|
|
def test_malformed_env_degrades_to_default():
|
|
# A garbage numeric value must not crash config; it degrades to the default.
|
|
cfg = Config.from_env({"WATCHDOG_INTERVAL_S": "abc", "WATCHDOG_MEM_PCT": ""})
|
|
assert cfg.interval_s == 30.0
|
|
assert cfg.mem_pct == 90.0
|
|
|
|
|
|
# -- ORCH-111: proc_blocking config (kill-switch default-off + safe threshold) --
|
|
def test_proc_blocking_defaults_off_and_safe():
|
|
cfg = Config.from_env({})
|
|
assert cfg.proc_enabled is False # opt-in (needs `pid: host`)
|
|
assert cfg.proc_patterns == ["pytest"]
|
|
assert cfg.proc_cooldown_s == 1800.0
|
|
# Cross-invariant (adr-0041 D2): the default age threshold MUST exceed the max
|
|
# legitimate test-run budget max(merge_retest_timeout_s=600, coverage=900).
|
|
assert cfg.proc_age_s > 900.0
|
|
|
|
|
|
def test_proc_blocking_thresholds_read_from_env():
|
|
cfg = Config.from_env(
|
|
{
|
|
"WATCHDOG_PROC_ENABLED": "true",
|
|
"WATCHDOG_PROC_AGE_MIN": "45",
|
|
"WATCHDOG_PROC_PATTERNS": "pytest,coverage run",
|
|
"WATCHDOG_PROC_COOLDOWN_S": "900",
|
|
}
|
|
)
|
|
assert cfg.proc_enabled is True
|
|
assert cfg.proc_age_s == 45 * 60.0
|
|
assert cfg.proc_patterns == ["pytest", "coverage run"]
|
|
assert cfg.proc_cooldown_s == 900.0
|
|
|
|
|
|
def test_proc_blocking_malformed_env_degrades():
|
|
cfg = Config.from_env({"WATCHDOG_PROC_AGE_MIN": "nope", "WATCHDOG_PROC_ENABLED": ""})
|
|
assert cfg.proc_age_min == 60.0
|
|
assert cfg.proc_enabled is False
|