Close the observability gap between agent_hung (only tracked jobs by jobs.pid)
and orphaned pytest subprocesses the orchestrator launches itself
(merge_gate.retest_branch / coverage_gate.measure_coverage). On a timeout-kill of
the agent (-9, ORCH-109) the grand-child pytest reparents onto tini and keeps
running for days, starving CPU and failing merge-gate re-test — with no alert.
Strictly inside the observer (watchdog/** + the watchdog compose service):
- watchdog/collectors/proc.py: stdlib-only /proc scan (under pid: host),
read-only, never-raise -> []; pure parsers split from I/O (tested on a fake
/proc tree). Never reads /proc/<pid>/environ.
- watchdog/signals.py: pure proc_signals builder, per-entity
("proc_blocking", pid), active iff age_s > proc_age_s; actionable RU detail.
- watchdog/core.py: opt-in tick block (gated on proc_enabled -> zero overhead /
byte-for-byte when off) + RECOVERY synthesis for a vanished process through the
existing decide()/AlertState (no new anti-spam logic).
- watchdog/config.py: WATCHDOG_PROC_{ENABLED(false),AGE_MIN(60),PATTERNS(pytest),
COOLDOWN_S(1800)}; default threshold > max(merge_retest_timeout_s=600,
coverage_run_timeout_s=900) so a legit in-flight run never crosses it.
- docker-compose.yml: pid: host on orchestrator-watchdog ONLY (read-only privilege).
Anti-false-positive and no overlap with agent_hung are by construction (cmdline
scope + age threshold), not fragile cross-namespace PID matching.
Canon synced: WATCHDOG_PROC_* in .env.watchdog.example <-> .env.example block;
documented in LITE_SETUP.md and docs/architecture/README.md (architect). src/**,
/metrics, schema_version, STAGE_TRANSITIONS, QG_CHECKS, check_*, machine-verdict
and the DB schema are untouched; deploy rebuilds only the sidecar, prod
orchestrator is not restarted (NFR-3).
Tests: tests/watchdog/test_proc_blocking_signal.py (TC-01..TC-06),
test_proc_collector.py (/proc parsing), test_tick_proc_blocking_integration.py
(TC-07), plus pid: host and proc-config assertions. Full pytest tests/ green (1930).
Refs: ORCH-111
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
129 lines
4.1 KiB
Python
129 lines
4.1 KiB
Python
"""ORCH-111 TC-07: full tick -> dispatch of the proc_blocking alert (integration).
|
|
|
|
REGRESS: ``Watchdog.tick()`` with a collector that returns a long-lived blocking
|
|
process must dispatch exactly one ``proc_blocking`` alert through the fake
|
|
Notifier — even though ``/metrics`` reports no ``stuck`` stage and no hung agent.
|
|
With the kill-switch OFF the path is inert (byte-for-byte as before ORCH-111).
|
|
|
|
The orchestrator ``/metrics`` envelope is stubbed healthy so ONLY the new signal
|
|
can fire; the proc collector is stubbed at the module boundary so the real
|
|
``_collect_proc`` gate + wrapper still execute.
|
|
"""
|
|
from watchdog.collectors import orch as orch_mod
|
|
from watchdog.collectors import proc as proc_mod
|
|
from watchdog.config import Config
|
|
from watchdog.core import Watchdog
|
|
|
|
|
|
class _Notifier:
|
|
def __init__(self):
|
|
self.sent = []
|
|
|
|
def send(self, text):
|
|
self.sent.append(text)
|
|
return True
|
|
|
|
|
|
class _StubDocker:
|
|
def inspect(self, name):
|
|
return {"State": {"Status": "running"}}
|
|
|
|
|
|
def _healthy_metrics(monkeypatch):
|
|
env = {
|
|
"schema_version": 1,
|
|
"generated_at": "2026-06-15T00:00:00Z",
|
|
"clk_tck": 100,
|
|
"agents": [],
|
|
"stages": [],
|
|
"queue": {"depth": 0, "counts": {"failed": 0}},
|
|
}
|
|
monkeypatch.setattr(
|
|
orch_mod, "fetch_metrics",
|
|
lambda *a, **k: orch_mod.FetchResult(ok=True, envelope=env),
|
|
)
|
|
|
|
|
|
def _cfg(**kw):
|
|
base = {
|
|
"WATCHDOG_TG_BOT_TOKEN": "t",
|
|
"WATCHDOG_TG_CHAT_ID": "c",
|
|
"WATCHDOG_PROC_ENABLED": "true",
|
|
"WATCHDOG_PROC_AGE_MIN": "60", # proc_age_s == 3600
|
|
"WATCHDOG_CONTAINERS": "orchestrator",
|
|
}
|
|
return Config.from_env({**base, **kw})
|
|
|
|
|
|
def _blocking(monkeypatch, age_s=7200.0):
|
|
rec = {"pid": 4242, "cmdline": "python3 -m pytest tests/test_install_lite_script.py",
|
|
"age_s": age_s, "cpu_s": 99999.0, "start_ticks": 1}
|
|
monkeypatch.setattr(proc_mod, "collect_candidates", lambda *a, **k: [rec])
|
|
|
|
|
|
def _proc_alerts(notifier):
|
|
return [m for m in notifier.sent if "Блокирующий процесс" in m]
|
|
|
|
|
|
def test_tc07_tick_dispatches_proc_blocking_alert(monkeypatch):
|
|
_healthy_metrics(monkeypatch)
|
|
_blocking(monkeypatch)
|
|
notifier = _Notifier()
|
|
dog = Watchdog(_cfg(), notifier=notifier, docker=_StubDocker(), now_provider=lambda: 0.0)
|
|
|
|
dog.tick()
|
|
|
|
alerts = _proc_alerts(notifier)
|
|
assert len(alerts) == 1
|
|
assert "4242" in alerts[0]
|
|
assert "pytest" in alerts[0]
|
|
assert alerts[0].startswith("\U0001f534") # red ALERT prefix
|
|
|
|
|
|
def test_tc07_killswitch_off_dispatches_nothing(monkeypatch):
|
|
_healthy_metrics(monkeypatch)
|
|
# Even if the collector WOULD return a blocking process, the gate skips it.
|
|
called = {"n": 0}
|
|
|
|
def _boom(*a, **k):
|
|
called["n"] += 1
|
|
return [{"pid": 1, "cmdline": "pytest", "age_s": 9e9, "cpu_s": 0.0}]
|
|
|
|
monkeypatch.setattr(proc_mod, "collect_candidates", _boom)
|
|
notifier = _Notifier()
|
|
dog = Watchdog(
|
|
_cfg(WATCHDOG_PROC_ENABLED="false"),
|
|
notifier=notifier, docker=_StubDocker(), now_provider=lambda: 0.0,
|
|
)
|
|
|
|
dog.tick()
|
|
|
|
assert _proc_alerts(notifier) == []
|
|
assert called["n"] == 0 # collector never invoked when disabled (zero overhead)
|
|
|
|
|
|
def test_tc07_in_budget_process_does_not_alert(monkeypatch):
|
|
# A process below the threshold (legitimate in-flight run) -> no alert (AC-4).
|
|
_healthy_metrics(monkeypatch)
|
|
_blocking(monkeypatch, age_s=600.0)
|
|
notifier = _Notifier()
|
|
dog = Watchdog(_cfg(), notifier=notifier, docker=_StubDocker(), now_provider=lambda: 0.0)
|
|
|
|
dog.tick()
|
|
|
|
assert _proc_alerts(notifier) == []
|
|
|
|
|
|
def test_tc07_tick_never_raises_when_collector_explodes(monkeypatch):
|
|
_healthy_metrics(monkeypatch)
|
|
|
|
def _explode(*a, **k):
|
|
raise RuntimeError("boom")
|
|
|
|
monkeypatch.setattr(proc_mod, "collect_candidates", _explode)
|
|
notifier = _Notifier()
|
|
dog = Watchdog(_cfg(), notifier=notifier, docker=_StubDocker(), now_provider=lambda: 0.0)
|
|
|
|
dog.tick() # must not raise — collector error degrades to one skipped signal
|
|
assert _proc_alerts(notifier) == []
|