feat(watchdog): proc_blocking alert for orphaned long-lived test processes
Close the observability gap between agent_hung (only tracked jobs by jobs.pid)
and orphaned pytest subprocesses the orchestrator launches itself
(merge_gate.retest_branch / coverage_gate.measure_coverage). On a timeout-kill of
the agent (-9, ORCH-109) the grand-child pytest reparents onto tini and keeps
running for days, starving CPU and failing merge-gate re-test — with no alert.
Strictly inside the observer (watchdog/** + the watchdog compose service):
- watchdog/collectors/proc.py: stdlib-only /proc scan (under pid: host),
read-only, never-raise -> []; pure parsers split from I/O (tested on a fake
/proc tree). Never reads /proc/<pid>/environ.
- watchdog/signals.py: pure proc_signals builder, per-entity
("proc_blocking", pid), active iff age_s > proc_age_s; actionable RU detail.
- watchdog/core.py: opt-in tick block (gated on proc_enabled -> zero overhead /
byte-for-byte when off) + RECOVERY synthesis for a vanished process through the
existing decide()/AlertState (no new anti-spam logic).
- watchdog/config.py: WATCHDOG_PROC_{ENABLED(false),AGE_MIN(60),PATTERNS(pytest),
COOLDOWN_S(1800)}; default threshold > max(merge_retest_timeout_s=600,
coverage_run_timeout_s=900) so a legit in-flight run never crosses it.
- docker-compose.yml: pid: host on orchestrator-watchdog ONLY (read-only privilege).
Anti-false-positive and no overlap with agent_hung are by construction (cmdline
scope + age threshold), not fragile cross-namespace PID matching.
Canon synced: WATCHDOG_PROC_* in .env.watchdog.example <-> .env.example block;
documented in LITE_SETUP.md and docs/architecture/README.md (architect). src/**,
/metrics, schema_version, STAGE_TRANSITIONS, QG_CHECKS, check_*, machine-verdict
and the DB schema are untouched; deploy rebuilds only the sidecar, prod
orchestrator is not restarted (NFR-3).
Tests: tests/watchdog/test_proc_blocking_signal.py (TC-01..TC-06),
test_proc_collector.py (/proc parsing), test_tick_proc_blocking_integration.py
(TC-07), plus pid: host and proc-config assertions. Full pytest tests/ green (1930).
Refs: ORCH-111
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -48,6 +48,17 @@ def test_host_paths_mounted_read_only():
|
||||
assert v.endswith(":ro"), f"watchdog mount must be :ro: {v}"
|
||||
|
||||
|
||||
def test_watchdog_shares_host_pid_namespace():
|
||||
# ORCH-111 (adr-0041 D6): the sidecar shares the host PID-namespace so its
|
||||
# /proc reflects the host (proc_blocking collector). `pid: host` is NOT a
|
||||
# volume, so the read-only-mounts invariant above is unaffected.
|
||||
wd = _compose()["services"]["orchestrator-watchdog"]
|
||||
assert wd.get("pid") == "host", "orchestrator-watchdog must declare `pid: host`"
|
||||
# The privilege stays on the OBSERVER only — prod orchestrator must NOT get it.
|
||||
orch = _compose()["services"]["orchestrator"]
|
||||
assert "pid" not in orch, "the prod orchestrator service must not share the host PID-namespace"
|
||||
|
||||
|
||||
def test_env_file_is_optional():
|
||||
# A missing .env.watchdog must not break `docker compose up` (self-hosting).
|
||||
wd = _compose()["services"]["orchestrator-watchdog"]
|
||||
|
||||
@@ -67,3 +67,35 @@ def test_malformed_env_degrades_to_default():
|
||||
cfg = Config.from_env({"WATCHDOG_INTERVAL_S": "abc", "WATCHDOG_MEM_PCT": ""})
|
||||
assert cfg.interval_s == 30.0
|
||||
assert cfg.mem_pct == 90.0
|
||||
|
||||
|
||||
# -- ORCH-111: proc_blocking config (kill-switch default-off + safe threshold) --
|
||||
def test_proc_blocking_defaults_off_and_safe():
|
||||
cfg = Config.from_env({})
|
||||
assert cfg.proc_enabled is False # opt-in (needs `pid: host`)
|
||||
assert cfg.proc_patterns == ["pytest"]
|
||||
assert cfg.proc_cooldown_s == 1800.0
|
||||
# Cross-invariant (adr-0041 D2): the default age threshold MUST exceed the max
|
||||
# legitimate test-run budget max(merge_retest_timeout_s=600, coverage=900).
|
||||
assert cfg.proc_age_s > 900.0
|
||||
|
||||
|
||||
def test_proc_blocking_thresholds_read_from_env():
|
||||
cfg = Config.from_env(
|
||||
{
|
||||
"WATCHDOG_PROC_ENABLED": "true",
|
||||
"WATCHDOG_PROC_AGE_MIN": "45",
|
||||
"WATCHDOG_PROC_PATTERNS": "pytest,coverage run",
|
||||
"WATCHDOG_PROC_COOLDOWN_S": "900",
|
||||
}
|
||||
)
|
||||
assert cfg.proc_enabled is True
|
||||
assert cfg.proc_age_s == 45 * 60.0
|
||||
assert cfg.proc_patterns == ["pytest", "coverage run"]
|
||||
assert cfg.proc_cooldown_s == 900.0
|
||||
|
||||
|
||||
def test_proc_blocking_malformed_env_degrades():
|
||||
cfg = Config.from_env({"WATCHDOG_PROC_AGE_MIN": "nope", "WATCHDOG_PROC_ENABLED": ""})
|
||||
assert cfg.proc_age_min == 60.0
|
||||
assert cfg.proc_enabled is False
|
||||
|
||||
256
tests/watchdog/test_proc_blocking_signal.py
Normal file
256
tests/watchdog/test_proc_blocking_signal.py
Normal file
@@ -0,0 +1,256 @@
|
||||
"""ORCH-111 TC-01…TC-06: the proc_blocking signal builder + decision surface.
|
||||
|
||||
Pure / deterministic — no real ``/proc``, no container, no socket, no timer. The
|
||||
collector is exercised here only for its never-raise / read-only contract
|
||||
(TC-04); its ``/proc`` parsing fixtures live in ``test_proc_collector.py``.
|
||||
|
||||
TC-01 is the REGRESS anchor: before ORCH-111 there was no ``proc_blocking``
|
||||
builder/dispatch at all, so a long-lived orphaned pytest raised no alert; this
|
||||
asserts the active signal is now produced (red→green).
|
||||
"""
|
||||
import ast as _ast
|
||||
import inspect as _inspect
|
||||
|
||||
from watchdog.collectors import proc as proc_mod
|
||||
from watchdog.config import Config
|
||||
from watchdog.core import Watchdog
|
||||
from watchdog.decision import (
|
||||
ACTION_ALERT,
|
||||
ACTION_NONE,
|
||||
ACTION_REALERT,
|
||||
ACTION_RECOVERY,
|
||||
)
|
||||
from watchdog.signals import proc_signals
|
||||
|
||||
|
||||
def _cfg(**kw) -> Config:
|
||||
base = {"WATCHDOG_PROC_ENABLED": "true", "WATCHDOG_PROC_AGE_MIN": "60"}
|
||||
return Config.from_env({**base, **kw})
|
||||
|
||||
|
||||
def _candidate(pid=4242, age_s=7200.0, cmdline="python3 -m pytest tests/", cpu_s=1234.0):
|
||||
return {"pid": pid, "cmdline": cmdline, "age_s": age_s, "cpu_s": cpu_s, "start_ticks": 1}
|
||||
|
||||
|
||||
# -- TC-01: regress — active signal for a long-lived blocking process ---------
|
||||
def test_tc01_builder_emits_active_proc_blocking_signal():
|
||||
cfg = _cfg() # proc_age_s == 3600
|
||||
sigs = proc_signals(cfg, [_candidate(pid=4242, age_s=7200.0)])
|
||||
assert len(sigs) == 1
|
||||
sig = sigs[0]
|
||||
assert sig.key == ("proc_blocking", 4242)
|
||||
assert sig.active is True # 7200 > 3600
|
||||
# AC-2: actionable detail — PID, age in seconds, cmdline fragment, CPU time.
|
||||
assert "4242" in sig.detail
|
||||
assert "7200" in sig.detail
|
||||
assert "pytest" in sig.detail
|
||||
assert "CPU" in sig.detail
|
||||
assert sig.cooldown_s == cfg.proc_cooldown_s
|
||||
|
||||
|
||||
# -- TC-02: anti-false-positive — below the threshold -> inactive -------------
|
||||
def test_tc02_below_threshold_is_inactive():
|
||||
cfg = _cfg() # proc_age_s == 3600
|
||||
sigs = proc_signals(cfg, [_candidate(age_s=600.0)]) # within a 600s test budget
|
||||
assert len(sigs) == 1
|
||||
assert sigs[0].active is False # 600 < 3600 -> no alert (BR-4 / AC-4)
|
||||
|
||||
|
||||
def test_tc02_boundary_is_strict_greater_than():
|
||||
cfg = _cfg()
|
||||
at_threshold = proc_signals(cfg, [_candidate(age_s=cfg.proc_age_s)])
|
||||
assert at_threshold[0].active is False # strict `>`: exactly at threshold is OK
|
||||
over = proc_signals(cfg, [_candidate(age_s=cfg.proc_age_s + 1)])
|
||||
assert over[0].active is True
|
||||
|
||||
|
||||
# -- TC-03: config / kill-switch + default threshold > test-run budget --------
|
||||
def test_tc03_defaults_are_off_and_safe():
|
||||
cfg = Config.from_env({})
|
||||
assert cfg.proc_enabled is False # default-OFF (opt-in, D5)
|
||||
assert cfg.proc_patterns == ["pytest"]
|
||||
assert cfg.proc_cooldown_s == 1800.0
|
||||
# Cross-invariant (D2): default age threshold MUST exceed the max legitimate
|
||||
# test-run budget max(merge_retest_timeout_s=600, coverage_run_timeout_s=900).
|
||||
assert cfg.proc_age_s > 900.0
|
||||
|
||||
|
||||
def test_tc03_env_overrides_and_malformed_degrade():
|
||||
cfg = Config.from_env(
|
||||
{
|
||||
"WATCHDOG_PROC_ENABLED": "true",
|
||||
"WATCHDOG_PROC_AGE_MIN": "30",
|
||||
"WATCHDOG_PROC_PATTERNS": "pytest,coverage run",
|
||||
"WATCHDOG_PROC_COOLDOWN_S": "600",
|
||||
}
|
||||
)
|
||||
assert cfg.proc_enabled is True
|
||||
assert cfg.proc_age_s == 30 * 60.0
|
||||
assert cfg.proc_patterns == ["pytest", "coverage run"]
|
||||
assert cfg.proc_cooldown_s == 600.0
|
||||
# malformed numerics degrade to defaults (never-raise config).
|
||||
bad = Config.from_env({"WATCHDOG_PROC_AGE_MIN": "abc", "WATCHDOG_PROC_COOLDOWN_S": ""})
|
||||
assert bad.proc_age_min == 60.0
|
||||
assert bad.proc_cooldown_s == 1800.0
|
||||
|
||||
|
||||
def test_tc03_killswitch_off_makes_collector_inert():
|
||||
cfg = Config.from_env({"WATCHDOG_PROC_ENABLED": "false"})
|
||||
dog = Watchdog(cfg, notifier=_Notifier(), docker=_StubDocker(), now_provider=lambda: 0.0)
|
||||
# The gated collector returns [] without ever touching /proc (zero overhead).
|
||||
assert dog._collect_proc(now=0.0) == []
|
||||
|
||||
|
||||
# -- TC-04: collector never-raise / read-only ---------------------------------
|
||||
def test_tc04_collector_degrades_to_empty_on_broken_source():
|
||||
# Missing /proc root -> [] (one signal skipped), no exception.
|
||||
assert proc_mod.collect_candidates(["pytest"], now=0.0, proc_root="/no/such/proc") == []
|
||||
|
||||
|
||||
def test_tc04_collector_empty_when_btime_unreadable(tmp_path):
|
||||
# /proc with no parseable btime -> [] (cannot compute age -> no bogus signal).
|
||||
(tmp_path / "stat").write_text("cpu 1 2 3\nintr 0\n")
|
||||
assert proc_mod.collect_candidates(["pytest"], now=0.0, proc_root=str(tmp_path)) == []
|
||||
|
||||
|
||||
def _docstring_node_ids(tree) -> set:
|
||||
"""ids of the Constant nodes that are module/func/class docstrings (prose)."""
|
||||
out = set()
|
||||
for node in _ast.walk(tree):
|
||||
if isinstance(node, (_ast.Module, _ast.FunctionDef, _ast.AsyncFunctionDef, _ast.ClassDef)):
|
||||
body = getattr(node, "body", [])
|
||||
if (
|
||||
body
|
||||
and isinstance(body[0], _ast.Expr)
|
||||
and isinstance(body[0].value, _ast.Constant)
|
||||
and isinstance(body[0].value.value, str)
|
||||
):
|
||||
out.add(id(body[0].value))
|
||||
return out
|
||||
|
||||
|
||||
def test_tc04_collector_source_is_read_only():
|
||||
# AC-3 / NFR-2: the EXECUTABLE code (not the prose describing the contract)
|
||||
# carries no kill / signal / subprocess / environ-read. Scan the AST so the
|
||||
# docstring that documents the ban does not trip the check.
|
||||
tree = _ast.parse(_inspect.getsource(proc_mod))
|
||||
docstrings = _docstring_node_ids(tree)
|
||||
violations: list[str] = []
|
||||
_MUTATING_ATTRS = {"kill", "system", "Popen", "popen", "run", "send_signal", "terminate"}
|
||||
for node in _ast.walk(tree):
|
||||
if isinstance(node, _ast.Import):
|
||||
for a in node.names:
|
||||
if a.name.split(".")[0] in {"subprocess", "signal"}:
|
||||
violations.append(f"import {a.name}")
|
||||
elif isinstance(node, _ast.ImportFrom):
|
||||
if (node.module or "").split(".")[0] in {"subprocess", "signal"}:
|
||||
violations.append(f"from {node.module}")
|
||||
elif isinstance(node, _ast.Attribute) and node.attr in _MUTATING_ATTRS:
|
||||
violations.append(f".{node.attr}")
|
||||
elif isinstance(node, _ast.Constant) and isinstance(node.value, str):
|
||||
if id(node) not in docstrings and "environ" in node.value:
|
||||
violations.append("reads /proc/<pid>/environ")
|
||||
assert not violations, f"read-only contract violated in proc.py: {violations}"
|
||||
|
||||
|
||||
def test_tc04_builder_skips_records_missing_fields():
|
||||
cfg = _cfg()
|
||||
sigs = proc_signals(cfg, [{"pid": None}, {"cmdline": "pytest"}, _candidate()])
|
||||
assert [s.key for s in sigs] == [("proc_blocking", 4242)] # only the valid record
|
||||
|
||||
|
||||
# -- TC-05: anti-spam / recovery through decide()/AlertState ------------------
|
||||
def test_tc05_alert_throttle_realert_then_recovery():
|
||||
seq = {"candidates": [_candidate(pid=7, age_s=7200.0)]}
|
||||
cfg = _cfg(WATCHDOG_PROC_COOLDOWN_S="1000")
|
||||
t = {"v": 0.0}
|
||||
notifier = _Notifier()
|
||||
dog = Watchdog(cfg, notifier=notifier, docker=_StubDocker(), now_provider=lambda: t["v"])
|
||||
dog._collect_proc = lambda now: list(seq["candidates"]) # inject collector
|
||||
|
||||
def proc_alerts():
|
||||
return [m for m in notifier.sent if "Блокирующий процесс" in m]
|
||||
|
||||
def actions():
|
||||
return [a for a, s in dog.tick() if getattr(s, "key", (None,))[0] == "proc_blocking"]
|
||||
|
||||
# tick 1: threshold crossed -> exactly one ALERT.
|
||||
assert ACTION_ALERT in actions()
|
||||
assert len(proc_alerts()) == 1
|
||||
# tick 2: still alive, within cooldown -> NONE (anti-spam, no new alert).
|
||||
t["v"] = 100.0
|
||||
assert actions() == [ACTION_NONE]
|
||||
assert len(proc_alerts()) == 1
|
||||
# tick 3: cooldown elapsed -> REALERT.
|
||||
t["v"] = 1100.0
|
||||
assert ACTION_REALERT in actions()
|
||||
assert len(proc_alerts()) == 2
|
||||
# tick 4: the process vanished -> exactly one RECOVERY (synthesised, D4).
|
||||
seq["candidates"] = []
|
||||
t["v"] = 1200.0
|
||||
assert ACTION_RECOVERY in actions()
|
||||
recoveries = [m for m in notifier.sent if "восстановление" in m and "Блокирующий" in m]
|
||||
assert len(recoveries) == 1
|
||||
# tick 5: still gone -> no repeated recovery (state cleared).
|
||||
t["v"] = 1300.0
|
||||
dog.tick()
|
||||
assert len([m for m in notifier.sent if "восстановление" in m and "Блокирующий" in m]) == 1
|
||||
|
||||
|
||||
# -- TC-06: no duplicate with agent_hung (cmdline partition) ------------------
|
||||
def test_tc06_claude_agent_cmdline_never_matches_pytest_pattern():
|
||||
# A claude agent process (covered by agent_hung) is excluded by the collector
|
||||
# pattern scope -> proc_blocking never fires for it (NFR-4 / AC-5, by construction).
|
||||
assert proc_mod.matches_patterns("claude --model claude-opus-4-8 -p ...", ["pytest"]) is False
|
||||
assert proc_mod.matches_patterns("python3 -m pytest tests/", ["pytest"]) is True
|
||||
|
||||
|
||||
def test_tc06_collector_excludes_non_matching_processes(tmp_path):
|
||||
_write_fake_proc(
|
||||
tmp_path,
|
||||
btime=1_000_000,
|
||||
procs={
|
||||
"100": ("claude --model claude-opus-4-8", _stat_line(start_ticks=0)),
|
||||
"200": ("python3 -m pytest tests/test_x.py", _stat_line(start_ticks=0)),
|
||||
},
|
||||
)
|
||||
recs = proc_mod.collect_candidates(
|
||||
["pytest"], now=1_010_000.0, proc_root=str(tmp_path), clk_tck=100
|
||||
)
|
||||
assert [r["pid"] for r in recs] == [200] # only the pytest process
|
||||
|
||||
|
||||
# -- shared fakes -------------------------------------------------------------
|
||||
class _Notifier:
|
||||
def __init__(self):
|
||||
self.sent = []
|
||||
|
||||
def send(self, text):
|
||||
self.sent.append(text)
|
||||
return True
|
||||
|
||||
|
||||
class _StubDocker:
|
||||
def inspect(self, name):
|
||||
return {"State": {"Status": "running"}}
|
||||
|
||||
|
||||
def _stat_line(start_ticks: int, utime: int = 0, stime: int = 0) -> str:
|
||||
# /proc/<pid>/stat: pid (comm) state ppid ... utime(14) stime(15) ... starttime(22) ...
|
||||
fields = ["0"] * 52
|
||||
fields[0] = "999"
|
||||
fields[1] = "(python3)"
|
||||
fields[2] = "S"
|
||||
fields[13] = str(utime) # field 14
|
||||
fields[14] = str(stime) # field 15
|
||||
fields[21] = str(start_ticks) # field 22
|
||||
return " ".join(fields)
|
||||
|
||||
|
||||
def _write_fake_proc(root, *, btime: int, procs: dict):
|
||||
(root / "stat").write_text(f"cpu 1 2 3\nbtime {btime}\nintr 0\n")
|
||||
for pid, (cmdline, stat_line) in procs.items():
|
||||
d = root / pid
|
||||
d.mkdir()
|
||||
(d / "cmdline").write_bytes(cmdline.replace(" ", "\x00").encode() + b"\x00")
|
||||
(d / "stat").write_text(stat_line)
|
||||
148
tests/watchdog/test_proc_collector.py
Normal file
148
tests/watchdog/test_proc_collector.py
Normal file
@@ -0,0 +1,148 @@
|
||||
"""ORCH-111: the /proc collector — pure parsing + a fake /proc tree (never-raise).
|
||||
|
||||
Mirrors ``test_host_collector.py``: the pure parsers are unit-tested on text
|
||||
fixtures and ``collect_candidates`` is driven against a temporary ``/proc`` tree,
|
||||
so no real host / Linux kernel is required.
|
||||
"""
|
||||
from watchdog.collectors import proc as proc_mod
|
||||
|
||||
|
||||
# -- parse_btime --------------------------------------------------------------
|
||||
def test_parse_btime_reads_the_btime_line():
|
||||
text = "cpu 1 2 3 4\nbtime 1700000000\nprocesses 99\n"
|
||||
assert proc_mod.parse_btime(text) == 1700000000
|
||||
|
||||
|
||||
def test_parse_btime_absent_is_none():
|
||||
assert proc_mod.parse_btime("cpu 1 2 3\nintr 0\n") is None
|
||||
|
||||
|
||||
def test_parse_btime_garbage_is_none():
|
||||
assert proc_mod.parse_btime("btime not-a-number\n") is None
|
||||
assert proc_mod.parse_btime("") is None
|
||||
|
||||
|
||||
# -- parse_pid_stat (comm may contain spaces/parens) --------------------------
|
||||
def test_parse_pid_stat_simple():
|
||||
# field 14 utime, 15 stime, 22 starttime.
|
||||
fields = ["0"] * 52
|
||||
fields[0], fields[1], fields[2] = "1234", "(python3)", "R"
|
||||
fields[13], fields[14], fields[21] = "50", "25", "9000"
|
||||
st = proc_mod.parse_pid_stat(" ".join(fields))
|
||||
assert st == {"utime": 50, "stime": 25, "starttime": 9000}
|
||||
|
||||
|
||||
def test_parse_pid_stat_comm_with_spaces_and_parens():
|
||||
# A pathological comm "(py (test) x)" must not break field indexing — we
|
||||
# split after the LAST ')'.
|
||||
fields = ["0"] * 52
|
||||
fields[13], fields[14], fields[21] = "7", "3", "4242"
|
||||
tail = " ".join(fields[2:])
|
||||
line = f"1234 (py (test) x) {tail}"
|
||||
st = proc_mod.parse_pid_stat(line)
|
||||
assert st == {"utime": 7, "stime": 3, "starttime": 4242}
|
||||
|
||||
|
||||
def test_parse_pid_stat_truncated_is_none():
|
||||
assert proc_mod.parse_pid_stat("1234 (python3) R 1 2 3") is None
|
||||
assert proc_mod.parse_pid_stat("no parens here") is None
|
||||
assert proc_mod.parse_pid_stat("") is None
|
||||
|
||||
|
||||
# -- decode_cmdline -----------------------------------------------------------
|
||||
def test_decode_cmdline_nul_separated():
|
||||
raw = b"python3\x00-m\x00pytest\x00tests/test_x.py\x00"
|
||||
assert proc_mod.decode_cmdline(raw) == "python3 -m pytest tests/test_x.py"
|
||||
|
||||
|
||||
def test_decode_cmdline_empty_for_kernel_thread():
|
||||
assert proc_mod.decode_cmdline(b"") == ""
|
||||
assert proc_mod.decode_cmdline(None) == ""
|
||||
|
||||
|
||||
# -- matches_patterns ---------------------------------------------------------
|
||||
def test_matches_patterns_substring_any():
|
||||
assert proc_mod.matches_patterns("python3 -m pytest x", ["pytest"]) is True
|
||||
assert proc_mod.matches_patterns("python3 -m coverage run", ["pytest", "coverage run"]) is True
|
||||
assert proc_mod.matches_patterns("bash -c sleep", ["pytest"]) is False
|
||||
assert proc_mod.matches_patterns("", ["pytest"]) is False
|
||||
assert proc_mod.matches_patterns("pytest", []) is False
|
||||
|
||||
|
||||
# -- collect_candidates (fake /proc tree) -------------------------------------
|
||||
def _stat_line(start_ticks, utime=0, stime=0):
|
||||
fields = ["0"] * 52
|
||||
fields[0], fields[1], fields[2] = "999", "(python3)", "S"
|
||||
fields[13], fields[14], fields[21] = str(utime), str(stime), str(start_ticks)
|
||||
return " ".join(fields)
|
||||
|
||||
|
||||
def _write_proc(root, btime, procs):
|
||||
(root / "stat").write_text(f"cpu 1 2 3\nbtime {btime}\n")
|
||||
for pid, (cmdline, stat) in procs.items():
|
||||
d = root / pid
|
||||
d.mkdir()
|
||||
(d / "cmdline").write_bytes(cmdline.replace(" ", "\x00").encode() + b"\x00")
|
||||
(d / "stat").write_text(stat)
|
||||
|
||||
|
||||
def test_collect_candidates_computes_age_and_cpu(tmp_path):
|
||||
# btime=1_000_000, starttime=200_000 ticks @ 100 Hz -> start epoch = 1_002_000.
|
||||
# now=1_010_000 -> age 8000s; utime+stime=300 ticks @100Hz -> cpu 3s.
|
||||
_write_proc(
|
||||
tmp_path,
|
||||
btime=1_000_000,
|
||||
procs={"200": ("python3 -m pytest tests/", _stat_line(200_000, utime=200, stime=100))},
|
||||
)
|
||||
recs = proc_mod.collect_candidates(
|
||||
["pytest"], now=1_010_000.0, proc_root=str(tmp_path), clk_tck=100
|
||||
)
|
||||
assert len(recs) == 1
|
||||
r = recs[0]
|
||||
assert r["pid"] == 200
|
||||
assert r["cmdline"] == "python3 -m pytest tests/"
|
||||
assert abs(r["age_s"] - 8000.0) < 1e-6
|
||||
assert abs(r["cpu_s"] - 3.0) < 1e-6
|
||||
|
||||
|
||||
def test_collect_candidates_filters_by_pattern(tmp_path):
|
||||
_write_proc(
|
||||
tmp_path,
|
||||
btime=1_000_000,
|
||||
procs={
|
||||
"100": ("claude --model x", _stat_line(0)),
|
||||
"200": ("python3 -m pytest a", _stat_line(0)),
|
||||
"300": ("/usr/bin/dockerd", _stat_line(0)),
|
||||
},
|
||||
)
|
||||
recs = proc_mod.collect_candidates(
|
||||
["pytest"], now=1_010_000.0, proc_root=str(tmp_path), clk_tck=100
|
||||
)
|
||||
assert [r["pid"] for r in recs] == [200]
|
||||
|
||||
|
||||
def test_collect_candidates_skips_unreadable_pid(tmp_path):
|
||||
# A matching pid whose stat is unparseable (race: vanished mid-scan) is
|
||||
# skipped without dropping the rest.
|
||||
_write_proc(
|
||||
tmp_path,
|
||||
btime=1_000_000,
|
||||
procs={
|
||||
"200": ("python3 -m pytest a", "garbage no parens"),
|
||||
"201": ("python3 -m pytest b", _stat_line(0)),
|
||||
},
|
||||
)
|
||||
recs = proc_mod.collect_candidates(
|
||||
["pytest"], now=1_010_000.0, proc_root=str(tmp_path), clk_tck=100
|
||||
)
|
||||
assert [r["pid"] for r in recs] == [201]
|
||||
|
||||
|
||||
def test_collect_candidates_ignores_non_numeric_entries(tmp_path):
|
||||
_write_proc(tmp_path, btime=1_000_000, procs={"200": ("pytest", _stat_line(0))})
|
||||
(tmp_path / "self").mkdir() # non-numeric -> ignored
|
||||
(tmp_path / "meminfo").write_text("noise")
|
||||
recs = proc_mod.collect_candidates(
|
||||
["pytest"], now=1_000_000.0, proc_root=str(tmp_path), clk_tck=100
|
||||
)
|
||||
assert [r["pid"] for r in recs] == [200]
|
||||
128
tests/watchdog/test_tick_proc_blocking_integration.py
Normal file
128
tests/watchdog/test_tick_proc_blocking_integration.py
Normal file
@@ -0,0 +1,128 @@
|
||||
"""ORCH-111 TC-07: full tick -> dispatch of the proc_blocking alert (integration).
|
||||
|
||||
REGRESS: ``Watchdog.tick()`` with a collector that returns a long-lived blocking
|
||||
process must dispatch exactly one ``proc_blocking`` alert through the fake
|
||||
Notifier — even though ``/metrics`` reports no ``stuck`` stage and no hung agent.
|
||||
With the kill-switch OFF the path is inert (byte-for-byte as before ORCH-111).
|
||||
|
||||
The orchestrator ``/metrics`` envelope is stubbed healthy so ONLY the new signal
|
||||
can fire; the proc collector is stubbed at the module boundary so the real
|
||||
``_collect_proc`` gate + wrapper still execute.
|
||||
"""
|
||||
from watchdog.collectors import orch as orch_mod
|
||||
from watchdog.collectors import proc as proc_mod
|
||||
from watchdog.config import Config
|
||||
from watchdog.core import Watchdog
|
||||
|
||||
|
||||
class _Notifier:
|
||||
def __init__(self):
|
||||
self.sent = []
|
||||
|
||||
def send(self, text):
|
||||
self.sent.append(text)
|
||||
return True
|
||||
|
||||
|
||||
class _StubDocker:
|
||||
def inspect(self, name):
|
||||
return {"State": {"Status": "running"}}
|
||||
|
||||
|
||||
def _healthy_metrics(monkeypatch):
|
||||
env = {
|
||||
"schema_version": 1,
|
||||
"generated_at": "2026-06-15T00:00:00Z",
|
||||
"clk_tck": 100,
|
||||
"agents": [],
|
||||
"stages": [],
|
||||
"queue": {"depth": 0, "counts": {"failed": 0}},
|
||||
}
|
||||
monkeypatch.setattr(
|
||||
orch_mod, "fetch_metrics",
|
||||
lambda *a, **k: orch_mod.FetchResult(ok=True, envelope=env),
|
||||
)
|
||||
|
||||
|
||||
def _cfg(**kw):
|
||||
base = {
|
||||
"WATCHDOG_TG_BOT_TOKEN": "t",
|
||||
"WATCHDOG_TG_CHAT_ID": "c",
|
||||
"WATCHDOG_PROC_ENABLED": "true",
|
||||
"WATCHDOG_PROC_AGE_MIN": "60", # proc_age_s == 3600
|
||||
"WATCHDOG_CONTAINERS": "orchestrator",
|
||||
}
|
||||
return Config.from_env({**base, **kw})
|
||||
|
||||
|
||||
def _blocking(monkeypatch, age_s=7200.0):
|
||||
rec = {"pid": 4242, "cmdline": "python3 -m pytest tests/test_install_lite_script.py",
|
||||
"age_s": age_s, "cpu_s": 99999.0, "start_ticks": 1}
|
||||
monkeypatch.setattr(proc_mod, "collect_candidates", lambda *a, **k: [rec])
|
||||
|
||||
|
||||
def _proc_alerts(notifier):
|
||||
return [m for m in notifier.sent if "Блокирующий процесс" in m]
|
||||
|
||||
|
||||
def test_tc07_tick_dispatches_proc_blocking_alert(monkeypatch):
|
||||
_healthy_metrics(monkeypatch)
|
||||
_blocking(monkeypatch)
|
||||
notifier = _Notifier()
|
||||
dog = Watchdog(_cfg(), notifier=notifier, docker=_StubDocker(), now_provider=lambda: 0.0)
|
||||
|
||||
dog.tick()
|
||||
|
||||
alerts = _proc_alerts(notifier)
|
||||
assert len(alerts) == 1
|
||||
assert "4242" in alerts[0]
|
||||
assert "pytest" in alerts[0]
|
||||
assert alerts[0].startswith("\U0001f534") # red ALERT prefix
|
||||
|
||||
|
||||
def test_tc07_killswitch_off_dispatches_nothing(monkeypatch):
|
||||
_healthy_metrics(monkeypatch)
|
||||
# Even if the collector WOULD return a blocking process, the gate skips it.
|
||||
called = {"n": 0}
|
||||
|
||||
def _boom(*a, **k):
|
||||
called["n"] += 1
|
||||
return [{"pid": 1, "cmdline": "pytest", "age_s": 9e9, "cpu_s": 0.0}]
|
||||
|
||||
monkeypatch.setattr(proc_mod, "collect_candidates", _boom)
|
||||
notifier = _Notifier()
|
||||
dog = Watchdog(
|
||||
_cfg(WATCHDOG_PROC_ENABLED="false"),
|
||||
notifier=notifier, docker=_StubDocker(), now_provider=lambda: 0.0,
|
||||
)
|
||||
|
||||
dog.tick()
|
||||
|
||||
assert _proc_alerts(notifier) == []
|
||||
assert called["n"] == 0 # collector never invoked when disabled (zero overhead)
|
||||
|
||||
|
||||
def test_tc07_in_budget_process_does_not_alert(monkeypatch):
|
||||
# A process below the threshold (legitimate in-flight run) -> no alert (AC-4).
|
||||
_healthy_metrics(monkeypatch)
|
||||
_blocking(monkeypatch, age_s=600.0)
|
||||
notifier = _Notifier()
|
||||
dog = Watchdog(_cfg(), notifier=notifier, docker=_StubDocker(), now_provider=lambda: 0.0)
|
||||
|
||||
dog.tick()
|
||||
|
||||
assert _proc_alerts(notifier) == []
|
||||
|
||||
|
||||
def test_tc07_tick_never_raises_when_collector_explodes(monkeypatch):
|
||||
_healthy_metrics(monkeypatch)
|
||||
|
||||
def _explode(*a, **k):
|
||||
raise RuntimeError("boom")
|
||||
|
||||
monkeypatch.setattr(proc_mod, "collect_candidates", _explode)
|
||||
notifier = _Notifier()
|
||||
dog = Watchdog(_cfg(), notifier=notifier, docker=_StubDocker(), now_provider=lambda: 0.0)
|
||||
|
||||
dog.tick() # must not raise — collector error degrades to one skipped signal
|
||||
assert _proc_alerts(notifier) == []
|
||||
Reference in New Issue
Block a user