Files
orchestrator/tests/watchdog/test_proc_collector.py
claude-bot 2e73ccf090 feat(watchdog): proc_blocking alert for orphaned long-lived test processes
Close the observability gap between agent_hung (only tracked jobs by jobs.pid)
and orphaned pytest subprocesses the orchestrator launches itself
(merge_gate.retest_branch / coverage_gate.measure_coverage). On a timeout-kill of
the agent (-9, ORCH-109) the grand-child pytest reparents onto tini and keeps
running for days, starving CPU and failing merge-gate re-test — with no alert.

Strictly inside the observer (watchdog/** + the watchdog compose service):
- watchdog/collectors/proc.py: stdlib-only /proc scan (under pid: host),
  read-only, never-raise -> []; pure parsers split from I/O (tested on a fake
  /proc tree). Never reads /proc/<pid>/environ.
- watchdog/signals.py: pure proc_signals builder, per-entity
  ("proc_blocking", pid), active iff age_s > proc_age_s; actionable RU detail.
- watchdog/core.py: opt-in tick block (gated on proc_enabled -> zero overhead /
  byte-for-byte when off) + RECOVERY synthesis for a vanished process through the
  existing decide()/AlertState (no new anti-spam logic).
- watchdog/config.py: WATCHDOG_PROC_{ENABLED(false),AGE_MIN(60),PATTERNS(pytest),
  COOLDOWN_S(1800)}; default threshold > max(merge_retest_timeout_s=600,
  coverage_run_timeout_s=900) so a legit in-flight run never crosses it.
- docker-compose.yml: pid: host on orchestrator-watchdog ONLY (read-only privilege).

Anti-false-positive and no overlap with agent_hung are by construction (cmdline
scope + age threshold), not fragile cross-namespace PID matching.

Canon synced: WATCHDOG_PROC_* in .env.watchdog.example <-> .env.example block;
documented in LITE_SETUP.md and docs/architecture/README.md (architect). src/**,
/metrics, schema_version, STAGE_TRANSITIONS, QG_CHECKS, check_*, machine-verdict
and the DB schema are untouched; deploy rebuilds only the sidecar, prod
orchestrator is not restarted (NFR-3).

Tests: tests/watchdog/test_proc_blocking_signal.py (TC-01..TC-06),
test_proc_collector.py (/proc parsing), test_tick_proc_blocking_integration.py
(TC-07), plus pid: host and proc-config assertions. Full pytest tests/ green (1930).

Refs: ORCH-111
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-15 02:14:17 +03:00

149 lines
5.5 KiB
Python

"""ORCH-111: the /proc collector — pure parsing + a fake /proc tree (never-raise).
Mirrors ``test_host_collector.py``: the pure parsers are unit-tested on text
fixtures and ``collect_candidates`` is driven against a temporary ``/proc`` tree,
so no real host / Linux kernel is required.
"""
from watchdog.collectors import proc as proc_mod
# -- parse_btime --------------------------------------------------------------
def test_parse_btime_reads_the_btime_line():
text = "cpu 1 2 3 4\nbtime 1700000000\nprocesses 99\n"
assert proc_mod.parse_btime(text) == 1700000000
def test_parse_btime_absent_is_none():
assert proc_mod.parse_btime("cpu 1 2 3\nintr 0\n") is None
def test_parse_btime_garbage_is_none():
assert proc_mod.parse_btime("btime not-a-number\n") is None
assert proc_mod.parse_btime("") is None
# -- parse_pid_stat (comm may contain spaces/parens) --------------------------
def test_parse_pid_stat_simple():
# field 14 utime, 15 stime, 22 starttime.
fields = ["0"] * 52
fields[0], fields[1], fields[2] = "1234", "(python3)", "R"
fields[13], fields[14], fields[21] = "50", "25", "9000"
st = proc_mod.parse_pid_stat(" ".join(fields))
assert st == {"utime": 50, "stime": 25, "starttime": 9000}
def test_parse_pid_stat_comm_with_spaces_and_parens():
# A pathological comm "(py (test) x)" must not break field indexing — we
# split after the LAST ')'.
fields = ["0"] * 52
fields[13], fields[14], fields[21] = "7", "3", "4242"
tail = " ".join(fields[2:])
line = f"1234 (py (test) x) {tail}"
st = proc_mod.parse_pid_stat(line)
assert st == {"utime": 7, "stime": 3, "starttime": 4242}
def test_parse_pid_stat_truncated_is_none():
assert proc_mod.parse_pid_stat("1234 (python3) R 1 2 3") is None
assert proc_mod.parse_pid_stat("no parens here") is None
assert proc_mod.parse_pid_stat("") is None
# -- decode_cmdline -----------------------------------------------------------
def test_decode_cmdline_nul_separated():
raw = b"python3\x00-m\x00pytest\x00tests/test_x.py\x00"
assert proc_mod.decode_cmdline(raw) == "python3 -m pytest tests/test_x.py"
def test_decode_cmdline_empty_for_kernel_thread():
assert proc_mod.decode_cmdline(b"") == ""
assert proc_mod.decode_cmdline(None) == ""
# -- matches_patterns ---------------------------------------------------------
def test_matches_patterns_substring_any():
assert proc_mod.matches_patterns("python3 -m pytest x", ["pytest"]) is True
assert proc_mod.matches_patterns("python3 -m coverage run", ["pytest", "coverage run"]) is True
assert proc_mod.matches_patterns("bash -c sleep", ["pytest"]) is False
assert proc_mod.matches_patterns("", ["pytest"]) is False
assert proc_mod.matches_patterns("pytest", []) is False
# -- collect_candidates (fake /proc tree) -------------------------------------
def _stat_line(start_ticks, utime=0, stime=0):
fields = ["0"] * 52
fields[0], fields[1], fields[2] = "999", "(python3)", "S"
fields[13], fields[14], fields[21] = str(utime), str(stime), str(start_ticks)
return " ".join(fields)
def _write_proc(root, btime, procs):
(root / "stat").write_text(f"cpu 1 2 3\nbtime {btime}\n")
for pid, (cmdline, stat) in procs.items():
d = root / pid
d.mkdir()
(d / "cmdline").write_bytes(cmdline.replace(" ", "\x00").encode() + b"\x00")
(d / "stat").write_text(stat)
def test_collect_candidates_computes_age_and_cpu(tmp_path):
# btime=1_000_000, starttime=200_000 ticks @ 100 Hz -> start epoch = 1_002_000.
# now=1_010_000 -> age 8000s; utime+stime=300 ticks @100Hz -> cpu 3s.
_write_proc(
tmp_path,
btime=1_000_000,
procs={"200": ("python3 -m pytest tests/", _stat_line(200_000, utime=200, stime=100))},
)
recs = proc_mod.collect_candidates(
["pytest"], now=1_010_000.0, proc_root=str(tmp_path), clk_tck=100
)
assert len(recs) == 1
r = recs[0]
assert r["pid"] == 200
assert r["cmdline"] == "python3 -m pytest tests/"
assert abs(r["age_s"] - 8000.0) < 1e-6
assert abs(r["cpu_s"] - 3.0) < 1e-6
def test_collect_candidates_filters_by_pattern(tmp_path):
_write_proc(
tmp_path,
btime=1_000_000,
procs={
"100": ("claude --model x", _stat_line(0)),
"200": ("python3 -m pytest a", _stat_line(0)),
"300": ("/usr/bin/dockerd", _stat_line(0)),
},
)
recs = proc_mod.collect_candidates(
["pytest"], now=1_010_000.0, proc_root=str(tmp_path), clk_tck=100
)
assert [r["pid"] for r in recs] == [200]
def test_collect_candidates_skips_unreadable_pid(tmp_path):
# A matching pid whose stat is unparseable (race: vanished mid-scan) is
# skipped without dropping the rest.
_write_proc(
tmp_path,
btime=1_000_000,
procs={
"200": ("python3 -m pytest a", "garbage no parens"),
"201": ("python3 -m pytest b", _stat_line(0)),
},
)
recs = proc_mod.collect_candidates(
["pytest"], now=1_010_000.0, proc_root=str(tmp_path), clk_tck=100
)
assert [r["pid"] for r in recs] == [201]
def test_collect_candidates_ignores_non_numeric_entries(tmp_path):
_write_proc(tmp_path, btime=1_000_000, procs={"200": ("pytest", _stat_line(0))})
(tmp_path / "self").mkdir() # non-numeric -> ignored
(tmp_path / "meminfo").write_text("noise")
recs = proc_mod.collect_candidates(
["pytest"], now=1_000_000.0, proc_root=str(tmp_path), clk_tck=100
)
assert [r["pid"] for r in recs] == [200]