feat(watchdog): proc_blocking alert for orphaned long-lived test processes
Close the observability gap between agent_hung (only tracked jobs by jobs.pid)
and orphaned pytest subprocesses the orchestrator launches itself
(merge_gate.retest_branch / coverage_gate.measure_coverage). On a timeout-kill of
the agent (-9, ORCH-109) the grand-child pytest reparents onto tini and keeps
running for days, starving CPU and failing merge-gate re-test — with no alert.
Strictly inside the observer (watchdog/** + the watchdog compose service):
- watchdog/collectors/proc.py: stdlib-only /proc scan (under pid: host),
read-only, never-raise -> []; pure parsers split from I/O (tested on a fake
/proc tree). Never reads /proc/<pid>/environ.
- watchdog/signals.py: pure proc_signals builder, per-entity
("proc_blocking", pid), active iff age_s > proc_age_s; actionable RU detail.
- watchdog/core.py: opt-in tick block (gated on proc_enabled -> zero overhead /
byte-for-byte when off) + RECOVERY synthesis for a vanished process through the
existing decide()/AlertState (no new anti-spam logic).
- watchdog/config.py: WATCHDOG_PROC_{ENABLED(false),AGE_MIN(60),PATTERNS(pytest),
COOLDOWN_S(1800)}; default threshold > max(merge_retest_timeout_s=600,
coverage_run_timeout_s=900) so a legit in-flight run never crosses it.
- docker-compose.yml: pid: host on orchestrator-watchdog ONLY (read-only privilege).
Anti-false-positive and no overlap with agent_hung are by construction (cmdline
scope + age threshold), not fragile cross-namespace PID matching.
Canon synced: WATCHDOG_PROC_* in .env.watchdog.example <-> .env.example block;
documented in LITE_SETUP.md and docs/architecture/README.md (architect). src/**,
/metrics, schema_version, STAGE_TRANSITIONS, QG_CHECKS, check_*, machine-verdict
and the DB schema are untouched; deploy rebuilds only the sidecar, prod
orchestrator is not restarted (NFR-3).
Tests: tests/watchdog/test_proc_blocking_signal.py (TC-01..TC-06),
test_proc_collector.py (/proc parsing), test_tick_proc_blocking_integration.py
(TC-07), plus pid: host and proc-config assertions. Full pytest tests/ green (1930).
Refs: ORCH-111
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
205
watchdog/collectors/proc.py
Normal file
205
watchdog/collectors/proc.py
Normal file
@@ -0,0 +1,205 @@
|
||||
"""Collector: long-lived host processes whose cmdline matches a test-class (ORCH-111).
|
||||
|
||||
stdlib-only ``/proc`` scan (ADR-001 D3). Under ``pid: host`` (D6) the container's
|
||||
``/proc`` reflects the host PID-namespace, so the sidecar sees the orphaned
|
||||
``pytest`` subprocess regardless of which container spawned it (the merge-gate /
|
||||
coverage-gate re-test the orchestrator launches itself; on a timeout-kill of the
|
||||
agent — ``exit_code=-9``, ORCH-109 — the grand-child ``pytest`` reparents onto
|
||||
tini and keeps running for days).
|
||||
|
||||
Strictly **READ-ONLY** (BR-3 / NFR-2): opens only ``/proc/stat``,
|
||||
``/proc/<pid>/stat`` and ``/proc/<pid>/cmdline`` for reading. There is **no**
|
||||
``os.kill``, signal-send, ``subprocess`` or any mutation on this path, and it
|
||||
**never** reads ``/proc/<pid>/environ`` (secrets, ADR-001 D3 / R-2).
|
||||
|
||||
**never-raise** (NFR-1): a per-pid race — the process died between ``listdir``
|
||||
and ``read`` — skips that pid without breaking the list; any top-level failure
|
||||
(non-Linux / missing ``/proc`` / unreadable ``/proc/stat``) degrades the whole
|
||||
scan to ``[]`` (one signal skipped, the tick lives, D8).
|
||||
|
||||
Pure parsing (``parse_btime`` / ``parse_pid_stat`` / ``decode_cmdline`` /
|
||||
``matches_patterns``) is split from the I/O orchestration (``collect_candidates``)
|
||||
so the scan is testable against a fake ``/proc`` tree, no real host needed.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
|
||||
logger = logging.getLogger("watchdog.collectors.proc")
|
||||
|
||||
# /proc/<pid>/stat field indices, 0-based AFTER the trailing ')' of `comm`.
|
||||
# /proc/<pid>/stat is: `pid (comm) state ppid ... utime stime ... starttime ...`.
|
||||
# Fields are 1-based in proc(5); field 3 (state) is the first token after the
|
||||
# last ')'. So field N (>=3) lives at index N-3 of the post-')'-split:
|
||||
# utime = field 14 -> index 11
|
||||
# stime = field 15 -> index 12
|
||||
# starttime = field 22 -> index 19
|
||||
_STAT_UTIME_IDX = 11
|
||||
_STAT_STIME_IDX = 12
|
||||
_STAT_STARTTIME_IDX = 19
|
||||
_STAT_MIN_FIELDS = _STAT_STARTTIME_IDX + 1 # need starttime present
|
||||
|
||||
_DEFAULT_CLK_TCK = 100
|
||||
|
||||
|
||||
def parse_btime(stat_text: str) -> int | None:
|
||||
"""Boot time (epoch seconds) from the ``btime <N>`` line of ``/proc/stat``.
|
||||
|
||||
Returns ``None`` when the line is absent / unparseable (never raises) so the
|
||||
caller degrades the whole scan to ``[]`` rather than emitting a bogus age.
|
||||
"""
|
||||
try:
|
||||
for line in stat_text.splitlines():
|
||||
if line.startswith("btime "):
|
||||
parts = line.split()
|
||||
if len(parts) >= 2:
|
||||
return int(parts[1])
|
||||
except Exception as e: # noqa: BLE001 - tolerant: no btime -> no scan
|
||||
logger.warning("watchdog: cannot parse /proc/stat btime: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
def parse_pid_stat(stat_text: str) -> dict | None:
|
||||
"""Parse ``/proc/<pid>/stat`` -> ``{utime, stime, starttime}`` (clock ticks).
|
||||
|
||||
The ``comm`` field (2) is wrapped in parens and may itself contain spaces or
|
||||
parens (e.g. ``(python -m) ()``), so we split AFTER the **last** ``')'`` and
|
||||
index the remaining space-separated fields. Returns ``None`` on a malformed /
|
||||
truncated line (never raises).
|
||||
"""
|
||||
try:
|
||||
rparen = stat_text.rfind(")")
|
||||
if rparen < 0:
|
||||
return None
|
||||
rest = stat_text[rparen + 1:].split()
|
||||
if len(rest) < _STAT_MIN_FIELDS:
|
||||
return None
|
||||
return {
|
||||
"utime": int(rest[_STAT_UTIME_IDX]),
|
||||
"stime": int(rest[_STAT_STIME_IDX]),
|
||||
"starttime": int(rest[_STAT_STARTTIME_IDX]),
|
||||
}
|
||||
except Exception as e: # noqa: BLE001 - one bad line, skip this pid
|
||||
logger.debug("watchdog: cannot parse pid stat: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
def decode_cmdline(raw: bytes | str | None) -> str:
|
||||
"""NUL-separated ``/proc/<pid>/cmdline`` -> a space-joined string.
|
||||
|
||||
Empty for kernel threads (they carry no cmdline) -> never matches a pattern.
|
||||
Tolerant of bytes / str / ``None`` and undecodable bytes (never raises).
|
||||
"""
|
||||
try:
|
||||
if raw is None:
|
||||
return ""
|
||||
if isinstance(raw, str):
|
||||
raw = raw.encode("utf-8", "replace")
|
||||
text = raw.decode("utf-8", "replace")
|
||||
parts = [p for p in text.split("\x00") if p]
|
||||
return " ".join(parts)
|
||||
except Exception: # noqa: BLE001 - undecodable cmdline -> treat as empty
|
||||
return ""
|
||||
|
||||
|
||||
def matches_patterns(cmdline: str, patterns: list[str]) -> bool:
|
||||
"""``True`` iff ``cmdline`` contains ANY pattern as a substring.
|
||||
|
||||
This is the test-class scope (ADR-001 D4): pattern-filtering happens in the
|
||||
collector, so the signal builder only applies the age threshold. The default
|
||||
pattern ``pytest`` never matches a ``claude`` agent cmdline -> zero overlap
|
||||
with ``agent_hung`` by construction (NFR-4 / AC-5).
|
||||
"""
|
||||
if not cmdline:
|
||||
return False
|
||||
for p in patterns or []:
|
||||
if p and p in cmdline:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _clk_tck() -> int:
|
||||
"""``os.sysconf('SC_CLK_TCK')`` with a safe fallback (never raises)."""
|
||||
try:
|
||||
v = os.sysconf("SC_CLK_TCK")
|
||||
return int(v) if v and int(v) > 0 else _DEFAULT_CLK_TCK
|
||||
except Exception: # noqa: BLE001 - non-Linux / unsupported -> conventional 100
|
||||
return _DEFAULT_CLK_TCK
|
||||
|
||||
|
||||
def _read_text(path: str) -> str | None:
|
||||
try:
|
||||
with open(path, "r") as f:
|
||||
return f.read()
|
||||
except Exception: # noqa: BLE001 - missing / unreadable -> None (per-pid race)
|
||||
return None
|
||||
|
||||
|
||||
def _read_bytes(path: str) -> bytes:
|
||||
try:
|
||||
with open(path, "rb") as f:
|
||||
return f.read()
|
||||
except Exception: # noqa: BLE001 - missing / unreadable -> empty cmdline
|
||||
return b""
|
||||
|
||||
|
||||
def collect_candidates(
|
||||
patterns: list[str],
|
||||
*,
|
||||
now: float,
|
||||
proc_root: str = "/proc",
|
||||
clk_tck: int | None = None,
|
||||
read_text=None,
|
||||
read_bytes=None,
|
||||
) -> list[dict]:
|
||||
"""Scan ``/proc`` for live processes whose cmdline matches ``patterns``.
|
||||
|
||||
Returns one ``{pid, cmdline, age_s, cpu_s, start_ticks}`` record per matching
|
||||
live process. Pattern-filtering happens HERE (D4); the builder applies the
|
||||
age threshold. ``age_s = now - (btime + starttime/clk_tck)``;
|
||||
``cpu_s = (utime + stime)/clk_tck`` (accumulated CPU time — informational for
|
||||
BR-2, NOT part of activation).
|
||||
|
||||
never-raise (D8): a top-level failure -> ``[]``; a per-pid race (vanished
|
||||
process / unreadable file) is skipped silently. ``proc_root`` / ``now`` /
|
||||
``clk_tck`` / ``read_*`` are injectable so the scan is unit-testable against a
|
||||
fake ``/proc`` tree with no real host.
|
||||
"""
|
||||
out: list[dict] = []
|
||||
try:
|
||||
rt = read_text or _read_text
|
||||
rb = read_bytes or _read_bytes
|
||||
ticks = clk_tck if (clk_tck and clk_tck > 0) else _clk_tck()
|
||||
btime = parse_btime(rt(os.path.join(proc_root, "stat")) or "")
|
||||
if btime is None:
|
||||
return []
|
||||
for entry in os.listdir(proc_root):
|
||||
if not entry.isdigit():
|
||||
continue
|
||||
try:
|
||||
cmdline = decode_cmdline(rb(os.path.join(proc_root, entry, "cmdline")))
|
||||
if not matches_patterns(cmdline, patterns):
|
||||
continue
|
||||
st = parse_pid_stat(rt(os.path.join(proc_root, entry, "stat")) or "")
|
||||
if st is None:
|
||||
continue
|
||||
start_ticks = st["starttime"]
|
||||
age_s = now - (btime + start_ticks / ticks)
|
||||
cpu_s = (st["utime"] + st["stime"]) / ticks
|
||||
out.append(
|
||||
{
|
||||
"pid": int(entry),
|
||||
"cmdline": cmdline,
|
||||
"age_s": age_s,
|
||||
"cpu_s": cpu_s,
|
||||
"start_ticks": start_ticks,
|
||||
}
|
||||
)
|
||||
except Exception as e: # noqa: BLE001 - per-pid race, skip and continue
|
||||
logger.debug("watchdog: skip /proc/%s: %s", entry, e)
|
||||
continue
|
||||
except Exception as e: # noqa: BLE001 - non-Linux / no /proc -> one signal tih
|
||||
logger.warning("watchdog: proc scan error: %s", e)
|
||||
return []
|
||||
return out
|
||||
@@ -116,6 +116,16 @@ class Config:
|
||||
containers: list[str] = field(default_factory=lambda: ["orchestrator"])
|
||||
docker_sock: str = "/var/run/docker.sock"
|
||||
|
||||
# -- blocking test/child processes (opt-in; pid: host /proc scan, D5) --
|
||||
# Default-OFF: the signal needs the `pid: host` privilege (D6) and a
|
||||
# conscious opt-in (mirror of disk_crit_enabled). proc_age_min MUST exceed
|
||||
# max(merge_retest_timeout_s, coverage_run_timeout_s)/60 so a legitimate
|
||||
# in-flight test run never crosses the threshold (D2 / AC-4).
|
||||
proc_enabled: bool = False
|
||||
proc_age_min: float = 60.0 # minutes a test process may live before alerting
|
||||
proc_patterns: list[str] = field(default_factory=lambda: ["pytest"])
|
||||
proc_cooldown_s: float = 1800.0 # per-signal re-alert throttle
|
||||
|
||||
# -- external dependencies -------------------------------------------
|
||||
deps: dict[str, str] = field(default_factory=dict)
|
||||
|
||||
@@ -132,6 +142,10 @@ class Config:
|
||||
def stage_stuck_s(self) -> float:
|
||||
return self.stage_stuck_min * 60.0
|
||||
|
||||
@property
|
||||
def proc_age_s(self) -> float:
|
||||
return self.proc_age_min * 60.0
|
||||
|
||||
@classmethod
|
||||
def from_env(cls, env: dict | None = None) -> "Config":
|
||||
"""Build a Config from ``env`` (defaults to ``os.environ``). never-raise."""
|
||||
@@ -153,6 +167,10 @@ class Config:
|
||||
queue_depth=_int(e, "WATCHDOG_QUEUE_DEPTH", 20),
|
||||
containers=_csv(e, "WATCHDOG_CONTAINERS", ["orchestrator"]),
|
||||
docker_sock=_str(e, "WATCHDOG_DOCKER_SOCK", "/var/run/docker.sock"),
|
||||
proc_enabled=_bool(e, "WATCHDOG_PROC_ENABLED", False),
|
||||
proc_age_min=_float(e, "WATCHDOG_PROC_AGE_MIN", 60.0),
|
||||
proc_patterns=_csv(e, "WATCHDOG_PROC_PATTERNS", ["pytest"]),
|
||||
proc_cooldown_s=_float(e, "WATCHDOG_PROC_COOLDOWN_S", 1800.0),
|
||||
deps=_deps(e, "WATCHDOG_DEPS"),
|
||||
tg_bot_token=_str(e, "WATCHDOG_TG_BOT_TOKEN", ""),
|
||||
tg_chat_id=_str(e, "WATCHDOG_TG_CHAT_ID", ""),
|
||||
|
||||
@@ -19,6 +19,7 @@ from .collectors import containers as containers_mod
|
||||
from .collectors import deps as deps_mod
|
||||
from .collectors import host as host_mod
|
||||
from .collectors import orch as orch_mod
|
||||
from .collectors import proc as proc_mod
|
||||
from .config import Config
|
||||
from .notify import Notifier
|
||||
from . import signals as signals_mod
|
||||
@@ -93,6 +94,18 @@ class Watchdog:
|
||||
logger.warning("watchdog: deps collect error: %s", e)
|
||||
return {}
|
||||
|
||||
def _collect_proc(self, now: float) -> list:
|
||||
# Opt-in: when WATCHDOG_PROC_ENABLED is false the scan is NOT called
|
||||
# (gate mirrors _collect_disk on disk_crit_enabled) -> zero overhead and
|
||||
# byte-for-byte tick behaviour as before ORCH-111 (D5 / AC-7).
|
||||
if not self.cfg.proc_enabled:
|
||||
return []
|
||||
try:
|
||||
return proc_mod.collect_candidates(self.cfg.proc_patterns, now=now)
|
||||
except Exception as e: # noqa: BLE001 - never-raise: one signal skipped
|
||||
logger.warning("watchdog: proc collect error: %s", e)
|
||||
return []
|
||||
|
||||
# -- one tick ---------------------------------------------------------
|
||||
def tick(self) -> list:
|
||||
"""Run one full pass; returns the dispatched ``(action, Signal)`` list.
|
||||
@@ -134,10 +147,53 @@ class Watchdog:
|
||||
# 4) external dependency pings
|
||||
built.extend(signals_mod.dep_signals(self._collect_deps()))
|
||||
|
||||
# 5) long-lived blocking test/child processes (opt-in; pid: host /proc).
|
||||
# Gated entirely on proc_enabled so a disabled sidecar is byte-for-byte
|
||||
# as before ORCH-111 (D5/AC-7); RECOVERY for a vanished process is
|
||||
# synthesised through the SAME decide()/AlertState machinery (D4).
|
||||
if self.cfg.proc_enabled:
|
||||
proc_sigs = signals_mod.proc_signals(self.cfg, self._collect_proc(now))
|
||||
proc_sigs.extend(self._synthesize_proc_recoveries(proc_sigs))
|
||||
built.extend(proc_sigs)
|
||||
|
||||
dispatched = self._dispatch(built, now)
|
||||
self.last_run_ts = now
|
||||
return dispatched
|
||||
|
||||
def _synthesize_proc_recoveries(self, current_sigs: list) -> list:
|
||||
"""Synthesise an inactive ``Signal`` for every vanished proc_blocking key.
|
||||
|
||||
``proc_signals`` emits a signal ONLY for a currently observed candidate,
|
||||
so a process that disappeared leaves an alerting :class:`AlertState` with
|
||||
no fresh signal and would never recover. Reusing ``decide()``/
|
||||
``AlertState`` (FR-5 — no separate anti-spam logic), we emit an
|
||||
``active=False`` signal for each alerting ``("proc_blocking", …)`` key
|
||||
absent from the current set -> ``decide`` yields exactly one RECOVERY and
|
||||
clears the state. This is per-family bookkeeping, not new throttling.
|
||||
"""
|
||||
out: list = []
|
||||
try:
|
||||
current_keys = {s.key for s in current_sigs}
|
||||
for key, state in list(self._states.items()):
|
||||
if (
|
||||
isinstance(key, tuple)
|
||||
and key
|
||||
and key[0] == "proc_blocking"
|
||||
and state.alerting
|
||||
and key not in current_keys
|
||||
):
|
||||
out.append(
|
||||
signals_mod.Signal(
|
||||
key=key,
|
||||
active=False,
|
||||
title="Блокирующий процесс",
|
||||
detail=f"процесс PID {key[1]} завершился",
|
||||
)
|
||||
)
|
||||
except Exception as e: # noqa: BLE001 - never-raise: skip recovery synthesis
|
||||
logger.warning("watchdog: proc recovery synth error: %s", e)
|
||||
return out
|
||||
|
||||
# -- decision + dispatch ----------------------------------------------
|
||||
def _dispatch(self, built: list, now: float) -> list:
|
||||
"""Run each signal through ``decide`` and send alert/realert/recovery."""
|
||||
|
||||
@@ -246,6 +246,54 @@ def container_signals(cfg: Config, statuses: dict) -> list:
|
||||
return sigs
|
||||
|
||||
|
||||
# Max cmdline length surfaced in an alert: truncate so a long arg list does not
|
||||
# leak random arguments into the Telegram channel (ADR-001 D4 / R-2).
|
||||
_PROC_CMDLINE_MAX = 120
|
||||
|
||||
|
||||
def proc_signals(cfg: Config, candidates: list) -> list:
|
||||
"""Build per-process ``proc_blocking`` signals from candidate records. Pure.
|
||||
|
||||
Each candidate is ``{pid, cmdline, age_s, cpu_s?, start_ticks?}`` already
|
||||
filtered to the test-class by the collector (D4). The signal is
|
||||
``active`` iff ``age_s > cfg.proc_age_s`` (the threshold is set above the max
|
||||
legitimate test-run budget, so an in-flight run is never active — AC-4). Key
|
||||
is per-entity ``("proc_blocking", pid)`` (mirror of ``("container_down",
|
||||
name)``) so ``AlertState`` / cooldown work per process. The detail is
|
||||
actionable (RU): truncated cmdline + PID + age (s) + accumulated CPU (s).
|
||||
"""
|
||||
sigs: list = []
|
||||
for rec in candidates or []:
|
||||
try:
|
||||
pid = rec.get("pid")
|
||||
age_s = rec.get("age_s")
|
||||
if pid is None or age_s is None:
|
||||
continue
|
||||
cmdline = (rec.get("cmdline") or "").strip()
|
||||
frag = cmdline[:_PROC_CMDLINE_MAX]
|
||||
if len(cmdline) > _PROC_CMDLINE_MAX:
|
||||
frag += "…"
|
||||
detail = (
|
||||
f"PID {pid} живёт {int(age_s)}s "
|
||||
f"(порог {int(cfg.proc_age_s)}s): {frag}"
|
||||
)
|
||||
cpu_s = rec.get("cpu_s")
|
||||
if cpu_s is not None:
|
||||
detail += f" · CPU {int(cpu_s)}s"
|
||||
sigs.append(
|
||||
Signal(
|
||||
key=("proc_blocking", pid),
|
||||
active=age_s > cfg.proc_age_s,
|
||||
title="Блокирующий процесс",
|
||||
detail=detail,
|
||||
cooldown_s=cfg.proc_cooldown_s,
|
||||
)
|
||||
)
|
||||
except Exception as e: # noqa: BLE001 - one bad record, others still build
|
||||
logger.warning("watchdog: proc signal build error: %s", e)
|
||||
return sigs
|
||||
|
||||
|
||||
def dep_signals(reachability: dict) -> list:
|
||||
"""Build per-dependency down signals from ``{name: reachable}``. Pure."""
|
||||
sigs: list = []
|
||||
|
||||
Reference in New Issue
Block a user