"""Collector: long-lived host processes whose cmdline matches a test-class (ORCH-111). stdlib-only ``/proc`` scan (ADR-001 D3). Under ``pid: host`` (D6) the container's ``/proc`` reflects the host PID-namespace, so the sidecar sees the orphaned ``pytest`` subprocess regardless of which container spawned it (the merge-gate / coverage-gate re-test the orchestrator launches itself; on a timeout-kill of the agent — ``exit_code=-9``, ORCH-109 — the grand-child ``pytest`` reparents onto tini and keeps running for days). Strictly **READ-ONLY** (BR-3 / NFR-2): opens only ``/proc/stat``, ``/proc//stat`` and ``/proc//cmdline`` for reading. There is **no** ``os.kill``, signal-send, ``subprocess`` or any mutation on this path, and it **never** reads ``/proc//environ`` (secrets, ADR-001 D3 / R-2). **never-raise** (NFR-1): a per-pid race — the process died between ``listdir`` and ``read`` — skips that pid without breaking the list; any top-level failure (non-Linux / missing ``/proc`` / unreadable ``/proc/stat``) degrades the whole scan to ``[]`` (one signal skipped, the tick lives, D8). Pure parsing (``parse_btime`` / ``parse_pid_stat`` / ``decode_cmdline`` / ``matches_patterns``) is split from the I/O orchestration (``collect_candidates``) so the scan is testable against a fake ``/proc`` tree, no real host needed. """ from __future__ import annotations import logging import os logger = logging.getLogger("watchdog.collectors.proc") # /proc//stat field indices, 0-based AFTER the trailing ')' of `comm`. # /proc//stat is: `pid (comm) state ppid ... utime stime ... starttime ...`. # Fields are 1-based in proc(5); field 3 (state) is the first token after the # last ')'. So field N (>=3) lives at index N-3 of the post-')'-split: # utime = field 14 -> index 11 # stime = field 15 -> index 12 # starttime = field 22 -> index 19 _STAT_UTIME_IDX = 11 _STAT_STIME_IDX = 12 _STAT_STARTTIME_IDX = 19 _STAT_MIN_FIELDS = _STAT_STARTTIME_IDX + 1 # need starttime present _DEFAULT_CLK_TCK = 100 def parse_btime(stat_text: str) -> int | None: """Boot time (epoch seconds) from the ``btime `` line of ``/proc/stat``. Returns ``None`` when the line is absent / unparseable (never raises) so the caller degrades the whole scan to ``[]`` rather than emitting a bogus age. """ try: for line in stat_text.splitlines(): if line.startswith("btime "): parts = line.split() if len(parts) >= 2: return int(parts[1]) except Exception as e: # noqa: BLE001 - tolerant: no btime -> no scan logger.warning("watchdog: cannot parse /proc/stat btime: %s", e) return None def parse_pid_stat(stat_text: str) -> dict | None: """Parse ``/proc//stat`` -> ``{utime, stime, starttime}`` (clock ticks). The ``comm`` field (2) is wrapped in parens and may itself contain spaces or parens (e.g. ``(python -m) ()``), so we split AFTER the **last** ``')'`` and index the remaining space-separated fields. Returns ``None`` on a malformed / truncated line (never raises). """ try: rparen = stat_text.rfind(")") if rparen < 0: return None rest = stat_text[rparen + 1:].split() if len(rest) < _STAT_MIN_FIELDS: return None return { "utime": int(rest[_STAT_UTIME_IDX]), "stime": int(rest[_STAT_STIME_IDX]), "starttime": int(rest[_STAT_STARTTIME_IDX]), } except Exception as e: # noqa: BLE001 - one bad line, skip this pid logger.debug("watchdog: cannot parse pid stat: %s", e) return None def decode_cmdline(raw: bytes | str | None) -> str: """NUL-separated ``/proc//cmdline`` -> a space-joined string. Empty for kernel threads (they carry no cmdline) -> never matches a pattern. Tolerant of bytes / str / ``None`` and undecodable bytes (never raises). """ try: if raw is None: return "" if isinstance(raw, str): raw = raw.encode("utf-8", "replace") text = raw.decode("utf-8", "replace") parts = [p for p in text.split("\x00") if p] return " ".join(parts) except Exception: # noqa: BLE001 - undecodable cmdline -> treat as empty return "" def matches_patterns(cmdline: str, patterns: list[str]) -> bool: """``True`` iff ``cmdline`` contains ANY pattern as a substring. This is the test-class scope (ADR-001 D4): pattern-filtering happens in the collector, so the signal builder only applies the age threshold. The default pattern ``pytest`` never matches a ``claude`` agent cmdline -> zero overlap with ``agent_hung`` by construction (NFR-4 / AC-5). """ if not cmdline: return False for p in patterns or []: if p and p in cmdline: return True return False def _clk_tck() -> int: """``os.sysconf('SC_CLK_TCK')`` with a safe fallback (never raises).""" try: v = os.sysconf("SC_CLK_TCK") return int(v) if v and int(v) > 0 else _DEFAULT_CLK_TCK except Exception: # noqa: BLE001 - non-Linux / unsupported -> conventional 100 return _DEFAULT_CLK_TCK def _read_text(path: str) -> str | None: try: with open(path, "r") as f: return f.read() except Exception: # noqa: BLE001 - missing / unreadable -> None (per-pid race) return None def _read_bytes(path: str) -> bytes: try: with open(path, "rb") as f: return f.read() except Exception: # noqa: BLE001 - missing / unreadable -> empty cmdline return b"" def collect_candidates( patterns: list[str], *, now: float, proc_root: str = "/proc", clk_tck: int | None = None, read_text=None, read_bytes=None, ) -> list[dict]: """Scan ``/proc`` for live processes whose cmdline matches ``patterns``. Returns one ``{pid, cmdline, age_s, cpu_s, start_ticks}`` record per matching live process. Pattern-filtering happens HERE (D4); the builder applies the age threshold. ``age_s = now - (btime + starttime/clk_tck)``; ``cpu_s = (utime + stime)/clk_tck`` (accumulated CPU time — informational for BR-2, NOT part of activation). never-raise (D8): a top-level failure -> ``[]``; a per-pid race (vanished process / unreadable file) is skipped silently. ``proc_root`` / ``now`` / ``clk_tck`` / ``read_*`` are injectable so the scan is unit-testable against a fake ``/proc`` tree with no real host. """ out: list[dict] = [] try: rt = read_text or _read_text rb = read_bytes or _read_bytes ticks = clk_tck if (clk_tck and clk_tck > 0) else _clk_tck() btime = parse_btime(rt(os.path.join(proc_root, "stat")) or "") if btime is None: return [] for entry in os.listdir(proc_root): if not entry.isdigit(): continue try: cmdline = decode_cmdline(rb(os.path.join(proc_root, entry, "cmdline"))) if not matches_patterns(cmdline, patterns): continue st = parse_pid_stat(rt(os.path.join(proc_root, entry, "stat")) or "") if st is None: continue start_ticks = st["starttime"] age_s = now - (btime + start_ticks / ticks) cpu_s = (st["utime"] + st["stime"]) / ticks out.append( { "pid": int(entry), "cmdline": cmdline, "age_s": age_s, "cpu_s": cpu_s, "start_ticks": start_ticks, } ) except Exception as e: # noqa: BLE001 - per-pid race, skip and continue logger.debug("watchdog: skip /proc/%s: %s", entry, e) continue except Exception as e: # noqa: BLE001 - non-Linux / no /proc -> one signal tih logger.warning("watchdog: proc scan error: %s", e) return [] return out