orchestrator/watchdog/collectors/proc.py

"""Collector: long-lived host processes whose cmdline matches a test-class (ORCH-111).

stdlib-only ``/proc`` scan (ADR-001 D3). Under ``pid: host`` (D6) the container's
``/proc`` reflects the host PID-namespace, so the sidecar sees the orphaned
``pytest`` subprocess regardless of which container spawned it (the merge-gate /
coverage-gate re-test the orchestrator launches itself; on a timeout-kill of the
agent — ``exit_code=-9``, ORCH-109 — the grand-child ``pytest`` reparents onto
tini and keeps running for days).

Strictly **READ-ONLY** (BR-3 / NFR-2): opens only ``/proc/stat``,
``/proc/<pid>/stat`` and ``/proc/<pid>/cmdline`` for reading. There is **no**
``os.kill``, signal-send, ``subprocess`` or any mutation on this path, and it
**never** reads ``/proc/<pid>/environ`` (secrets, ADR-001 D3 / R-2).

**never-raise** (NFR-1): a per-pid race — the process died between ``listdir``
and ``read`` — skips that pid without breaking the list; any top-level failure
(non-Linux / missing ``/proc`` / unreadable ``/proc/stat``) degrades the whole
scan to ``[]`` (one signal skipped, the tick lives, D8).

Pure parsing (``parse_btime`` / ``parse_pid_stat`` / ``decode_cmdline`` /
``matches_patterns``) is split from the I/O orchestration (``collect_candidates``)
so the scan is testable against a fake ``/proc`` tree, no real host needed.
"""
from __future__ import annotations

import logging
import os

logger = logging.getLogger("watchdog.collectors.proc")

# /proc/<pid>/stat field indices, 0-based AFTER the trailing ')' of `comm`.
# /proc/<pid>/stat is: `pid (comm) state ppid ... utime stime ... starttime ...`.
# Fields are 1-based in proc(5); field 3 (state) is the first token after the
# last ')'. So field N (>=3) lives at index N-3 of the post-')'-split:
#   utime     = field 14 -> index 11
#   stime     = field 15 -> index 12
#   starttime = field 22 -> index 19
_STAT_UTIME_IDX = 11
_STAT_STIME_IDX = 12
_STAT_STARTTIME_IDX = 19
_STAT_MIN_FIELDS = _STAT_STARTTIME_IDX + 1  # need starttime present

_DEFAULT_CLK_TCK = 100


def parse_btime(stat_text: str) -> int | None:
    """Boot time (epoch seconds) from the ``btime <N>`` line of ``/proc/stat``.

    Returns ``None`` when the line is absent / unparseable (never raises) so the
    caller degrades the whole scan to ``[]`` rather than emitting a bogus age.
    """
    try:
        for line in stat_text.splitlines():
            if line.startswith("btime "):
                parts = line.split()
                if len(parts) >= 2:
                    return int(parts[1])
    except Exception as e:  # noqa: BLE001 - tolerant: no btime -> no scan
        logger.warning("watchdog: cannot parse /proc/stat btime: %s", e)
    return None


def parse_pid_stat(stat_text: str) -> dict | None:
    """Parse ``/proc/<pid>/stat`` -> ``{utime, stime, starttime}`` (clock ticks).

    The ``comm`` field (2) is wrapped in parens and may itself contain spaces or
    parens (e.g. ``(python -m) ()``), so we split AFTER the **last** ``')'`` and
    index the remaining space-separated fields. Returns ``None`` on a malformed /
    truncated line (never raises).
    """
    try:
        rparen = stat_text.rfind(")")
        if rparen < 0:
            return None
        rest = stat_text[rparen + 1:].split()
        if len(rest) < _STAT_MIN_FIELDS:
            return None
        return {
            "utime": int(rest[_STAT_UTIME_IDX]),
            "stime": int(rest[_STAT_STIME_IDX]),
            "starttime": int(rest[_STAT_STARTTIME_IDX]),
        }
    except Exception as e:  # noqa: BLE001 - one bad line, skip this pid
        logger.debug("watchdog: cannot parse pid stat: %s", e)
        return None


def decode_cmdline(raw: bytes | str | None) -> str:
    """NUL-separated ``/proc/<pid>/cmdline`` -> a space-joined string.

    Empty for kernel threads (they carry no cmdline) -> never matches a pattern.
    Tolerant of bytes / str / ``None`` and undecodable bytes (never raises).
    """
    try:
        if raw is None:
            return ""
        if isinstance(raw, str):
            raw = raw.encode("utf-8", "replace")
        text = raw.decode("utf-8", "replace")
        parts = [p for p in text.split("\x00") if p]
        return " ".join(parts)
    except Exception:  # noqa: BLE001 - undecodable cmdline -> treat as empty
        return ""


def matches_patterns(cmdline: str, patterns: list[str]) -> bool:
    """``True`` iff ``cmdline`` contains ANY pattern as a substring.

    This is the test-class scope (ADR-001 D4): pattern-filtering happens in the
    collector, so the signal builder only applies the age threshold. The default
    pattern ``pytest`` never matches a ``claude`` agent cmdline -> zero overlap
    with ``agent_hung`` by construction (NFR-4 / AC-5).
    """
    if not cmdline:
        return False
    for p in patterns or []:
        if p and p in cmdline:
            return True
    return False


def _clk_tck() -> int:
    """``os.sysconf('SC_CLK_TCK')`` with a safe fallback (never raises)."""
    try:
        v = os.sysconf("SC_CLK_TCK")
        return int(v) if v and int(v) > 0 else _DEFAULT_CLK_TCK
    except Exception:  # noqa: BLE001 - non-Linux / unsupported -> conventional 100
        return _DEFAULT_CLK_TCK


def _read_text(path: str) -> str | None:
    try:
        with open(path, "r") as f:
            return f.read()
    except Exception:  # noqa: BLE001 - missing / unreadable -> None (per-pid race)
        return None


def _read_bytes(path: str) -> bytes:
    try:
        with open(path, "rb") as f:
            return f.read()
    except Exception:  # noqa: BLE001 - missing / unreadable -> empty cmdline
        return b""


def collect_candidates(
    patterns: list[str],
    *,
    now: float,
    proc_root: str = "/proc",
    clk_tck: int | None = None,
    read_text=None,
    read_bytes=None,
) -> list[dict]:
    """Scan ``/proc`` for live processes whose cmdline matches ``patterns``.

    Returns one ``{pid, cmdline, age_s, cpu_s, start_ticks}`` record per matching
    live process. Pattern-filtering happens HERE (D4); the builder applies the
    age threshold. ``age_s = now - (btime + starttime/clk_tck)``;
    ``cpu_s = (utime + stime)/clk_tck`` (accumulated CPU time — informational for
    BR-2, NOT part of activation).

    never-raise (D8): a top-level failure -> ``[]``; a per-pid race (vanished
    process / unreadable file) is skipped silently. ``proc_root`` / ``now`` /
    ``clk_tck`` / ``read_*`` are injectable so the scan is unit-testable against a
    fake ``/proc`` tree with no real host.
    """
    out: list[dict] = []
    try:
        rt = read_text or _read_text
        rb = read_bytes or _read_bytes
        ticks = clk_tck if (clk_tck and clk_tck > 0) else _clk_tck()
        btime = parse_btime(rt(os.path.join(proc_root, "stat")) or "")
        if btime is None:
            return []
        for entry in os.listdir(proc_root):
            if not entry.isdigit():
                continue
            try:
                cmdline = decode_cmdline(rb(os.path.join(proc_root, entry, "cmdline")))
                if not matches_patterns(cmdline, patterns):
                    continue
                st = parse_pid_stat(rt(os.path.join(proc_root, entry, "stat")) or "")
                if st is None:
                    continue
                start_ticks = st["starttime"]
                age_s = now - (btime + start_ticks / ticks)
                cpu_s = (st["utime"] + st["stime"]) / ticks
                out.append(
                    {
                        "pid": int(entry),
                        "cmdline": cmdline,
                        "age_s": age_s,
                        "cpu_s": cpu_s,
                        "start_ticks": start_ticks,
                    }
                )
            except Exception as e:  # noqa: BLE001 - per-pid race, skip and continue
                logger.debug("watchdog: skip /proc/%s: %s", entry, e)
                continue
    except Exception as e:  # noqa: BLE001 - non-Linux / no /proc -> one signal tih
        logger.warning("watchdog: proc scan error: %s", e)
        return []
    return out