orchestrator/src/proc_group.py

"""Process-group runner with tree-kill on timeout (ORCH-110 D1).

Background
----------
The orchestrator spawns its OWN pytest subprocesses OUTSIDE the LLM-agent path:
``merge_gate.retest_branch`` (the merge-gate re-test) and
``coverage_gate.measure_coverage`` (the coverage measurement). Both used a plain
``subprocess.run(..., timeout=...)``. On ``TimeoutExpired`` CPython kills only the
DIRECT child (``proc.kill()``); any grandchildren (pytest-xdist workers, tests that
spawn their own processes) are re-parented to PID 1 and keep running — for days,
starving the host CPU. That orphan leak is a ROOT cause of the ORCH-109 / PR #129
incident (a 516s suite blew its 600s budget under CPU starvation -> a false
merge-gate rollback after a green CI).

This leaf gives those orchestrator-spawned runs the SAME SIGTERM->grace->SIGKILL
cascade the LLM agents already get from ``launcher.stop_process`` — but at the
process-GROUP level so the WHOLE subtree dies on a timeout:

  * the child becomes the leader of a NEW session / process group
    (``start_new_session=True`` -> ``setsid``), so every descendant shares
    ``pgid == child.pid``;
  * on timeout the whole group is killed with ``os.killpg`` (SIGTERM, grace poll,
    then SIGKILL), then reaped (no zombies).

Invariants:
  * **stdlib-only leaf** — imports only ``os`` / ``signal`` / ``subprocess`` /
    ``time`` / ``logging`` / ``dataclasses``; NEVER another ``src/*`` module
    (mirrors the purity of ``serial_gate`` / ``staging_verdict``).
  * **never-raise** — any OS error degrades to a safe ``ProcResult`` (the caller maps
    it to its existing contract); an exception never escapes.
  * **never-break fallback** — with ``tree_kill=False`` OR on a platform without
    ``os.killpg`` / ``os.setsid`` (non-POSIX) it degrades to the prior
    ``subprocess.run(cmd, ..., timeout=timeout)`` (byte-for-byte the old behaviour;
    prod is Linux and never hits the fallback).
"""

import logging
import os
import signal
import subprocess
import time
from dataclasses import dataclass

logger = logging.getLogger("orchestrator.proc_group")


@dataclass
class ProcResult:
    """Outcome of a process-group run (ORCH-110 D1).

    ``returncode`` is the child's exit code, or ``None`` when the process could not
    be reaped with a real code (timeout-kill / spawn error). ``timed_out`` is True
    iff the command exceeded its wall-clock budget (and the whole group was killed).
    """

    returncode: int | None
    stdout: str
    stderr: str
    timed_out: bool


def _tree_kill_supported() -> bool:
    """POSIX process-group ops available? (non-POSIX -> fall back to subprocess.run)."""
    return hasattr(os, "killpg") and hasattr(os, "getpgid") and hasattr(os, "setsid")


def _kill_group(pid: int, grace_s: float) -> None:
    """SIGTERM -> grace -> SIGKILL the whole process GROUP led by ``pid``.

    Mirrors ``launcher.stop_process`` but targets the process GROUP (``os.killpg``)
    so grandchildren die too. ``ProcessLookupError`` is tolerated at every step (the
    group may already be gone). Never raises.
    """
    try:
        pgid = os.getpgid(pid)
    except (ProcessLookupError, OSError):
        return

    # Phase 1: SIGTERM the whole group (graceful).
    try:
        os.killpg(pgid, signal.SIGTERM)
    except ProcessLookupError:
        return
    except OSError as e:  # noqa: BLE001 - never-raise
        logger.warning("proc_group: SIGTERM killpg(%s) error: %s", pgid, e)

    # Phase 2: poll for graceful group exit within the grace window.
    poll = 0.2
    waited = 0.0
    while waited < max(0.0, grace_s):
        time.sleep(poll)
        waited += poll
        try:
            os.killpg(pgid, 0)  # signal 0 = liveness probe of the group
        except ProcessLookupError:
            return  # whole group gone
        except OSError:
            break

    # Phase 3: still alive -> hard SIGKILL the group.
    try:
        os.killpg(pgid, signal.SIGKILL)
    except ProcessLookupError:
        return
    except OSError as e:  # noqa: BLE001 - never-raise
        logger.warning("proc_group: SIGKILL killpg(%s) error: %s", pgid, e)


def _run_plain(cmd, *, cwd, timeout, env=None) -> ProcResult:
    """Fallback path: prior ``subprocess.run`` semantics (never-break)."""
    try:
        r = subprocess.run(
            cmd, cwd=cwd, env=env, capture_output=True, text=True, timeout=timeout
        )
        return ProcResult(
            returncode=r.returncode, stdout=r.stdout or "", stderr=r.stderr or "",
            timed_out=False,
        )
    except subprocess.TimeoutExpired:
        return ProcResult(returncode=None, stdout="", stderr="", timed_out=True)
    except (subprocess.SubprocessError, OSError) as e:
        logger.warning("proc_group(_run_plain): error for %s: %s", cmd, e)
        return ProcResult(returncode=None, stdout="", stderr=str(e), timed_out=False)


def run_in_process_group(
    cmd, *, cwd, timeout, env=None, grace_s: float = 5.0, tree_kill: bool = True
) -> ProcResult:
    """Run ``cmd`` in its own process group; tree-kill the whole group on timeout.

    See the module docstring. Returns a ``ProcResult``; ``timed_out=True`` iff the
    command exceeded ``timeout`` (and the whole group was killed). Never raises.

    Fallback (never-break): ``tree_kill=False`` or a non-POSIX platform -> a plain
    ``subprocess.run(cmd, ..., timeout=timeout)`` (the prior behaviour).
    """
    if not tree_kill or not _tree_kill_supported():
        return _run_plain(cmd, cwd=cwd, timeout=timeout, env=env)

    try:
        proc = subprocess.Popen(  # noqa: S603 - cmd is an internal, fixed argv
            cmd,
            cwd=cwd,
            env=env,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            start_new_session=True,  # setsid -> child leads a new process group
        )
    except (OSError, ValueError) as e:
        logger.warning("proc_group: spawn error for %s: %s", cmd, e)
        return ProcResult(returncode=None, stdout="", stderr=str(e), timed_out=False)

    try:
        out, err = proc.communicate(timeout=timeout)
        return ProcResult(
            returncode=proc.returncode, stdout=out or "", stderr=err or "",
            timed_out=False,
        )
    except subprocess.TimeoutExpired:
        # Tree-kill the WHOLE group (children + grandchildren), then reap to avoid
        # leaving a zombie behind.
        _kill_group(proc.pid, grace_s)
        out, err = "", ""
        try:
            out, err = proc.communicate(timeout=grace_s + 5.0)
        except (subprocess.TimeoutExpired, OSError, ValueError):
            try:
                proc.wait(timeout=grace_s + 5.0)
            except (subprocess.TimeoutExpired, OSError, ValueError):
                pass
        return ProcResult(
            returncode=proc.returncode, stdout=out or "", stderr=err or "",
            timed_out=True,
        )
    except (OSError, ValueError) as e:  # noqa: BLE001 - never-raise
        logger.warning("proc_group: communicate error for %s: %s", cmd, e)
        _kill_group(proc.pid, grace_s)
        return ProcResult(returncode=proc.returncode, stdout="", stderr=str(e), timed_out=False)