"""Process-group runner with tree-kill on timeout (ORCH-110 D1). Background ---------- The orchestrator spawns its OWN pytest subprocesses OUTSIDE the LLM-agent path: ``merge_gate.retest_branch`` (the merge-gate re-test) and ``coverage_gate.measure_coverage`` (the coverage measurement). Both used a plain ``subprocess.run(..., timeout=...)``. On ``TimeoutExpired`` CPython kills only the DIRECT child (``proc.kill()``); any grandchildren (pytest-xdist workers, tests that spawn their own processes) are re-parented to PID 1 and keep running — for days, starving the host CPU. That orphan leak is a ROOT cause of the ORCH-109 / PR #129 incident (a 516s suite blew its 600s budget under CPU starvation -> a false merge-gate rollback after a green CI). This leaf gives those orchestrator-spawned runs the SAME SIGTERM->grace->SIGKILL cascade the LLM agents already get from ``launcher.stop_process`` — but at the process-GROUP level so the WHOLE subtree dies on a timeout: * the child becomes the leader of a NEW session / process group (``start_new_session=True`` -> ``setsid``), so every descendant shares ``pgid == child.pid``; * on timeout the whole group is killed with ``os.killpg`` (SIGTERM, grace poll, then SIGKILL), then reaped (no zombies). Invariants: * **stdlib-only leaf** — imports only ``os`` / ``signal`` / ``subprocess`` / ``time`` / ``logging`` / ``dataclasses``; NEVER another ``src/*`` module (mirrors the purity of ``serial_gate`` / ``staging_verdict``). * **never-raise** — any OS error degrades to a safe ``ProcResult`` (the caller maps it to its existing contract); an exception never escapes. * **never-break fallback** — with ``tree_kill=False`` OR on a platform without ``os.killpg`` / ``os.setsid`` (non-POSIX) it degrades to the prior ``subprocess.run(cmd, ..., timeout=timeout)`` (byte-for-byte the old behaviour; prod is Linux and never hits the fallback). """ import logging import os import signal import subprocess import time from dataclasses import dataclass logger = logging.getLogger("orchestrator.proc_group") @dataclass class ProcResult: """Outcome of a process-group run (ORCH-110 D1). ``returncode`` is the child's exit code, or ``None`` when the process could not be reaped with a real code (timeout-kill / spawn error). ``timed_out`` is True iff the command exceeded its wall-clock budget (and the whole group was killed). """ returncode: int | None stdout: str stderr: str timed_out: bool def _tree_kill_supported() -> bool: """POSIX process-group ops available? (non-POSIX -> fall back to subprocess.run).""" return hasattr(os, "killpg") and hasattr(os, "getpgid") and hasattr(os, "setsid") def _kill_group(pid: int, grace_s: float) -> None: """SIGTERM -> grace -> SIGKILL the whole process GROUP led by ``pid``. Mirrors ``launcher.stop_process`` but targets the process GROUP (``os.killpg``) so grandchildren die too. ``ProcessLookupError`` is tolerated at every step (the group may already be gone). Never raises. """ try: pgid = os.getpgid(pid) except (ProcessLookupError, OSError): return # Phase 1: SIGTERM the whole group (graceful). try: os.killpg(pgid, signal.SIGTERM) except ProcessLookupError: return except OSError as e: # noqa: BLE001 - never-raise logger.warning("proc_group: SIGTERM killpg(%s) error: %s", pgid, e) # Phase 2: poll for graceful group exit within the grace window. poll = 0.2 waited = 0.0 while waited < max(0.0, grace_s): time.sleep(poll) waited += poll try: os.killpg(pgid, 0) # signal 0 = liveness probe of the group except ProcessLookupError: return # whole group gone except OSError: break # Phase 3: still alive -> hard SIGKILL the group. try: os.killpg(pgid, signal.SIGKILL) except ProcessLookupError: return except OSError as e: # noqa: BLE001 - never-raise logger.warning("proc_group: SIGKILL killpg(%s) error: %s", pgid, e) def _run_plain(cmd, *, cwd, timeout, env=None) -> ProcResult: """Fallback path: prior ``subprocess.run`` semantics (never-break).""" try: r = subprocess.run( cmd, cwd=cwd, env=env, capture_output=True, text=True, timeout=timeout ) return ProcResult( returncode=r.returncode, stdout=r.stdout or "", stderr=r.stderr or "", timed_out=False, ) except subprocess.TimeoutExpired: return ProcResult(returncode=None, stdout="", stderr="", timed_out=True) except (subprocess.SubprocessError, OSError) as e: logger.warning("proc_group(_run_plain): error for %s: %s", cmd, e) return ProcResult(returncode=None, stdout="", stderr=str(e), timed_out=False) def run_in_process_group( cmd, *, cwd, timeout, env=None, grace_s: float = 5.0, tree_kill: bool = True ) -> ProcResult: """Run ``cmd`` in its own process group; tree-kill the whole group on timeout. See the module docstring. Returns a ``ProcResult``; ``timed_out=True`` iff the command exceeded ``timeout`` (and the whole group was killed). Never raises. Fallback (never-break): ``tree_kill=False`` or a non-POSIX platform -> a plain ``subprocess.run(cmd, ..., timeout=timeout)`` (the prior behaviour). """ if not tree_kill or not _tree_kill_supported(): return _run_plain(cmd, cwd=cwd, timeout=timeout, env=env) try: proc = subprocess.Popen( # noqa: S603 - cmd is an internal, fixed argv cmd, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, start_new_session=True, # setsid -> child leads a new process group ) except (OSError, ValueError) as e: logger.warning("proc_group: spawn error for %s: %s", cmd, e) return ProcResult(returncode=None, stdout="", stderr=str(e), timed_out=False) try: out, err = proc.communicate(timeout=timeout) return ProcResult( returncode=proc.returncode, stdout=out or "", stderr=err or "", timed_out=False, ) except subprocess.TimeoutExpired: # Tree-kill the WHOLE group (children + grandchildren), then reap to avoid # leaving a zombie behind. _kill_group(proc.pid, grace_s) out, err = "", "" try: out, err = proc.communicate(timeout=grace_s + 5.0) except (subprocess.TimeoutExpired, OSError, ValueError): try: proc.wait(timeout=grace_s + 5.0) except (subprocess.TimeoutExpired, OSError, ValueError): pass return ProcResult( returncode=proc.returncode, stdout=out or "", stderr=err or "", timed_out=True, ) except (OSError, ValueError) as e: # noqa: BLE001 - never-raise logger.warning("proc_group: communicate error for %s: %s", cmd, e) _kill_group(proc.pid, grace_s) return ProcResult(returncode=proc.returncode, stdout="", stderr=str(e), timed_out=False)