Files
orchestrator/src/proc_group.py
claude-bot 651b9af7c3 fix(merge-gate): tolerate re-test infra-timeout + tree-kill spawned pytest
Eliminate the false `deploy-staging -> development` rollback that fired when the
merge-gate local re-test timed out (infra/resource) on a green CI + tester +
staging branch (incident ORCH-109/PR #129: a 516.7s suite blew its 600s budget
under CPU starvation from orphaned pytest processes -> timeout misrouted as a
code fault -> developer-retry loop -> manual gate).

Additive, 5 independent kill-switches, never-raise, self-hosting scope. Untouched
byte-for-byte: STAGE_TRANSITIONS, the QG_CHECKS registry, check_branch_mergeable
name/semantics, machine-verdict keys, the DB schema. INV-4 (never push/force-push
main) and the no-prod-restart rule are preserved.

- D1: new stdlib-only leaf src/proc_group.py runs the spawned re-test/coverage
  pytest in its own process group (start_new_session) and tree-kills the WHOLE
  group on timeout (os.killpg SIGTERM->grace->SIGKILL); used by
  merge_gate.retest_branch and coverage_gate.measure_coverage. No orphan leak.
  Fallback never-break: subprocess_tree_kill_enabled=False / non-POSIX -> the
  prior subprocess.run.
- D2/D3: merge_gate.classify_retest_failure distinguishes timeout/red/lock-busy/
  other; an infra timeout routes to _handle_merge_gate_infra_retry (bounded
  re-queue, task stays on deploy-staging, no rollback / no developer-retry); a
  red re-test / conflict still rolls back (BR-6). Exhaustion -> one infra alert.
- D4: skip the local re-test when the pre-merge rebase was a proven no-op (HEAD
  already CI/tester/staging-validated); fail-safe runs the re-test on any
  uncertainty. Flag merge_retest_skip_when_current_enabled.
- D5: merge_retest_timeout_s 600 -> 900 + _resolve_retest_timeout validation;
  reaper_max_running_s invariant preserved without change.
- D6: in-process counters + read-only merge_gate block in GET /queue; appended
  ("ORCH-110","classify_retest_failure","src/merge_gate.py") to
  MAIN_REGRESSION_MARKERS. Docs (README/internals overview/CLAUDE/CHANGELOG/
  .env.example) updated in the same PR.

Tests: tests/test_orch110_*.py (TC-01..TC-12, incl. the red-before/green-after
incident regression). Full suite green (1988 passed).

Refs: ORCH-110

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-15 10:42:34 +03:00

180 lines
7.1 KiB
Python

"""Process-group runner with tree-kill on timeout (ORCH-110 D1).
Background
----------
The orchestrator spawns its OWN pytest subprocesses OUTSIDE the LLM-agent path:
``merge_gate.retest_branch`` (the merge-gate re-test) and
``coverage_gate.measure_coverage`` (the coverage measurement). Both used a plain
``subprocess.run(..., timeout=...)``. On ``TimeoutExpired`` CPython kills only the
DIRECT child (``proc.kill()``); any grandchildren (pytest-xdist workers, tests that
spawn their own processes) are re-parented to PID 1 and keep running — for days,
starving the host CPU. That orphan leak is a ROOT cause of the ORCH-109 / PR #129
incident (a 516s suite blew its 600s budget under CPU starvation -> a false
merge-gate rollback after a green CI).
This leaf gives those orchestrator-spawned runs the SAME SIGTERM->grace->SIGKILL
cascade the LLM agents already get from ``launcher.stop_process`` — but at the
process-GROUP level so the WHOLE subtree dies on a timeout:
* the child becomes the leader of a NEW session / process group
(``start_new_session=True`` -> ``setsid``), so every descendant shares
``pgid == child.pid``;
* on timeout the whole group is killed with ``os.killpg`` (SIGTERM, grace poll,
then SIGKILL), then reaped (no zombies).
Invariants:
* **stdlib-only leaf** — imports only ``os`` / ``signal`` / ``subprocess`` /
``time`` / ``logging`` / ``dataclasses``; NEVER another ``src/*`` module
(mirrors the purity of ``serial_gate`` / ``staging_verdict``).
* **never-raise** — any OS error degrades to a safe ``ProcResult`` (the caller maps
it to its existing contract); an exception never escapes.
* **never-break fallback** — with ``tree_kill=False`` OR on a platform without
``os.killpg`` / ``os.setsid`` (non-POSIX) it degrades to the prior
``subprocess.run(cmd, ..., timeout=timeout)`` (byte-for-byte the old behaviour;
prod is Linux and never hits the fallback).
"""
import logging
import os
import signal
import subprocess
import time
from dataclasses import dataclass
logger = logging.getLogger("orchestrator.proc_group")
@dataclass
class ProcResult:
"""Outcome of a process-group run (ORCH-110 D1).
``returncode`` is the child's exit code, or ``None`` when the process could not
be reaped with a real code (timeout-kill / spawn error). ``timed_out`` is True
iff the command exceeded its wall-clock budget (and the whole group was killed).
"""
returncode: int | None
stdout: str
stderr: str
timed_out: bool
def _tree_kill_supported() -> bool:
"""POSIX process-group ops available? (non-POSIX -> fall back to subprocess.run)."""
return hasattr(os, "killpg") and hasattr(os, "getpgid") and hasattr(os, "setsid")
def _kill_group(pid: int, grace_s: float) -> None:
"""SIGTERM -> grace -> SIGKILL the whole process GROUP led by ``pid``.
Mirrors ``launcher.stop_process`` but targets the process GROUP (``os.killpg``)
so grandchildren die too. ``ProcessLookupError`` is tolerated at every step (the
group may already be gone). Never raises.
"""
try:
pgid = os.getpgid(pid)
except (ProcessLookupError, OSError):
return
# Phase 1: SIGTERM the whole group (graceful).
try:
os.killpg(pgid, signal.SIGTERM)
except ProcessLookupError:
return
except OSError as e: # noqa: BLE001 - never-raise
logger.warning("proc_group: SIGTERM killpg(%s) error: %s", pgid, e)
# Phase 2: poll for graceful group exit within the grace window.
poll = 0.2
waited = 0.0
while waited < max(0.0, grace_s):
time.sleep(poll)
waited += poll
try:
os.killpg(pgid, 0) # signal 0 = liveness probe of the group
except ProcessLookupError:
return # whole group gone
except OSError:
break
# Phase 3: still alive -> hard SIGKILL the group.
try:
os.killpg(pgid, signal.SIGKILL)
except ProcessLookupError:
return
except OSError as e: # noqa: BLE001 - never-raise
logger.warning("proc_group: SIGKILL killpg(%s) error: %s", pgid, e)
def _run_plain(cmd, *, cwd, timeout, env=None) -> ProcResult:
"""Fallback path: prior ``subprocess.run`` semantics (never-break)."""
try:
r = subprocess.run(
cmd, cwd=cwd, env=env, capture_output=True, text=True, timeout=timeout
)
return ProcResult(
returncode=r.returncode, stdout=r.stdout or "", stderr=r.stderr or "",
timed_out=False,
)
except subprocess.TimeoutExpired:
return ProcResult(returncode=None, stdout="", stderr="", timed_out=True)
except (subprocess.SubprocessError, OSError) as e:
logger.warning("proc_group(_run_plain): error for %s: %s", cmd, e)
return ProcResult(returncode=None, stdout="", stderr=str(e), timed_out=False)
def run_in_process_group(
cmd, *, cwd, timeout, env=None, grace_s: float = 5.0, tree_kill: bool = True
) -> ProcResult:
"""Run ``cmd`` in its own process group; tree-kill the whole group on timeout.
See the module docstring. Returns a ``ProcResult``; ``timed_out=True`` iff the
command exceeded ``timeout`` (and the whole group was killed). Never raises.
Fallback (never-break): ``tree_kill=False`` or a non-POSIX platform -> a plain
``subprocess.run(cmd, ..., timeout=timeout)`` (the prior behaviour).
"""
if not tree_kill or not _tree_kill_supported():
return _run_plain(cmd, cwd=cwd, timeout=timeout, env=env)
try:
proc = subprocess.Popen( # noqa: S603 - cmd is an internal, fixed argv
cmd,
cwd=cwd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
start_new_session=True, # setsid -> child leads a new process group
)
except (OSError, ValueError) as e:
logger.warning("proc_group: spawn error for %s: %s", cmd, e)
return ProcResult(returncode=None, stdout="", stderr=str(e), timed_out=False)
try:
out, err = proc.communicate(timeout=timeout)
return ProcResult(
returncode=proc.returncode, stdout=out or "", stderr=err or "",
timed_out=False,
)
except subprocess.TimeoutExpired:
# Tree-kill the WHOLE group (children + grandchildren), then reap to avoid
# leaving a zombie behind.
_kill_group(proc.pid, grace_s)
out, err = "", ""
try:
out, err = proc.communicate(timeout=grace_s + 5.0)
except (subprocess.TimeoutExpired, OSError, ValueError):
try:
proc.wait(timeout=grace_s + 5.0)
except (subprocess.TimeoutExpired, OSError, ValueError):
pass
return ProcResult(
returncode=proc.returncode, stdout=out or "", stderr=err or "",
timed_out=True,
)
except (OSError, ValueError) as e: # noqa: BLE001 - never-raise
logger.warning("proc_group: communicate error for %s: %s", cmd, e)
_kill_group(proc.pid, grace_s)
return ProcResult(returncode=proc.returncode, stdout="", stderr=str(e), timed_out=False)