Eliminate the false `deploy-staging -> development` rollback that fired when the merge-gate local re-test timed out (infra/resource) on a green CI + tester + staging branch (incident ORCH-109/PR #129: a 516.7s suite blew its 600s budget under CPU starvation from orphaned pytest processes -> timeout misrouted as a code fault -> developer-retry loop -> manual gate). Additive, 5 independent kill-switches, never-raise, self-hosting scope. Untouched byte-for-byte: STAGE_TRANSITIONS, the QG_CHECKS registry, check_branch_mergeable name/semantics, machine-verdict keys, the DB schema. INV-4 (never push/force-push main) and the no-prod-restart rule are preserved. - D1: new stdlib-only leaf src/proc_group.py runs the spawned re-test/coverage pytest in its own process group (start_new_session) and tree-kills the WHOLE group on timeout (os.killpg SIGTERM->grace->SIGKILL); used by merge_gate.retest_branch and coverage_gate.measure_coverage. No orphan leak. Fallback never-break: subprocess_tree_kill_enabled=False / non-POSIX -> the prior subprocess.run. - D2/D3: merge_gate.classify_retest_failure distinguishes timeout/red/lock-busy/ other; an infra timeout routes to _handle_merge_gate_infra_retry (bounded re-queue, task stays on deploy-staging, no rollback / no developer-retry); a red re-test / conflict still rolls back (BR-6). Exhaustion -> one infra alert. - D4: skip the local re-test when the pre-merge rebase was a proven no-op (HEAD already CI/tester/staging-validated); fail-safe runs the re-test on any uncertainty. Flag merge_retest_skip_when_current_enabled. - D5: merge_retest_timeout_s 600 -> 900 + _resolve_retest_timeout validation; reaper_max_running_s invariant preserved without change. - D6: in-process counters + read-only merge_gate block in GET /queue; appended ("ORCH-110","classify_retest_failure","src/merge_gate.py") to MAIN_REGRESSION_MARKERS. Docs (README/internals overview/CLAUDE/CHANGELOG/ .env.example) updated in the same PR. Tests: tests/test_orch110_*.py (TC-01..TC-12, incl. the red-before/green-after incident regression). Full suite green (1988 passed). Refs: ORCH-110 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
180 lines
7.1 KiB
Python
180 lines
7.1 KiB
Python
"""Process-group runner with tree-kill on timeout (ORCH-110 D1).
|
|
|
|
Background
|
|
----------
|
|
The orchestrator spawns its OWN pytest subprocesses OUTSIDE the LLM-agent path:
|
|
``merge_gate.retest_branch`` (the merge-gate re-test) and
|
|
``coverage_gate.measure_coverage`` (the coverage measurement). Both used a plain
|
|
``subprocess.run(..., timeout=...)``. On ``TimeoutExpired`` CPython kills only the
|
|
DIRECT child (``proc.kill()``); any grandchildren (pytest-xdist workers, tests that
|
|
spawn their own processes) are re-parented to PID 1 and keep running — for days,
|
|
starving the host CPU. That orphan leak is a ROOT cause of the ORCH-109 / PR #129
|
|
incident (a 516s suite blew its 600s budget under CPU starvation -> a false
|
|
merge-gate rollback after a green CI).
|
|
|
|
This leaf gives those orchestrator-spawned runs the SAME SIGTERM->grace->SIGKILL
|
|
cascade the LLM agents already get from ``launcher.stop_process`` — but at the
|
|
process-GROUP level so the WHOLE subtree dies on a timeout:
|
|
|
|
* the child becomes the leader of a NEW session / process group
|
|
(``start_new_session=True`` -> ``setsid``), so every descendant shares
|
|
``pgid == child.pid``;
|
|
* on timeout the whole group is killed with ``os.killpg`` (SIGTERM, grace poll,
|
|
then SIGKILL), then reaped (no zombies).
|
|
|
|
Invariants:
|
|
* **stdlib-only leaf** — imports only ``os`` / ``signal`` / ``subprocess`` /
|
|
``time`` / ``logging`` / ``dataclasses``; NEVER another ``src/*`` module
|
|
(mirrors the purity of ``serial_gate`` / ``staging_verdict``).
|
|
* **never-raise** — any OS error degrades to a safe ``ProcResult`` (the caller maps
|
|
it to its existing contract); an exception never escapes.
|
|
* **never-break fallback** — with ``tree_kill=False`` OR on a platform without
|
|
``os.killpg`` / ``os.setsid`` (non-POSIX) it degrades to the prior
|
|
``subprocess.run(cmd, ..., timeout=timeout)`` (byte-for-byte the old behaviour;
|
|
prod is Linux and never hits the fallback).
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import signal
|
|
import subprocess
|
|
import time
|
|
from dataclasses import dataclass
|
|
|
|
logger = logging.getLogger("orchestrator.proc_group")
|
|
|
|
|
|
@dataclass
|
|
class ProcResult:
|
|
"""Outcome of a process-group run (ORCH-110 D1).
|
|
|
|
``returncode`` is the child's exit code, or ``None`` when the process could not
|
|
be reaped with a real code (timeout-kill / spawn error). ``timed_out`` is True
|
|
iff the command exceeded its wall-clock budget (and the whole group was killed).
|
|
"""
|
|
|
|
returncode: int | None
|
|
stdout: str
|
|
stderr: str
|
|
timed_out: bool
|
|
|
|
|
|
def _tree_kill_supported() -> bool:
|
|
"""POSIX process-group ops available? (non-POSIX -> fall back to subprocess.run)."""
|
|
return hasattr(os, "killpg") and hasattr(os, "getpgid") and hasattr(os, "setsid")
|
|
|
|
|
|
def _kill_group(pid: int, grace_s: float) -> None:
|
|
"""SIGTERM -> grace -> SIGKILL the whole process GROUP led by ``pid``.
|
|
|
|
Mirrors ``launcher.stop_process`` but targets the process GROUP (``os.killpg``)
|
|
so grandchildren die too. ``ProcessLookupError`` is tolerated at every step (the
|
|
group may already be gone). Never raises.
|
|
"""
|
|
try:
|
|
pgid = os.getpgid(pid)
|
|
except (ProcessLookupError, OSError):
|
|
return
|
|
|
|
# Phase 1: SIGTERM the whole group (graceful).
|
|
try:
|
|
os.killpg(pgid, signal.SIGTERM)
|
|
except ProcessLookupError:
|
|
return
|
|
except OSError as e: # noqa: BLE001 - never-raise
|
|
logger.warning("proc_group: SIGTERM killpg(%s) error: %s", pgid, e)
|
|
|
|
# Phase 2: poll for graceful group exit within the grace window.
|
|
poll = 0.2
|
|
waited = 0.0
|
|
while waited < max(0.0, grace_s):
|
|
time.sleep(poll)
|
|
waited += poll
|
|
try:
|
|
os.killpg(pgid, 0) # signal 0 = liveness probe of the group
|
|
except ProcessLookupError:
|
|
return # whole group gone
|
|
except OSError:
|
|
break
|
|
|
|
# Phase 3: still alive -> hard SIGKILL the group.
|
|
try:
|
|
os.killpg(pgid, signal.SIGKILL)
|
|
except ProcessLookupError:
|
|
return
|
|
except OSError as e: # noqa: BLE001 - never-raise
|
|
logger.warning("proc_group: SIGKILL killpg(%s) error: %s", pgid, e)
|
|
|
|
|
|
def _run_plain(cmd, *, cwd, timeout, env=None) -> ProcResult:
|
|
"""Fallback path: prior ``subprocess.run`` semantics (never-break)."""
|
|
try:
|
|
r = subprocess.run(
|
|
cmd, cwd=cwd, env=env, capture_output=True, text=True, timeout=timeout
|
|
)
|
|
return ProcResult(
|
|
returncode=r.returncode, stdout=r.stdout or "", stderr=r.stderr or "",
|
|
timed_out=False,
|
|
)
|
|
except subprocess.TimeoutExpired:
|
|
return ProcResult(returncode=None, stdout="", stderr="", timed_out=True)
|
|
except (subprocess.SubprocessError, OSError) as e:
|
|
logger.warning("proc_group(_run_plain): error for %s: %s", cmd, e)
|
|
return ProcResult(returncode=None, stdout="", stderr=str(e), timed_out=False)
|
|
|
|
|
|
def run_in_process_group(
|
|
cmd, *, cwd, timeout, env=None, grace_s: float = 5.0, tree_kill: bool = True
|
|
) -> ProcResult:
|
|
"""Run ``cmd`` in its own process group; tree-kill the whole group on timeout.
|
|
|
|
See the module docstring. Returns a ``ProcResult``; ``timed_out=True`` iff the
|
|
command exceeded ``timeout`` (and the whole group was killed). Never raises.
|
|
|
|
Fallback (never-break): ``tree_kill=False`` or a non-POSIX platform -> a plain
|
|
``subprocess.run(cmd, ..., timeout=timeout)`` (the prior behaviour).
|
|
"""
|
|
if not tree_kill or not _tree_kill_supported():
|
|
return _run_plain(cmd, cwd=cwd, timeout=timeout, env=env)
|
|
|
|
try:
|
|
proc = subprocess.Popen( # noqa: S603 - cmd is an internal, fixed argv
|
|
cmd,
|
|
cwd=cwd,
|
|
env=env,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
text=True,
|
|
start_new_session=True, # setsid -> child leads a new process group
|
|
)
|
|
except (OSError, ValueError) as e:
|
|
logger.warning("proc_group: spawn error for %s: %s", cmd, e)
|
|
return ProcResult(returncode=None, stdout="", stderr=str(e), timed_out=False)
|
|
|
|
try:
|
|
out, err = proc.communicate(timeout=timeout)
|
|
return ProcResult(
|
|
returncode=proc.returncode, stdout=out or "", stderr=err or "",
|
|
timed_out=False,
|
|
)
|
|
except subprocess.TimeoutExpired:
|
|
# Tree-kill the WHOLE group (children + grandchildren), then reap to avoid
|
|
# leaving a zombie behind.
|
|
_kill_group(proc.pid, grace_s)
|
|
out, err = "", ""
|
|
try:
|
|
out, err = proc.communicate(timeout=grace_s + 5.0)
|
|
except (subprocess.TimeoutExpired, OSError, ValueError):
|
|
try:
|
|
proc.wait(timeout=grace_s + 5.0)
|
|
except (subprocess.TimeoutExpired, OSError, ValueError):
|
|
pass
|
|
return ProcResult(
|
|
returncode=proc.returncode, stdout=out or "", stderr=err or "",
|
|
timed_out=True,
|
|
)
|
|
except (OSError, ValueError) as e: # noqa: BLE001 - never-raise
|
|
logger.warning("proc_group: communicate error for %s: %s", cmd, e)
|
|
_kill_group(proc.pid, grace_s)
|
|
return ProcResult(returncode=proc.returncode, stdout="", stderr=str(e), timed_out=False)
|