fix(merge-gate): tolerate re-test infra-timeout + tree-kill spawned pytest
Eliminate the false `deploy-staging -> development` rollback that fired when the merge-gate local re-test timed out (infra/resource) on a green CI + tester + staging branch (incident ORCH-109/PR #129: a 516.7s suite blew its 600s budget under CPU starvation from orphaned pytest processes -> timeout misrouted as a code fault -> developer-retry loop -> manual gate). Additive, 5 independent kill-switches, never-raise, self-hosting scope. Untouched byte-for-byte: STAGE_TRANSITIONS, the QG_CHECKS registry, check_branch_mergeable name/semantics, machine-verdict keys, the DB schema. INV-4 (never push/force-push main) and the no-prod-restart rule are preserved. - D1: new stdlib-only leaf src/proc_group.py runs the spawned re-test/coverage pytest in its own process group (start_new_session) and tree-kills the WHOLE group on timeout (os.killpg SIGTERM->grace->SIGKILL); used by merge_gate.retest_branch and coverage_gate.measure_coverage. No orphan leak. Fallback never-break: subprocess_tree_kill_enabled=False / non-POSIX -> the prior subprocess.run. - D2/D3: merge_gate.classify_retest_failure distinguishes timeout/red/lock-busy/ other; an infra timeout routes to _handle_merge_gate_infra_retry (bounded re-queue, task stays on deploy-staging, no rollback / no developer-retry); a red re-test / conflict still rolls back (BR-6). Exhaustion -> one infra alert. - D4: skip the local re-test when the pre-merge rebase was a proven no-op (HEAD already CI/tester/staging-validated); fail-safe runs the re-test on any uncertainty. Flag merge_retest_skip_when_current_enabled. - D5: merge_retest_timeout_s 600 -> 900 + _resolve_retest_timeout validation; reaper_max_running_s invariant preserved without change. - D6: in-process counters + read-only merge_gate block in GET /queue; appended ("ORCH-110","classify_retest_failure","src/merge_gate.py") to MAIN_REGRESSION_MARKERS. Docs (README/internals overview/CLAUDE/CHANGELOG/ .env.example) updated in the same PR. Tests: tests/test_orch110_*.py (TC-01..TC-12, incl. the red-before/green-after incident regression). Full suite green (1988 passed). Refs: ORCH-110 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
179
src/proc_group.py
Normal file
179
src/proc_group.py
Normal file
@@ -0,0 +1,179 @@
|
||||
"""Process-group runner with tree-kill on timeout (ORCH-110 D1).
|
||||
|
||||
Background
|
||||
----------
|
||||
The orchestrator spawns its OWN pytest subprocesses OUTSIDE the LLM-agent path:
|
||||
``merge_gate.retest_branch`` (the merge-gate re-test) and
|
||||
``coverage_gate.measure_coverage`` (the coverage measurement). Both used a plain
|
||||
``subprocess.run(..., timeout=...)``. On ``TimeoutExpired`` CPython kills only the
|
||||
DIRECT child (``proc.kill()``); any grandchildren (pytest-xdist workers, tests that
|
||||
spawn their own processes) are re-parented to PID 1 and keep running — for days,
|
||||
starving the host CPU. That orphan leak is a ROOT cause of the ORCH-109 / PR #129
|
||||
incident (a 516s suite blew its 600s budget under CPU starvation -> a false
|
||||
merge-gate rollback after a green CI).
|
||||
|
||||
This leaf gives those orchestrator-spawned runs the SAME SIGTERM->grace->SIGKILL
|
||||
cascade the LLM agents already get from ``launcher.stop_process`` — but at the
|
||||
process-GROUP level so the WHOLE subtree dies on a timeout:
|
||||
|
||||
* the child becomes the leader of a NEW session / process group
|
||||
(``start_new_session=True`` -> ``setsid``), so every descendant shares
|
||||
``pgid == child.pid``;
|
||||
* on timeout the whole group is killed with ``os.killpg`` (SIGTERM, grace poll,
|
||||
then SIGKILL), then reaped (no zombies).
|
||||
|
||||
Invariants:
|
||||
* **stdlib-only leaf** — imports only ``os`` / ``signal`` / ``subprocess`` /
|
||||
``time`` / ``logging`` / ``dataclasses``; NEVER another ``src/*`` module
|
||||
(mirrors the purity of ``serial_gate`` / ``staging_verdict``).
|
||||
* **never-raise** — any OS error degrades to a safe ``ProcResult`` (the caller maps
|
||||
it to its existing contract); an exception never escapes.
|
||||
* **never-break fallback** — with ``tree_kill=False`` OR on a platform without
|
||||
``os.killpg`` / ``os.setsid`` (non-POSIX) it degrades to the prior
|
||||
``subprocess.run(cmd, ..., timeout=timeout)`` (byte-for-byte the old behaviour;
|
||||
prod is Linux and never hits the fallback).
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import subprocess
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
|
||||
logger = logging.getLogger("orchestrator.proc_group")
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProcResult:
|
||||
"""Outcome of a process-group run (ORCH-110 D1).
|
||||
|
||||
``returncode`` is the child's exit code, or ``None`` when the process could not
|
||||
be reaped with a real code (timeout-kill / spawn error). ``timed_out`` is True
|
||||
iff the command exceeded its wall-clock budget (and the whole group was killed).
|
||||
"""
|
||||
|
||||
returncode: int | None
|
||||
stdout: str
|
||||
stderr: str
|
||||
timed_out: bool
|
||||
|
||||
|
||||
def _tree_kill_supported() -> bool:
|
||||
"""POSIX process-group ops available? (non-POSIX -> fall back to subprocess.run)."""
|
||||
return hasattr(os, "killpg") and hasattr(os, "getpgid") and hasattr(os, "setsid")
|
||||
|
||||
|
||||
def _kill_group(pid: int, grace_s: float) -> None:
|
||||
"""SIGTERM -> grace -> SIGKILL the whole process GROUP led by ``pid``.
|
||||
|
||||
Mirrors ``launcher.stop_process`` but targets the process GROUP (``os.killpg``)
|
||||
so grandchildren die too. ``ProcessLookupError`` is tolerated at every step (the
|
||||
group may already be gone). Never raises.
|
||||
"""
|
||||
try:
|
||||
pgid = os.getpgid(pid)
|
||||
except (ProcessLookupError, OSError):
|
||||
return
|
||||
|
||||
# Phase 1: SIGTERM the whole group (graceful).
|
||||
try:
|
||||
os.killpg(pgid, signal.SIGTERM)
|
||||
except ProcessLookupError:
|
||||
return
|
||||
except OSError as e: # noqa: BLE001 - never-raise
|
||||
logger.warning("proc_group: SIGTERM killpg(%s) error: %s", pgid, e)
|
||||
|
||||
# Phase 2: poll for graceful group exit within the grace window.
|
||||
poll = 0.2
|
||||
waited = 0.0
|
||||
while waited < max(0.0, grace_s):
|
||||
time.sleep(poll)
|
||||
waited += poll
|
||||
try:
|
||||
os.killpg(pgid, 0) # signal 0 = liveness probe of the group
|
||||
except ProcessLookupError:
|
||||
return # whole group gone
|
||||
except OSError:
|
||||
break
|
||||
|
||||
# Phase 3: still alive -> hard SIGKILL the group.
|
||||
try:
|
||||
os.killpg(pgid, signal.SIGKILL)
|
||||
except ProcessLookupError:
|
||||
return
|
||||
except OSError as e: # noqa: BLE001 - never-raise
|
||||
logger.warning("proc_group: SIGKILL killpg(%s) error: %s", pgid, e)
|
||||
|
||||
|
||||
def _run_plain(cmd, *, cwd, timeout, env=None) -> ProcResult:
|
||||
"""Fallback path: prior ``subprocess.run`` semantics (never-break)."""
|
||||
try:
|
||||
r = subprocess.run(
|
||||
cmd, cwd=cwd, env=env, capture_output=True, text=True, timeout=timeout
|
||||
)
|
||||
return ProcResult(
|
||||
returncode=r.returncode, stdout=r.stdout or "", stderr=r.stderr or "",
|
||||
timed_out=False,
|
||||
)
|
||||
except subprocess.TimeoutExpired:
|
||||
return ProcResult(returncode=None, stdout="", stderr="", timed_out=True)
|
||||
except (subprocess.SubprocessError, OSError) as e:
|
||||
logger.warning("proc_group(_run_plain): error for %s: %s", cmd, e)
|
||||
return ProcResult(returncode=None, stdout="", stderr=str(e), timed_out=False)
|
||||
|
||||
|
||||
def run_in_process_group(
|
||||
cmd, *, cwd, timeout, env=None, grace_s: float = 5.0, tree_kill: bool = True
|
||||
) -> ProcResult:
|
||||
"""Run ``cmd`` in its own process group; tree-kill the whole group on timeout.
|
||||
|
||||
See the module docstring. Returns a ``ProcResult``; ``timed_out=True`` iff the
|
||||
command exceeded ``timeout`` (and the whole group was killed). Never raises.
|
||||
|
||||
Fallback (never-break): ``tree_kill=False`` or a non-POSIX platform -> a plain
|
||||
``subprocess.run(cmd, ..., timeout=timeout)`` (the prior behaviour).
|
||||
"""
|
||||
if not tree_kill or not _tree_kill_supported():
|
||||
return _run_plain(cmd, cwd=cwd, timeout=timeout, env=env)
|
||||
|
||||
try:
|
||||
proc = subprocess.Popen( # noqa: S603 - cmd is an internal, fixed argv
|
||||
cmd,
|
||||
cwd=cwd,
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
start_new_session=True, # setsid -> child leads a new process group
|
||||
)
|
||||
except (OSError, ValueError) as e:
|
||||
logger.warning("proc_group: spawn error for %s: %s", cmd, e)
|
||||
return ProcResult(returncode=None, stdout="", stderr=str(e), timed_out=False)
|
||||
|
||||
try:
|
||||
out, err = proc.communicate(timeout=timeout)
|
||||
return ProcResult(
|
||||
returncode=proc.returncode, stdout=out or "", stderr=err or "",
|
||||
timed_out=False,
|
||||
)
|
||||
except subprocess.TimeoutExpired:
|
||||
# Tree-kill the WHOLE group (children + grandchildren), then reap to avoid
|
||||
# leaving a zombie behind.
|
||||
_kill_group(proc.pid, grace_s)
|
||||
out, err = "", ""
|
||||
try:
|
||||
out, err = proc.communicate(timeout=grace_s + 5.0)
|
||||
except (subprocess.TimeoutExpired, OSError, ValueError):
|
||||
try:
|
||||
proc.wait(timeout=grace_s + 5.0)
|
||||
except (subprocess.TimeoutExpired, OSError, ValueError):
|
||||
pass
|
||||
return ProcResult(
|
||||
returncode=proc.returncode, stdout=out or "", stderr=err or "",
|
||||
timed_out=True,
|
||||
)
|
||||
except (OSError, ValueError) as e: # noqa: BLE001 - never-raise
|
||||
logger.warning("proc_group: communicate error for %s: %s", cmd, e)
|
||||
_kill_group(proc.pid, grace_s)
|
||||
return ProcResult(returncode=proc.returncode, stdout="", stderr=str(e), timed_out=False)
|
||||
Reference in New Issue
Block a user