Eliminate the false `deploy-staging -> development` rollback that fired when the merge-gate local re-test timed out (infra/resource) on a green CI + tester + staging branch (incident ORCH-109/PR #129: a 516.7s suite blew its 600s budget under CPU starvation from orphaned pytest processes -> timeout misrouted as a code fault -> developer-retry loop -> manual gate). Additive, 5 independent kill-switches, never-raise, self-hosting scope. Untouched byte-for-byte: STAGE_TRANSITIONS, the QG_CHECKS registry, check_branch_mergeable name/semantics, machine-verdict keys, the DB schema. INV-4 (never push/force-push main) and the no-prod-restart rule are preserved. - D1: new stdlib-only leaf src/proc_group.py runs the spawned re-test/coverage pytest in its own process group (start_new_session) and tree-kills the WHOLE group on timeout (os.killpg SIGTERM->grace->SIGKILL); used by merge_gate.retest_branch and coverage_gate.measure_coverage. No orphan leak. Fallback never-break: subprocess_tree_kill_enabled=False / non-POSIX -> the prior subprocess.run. - D2/D3: merge_gate.classify_retest_failure distinguishes timeout/red/lock-busy/ other; an infra timeout routes to _handle_merge_gate_infra_retry (bounded re-queue, task stays on deploy-staging, no rollback / no developer-retry); a red re-test / conflict still rolls back (BR-6). Exhaustion -> one infra alert. - D4: skip the local re-test when the pre-merge rebase was a proven no-op (HEAD already CI/tester/staging-validated); fail-safe runs the re-test on any uncertainty. Flag merge_retest_skip_when_current_enabled. - D5: merge_retest_timeout_s 600 -> 900 + _resolve_retest_timeout validation; reaper_max_running_s invariant preserved without change. - D6: in-process counters + read-only merge_gate block in GET /queue; appended ("ORCH-110","classify_retest_failure","src/merge_gate.py") to MAIN_REGRESSION_MARKERS. Docs (README/internals overview/CLAUDE/CHANGELOG/ .env.example) updated in the same PR. Tests: tests/test_orch110_*.py (TC-01..TC-12, incl. the red-before/green-after incident regression). Full suite green (1988 passed). Refs: ORCH-110 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
186 lines
8.3 KiB
Python
186 lines
8.3 KiB
Python
"""ORCH-110 TC-01 / TC-02: process-group tree-kill of orchestrator-spawned pytest.
|
|
|
|
Covers D1 (FR-2 / BR-3 / AC-4): a timeout in ``merge_gate.retest_branch`` /
|
|
``coverage_gate.measure_coverage`` must tree-kill the WHOLE subprocess subtree
|
|
(children + grandchildren) — the orphan-leak root cause of the ORCH-109 incident —
|
|
not just the direct child.
|
|
|
|
Fully deterministic and offline:
|
|
* TC-01 runs a REAL throwaway process that spawns a long-sleeping grandchild via
|
|
``sys.executable`` (always present), then asserts the grandchild is dead after a
|
|
``run_in_process_group`` timeout — the concrete "no orphan survives" proof.
|
|
* TC-01b / TC-02 assert the two call sites delegate to ``run_in_process_group``
|
|
with the right tree-kill kwargs and map a timed-out ProcResult to their existing
|
|
contract (``(False, "re-test timeout after <T>s")`` / ``None``).
|
|
No network, no Plane/Gitea, no Claude CLI.
|
|
"""
|
|
import os
|
|
import tempfile
|
|
import time
|
|
|
|
os.environ.setdefault("ORCH_DB_PATH", os.path.join(tempfile.gettempdir(), "test_orch110_lifecycle.db"))
|
|
os.environ.setdefault("ORCH_REPOS_DIR", tempfile.gettempdir())
|
|
os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
|
|
os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")
|
|
|
|
import sys # noqa: E402
|
|
|
|
import pytest # noqa: E402
|
|
|
|
from src import coverage_gate as cg # noqa: E402
|
|
from src import merge_gate # noqa: E402
|
|
from src import proc_group # noqa: E402
|
|
from src.proc_group import ProcResult, run_in_process_group # noqa: E402
|
|
|
|
_POSIX = hasattr(os, "killpg") and hasattr(os, "setsid")
|
|
|
|
|
|
def _alive(pid: int) -> bool:
|
|
"""True iff ``pid`` is a live (non-reaped) process."""
|
|
try:
|
|
os.kill(pid, 0)
|
|
return True
|
|
except ProcessLookupError:
|
|
return False
|
|
except PermissionError:
|
|
return True
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TC-01: real grandchild is tree-killed on timeout (the FR-2 / AC-4 proof).
|
|
# ---------------------------------------------------------------------------
|
|
@pytest.mark.skipif(not _POSIX, reason="process-group tree-kill is POSIX-only")
|
|
def test_tc01_grandchild_orphan_is_tree_killed_on_timeout(tmp_path):
|
|
pidfile = tmp_path / "grandchild.pid"
|
|
script = tmp_path / "spawner.py"
|
|
# The leader spawns a long-sleeping grandchild, records its pid, then sleeps far
|
|
# past the timeout. A naive subprocess.run(timeout=) would kill ONLY the leader
|
|
# and leak the grandchild (the ORCH-109 bug). The process-group tree-kill must
|
|
# reap the grandchild too.
|
|
script.write_text(
|
|
"import subprocess, sys, time\n"
|
|
"gc = subprocess.Popen([sys.executable, '-c', 'import time; time.sleep(120)'])\n"
|
|
"open(sys.argv[1], 'w').write(str(gc.pid))\n"
|
|
"time.sleep(120)\n",
|
|
encoding="utf-8",
|
|
)
|
|
|
|
res = run_in_process_group(
|
|
[sys.executable, str(script), str(pidfile)],
|
|
cwd=str(tmp_path), timeout=2, grace_s=1, tree_kill=True,
|
|
)
|
|
assert res.timed_out is True
|
|
|
|
# Wait for the leader to actually have written the grandchild pid.
|
|
deadline = time.time() + 5
|
|
while time.time() < deadline and not pidfile.exists():
|
|
time.sleep(0.05)
|
|
assert pidfile.exists(), "spawner never recorded the grandchild pid"
|
|
gc_pid = int(pidfile.read_text().strip())
|
|
|
|
# The grandchild must die (be reaped) within a short window after the tree-kill.
|
|
deadline = time.time() + 8
|
|
while time.time() < deadline and _alive(gc_pid):
|
|
time.sleep(0.1)
|
|
assert not _alive(gc_pid), "grandchild orphan survived the timeout tree-kill (FR-2/AC-4)"
|
|
|
|
|
|
@pytest.mark.skipif(not _POSIX, reason="process-group runner is POSIX-only")
|
|
def test_tc01_green_and_red_returncodes_passthrough(tmp_path):
|
|
"""The runner returns the real exit code on a normal (non-timeout) completion."""
|
|
ok = run_in_process_group(
|
|
[sys.executable, "-c", "import sys; sys.exit(0)"],
|
|
cwd=str(tmp_path), timeout=30, tree_kill=True,
|
|
)
|
|
assert ok.timed_out is False and ok.returncode == 0
|
|
red = run_in_process_group(
|
|
[sys.executable, "-c", "import sys; sys.exit(3)"],
|
|
cwd=str(tmp_path), timeout=30, tree_kill=True,
|
|
)
|
|
assert red.timed_out is False and red.returncode == 3
|
|
|
|
|
|
def test_tc01_fallback_when_tree_kill_disabled(tmp_path, monkeypatch):
|
|
"""tree_kill=False -> degrades to subprocess.run (never-break); still times out."""
|
|
res = run_in_process_group(
|
|
[sys.executable, "-c", "import time; time.sleep(30)"],
|
|
cwd=str(tmp_path), timeout=1, tree_kill=False,
|
|
)
|
|
assert res.timed_out is True
|
|
|
|
|
|
def test_tc01_fallback_on_non_posix(tmp_path, monkeypatch):
|
|
"""No os.killpg/setsid (non-POSIX) -> the plain subprocess.run fallback runs."""
|
|
monkeypatch.setattr(proc_group, "_tree_kill_supported", lambda: False)
|
|
res = run_in_process_group(
|
|
[sys.executable, "-c", "import sys; sys.exit(0)"],
|
|
cwd=str(tmp_path), timeout=30,
|
|
)
|
|
assert res.timed_out is False and res.returncode == 0
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TC-01b: retest_branch delegates to the process-group runner with the right
|
|
# kwargs and maps a timeout to its existing contract.
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc01b_retest_branch_uses_process_group_and_maps_timeout(tmp_path, monkeypatch):
|
|
wt = tmp_path / "wt"
|
|
wt.mkdir()
|
|
monkeypatch.setattr(merge_gate, "get_worktree_path", lambda repo, branch: str(wt))
|
|
monkeypatch.setattr(merge_gate.settings, "merge_retest_timeout_s", 900, raising=False)
|
|
monkeypatch.setattr(merge_gate.settings, "subprocess_tree_kill_enabled", True, raising=False)
|
|
monkeypatch.setattr(merge_gate.settings, "agent_kill_grace_seconds", 7, raising=False)
|
|
|
|
captured = {}
|
|
|
|
def _fake(cmd, *, cwd, timeout, env=None, grace_s=5.0, tree_kill=True):
|
|
captured.update(cmd=cmd, cwd=cwd, timeout=timeout, grace_s=grace_s, tree_kill=tree_kill)
|
|
return ProcResult(returncode=None, stdout="", stderr="", timed_out=True)
|
|
|
|
monkeypatch.setattr(merge_gate, "run_in_process_group", _fake)
|
|
ok, reason = merge_gate.retest_branch("orchestrator", "feature/x")
|
|
assert ok is False
|
|
assert reason == "re-test timeout after 900s"
|
|
# FR-2: spawned via the process-group runner with tree-kill + the agent grace.
|
|
assert captured["tree_kill"] is True
|
|
assert captured["grace_s"] == 7
|
|
assert captured["cmd"][:3] == ["python", "-m", "pytest"]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TC-02: coverage_gate.measure_coverage — sibling orphan source (BR-3). Timeout
|
|
# tree-kills + returns None; uses the same runner with tree-kill from settings.
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc02_measure_coverage_timeout_tree_kill_returns_none(tmp_path, monkeypatch):
|
|
monkeypatch.setattr(cg, "ensure_worktree", lambda r, b: str(tmp_path))
|
|
monkeypatch.setattr(cg.settings, "subprocess_tree_kill_enabled", True, raising=False)
|
|
monkeypatch.setattr(cg.settings, "agent_kill_grace_seconds", 5, raising=False)
|
|
monkeypatch.setattr(cg.settings, "coverage_run_timeout_s", 900, raising=False)
|
|
|
|
captured = {}
|
|
|
|
def _fake(cmd, *, cwd, timeout, env=None, grace_s=5.0, tree_kill=True):
|
|
captured.update(tree_kill=tree_kill, grace_s=grace_s, timeout=timeout)
|
|
return ProcResult(returncode=None, stdout="", stderr="", timed_out=True)
|
|
|
|
monkeypatch.setattr(cg, "run_in_process_group", _fake)
|
|
assert cg.measure_coverage("orchestrator", "feature/x") is None
|
|
assert captured["tree_kill"] is True
|
|
assert captured["grace_s"] == 5
|
|
assert captured["timeout"] == 900
|
|
|
|
|
|
def test_tc02_measure_coverage_respects_tree_kill_killswitch(tmp_path, monkeypatch):
|
|
"""With the tree-kill kill-switch off, the runner is asked for the fallback path."""
|
|
monkeypatch.setattr(cg, "ensure_worktree", lambda r, b: str(tmp_path))
|
|
monkeypatch.setattr(cg.settings, "subprocess_tree_kill_enabled", False, raising=False)
|
|
captured = {}
|
|
|
|
def _fake(cmd, *, cwd, timeout, env=None, grace_s=5.0, tree_kill=True):
|
|
captured.update(tree_kill=tree_kill)
|
|
return ProcResult(returncode=None, stdout="", stderr="", timed_out=True)
|
|
|
|
monkeypatch.setattr(cg, "run_in_process_group", _fake)
|
|
assert cg.measure_coverage("orchestrator", "feature/x") is None
|
|
assert captured["tree_kill"] is False
|