"""ORCH-110 TC-01 / TC-02: process-group tree-kill of orchestrator-spawned pytest. Covers D1 (FR-2 / BR-3 / AC-4): a timeout in ``merge_gate.retest_branch`` / ``coverage_gate.measure_coverage`` must tree-kill the WHOLE subprocess subtree (children + grandchildren) — the orphan-leak root cause of the ORCH-109 incident — not just the direct child. Fully deterministic and offline: * TC-01 runs a REAL throwaway process that spawns a long-sleeping grandchild via ``sys.executable`` (always present), then asserts the grandchild is dead after a ``run_in_process_group`` timeout — the concrete "no orphan survives" proof. * TC-01b / TC-02 assert the two call sites delegate to ``run_in_process_group`` with the right tree-kill kwargs and map a timed-out ProcResult to their existing contract (``(False, "re-test timeout after s")`` / ``None``). No network, no Plane/Gitea, no Claude CLI. """ import os import tempfile import time os.environ.setdefault("ORCH_DB_PATH", os.path.join(tempfile.gettempdir(), "test_orch110_lifecycle.db")) os.environ.setdefault("ORCH_REPOS_DIR", tempfile.gettempdir()) os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token") os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token") import sys # noqa: E402 import pytest # noqa: E402 from src import coverage_gate as cg # noqa: E402 from src import merge_gate # noqa: E402 from src import proc_group # noqa: E402 from src.proc_group import ProcResult, run_in_process_group # noqa: E402 _POSIX = hasattr(os, "killpg") and hasattr(os, "setsid") def _alive(pid: int) -> bool: """True iff ``pid`` is a live (non-reaped) process.""" try: os.kill(pid, 0) return True except ProcessLookupError: return False except PermissionError: return True # --------------------------------------------------------------------------- # TC-01: real grandchild is tree-killed on timeout (the FR-2 / AC-4 proof). # --------------------------------------------------------------------------- @pytest.mark.skipif(not _POSIX, reason="process-group tree-kill is POSIX-only") def test_tc01_grandchild_orphan_is_tree_killed_on_timeout(tmp_path): pidfile = tmp_path / "grandchild.pid" script = tmp_path / "spawner.py" # The leader spawns a long-sleeping grandchild, records its pid, then sleeps far # past the timeout. A naive subprocess.run(timeout=) would kill ONLY the leader # and leak the grandchild (the ORCH-109 bug). The process-group tree-kill must # reap the grandchild too. script.write_text( "import subprocess, sys, time\n" "gc = subprocess.Popen([sys.executable, '-c', 'import time; time.sleep(120)'])\n" "open(sys.argv[1], 'w').write(str(gc.pid))\n" "time.sleep(120)\n", encoding="utf-8", ) res = run_in_process_group( [sys.executable, str(script), str(pidfile)], cwd=str(tmp_path), timeout=2, grace_s=1, tree_kill=True, ) assert res.timed_out is True # Wait for the leader to actually have written the grandchild pid. deadline = time.time() + 5 while time.time() < deadline and not pidfile.exists(): time.sleep(0.05) assert pidfile.exists(), "spawner never recorded the grandchild pid" gc_pid = int(pidfile.read_text().strip()) # The grandchild must die (be reaped) within a short window after the tree-kill. deadline = time.time() + 8 while time.time() < deadline and _alive(gc_pid): time.sleep(0.1) assert not _alive(gc_pid), "grandchild orphan survived the timeout tree-kill (FR-2/AC-4)" @pytest.mark.skipif(not _POSIX, reason="process-group runner is POSIX-only") def test_tc01_green_and_red_returncodes_passthrough(tmp_path): """The runner returns the real exit code on a normal (non-timeout) completion.""" ok = run_in_process_group( [sys.executable, "-c", "import sys; sys.exit(0)"], cwd=str(tmp_path), timeout=30, tree_kill=True, ) assert ok.timed_out is False and ok.returncode == 0 red = run_in_process_group( [sys.executable, "-c", "import sys; sys.exit(3)"], cwd=str(tmp_path), timeout=30, tree_kill=True, ) assert red.timed_out is False and red.returncode == 3 def test_tc01_fallback_when_tree_kill_disabled(tmp_path, monkeypatch): """tree_kill=False -> degrades to subprocess.run (never-break); still times out.""" res = run_in_process_group( [sys.executable, "-c", "import time; time.sleep(30)"], cwd=str(tmp_path), timeout=1, tree_kill=False, ) assert res.timed_out is True def test_tc01_fallback_on_non_posix(tmp_path, monkeypatch): """No os.killpg/setsid (non-POSIX) -> the plain subprocess.run fallback runs.""" monkeypatch.setattr(proc_group, "_tree_kill_supported", lambda: False) res = run_in_process_group( [sys.executable, "-c", "import sys; sys.exit(0)"], cwd=str(tmp_path), timeout=30, ) assert res.timed_out is False and res.returncode == 0 # --------------------------------------------------------------------------- # TC-01b: retest_branch delegates to the process-group runner with the right # kwargs and maps a timeout to its existing contract. # --------------------------------------------------------------------------- def test_tc01b_retest_branch_uses_process_group_and_maps_timeout(tmp_path, monkeypatch): wt = tmp_path / "wt" wt.mkdir() monkeypatch.setattr(merge_gate, "get_worktree_path", lambda repo, branch: str(wt)) monkeypatch.setattr(merge_gate.settings, "merge_retest_timeout_s", 900, raising=False) monkeypatch.setattr(merge_gate.settings, "subprocess_tree_kill_enabled", True, raising=False) monkeypatch.setattr(merge_gate.settings, "agent_kill_grace_seconds", 7, raising=False) captured = {} def _fake(cmd, *, cwd, timeout, env=None, grace_s=5.0, tree_kill=True): captured.update(cmd=cmd, cwd=cwd, timeout=timeout, grace_s=grace_s, tree_kill=tree_kill) return ProcResult(returncode=None, stdout="", stderr="", timed_out=True) monkeypatch.setattr(merge_gate, "run_in_process_group", _fake) ok, reason = merge_gate.retest_branch("orchestrator", "feature/x") assert ok is False assert reason == "re-test timeout after 900s" # FR-2: spawned via the process-group runner with tree-kill + the agent grace. assert captured["tree_kill"] is True assert captured["grace_s"] == 7 assert captured["cmd"][:3] == ["python", "-m", "pytest"] # --------------------------------------------------------------------------- # TC-02: coverage_gate.measure_coverage — sibling orphan source (BR-3). Timeout # tree-kills + returns None; uses the same runner with tree-kill from settings. # --------------------------------------------------------------------------- def test_tc02_measure_coverage_timeout_tree_kill_returns_none(tmp_path, monkeypatch): monkeypatch.setattr(cg, "ensure_worktree", lambda r, b: str(tmp_path)) monkeypatch.setattr(cg.settings, "subprocess_tree_kill_enabled", True, raising=False) monkeypatch.setattr(cg.settings, "agent_kill_grace_seconds", 5, raising=False) monkeypatch.setattr(cg.settings, "coverage_run_timeout_s", 900, raising=False) captured = {} def _fake(cmd, *, cwd, timeout, env=None, grace_s=5.0, tree_kill=True): captured.update(tree_kill=tree_kill, grace_s=grace_s, timeout=timeout) return ProcResult(returncode=None, stdout="", stderr="", timed_out=True) monkeypatch.setattr(cg, "run_in_process_group", _fake) assert cg.measure_coverage("orchestrator", "feature/x") is None assert captured["tree_kill"] is True assert captured["grace_s"] == 5 assert captured["timeout"] == 900 def test_tc02_measure_coverage_respects_tree_kill_killswitch(tmp_path, monkeypatch): """With the tree-kill kill-switch off, the runner is asked for the fallback path.""" monkeypatch.setattr(cg, "ensure_worktree", lambda r, b: str(tmp_path)) monkeypatch.setattr(cg.settings, "subprocess_tree_kill_enabled", False, raising=False) captured = {} def _fake(cmd, *, cwd, timeout, env=None, grace_s=5.0, tree_kill=True): captured.update(tree_kill=tree_kill) return ProcResult(returncode=None, stdout="", stderr="", timed_out=True) monkeypatch.setattr(cg, "run_in_process_group", _fake) assert cg.measure_coverage("orchestrator", "feature/x") is None assert captured["tree_kill"] is False