Eliminate the false `deploy-staging -> development` rollback that fired when the merge-gate local re-test timed out (infra/resource) on a green CI + tester + staging branch (incident ORCH-109/PR #129: a 516.7s suite blew its 600s budget under CPU starvation from orphaned pytest processes -> timeout misrouted as a code fault -> developer-retry loop -> manual gate). Additive, 5 independent kill-switches, never-raise, self-hosting scope. Untouched byte-for-byte: STAGE_TRANSITIONS, the QG_CHECKS registry, check_branch_mergeable name/semantics, machine-verdict keys, the DB schema. INV-4 (never push/force-push main) and the no-prod-restart rule are preserved. - D1: new stdlib-only leaf src/proc_group.py runs the spawned re-test/coverage pytest in its own process group (start_new_session) and tree-kills the WHOLE group on timeout (os.killpg SIGTERM->grace->SIGKILL); used by merge_gate.retest_branch and coverage_gate.measure_coverage. No orphan leak. Fallback never-break: subprocess_tree_kill_enabled=False / non-POSIX -> the prior subprocess.run. - D2/D3: merge_gate.classify_retest_failure distinguishes timeout/red/lock-busy/ other; an infra timeout routes to _handle_merge_gate_infra_retry (bounded re-queue, task stays on deploy-staging, no rollback / no developer-retry); a red re-test / conflict still rolls back (BR-6). Exhaustion -> one infra alert. - D4: skip the local re-test when the pre-merge rebase was a proven no-op (HEAD already CI/tester/staging-validated); fail-safe runs the re-test on any uncertainty. Flag merge_retest_skip_when_current_enabled. - D5: merge_retest_timeout_s 600 -> 900 + _resolve_retest_timeout validation; reaper_max_running_s invariant preserved without change. - D6: in-process counters + read-only merge_gate block in GET /queue; appended ("ORCH-110","classify_retest_failure","src/merge_gate.py") to MAIN_REGRESSION_MARKERS. Docs (README/internals overview/CLAUDE/CHANGELOG/ .env.example) updated in the same PR. Tests: tests/test_orch110_*.py (TC-01..TC-12, incl. the red-before/green-after incident regression). Full suite green (1988 passed). Refs: ORCH-110 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
104 lines
4.9 KiB
Python
104 lines
4.9 KiB
Python
"""ORCH-110 TC-07: kill-switches + non-self repo => byte-for-byte pre-ORCH-110.
|
|
|
|
Covers NFR-2 / FR-5 / AC-7. Each ORCH-110 behaviour is an INDEPENDENT kill-switch;
|
|
with a flag off the affected path reverts to the prior behaviour. A non-self-hosting
|
|
repo (enduro-trails) never reaches the infra-retry / tree-kill paths at all (the
|
|
merge-gate is N/A for it).
|
|
"""
|
|
import os
|
|
import tempfile
|
|
|
|
os.environ.setdefault("ORCH_DB_PATH", os.path.join(tempfile.gettempdir(), "test_orch110_killswitch.db"))
|
|
os.environ.setdefault("ORCH_REPOS_DIR", tempfile.gettempdir())
|
|
os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
|
|
os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")
|
|
|
|
import pytest # noqa: E402
|
|
|
|
from src import merge_gate # noqa: E402
|
|
from src.qg import checks as qg # noqa: E402
|
|
from src.proc_group import ProcResult # noqa: E402
|
|
|
|
_REPO = "orchestrator"
|
|
_BRANCH = "feature/ORCH-110-x"
|
|
_WI = "ORCH-110"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# D1 kill-switch: subprocess_tree_kill_enabled=False -> the fallback path.
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc07_tree_kill_off_passes_tree_kill_false(tmp_path, monkeypatch):
|
|
wt = tmp_path / "wt"
|
|
wt.mkdir()
|
|
monkeypatch.setattr(merge_gate, "get_worktree_path", lambda r, b: str(wt))
|
|
monkeypatch.setattr(merge_gate.settings, "subprocess_tree_kill_enabled", False, raising=False)
|
|
captured = {}
|
|
|
|
def _fake(cmd, *, cwd, timeout, env=None, grace_s=5.0, tree_kill=True):
|
|
captured["tree_kill"] = tree_kill
|
|
return ProcResult(returncode=0, stdout="1 passed", stderr="", timed_out=False)
|
|
|
|
monkeypatch.setattr(merge_gate, "run_in_process_group", _fake)
|
|
ok, reason = merge_gate.retest_branch(_REPO, _BRANCH)
|
|
assert ok is True and reason == "re-test green"
|
|
assert captured["tree_kill"] is False # -> run_in_process_group degrades to subprocess.run
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# D4 kill-switch: merge_retest_skip_when_current_enabled=False -> always re-test
|
|
# after a rebase (even a proven no-op).
|
|
# ---------------------------------------------------------------------------
|
|
@pytest.fixture
|
|
def gate_primitives(monkeypatch):
|
|
calls = {"retest": 0}
|
|
monkeypatch.setattr(qg.settings, "merge_gate_enabled", True, raising=False)
|
|
monkeypatch.setattr(qg.settings, "merge_gate_repos", "", raising=False)
|
|
monkeypatch.setattr(qg.settings, "premerge_rebase_always", True, raising=False)
|
|
monkeypatch.setattr(merge_gate, "acquire_merge_lease", lambda *a, **k: (True, "lease acquired"), raising=False)
|
|
monkeypatch.setattr(merge_gate, "branch_is_behind_main", lambda r, b: True, raising=False)
|
|
monkeypatch.setattr(merge_gate, "auto_rebase_onto_main", lambda r, b: (True, "rebased"), raising=False)
|
|
monkeypatch.setattr(merge_gate, "release_merge_lease", lambda *a, **k: None, raising=False)
|
|
# A PROVEN no-op rebase: HEAD is identical before/after.
|
|
monkeypatch.setattr(merge_gate, "head_sha", lambda r, b: "deadbeefcafe", raising=False)
|
|
|
|
def _retest(r, b):
|
|
calls["retest"] += 1
|
|
return True, "re-test green"
|
|
|
|
monkeypatch.setattr(merge_gate, "retest_branch", _retest, raising=False)
|
|
return calls
|
|
|
|
|
|
def test_tc07_skip_when_current_off_always_retests(gate_primitives, monkeypatch):
|
|
monkeypatch.setattr(qg.settings, "merge_retest_skip_when_current_enabled", False, raising=False)
|
|
ok, reason = qg.check_branch_mergeable(_REPO, _WI, _BRANCH)
|
|
assert ok is True
|
|
assert reason == "rebased onto main, re-test green"
|
|
assert gate_primitives["retest"] == 1 # re-test STILL runs (D4 off)
|
|
|
|
|
|
def test_tc07_skip_when_current_on_skips_noop_retest(gate_primitives, monkeypatch):
|
|
"""Mirror sanity: with the flag ON, the proven no-op rebase skips the re-test."""
|
|
monkeypatch.setattr(qg.settings, "merge_retest_skip_when_current_enabled", True, raising=False)
|
|
ok, reason = qg.check_branch_mergeable(_REPO, _WI, _BRANCH)
|
|
assert ok is True
|
|
assert "re-test skipped" in reason
|
|
assert gate_primitives["retest"] == 0 # re-test NOT run on the no-op
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Non-self repo (enduro-trails): the merge-gate is N/A -> never the new paths.
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc07_non_self_repo_is_noop(monkeypatch):
|
|
monkeypatch.setattr(qg.settings, "merge_gate_enabled", True, raising=False)
|
|
monkeypatch.setattr(qg.settings, "merge_gate_repos", "", raising=False)
|
|
# Guard: if the gate wrongly engaged it would touch the lease -> fail loudly.
|
|
monkeypatch.setattr(
|
|
merge_gate, "acquire_merge_lease",
|
|
lambda *a, **k: pytest.fail("merge-gate must be N/A for enduro-trails"),
|
|
raising=False,
|
|
)
|
|
ok, reason = qg.check_branch_mergeable("enduro-trails", "ET-1", "feature/ET-1-x")
|
|
assert ok is True
|
|
assert "N/A" in reason
|