Eliminate the false `deploy-staging -> development` rollback that fired when the merge-gate local re-test timed out (infra/resource) on a green CI + tester + staging branch (incident ORCH-109/PR #129: a 516.7s suite blew its 600s budget under CPU starvation from orphaned pytest processes -> timeout misrouted as a code fault -> developer-retry loop -> manual gate). Additive, 5 independent kill-switches, never-raise, self-hosting scope. Untouched byte-for-byte: STAGE_TRANSITIONS, the QG_CHECKS registry, check_branch_mergeable name/semantics, machine-verdict keys, the DB schema. INV-4 (never push/force-push main) and the no-prod-restart rule are preserved. - D1: new stdlib-only leaf src/proc_group.py runs the spawned re-test/coverage pytest in its own process group (start_new_session) and tree-kills the WHOLE group on timeout (os.killpg SIGTERM->grace->SIGKILL); used by merge_gate.retest_branch and coverage_gate.measure_coverage. No orphan leak. Fallback never-break: subprocess_tree_kill_enabled=False / non-POSIX -> the prior subprocess.run. - D2/D3: merge_gate.classify_retest_failure distinguishes timeout/red/lock-busy/ other; an infra timeout routes to _handle_merge_gate_infra_retry (bounded re-queue, task stays on deploy-staging, no rollback / no developer-retry); a red re-test / conflict still rolls back (BR-6). Exhaustion -> one infra alert. - D4: skip the local re-test when the pre-merge rebase was a proven no-op (HEAD already CI/tester/staging-validated); fail-safe runs the re-test on any uncertainty. Flag merge_retest_skip_when_current_enabled. - D5: merge_retest_timeout_s 600 -> 900 + _resolve_retest_timeout validation; reaper_max_running_s invariant preserved without change. - D6: in-process counters + read-only merge_gate block in GET /queue; appended ("ORCH-110","classify_retest_failure","src/merge_gate.py") to MAIN_REGRESSION_MARKERS. Docs (README/internals overview/CLAUDE/CHANGELOG/ .env.example) updated in the same PR. Tests: tests/test_orch110_*.py (TC-01..TC-12, incl. the red-before/green-after incident regression). Full suite green (1988 passed). Refs: ORCH-110 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
214 lines
8.7 KiB
Python
214 lines
8.7 KiB
Python
"""ORCH-110 TC-10: regression of the ORCH-109 / PR #129 incident.
|
|
|
|
Incident: tester PASS + green CI + the branch not-behind, but the merge-gate local
|
|
re-test blew its wall-clock budget under CPU starvation -> ``check_branch_mergeable``
|
|
returned ``(False, "re-test timeout ...")`` -> the engine routed it to
|
|
``_handle_merge_gate_rollback`` (rollback deploy-staging -> development + a developer
|
|
retry) -> every retry timed out the same way -> "Merge-gate still failing after 3
|
|
developer retries" -> a stuck task needing manual intervention.
|
|
|
|
This drives the REAL ``check_branch_mergeable`` through ``advance_stage`` (only the
|
|
git/test primitives are mocked) and asserts the incident can no longer happen:
|
|
|
|
* Scenario A (D4) — a not-behind branch (no-op rebase) on a green-CI HEAD: the
|
|
local re-test is SKIPPED entirely -> the gate passes -> advance to deploy.
|
|
* Scenario B (D3) — a real catch-up whose re-test times out: a bounded infra-retry
|
|
(task stays on deploy-staging), NEVER a rollback to development and NEVER the
|
|
"Merge-gate still failing after N developer retries" alert.
|
|
|
|
RED-before / GREEN-after: on pre-ORCH-110 code both scenarios would roll the task
|
|
back to development (Scenario A would even run the doomed re-test), so the
|
|
``rolled_back_to is None`` / ``stage == deploy-staging`` assertions below would FAIL.
|
|
"""
|
|
import os
|
|
import tempfile
|
|
|
|
import pytest
|
|
|
|
_test_db = os.path.join(tempfile.gettempdir(), "test_orch110_regression.db")
|
|
os.environ["ORCH_DB_PATH"] = _test_db
|
|
os.environ["ORCH_REPOS_DIR"] = tempfile.gettempdir()
|
|
os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
|
|
os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")
|
|
|
|
from unittest.mock import MagicMock # noqa: E402
|
|
|
|
import src.db as _db # noqa: E402
|
|
from src.db import init_db, get_db # noqa: E402
|
|
from src import stage_engine # noqa: E402
|
|
from src import merge_gate # noqa: E402
|
|
from src.qg import checks as qg # noqa: E402
|
|
from src.stage_engine import advance_stage # noqa: E402
|
|
|
|
_REPO = "orchestrator"
|
|
_WI = "ORCH-110"
|
|
_BRANCH = "feature/ORCH-110-x"
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def fresh_db(monkeypatch):
|
|
monkeypatch.setattr(_db.settings, "db_path", _test_db)
|
|
if os.path.exists(_test_db):
|
|
os.unlink(_test_db)
|
|
init_db()
|
|
yield
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def silence(monkeypatch):
|
|
for name in (
|
|
"notify_stage_change", "notify_qg_failure", "send_telegram",
|
|
"plane_notify_stage", "plane_notify_qg", "plane_add_comment",
|
|
"set_issue_in_progress", "set_issue_blocked", "notify_approve_requested",
|
|
"set_issue_in_review", "set_issue_needs_input",
|
|
):
|
|
monkeypatch.setattr(stage_engine, name, MagicMock())
|
|
# Keep the merge-gate the only intervening sub-gate on the edge; no Phase A.
|
|
monkeypatch.setattr(stage_engine.settings, "deploy_require_manual_approve", False)
|
|
# Real gate scope: orchestrator self-hosting.
|
|
monkeypatch.setattr(qg.settings, "merge_gate_enabled", True, raising=False)
|
|
monkeypatch.setattr(qg.settings, "merge_gate_repos", "", raising=False)
|
|
monkeypatch.setattr(qg.settings, "premerge_rebase_always", True, raising=False)
|
|
monkeypatch.setattr(qg.settings, "merge_retest_skip_when_current_enabled", True, raising=False)
|
|
monkeypatch.setattr(stage_engine.settings, "merge_retest_infra_tolerance_enabled", True)
|
|
monkeypatch.setattr(stage_engine.settings, "merge_retest_infra_max_retries", 2)
|
|
monkeypatch.setattr(stage_engine.settings, "merge_retest_infra_retry_delay_s", 120)
|
|
|
|
|
|
def _pass(*a, **k):
|
|
return (True, "ok")
|
|
|
|
|
|
def _patch_edge_gates_except_merge(monkeypatch):
|
|
"""All edge gates pass EXCEPT check_branch_mergeable, which stays the REAL one."""
|
|
patched = {**stage_engine.QG_CHECKS}
|
|
patched["check_staging_status"] = _pass
|
|
patched["check_security_gate"] = _pass
|
|
patched["check_coverage_gate"] = _pass
|
|
patched["check_staging_image_fresh"] = _pass
|
|
monkeypatch.setattr(stage_engine, "QG_CHECKS", patched)
|
|
|
|
|
|
def _mock_merge_primitives(monkeypatch, *, head_shas, retest_result, retest_calls):
|
|
"""Mock the git/test primitives the real check_branch_mergeable composes."""
|
|
monkeypatch.setattr(merge_gate, "acquire_merge_lease", lambda *a, **k: (True, "lease acquired"))
|
|
monkeypatch.setattr(merge_gate, "release_merge_lease", lambda *a, **k: None)
|
|
monkeypatch.setattr(merge_gate, "branch_is_behind_main", lambda r, b: True)
|
|
monkeypatch.setattr(merge_gate, "auto_rebase_onto_main", lambda r, b: (True, "rebased onto origin/main"))
|
|
|
|
shas = list(head_shas)
|
|
|
|
def _head_sha(r, b):
|
|
return shas.pop(0) if shas else ""
|
|
|
|
monkeypatch.setattr(merge_gate, "head_sha", _head_sha)
|
|
|
|
def _retest(r, b):
|
|
retest_calls.append((r, b))
|
|
return retest_result
|
|
|
|
monkeypatch.setattr(merge_gate, "retest_branch", _retest)
|
|
|
|
|
|
def _make_task():
|
|
conn = get_db()
|
|
cur = conn.execute(
|
|
"INSERT INTO tasks (plane_id, work_item_id, repo, branch, stage) VALUES (?,?,?,?,?)",
|
|
(f"plane-{_WI}", _WI, _REPO, _BRANCH, "deploy-staging"),
|
|
)
|
|
tid = cur.lastrowid
|
|
conn.commit()
|
|
conn.close()
|
|
return tid
|
|
|
|
|
|
def _stage(task_id):
|
|
conn = get_db()
|
|
row = conn.execute("SELECT stage FROM tasks WHERE id=?", (task_id,)).fetchone()
|
|
conn.close()
|
|
return row[0]
|
|
|
|
|
|
def _agents():
|
|
conn = get_db()
|
|
rows = conn.execute("SELECT agent FROM jobs ORDER BY id").fetchall()
|
|
conn.close()
|
|
return [r[0] for r in rows]
|
|
|
|
|
|
def _advance(task_id):
|
|
return advance_stage(task_id, "deploy-staging", _REPO, _WI, _BRANCH, finished_agent="deployer")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Scenario A (D4): the literal incident — not-behind branch, green CI HEAD.
|
|
# The re-test is SKIPPED, so the timeout can never happen; advance to deploy.
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc10_incident_noop_rebase_skips_retest_and_advances(monkeypatch):
|
|
_patch_edge_gates_except_merge(monkeypatch)
|
|
retest_calls = []
|
|
# Same SHA before/after rebase -> proven no-op. retest is rigged to TIME OUT if
|
|
# it were (wrongly) called — proving the skip is what avoids the false rollback.
|
|
_mock_merge_primitives(
|
|
monkeypatch,
|
|
head_shas=["abc123", "abc123"],
|
|
retest_result=(False, "re-test timeout after 900s"),
|
|
retest_calls=retest_calls,
|
|
)
|
|
task_id = _make_task()
|
|
res = _advance(task_id)
|
|
|
|
assert retest_calls == [], "re-test must be SKIPPED on a no-op rebase (D4)"
|
|
assert res.rolled_back_to is None # the incident's false rollback is gone
|
|
assert res.advanced is True
|
|
assert res.to_stage == "deploy"
|
|
assert _stage(task_id) == "deploy"
|
|
assert "developer" not in _agents() # no developer-retry burned
|
|
assert stage_engine.set_issue_blocked.called is False
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Scenario B (D3): a real catch-up whose re-test times out -> infra-retry, not
|
|
# rollback, and never the "still failing after N retries" manual gate.
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc10_real_catchup_retest_timeout_infra_retries_not_rollback(monkeypatch):
|
|
_patch_edge_gates_except_merge(monkeypatch)
|
|
retest_calls = []
|
|
# HEAD moved (real catch-up) -> re-test runs -> times out.
|
|
_mock_merge_primitives(
|
|
monkeypatch,
|
|
head_shas=["old111", "new222"],
|
|
retest_result=(False, "re-test timeout after 900s"),
|
|
retest_calls=retest_calls,
|
|
)
|
|
task_id = _make_task()
|
|
res = _advance(task_id)
|
|
|
|
assert retest_calls, "re-test SHOULD run on a real (HEAD-moving) catch-up"
|
|
assert res.rolled_back_to is None # NOT the code-fault rollback
|
|
assert res.note == "merge-gate-infra-retry"
|
|
assert _stage(task_id) == "deploy-staging" # stays put for a bounded retry
|
|
assert _agents() == ["deployer"] # staging-deployer re-queued, NOT developer
|
|
assert stage_engine.set_issue_blocked.called is False
|
|
# The "Merge-gate still failing after N developer retries" manual gate never fires.
|
|
for call in stage_engine.send_telegram.call_args_list:
|
|
assert "developer retries" not in call[0][0]
|
|
|
|
|
|
def test_tc10_real_catchup_red_retest_still_rolls_back(monkeypatch):
|
|
"""Anti-over-tolerance guard inside the incident scenario: a genuinely RED
|
|
re-test on a real catch-up STILL rolls back (BR-6 / AC-3)."""
|
|
_patch_edge_gates_except_merge(monkeypatch)
|
|
retest_calls = []
|
|
_mock_merge_primitives(
|
|
monkeypatch,
|
|
head_shas=["old111", "new222"],
|
|
retest_result=(False, "re-test failed: ...1 failed"),
|
|
retest_calls=retest_calls,
|
|
)
|
|
task_id = _make_task()
|
|
res = _advance(task_id)
|
|
assert res.rolled_back_to == "development"
|
|
assert _stage(task_id) == "development"
|
|
assert _agents() == ["developer"]
|