"""ORCH-110 TC-10: regression of the ORCH-109 / PR #129 incident. Incident: tester PASS + green CI + the branch not-behind, but the merge-gate local re-test blew its wall-clock budget under CPU starvation -> ``check_branch_mergeable`` returned ``(False, "re-test timeout ...")`` -> the engine routed it to ``_handle_merge_gate_rollback`` (rollback deploy-staging -> development + a developer retry) -> every retry timed out the same way -> "Merge-gate still failing after 3 developer retries" -> a stuck task needing manual intervention. This drives the REAL ``check_branch_mergeable`` through ``advance_stage`` (only the git/test primitives are mocked) and asserts the incident can no longer happen: * Scenario A (D4) — a not-behind branch (no-op rebase) on a green-CI HEAD: the local re-test is SKIPPED entirely -> the gate passes -> advance to deploy. * Scenario B (D3) — a real catch-up whose re-test times out: a bounded infra-retry (task stays on deploy-staging), NEVER a rollback to development and NEVER the "Merge-gate still failing after N developer retries" alert. RED-before / GREEN-after: on pre-ORCH-110 code both scenarios would roll the task back to development (Scenario A would even run the doomed re-test), so the ``rolled_back_to is None`` / ``stage == deploy-staging`` assertions below would FAIL. """ import os import tempfile import pytest _test_db = os.path.join(tempfile.gettempdir(), "test_orch110_regression.db") os.environ["ORCH_DB_PATH"] = _test_db os.environ["ORCH_REPOS_DIR"] = tempfile.gettempdir() os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token") os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token") from unittest.mock import MagicMock # noqa: E402 import src.db as _db # noqa: E402 from src.db import init_db, get_db # noqa: E402 from src import stage_engine # noqa: E402 from src import merge_gate # noqa: E402 from src.qg import checks as qg # noqa: E402 from src.stage_engine import advance_stage # noqa: E402 _REPO = "orchestrator" _WI = "ORCH-110" _BRANCH = "feature/ORCH-110-x" @pytest.fixture(autouse=True) def fresh_db(monkeypatch): monkeypatch.setattr(_db.settings, "db_path", _test_db) if os.path.exists(_test_db): os.unlink(_test_db) init_db() yield @pytest.fixture(autouse=True) def silence(monkeypatch): for name in ( "notify_stage_change", "notify_qg_failure", "send_telegram", "plane_notify_stage", "plane_notify_qg", "plane_add_comment", "set_issue_in_progress", "set_issue_blocked", "notify_approve_requested", "set_issue_in_review", "set_issue_needs_input", ): monkeypatch.setattr(stage_engine, name, MagicMock()) # Keep the merge-gate the only intervening sub-gate on the edge; no Phase A. monkeypatch.setattr(stage_engine.settings, "deploy_require_manual_approve", False) # Real gate scope: orchestrator self-hosting. monkeypatch.setattr(qg.settings, "merge_gate_enabled", True, raising=False) monkeypatch.setattr(qg.settings, "merge_gate_repos", "", raising=False) monkeypatch.setattr(qg.settings, "premerge_rebase_always", True, raising=False) monkeypatch.setattr(qg.settings, "merge_retest_skip_when_current_enabled", True, raising=False) monkeypatch.setattr(stage_engine.settings, "merge_retest_infra_tolerance_enabled", True) monkeypatch.setattr(stage_engine.settings, "merge_retest_infra_max_retries", 2) monkeypatch.setattr(stage_engine.settings, "merge_retest_infra_retry_delay_s", 120) def _pass(*a, **k): return (True, "ok") def _patch_edge_gates_except_merge(monkeypatch): """All edge gates pass EXCEPT check_branch_mergeable, which stays the REAL one.""" patched = {**stage_engine.QG_CHECKS} patched["check_staging_status"] = _pass patched["check_security_gate"] = _pass patched["check_coverage_gate"] = _pass patched["check_staging_image_fresh"] = _pass monkeypatch.setattr(stage_engine, "QG_CHECKS", patched) def _mock_merge_primitives(monkeypatch, *, head_shas, retest_result, retest_calls): """Mock the git/test primitives the real check_branch_mergeable composes.""" monkeypatch.setattr(merge_gate, "acquire_merge_lease", lambda *a, **k: (True, "lease acquired")) monkeypatch.setattr(merge_gate, "release_merge_lease", lambda *a, **k: None) monkeypatch.setattr(merge_gate, "branch_is_behind_main", lambda r, b: True) monkeypatch.setattr(merge_gate, "auto_rebase_onto_main", lambda r, b: (True, "rebased onto origin/main")) shas = list(head_shas) def _head_sha(r, b): return shas.pop(0) if shas else "" monkeypatch.setattr(merge_gate, "head_sha", _head_sha) def _retest(r, b): retest_calls.append((r, b)) return retest_result monkeypatch.setattr(merge_gate, "retest_branch", _retest) def _make_task(): conn = get_db() cur = conn.execute( "INSERT INTO tasks (plane_id, work_item_id, repo, branch, stage) VALUES (?,?,?,?,?)", (f"plane-{_WI}", _WI, _REPO, _BRANCH, "deploy-staging"), ) tid = cur.lastrowid conn.commit() conn.close() return tid def _stage(task_id): conn = get_db() row = conn.execute("SELECT stage FROM tasks WHERE id=?", (task_id,)).fetchone() conn.close() return row[0] def _agents(): conn = get_db() rows = conn.execute("SELECT agent FROM jobs ORDER BY id").fetchall() conn.close() return [r[0] for r in rows] def _advance(task_id): return advance_stage(task_id, "deploy-staging", _REPO, _WI, _BRANCH, finished_agent="deployer") # --------------------------------------------------------------------------- # Scenario A (D4): the literal incident — not-behind branch, green CI HEAD. # The re-test is SKIPPED, so the timeout can never happen; advance to deploy. # --------------------------------------------------------------------------- def test_tc10_incident_noop_rebase_skips_retest_and_advances(monkeypatch): _patch_edge_gates_except_merge(monkeypatch) retest_calls = [] # Same SHA before/after rebase -> proven no-op. retest is rigged to TIME OUT if # it were (wrongly) called — proving the skip is what avoids the false rollback. _mock_merge_primitives( monkeypatch, head_shas=["abc123", "abc123"], retest_result=(False, "re-test timeout after 900s"), retest_calls=retest_calls, ) task_id = _make_task() res = _advance(task_id) assert retest_calls == [], "re-test must be SKIPPED on a no-op rebase (D4)" assert res.rolled_back_to is None # the incident's false rollback is gone assert res.advanced is True assert res.to_stage == "deploy" assert _stage(task_id) == "deploy" assert "developer" not in _agents() # no developer-retry burned assert stage_engine.set_issue_blocked.called is False # --------------------------------------------------------------------------- # Scenario B (D3): a real catch-up whose re-test times out -> infra-retry, not # rollback, and never the "still failing after N retries" manual gate. # --------------------------------------------------------------------------- def test_tc10_real_catchup_retest_timeout_infra_retries_not_rollback(monkeypatch): _patch_edge_gates_except_merge(monkeypatch) retest_calls = [] # HEAD moved (real catch-up) -> re-test runs -> times out. _mock_merge_primitives( monkeypatch, head_shas=["old111", "new222"], retest_result=(False, "re-test timeout after 900s"), retest_calls=retest_calls, ) task_id = _make_task() res = _advance(task_id) assert retest_calls, "re-test SHOULD run on a real (HEAD-moving) catch-up" assert res.rolled_back_to is None # NOT the code-fault rollback assert res.note == "merge-gate-infra-retry" assert _stage(task_id) == "deploy-staging" # stays put for a bounded retry assert _agents() == ["deployer"] # staging-deployer re-queued, NOT developer assert stage_engine.set_issue_blocked.called is False # The "Merge-gate still failing after N developer retries" manual gate never fires. for call in stage_engine.send_telegram.call_args_list: assert "developer retries" not in call[0][0] def test_tc10_real_catchup_red_retest_still_rolls_back(monkeypatch): """Anti-over-tolerance guard inside the incident scenario: a genuinely RED re-test on a real catch-up STILL rolls back (BR-6 / AC-3).""" _patch_edge_gates_except_merge(monkeypatch) retest_calls = [] _mock_merge_primitives( monkeypatch, head_shas=["old111", "new222"], retest_result=(False, "re-test failed: ...1 failed"), retest_calls=retest_calls, ) task_id = _make_task() res = _advance(task_id) assert res.rolled_back_to == "development" assert _stage(task_id) == "development" assert _agents() == ["developer"]