orchestrator/tests/test_orch110_false_rollback_regression.py

"""ORCH-110 TC-10: regression of the ORCH-109 / PR #129 incident.

Incident: tester PASS + green CI + the branch not-behind, but the merge-gate local
re-test blew its wall-clock budget under CPU starvation -> ``check_branch_mergeable``
returned ``(False, "re-test timeout ...")`` -> the engine routed it to
``_handle_merge_gate_rollback`` (rollback deploy-staging -> development + a developer
retry) -> every retry timed out the same way -> "Merge-gate still failing after 3
developer retries" -> a stuck task needing manual intervention.

This drives the REAL ``check_branch_mergeable`` through ``advance_stage`` (only the
git/test primitives are mocked) and asserts the incident can no longer happen:

  * Scenario A (D4) — a not-behind branch (no-op rebase) on a green-CI HEAD: the
    local re-test is SKIPPED entirely -> the gate passes -> advance to deploy.
  * Scenario B (D3) — a real catch-up whose re-test times out: a bounded infra-retry
    (task stays on deploy-staging), NEVER a rollback to development and NEVER the
    "Merge-gate still failing after N developer retries" alert.

RED-before / GREEN-after: on pre-ORCH-110 code both scenarios would roll the task
back to development (Scenario A would even run the doomed re-test), so the
``rolled_back_to is None`` / ``stage == deploy-staging`` assertions below would FAIL.
"""
import os
import tempfile

import pytest

_test_db = os.path.join(tempfile.gettempdir(), "test_orch110_regression.db")
os.environ["ORCH_DB_PATH"] = _test_db
os.environ["ORCH_REPOS_DIR"] = tempfile.gettempdir()
os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")

from unittest.mock import MagicMock  # noqa: E402

import src.db as _db  # noqa: E402
from src.db import init_db, get_db  # noqa: E402
from src import stage_engine  # noqa: E402
from src import merge_gate  # noqa: E402
from src.qg import checks as qg  # noqa: E402
from src.stage_engine import advance_stage  # noqa: E402

_REPO = "orchestrator"
_WI = "ORCH-110"
_BRANCH = "feature/ORCH-110-x"


@pytest.fixture(autouse=True)
def fresh_db(monkeypatch):
    monkeypatch.setattr(_db.settings, "db_path", _test_db)
    if os.path.exists(_test_db):
        os.unlink(_test_db)
    init_db()
    yield


@pytest.fixture(autouse=True)
def silence(monkeypatch):
    for name in (
        "notify_stage_change", "notify_qg_failure", "send_telegram",
        "plane_notify_stage", "plane_notify_qg", "plane_add_comment",
        "set_issue_in_progress", "set_issue_blocked", "notify_approve_requested",
        "set_issue_in_review", "set_issue_needs_input",
    ):
        monkeypatch.setattr(stage_engine, name, MagicMock())
    # Keep the merge-gate the only intervening sub-gate on the edge; no Phase A.
    monkeypatch.setattr(stage_engine.settings, "deploy_require_manual_approve", False)
    # Real gate scope: orchestrator self-hosting.
    monkeypatch.setattr(qg.settings, "merge_gate_enabled", True, raising=False)
    monkeypatch.setattr(qg.settings, "merge_gate_repos", "", raising=False)
    monkeypatch.setattr(qg.settings, "premerge_rebase_always", True, raising=False)
    monkeypatch.setattr(qg.settings, "merge_retest_skip_when_current_enabled", True, raising=False)
    monkeypatch.setattr(stage_engine.settings, "merge_retest_infra_tolerance_enabled", True)
    monkeypatch.setattr(stage_engine.settings, "merge_retest_infra_max_retries", 2)
    monkeypatch.setattr(stage_engine.settings, "merge_retest_infra_retry_delay_s", 120)


def _pass(*a, **k):
    return (True, "ok")


def _patch_edge_gates_except_merge(monkeypatch):
    """All edge gates pass EXCEPT check_branch_mergeable, which stays the REAL one."""
    patched = {**stage_engine.QG_CHECKS}
    patched["check_staging_status"] = _pass
    patched["check_security_gate"] = _pass
    patched["check_coverage_gate"] = _pass
    patched["check_staging_image_fresh"] = _pass
    monkeypatch.setattr(stage_engine, "QG_CHECKS", patched)


def _mock_merge_primitives(monkeypatch, *, head_shas, retest_result, retest_calls):
    """Mock the git/test primitives the real check_branch_mergeable composes."""
    monkeypatch.setattr(merge_gate, "acquire_merge_lease", lambda *a, **k: (True, "lease acquired"))
    monkeypatch.setattr(merge_gate, "release_merge_lease", lambda *a, **k: None)
    monkeypatch.setattr(merge_gate, "branch_is_behind_main", lambda r, b: True)
    monkeypatch.setattr(merge_gate, "auto_rebase_onto_main", lambda r, b: (True, "rebased onto origin/main"))

    shas = list(head_shas)

    def _head_sha(r, b):
        return shas.pop(0) if shas else ""

    monkeypatch.setattr(merge_gate, "head_sha", _head_sha)

    def _retest(r, b):
        retest_calls.append((r, b))
        return retest_result

    monkeypatch.setattr(merge_gate, "retest_branch", _retest)


def _make_task():
    conn = get_db()
    cur = conn.execute(
        "INSERT INTO tasks (plane_id, work_item_id, repo, branch, stage) VALUES (?,?,?,?,?)",
        (f"plane-{_WI}", _WI, _REPO, _BRANCH, "deploy-staging"),
    )
    tid = cur.lastrowid
    conn.commit()
    conn.close()
    return tid


def _stage(task_id):
    conn = get_db()
    row = conn.execute("SELECT stage FROM tasks WHERE id=?", (task_id,)).fetchone()
    conn.close()
    return row[0]


def _agents():
    conn = get_db()
    rows = conn.execute("SELECT agent FROM jobs ORDER BY id").fetchall()
    conn.close()
    return [r[0] for r in rows]


def _advance(task_id):
    return advance_stage(task_id, "deploy-staging", _REPO, _WI, _BRANCH, finished_agent="deployer")


# ---------------------------------------------------------------------------
# Scenario A (D4): the literal incident — not-behind branch, green CI HEAD.
# The re-test is SKIPPED, so the timeout can never happen; advance to deploy.
# ---------------------------------------------------------------------------
def test_tc10_incident_noop_rebase_skips_retest_and_advances(monkeypatch):
    _patch_edge_gates_except_merge(monkeypatch)
    retest_calls = []
    # Same SHA before/after rebase -> proven no-op. retest is rigged to TIME OUT if
    # it were (wrongly) called — proving the skip is what avoids the false rollback.
    _mock_merge_primitives(
        monkeypatch,
        head_shas=["abc123", "abc123"],
        retest_result=(False, "re-test timeout after 900s"),
        retest_calls=retest_calls,
    )
    task_id = _make_task()
    res = _advance(task_id)

    assert retest_calls == [], "re-test must be SKIPPED on a no-op rebase (D4)"
    assert res.rolled_back_to is None              # the incident's false rollback is gone
    assert res.advanced is True
    assert res.to_stage == "deploy"
    assert _stage(task_id) == "deploy"
    assert "developer" not in _agents()           # no developer-retry burned
    assert stage_engine.set_issue_blocked.called is False


# ---------------------------------------------------------------------------
# Scenario B (D3): a real catch-up whose re-test times out -> infra-retry, not
# rollback, and never the "still failing after N retries" manual gate.
# ---------------------------------------------------------------------------
def test_tc10_real_catchup_retest_timeout_infra_retries_not_rollback(monkeypatch):
    _patch_edge_gates_except_merge(monkeypatch)
    retest_calls = []
    # HEAD moved (real catch-up) -> re-test runs -> times out.
    _mock_merge_primitives(
        monkeypatch,
        head_shas=["old111", "new222"],
        retest_result=(False, "re-test timeout after 900s"),
        retest_calls=retest_calls,
    )
    task_id = _make_task()
    res = _advance(task_id)

    assert retest_calls, "re-test SHOULD run on a real (HEAD-moving) catch-up"
    assert res.rolled_back_to is None              # NOT the code-fault rollback
    assert res.note == "merge-gate-infra-retry"
    assert _stage(task_id) == "deploy-staging"     # stays put for a bounded retry
    assert _agents() == ["deployer"]               # staging-deployer re-queued, NOT developer
    assert stage_engine.set_issue_blocked.called is False
    # The "Merge-gate still failing after N developer retries" manual gate never fires.
    for call in stage_engine.send_telegram.call_args_list:
        assert "developer retries" not in call[0][0]


def test_tc10_real_catchup_red_retest_still_rolls_back(monkeypatch):
    """Anti-over-tolerance guard inside the incident scenario: a genuinely RED
    re-test on a real catch-up STILL rolls back (BR-6 / AC-3)."""
    _patch_edge_gates_except_merge(monkeypatch)
    retest_calls = []
    _mock_merge_primitives(
        monkeypatch,
        head_shas=["old111", "new222"],
        retest_result=(False, "re-test failed: ...1 failed"),
        retest_calls=retest_calls,
    )
    task_id = _make_task()
    res = _advance(task_id)
    assert res.rolled_back_to == "development"
    assert _stage(task_id) == "development"
    assert _agents() == ["developer"]