Eliminate the false `deploy-staging -> development` rollback that fired when the merge-gate local re-test timed out (infra/resource) on a green CI + tester + staging branch (incident ORCH-109/PR #129: a 516.7s suite blew its 600s budget under CPU starvation from orphaned pytest processes -> timeout misrouted as a code fault -> developer-retry loop -> manual gate). Additive, 5 independent kill-switches, never-raise, self-hosting scope. Untouched byte-for-byte: STAGE_TRANSITIONS, the QG_CHECKS registry, check_branch_mergeable name/semantics, machine-verdict keys, the DB schema. INV-4 (never push/force-push main) and the no-prod-restart rule are preserved. - D1: new stdlib-only leaf src/proc_group.py runs the spawned re-test/coverage pytest in its own process group (start_new_session) and tree-kills the WHOLE group on timeout (os.killpg SIGTERM->grace->SIGKILL); used by merge_gate.retest_branch and coverage_gate.measure_coverage. No orphan leak. Fallback never-break: subprocess_tree_kill_enabled=False / non-POSIX -> the prior subprocess.run. - D2/D3: merge_gate.classify_retest_failure distinguishes timeout/red/lock-busy/ other; an infra timeout routes to _handle_merge_gate_infra_retry (bounded re-queue, task stays on deploy-staging, no rollback / no developer-retry); a red re-test / conflict still rolls back (BR-6). Exhaustion -> one infra alert. - D4: skip the local re-test when the pre-merge rebase was a proven no-op (HEAD already CI/tester/staging-validated); fail-safe runs the re-test on any uncertainty. Flag merge_retest_skip_when_current_enabled. - D5: merge_retest_timeout_s 600 -> 900 + _resolve_retest_timeout validation; reaper_max_running_s invariant preserved without change. - D6: in-process counters + read-only merge_gate block in GET /queue; appended ("ORCH-110","classify_retest_failure","src/merge_gate.py") to MAIN_REGRESSION_MARKERS. Docs (README/internals overview/CLAUDE/CHANGELOG/ .env.example) updated in the same PR. Tests: tests/test_orch110_*.py (TC-01..TC-12, incl. the red-before/green-after incident regression). Full suite green (1988 passed). Refs: ORCH-110 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
262 lines
9.8 KiB
Python
262 lines
9.8 KiB
Python
"""ORCH-110 TC-04 / TC-05 / TC-06 / TC-09: merge-gate infra-timeout routing.
|
||
|
||
Drives the engine (``stage_engine.advance_stage``) on the deploy-staging -> deploy
|
||
edge with ``check_branch_mergeable`` monkeypatched, exactly like the existing
|
||
``test_stage_engine.TestMergeGate`` suite, and asserts the NEW routing (D3):
|
||
|
||
* TC-04 — an INFRA re-test timeout -> bounded infra-retry (re-queue the
|
||
staging-deployer with a delay, task STAYS on deploy-staging) — NOT a rollback to
|
||
development and NOT a developer-retry.
|
||
* TC-05 — a deterministically RED re-test STILL rolls back to development +
|
||
developer retry (BR-6 / AC-3 anti-over-tolerance).
|
||
* TC-06 — the infra-retry is bounded (anti-loop): after the budget it blocks with
|
||
ONE infra-alert, no infinite bounce, no new job, task NOT in development.
|
||
* TC-09 — never-raise: an error in the transient path is swallowed (a WARNING) and
|
||
never escapes into advance_stage.
|
||
|
||
Offline: isolated sqlite DB; Plane/Telegram/notifications mocked at stage_engine.
|
||
"""
|
||
import os
|
||
import tempfile
|
||
|
||
import pytest
|
||
|
||
_test_db = os.path.join(tempfile.gettempdir(), "test_orch110_routing.db")
|
||
os.environ["ORCH_DB_PATH"] = _test_db
|
||
os.environ["ORCH_REPOS_DIR"] = tempfile.gettempdir()
|
||
os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
|
||
os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")
|
||
|
||
from unittest.mock import MagicMock # noqa: E402
|
||
|
||
import src.db as _db # noqa: E402
|
||
from src.db import init_db, get_db # noqa: E402
|
||
from src import stage_engine # noqa: E402
|
||
from src.stage_engine import advance_stage # noqa: E402
|
||
|
||
_REPO = "orchestrator"
|
||
_WI = "ORCH-110"
|
||
_BRANCH = "feature/ORCH-110-x"
|
||
_TIMEOUT_REASON = "re-test timeout after 900s"
|
||
|
||
|
||
@pytest.fixture(autouse=True)
|
||
def fresh_db(monkeypatch):
|
||
monkeypatch.setattr(_db.settings, "db_path", _test_db)
|
||
if os.path.exists(_test_db):
|
||
os.unlink(_test_db)
|
||
init_db()
|
||
yield
|
||
|
||
|
||
@pytest.fixture(autouse=True)
|
||
def silence_side_effects(monkeypatch):
|
||
for name in (
|
||
"notify_stage_change", "notify_qg_failure", "send_telegram",
|
||
"plane_notify_stage", "plane_notify_qg", "plane_add_comment",
|
||
"set_issue_in_progress", "set_issue_blocked",
|
||
):
|
||
monkeypatch.setattr(stage_engine, name, MagicMock())
|
||
# The merge-gate sub-gate runs only AFTER the stage QG + security + coverage pass;
|
||
# the self-deploy Phase A interception is irrelevant (merge-gate intervenes first).
|
||
monkeypatch.setattr(stage_engine.settings, "deploy_require_manual_approve", False)
|
||
|
||
|
||
@pytest.fixture(autouse=True)
|
||
def tolerance_on(monkeypatch):
|
||
monkeypatch.setattr(stage_engine.settings, "merge_retest_infra_tolerance_enabled", True)
|
||
monkeypatch.setattr(stage_engine.settings, "merge_retest_infra_max_retries", 2)
|
||
monkeypatch.setattr(stage_engine.settings, "merge_retest_infra_retry_delay_s", 120)
|
||
|
||
|
||
def _pass(*a, **k):
|
||
return (True, "ok")
|
||
|
||
|
||
def _fail(reason):
|
||
def _f(*a, **k):
|
||
return (False, reason)
|
||
return _f
|
||
|
||
|
||
def _patch_gates(monkeypatch, merge_reason):
|
||
monkeypatch.setattr(
|
||
stage_engine, "QG_CHECKS",
|
||
{**stage_engine.QG_CHECKS,
|
||
"check_staging_status": _pass,
|
||
"check_security_gate": _pass,
|
||
"check_coverage_gate": _pass,
|
||
"check_branch_mergeable": _fail(merge_reason),
|
||
"check_staging_image_fresh": _pass},
|
||
)
|
||
|
||
|
||
def _make_task(stage="deploy-staging"):
|
||
conn = get_db()
|
||
cur = conn.execute(
|
||
"INSERT INTO tasks (plane_id, work_item_id, repo, branch, stage) VALUES (?,?,?,?,?)",
|
||
(f"plane-{_WI}", _WI, _REPO, _BRANCH, stage),
|
||
)
|
||
tid = cur.lastrowid
|
||
conn.commit()
|
||
conn.close()
|
||
return tid
|
||
|
||
|
||
def _stage(task_id):
|
||
conn = get_db()
|
||
row = conn.execute("SELECT stage FROM tasks WHERE id=?", (task_id,)).fetchone()
|
||
conn.close()
|
||
return row[0]
|
||
|
||
|
||
def _jobs():
|
||
conn = get_db()
|
||
rows = conn.execute(
|
||
"SELECT agent, task_content, available_at FROM jobs ORDER BY id"
|
||
).fetchall()
|
||
conn.close()
|
||
return [dict(r) for r in rows]
|
||
|
||
|
||
def _seed_infra_retry_jobs(task_id, n):
|
||
conn = get_db()
|
||
for _ in range(n):
|
||
conn.execute(
|
||
"INSERT INTO jobs (agent, repo, task_id, task_content) "
|
||
"VALUES ('deployer','orchestrator',?, 'Note: merge-gate infra-timeout retry')",
|
||
(task_id,),
|
||
)
|
||
conn.commit()
|
||
conn.close()
|
||
|
||
|
||
def _advance(task_id):
|
||
return advance_stage(
|
||
task_id, "deploy-staging", _REPO, _WI, _BRANCH, finished_agent="deployer"
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# TC-04 — infra-timeout -> bounded infra-retry, NOT a rollback to development.
|
||
# ---------------------------------------------------------------------------
|
||
def test_tc04_infra_timeout_reschedules_not_rollback(monkeypatch):
|
||
_patch_gates(monkeypatch, _TIMEOUT_REASON)
|
||
task_id = _make_task()
|
||
res = _advance(task_id)
|
||
|
||
assert res.advanced is False
|
||
assert res.rolled_back_to is None # NOT a code-fault rollback
|
||
assert res.note == "merge-gate-infra-retry"
|
||
assert _stage(task_id) == "deploy-staging" # stays put
|
||
jobs = _jobs()
|
||
assert len(jobs) == 1
|
||
assert jobs[0]["agent"] == "deployer" # re-queued staging-deployer, NOT developer
|
||
assert "merge-gate infra-timeout retry" in jobs[0]["task_content"]
|
||
assert jobs[0]["available_at"] is not None # delayed re-pickup
|
||
assert stage_engine.set_issue_blocked.called is False
|
||
# No developer-retry semantics: the rollback comment / in-progress is never set.
|
||
assert stage_engine.set_issue_in_progress.called is False
|
||
|
||
|
||
def test_tc04_killswitch_within_routing_is_observed(monkeypatch):
|
||
"""The infra-timeout always bumps the timeout counter (observability), even when
|
||
routed to the retry path."""
|
||
_patch_gates(monkeypatch, _TIMEOUT_REASON)
|
||
before = stage_engine.merge_gate.merge_gate_status()["retest_timeout_total"]
|
||
task_id = _make_task()
|
||
_advance(task_id)
|
||
after = stage_engine.merge_gate.merge_gate_status()["retest_timeout_total"]
|
||
assert after == before + 1
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# TC-05 — a deterministically RED re-test STILL rolls back (BR-6 / AC-3).
|
||
# ---------------------------------------------------------------------------
|
||
def test_tc05_red_retest_still_rolls_back(monkeypatch):
|
||
_patch_gates(monkeypatch, "re-test failed after rebase: 1 failed, 5 passed")
|
||
task_id = _make_task()
|
||
res = _advance(task_id)
|
||
|
||
assert res.advanced is False
|
||
assert res.rolled_back_to == "development"
|
||
assert _stage(task_id) == "development"
|
||
jobs = _jobs()
|
||
assert len(jobs) == 1
|
||
assert jobs[0]["agent"] == "developer" # developer re-queued (retry)
|
||
|
||
|
||
def test_tc05_conflict_still_rolls_back(monkeypatch):
|
||
_patch_gates(monkeypatch, "rebase conflict: src/db.py")
|
||
task_id = _make_task()
|
||
res = _advance(task_id)
|
||
assert res.rolled_back_to == "development"
|
||
assert _stage(task_id) == "development"
|
||
assert _jobs()[0]["agent"] == "developer"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# TC-06 — anti-loop: infra-retry is bounded; exhaustion -> ONE infra-alert.
|
||
# ---------------------------------------------------------------------------
|
||
def test_tc06_infra_retry_bounded_then_infra_alert(monkeypatch):
|
||
monkeypatch.setattr(stage_engine.settings, "merge_retest_infra_max_retries", 2)
|
||
_patch_gates(monkeypatch, _TIMEOUT_REASON)
|
||
task_id = _make_task()
|
||
_seed_infra_retry_jobs(task_id, 2) # budget already spent
|
||
|
||
res = _advance(task_id)
|
||
assert res.advanced is False
|
||
assert res.rolled_back_to is None # NOT a rollback even at exhaustion
|
||
assert res.note == "merge-gate-infra-retry-exhausted"
|
||
assert res.alerted is True
|
||
assert _stage(task_id) == "deploy-staging" # NOT moved to development
|
||
assert stage_engine.set_issue_blocked.called
|
||
assert stage_engine.send_telegram.called
|
||
# No NEW retry job past the cap (still only the 2 we seeded).
|
||
assert len(_jobs()) == 2
|
||
# The alert is INFRA-specific, not "developer must fix".
|
||
msg = stage_engine.send_telegram.call_args[0][0]
|
||
assert "infra" in msg.lower() or "ресурс" in msg.lower()
|
||
assert "НЕ дефект кода" in msg
|
||
|
||
|
||
def test_tc06_below_budget_keeps_retrying(monkeypatch):
|
||
monkeypatch.setattr(stage_engine.settings, "merge_retest_infra_max_retries", 3)
|
||
_patch_gates(monkeypatch, _TIMEOUT_REASON)
|
||
task_id = _make_task()
|
||
_seed_infra_retry_jobs(task_id, 1) # one retry already done, budget 3
|
||
|
||
res = _advance(task_id)
|
||
assert res.note == "merge-gate-infra-retry"
|
||
assert res.alerted is not True
|
||
# The seeded job + the new retry job.
|
||
assert len(_jobs()) == 2
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# TC-09 — never-raise: an error in the transient path is swallowed.
|
||
# ---------------------------------------------------------------------------
|
||
def test_tc09_infra_retry_never_raises(monkeypatch):
|
||
_patch_gates(monkeypatch, _TIMEOUT_REASON)
|
||
|
||
def _boom(*a, **k):
|
||
raise RuntimeError("enqueue exploded")
|
||
|
||
monkeypatch.setattr(stage_engine, "enqueue_job", _boom)
|
||
task_id = _make_task()
|
||
# Must NOT raise into advance_stage.
|
||
res = _advance(task_id)
|
||
assert res.note == "merge-gate-infra-retry-error"
|
||
assert _stage(task_id) == "deploy-staging" # left for the reconciler/reaper
|
||
|
||
|
||
def test_tc09_killswitch_off_falls_back_to_rollback(monkeypatch):
|
||
"""tolerance off -> a timeout takes the prior rollback path byte-for-byte (NFR-2)."""
|
||
monkeypatch.setattr(stage_engine.settings, "merge_retest_infra_tolerance_enabled", False)
|
||
_patch_gates(monkeypatch, _TIMEOUT_REASON)
|
||
task_id = _make_task()
|
||
res = _advance(task_id)
|
||
assert res.rolled_back_to == "development"
|
||
assert _stage(task_id) == "development"
|
||
assert _jobs()[0]["agent"] == "developer"
|