Files
orchestrator/tests/test_orch110_merge_gate_routing.py
claude-bot 651b9af7c3 fix(merge-gate): tolerate re-test infra-timeout + tree-kill spawned pytest
Eliminate the false `deploy-staging -> development` rollback that fired when the
merge-gate local re-test timed out (infra/resource) on a green CI + tester +
staging branch (incident ORCH-109/PR #129: a 516.7s suite blew its 600s budget
under CPU starvation from orphaned pytest processes -> timeout misrouted as a
code fault -> developer-retry loop -> manual gate).

Additive, 5 independent kill-switches, never-raise, self-hosting scope. Untouched
byte-for-byte: STAGE_TRANSITIONS, the QG_CHECKS registry, check_branch_mergeable
name/semantics, machine-verdict keys, the DB schema. INV-4 (never push/force-push
main) and the no-prod-restart rule are preserved.

- D1: new stdlib-only leaf src/proc_group.py runs the spawned re-test/coverage
  pytest in its own process group (start_new_session) and tree-kills the WHOLE
  group on timeout (os.killpg SIGTERM->grace->SIGKILL); used by
  merge_gate.retest_branch and coverage_gate.measure_coverage. No orphan leak.
  Fallback never-break: subprocess_tree_kill_enabled=False / non-POSIX -> the
  prior subprocess.run.
- D2/D3: merge_gate.classify_retest_failure distinguishes timeout/red/lock-busy/
  other; an infra timeout routes to _handle_merge_gate_infra_retry (bounded
  re-queue, task stays on deploy-staging, no rollback / no developer-retry); a
  red re-test / conflict still rolls back (BR-6). Exhaustion -> one infra alert.
- D4: skip the local re-test when the pre-merge rebase was a proven no-op (HEAD
  already CI/tester/staging-validated); fail-safe runs the re-test on any
  uncertainty. Flag merge_retest_skip_when_current_enabled.
- D5: merge_retest_timeout_s 600 -> 900 + _resolve_retest_timeout validation;
  reaper_max_running_s invariant preserved without change.
- D6: in-process counters + read-only merge_gate block in GET /queue; appended
  ("ORCH-110","classify_retest_failure","src/merge_gate.py") to
  MAIN_REGRESSION_MARKERS. Docs (README/internals overview/CLAUDE/CHANGELOG/
  .env.example) updated in the same PR.

Tests: tests/test_orch110_*.py (TC-01..TC-12, incl. the red-before/green-after
incident regression). Full suite green (1988 passed).

Refs: ORCH-110

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-15 10:42:34 +03:00

262 lines
9.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""ORCH-110 TC-04 / TC-05 / TC-06 / TC-09: merge-gate infra-timeout routing.
Drives the engine (``stage_engine.advance_stage``) on the deploy-staging -> deploy
edge with ``check_branch_mergeable`` monkeypatched, exactly like the existing
``test_stage_engine.TestMergeGate`` suite, and asserts the NEW routing (D3):
* TC-04 — an INFRA re-test timeout -> bounded infra-retry (re-queue the
staging-deployer with a delay, task STAYS on deploy-staging) — NOT a rollback to
development and NOT a developer-retry.
* TC-05 — a deterministically RED re-test STILL rolls back to development +
developer retry (BR-6 / AC-3 anti-over-tolerance).
* TC-06 — the infra-retry is bounded (anti-loop): after the budget it blocks with
ONE infra-alert, no infinite bounce, no new job, task NOT in development.
* TC-09 — never-raise: an error in the transient path is swallowed (a WARNING) and
never escapes into advance_stage.
Offline: isolated sqlite DB; Plane/Telegram/notifications mocked at stage_engine.
"""
import os
import tempfile
import pytest
_test_db = os.path.join(tempfile.gettempdir(), "test_orch110_routing.db")
os.environ["ORCH_DB_PATH"] = _test_db
os.environ["ORCH_REPOS_DIR"] = tempfile.gettempdir()
os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")
from unittest.mock import MagicMock # noqa: E402
import src.db as _db # noqa: E402
from src.db import init_db, get_db # noqa: E402
from src import stage_engine # noqa: E402
from src.stage_engine import advance_stage # noqa: E402
_REPO = "orchestrator"
_WI = "ORCH-110"
_BRANCH = "feature/ORCH-110-x"
_TIMEOUT_REASON = "re-test timeout after 900s"
@pytest.fixture(autouse=True)
def fresh_db(monkeypatch):
monkeypatch.setattr(_db.settings, "db_path", _test_db)
if os.path.exists(_test_db):
os.unlink(_test_db)
init_db()
yield
@pytest.fixture(autouse=True)
def silence_side_effects(monkeypatch):
for name in (
"notify_stage_change", "notify_qg_failure", "send_telegram",
"plane_notify_stage", "plane_notify_qg", "plane_add_comment",
"set_issue_in_progress", "set_issue_blocked",
):
monkeypatch.setattr(stage_engine, name, MagicMock())
# The merge-gate sub-gate runs only AFTER the stage QG + security + coverage pass;
# the self-deploy Phase A interception is irrelevant (merge-gate intervenes first).
monkeypatch.setattr(stage_engine.settings, "deploy_require_manual_approve", False)
@pytest.fixture(autouse=True)
def tolerance_on(monkeypatch):
monkeypatch.setattr(stage_engine.settings, "merge_retest_infra_tolerance_enabled", True)
monkeypatch.setattr(stage_engine.settings, "merge_retest_infra_max_retries", 2)
monkeypatch.setattr(stage_engine.settings, "merge_retest_infra_retry_delay_s", 120)
def _pass(*a, **k):
return (True, "ok")
def _fail(reason):
def _f(*a, **k):
return (False, reason)
return _f
def _patch_gates(monkeypatch, merge_reason):
monkeypatch.setattr(
stage_engine, "QG_CHECKS",
{**stage_engine.QG_CHECKS,
"check_staging_status": _pass,
"check_security_gate": _pass,
"check_coverage_gate": _pass,
"check_branch_mergeable": _fail(merge_reason),
"check_staging_image_fresh": _pass},
)
def _make_task(stage="deploy-staging"):
conn = get_db()
cur = conn.execute(
"INSERT INTO tasks (plane_id, work_item_id, repo, branch, stage) VALUES (?,?,?,?,?)",
(f"plane-{_WI}", _WI, _REPO, _BRANCH, stage),
)
tid = cur.lastrowid
conn.commit()
conn.close()
return tid
def _stage(task_id):
conn = get_db()
row = conn.execute("SELECT stage FROM tasks WHERE id=?", (task_id,)).fetchone()
conn.close()
return row[0]
def _jobs():
conn = get_db()
rows = conn.execute(
"SELECT agent, task_content, available_at FROM jobs ORDER BY id"
).fetchall()
conn.close()
return [dict(r) for r in rows]
def _seed_infra_retry_jobs(task_id, n):
conn = get_db()
for _ in range(n):
conn.execute(
"INSERT INTO jobs (agent, repo, task_id, task_content) "
"VALUES ('deployer','orchestrator',?, 'Note: merge-gate infra-timeout retry')",
(task_id,),
)
conn.commit()
conn.close()
def _advance(task_id):
return advance_stage(
task_id, "deploy-staging", _REPO, _WI, _BRANCH, finished_agent="deployer"
)
# ---------------------------------------------------------------------------
# TC-04 — infra-timeout -> bounded infra-retry, NOT a rollback to development.
# ---------------------------------------------------------------------------
def test_tc04_infra_timeout_reschedules_not_rollback(monkeypatch):
_patch_gates(monkeypatch, _TIMEOUT_REASON)
task_id = _make_task()
res = _advance(task_id)
assert res.advanced is False
assert res.rolled_back_to is None # NOT a code-fault rollback
assert res.note == "merge-gate-infra-retry"
assert _stage(task_id) == "deploy-staging" # stays put
jobs = _jobs()
assert len(jobs) == 1
assert jobs[0]["agent"] == "deployer" # re-queued staging-deployer, NOT developer
assert "merge-gate infra-timeout retry" in jobs[0]["task_content"]
assert jobs[0]["available_at"] is not None # delayed re-pickup
assert stage_engine.set_issue_blocked.called is False
# No developer-retry semantics: the rollback comment / in-progress is never set.
assert stage_engine.set_issue_in_progress.called is False
def test_tc04_killswitch_within_routing_is_observed(monkeypatch):
"""The infra-timeout always bumps the timeout counter (observability), even when
routed to the retry path."""
_patch_gates(monkeypatch, _TIMEOUT_REASON)
before = stage_engine.merge_gate.merge_gate_status()["retest_timeout_total"]
task_id = _make_task()
_advance(task_id)
after = stage_engine.merge_gate.merge_gate_status()["retest_timeout_total"]
assert after == before + 1
# ---------------------------------------------------------------------------
# TC-05 — a deterministically RED re-test STILL rolls back (BR-6 / AC-3).
# ---------------------------------------------------------------------------
def test_tc05_red_retest_still_rolls_back(monkeypatch):
_patch_gates(monkeypatch, "re-test failed after rebase: 1 failed, 5 passed")
task_id = _make_task()
res = _advance(task_id)
assert res.advanced is False
assert res.rolled_back_to == "development"
assert _stage(task_id) == "development"
jobs = _jobs()
assert len(jobs) == 1
assert jobs[0]["agent"] == "developer" # developer re-queued (retry)
def test_tc05_conflict_still_rolls_back(monkeypatch):
_patch_gates(monkeypatch, "rebase conflict: src/db.py")
task_id = _make_task()
res = _advance(task_id)
assert res.rolled_back_to == "development"
assert _stage(task_id) == "development"
assert _jobs()[0]["agent"] == "developer"
# ---------------------------------------------------------------------------
# TC-06 — anti-loop: infra-retry is bounded; exhaustion -> ONE infra-alert.
# ---------------------------------------------------------------------------
def test_tc06_infra_retry_bounded_then_infra_alert(monkeypatch):
monkeypatch.setattr(stage_engine.settings, "merge_retest_infra_max_retries", 2)
_patch_gates(monkeypatch, _TIMEOUT_REASON)
task_id = _make_task()
_seed_infra_retry_jobs(task_id, 2) # budget already spent
res = _advance(task_id)
assert res.advanced is False
assert res.rolled_back_to is None # NOT a rollback even at exhaustion
assert res.note == "merge-gate-infra-retry-exhausted"
assert res.alerted is True
assert _stage(task_id) == "deploy-staging" # NOT moved to development
assert stage_engine.set_issue_blocked.called
assert stage_engine.send_telegram.called
# No NEW retry job past the cap (still only the 2 we seeded).
assert len(_jobs()) == 2
# The alert is INFRA-specific, not "developer must fix".
msg = stage_engine.send_telegram.call_args[0][0]
assert "infra" in msg.lower() or "ресурс" in msg.lower()
assert "НЕ дефект кода" in msg
def test_tc06_below_budget_keeps_retrying(monkeypatch):
monkeypatch.setattr(stage_engine.settings, "merge_retest_infra_max_retries", 3)
_patch_gates(monkeypatch, _TIMEOUT_REASON)
task_id = _make_task()
_seed_infra_retry_jobs(task_id, 1) # one retry already done, budget 3
res = _advance(task_id)
assert res.note == "merge-gate-infra-retry"
assert res.alerted is not True
# The seeded job + the new retry job.
assert len(_jobs()) == 2
# ---------------------------------------------------------------------------
# TC-09 — never-raise: an error in the transient path is swallowed.
# ---------------------------------------------------------------------------
def test_tc09_infra_retry_never_raises(monkeypatch):
_patch_gates(monkeypatch, _TIMEOUT_REASON)
def _boom(*a, **k):
raise RuntimeError("enqueue exploded")
monkeypatch.setattr(stage_engine, "enqueue_job", _boom)
task_id = _make_task()
# Must NOT raise into advance_stage.
res = _advance(task_id)
assert res.note == "merge-gate-infra-retry-error"
assert _stage(task_id) == "deploy-staging" # left for the reconciler/reaper
def test_tc09_killswitch_off_falls_back_to_rollback(monkeypatch):
"""tolerance off -> a timeout takes the prior rollback path byte-for-byte (NFR-2)."""
monkeypatch.setattr(stage_engine.settings, "merge_retest_infra_tolerance_enabled", False)
_patch_gates(monkeypatch, _TIMEOUT_REASON)
task_id = _make_task()
res = _advance(task_id)
assert res.rolled_back_to == "development"
assert _stage(task_id) == "development"
assert _jobs()[0]["agent"] == "developer"