Eliminate the false `deploy-staging -> development` rollback that fired when the merge-gate local re-test timed out (infra/resource) on a green CI + tester + staging branch (incident ORCH-109/PR #129: a 516.7s suite blew its 600s budget under CPU starvation from orphaned pytest processes -> timeout misrouted as a code fault -> developer-retry loop -> manual gate). Additive, 5 independent kill-switches, never-raise, self-hosting scope. Untouched byte-for-byte: STAGE_TRANSITIONS, the QG_CHECKS registry, check_branch_mergeable name/semantics, machine-verdict keys, the DB schema. INV-4 (never push/force-push main) and the no-prod-restart rule are preserved. - D1: new stdlib-only leaf src/proc_group.py runs the spawned re-test/coverage pytest in its own process group (start_new_session) and tree-kills the WHOLE group on timeout (os.killpg SIGTERM->grace->SIGKILL); used by merge_gate.retest_branch and coverage_gate.measure_coverage. No orphan leak. Fallback never-break: subprocess_tree_kill_enabled=False / non-POSIX -> the prior subprocess.run. - D2/D3: merge_gate.classify_retest_failure distinguishes timeout/red/lock-busy/ other; an infra timeout routes to _handle_merge_gate_infra_retry (bounded re-queue, task stays on deploy-staging, no rollback / no developer-retry); a red re-test / conflict still rolls back (BR-6). Exhaustion -> one infra alert. - D4: skip the local re-test when the pre-merge rebase was a proven no-op (HEAD already CI/tester/staging-validated); fail-safe runs the re-test on any uncertainty. Flag merge_retest_skip_when_current_enabled. - D5: merge_retest_timeout_s 600 -> 900 + _resolve_retest_timeout validation; reaper_max_running_s invariant preserved without change. - D6: in-process counters + read-only merge_gate block in GET /queue; appended ("ORCH-110","classify_retest_failure","src/merge_gate.py") to MAIN_REGRESSION_MARKERS. Docs (README/internals overview/CLAUDE/CHANGELOG/ .env.example) updated in the same PR. Tests: tests/test_orch110_*.py (TC-01..TC-12, incl. the red-before/green-after incident regression). Full suite green (1988 passed). Refs: ORCH-110 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
136 lines
5.8 KiB
Python
136 lines
5.8 KiB
Python
"""ORCH-110 TC-12: observability of the infra-timeout path (D6 / FR-6 / AC-9).
|
||
|
||
The infra-timeout state must be (a) reflected in read-only counters, (b) surfaced in
|
||
the additive ``merge_gate`` block of ``GET /queue``, and (c) distinguishable from a
|
||
code-fault rollback — with the exhaustion alert carrying the CLICKABLE issue number
|
||
and an explicitly infrastructural (NOT "developer must fix") wording. No dedup/overlap
|
||
with ORCH-111 (which only OBSERVES surviving processes; ORCH-110 prevents/tolerates).
|
||
"""
|
||
import asyncio
|
||
import os
|
||
import tempfile
|
||
|
||
_test_db = os.path.join(tempfile.gettempdir(), "test_orch110_observability.db")
|
||
os.environ["ORCH_DB_PATH"] = _test_db
|
||
os.environ["ORCH_REPOS_DIR"] = tempfile.gettempdir()
|
||
os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
|
||
os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")
|
||
|
||
from unittest.mock import MagicMock # noqa: E402
|
||
|
||
import pytest # noqa: E402
|
||
|
||
import src.db as _db # noqa: E402
|
||
from src.db import init_db, get_db # noqa: E402
|
||
from src import merge_gate # noqa: E402
|
||
from src import stage_engine # noqa: E402
|
||
from src.stage_engine import AdvanceResult # noqa: E402
|
||
|
||
|
||
@pytest.fixture(autouse=True)
|
||
def fresh_db(monkeypatch):
|
||
monkeypatch.setattr(_db.settings, "db_path", _test_db)
|
||
if os.path.exists(_test_db):
|
||
os.unlink(_test_db)
|
||
init_db()
|
||
# Reset the in-process counters so each test starts from a known baseline.
|
||
merge_gate._MERGE_GATE_COUNTERS.update(
|
||
retest_timeout_total=0, retest_infra_retry_total=0,
|
||
retest_infra_exhausted_total=0, retest_skipped_current_total=0,
|
||
last_infra_timeout_wi=None,
|
||
)
|
||
yield
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# merge_gate_status() snapshot + counter increments.
|
||
# ---------------------------------------------------------------------------
|
||
def test_tc12_status_exposes_flags_and_counters():
|
||
snap = merge_gate.merge_gate_status()
|
||
for key in (
|
||
"infra_tolerance_enabled", "infra_max_retries", "infra_retry_delay_s",
|
||
"skip_when_current_enabled", "tree_kill_enabled", "retest_timeout_s",
|
||
"retest_timeout_total", "retest_infra_retry_total",
|
||
"retest_infra_exhausted_total", "retest_skipped_current_total",
|
||
"last_infra_timeout_wi",
|
||
):
|
||
assert key in snap, f"missing /queue merge_gate key: {key}"
|
||
|
||
|
||
def test_tc12_counters_track_infra_timeout_distinctly():
|
||
merge_gate.note_retest_timeout("ORCH-110")
|
||
merge_gate.note_retest_infra_retry()
|
||
merge_gate.note_retest_infra_exhausted()
|
||
merge_gate.note_retest_skipped_current()
|
||
snap = merge_gate.merge_gate_status()
|
||
assert snap["retest_timeout_total"] == 1
|
||
assert snap["retest_infra_retry_total"] == 1
|
||
assert snap["retest_infra_exhausted_total"] == 1
|
||
assert snap["retest_skipped_current_total"] == 1
|
||
# Distinguishable from a code-fault: the last infra-timeout WI is tracked here,
|
||
# NOT in the merge-verify (code/merge) counters.
|
||
assert snap["last_infra_timeout_wi"] == "ORCH-110"
|
||
|
||
|
||
def test_tc12_status_never_raises(monkeypatch):
|
||
# A broken settings attribute -> the snapshot degrades, never raises.
|
||
monkeypatch.delattr(merge_gate.settings, "merge_retest_infra_tolerance_enabled", raising=False)
|
||
snap = merge_gate.merge_gate_status()
|
||
assert isinstance(snap, dict)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# GET /queue carries the additive merge_gate block (and the legacy keys stay).
|
||
# ---------------------------------------------------------------------------
|
||
def test_tc12_queue_endpoint_includes_merge_gate_block():
|
||
from src.main import queue
|
||
out = asyncio.run(queue())
|
||
assert "merge_gate" in out
|
||
assert "infra_tolerance_enabled" in out["merge_gate"]
|
||
# The pre-existing observability keys are untouched (additive only).
|
||
assert "merge_verify" in out and "coverage" in out
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# The exhaustion alert is INFRA-specific + carries the clickable issue number,
|
||
# distinct from the code-fault "Merge-gate still failing after N developer retries".
|
||
# ---------------------------------------------------------------------------
|
||
def test_tc12_exhaustion_alert_is_infra_specific_and_clickable(monkeypatch):
|
||
sent = {}
|
||
|
||
def _tg(msg):
|
||
sent["msg"] = msg
|
||
|
||
monkeypatch.setattr(stage_engine, "send_telegram", _tg)
|
||
monkeypatch.setattr(stage_engine, "set_issue_blocked", MagicMock())
|
||
monkeypatch.setattr(stage_engine, "plane_add_comment", MagicMock())
|
||
# link_for builds the clickable number; use a recognisable sentinel.
|
||
monkeypatch.setattr(stage_engine, "link_for", lambda wi, **k: f"<a href='x'>{wi}</a>")
|
||
monkeypatch.setattr(stage_engine.settings, "merge_retest_infra_max_retries", 2)
|
||
|
||
# Seed an exhausted budget for the task.
|
||
conn = get_db()
|
||
cur = conn.execute(
|
||
"INSERT INTO tasks (plane_id, work_item_id, repo, branch, stage) VALUES (?,?,?,?,?)",
|
||
("plane-ORCH-110", "ORCH-110", "orchestrator", "feature/ORCH-110-x", "deploy-staging"),
|
||
)
|
||
task_id = cur.lastrowid
|
||
for _ in range(2):
|
||
conn.execute(
|
||
"INSERT INTO jobs (agent, repo, task_id, task_content) "
|
||
"VALUES ('deployer','orchestrator',?, 'Note: merge-gate infra-timeout retry')",
|
||
(task_id,),
|
||
)
|
||
conn.commit()
|
||
conn.close()
|
||
|
||
res = AdvanceResult()
|
||
stage_engine._handle_merge_gate_infra_retry(
|
||
task_id, "deploy-staging", "orchestrator", "ORCH-110",
|
||
"feature/ORCH-110-x", "re-test timeout after 900s", res,
|
||
)
|
||
assert res.note == "merge-gate-infra-retry-exhausted"
|
||
assert "ORCH-110" in sent["msg"] # clickable issue number present
|
||
assert "developer retries" not in sent["msg"] # NOT the code-fault wording
|
||
assert "НЕ дефект кода" in sent["msg"] # explicitly infrastructural
|