Files
orchestrator/tests/test_orch110_observability.py
claude-bot 651b9af7c3 fix(merge-gate): tolerate re-test infra-timeout + tree-kill spawned pytest
Eliminate the false `deploy-staging -> development` rollback that fired when the
merge-gate local re-test timed out (infra/resource) on a green CI + tester +
staging branch (incident ORCH-109/PR #129: a 516.7s suite blew its 600s budget
under CPU starvation from orphaned pytest processes -> timeout misrouted as a
code fault -> developer-retry loop -> manual gate).

Additive, 5 independent kill-switches, never-raise, self-hosting scope. Untouched
byte-for-byte: STAGE_TRANSITIONS, the QG_CHECKS registry, check_branch_mergeable
name/semantics, machine-verdict keys, the DB schema. INV-4 (never push/force-push
main) and the no-prod-restart rule are preserved.

- D1: new stdlib-only leaf src/proc_group.py runs the spawned re-test/coverage
  pytest in its own process group (start_new_session) and tree-kills the WHOLE
  group on timeout (os.killpg SIGTERM->grace->SIGKILL); used by
  merge_gate.retest_branch and coverage_gate.measure_coverage. No orphan leak.
  Fallback never-break: subprocess_tree_kill_enabled=False / non-POSIX -> the
  prior subprocess.run.
- D2/D3: merge_gate.classify_retest_failure distinguishes timeout/red/lock-busy/
  other; an infra timeout routes to _handle_merge_gate_infra_retry (bounded
  re-queue, task stays on deploy-staging, no rollback / no developer-retry); a
  red re-test / conflict still rolls back (BR-6). Exhaustion -> one infra alert.
- D4: skip the local re-test when the pre-merge rebase was a proven no-op (HEAD
  already CI/tester/staging-validated); fail-safe runs the re-test on any
  uncertainty. Flag merge_retest_skip_when_current_enabled.
- D5: merge_retest_timeout_s 600 -> 900 + _resolve_retest_timeout validation;
  reaper_max_running_s invariant preserved without change.
- D6: in-process counters + read-only merge_gate block in GET /queue; appended
  ("ORCH-110","classify_retest_failure","src/merge_gate.py") to
  MAIN_REGRESSION_MARKERS. Docs (README/internals overview/CLAUDE/CHANGELOG/
  .env.example) updated in the same PR.

Tests: tests/test_orch110_*.py (TC-01..TC-12, incl. the red-before/green-after
incident regression). Full suite green (1988 passed).

Refs: ORCH-110

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-15 10:42:34 +03:00

136 lines
5.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""ORCH-110 TC-12: observability of the infra-timeout path (D6 / FR-6 / AC-9).
The infra-timeout state must be (a) reflected in read-only counters, (b) surfaced in
the additive ``merge_gate`` block of ``GET /queue``, and (c) distinguishable from a
code-fault rollback — with the exhaustion alert carrying the CLICKABLE issue number
and an explicitly infrastructural (NOT "developer must fix") wording. No dedup/overlap
with ORCH-111 (which only OBSERVES surviving processes; ORCH-110 prevents/tolerates).
"""
import asyncio
import os
import tempfile
_test_db = os.path.join(tempfile.gettempdir(), "test_orch110_observability.db")
os.environ["ORCH_DB_PATH"] = _test_db
os.environ["ORCH_REPOS_DIR"] = tempfile.gettempdir()
os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")
from unittest.mock import MagicMock # noqa: E402
import pytest # noqa: E402
import src.db as _db # noqa: E402
from src.db import init_db, get_db # noqa: E402
from src import merge_gate # noqa: E402
from src import stage_engine # noqa: E402
from src.stage_engine import AdvanceResult # noqa: E402
@pytest.fixture(autouse=True)
def fresh_db(monkeypatch):
monkeypatch.setattr(_db.settings, "db_path", _test_db)
if os.path.exists(_test_db):
os.unlink(_test_db)
init_db()
# Reset the in-process counters so each test starts from a known baseline.
merge_gate._MERGE_GATE_COUNTERS.update(
retest_timeout_total=0, retest_infra_retry_total=0,
retest_infra_exhausted_total=0, retest_skipped_current_total=0,
last_infra_timeout_wi=None,
)
yield
# ---------------------------------------------------------------------------
# merge_gate_status() snapshot + counter increments.
# ---------------------------------------------------------------------------
def test_tc12_status_exposes_flags_and_counters():
snap = merge_gate.merge_gate_status()
for key in (
"infra_tolerance_enabled", "infra_max_retries", "infra_retry_delay_s",
"skip_when_current_enabled", "tree_kill_enabled", "retest_timeout_s",
"retest_timeout_total", "retest_infra_retry_total",
"retest_infra_exhausted_total", "retest_skipped_current_total",
"last_infra_timeout_wi",
):
assert key in snap, f"missing /queue merge_gate key: {key}"
def test_tc12_counters_track_infra_timeout_distinctly():
merge_gate.note_retest_timeout("ORCH-110")
merge_gate.note_retest_infra_retry()
merge_gate.note_retest_infra_exhausted()
merge_gate.note_retest_skipped_current()
snap = merge_gate.merge_gate_status()
assert snap["retest_timeout_total"] == 1
assert snap["retest_infra_retry_total"] == 1
assert snap["retest_infra_exhausted_total"] == 1
assert snap["retest_skipped_current_total"] == 1
# Distinguishable from a code-fault: the last infra-timeout WI is tracked here,
# NOT in the merge-verify (code/merge) counters.
assert snap["last_infra_timeout_wi"] == "ORCH-110"
def test_tc12_status_never_raises(monkeypatch):
# A broken settings attribute -> the snapshot degrades, never raises.
monkeypatch.delattr(merge_gate.settings, "merge_retest_infra_tolerance_enabled", raising=False)
snap = merge_gate.merge_gate_status()
assert isinstance(snap, dict)
# ---------------------------------------------------------------------------
# GET /queue carries the additive merge_gate block (and the legacy keys stay).
# ---------------------------------------------------------------------------
def test_tc12_queue_endpoint_includes_merge_gate_block():
from src.main import queue
out = asyncio.run(queue())
assert "merge_gate" in out
assert "infra_tolerance_enabled" in out["merge_gate"]
# The pre-existing observability keys are untouched (additive only).
assert "merge_verify" in out and "coverage" in out
# ---------------------------------------------------------------------------
# The exhaustion alert is INFRA-specific + carries the clickable issue number,
# distinct from the code-fault "Merge-gate still failing after N developer retries".
# ---------------------------------------------------------------------------
def test_tc12_exhaustion_alert_is_infra_specific_and_clickable(monkeypatch):
sent = {}
def _tg(msg):
sent["msg"] = msg
monkeypatch.setattr(stage_engine, "send_telegram", _tg)
monkeypatch.setattr(stage_engine, "set_issue_blocked", MagicMock())
monkeypatch.setattr(stage_engine, "plane_add_comment", MagicMock())
# link_for builds the clickable number; use a recognisable sentinel.
monkeypatch.setattr(stage_engine, "link_for", lambda wi, **k: f"<a href='x'>{wi}</a>")
monkeypatch.setattr(stage_engine.settings, "merge_retest_infra_max_retries", 2)
# Seed an exhausted budget for the task.
conn = get_db()
cur = conn.execute(
"INSERT INTO tasks (plane_id, work_item_id, repo, branch, stage) VALUES (?,?,?,?,?)",
("plane-ORCH-110", "ORCH-110", "orchestrator", "feature/ORCH-110-x", "deploy-staging"),
)
task_id = cur.lastrowid
for _ in range(2):
conn.execute(
"INSERT INTO jobs (agent, repo, task_id, task_content) "
"VALUES ('deployer','orchestrator',?, 'Note: merge-gate infra-timeout retry')",
(task_id,),
)
conn.commit()
conn.close()
res = AdvanceResult()
stage_engine._handle_merge_gate_infra_retry(
task_id, "deploy-staging", "orchestrator", "ORCH-110",
"feature/ORCH-110-x", "re-test timeout after 900s", res,
)
assert res.note == "merge-gate-infra-retry-exhausted"
assert "ORCH-110" in sent["msg"] # clickable issue number present
assert "developer retries" not in sent["msg"] # NOT the code-fault wording
assert "НЕ дефект кода" in sent["msg"] # explicitly infrastructural