Eliminate the false `deploy-staging -> development` rollback that fired when the merge-gate local re-test timed out (infra/resource) on a green CI + tester + staging branch (incident ORCH-109/PR #129: a 516.7s suite blew its 600s budget under CPU starvation from orphaned pytest processes -> timeout misrouted as a code fault -> developer-retry loop -> manual gate). Additive, 5 independent kill-switches, never-raise, self-hosting scope. Untouched byte-for-byte: STAGE_TRANSITIONS, the QG_CHECKS registry, check_branch_mergeable name/semantics, machine-verdict keys, the DB schema. INV-4 (never push/force-push main) and the no-prod-restart rule are preserved. - D1: new stdlib-only leaf src/proc_group.py runs the spawned re-test/coverage pytest in its own process group (start_new_session) and tree-kills the WHOLE group on timeout (os.killpg SIGTERM->grace->SIGKILL); used by merge_gate.retest_branch and coverage_gate.measure_coverage. No orphan leak. Fallback never-break: subprocess_tree_kill_enabled=False / non-POSIX -> the prior subprocess.run. - D2/D3: merge_gate.classify_retest_failure distinguishes timeout/red/lock-busy/ other; an infra timeout routes to _handle_merge_gate_infra_retry (bounded re-queue, task stays on deploy-staging, no rollback / no developer-retry); a red re-test / conflict still rolls back (BR-6). Exhaustion -> one infra alert. - D4: skip the local re-test when the pre-merge rebase was a proven no-op (HEAD already CI/tester/staging-validated); fail-safe runs the re-test on any uncertainty. Flag merge_retest_skip_when_current_enabled. - D5: merge_retest_timeout_s 600 -> 900 + _resolve_retest_timeout validation; reaper_max_running_s invariant preserved without change. - D6: in-process counters + read-only merge_gate block in GET /queue; appended ("ORCH-110","classify_retest_failure","src/merge_gate.py") to MAIN_REGRESSION_MARKERS. Docs (README/internals overview/CLAUDE/CHANGELOG/ .env.example) updated in the same PR. Tests: tests/test_orch110_*.py (TC-01..TC-12, incl. the red-before/green-after incident regression). Full suite green (1988 passed). Refs: ORCH-110 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
83 lines
3.3 KiB
Python
83 lines
3.3 KiB
Python
"""ORCH-110 TC-08: re-test budget validation + cross-invariants (D5).
|
|
|
|
Covers FR-3 / AC-5 / NFR-6:
|
|
* ``_resolve_retest_timeout`` validates the config (malformed / non-positive ->
|
|
safe default 900 + WARNING; never reaches subprocess);
|
|
* the budget was bumped 600 -> 900;
|
|
* the cross-invariant ``reaper_max_running_s > Σ(deploy-staging gate-work) + grace``
|
|
(ORCH-065/109) still holds with the new 900s re-test budget — WITHOUT raising
|
|
``reaper_max_running_s``.
|
|
"""
|
|
import os
|
|
import tempfile
|
|
|
|
os.environ.setdefault("ORCH_DB_PATH", os.path.join(tempfile.gettempdir(), "test_orch110_budget.db"))
|
|
os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
|
|
os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")
|
|
|
|
import pytest # noqa: E402
|
|
|
|
from src import merge_gate # noqa: E402
|
|
from src.config import Settings, settings # noqa: E402
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _resolve_retest_timeout — validation (never-break).
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc08_resolve_uses_positive_config(monkeypatch):
|
|
monkeypatch.setattr(merge_gate.settings, "merge_retest_timeout_s", 1234, raising=False)
|
|
assert merge_gate._resolve_retest_timeout() == 1234
|
|
|
|
|
|
@pytest.mark.parametrize("bad", [0, -5, "abc", None, 3.0])
|
|
def test_tc08_resolve_bad_config_falls_back_to_default(monkeypatch, bad):
|
|
monkeypatch.setattr(merge_gate.settings, "merge_retest_timeout_s", bad, raising=False)
|
|
# 3.0 is a valid positive int(3) -> stays 3; everything else -> 900 default.
|
|
out = merge_gate._resolve_retest_timeout()
|
|
if bad == 3.0:
|
|
assert out == 3
|
|
else:
|
|
assert out == 900
|
|
|
|
|
|
def test_tc08_default_budget_bumped_to_900():
|
|
"""D5: the shipped default budget is 900 (raised from 600)."""
|
|
assert Settings().merge_retest_timeout_s == 900
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Cross-invariant: reaper backstop covers the worst-case deploy-staging gate-work.
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc08_reaper_covers_deploy_staging_worstcase():
|
|
"""ORCH-065/109 invariant with the new 900s re-test budget (ADR D5 table)."""
|
|
s = Settings()
|
|
# Worst-case sum of work charged to a deploy-staging-deployer job (ADR D5).
|
|
security = 120
|
|
rebase = 120
|
|
image = 600
|
|
worst = (
|
|
s.agent_timeout_seconds # deployer agent (1800)
|
|
+ security
|
|
+ rebase
|
|
+ s.merge_retest_timeout_s # re-test (900, new)
|
|
+ s.coverage_run_timeout_s # coverage (900)
|
|
+ image
|
|
+ s.agent_kill_grace_seconds # grace (20)
|
|
)
|
|
assert worst <= 4460 # matches the ADR D5 table
|
|
assert s.reaper_max_running_s > worst, (
|
|
f"reaper_max_running_s={s.reaper_max_running_s} must exceed "
|
|
f"deploy-staging worst-case {worst}"
|
|
)
|
|
|
|
|
|
def test_tc08_reaper_still_covers_max_agent_timeout():
|
|
"""ORCH-065/109: reaper_max_running_s > max(agent timeout) + grace (unchanged)."""
|
|
s = Settings()
|
|
assert s.reaper_max_running_s > s.agent_timeout_developer_s + s.agent_kill_grace_seconds
|
|
|
|
|
|
def test_tc08_reaper_max_running_s_unchanged():
|
|
"""D5 must NOT change reaper_max_running_s (stays 5400 from ORCH-109)."""
|
|
assert settings.reaper_max_running_s == 5400
|