fix(merge_gate): retry transient Gitea merge errors + already-in-main guard

merge_pr now wraps ONLY the mutating POST /pulls/{n}/merge in a bounded
exponential-backoff retry-loop on TRANSIENT outcomes (405 "try again later",
408, any 5xx, network/timeout, and 409|422 while the PR is still mergeable);
TERMINAL outcomes (403/404/real conflict via mergeable==False) -> fast honest
False, so the ORCH-071/081 not-merged HOLD backstop is unchanged. Fixes the
ORCH-063 false HOLD + manual re-merge on Gitea's post-push mergeability hiccup.

ensure_open_pr gains an "already fully in main" guard (_branch_fully_in_main,
git merge-base --is-ancestor HEAD origin/main) BEFORE creating a PR -> new
"already-in-main" outcome avoids the garbage empty PR on a re-driven finalizer;
_handle_merge_verify skips merge_pr on that outcome and lets the authoritative
SHA-in-main check confirm -> done (not a HOLD). git error of the guard fails
OPEN to the create path.

New ORCH_MERGE_RETRY_* settings (kill-switch merge_retry_enabled -> one-shot,
max_attempts=3, backoff base=2/max=5). INV-4 (merge only via Gitea PR-merge API,
never push/force-push main), never-raise, STAGE_TRANSITIONS/QG_CHECKS/DB schema
unchanged. Docs (README merge-verify section, CLAUDE.md, CHANGELOG, .env.example)
updated in the same PR. Tests: test_merge_gate.py TC-01..12, test_config.py
TC-13, test_merge_verify.py TC-14..16; full suite green (1389).

Refs: ORCH-093

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-09 22:38:16 +03:00
committed by orchestrator-deployer
parent 3d0f51512b
commit 0b25fc1527
12 changed files with 651 additions and 21 deletions

View File

@@ -257,3 +257,38 @@ def test_tc19_check_branch_mergeable_signature_intact():
from src.qg.checks import check_branch_mergeable
params = list(inspect.signature(check_branch_mergeable).parameters)
assert params == ["repo", "work_item_id", "branch"]
# ---------------------------------------------------------------------------
# ORCH-093 / TC-13: merge_retry_* settings defaults + env override (AC-5).
# ---------------------------------------------------------------------------
_MERGE_RETRY_ENV = (
"ORCH_MERGE_RETRY_ENABLED",
"ORCH_MERGE_RETRY_MAX_ATTEMPTS",
"ORCH_MERGE_RETRY_BACKOFF_BASE_S",
"ORCH_MERGE_RETRY_BACKOFF_MAX_S",
)
def test_merge_retry_settings_defaults(monkeypatch):
"""Documented defaults when no ORCH_MERGE_RETRY_* env is set."""
for name in _MERGE_RETRY_ENV:
monkeypatch.delenv(name, raising=False)
s = Settings()
assert s.merge_retry_enabled is True
assert s.merge_retry_max_attempts == 3
assert s.merge_retry_backoff_base_s == 2
assert s.merge_retry_backoff_max_s == 5
def test_merge_retry_settings_env_override(monkeypatch):
"""Each field is read from its ORCH_MERGE_RETRY_* env var."""
monkeypatch.setenv("ORCH_MERGE_RETRY_ENABLED", "false")
monkeypatch.setenv("ORCH_MERGE_RETRY_MAX_ATTEMPTS", "5")
monkeypatch.setenv("ORCH_MERGE_RETRY_BACKOFF_BASE_S", "1")
monkeypatch.setenv("ORCH_MERGE_RETRY_BACKOFF_MAX_S", "8")
s = Settings()
assert s.merge_retry_enabled is False
assert s.merge_retry_max_attempts == 5
assert s.merge_retry_backoff_base_s == 1
assert s.merge_retry_backoff_max_s == 8

View File

@@ -389,3 +389,207 @@ def test_tc16_deployer_prompt_consults_guard():
assert "no second merge" in lowered, (
"deployer prompt must document the already-merged no-op (AC-11)"
)
# ===========================================================================
# ORCH-093: merge_pr transient-retry + ensure_open_pr already-in-main guard.
# TC-01..TC-12 — httpx mocked; time.sleep no-op so backoff never slows tests.
# ===========================================================================
ORCH093_BRANCH = "feature/ORCH-093-x"
class _Resp093:
"""Response stand-in with status_code / json() / text (merge_pr reads .text)."""
def __init__(self, status_code, payload=None, text=""):
self.status_code = status_code
self._payload = payload if payload is not None else []
self.text = text
def json(self):
return self._payload
@pytest.fixture
def merge093(monkeypatch):
"""Wire Gitea settings + retry defaults; no-op backoff; PR not-already-merged."""
monkeypatch.setattr(merge_gate.settings, "gitea_url", "http://gitea.test")
monkeypatch.setattr(merge_gate.settings, "gitea_owner", "admin")
monkeypatch.setattr(merge_gate.settings, "gitea_token", "tok")
monkeypatch.setattr(merge_gate.settings, "merge_pr_timeout_s", 5)
monkeypatch.setattr(merge_gate.settings, "merge_retry_enabled", True)
monkeypatch.setattr(merge_gate.settings, "merge_retry_max_attempts", 3)
monkeypatch.setattr(merge_gate.settings, "merge_retry_backoff_base_s", 2)
monkeypatch.setattr(merge_gate.settings, "merge_retry_backoff_max_s", 5)
monkeypatch.setattr(merge_gate.time, "sleep", lambda *a, **k: None)
monkeypatch.setattr(merge_gate, "pr_already_merged", lambda r, b: False)
def _open_code_pr_get(number=7):
"""A list-PRs GET returning exactly one open code-PR (head==branch, base==main)."""
return lambda *a, **k: _Resp093(
200, [{"head": {"ref": ORCH093_BRANCH}, "base": {"ref": "main"}, "number": number}]
)
class _PostSeq:
"""Returns queued responses (or raises queued exceptions) on each POST call."""
def __init__(self, items):
self._items = list(items)
self.calls = 0
def __call__(self, *a, **k):
self.calls += 1
item = self._items.pop(0) if self._items else self._items_last
self._items_last = item
if isinstance(item, Exception):
raise item
return item
# --- TC-01: 405, 405, 200 -> (True, ...); exactly 3 POST; no false False (AC-1) ---
def test_tc01_merge_retries_405_then_succeeds(merge093, monkeypatch):
monkeypatch.setattr(httpx, "get", _open_code_pr_get(7))
seq = _PostSeq([_Resp093(405, text="try again later"),
_Resp093(405, text="try again later"),
_Resp093(200)])
monkeypatch.setattr(httpx, "post", seq)
ok, msg = merge_gate.merge_pr("orchestrator", ORCH093_BRANCH)
assert ok is True and "PR #7" in msg
assert seq.calls == 3
# --- TC-02: 503 (5xx) then 200 -> retry -> (True, ...) (AC-1) ---
def test_tc02_merge_retries_5xx_then_succeeds(merge093, monkeypatch):
monkeypatch.setattr(httpx, "get", _open_code_pr_get(7))
seq = _PostSeq([_Resp093(503, text="bad gateway"), _Resp093(200)])
monkeypatch.setattr(httpx, "post", seq)
ok, msg = merge_gate.merge_pr("orchestrator", ORCH093_BRANCH)
assert ok is True and seq.calls == 2
# --- TC-03: httpx Timeout in attempt 1, then 200 -> retry; never-raise (AC-1/AC-6) ---
def test_tc03_merge_retries_network_error_then_succeeds(merge093, monkeypatch):
monkeypatch.setattr(httpx, "get", _open_code_pr_get(7))
seq = _PostSeq([httpx.ConnectTimeout("timed out"), _Resp093(200)])
monkeypatch.setattr(httpx, "post", seq)
ok, msg = merge_gate.merge_pr("orchestrator", ORCH093_BRANCH)
assert ok is True and seq.calls == 2
# --- TC-04: real conflict 409 + mergeable=False -> (False, ...), no extra POST (AC-2) ---
def test_tc04_real_conflict_terminal_no_retry(merge093, monkeypatch):
monkeypatch.setattr(httpx, "get", _open_code_pr_get(7))
monkeypatch.setattr(merge_gate, "_pr_mergeable", lambda r, i: False)
seq = _PostSeq([_Resp093(409, text="conflict")])
monkeypatch.setattr(httpx, "post", seq)
ok, msg = merge_gate.merge_pr("orchestrator", ORCH093_BRANCH)
assert ok is False and "HTTP 409" in msg
assert seq.calls == 1 # terminal -> no retry
# --- TC-05: ambiguous 409 + mergeable=True -> transient -> retry -> 200 (AC-2) ---
def test_tc05_ambiguous_409_mergeable_true_retries(merge093, monkeypatch):
monkeypatch.setattr(httpx, "get", _open_code_pr_get(7))
monkeypatch.setattr(merge_gate, "_pr_mergeable", lambda r, i: True)
seq = _PostSeq([_Resp093(409, text="recomputing"), _Resp093(200)])
monkeypatch.setattr(httpx, "post", seq)
ok, msg = merge_gate.merge_pr("orchestrator", ORCH093_BRANCH)
assert ok is True and seq.calls == 2
# --- TC-06: 403 (no rights) -> immediate (False, ...) without retry (AC-2) ---
def test_tc06_403_terminal_no_retry(merge093, monkeypatch):
monkeypatch.setattr(httpx, "get", _open_code_pr_get(7))
seq = _PostSeq([_Resp093(403, text="forbidden")])
monkeypatch.setattr(httpx, "post", seq)
ok, msg = merge_gate.merge_pr("orchestrator", ORCH093_BRANCH)
assert ok is False and "HTTP 403" in msg and seq.calls == 1
# --- TC-07: 405 on all N attempts -> (False, "merge failed after N attempts: HTTP 405") (AC-3) ---
def test_tc07_exhausts_retries_clear_reason(merge093, monkeypatch):
monkeypatch.setattr(httpx, "get", _open_code_pr_get(7))
seq = _PostSeq([_Resp093(405), _Resp093(405), _Resp093(405)])
monkeypatch.setattr(httpx, "post", seq)
ok, msg = merge_gate.merge_pr("orchestrator", ORCH093_BRANCH)
assert ok is False
assert "after 3 attempts" in msg and "HTTP 405" in msg
assert seq.calls == 3
# --- TC-08: kill-switch off -> exactly one POST (one-shot) at 405 -> (False, ...) (AC-5/AC-3) ---
def test_tc08_killswitch_off_one_shot(merge093, monkeypatch):
monkeypatch.setattr(merge_gate.settings, "merge_retry_enabled", False)
monkeypatch.setattr(httpx, "get", _open_code_pr_get(7))
seq = _PostSeq([_Resp093(405), _Resp093(200)]) # 2nd would succeed if retried
monkeypatch.setattr(httpx, "post", seq)
ok, msg = merge_gate.merge_pr("orchestrator", ORCH093_BRANCH)
assert ok is False and seq.calls == 1 # one-shot: never retried
# --- TC-09: ensure_open_pr — no open PR, branch fully in main -> already-in-main, no POST (AC-4) ---
def test_tc09_ensure_already_in_main_no_post(merge093, monkeypatch):
monkeypatch.setattr(httpx, "get", lambda *a, **k: _Resp093(200, [])) # no open PR
monkeypatch.setattr(merge_gate, "_branch_fully_in_main", lambda r, b: True)
monkeypatch.setattr(httpx, "post", lambda *a, **k: (_ for _ in ()).throw(
AssertionError("must NOT POST /pulls for an already-in-main branch")))
status, detail = merge_gate.ensure_open_pr("orchestrator", ORCH093_BRANCH)
assert status == "already-in-main"
# --- TC-10: ensure_open_pr — no open PR, commits beyond main -> creates PR (regress) (AC-4) ---
def test_tc10_ensure_creates_when_commits_beyond_main(merge093, monkeypatch):
monkeypatch.setattr(httpx, "get", lambda *a, **k: _Resp093(200, []))
monkeypatch.setattr(merge_gate, "_branch_fully_in_main", lambda r, b: False)
post_calls = []
def fake_post(url, json=None, headers=None, timeout=None):
post_calls.append(url)
return _Resp093(201, {"number": 12})
monkeypatch.setattr(httpx, "post", fake_post)
status, detail = merge_gate.ensure_open_pr("orchestrator", ORCH093_BRANCH)
assert status == "created" and detail == "12"
assert len(post_calls) == 1
# --- TC-11: ensure_open_pr — git error in guard (None) -> fail-OPEN -> create path (AC-6) ---
def test_tc11_ensure_guard_git_error_fail_open(merge093, monkeypatch):
monkeypatch.setattr(httpx, "get", lambda *a, **k: _Resp093(200, []))
# None == git/OS error / ambiguous -> must NOT block; degrade to create.
monkeypatch.setattr(merge_gate, "_branch_fully_in_main", lambda r, b: None)
monkeypatch.setattr(httpx, "post", lambda *a, **k: _Resp093(201, {"number": 13}))
status, detail = merge_gate.ensure_open_pr("orchestrator", ORCH093_BRANCH)
assert status == "created" # fail-open: did not become a false no-op
def test_tc11_branch_fully_in_main_never_raises(monkeypatch):
"""_branch_fully_in_main: any git/OS error -> None (never-raise) (AC-6)."""
monkeypatch.setattr(merge_gate, "ensure_worktree", lambda r, b: "/wt")
def boom(*a, **k):
raise OSError("git exploded")
monkeypatch.setattr(merge_gate.subprocess, "run", boom)
assert merge_gate._branch_fully_in_main("orchestrator", ORCH093_BRANCH) is None
# --- TC-12: merge_pr / ensure_open_pr — uncaught httpx error -> safe tuple (never-raise) (AC-6) ---
def test_tc12_merge_pr_never_raises(merge093, monkeypatch):
def boom(*a, **k):
raise httpx.HTTPError("kaboom")
monkeypatch.setattr(httpx, "get", boom)
ok, msg = merge_gate.merge_pr("orchestrator", ORCH093_BRANCH)
assert ok is False and isinstance(msg, str)
def test_tc12_ensure_open_pr_never_raises(merge093, monkeypatch):
def boom(*a, **k):
raise httpx.HTTPError("kaboom")
monkeypatch.setattr(httpx, "get", boom)
status, detail = merge_gate.ensure_open_pr("orchestrator", ORCH093_BRANCH)
assert status == "failed" and isinstance(detail, str)

View File

@@ -131,3 +131,92 @@ def test_tc12_kill_switch_disables_under_gate(monkeypatch):
monkeypatch.setattr(merge_gate.settings, "merge_verify_enabled", False)
assert merge_gate.merge_verify_applies("orchestrator") is False
assert merge_gate.merge_verify_applies("enduro-trails") is False
# ===========================================================================
# ORCH-093 / TC-14..16: _handle_merge_verify integration (deploy->done under-gate).
# already-in-main skips merge_pr; transient-retry success -> done; exhausted -> HOLD.
# ===========================================================================
import os # noqa: E402
import tempfile # noqa: E402
os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")
os.environ.setdefault("ORCH_DB_PATH", os.path.join(tempfile.gettempdir(), "test_orch093.db"))
from unittest.mock import MagicMock # noqa: E402
from src import stage_engine, image_freshness # noqa: E402
from src.stage_engine import AdvanceResult, _handle_merge_verify # noqa: E402
_O93_REPO = "orchestrator"
_O93_WI = "ORCH-093"
_O93_BRANCH = "feature/ORCH-093-x"
@pytest.fixture
def _o93_wire(monkeypatch):
monkeypatch.setattr(stage_engine.merge_gate, "merge_verify_applies", lambda r: True)
monkeypatch.setattr(stage_engine.settings, "merge_verify_autocreate_pr_enabled", True)
monkeypatch.setattr(stage_engine.settings, "regression_guard_enabled", False)
monkeypatch.setattr(image_freshness, "validated_revision", lambda r, b: "deadbeef")
for name in ("set_issue_blocked", "plane_add_comment", "send_telegram", "link_for"):
monkeypatch.setattr(stage_engine, name, MagicMock())
monkeypatch.setattr(
stage_engine.self_deploy, "record_merged_to_main", MagicMock(return_value=True)
)
# --- TC-14: ensure_open_pr -> already-in-main -> skip merge_pr; SHA-in-main -> done (AC-4) ---
def test_tc14_already_in_main_skips_merge_pr_then_done(_o93_wire, monkeypatch):
monkeypatch.setattr(
stage_engine.merge_gate, "ensure_open_pr", lambda r, b: ("already-in-main", "x")
)
merge = MagicMock()
monkeypatch.setattr(stage_engine.merge_gate, "merge_pr", merge)
monkeypatch.setattr(stage_engine.merge_gate, "verify_merged_to_main", lambda r, b, s: True)
res = AdvanceResult()
intervened = _handle_merge_verify(1, _O93_REPO, _O93_WI, _O93_BRANCH, res)
assert intervened is False # advance to done
assert res.alerted is False
assert not merge.called # merge_pr SKIPPED (nothing to merge)
assert not stage_engine.set_issue_blocked.called
# --- TC-15: merge_pr exhausted (False) + SHA not in main -> HOLD + alert (ORCH-071/081) (AC-3) ---
def test_tc15_merge_failed_and_not_in_main_holds(_o93_wire, monkeypatch):
monkeypatch.setattr(
stage_engine.merge_gate, "ensure_open_pr", lambda r, b: ("existed", "9")
)
monkeypatch.setattr(
stage_engine.merge_gate, "merge_pr",
lambda r, b: (False, "merge failed after 3 attempts: HTTP 405"),
)
monkeypatch.setattr(stage_engine.merge_gate, "verify_merged_to_main", lambda r, b, s: False)
res = AdvanceResult()
intervened = _handle_merge_verify(1, _O93_REPO, _O93_WI, _O93_BRANCH, res)
assert intervened is True # HOLD, NOT done
assert res.advanced is False
assert res.note == "merge-not-verified-hold"
assert stage_engine.set_issue_blocked.called
# --- TC-16: happy path — transient retry success in merge_pr -> SHA-in-main -> done (AC-1) ---
def test_tc16_transient_retry_success_then_done(_o93_wire, monkeypatch):
monkeypatch.setattr(
stage_engine.merge_gate, "ensure_open_pr", lambda r, b: ("existed", "9")
)
# merge_pr already rode out the 405x2->200 transient internally -> (True, ...).
monkeypatch.setattr(stage_engine.merge_gate, "merge_pr", lambda r, b: (True, "merged PR #9"))
monkeypatch.setattr(stage_engine.merge_gate, "verify_merged_to_main", lambda r, b, s: True)
res = AdvanceResult()
intervened = _handle_merge_verify(1, _O93_REPO, _O93_WI, _O93_BRANCH, res)
assert intervened is False # done, no false HOLD
assert res.alerted is False
assert not stage_engine.set_issue_blocked.called

View File

@@ -32,6 +32,11 @@ def _settings(monkeypatch):
monkeypatch.setattr(merge_gate.settings, "gitea_owner", "owner")
monkeypatch.setattr(merge_gate.settings, "gitea_token", "tok")
monkeypatch.setattr(merge_gate.settings, "gitea_url", "http://gitea.test")
# ORCH-093: these tests target the HTTP create/race logic of ensure_open_pr.
# The new already-in-main guard (_branch_fully_in_main) runs real git; pin it
# to "commits beyond main" (False) so the create path is exercised as intended.
# The guard itself has dedicated coverage (test_merge_gate.py TC-09/10/11).
monkeypatch.setattr(merge_gate, "_branch_fully_in_main", lambda r, b: False)
def _install_httpx(monkeypatch, get_resp, post_resp=None, record=None):