fix(merge_gate): retry transient Gitea merge errors + already-in-main guard
merge_pr now wraps ONLY the mutating POST /pulls/{n}/merge in a bounded
exponential-backoff retry-loop on TRANSIENT outcomes (405 "try again later",
408, any 5xx, network/timeout, and 409|422 while the PR is still mergeable);
TERMINAL outcomes (403/404/real conflict via mergeable==False) -> fast honest
False, so the ORCH-071/081 not-merged HOLD backstop is unchanged. Fixes the
ORCH-063 false HOLD + manual re-merge on Gitea's post-push mergeability hiccup.
ensure_open_pr gains an "already fully in main" guard (_branch_fully_in_main,
git merge-base --is-ancestor HEAD origin/main) BEFORE creating a PR -> new
"already-in-main" outcome avoids the garbage empty PR on a re-driven finalizer;
_handle_merge_verify skips merge_pr on that outcome and lets the authoritative
SHA-in-main check confirm -> done (not a HOLD). git error of the guard fails
OPEN to the create path.
New ORCH_MERGE_RETRY_* settings (kill-switch merge_retry_enabled -> one-shot,
max_attempts=3, backoff base=2/max=5). INV-4 (merge only via Gitea PR-merge API,
never push/force-push main), never-raise, STAGE_TRANSITIONS/QG_CHECKS/DB schema
unchanged. Docs (README merge-verify section, CLAUDE.md, CHANGELOG, .env.example)
updated in the same PR. Tests: test_merge_gate.py TC-01..12, test_config.py
TC-13, test_merge_verify.py TC-14..16; full suite green (1389).
Refs: ORCH-093
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -549,6 +549,31 @@ class Settings(BaseSettings):
|
||||
merge_pr_timeout_s: int = 60
|
||||
merge_verify_timeout_s: int = 60
|
||||
|
||||
# ORCH-093: deterministic merge-actor retry of TRANSIENT Gitea merge errors.
|
||||
# The incident ORCH-063 had a green self-deploy + an open, mergeable PR, yet
|
||||
# POST /pulls/{n}/merge returned HTTP 405 ("Please try again later") because
|
||||
# Gitea was still recomputing `mergeable` right after the push — the one-shot
|
||||
# merge_pr returned False, the ORCH-071/081 backstop HELD the task on `deploy`,
|
||||
# and a human had to re-merge by hand. merge_pr now wraps ONLY the mutating
|
||||
# POST in a bounded exponential-backoff retry-loop on TRANSIENT outcomes
|
||||
# (405/408/5xx/network-timeout, and 409|422 while the PR is still mergeable);
|
||||
# TERMINAL outcomes (403/404/real conflict) -> fast honest False (the HOLD
|
||||
# protection is unchanged). Mirrors the ci_poll_* idiom of check_ci_green.
|
||||
# merge_retry_enabled -> kill-switch; False -> exactly one POST
|
||||
# (byte-for-byte the prior one-shot behaviour,
|
||||
# env ORCH_MERGE_RETRY_ENABLED).
|
||||
# merge_retry_max_attempts -> max POST attempts on a transient outcome
|
||||
# (env ORCH_MERGE_RETRY_MAX_ATTEMPTS).
|
||||
# merge_retry_backoff_base_s -> exponential backoff base seconds
|
||||
# (env ORCH_MERGE_RETRY_BACKOFF_BASE_S).
|
||||
# merge_retry_backoff_max_s -> per-sleep backoff ceiling seconds; total sleep
|
||||
# is bounded by (N-1) * max so the monitor-thread
|
||||
# is never wedged (env ORCH_MERGE_RETRY_BACKOFF_MAX_S).
|
||||
merge_retry_enabled: bool = True
|
||||
merge_retry_max_attempts: int = 3
|
||||
merge_retry_backoff_base_s: int = 2
|
||||
merge_retry_backoff_max_s: int = 5
|
||||
|
||||
# ORCH-026: intra-repo merge serialisation (Level A) + declarative task
|
||||
# dependencies (Level B). Level A reuses the ORCH-043/065 merge-lease window
|
||||
# (no new mechanism) — the merge-lease already serialises "merge -> main-updated"
|
||||
|
||||
@@ -602,6 +602,51 @@ def merge_verify_applies(repo: str) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
def _branch_fully_in_main(repo: str, branch: str) -> bool | None:
|
||||
"""Return True iff ``branch`` has NO commits beyond ``origin/main`` (ORCH-093 D3).
|
||||
|
||||
Used by ``ensure_open_pr`` to avoid creating an empty PR on a branch that is
|
||||
already fully merged into ``main`` (the ORCH-063 garbage-PR symptom on a
|
||||
re-driven finalizer after a manual merge). In the per-branch worktree:
|
||||
``git fetch origin main`` then ``git merge-base --is-ancestor HEAD origin/main``
|
||||
(equivalent to ``git rev-list --count origin/main..HEAD == 0``; same idiom as
|
||||
``branch_is_behind_main`` / ``verify_merged_to_main``).
|
||||
|
||||
* ``rc == 0`` -> HEAD is an ancestor of origin/main -> fully in main -> ``True``.
|
||||
* ``rc == 1`` -> there are commits beyond main -> ``False``.
|
||||
* git/OS error / ambiguous rc -> ``None`` (caller fail-OPENs: degrade to the
|
||||
create path; an infra hiccup must NOT become a false no-op merge).
|
||||
|
||||
Never-raise: any error -> ``None``.
|
||||
"""
|
||||
try:
|
||||
wt = ensure_worktree(repo, branch)
|
||||
except Exception as e: # noqa: BLE001 - never-raise contract -> fail-OPEN
|
||||
logger.warning("_branch_fully_in_main: worktree error for %s/%s: %s", repo, branch, e)
|
||||
return None
|
||||
try:
|
||||
subprocess.run(
|
||||
["git", "-C", wt, "fetch", "origin", "main"],
|
||||
capture_output=True, timeout=_FETCH_TIMEOUT,
|
||||
)
|
||||
r = subprocess.run(
|
||||
["git", "-C", wt, "merge-base", "--is-ancestor", "HEAD", "origin/main"],
|
||||
capture_output=True, timeout=_SHORT_TIMEOUT,
|
||||
)
|
||||
except (subprocess.SubprocessError, OSError) as e:
|
||||
logger.warning("_branch_fully_in_main: git error for %s/%s: %s", repo, branch, e)
|
||||
return None
|
||||
if r.returncode == 0:
|
||||
return True
|
||||
if r.returncode == 1:
|
||||
return False
|
||||
logger.warning(
|
||||
"_branch_fully_in_main: ambiguous merge-base rc=%s for %s/%s (fail-open)",
|
||||
r.returncode, repo, branch,
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def ensure_open_pr(repo: str, branch: str) -> tuple[str, str]:
|
||||
"""Guarantee an open **code-PR** (``head==branch`` AND ``base=="main"``) exists.
|
||||
|
||||
@@ -625,6 +670,12 @@ def ensure_open_pr(repo: str, branch: str) -> tuple[str, str]:
|
||||
``("existed", …)``; no duplicate is created (AC-2 / FR-5).
|
||||
4. Any other HTTP/parse/network error -> ``("failed", "<reason>")``.
|
||||
|
||||
ORCH-093 (D3) adds a guard BETWEEN steps 1 and 2: if the branch is already fully
|
||||
in ``main`` (no commits beyond ``origin/main``) there is nothing to PR -> the new
|
||||
outcome ``("already-in-main", "<reason>")`` is returned WITHOUT a ``POST`` (avoids
|
||||
an empty garbage PR on a re-driven finalizer). A git error of the guard fails OPEN
|
||||
(degrade to the create path) so an infra hiccup never becomes a false no-op.
|
||||
|
||||
Reuses ``settings.merge_pr_timeout_s`` (same class of Gitea calls as ``merge_pr``).
|
||||
Never-raise (AC-7): any unexpected error -> ``("failed", str(e))``; the exception is
|
||||
NEVER propagated into ``_handle_merge_verify`` / ``advance_stage``.
|
||||
@@ -657,6 +708,21 @@ def ensure_open_pr(repo: str, branch: str) -> tuple[str, str]:
|
||||
logger.info("ensure_open_pr: %s/%s already has open code-PR #%s", repo, branch, existing)
|
||||
return "existed", str(existing)
|
||||
|
||||
# Step 1b (ORCH-093 D3): guard "branch already fully in main". If the branch
|
||||
# has no commits beyond origin/main there is nothing to PR — creating one
|
||||
# would yield an empty garbage PR (the ORCH-063 symptom on a re-driven
|
||||
# finalizer after a manual merge). Return the new "already-in-main" outcome
|
||||
# so _handle_merge_verify skips merge_pr and lets the authoritative
|
||||
# SHA-in-main check confirm -> done. fail-OPEN on git error / ambiguous
|
||||
# (None): degrade to the create path below, NEVER block — an infra hiccup
|
||||
# must not become a false no-op merge (SHA-in-main downstream stays the proof).
|
||||
if _branch_fully_in_main(repo, branch) is True:
|
||||
logger.info(
|
||||
"ensure_open_pr: %s/%s already fully in main -> already-in-main (no PR created)",
|
||||
repo, branch,
|
||||
)
|
||||
return "already-in-main", "branch already in main (no commits beyond origin/main)"
|
||||
|
||||
# Step 2: create the code-PR onto main.
|
||||
parts = branch.split("/")
|
||||
title = parts[-1] if parts else branch
|
||||
@@ -697,6 +763,89 @@ def ensure_open_pr(repo: str, branch: str) -> tuple[str, str]:
|
||||
return "failed", f"ensure_open_pr error: {e}"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ORCH-093: transient-error retry of the merge POST + classification helpers.
|
||||
# ---------------------------------------------------------------------------
|
||||
def _merge_backoff(attempt: int) -> float:
|
||||
"""Exponential backoff (s) with a ceiling for the merge-POST retry (ORCH-093 D1).
|
||||
|
||||
``backoff(i) = min(base * 2**(i-1), max)`` — the transient-breaker idiom of the
|
||||
Claude agents, bounded so the total sleep ``(N-1) * max`` can never wedge the
|
||||
monitor-thread running merge-verify (NFR-4). Defaults base=2, max=5 -> the
|
||||
sequence is 2, 4, 5, 5, … seconds.
|
||||
"""
|
||||
base = settings.merge_retry_backoff_base_s
|
||||
cap = settings.merge_retry_backoff_max_s
|
||||
try:
|
||||
return float(min(base * (2 ** (max(attempt, 1) - 1)), cap))
|
||||
except Exception: # noqa: BLE001 - never-raise; degrade to the ceiling
|
||||
return float(cap)
|
||||
|
||||
|
||||
def _pr_mergeable(repo: str, index) -> bool | None:
|
||||
"""Read the ``mergeable`` field of PR ``index`` via ``GET /pulls/{index}`` (ORCH-093 D2).
|
||||
|
||||
Used ONLY to disambiguate a ``409``/``422`` merge POST: Gitea may still be
|
||||
recomputing mergeability right after a push (the ORCH-063 root cause). Returns
|
||||
the boolean ``mergeable`` flag, or ``None`` when it is absent / non-boolean / the
|
||||
GET fails (never-raise) — the caller treats ``None`` as the default-policy
|
||||
transient (D2).
|
||||
"""
|
||||
try:
|
||||
import httpx
|
||||
owner = settings.gitea_owner
|
||||
headers = {"Authorization": f"token {settings.gitea_token}"}
|
||||
resp = httpx.get(
|
||||
f"{settings.gitea_url}/api/v1/repos/{owner}/{repo}/pulls/{index}",
|
||||
headers=headers, timeout=settings.merge_pr_timeout_s,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
return None
|
||||
val = (resp.json() or {}).get("mergeable")
|
||||
return val if isinstance(val, bool) else None
|
||||
except Exception as e: # noqa: BLE001 - never-raise contract
|
||||
logger.warning("_pr_mergeable check failed for %s PR #%s: %s", repo, index, e)
|
||||
return None
|
||||
|
||||
|
||||
def _classify_merge_response(repo: str, branch: str, index, status_code: int) -> str:
|
||||
"""Classify a non-2xx ``POST /pulls/{index}/merge`` outcome (ORCH-093 D2).
|
||||
|
||||
Returns ``"transient"`` (retry within budget) or ``"terminal"`` (fast honest
|
||||
``False``; the ORCH-071/081 HOLD backstop takes over). Decision tree:
|
||||
|
||||
* ``405`` ("try again later"), ``408``, any ``5xx`` -> **transient**.
|
||||
* ``403`` (no rights), ``404`` (PR gone) -> **terminal**.
|
||||
* ``409`` / ``422`` (ambiguous) -> ``GET /pulls/{index}`` -> ``mergeable``:
|
||||
- ``False`` -> **terminal** (real conflict, fast HOLD).
|
||||
- ``True`` / ``None`` / GET failed -> **transient** (default-policy
|
||||
fail-OPEN-in-retry: Gitea has not recomputed yet — the ORCH-063 case;
|
||||
the retry budget is finite, so a real conflict still HOLDs after it).
|
||||
* any other unexpected code -> **terminal** (do not loop on unknowns).
|
||||
|
||||
Never-raise: any error -> ``"transient"`` (conservative, within the bounded
|
||||
retry budget).
|
||||
"""
|
||||
try:
|
||||
if status_code in (405, 408) or 500 <= status_code <= 599:
|
||||
return "transient"
|
||||
if status_code in (403, 404):
|
||||
return "terminal"
|
||||
if status_code in (409, 422):
|
||||
mergeable = _pr_mergeable(repo, index)
|
||||
if mergeable is False:
|
||||
return "terminal"
|
||||
# True OR None/unavailable -> transient (default-policy, D2).
|
||||
return "transient"
|
||||
return "terminal"
|
||||
except Exception as e: # noqa: BLE001 - never-raise contract
|
||||
logger.warning(
|
||||
"_classify_merge_response error for %s/%s PR #%s: %s (transient)",
|
||||
repo, branch, index, e,
|
||||
)
|
||||
return "transient"
|
||||
|
||||
|
||||
def merge_pr(repo: str, branch: str) -> tuple[bool, str]:
|
||||
"""Deterministically merge the open PR for ``branch`` via the Gitea PR-merge API.
|
||||
|
||||
@@ -712,8 +861,16 @@ def merge_pr(repo: str, branch: str) -> tuple[bool, str]:
|
||||
(FR-3) adds the ``base == main`` filter so the actor merges exactly the
|
||||
feature code-PR and never an auto docs-PR / a PR onto a foreign base. No
|
||||
such open PR -> ``(False, "no open PR")``.
|
||||
3. ``POST /repos/{owner}/{repo}/pulls/{index}/merge`` (Do: ``merge``) ->
|
||||
200/201 -> ``(True, "merged PR #<n>")``; otherwise ``(False, "<reason>")``.
|
||||
3. ``POST /repos/{owner}/{repo}/pulls/{index}/merge`` (Do: ``merge``) in a
|
||||
bounded retry-loop (ORCH-093 D1): ``200/201`` -> ``(True, "merged PR #<n>")``;
|
||||
a TRANSIENT outcome (405/408/5xx/network/timeout, or 409|422 while still
|
||||
mergeable) is retried with exponential backoff up to
|
||||
``merge_retry_max_attempts``; a TERMINAL outcome (403/404/real conflict) ->
|
||||
immediate ``(False, "merge failed: HTTP <code>")``; exhausting the budget on
|
||||
a transient -> ``(False, "merge failed after <N> attempts: HTTP <code>")``.
|
||||
The kill-switch ``merge_retry_enabled=False`` forces exactly one POST
|
||||
(the prior one-shot behaviour). Only the mutating POST is retried — the
|
||||
idempotent steps above are not.
|
||||
|
||||
Never-raise (INV-1/AC-9 / TC-09): any HTTP/parse error -> ``(False, reason)``.
|
||||
"""
|
||||
@@ -744,21 +901,59 @@ def merge_pr(repo: str, branch: str) -> tuple[bool, str]:
|
||||
if index is None:
|
||||
return False, "no open PR"
|
||||
|
||||
m = httpx.post(
|
||||
f"{base}/pulls/{index}/merge",
|
||||
json={"Do": "merge"},
|
||||
headers=headers,
|
||||
timeout=timeout,
|
||||
)
|
||||
if m.status_code in (200, 201):
|
||||
logger.info("merge_pr: merged PR #%s for %s/%s", index, repo, branch)
|
||||
return True, f"merged PR #{index}"
|
||||
detail = (m.text or "").strip()[:200]
|
||||
logger.warning(
|
||||
"merge_pr: merge failed for %s/%s PR #%s: HTTP %s %s",
|
||||
repo, branch, index, m.status_code, detail,
|
||||
)
|
||||
return False, f"merge failed: HTTP {m.status_code}"
|
||||
# ORCH-093 D1: retry ONLY the mutating POST on transient outcomes. The
|
||||
# kill-switch collapses the budget to one attempt = the prior one-shot path
|
||||
# (no branching of the loop body, ADR D1).
|
||||
n_eff = settings.merge_retry_max_attempts if settings.merge_retry_enabled else 1
|
||||
if n_eff < 1:
|
||||
n_eff = 1
|
||||
for attempt in range(1, n_eff + 1):
|
||||
try:
|
||||
m = httpx.post(
|
||||
f"{base}/pulls/{index}/merge",
|
||||
json={"Do": "merge"},
|
||||
headers=headers,
|
||||
timeout=timeout,
|
||||
)
|
||||
except (httpx.HTTPError, OSError) as e:
|
||||
# Network/timeout -> transient within the bounded budget (never-raise).
|
||||
logger.warning(
|
||||
"merge_pr: attempt %s/%s network error for %s/%s PR #%s: %s (transient)",
|
||||
attempt, n_eff, repo, branch, index, e,
|
||||
)
|
||||
if attempt < n_eff:
|
||||
time.sleep(_merge_backoff(attempt))
|
||||
continue
|
||||
return False, f"merge failed after {n_eff} attempts: network error"
|
||||
|
||||
if m.status_code in (200, 201):
|
||||
logger.info(
|
||||
"merge_pr: merged PR #%s for %s/%s (attempt %s/%s)",
|
||||
index, repo, branch, attempt, n_eff,
|
||||
)
|
||||
return True, f"merged PR #{index}"
|
||||
|
||||
detail = (m.text or "").strip()[:200]
|
||||
cls = _classify_merge_response(repo, branch, index, m.status_code)
|
||||
if cls == "terminal":
|
||||
logger.warning(
|
||||
"merge_pr: merge failed for %s/%s PR #%s: HTTP %s %s (terminal)",
|
||||
repo, branch, index, m.status_code, detail,
|
||||
)
|
||||
return False, f"merge failed: HTTP {m.status_code}"
|
||||
|
||||
# Transient: log attempt i/N (check_ci_green idiom) and retry if budget left.
|
||||
logger.warning(
|
||||
"merge_pr: attempt %s/%s transient HTTP %s for %s/%s PR #%s %s",
|
||||
attempt, n_eff, m.status_code, repo, branch, index, detail,
|
||||
)
|
||||
if attempt < n_eff:
|
||||
time.sleep(_merge_backoff(attempt))
|
||||
continue
|
||||
return False, f"merge failed after {n_eff} attempts: HTTP {m.status_code}"
|
||||
|
||||
# Unreachable (loop always returns), defensive only.
|
||||
return False, f"merge failed after {n_eff} attempts"
|
||||
except Exception as e: # noqa: BLE001 - never-raise contract
|
||||
logger.warning("merge_pr unexpected error for %s/%s: %s", repo, branch, e)
|
||||
return False, f"merge error: {e}"
|
||||
@@ -841,6 +1036,7 @@ MAIN_REGRESSION_MARKERS: list[tuple[str, str, str]] = [
|
||||
("ORCH-071", "verify_merged_to_main", "src/merge_gate.py"),
|
||||
("ORCH-073", "check_main_regression", "src/merge_gate.py"),
|
||||
("ORCH-082", "ensure_open_pr", "src/merge_gate.py"),
|
||||
("ORCH-093", "_classify_merge_response", "src/merge_gate.py"),
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -1483,6 +1483,7 @@ def _handle_merge_verify(task_id, repo, work_item_id, branch, result: AdvanceRes
|
||||
# `created`/`existed` -> proceed unchanged; `failed` -> honest HOLD with a
|
||||
# distinguishable text (NOT the not-merged HOLD). ORCH-073's SHA-in-main proof
|
||||
# below is untouched and stays authoritative. Kill-switch off -> 1:1 prior path.
|
||||
skip_merge = False
|
||||
if settings.merge_verify_autocreate_pr_enabled:
|
||||
pr_status, pr_detail = merge_gate.ensure_open_pr(repo, branch)
|
||||
logger.info(
|
||||
@@ -1492,10 +1493,25 @@ def _handle_merge_verify(task_id, repo, work_item_id, branch, result: AdvanceRes
|
||||
return _hold_pr_create_failed(
|
||||
task_id, repo, work_item_id, branch, pr_detail, result
|
||||
)
|
||||
if pr_status == "already-in-main":
|
||||
# ORCH-093 (D4): the branch is already fully in `main` -> nothing to
|
||||
# merge and no PR was created. Skip the deterministic merge_pr; the
|
||||
# authoritative SHA-in-main check below confirms the merge -> done.
|
||||
# This is NOT a HOLD (the goal is already achieved); if for some
|
||||
# reason the SHA is not in main the prior not-merged HOLD still fires
|
||||
# (fail-closed, safe).
|
||||
logger.info(
|
||||
f"Task {task_id}: merge-verify already-in-main -> skip merge_pr "
|
||||
"(SHA-in-main authoritative)"
|
||||
)
|
||||
skip_merge = True
|
||||
# "created" | "existed" -> proceed normally to merge_pr.
|
||||
|
||||
# Deterministic merge-actor (no-op if the PR is already merged, INV-5/AC-9).
|
||||
merged_ok, merge_msg = merge_gate.merge_pr(repo, branch)
|
||||
if skip_merge:
|
||||
merged_ok, merge_msg = True, "already-in-main (skipped merge_pr)"
|
||||
else:
|
||||
merged_ok, merge_msg = merge_gate.merge_pr(repo, branch)
|
||||
logger.info(
|
||||
f"Task {task_id}: merge-verify merge_pr -> ok={merged_ok} ({merge_msg})"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user