merge_pr now wraps ONLY the mutating POST /pulls/{n}/merge in a bounded
exponential-backoff retry-loop on TRANSIENT outcomes (405 "try again later",
408, any 5xx, network/timeout, and 409|422 while the PR is still mergeable);
TERMINAL outcomes (403/404/real conflict via mergeable==False) -> fast honest
False, so the ORCH-071/081 not-merged HOLD backstop is unchanged. Fixes the
ORCH-063 false HOLD + manual re-merge on Gitea's post-push mergeability hiccup.
ensure_open_pr gains an "already fully in main" guard (_branch_fully_in_main,
git merge-base --is-ancestor HEAD origin/main) BEFORE creating a PR -> new
"already-in-main" outcome avoids the garbage empty PR on a re-driven finalizer;
_handle_merge_verify skips merge_pr on that outcome and lets the authoritative
SHA-in-main check confirm -> done (not a HOLD). git error of the guard fails
OPEN to the create path.
New ORCH_MERGE_RETRY_* settings (kill-switch merge_retry_enabled -> one-shot,
max_attempts=3, backoff base=2/max=5). INV-4 (merge only via Gitea PR-merge API,
never push/force-push main), never-raise, STAGE_TRANSITIONS/QG_CHECKS/DB schema
unchanged. Docs (README merge-verify section, CLAUDE.md, CHANGELOG, .env.example)
updated in the same PR. Tests: test_merge_gate.py TC-01..12, test_config.py
TC-13, test_merge_verify.py TC-14..16; full suite green (1389).
Refs: ORCH-093
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1124 lines
51 KiB
Python
1124 lines
51 KiB
Python
"""Merge-gate core (ORCH-043): catch a branch up to the CURRENT origin/main,
|
||
re-test it, and serialise merges with a file lease.
|
||
|
||
Background
|
||
----------
|
||
The pipeline validates a branch against the ``main`` it was BRANCHED from, not the
|
||
``main`` at the moment of merge. Between "branch validated" and "branch merged" a
|
||
parallel task may have advanced ``main`` -> a *semantic* merge conflict: git merges
|
||
with no textual conflict, yet the combined ``main`` is broken. For the self-hosting
|
||
``orchestrator`` repo that means a red ``main`` of the tool serving every project.
|
||
|
||
This module provides the deterministic (no-LLM) primitives the quality-gate
|
||
``check_branch_mergeable`` (src/qg/checks.py) composes on the
|
||
``deploy-staging -> deploy`` edge, BEFORE the deployer merges the PR:
|
||
|
||
* ``branch_is_behind_main`` -> is the branch missing the latest origin/main?
|
||
* ``auto_rebase_onto_main`` -> rebase onto origin/main + push --force-with-lease
|
||
(ONLY the task branch; NEVER main).
|
||
* ``retest_branch`` -> run the project test-suite in the caught-up worktree.
|
||
* file lease (``acquire_merge_lease`` / ``release_merge_lease``) -> serialise the
|
||
"catch-up + re-test + merge" of ONE repo, held from the gate to the actual merge.
|
||
|
||
Invariants (self-hosting safety, ТЗ §10):
|
||
* NEVER push or force-push ``main`` — the only force op is ``--force-with-lease``
|
||
on the task branch.
|
||
* All git ops run in the per-branch worktree (ensure_worktree), never the shared clone.
|
||
* Every public function honours a strict **never-raise** contract: any git/OS error
|
||
-> ``(False, "<reason>")`` (or a safe bool), never a propagated exception.
|
||
"""
|
||
|
||
import json
|
||
import logging
|
||
import os
|
||
import subprocess
|
||
import time
|
||
|
||
from .config import settings
|
||
from .git_worktree import ensure_worktree, get_worktree_path
|
||
|
||
logger = logging.getLogger("orchestrator.merge_gate")
|
||
|
||
# git sub-command timeouts (seconds). Generous but bounded so a hung git never
|
||
# wedges the monitor-thread that runs the gate.
|
||
_FETCH_TIMEOUT = 60
|
||
_REBASE_TIMEOUT = 120
|
||
_PUSH_TIMEOUT = 60
|
||
_SHORT_TIMEOUT = 30
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# behind / ancestor detection
|
||
# ---------------------------------------------------------------------------
|
||
def branch_is_behind_main(repo: str, branch: str) -> bool:
|
||
"""Return True iff ``branch`` does NOT already contain the latest origin/main.
|
||
|
||
A branch is "behind" when ``origin/main`` is **not** an ancestor of the branch
|
||
HEAD (``git merge-base --is-ancestor origin/main HEAD`` returns non-zero). All
|
||
work happens in the per-branch worktree (ORCH-2 / S-4 isolation).
|
||
|
||
Never-raise (AC-9 / TC-03): any git/OS failure or an ambiguous result is treated
|
||
as "cannot prove the branch is up-to-date" -> return True (force a rebase attempt
|
||
rather than merge blindly). It returns a bool, never raises.
|
||
"""
|
||
try:
|
||
wt = ensure_worktree(repo, branch)
|
||
except Exception as e: # noqa: BLE001 - never-raise contract
|
||
logger.warning("branch_is_behind_main: worktree error for %s/%s: %s", repo, branch, e)
|
||
return True
|
||
|
||
try:
|
||
subprocess.run(
|
||
["git", "-C", wt, "fetch", "origin", "main"],
|
||
capture_output=True, timeout=_FETCH_TIMEOUT,
|
||
)
|
||
r = subprocess.run(
|
||
["git", "-C", wt, "merge-base", "--is-ancestor", "origin/main", "HEAD"],
|
||
capture_output=True, timeout=_SHORT_TIMEOUT,
|
||
)
|
||
except (subprocess.SubprocessError, OSError) as e:
|
||
logger.warning("branch_is_behind_main: git error for %s/%s: %s", repo, branch, e)
|
||
return True
|
||
|
||
if r.returncode == 0:
|
||
# origin/main IS an ancestor of HEAD -> branch already up-to-date.
|
||
return False
|
||
if r.returncode == 1:
|
||
# origin/main is NOT an ancestor -> branch is behind.
|
||
return True
|
||
# Any other code (e.g. bad ref) -> ambiguous; do not merge blindly.
|
||
logger.warning(
|
||
"branch_is_behind_main: ambiguous merge-base rc=%s for %s/%s (treating as behind)",
|
||
r.returncode, repo, branch,
|
||
)
|
||
return True
|
||
|
||
|
||
def _conflicted_files(wt: str) -> str:
|
||
"""Best-effort list of unmerged (conflicting) files in the worktree."""
|
||
try:
|
||
r = subprocess.run(
|
||
["git", "-C", wt, "diff", "--name-only", "--diff-filter=U"],
|
||
capture_output=True, text=True, timeout=_SHORT_TIMEOUT,
|
||
)
|
||
files = r.stdout.strip().replace("\n", ", ")
|
||
return files or "unknown"
|
||
except (subprocess.SubprocessError, OSError):
|
||
return "unknown"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# auto-rebase onto origin/main
|
||
# ---------------------------------------------------------------------------
|
||
def auto_rebase_onto_main(repo: str, branch: str) -> tuple[bool, str]:
|
||
"""Catch ``branch`` up to ``origin/main`` via rebase, then push it.
|
||
|
||
Steps (all in the per-branch worktree):
|
||
1. ``git fetch origin main``.
|
||
2. ``git rebase origin/main``:
|
||
- textual conflict (non-zero) -> ``git rebase --abort`` (leave worktree
|
||
clean) -> ``(False, "rebase conflict: <files>")`` (AC-3).
|
||
3. clean rebase -> ``git push --force-with-lease origin <branch>`` — ONLY the
|
||
task branch, NEVER ``main`` (AC-7) -> ``(True, "rebased onto origin/main")``.
|
||
|
||
Never-raise (AC-9): any git/OS error -> ``(False, "<reason>")``.
|
||
"""
|
||
try:
|
||
wt = ensure_worktree(repo, branch)
|
||
except Exception as e: # noqa: BLE001 - never-raise contract
|
||
return False, f"rebase setup error: {e}"
|
||
|
||
try:
|
||
subprocess.run(
|
||
["git", "-C", wt, "fetch", "origin", "main"],
|
||
capture_output=True, timeout=_FETCH_TIMEOUT,
|
||
)
|
||
r = subprocess.run(
|
||
["git", "-C", wt, "rebase", "origin/main"],
|
||
capture_output=True, text=True, timeout=_REBASE_TIMEOUT,
|
||
)
|
||
if r.returncode != 0:
|
||
files = _conflicted_files(wt)
|
||
subprocess.run(
|
||
["git", "-C", wt, "rebase", "--abort"],
|
||
capture_output=True, timeout=_SHORT_TIMEOUT,
|
||
)
|
||
logger.warning("auto_rebase: conflict on %s/%s: %s", repo, branch, files)
|
||
return False, f"rebase conflict: {files}"
|
||
|
||
# Clean rebase -> push ONLY the task branch with a lease (never main).
|
||
p = subprocess.run(
|
||
["git", "-C", wt, "push", "--force-with-lease", "origin", branch],
|
||
capture_output=True, text=True, timeout=_PUSH_TIMEOUT,
|
||
)
|
||
if p.returncode != 0:
|
||
detail = (p.stderr or p.stdout or "").strip()[:200]
|
||
logger.warning("auto_rebase: push failed on %s/%s: %s", repo, branch, detail)
|
||
return False, f"push --force-with-lease failed: {detail}"
|
||
|
||
logger.info("auto_rebase: %s/%s rebased onto origin/main and pushed", repo, branch)
|
||
return True, "rebased onto origin/main"
|
||
except subprocess.TimeoutExpired:
|
||
# Leave no half-finished rebase behind.
|
||
try:
|
||
subprocess.run(
|
||
["git", "-C", wt, "rebase", "--abort"],
|
||
capture_output=True, timeout=_SHORT_TIMEOUT,
|
||
)
|
||
except (subprocess.SubprocessError, OSError):
|
||
pass
|
||
return False, "rebase timeout"
|
||
except (subprocess.SubprocessError, OSError) as e:
|
||
return False, f"rebase error: {e}"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# re-test in the caught-up worktree
|
||
# ---------------------------------------------------------------------------
|
||
def retest_branch(repo: str, branch: str) -> tuple[bool, str]:
|
||
"""Run the project test-suite in the (already caught-up) branch worktree.
|
||
|
||
Command: ``python -m pytest <merge_retest_target>`` (default ``tests/``),
|
||
matching the orchestrator CI / check_tests_local pattern. Bounded by
|
||
``settings.merge_retest_timeout_s``.
|
||
|
||
Returns:
|
||
* ``(True, "re-test green")`` — pytest rc == 0
|
||
* ``(False, "re-test timeout after <T>s")`` — exceeded the timeout (AC-6)
|
||
* ``(False, "re-test failed: ...<tail>")`` — non-zero rc, with output tail
|
||
Never-raise (AC-9): any setup/OS error -> ``(False, "<reason>")``.
|
||
"""
|
||
wt = get_worktree_path(repo, branch)
|
||
if not os.path.isdir(wt):
|
||
# Caller usually rebased first (worktree exists); ensure as a fallback.
|
||
try:
|
||
wt = ensure_worktree(repo, branch)
|
||
except Exception as e: # noqa: BLE001 - never-raise contract
|
||
return False, f"re-test setup error: {e}"
|
||
|
||
target = settings.merge_retest_target or "tests/"
|
||
timeout = settings.merge_retest_timeout_s
|
||
try:
|
||
r = subprocess.run(
|
||
["python", "-m", "pytest", target, "-q"],
|
||
cwd=wt, capture_output=True, text=True, timeout=timeout,
|
||
)
|
||
except subprocess.TimeoutExpired:
|
||
logger.warning("retest_branch: timeout (%ss) on %s/%s", timeout, repo, branch)
|
||
return False, f"re-test timeout after {timeout}s"
|
||
except (subprocess.SubprocessError, OSError) as e:
|
||
return False, f"re-test error: {e}"
|
||
|
||
if r.returncode == 0:
|
||
return True, "re-test green"
|
||
tail = ((r.stdout or "") + (r.stderr or ""))[-500:]
|
||
logger.warning("retest_branch: red on %s/%s", repo, branch)
|
||
return False, f"re-test failed: ...{tail}"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# merge-lease (serialise catch-up + re-test + merge per repo)
|
||
# ---------------------------------------------------------------------------
|
||
def _lease_path(repo: str) -> str:
|
||
"""Filesystem path of the per-repo merge lease (no schema change, ТЗ §4)."""
|
||
return os.path.join(settings.repos_dir, f".merge-lease-{repo}.json")
|
||
|
||
|
||
def _read_lease(path: str) -> dict | None:
|
||
"""Read+parse the lease file; None if missing or corrupt (never-raise)."""
|
||
try:
|
||
with open(path, "r", encoding="utf-8") as f:
|
||
return json.loads(f.read())
|
||
except FileNotFoundError:
|
||
return None
|
||
except (OSError, ValueError) as e:
|
||
logger.warning("merge-lease read error at %s: %s", path, e)
|
||
return None
|
||
|
||
|
||
def _write_lease(path: str, holder: dict) -> None:
|
||
"""Atomically (O_CREAT|O_EXCL) write the lease; raises FileExistsError if held."""
|
||
fd = os.open(path, os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o644)
|
||
try:
|
||
os.write(fd, json.dumps(holder).encode("utf-8"))
|
||
finally:
|
||
os.close(fd)
|
||
|
||
|
||
def acquire_merge_lease(
|
||
repo: str, branch: str, work_item_id: str | None = None, task_id: int | None = None
|
||
) -> tuple[bool, str]:
|
||
"""Try to acquire the per-repo merge lease. **Non-blocking** (anti-deadlock).
|
||
|
||
Holder identity is the task ``branch`` (stable, one branch per task). Outcomes:
|
||
* no lease file -> acquire, write metadata -> ``(True, "lease acquired")``
|
||
* lease held by self -> idempotent re-acquire (restart/retry) -> ``(True, "lease already held")``
|
||
* lease held by other, age < merge_lock_timeout_s -> ``(False, "merge-lock busy")``
|
||
* lease held by other, age >= merge_lock_timeout_s -> stale -> reclaim with a
|
||
``logger.warning`` (the holder process died without releasing) -> ``(True, ...)``
|
||
|
||
Never-raise: any unexpected error -> ``(False, "merge-lock busy")`` so the caller
|
||
DEFERS and retries rather than burning a developer retry on an infra hiccup.
|
||
"""
|
||
path = _lease_path(repo)
|
||
holder = {
|
||
"branch": branch,
|
||
"work_item_id": work_item_id,
|
||
"task_id": task_id,
|
||
"acquired_at": time.time(),
|
||
"pid": os.getpid(),
|
||
}
|
||
try:
|
||
try:
|
||
_write_lease(path, holder)
|
||
logger.info("merge-lease acquired for %s by %s", repo, branch)
|
||
return True, "lease acquired"
|
||
except FileExistsError:
|
||
pass
|
||
|
||
existing = _read_lease(path)
|
||
if existing is None:
|
||
# Corrupt/empty lease file — reclaim it.
|
||
_force_write_lease(path, holder)
|
||
logger.warning("merge-lease for %s was corrupt; reclaimed by %s", repo, branch)
|
||
return True, "lease reclaimed (corrupt)"
|
||
|
||
if existing.get("branch") == branch:
|
||
return True, "lease already held"
|
||
|
||
age = time.time() - float(existing.get("acquired_at") or 0)
|
||
if age >= settings.merge_lock_timeout_s:
|
||
_force_write_lease(path, holder)
|
||
logger.warning(
|
||
"merge-lease for %s was stale (age %.0fs >= %ss, holder=%s); reclaimed by %s",
|
||
repo, age, settings.merge_lock_timeout_s, existing.get("branch"), branch,
|
||
)
|
||
return True, "lease reclaimed (stale)"
|
||
|
||
logger.info(
|
||
"merge-lease for %s busy (held by %s, age %.0fs); %s defers",
|
||
repo, existing.get("branch"), age, branch,
|
||
)
|
||
return False, "merge-lock busy"
|
||
except Exception as e: # noqa: BLE001 - never-raise contract
|
||
logger.warning("acquire_merge_lease unexpected error for %s/%s: %s", repo, branch, e)
|
||
return False, "merge-lock busy"
|
||
|
||
|
||
def _force_write_lease(path: str, holder: dict) -> None:
|
||
"""Overwrite the lease (used for stale/corrupt reclaim). Best-effort."""
|
||
try:
|
||
with open(path, "w", encoding="utf-8") as f:
|
||
f.write(json.dumps(holder))
|
||
except OSError as e:
|
||
logger.warning("merge-lease force-write error at %s: %s", path, e)
|
||
|
||
|
||
def release_merge_lease(repo: str, branch: str | None = None) -> None:
|
||
"""Release the per-repo merge lease. **Idempotent** and **holder-aware**.
|
||
|
||
If ``branch`` is given, the lease is removed ONLY when the current holder's
|
||
branch matches (so a delayed release from an already-merged task can never
|
||
delete a lease a DIFFERENT task acquired afterwards). With ``branch=None`` the
|
||
release is unconditional (best-effort backstop). Never raises.
|
||
"""
|
||
path = _lease_path(repo)
|
||
try:
|
||
if branch is not None:
|
||
existing = _read_lease(path)
|
||
if existing is not None and existing.get("branch") != branch:
|
||
logger.info(
|
||
"merge-lease release skipped for %s: holder=%s != %s",
|
||
repo, existing.get("branch"), branch,
|
||
)
|
||
return
|
||
os.remove(path)
|
||
logger.info("merge-lease released for %s (%s)", repo, branch or "force")
|
||
except FileNotFoundError:
|
||
return
|
||
except OSError as e:
|
||
logger.warning("merge-lease release error for %s: %s", repo, e)
|
||
|
||
|
||
def current_lease_holder(repo: str) -> str | None:
|
||
"""ORCH-090: branch currently holding the per-repo merge-lease, or None.
|
||
|
||
Read-only helper used by ``cancel.in_critical_window`` to decide whether a STOP
|
||
must be DEFERRED (the task is mid-merge). Never raises -> None on missing/corrupt
|
||
lease or any error (the caller treats an error as fail-CLOSED itself).
|
||
"""
|
||
try:
|
||
existing = _read_lease(_lease_path(repo))
|
||
return existing.get("branch") if existing else None
|
||
except Exception as e: # noqa: BLE001 - never-raise
|
||
logger.warning("current_lease_holder error for %s: %s", repo, e)
|
||
return None
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# ORCH-065: proactive stale/dead merge-lease reclaim (Problem B)
|
||
# ---------------------------------------------------------------------------
|
||
def pid_alive(pid) -> bool:
|
||
"""Return True iff process ``pid`` is alive (``os.kill(pid, 0)`` liveness probe).
|
||
|
||
Semantics (ADR-001 Р-2, never-raise):
|
||
* ``ProcessLookupError`` -> the process is gone -> ``False`` (reclaimable).
|
||
* ``PermissionError`` -> the pid exists but is owned by another user ->
|
||
``True`` (alive; conservatively do NOT reclaim).
|
||
* missing / invalid pid -> ``True`` (conservative: a lease that predates the
|
||
pid field, or a malformed pid, is NOT reclaimed on the liveness signal —
|
||
the TTL backstop still catches it).
|
||
Never raises; any unexpected OS/type error -> conservative ``True``.
|
||
"""
|
||
if not pid:
|
||
return True
|
||
try:
|
||
os.kill(int(pid), 0)
|
||
return True
|
||
except ProcessLookupError:
|
||
return False
|
||
except PermissionError:
|
||
return True
|
||
except (OSError, ValueError, TypeError):
|
||
return True
|
||
|
||
|
||
def _lease_reclaim_applies(repo: str) -> bool:
|
||
"""Whether proactive lease-reclaim is REAL for ``repo`` (same scope as merge-gate).
|
||
|
||
Reuses ``qg.checks._merge_gate_applies`` (``merge_gate_repos`` CSV, else the
|
||
self-hosting ``orchestrator``) so reclaim and the gate share one predicate
|
||
(ADR-001 Р-2 / FR-2.4). Imported lazily to avoid an import cycle (qg.checks
|
||
imports merge_gate lazily inside ``check_branch_mergeable``). Never raises:
|
||
any error -> ``False`` (no-op, the safe default).
|
||
"""
|
||
try:
|
||
from .qg.checks import _merge_gate_applies
|
||
return _merge_gate_applies(repo)
|
||
except Exception as e: # noqa: BLE001 - never-raise contract
|
||
logger.warning("lease-reclaim applicability check failed for %s: %s", repo, e)
|
||
return False
|
||
|
||
|
||
def reclaim_stale_lease(repo: str) -> bool:
|
||
"""Proactively reclaim a dead/stale merge-lease for ``repo`` (ADR-001 Р-2).
|
||
|
||
Unlike the lazy TTL reclaim inside ``acquire_merge_lease`` (which only fires
|
||
when ANOTHER task tries to acquire), this releases the lease as soon as the
|
||
holder is provably gone — without waiting for the TTL or a foreign acquire:
|
||
|
||
* holder pid is dead (``pid_alive`` is False) -> reclaim, OR
|
||
* lease age >= ``merge_lock_timeout_s`` (TTL) -> reclaim (AC-7).
|
||
|
||
A LIVE holder within its TTL is never touched (AC-8 — protects a legitimate
|
||
in-flight merge). Reclaim is holder-aware (``release_merge_lease(repo,
|
||
branch=holder)``) so it can never delete a lease a different task acquired in
|
||
the meantime. Conditional (FR-2.4): real only for ``merge_gate_repos`` /
|
||
self-hosting; other repos -> no-op. Kill-switch ``lease_reclaim_enabled``.
|
||
|
||
Returns True iff a lease was reclaimed. Never raises (AC-9): any read/remove
|
||
error is logged and swallowed so a single bad lease never kills the reaper
|
||
thread. Does NOT run any git operation — only the lease file is removed.
|
||
"""
|
||
try:
|
||
if not settings.lease_reclaim_enabled:
|
||
return False
|
||
if not _lease_reclaim_applies(repo):
|
||
return False
|
||
path = _lease_path(repo)
|
||
existing = _read_lease(path)
|
||
if existing is None:
|
||
return False # no lease (or unreadable -> _read_lease already logged)
|
||
holder = existing.get("branch")
|
||
pid = existing.get("pid")
|
||
age = time.time() - float(existing.get("acquired_at") or 0)
|
||
dead = not pid_alive(pid)
|
||
expired = age >= settings.merge_lock_timeout_s
|
||
if not (dead or expired):
|
||
return False # live holder within TTL -> protect legitimate merge
|
||
why = f"dead pid={pid}" if dead else f"stale age={age:.0f}s>=TTL"
|
||
release_merge_lease(repo, branch=holder)
|
||
logger.warning(
|
||
"merge-lease for %s reclaimed proactively (%s, holder=%s)",
|
||
repo, why, holder,
|
||
)
|
||
try:
|
||
from .notifications import send_telegram
|
||
send_telegram(
|
||
f"\U0001f527 merge-lease для {repo} освобождён проактивно "
|
||
f"({why}, holder={holder})"
|
||
)
|
||
except Exception as e: # noqa: BLE001 - telegram best-effort, never fatal
|
||
logger.warning("lease-reclaim telegram failed for %s: %s", repo, e)
|
||
return True
|
||
except Exception as e: # noqa: BLE001 - never-raise contract
|
||
logger.warning("reclaim_stale_lease unexpected error for %s: %s", repo, e)
|
||
return False
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# ORCH-065: idempotent merge finalization guard (Problem C)
|
||
# ---------------------------------------------------------------------------
|
||
def pr_already_merged(repo: str, branch: str) -> bool:
|
||
"""Return True iff the **code-PR of ``branch``** is ALREADY merged (idempotency-guard).
|
||
|
||
ORCH-073 ADR-001 Р-2 (FR-2): this is an **idempotency-guard for ``merge_pr``**, NOT
|
||
a source of truth for ``done`` (the only proof of merge is SHA-in-main, FR-1 /
|
||
``verify_merged_to_main``). It lets a re-driven / reaped ``merge_pr`` be idempotent:
|
||
the code-PR is already merged -> no-op, never a duplicate merge.
|
||
|
||
Root-cause fix (G4 audit): the previous implementation returned True for ANY
|
||
``merged == True`` PR returned by ``GET /pulls?state=all&head=<branch>``. Gitea's
|
||
``head`` query-param filters unreliably for a bare branch name, so auto docs-PRs
|
||
(staging/deploy logs, ``head=docs/*``) leaked into the result and were counted as
|
||
"merged" — the ORCH-067/069 phantom-merge. We now apply an EXPLICIT in-loop filter
|
||
instead of trusting the query-param: a PR counts only when it carries the code of
|
||
THIS feature-branch into ``main``:
|
||
|
||
* ``pr.merged is True`` AND
|
||
* ``pr.head.ref == branch`` (the code of exactly this feature-branch) AND
|
||
* ``pr.base.ref == "main"`` (target is main, not a docs/other base).
|
||
|
||
This excludes auto docs-PRs (different ``head.ref``) and PRs onto a non-``main``
|
||
base, so a merged docs-PR can no longer make ``merge_pr`` skip a real code merge.
|
||
|
||
Queries Gitea ``GET /repos/{owner}/{repo}/pulls?state=all&head=<branch>`` and
|
||
reports True only when a matching PR passes the filter above. Never raises (AC-9):
|
||
any HTTP/parse error -> ``False`` (conservative: "not known-merged" lets the
|
||
normal gate re-evaluate rather than silently skipping a real merge).
|
||
"""
|
||
try:
|
||
import httpx
|
||
owner = settings.gitea_owner
|
||
headers = {"Authorization": f"token {settings.gitea_token}"}
|
||
resp = httpx.get(
|
||
f"{settings.gitea_url}/api/v1/repos/{owner}/{repo}/pulls",
|
||
params={"state": "all", "head": branch},
|
||
headers=headers, timeout=_SHORT_TIMEOUT,
|
||
)
|
||
if resp.status_code != 200:
|
||
return False
|
||
for pr in resp.json() or []:
|
||
if (
|
||
pr.get("merged") is True
|
||
and pr.get("head", {}).get("ref") == branch
|
||
and pr.get("base", {}).get("ref") == "main"
|
||
):
|
||
return True
|
||
return False
|
||
except Exception as e: # noqa: BLE001 - never-raise contract
|
||
logger.warning("pr_already_merged check failed for %s/%s: %s", repo, branch, e)
|
||
return False
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# ORCH-071: deterministic merge-actor + post-deploy merge verification.
|
||
#
|
||
# For the self-hosting repo the `deploy` stage runs the deterministic self-deploy
|
||
# path (Phase A/B/C) and the LLM `deployer` agent — historically the ONLY actor
|
||
# that merged the feature PR into `main` — never runs. These two helpers close the
|
||
# "phantom merge" gap (LESSONS_2026-06-08): a deterministic actor merges the PR via
|
||
# the Gitea PR-merge API (NEVER a push/force-push to main, INV-4) and a verifier
|
||
# confirms `main` actually received the commit before the pipeline reaches `done`.
|
||
# Both wire into the `deploy -> done` under-gate (stage_engine._handle_merge_verify).
|
||
# ---------------------------------------------------------------------------
|
||
|
||
# Lightweight in-process observability counters (D8). Reset only on process start;
|
||
# surfaced read-only via `merge_verify_status()` in GET /queue. Never the source of
|
||
# truth for any decision — purely informational.
|
||
_MERGE_VERIFY_COUNTERS: dict = {
|
||
"merge_verified_total": 0,
|
||
"not_merged_alerts_total": 0,
|
||
"main_regressed_alerts_total": 0, # ORCH-073 Р-4: regression-guard HOLD+alert count.
|
||
"last_alert_wi": None,
|
||
}
|
||
|
||
|
||
def note_merge_verified() -> None:
|
||
"""Bump the 'merge verified -> done' counter (observability only). Never raises."""
|
||
try:
|
||
_MERGE_VERIFY_COUNTERS["merge_verified_total"] += 1
|
||
except Exception: # noqa: BLE001 - observability must never break a decision
|
||
pass
|
||
|
||
|
||
def note_not_merged_alert(work_item_id: str | None) -> None:
|
||
"""Bump the 'deploy succeeded but not merged' counter. Never raises."""
|
||
try:
|
||
_MERGE_VERIFY_COUNTERS["not_merged_alerts_total"] += 1
|
||
_MERGE_VERIFY_COUNTERS["last_alert_wi"] = work_item_id
|
||
except Exception: # noqa: BLE001 - observability must never break a decision
|
||
pass
|
||
|
||
|
||
def note_main_regressed_alert(work_item_id: str | None) -> None:
|
||
"""Bump the 'main regressed (marker missing)' counter (ORCH-073 Р-4). Never raises."""
|
||
try:
|
||
_MERGE_VERIFY_COUNTERS["main_regressed_alerts_total"] += 1
|
||
_MERGE_VERIFY_COUNTERS["last_alert_wi"] = work_item_id
|
||
except Exception: # noqa: BLE001 - observability must never break a decision
|
||
pass
|
||
|
||
|
||
def merge_verify_status() -> dict:
|
||
"""Snapshot of the merge-verify under-gate for GET /queue. Never raises."""
|
||
try:
|
||
return {
|
||
"enabled": bool(settings.merge_verify_enabled),
|
||
"repos": settings.merge_verify_repos or "",
|
||
"merge_verified_total": _MERGE_VERIFY_COUNTERS["merge_verified_total"],
|
||
"not_merged_alerts_total": _MERGE_VERIFY_COUNTERS["not_merged_alerts_total"],
|
||
"main_regressed_alerts_total": _MERGE_VERIFY_COUNTERS["main_regressed_alerts_total"],
|
||
"last_alert_wi": _MERGE_VERIFY_COUNTERS["last_alert_wi"],
|
||
}
|
||
except Exception as e: # noqa: BLE001 - never-raise contract
|
||
logger.warning("merge_verify_status error: %s", e)
|
||
return {"enabled": False}
|
||
|
||
|
||
def merge_verify_applies(repo: str) -> bool:
|
||
"""Whether the ORCH-071 merge-verify under-gate is REAL for this repo.
|
||
|
||
Mirrors ``self_deploy_applies`` / ``image_freshness_applies`` (FR-5 / AC-10):
|
||
* ``merge_verify_enabled=False`` -> always False (global kill-switch -> the
|
||
pipeline behaves exactly as before ORCH-071 for everyone).
|
||
* ``merge_verify_repos`` (CSV) non-empty -> real only for listed repos.
|
||
* empty CSV -> real ONLY for the self-hosting repo (``orchestrator``); other
|
||
repos keep the LLM-``deployer`` merge path unchanged (AC-4b).
|
||
Never raises (any error -> False = no-op, the safe default).
|
||
"""
|
||
try:
|
||
if not settings.merge_verify_enabled:
|
||
return False
|
||
raw = (settings.merge_verify_repos or "").strip()
|
||
if raw:
|
||
allowed = {r.strip().lower() for r in raw.split(",") if r.strip()}
|
||
return (repo or "").strip().lower() in allowed
|
||
# Lazy import keeps this a leaf-ish module (qg.checks imports merge_gate lazily).
|
||
from .qg.checks import is_self_hosting_repo
|
||
return is_self_hosting_repo(repo)
|
||
except Exception as e: # noqa: BLE001 - never-raise contract
|
||
logger.warning("merge_verify_applies error for %s: %s", repo, e)
|
||
return False
|
||
|
||
|
||
def _branch_fully_in_main(repo: str, branch: str) -> bool | None:
|
||
"""Return True iff ``branch`` has NO commits beyond ``origin/main`` (ORCH-093 D3).
|
||
|
||
Used by ``ensure_open_pr`` to avoid creating an empty PR on a branch that is
|
||
already fully merged into ``main`` (the ORCH-063 garbage-PR symptom on a
|
||
re-driven finalizer after a manual merge). In the per-branch worktree:
|
||
``git fetch origin main`` then ``git merge-base --is-ancestor HEAD origin/main``
|
||
(equivalent to ``git rev-list --count origin/main..HEAD == 0``; same idiom as
|
||
``branch_is_behind_main`` / ``verify_merged_to_main``).
|
||
|
||
* ``rc == 0`` -> HEAD is an ancestor of origin/main -> fully in main -> ``True``.
|
||
* ``rc == 1`` -> there are commits beyond main -> ``False``.
|
||
* git/OS error / ambiguous rc -> ``None`` (caller fail-OPENs: degrade to the
|
||
create path; an infra hiccup must NOT become a false no-op merge).
|
||
|
||
Never-raise: any error -> ``None``.
|
||
"""
|
||
try:
|
||
wt = ensure_worktree(repo, branch)
|
||
except Exception as e: # noqa: BLE001 - never-raise contract -> fail-OPEN
|
||
logger.warning("_branch_fully_in_main: worktree error for %s/%s: %s", repo, branch, e)
|
||
return None
|
||
try:
|
||
subprocess.run(
|
||
["git", "-C", wt, "fetch", "origin", "main"],
|
||
capture_output=True, timeout=_FETCH_TIMEOUT,
|
||
)
|
||
r = subprocess.run(
|
||
["git", "-C", wt, "merge-base", "--is-ancestor", "HEAD", "origin/main"],
|
||
capture_output=True, timeout=_SHORT_TIMEOUT,
|
||
)
|
||
except (subprocess.SubprocessError, OSError) as e:
|
||
logger.warning("_branch_fully_in_main: git error for %s/%s: %s", repo, branch, e)
|
||
return None
|
||
if r.returncode == 0:
|
||
return True
|
||
if r.returncode == 1:
|
||
return False
|
||
logger.warning(
|
||
"_branch_fully_in_main: ambiguous merge-base rc=%s for %s/%s (fail-open)",
|
||
r.returncode, repo, branch,
|
||
)
|
||
return None
|
||
|
||
|
||
def ensure_open_pr(repo: str, branch: str) -> tuple[str, str]:
|
||
"""Guarantee an open **code-PR** (``head==branch`` AND ``base=="main"``) exists.
|
||
|
||
ORCH-082 (ADR-001 Р-1 / FR-1): the idempotent leaf-actor that closes the missing
|
||
invariant "by merge-verify time the branch has an open code-PR". The pipeline used
|
||
to create a PR ONLY on the developer path with a fresh worktree commit
|
||
(``launcher._ensure_pr``), so a branch could reach the ``deploy -> done`` merge-verify
|
||
under-gate with no open code-PR -> ``merge_pr`` returned ``"no open PR"`` -> a FALSE
|
||
HOLD (the ORCH-074 incident). This actor creates/finds the code-PR ДО the
|
||
deterministic ``merge_pr``; ORCH-073's SHA-in-main proof stays authoritative.
|
||
|
||
Algorithm (FR-1):
|
||
1. ``GET …/pulls?state=open`` -> a PR with **``head.ref==branch`` AND
|
||
``base.ref=="main"``**. The filter is **identical** to ``merge_pr``/ORCH-073
|
||
FR-3 so both actors agree on exactly the same PR — an auto docs-PR
|
||
(``base != main``) is NOT a code-PR (AC-6). Found -> ``("existed", "<number>")``.
|
||
2. Otherwise ``POST …/pulls`` (``head=branch``, ``base=main``, auto title/body) ->
|
||
``201`` -> ``("created", "<number>")``.
|
||
3. Idempotency on a race: a ``POST`` that fails because the PR already exists
|
||
(Gitea ``409``/``422``) -> a repeat ``GET`` (step 1) confirms the existing PR ->
|
||
``("existed", …)``; no duplicate is created (AC-2 / FR-5).
|
||
4. Any other HTTP/parse/network error -> ``("failed", "<reason>")``.
|
||
|
||
ORCH-093 (D3) adds a guard BETWEEN steps 1 and 2: if the branch is already fully
|
||
in ``main`` (no commits beyond ``origin/main``) there is nothing to PR -> the new
|
||
outcome ``("already-in-main", "<reason>")`` is returned WITHOUT a ``POST`` (avoids
|
||
an empty garbage PR on a re-driven finalizer). A git error of the guard fails OPEN
|
||
(degrade to the create path) so an infra hiccup never becomes a false no-op.
|
||
|
||
Reuses ``settings.merge_pr_timeout_s`` (same class of Gitea calls as ``merge_pr``).
|
||
Never-raise (AC-7): any unexpected error -> ``("failed", str(e))``; the exception is
|
||
NEVER propagated into ``_handle_merge_verify`` / ``advance_stage``.
|
||
"""
|
||
try:
|
||
import httpx
|
||
owner = settings.gitea_owner
|
||
headers = {"Authorization": f"token {settings.gitea_token}"}
|
||
base = f"{settings.gitea_url}/api/v1/repos/{owner}/{repo}"
|
||
timeout = settings.merge_pr_timeout_s
|
||
|
||
def _find_open_code_pr() -> int | None:
|
||
"""GET open PRs; return the code-PR number (head==branch AND base==main)."""
|
||
resp = httpx.get(
|
||
f"{base}/pulls", params={"state": "open"}, headers=headers, timeout=timeout
|
||
)
|
||
if resp.status_code != 200:
|
||
return None
|
||
for pr in resp.json() or []:
|
||
if (
|
||
pr.get("head", {}).get("ref") == branch
|
||
and pr.get("base", {}).get("ref") == "main"
|
||
):
|
||
return pr.get("number")
|
||
return None
|
||
|
||
# Step 1: an open code-PR already exists -> existed (no duplicate POST).
|
||
existing = _find_open_code_pr()
|
||
if existing is not None:
|
||
logger.info("ensure_open_pr: %s/%s already has open code-PR #%s", repo, branch, existing)
|
||
return "existed", str(existing)
|
||
|
||
# Step 1b (ORCH-093 D3): guard "branch already fully in main". If the branch
|
||
# has no commits beyond origin/main there is nothing to PR — creating one
|
||
# would yield an empty garbage PR (the ORCH-063 symptom on a re-driven
|
||
# finalizer after a manual merge). Return the new "already-in-main" outcome
|
||
# so _handle_merge_verify skips merge_pr and lets the authoritative
|
||
# SHA-in-main check confirm -> done. fail-OPEN on git error / ambiguous
|
||
# (None): degrade to the create path below, NEVER block — an infra hiccup
|
||
# must not become a false no-op merge (SHA-in-main downstream stays the proof).
|
||
if _branch_fully_in_main(repo, branch) is True:
|
||
logger.info(
|
||
"ensure_open_pr: %s/%s already fully in main -> already-in-main (no PR created)",
|
||
repo, branch,
|
||
)
|
||
return "already-in-main", "branch already in main (no commits beyond origin/main)"
|
||
|
||
# Step 2: create the code-PR onto main.
|
||
parts = branch.split("/")
|
||
title = parts[-1] if parts else branch
|
||
m = httpx.post(
|
||
f"{base}/pulls",
|
||
json={
|
||
"title": f"feat: {title}",
|
||
"head": branch,
|
||
"base": "main",
|
||
"body": f"Auto-created by orchestrator merge-verify for {branch}",
|
||
},
|
||
headers=headers,
|
||
timeout=timeout,
|
||
)
|
||
if m.status_code in (200, 201):
|
||
number = (m.json() or {}).get("number")
|
||
logger.info("ensure_open_pr: created PR #%s for %s/%s", number, repo, branch)
|
||
return "created", str(number)
|
||
|
||
# Step 3: race / already-exists (409 conflict, 422 unprocessable) -> re-GET.
|
||
if m.status_code in (409, 422):
|
||
again = _find_open_code_pr()
|
||
if again is not None:
|
||
logger.info(
|
||
"ensure_open_pr: %s/%s PR already existed on retry (#%s, HTTP %s)",
|
||
repo, branch, again, m.status_code,
|
||
)
|
||
return "existed", str(again)
|
||
|
||
detail = (m.text or "").strip()[:200]
|
||
logger.warning(
|
||
"ensure_open_pr: create failed for %s/%s: HTTP %s %s",
|
||
repo, branch, m.status_code, detail,
|
||
)
|
||
return "failed", f"create PR failed: HTTP {m.status_code}"
|
||
except Exception as e: # noqa: BLE001 - never-raise contract (AC-7)
|
||
logger.warning("ensure_open_pr unexpected error for %s/%s: %s", repo, branch, e)
|
||
return "failed", f"ensure_open_pr error: {e}"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# ORCH-093: transient-error retry of the merge POST + classification helpers.
|
||
# ---------------------------------------------------------------------------
|
||
def _merge_backoff(attempt: int) -> float:
|
||
"""Exponential backoff (s) with a ceiling for the merge-POST retry (ORCH-093 D1).
|
||
|
||
``backoff(i) = min(base * 2**(i-1), max)`` — the transient-breaker idiom of the
|
||
Claude agents, bounded so the total sleep ``(N-1) * max`` can never wedge the
|
||
monitor-thread running merge-verify (NFR-4). Defaults base=2, max=5 -> the
|
||
sequence is 2, 4, 5, 5, … seconds.
|
||
"""
|
||
base = settings.merge_retry_backoff_base_s
|
||
cap = settings.merge_retry_backoff_max_s
|
||
try:
|
||
return float(min(base * (2 ** (max(attempt, 1) - 1)), cap))
|
||
except Exception: # noqa: BLE001 - never-raise; degrade to the ceiling
|
||
return float(cap)
|
||
|
||
|
||
def _pr_mergeable(repo: str, index) -> bool | None:
|
||
"""Read the ``mergeable`` field of PR ``index`` via ``GET /pulls/{index}`` (ORCH-093 D2).
|
||
|
||
Used ONLY to disambiguate a ``409``/``422`` merge POST: Gitea may still be
|
||
recomputing mergeability right after a push (the ORCH-063 root cause). Returns
|
||
the boolean ``mergeable`` flag, or ``None`` when it is absent / non-boolean / the
|
||
GET fails (never-raise) — the caller treats ``None`` as the default-policy
|
||
transient (D2).
|
||
"""
|
||
try:
|
||
import httpx
|
||
owner = settings.gitea_owner
|
||
headers = {"Authorization": f"token {settings.gitea_token}"}
|
||
resp = httpx.get(
|
||
f"{settings.gitea_url}/api/v1/repos/{owner}/{repo}/pulls/{index}",
|
||
headers=headers, timeout=settings.merge_pr_timeout_s,
|
||
)
|
||
if resp.status_code != 200:
|
||
return None
|
||
val = (resp.json() or {}).get("mergeable")
|
||
return val if isinstance(val, bool) else None
|
||
except Exception as e: # noqa: BLE001 - never-raise contract
|
||
logger.warning("_pr_mergeable check failed for %s PR #%s: %s", repo, index, e)
|
||
return None
|
||
|
||
|
||
def _classify_merge_response(repo: str, branch: str, index, status_code: int) -> str:
|
||
"""Classify a non-2xx ``POST /pulls/{index}/merge`` outcome (ORCH-093 D2).
|
||
|
||
Returns ``"transient"`` (retry within budget) or ``"terminal"`` (fast honest
|
||
``False``; the ORCH-071/081 HOLD backstop takes over). Decision tree:
|
||
|
||
* ``405`` ("try again later"), ``408``, any ``5xx`` -> **transient**.
|
||
* ``403`` (no rights), ``404`` (PR gone) -> **terminal**.
|
||
* ``409`` / ``422`` (ambiguous) -> ``GET /pulls/{index}`` -> ``mergeable``:
|
||
- ``False`` -> **terminal** (real conflict, fast HOLD).
|
||
- ``True`` / ``None`` / GET failed -> **transient** (default-policy
|
||
fail-OPEN-in-retry: Gitea has not recomputed yet — the ORCH-063 case;
|
||
the retry budget is finite, so a real conflict still HOLDs after it).
|
||
* any other unexpected code -> **terminal** (do not loop on unknowns).
|
||
|
||
Never-raise: any error -> ``"transient"`` (conservative, within the bounded
|
||
retry budget).
|
||
"""
|
||
try:
|
||
if status_code in (405, 408) or 500 <= status_code <= 599:
|
||
return "transient"
|
||
if status_code in (403, 404):
|
||
return "terminal"
|
||
if status_code in (409, 422):
|
||
mergeable = _pr_mergeable(repo, index)
|
||
if mergeable is False:
|
||
return "terminal"
|
||
# True OR None/unavailable -> transient (default-policy, D2).
|
||
return "transient"
|
||
return "terminal"
|
||
except Exception as e: # noqa: BLE001 - never-raise contract
|
||
logger.warning(
|
||
"_classify_merge_response error for %s/%s PR #%s: %s (transient)",
|
||
repo, branch, index, e,
|
||
)
|
||
return "transient"
|
||
|
||
|
||
def merge_pr(repo: str, branch: str) -> tuple[bool, str]:
|
||
"""Deterministically merge the open PR for ``branch`` via the Gitea PR-merge API.
|
||
|
||
The self-hosting deterministic merge-actor (FR-1 / D3). NEVER pushes or
|
||
force-pushes ``main`` (INV-4/AC-8) — the ONLY mutation is the Gitea
|
||
``POST /pulls/{index}/merge`` call, exactly what the LLM ``deployer`` used to do
|
||
on non-self repos.
|
||
|
||
Algorithm:
|
||
1. ``pr_already_merged`` -> True -> no-op ``(True, "already-merged")`` (INV-5/AC-9).
|
||
2. ``GET /repos/{owner}/{repo}/pulls?state=open`` -> the open PR whose head ref
|
||
== ``branch`` AND base ref == ``main`` -> its index. ORCH-073 ADR-001 Р-3
|
||
(FR-3) adds the ``base == main`` filter so the actor merges exactly the
|
||
feature code-PR and never an auto docs-PR / a PR onto a foreign base. No
|
||
such open PR -> ``(False, "no open PR")``.
|
||
3. ``POST /repos/{owner}/{repo}/pulls/{index}/merge`` (Do: ``merge``) in a
|
||
bounded retry-loop (ORCH-093 D1): ``200/201`` -> ``(True, "merged PR #<n>")``;
|
||
a TRANSIENT outcome (405/408/5xx/network/timeout, or 409|422 while still
|
||
mergeable) is retried with exponential backoff up to
|
||
``merge_retry_max_attempts``; a TERMINAL outcome (403/404/real conflict) ->
|
||
immediate ``(False, "merge failed: HTTP <code>")``; exhausting the budget on
|
||
a transient -> ``(False, "merge failed after <N> attempts: HTTP <code>")``.
|
||
The kill-switch ``merge_retry_enabled=False`` forces exactly one POST
|
||
(the prior one-shot behaviour). Only the mutating POST is retried — the
|
||
idempotent steps above are not.
|
||
|
||
Never-raise (INV-1/AC-9 / TC-09): any HTTP/parse error -> ``(False, reason)``.
|
||
"""
|
||
try:
|
||
if pr_already_merged(repo, branch):
|
||
logger.info("merge_pr: %s/%s already merged -> no-op", repo, branch)
|
||
return True, "already-merged"
|
||
|
||
import httpx
|
||
owner = settings.gitea_owner
|
||
headers = {"Authorization": f"token {settings.gitea_token}"}
|
||
base = f"{settings.gitea_url}/api/v1/repos/{owner}/{repo}"
|
||
timeout = settings.merge_pr_timeout_s
|
||
|
||
resp = httpx.get(
|
||
f"{base}/pulls", params={"state": "open"}, headers=headers, timeout=timeout
|
||
)
|
||
if resp.status_code != 200:
|
||
return False, f"list PRs failed: HTTP {resp.status_code}"
|
||
index = None
|
||
for pr in resp.json() or []:
|
||
if (
|
||
pr.get("head", {}).get("ref") == branch
|
||
and pr.get("base", {}).get("ref") == "main"
|
||
):
|
||
index = pr.get("number")
|
||
break
|
||
if index is None:
|
||
return False, "no open PR"
|
||
|
||
# ORCH-093 D1: retry ONLY the mutating POST on transient outcomes. The
|
||
# kill-switch collapses the budget to one attempt = the prior one-shot path
|
||
# (no branching of the loop body, ADR D1).
|
||
n_eff = settings.merge_retry_max_attempts if settings.merge_retry_enabled else 1
|
||
if n_eff < 1:
|
||
n_eff = 1
|
||
for attempt in range(1, n_eff + 1):
|
||
try:
|
||
m = httpx.post(
|
||
f"{base}/pulls/{index}/merge",
|
||
json={"Do": "merge"},
|
||
headers=headers,
|
||
timeout=timeout,
|
||
)
|
||
except (httpx.HTTPError, OSError) as e:
|
||
# Network/timeout -> transient within the bounded budget (never-raise).
|
||
logger.warning(
|
||
"merge_pr: attempt %s/%s network error for %s/%s PR #%s: %s (transient)",
|
||
attempt, n_eff, repo, branch, index, e,
|
||
)
|
||
if attempt < n_eff:
|
||
time.sleep(_merge_backoff(attempt))
|
||
continue
|
||
return False, f"merge failed after {n_eff} attempts: network error"
|
||
|
||
if m.status_code in (200, 201):
|
||
logger.info(
|
||
"merge_pr: merged PR #%s for %s/%s (attempt %s/%s)",
|
||
index, repo, branch, attempt, n_eff,
|
||
)
|
||
return True, f"merged PR #{index}"
|
||
|
||
detail = (m.text or "").strip()[:200]
|
||
cls = _classify_merge_response(repo, branch, index, m.status_code)
|
||
if cls == "terminal":
|
||
logger.warning(
|
||
"merge_pr: merge failed for %s/%s PR #%s: HTTP %s %s (terminal)",
|
||
repo, branch, index, m.status_code, detail,
|
||
)
|
||
return False, f"merge failed: HTTP {m.status_code}"
|
||
|
||
# Transient: log attempt i/N (check_ci_green idiom) and retry if budget left.
|
||
logger.warning(
|
||
"merge_pr: attempt %s/%s transient HTTP %s for %s/%s PR #%s %s",
|
||
attempt, n_eff, m.status_code, repo, branch, index, detail,
|
||
)
|
||
if attempt < n_eff:
|
||
time.sleep(_merge_backoff(attempt))
|
||
continue
|
||
return False, f"merge failed after {n_eff} attempts: HTTP {m.status_code}"
|
||
|
||
# Unreachable (loop always returns), defensive only.
|
||
return False, f"merge failed after {n_eff} attempts"
|
||
except Exception as e: # noqa: BLE001 - never-raise contract
|
||
logger.warning("merge_pr unexpected error for %s/%s: %s", repo, branch, e)
|
||
return False, f"merge error: {e}"
|
||
|
||
|
||
def verify_merged_to_main(repo: str, branch: str, sha: str) -> bool:
|
||
"""Return True iff the deployed commit is confirmed merged into ``origin/main``.
|
||
|
||
Post-deploy verification — ORCH-073 ADR-001 Р-1 (FR-1): the merge is confirmed by
|
||
the SINGLE, authoritative fact "the deployed commit IS an ancestor of the current
|
||
``origin/main``":
|
||
|
||
* after ``git fetch origin main`` (in the per-branch worktree),
|
||
``git merge-base --is-ancestor <sha> origin/main`` returns ``rc == 0``.
|
||
|
||
The former OR-branch ``pr_already_merged(repo, branch)`` was REMOVED: a merged
|
||
``PR.merged == true`` is no longer sufficient to confirm a merge. That branch was
|
||
the ORCH-067/069 phantom-merge root cause — an auto docs-PR (staging/deploy logs)
|
||
counted as "merged" via the unreliable Gitea ``head`` query, turning merge-verify
|
||
falsely GREEN while the code-PR was never merged. ``pr_already_merged`` now serves
|
||
ONLY as an idempotency-guard inside ``merge_pr`` (Р-2/Р-3), never as proof of merge.
|
||
|
||
``sha`` is the validated commit (``image_freshness.validated_revision`` =
|
||
worktree ``git rev-parse HEAD``). An empty ``sha`` is inconclusive -> ``False``
|
||
(fail-closed: alert + HOLD), since the SHA-in-main check cannot run without it.
|
||
|
||
Never-raise (INV-1/AC-7 / TC-04): any git/HTTP error -> ``False`` (= "not
|
||
confirmed" -> fail-closed for ``done``: alert + HOLD). The exception is NEVER
|
||
propagated into ``advance_stage``.
|
||
"""
|
||
try:
|
||
if not sha:
|
||
logger.warning(
|
||
"verify_merged_to_main: empty SHA for %s/%s -> cannot confirm SHA-in-main",
|
||
repo, branch,
|
||
)
|
||
return False
|
||
try:
|
||
wt = ensure_worktree(repo, branch)
|
||
except Exception as e: # noqa: BLE001 - never-raise contract
|
||
logger.warning(
|
||
"verify_merged_to_main: worktree error for %s/%s: %s", repo, branch, e
|
||
)
|
||
return False
|
||
subprocess.run(
|
||
["git", "-C", wt, "fetch", "origin", "main"],
|
||
capture_output=True, timeout=settings.merge_verify_timeout_s,
|
||
)
|
||
r = subprocess.run(
|
||
["git", "-C", wt, "merge-base", "--is-ancestor", sha, "origin/main"],
|
||
capture_output=True, timeout=settings.merge_verify_timeout_s,
|
||
)
|
||
return r.returncode == 0
|
||
except Exception as e: # noqa: BLE001 - never-raise contract
|
||
logger.warning(
|
||
"verify_merged_to_main unexpected error for %s/%s: %s", repo, branch, e
|
||
)
|
||
return False
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# ORCH-073 (ADR-001 Р-4): main-integrity regression guard.
|
||
#
|
||
# A secondary, deterministic (no-LLM) guard that runs in `_handle_merge_verify`
|
||
# AFTER the SHA-in-main check (verify_merged_to_main, FR-1) confirms the deployed
|
||
# commit, and BEFORE the task is stamped `done`. It checks that a DECLARATIVE set
|
||
# of markers for recently-merged tasks is still present in `origin/main` — i.e. a
|
||
# CHANGELOG-rebase / phantom-merge did not silently roll back a neighbouring task's
|
||
# code (the ORCH-067/069 failure mode, which SHA-in-main alone would not catch when
|
||
# the deployed SHA itself IS in main but a sibling's code is gone).
|
||
# ---------------------------------------------------------------------------
|
||
|
||
# Declarative, append-only marker set (ADR-001 Р-4). Each future task that lands
|
||
# significant code SHOULD append its own (task, marker_substring, path) row so the
|
||
# guard protects it from a later phantom-merge / rebase rollback. Kept in code (not
|
||
# DB / Plane — a non-goal) so it versions together with the fix it protects.
|
||
MAIN_REGRESSION_MARKERS: list[tuple[str, str, str]] = [
|
||
("ORCH-067", "plane_issue_link", "src/notifications.py"),
|
||
("ORCH-069", "qg0_title_max", "src/config.py"),
|
||
("ORCH-071", "verify_merged_to_main", "src/merge_gate.py"),
|
||
("ORCH-073", "check_main_regression", "src/merge_gate.py"),
|
||
("ORCH-082", "ensure_open_pr", "src/merge_gate.py"),
|
||
("ORCH-093", "_classify_merge_response", "src/merge_gate.py"),
|
||
]
|
||
|
||
|
||
def check_main_regression(repo: str, branch: str) -> tuple[bool, str]:
|
||
"""Verify the declarative marker set is still present in ``origin/main``.
|
||
|
||
ORCH-073 ADR-001 Р-4 (FR-5). For each ``(task, marker, path)`` in
|
||
``MAIN_REGRESSION_MARKERS`` run ``git grep -c <marker> origin/main -- <path>`` in
|
||
the per-branch worktree (after ``git fetch origin main``). A DETERMINISTIC count
|
||
of ``0`` for any marker means a neighbouring task's code was rolled back ->
|
||
regression.
|
||
|
||
Returns ``(ok, reason)``:
|
||
* ``(True, "markers intact (<n>)")`` — every marker present -> proceed.
|
||
* ``(False, "main regressed: <task> ...")`` — a marker is deterministically
|
||
absent (count==0) -> caller HOLDs the task (NOT done) + alerts.
|
||
|
||
**Fail-OPEN on infra error** (intentional trade-off, ADR-001 Р-4): any git/OS
|
||
error on the grep itself -> ``(True, "guard inconclusive: <reason>")`` so a flaky
|
||
git never produces a false HOLD. "Regressed" is asserted ONLY on a deterministic
|
||
``count == 0``, never on "could not determine". The PRIMARY fail-closed gate is
|
||
SHA-in-main (FR-1); this marker-grep is a secondary, best-effort guard.
|
||
|
||
Never raises (INV-1): any unexpected error -> ``(True, "guard error: ...")``.
|
||
"""
|
||
try:
|
||
try:
|
||
wt = ensure_worktree(repo, branch)
|
||
except Exception as e: # noqa: BLE001 - never-raise contract -> fail-open
|
||
logger.warning(
|
||
"check_main_regression: worktree error for %s/%s: %s (fail-open)",
|
||
repo, branch, e,
|
||
)
|
||
return True, f"guard inconclusive: worktree error: {e}"
|
||
|
||
try:
|
||
subprocess.run(
|
||
["git", "-C", wt, "fetch", "origin", "main"],
|
||
capture_output=True, timeout=settings.merge_verify_timeout_s,
|
||
)
|
||
except (subprocess.SubprocessError, OSError) as e:
|
||
logger.warning(
|
||
"check_main_regression: fetch error for %s/%s: %s (fail-open)",
|
||
repo, branch, e,
|
||
)
|
||
return True, f"guard inconclusive: fetch error: {e}"
|
||
|
||
for task, marker, path in MAIN_REGRESSION_MARKERS:
|
||
try:
|
||
r = subprocess.run(
|
||
["git", "-C", wt, "grep", "-c", marker, "origin/main", "--", path],
|
||
capture_output=True, text=True, timeout=_SHORT_TIMEOUT,
|
||
)
|
||
except (subprocess.SubprocessError, OSError) as e:
|
||
# Infra error on this marker -> fail-open (do NOT assert regression).
|
||
logger.warning(
|
||
"check_main_regression: grep error for %s (%s @ %s): %s (fail-open)",
|
||
task, marker, path, e,
|
||
)
|
||
return True, f"guard inconclusive: grep error for {task}: {e}"
|
||
# git grep exit codes: 0 = match(es) found, 1 = no match, >1 = real error.
|
||
if r.returncode == 0:
|
||
continue
|
||
if r.returncode == 1:
|
||
# Deterministic absence -> regression of a neighbouring task's code.
|
||
logger.warning(
|
||
"check_main_regression: marker MISSING in origin/main for %s "
|
||
"(%s @ %s) -> main regressed", task, marker, path,
|
||
)
|
||
return False, f"main regressed: {task} code missing ({marker} @ {path})"
|
||
# rc > 1 -> git error (e.g. bad path/ref) -> inconclusive -> fail-open.
|
||
logger.warning(
|
||
"check_main_regression: ambiguous git grep rc=%s for %s (%s @ %s) "
|
||
"(fail-open)", r.returncode, task, marker, path,
|
||
)
|
||
return True, f"guard inconclusive: git grep rc={r.returncode} for {task}"
|
||
|
||
return True, f"markers intact ({len(MAIN_REGRESSION_MARKERS)})"
|
||
except Exception as e: # noqa: BLE001 - never-raise contract -> fail-open
|
||
logger.warning(
|
||
"check_main_regression unexpected error for %s/%s: %s (fail-open)",
|
||
repo, branch, e,
|
||
)
|
||
return True, f"guard error: {e}"
|