feat(merge-gate): auto-rebase onto current main + re-test + serialise merges
Deterministic (no-LLM) sub-gate on the deploy-staging -> deploy edge that catches a feature branch up to the CURRENT origin/main, re-tests the combined tree, and serialises merges with a per-repo file lease — so two green parallel branches can no longer break main (self-hosting safety for the orchestrator repo). - src/merge_gate.py: branch_is_behind_main, auto_rebase_onto_main (push --force-with-lease ONLY the task branch, NEVER main), retest_branch, and a file merge-lease (atomic O_CREAT|O_EXCL, holder-aware release, stale reclaim). Strict never-raise contract; all git ops in the per-branch worktree. - src/qg/checks.py: check_branch_mergeable composes the primitives under the lease; registered in QG_CHECKS. Conditional rollout (merge_gate_enabled / merge_gate_repos, default self-hosting only). - src/stage_engine.py: sub-gate hook on deploy-staging (not a new stage). PASS -> advance; "merge-lock busy" -> DEFER (re-queue with available_at, anti-deadlock at max_concurrency=1, capped); conflict/red re-test -> rollback to development + developer retry (capped by MAX_DEVELOPER_RETRIES). Lease released on deploy->done / rollback / PR-merged webhook. - src/db.py: enqueue_job(available_at_delay_s=...) for the defer (no schema change). - src/webhooks/gitea.py: holder-aware lease release on PR-merged. - src/config.py + .env.example: ORCH_MERGE_* settings. Docs: README + adr-0006 (architect) already cover the design; CHANGELOG updated. Tests: test_merge_gate.py, test_qg_merge_gate.py, test_merge_gate_race.py, test_stage_engine.py::TestMergeGate, test_config.py, QG-registry snapshot. Full suite: 535 passed. Refs: ORCH-043 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -130,6 +130,28 @@ class Settings(BaseSettings):
|
||||
ci_poll_max_attempts: int = 12
|
||||
ci_poll_interval_s: int = 10
|
||||
|
||||
# ORCH-043: merge-gate (auto-rebase + re-test + merge-lock) on the
|
||||
# deploy-staging -> deploy edge. A deterministic sub-gate (no LLM) that
|
||||
# catches the up-to-date branch up to the CURRENT origin/main, re-tests it,
|
||||
# and serialises merges so two green branches can't break main.
|
||||
# merge_gate_enabled -> global kill-switch; False -> no-op pass for the
|
||||
# whole gate (staged rollout, env ORCH_MERGE_GATE_ENABLED).
|
||||
# merge_gate_repos -> CSV of repos where the gate is REAL; empty means
|
||||
# only the self-hosting repo (orchestrator). Other
|
||||
# repos -> conditional no-op (mirrors ORCH-35 staging).
|
||||
# merge_retest_timeout_s -> wall-clock budget for the post-rebase re-test.
|
||||
# merge_retest_target -> pytest target for the re-test (portability across repos).
|
||||
# merge_lock_timeout_s -> max lease age; an older lease is reclaimed (crash backstop).
|
||||
# merge_defer_delay_s -> delay before re-running the gate when the lock is busy.
|
||||
# merge_defer_max_attempts -> defer retries before escalation (avoids livelock).
|
||||
merge_gate_enabled: bool = True
|
||||
merge_gate_repos: str = ""
|
||||
merge_retest_timeout_s: int = 600
|
||||
merge_retest_target: str = "tests/"
|
||||
merge_lock_timeout_s: int = 300
|
||||
merge_defer_delay_s: int = 60
|
||||
merge_defer_max_attempts: int = 5
|
||||
|
||||
# Telegram notifications
|
||||
telegram_bot_token: str = ""
|
||||
telegram_chat_id: str = ""
|
||||
|
||||
25
src/db.py
25
src/db.py
@@ -324,19 +324,34 @@ def enqueue_job(
|
||||
task_content: str | None = None,
|
||||
task_id: int | None = None,
|
||||
max_attempts: int = 2,
|
||||
available_at_delay_s: int | None = None,
|
||||
) -> int:
|
||||
"""Enqueue a new job (status='queued'). Returns the new job id.
|
||||
|
||||
This is what webhook handlers call instead of launching an agent in-process:
|
||||
it is a fast DB INSERT that returns immediately. The background worker
|
||||
(queue_worker) picks the job up later.
|
||||
|
||||
ORCH-043 (merge-gate defer): when ``available_at_delay_s`` is given the job's
|
||||
``available_at`` is set to ``now + delay`` so claim_next_job won't pick it up
|
||||
until the delay elapses (re-uses the existing ORCH-1 backoff gate). Used to
|
||||
re-queue the staging-deployer after a "merge-lock busy" defer without burning a
|
||||
worker slot in a blocking wait.
|
||||
"""
|
||||
conn = get_db()
|
||||
cursor = conn.execute(
|
||||
"INSERT INTO jobs (agent, repo, task_id, task_content, max_attempts) "
|
||||
"VALUES (?, ?, ?, ?, ?)",
|
||||
(agent, repo, task_id, task_content, max_attempts),
|
||||
)
|
||||
if available_at_delay_s is not None:
|
||||
cursor = conn.execute(
|
||||
"INSERT INTO jobs (agent, repo, task_id, task_content, max_attempts, available_at) "
|
||||
"VALUES (?, ?, ?, ?, ?, datetime('now', ?))",
|
||||
(agent, repo, task_id, task_content, max_attempts,
|
||||
f"+{int(available_at_delay_s)} seconds"),
|
||||
)
|
||||
else:
|
||||
cursor = conn.execute(
|
||||
"INSERT INTO jobs (agent, repo, task_id, task_content, max_attempts) "
|
||||
"VALUES (?, ?, ?, ?, ?)",
|
||||
(agent, repo, task_id, task_content, max_attempts),
|
||||
)
|
||||
job_id = cursor.lastrowid
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
340
src/merge_gate.py
Normal file
340
src/merge_gate.py
Normal file
@@ -0,0 +1,340 @@
|
||||
"""Merge-gate core (ORCH-043): catch a branch up to the CURRENT origin/main,
|
||||
re-test it, and serialise merges with a file lease.
|
||||
|
||||
Background
|
||||
----------
|
||||
The pipeline validates a branch against the ``main`` it was BRANCHED from, not the
|
||||
``main`` at the moment of merge. Between "branch validated" and "branch merged" a
|
||||
parallel task may have advanced ``main`` -> a *semantic* merge conflict: git merges
|
||||
with no textual conflict, yet the combined ``main`` is broken. For the self-hosting
|
||||
``orchestrator`` repo that means a red ``main`` of the tool serving every project.
|
||||
|
||||
This module provides the deterministic (no-LLM) primitives the quality-gate
|
||||
``check_branch_mergeable`` (src/qg/checks.py) composes on the
|
||||
``deploy-staging -> deploy`` edge, BEFORE the deployer merges the PR:
|
||||
|
||||
* ``branch_is_behind_main`` -> is the branch missing the latest origin/main?
|
||||
* ``auto_rebase_onto_main`` -> rebase onto origin/main + push --force-with-lease
|
||||
(ONLY the task branch; NEVER main).
|
||||
* ``retest_branch`` -> run the project test-suite in the caught-up worktree.
|
||||
* file lease (``acquire_merge_lease`` / ``release_merge_lease``) -> serialise the
|
||||
"catch-up + re-test + merge" of ONE repo, held from the gate to the actual merge.
|
||||
|
||||
Invariants (self-hosting safety, ТЗ §10):
|
||||
* NEVER push or force-push ``main`` — the only force op is ``--force-with-lease``
|
||||
on the task branch.
|
||||
* All git ops run in the per-branch worktree (ensure_worktree), never the shared clone.
|
||||
* Every public function honours a strict **never-raise** contract: any git/OS error
|
||||
-> ``(False, "<reason>")`` (or a safe bool), never a propagated exception.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
from .config import settings
|
||||
from .git_worktree import ensure_worktree, get_worktree_path
|
||||
|
||||
logger = logging.getLogger("orchestrator.merge_gate")
|
||||
|
||||
# git sub-command timeouts (seconds). Generous but bounded so a hung git never
|
||||
# wedges the monitor-thread that runs the gate.
|
||||
_FETCH_TIMEOUT = 60
|
||||
_REBASE_TIMEOUT = 120
|
||||
_PUSH_TIMEOUT = 60
|
||||
_SHORT_TIMEOUT = 30
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# behind / ancestor detection
|
||||
# ---------------------------------------------------------------------------
|
||||
def branch_is_behind_main(repo: str, branch: str) -> bool:
|
||||
"""Return True iff ``branch`` does NOT already contain the latest origin/main.
|
||||
|
||||
A branch is "behind" when ``origin/main`` is **not** an ancestor of the branch
|
||||
HEAD (``git merge-base --is-ancestor origin/main HEAD`` returns non-zero). All
|
||||
work happens in the per-branch worktree (ORCH-2 / S-4 isolation).
|
||||
|
||||
Never-raise (AC-9 / TC-03): any git/OS failure or an ambiguous result is treated
|
||||
as "cannot prove the branch is up-to-date" -> return True (force a rebase attempt
|
||||
rather than merge blindly). It returns a bool, never raises.
|
||||
"""
|
||||
try:
|
||||
wt = ensure_worktree(repo, branch)
|
||||
except Exception as e: # noqa: BLE001 - never-raise contract
|
||||
logger.warning("branch_is_behind_main: worktree error for %s/%s: %s", repo, branch, e)
|
||||
return True
|
||||
|
||||
try:
|
||||
subprocess.run(
|
||||
["git", "-C", wt, "fetch", "origin", "main"],
|
||||
capture_output=True, timeout=_FETCH_TIMEOUT,
|
||||
)
|
||||
r = subprocess.run(
|
||||
["git", "-C", wt, "merge-base", "--is-ancestor", "origin/main", "HEAD"],
|
||||
capture_output=True, timeout=_SHORT_TIMEOUT,
|
||||
)
|
||||
except (subprocess.SubprocessError, OSError) as e:
|
||||
logger.warning("branch_is_behind_main: git error for %s/%s: %s", repo, branch, e)
|
||||
return True
|
||||
|
||||
if r.returncode == 0:
|
||||
# origin/main IS an ancestor of HEAD -> branch already up-to-date.
|
||||
return False
|
||||
if r.returncode == 1:
|
||||
# origin/main is NOT an ancestor -> branch is behind.
|
||||
return True
|
||||
# Any other code (e.g. bad ref) -> ambiguous; do not merge blindly.
|
||||
logger.warning(
|
||||
"branch_is_behind_main: ambiguous merge-base rc=%s for %s/%s (treating as behind)",
|
||||
r.returncode, repo, branch,
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
def _conflicted_files(wt: str) -> str:
|
||||
"""Best-effort list of unmerged (conflicting) files in the worktree."""
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["git", "-C", wt, "diff", "--name-only", "--diff-filter=U"],
|
||||
capture_output=True, text=True, timeout=_SHORT_TIMEOUT,
|
||||
)
|
||||
files = r.stdout.strip().replace("\n", ", ")
|
||||
return files or "unknown"
|
||||
except (subprocess.SubprocessError, OSError):
|
||||
return "unknown"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# auto-rebase onto origin/main
|
||||
# ---------------------------------------------------------------------------
|
||||
def auto_rebase_onto_main(repo: str, branch: str) -> tuple[bool, str]:
|
||||
"""Catch ``branch`` up to ``origin/main`` via rebase, then push it.
|
||||
|
||||
Steps (all in the per-branch worktree):
|
||||
1. ``git fetch origin main``.
|
||||
2. ``git rebase origin/main``:
|
||||
- textual conflict (non-zero) -> ``git rebase --abort`` (leave worktree
|
||||
clean) -> ``(False, "rebase conflict: <files>")`` (AC-3).
|
||||
3. clean rebase -> ``git push --force-with-lease origin <branch>`` — ONLY the
|
||||
task branch, NEVER ``main`` (AC-7) -> ``(True, "rebased onto origin/main")``.
|
||||
|
||||
Never-raise (AC-9): any git/OS error -> ``(False, "<reason>")``.
|
||||
"""
|
||||
try:
|
||||
wt = ensure_worktree(repo, branch)
|
||||
except Exception as e: # noqa: BLE001 - never-raise contract
|
||||
return False, f"rebase setup error: {e}"
|
||||
|
||||
try:
|
||||
subprocess.run(
|
||||
["git", "-C", wt, "fetch", "origin", "main"],
|
||||
capture_output=True, timeout=_FETCH_TIMEOUT,
|
||||
)
|
||||
r = subprocess.run(
|
||||
["git", "-C", wt, "rebase", "origin/main"],
|
||||
capture_output=True, text=True, timeout=_REBASE_TIMEOUT,
|
||||
)
|
||||
if r.returncode != 0:
|
||||
files = _conflicted_files(wt)
|
||||
subprocess.run(
|
||||
["git", "-C", wt, "rebase", "--abort"],
|
||||
capture_output=True, timeout=_SHORT_TIMEOUT,
|
||||
)
|
||||
logger.warning("auto_rebase: conflict on %s/%s: %s", repo, branch, files)
|
||||
return False, f"rebase conflict: {files}"
|
||||
|
||||
# Clean rebase -> push ONLY the task branch with a lease (never main).
|
||||
p = subprocess.run(
|
||||
["git", "-C", wt, "push", "--force-with-lease", "origin", branch],
|
||||
capture_output=True, text=True, timeout=_PUSH_TIMEOUT,
|
||||
)
|
||||
if p.returncode != 0:
|
||||
detail = (p.stderr or p.stdout or "").strip()[:200]
|
||||
logger.warning("auto_rebase: push failed on %s/%s: %s", repo, branch, detail)
|
||||
return False, f"push --force-with-lease failed: {detail}"
|
||||
|
||||
logger.info("auto_rebase: %s/%s rebased onto origin/main and pushed", repo, branch)
|
||||
return True, "rebased onto origin/main"
|
||||
except subprocess.TimeoutExpired:
|
||||
# Leave no half-finished rebase behind.
|
||||
try:
|
||||
subprocess.run(
|
||||
["git", "-C", wt, "rebase", "--abort"],
|
||||
capture_output=True, timeout=_SHORT_TIMEOUT,
|
||||
)
|
||||
except (subprocess.SubprocessError, OSError):
|
||||
pass
|
||||
return False, "rebase timeout"
|
||||
except (subprocess.SubprocessError, OSError) as e:
|
||||
return False, f"rebase error: {e}"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# re-test in the caught-up worktree
|
||||
# ---------------------------------------------------------------------------
|
||||
def retest_branch(repo: str, branch: str) -> tuple[bool, str]:
|
||||
"""Run the project test-suite in the (already caught-up) branch worktree.
|
||||
|
||||
Command: ``python -m pytest <merge_retest_target>`` (default ``tests/``),
|
||||
matching the orchestrator CI / check_tests_local pattern. Bounded by
|
||||
``settings.merge_retest_timeout_s``.
|
||||
|
||||
Returns:
|
||||
* ``(True, "re-test green")`` — pytest rc == 0
|
||||
* ``(False, "re-test timeout after <T>s")`` — exceeded the timeout (AC-6)
|
||||
* ``(False, "re-test failed: ...<tail>")`` — non-zero rc, with output tail
|
||||
Never-raise (AC-9): any setup/OS error -> ``(False, "<reason>")``.
|
||||
"""
|
||||
wt = get_worktree_path(repo, branch)
|
||||
if not os.path.isdir(wt):
|
||||
# Caller usually rebased first (worktree exists); ensure as a fallback.
|
||||
try:
|
||||
wt = ensure_worktree(repo, branch)
|
||||
except Exception as e: # noqa: BLE001 - never-raise contract
|
||||
return False, f"re-test setup error: {e}"
|
||||
|
||||
target = settings.merge_retest_target or "tests/"
|
||||
timeout = settings.merge_retest_timeout_s
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["python", "-m", "pytest", target, "-q"],
|
||||
cwd=wt, capture_output=True, text=True, timeout=timeout,
|
||||
)
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.warning("retest_branch: timeout (%ss) on %s/%s", timeout, repo, branch)
|
||||
return False, f"re-test timeout after {timeout}s"
|
||||
except (subprocess.SubprocessError, OSError) as e:
|
||||
return False, f"re-test error: {e}"
|
||||
|
||||
if r.returncode == 0:
|
||||
return True, "re-test green"
|
||||
tail = ((r.stdout or "") + (r.stderr or ""))[-500:]
|
||||
logger.warning("retest_branch: red on %s/%s", repo, branch)
|
||||
return False, f"re-test failed: ...{tail}"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# merge-lease (serialise catch-up + re-test + merge per repo)
|
||||
# ---------------------------------------------------------------------------
|
||||
def _lease_path(repo: str) -> str:
|
||||
"""Filesystem path of the per-repo merge lease (no schema change, ТЗ §4)."""
|
||||
return os.path.join(settings.repos_dir, f".merge-lease-{repo}.json")
|
||||
|
||||
|
||||
def _read_lease(path: str) -> dict | None:
|
||||
"""Read+parse the lease file; None if missing or corrupt (never-raise)."""
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return json.loads(f.read())
|
||||
except FileNotFoundError:
|
||||
return None
|
||||
except (OSError, ValueError) as e:
|
||||
logger.warning("merge-lease read error at %s: %s", path, e)
|
||||
return None
|
||||
|
||||
|
||||
def _write_lease(path: str, holder: dict) -> None:
|
||||
"""Atomically (O_CREAT|O_EXCL) write the lease; raises FileExistsError if held."""
|
||||
fd = os.open(path, os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o644)
|
||||
try:
|
||||
os.write(fd, json.dumps(holder).encode("utf-8"))
|
||||
finally:
|
||||
os.close(fd)
|
||||
|
||||
|
||||
def acquire_merge_lease(
|
||||
repo: str, branch: str, work_item_id: str | None = None, task_id: int | None = None
|
||||
) -> tuple[bool, str]:
|
||||
"""Try to acquire the per-repo merge lease. **Non-blocking** (anti-deadlock).
|
||||
|
||||
Holder identity is the task ``branch`` (stable, one branch per task). Outcomes:
|
||||
* no lease file -> acquire, write metadata -> ``(True, "lease acquired")``
|
||||
* lease held by self -> idempotent re-acquire (restart/retry) -> ``(True, "lease already held")``
|
||||
* lease held by other, age < merge_lock_timeout_s -> ``(False, "merge-lock busy")``
|
||||
* lease held by other, age >= merge_lock_timeout_s -> stale -> reclaim with a
|
||||
``logger.warning`` (the holder process died without releasing) -> ``(True, ...)``
|
||||
|
||||
Never-raise: any unexpected error -> ``(False, "merge-lock busy")`` so the caller
|
||||
DEFERS and retries rather than burning a developer retry on an infra hiccup.
|
||||
"""
|
||||
path = _lease_path(repo)
|
||||
holder = {
|
||||
"branch": branch,
|
||||
"work_item_id": work_item_id,
|
||||
"task_id": task_id,
|
||||
"acquired_at": time.time(),
|
||||
"pid": os.getpid(),
|
||||
}
|
||||
try:
|
||||
try:
|
||||
_write_lease(path, holder)
|
||||
logger.info("merge-lease acquired for %s by %s", repo, branch)
|
||||
return True, "lease acquired"
|
||||
except FileExistsError:
|
||||
pass
|
||||
|
||||
existing = _read_lease(path)
|
||||
if existing is None:
|
||||
# Corrupt/empty lease file — reclaim it.
|
||||
_force_write_lease(path, holder)
|
||||
logger.warning("merge-lease for %s was corrupt; reclaimed by %s", repo, branch)
|
||||
return True, "lease reclaimed (corrupt)"
|
||||
|
||||
if existing.get("branch") == branch:
|
||||
return True, "lease already held"
|
||||
|
||||
age = time.time() - float(existing.get("acquired_at") or 0)
|
||||
if age >= settings.merge_lock_timeout_s:
|
||||
_force_write_lease(path, holder)
|
||||
logger.warning(
|
||||
"merge-lease for %s was stale (age %.0fs >= %ss, holder=%s); reclaimed by %s",
|
||||
repo, age, settings.merge_lock_timeout_s, existing.get("branch"), branch,
|
||||
)
|
||||
return True, "lease reclaimed (stale)"
|
||||
|
||||
logger.info(
|
||||
"merge-lease for %s busy (held by %s, age %.0fs); %s defers",
|
||||
repo, existing.get("branch"), age, branch,
|
||||
)
|
||||
return False, "merge-lock busy"
|
||||
except Exception as e: # noqa: BLE001 - never-raise contract
|
||||
logger.warning("acquire_merge_lease unexpected error for %s/%s: %s", repo, branch, e)
|
||||
return False, "merge-lock busy"
|
||||
|
||||
|
||||
def _force_write_lease(path: str, holder: dict) -> None:
|
||||
"""Overwrite the lease (used for stale/corrupt reclaim). Best-effort."""
|
||||
try:
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
f.write(json.dumps(holder))
|
||||
except OSError as e:
|
||||
logger.warning("merge-lease force-write error at %s: %s", path, e)
|
||||
|
||||
|
||||
def release_merge_lease(repo: str, branch: str | None = None) -> None:
|
||||
"""Release the per-repo merge lease. **Idempotent** and **holder-aware**.
|
||||
|
||||
If ``branch`` is given, the lease is removed ONLY when the current holder's
|
||||
branch matches (so a delayed release from an already-merged task can never
|
||||
delete a lease a DIFFERENT task acquired afterwards). With ``branch=None`` the
|
||||
release is unconditional (best-effort backstop). Never raises.
|
||||
"""
|
||||
path = _lease_path(repo)
|
||||
try:
|
||||
if branch is not None:
|
||||
existing = _read_lease(path)
|
||||
if existing is not None and existing.get("branch") != branch:
|
||||
logger.info(
|
||||
"merge-lease release skipped for %s: holder=%s != %s",
|
||||
repo, existing.get("branch"), branch,
|
||||
)
|
||||
return
|
||||
os.remove(path)
|
||||
logger.info("merge-lease released for %s (%s)", repo, branch or "force")
|
||||
except FileNotFoundError:
|
||||
return
|
||||
except OSError as e:
|
||||
logger.warning("merge-lease release error for %s: %s", repo, e)
|
||||
@@ -621,6 +621,87 @@ def check_staging_status(repo: str, work_item_id: str, branch: str | None = None
|
||||
return False, "Staging log not found (15-staging-log.md)"
|
||||
|
||||
|
||||
def _merge_gate_applies(repo: str) -> bool:
|
||||
"""Whether the merge-gate is REAL for this repo (ORCH-043, conditional rollout).
|
||||
|
||||
Mirrors the ORCH-35 conditional staging-gate. ``merge_gate_repos`` is a CSV of
|
||||
repos where the gate is enforced; when empty the gate is real ONLY for the
|
||||
self-hosting repo (``orchestrator``). Other repos -> conditional no-op.
|
||||
"""
|
||||
raw = (settings.merge_gate_repos or "").strip()
|
||||
if raw:
|
||||
allowed = {r.strip().lower() for r in raw.split(",") if r.strip()}
|
||||
return (repo or "").strip().lower() in allowed
|
||||
return is_self_hosting_repo(repo)
|
||||
|
||||
|
||||
def check_branch_mergeable(repo: str, work_item_id: str, branch: str) -> tuple[bool, str]:
|
||||
"""ORCH-043 merge-gate: validate the branch against the CURRENT origin/main
|
||||
immediately before the deployer merges its PR (deploy-staging -> deploy edge).
|
||||
|
||||
Deterministic, no LLM. Algorithm (ADR-001 §4):
|
||||
1. Conditionality: merge_gate_enabled=False -> (True, "merge-gate disabled");
|
||||
repo where the gate is not real -> (True, "merge-gate N/A for <repo>").
|
||||
2. Acquire the per-repo merge lease (NON-blocking). Busy -> (False, "merge-lock
|
||||
busy") — a SIGNAL for the engine to DEFER (not a code fault, no rollback).
|
||||
3. Double-check "behind origin/main" UNDER the lease (main may have moved while
|
||||
we waited). Not behind -> (True, "branch up-to-date with main"); lease HELD.
|
||||
4. Behind -> auto_rebase_onto_main:
|
||||
- conflict -> release lease -> (False, "rebase conflict: ...")
|
||||
- clean -> retest_branch:
|
||||
green -> (True, "rebased onto main, re-test green"); lease HELD
|
||||
red/timeout -> release lease -> (False, "re-test ... after rebase")
|
||||
5. On SUCCESS the lease is HELD until the actual merge (released on PR-merged
|
||||
webhook / deploy->done / rollback). On any FAILURE the lease is released.
|
||||
|
||||
Never-raise (AC-9): any internal error -> (False, "<reason>") with the lease
|
||||
released; an exception never escapes into advance_stage.
|
||||
"""
|
||||
# Imported lazily so qg.checks stays importable without the merge_gate deps in
|
||||
# minimal/test contexts and to avoid an import cycle surprise.
|
||||
from .. import merge_gate
|
||||
|
||||
try:
|
||||
if not settings.merge_gate_enabled:
|
||||
return True, "merge-gate disabled"
|
||||
if not _merge_gate_applies(repo):
|
||||
return True, f"merge-gate N/A for {repo}"
|
||||
|
||||
acquired, reason = merge_gate.acquire_merge_lease(repo, branch, work_item_id)
|
||||
if not acquired:
|
||||
# "merge-lock busy" -> caller defers; lease NOT held by us, nothing to release.
|
||||
return False, reason
|
||||
|
||||
try:
|
||||
# Double-check under the lease: another task may have just merged.
|
||||
if not merge_gate.branch_is_behind_main(repo, branch):
|
||||
logger.info("check_branch_mergeable: %s up-to-date with main", branch)
|
||||
return True, "branch up-to-date with main"
|
||||
|
||||
ok, rb_reason = merge_gate.auto_rebase_onto_main(repo, branch)
|
||||
if not ok:
|
||||
merge_gate.release_merge_lease(repo, branch)
|
||||
return False, rb_reason # "rebase conflict: ..."
|
||||
|
||||
ok_t, t_reason = merge_gate.retest_branch(repo, branch)
|
||||
if ok_t:
|
||||
logger.info("check_branch_mergeable: %s rebased + re-test green", branch)
|
||||
return True, "rebased onto main, re-test green"
|
||||
|
||||
merge_gate.release_merge_lease(repo, branch)
|
||||
if "timeout" in t_reason:
|
||||
return False, t_reason # "re-test timeout after <T>s" (AC-6)
|
||||
tail = t_reason.removeprefix("re-test failed: ")
|
||||
return False, f"re-test failed after rebase: {tail}"
|
||||
except Exception as e: # noqa: BLE001 - never-raise; always release on error
|
||||
merge_gate.release_merge_lease(repo, branch)
|
||||
logger.error("check_branch_mergeable inner error for %s/%s: %s", repo, branch, e)
|
||||
return False, f"merge-gate error: {e}"
|
||||
except Exception as e: # noqa: BLE001 - outer never-raise guard
|
||||
logger.error("check_branch_mergeable error for %s/%s: %s", repo, branch, e)
|
||||
return False, f"merge-gate error: {e}"
|
||||
|
||||
|
||||
# Registry for dynamic lookup by name
|
||||
QG_CHECKS = {
|
||||
"check_analysis_approved": check_analysis_approved,
|
||||
@@ -633,4 +714,5 @@ QG_CHECKS = {
|
||||
"check_tests_local": check_tests_local,
|
||||
"check_deploy_status": check_deploy_status,
|
||||
"check_staging_status": check_staging_status,
|
||||
"check_branch_mergeable": check_branch_mergeable,
|
||||
}
|
||||
|
||||
@@ -34,6 +34,7 @@ from .stages import get_next_stage, get_qg_for_stage, get_agent_for_stage
|
||||
from .git_worktree import get_worktree_path
|
||||
from .review_parse import extract_review_findings, extract_test_failures
|
||||
from .qg.checks import QG_CHECKS
|
||||
from . import merge_gate
|
||||
from .notifications import (
|
||||
notify_stage_change,
|
||||
notify_qg_failure,
|
||||
@@ -239,6 +240,18 @@ def advance_stage(
|
||||
result.note = f"qg '{qg_name}' not in registry"
|
||||
return result
|
||||
|
||||
# --- ORCH-043 merge-gate sub-gate (deploy-staging -> deploy edge) -----
|
||||
# AFTER check_staging_status passed and BEFORE we advance to `deploy` /
|
||||
# launch the deployer that merges the PR. Not a STAGE_TRANSITIONS entry —
|
||||
# it is an edge sub-gate triggered by the same "staging-deployer finished"
|
||||
# event. If it intervenes (defer on busy-lock, or rollback on conflict /
|
||||
# red re-test) it owns the outcome and we return without advancing.
|
||||
if current_stage == "deploy-staging":
|
||||
if _handle_merge_gate(
|
||||
task_id, current_stage, repo, work_item_id, branch, agent, result
|
||||
):
|
||||
return result
|
||||
|
||||
# --- Advance ---------------------------------------------------------
|
||||
update_task_stage(task_id, next_stage)
|
||||
# Telegram live tracker: the analysis->architecture advance is the human
|
||||
@@ -274,6 +287,15 @@ def advance_stage(
|
||||
except Exception as e:
|
||||
logger.error(f"Task {task_id}: failed to set Plane Done: {e}")
|
||||
|
||||
# ORCH-043: the merge has landed (deploy->done). Release the merge lease as
|
||||
# a backstop in case the PR-merged webhook was lost (holder-aware no-op if a
|
||||
# different task already owns it). Never raises.
|
||||
if next_stage == "done":
|
||||
try:
|
||||
merge_gate.release_merge_lease(repo, branch)
|
||||
except Exception as e: # noqa: BLE001 - defensive
|
||||
logger.warning(f"Task {task_id}: merge-lease release on done failed: {e}")
|
||||
|
||||
# --- Launch the next agent (ORCH-4 fix: current_stage, not next) -----
|
||||
next_agent = get_agent_for_stage(current_stage)
|
||||
if next_agent:
|
||||
@@ -565,6 +587,12 @@ def _handle_qg_failure_rollbacks(
|
||||
notify_stage_change(task_id, current_stage, "development")
|
||||
plane_notify_stage(work_item_id, current_stage, "development")
|
||||
result.rolled_back_to = "development"
|
||||
# ORCH-043: deploy failed -> no merge will complete; release the lease so the
|
||||
# next task isn't blocked until the lease ages out (holder-aware no-op).
|
||||
try:
|
||||
merge_gate.release_merge_lease(repo, branch)
|
||||
except Exception as e: # noqa: BLE001 - defensive
|
||||
logger.warning(f"Task {task_id}: merge-lease release on deploy-fail failed: {e}")
|
||||
set_issue_blocked(work_item_id)
|
||||
notify_qg_failure(task_id, "deploy", "check_deploy_status", reason)
|
||||
plane_add_comment(
|
||||
@@ -582,3 +610,155 @@ def _handle_qg_failure_rollbacks(
|
||||
f"Task {task_id}: deployer verdict FAILED, rolled back deploy -> "
|
||||
f"development ({reason})"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ORCH-043: merge-gate sub-gate on the deploy-staging -> deploy edge
|
||||
# ---------------------------------------------------------------------------
|
||||
def _merge_defer_count(task_id: int) -> int:
|
||||
"""How many times this task has already been deferred by the merge-gate.
|
||||
|
||||
Counted from the persisted jobs queue (restart-safe) by the defer marker in
|
||||
task_content, so a service restart never resets the defer budget.
|
||||
"""
|
||||
conn = get_db()
|
||||
n = conn.execute(
|
||||
"SELECT COUNT(*) FROM jobs WHERE task_id=? AND task_content LIKE '%merge-gate defer%'",
|
||||
(task_id,),
|
||||
).fetchone()[0]
|
||||
conn.close()
|
||||
return n
|
||||
|
||||
|
||||
def _handle_merge_gate(
|
||||
task_id, current_stage, repo, work_item_id, branch, agent, result: AdvanceResult
|
||||
) -> bool:
|
||||
"""Run check_branch_mergeable on the deploy-staging -> deploy edge.
|
||||
|
||||
Returns True if the gate INTERVENED (the caller must return without advancing):
|
||||
* "merge-lock busy" -> DEFER (re-queue the staging-deployer with a
|
||||
delay; the task stays on deploy-staging). Code
|
||||
is fine, so NO rollback and no developer retry.
|
||||
* conflict / red re-test -> ROLLBACK to development (+ developer retry,
|
||||
capped by MAX_DEVELOPER_RETRIES).
|
||||
Returns False when the gate PASSED (branch up-to-date, or rebased + re-test green)
|
||||
so advance_stage proceeds to `deploy` and launches the deployer that merges. On a
|
||||
PASS the merge lease is HELD until the actual merge (released on PR-merged webhook
|
||||
/ deploy->done / rollback).
|
||||
"""
|
||||
passed, reason = _run_qg("check_branch_mergeable", repo, work_item_id, branch)
|
||||
if passed:
|
||||
logger.info(f"Task {task_id}: merge-gate passed ({reason})")
|
||||
return False
|
||||
|
||||
result.qg_name = "check_branch_mergeable"
|
||||
result.qg_passed = False
|
||||
result.qg_reason = reason
|
||||
|
||||
if reason == "merge-lock busy":
|
||||
_handle_merge_gate_defer(
|
||||
task_id, current_stage, repo, work_item_id, branch, result
|
||||
)
|
||||
return True
|
||||
|
||||
_handle_merge_gate_rollback(
|
||||
task_id, current_stage, repo, work_item_id, branch, reason, result
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
def _handle_merge_gate_defer(
|
||||
task_id, current_stage, repo, work_item_id, branch, result: AdvanceResult
|
||||
):
|
||||
"""merge-lock busy -> DEFER: re-queue the staging-deployer after a delay.
|
||||
|
||||
Non-blocking: the worker slot is freed (anti-deadlock at max_concurrency=1) so
|
||||
the lease HOLDER can finish merging. The task remains on deploy-staging; a later
|
||||
staging-deployer run re-evaluates the gate. Bounded by merge_defer_max_attempts.
|
||||
"""
|
||||
defers = _merge_defer_count(task_id)
|
||||
if defers < settings.merge_defer_max_attempts:
|
||||
task_desc = (
|
||||
f"Work item: {work_item_id}\nRepo: {repo}\nBranch: {branch}\n"
|
||||
f"Stage: deploy-staging\nNote: merge-gate defer "
|
||||
f"(attempt {defers + 1}/{settings.merge_defer_max_attempts}) — "
|
||||
f"merge-lock busy, retrying after {settings.merge_defer_delay_s}s."
|
||||
)
|
||||
new_job = enqueue_job(
|
||||
"deployer", repo, task_desc, task_id=task_id,
|
||||
available_at_delay_s=settings.merge_defer_delay_s,
|
||||
)
|
||||
result.enqueued_agent = "deployer"
|
||||
result.enqueued_job_id = new_job
|
||||
result.note = "merge-gate-deferred"
|
||||
logger.info(
|
||||
f"Task {task_id}: merge-lock busy, deferred deployer "
|
||||
f"(job_id={new_job}, attempt {defers + 1}/{settings.merge_defer_max_attempts})"
|
||||
)
|
||||
else:
|
||||
set_issue_blocked(work_item_id)
|
||||
send_telegram(
|
||||
f"\U0001f6a8 {work_item_id}: merge-gate defer limit "
|
||||
f"({settings.merge_defer_max_attempts}) reached (merge-lock busy). "
|
||||
f"Manual intervention needed."
|
||||
)
|
||||
result.alerted = True
|
||||
result.note = "merge-gate-defer-exhausted"
|
||||
logger.error(
|
||||
f"Task {task_id}: merge-gate defer attempts exhausted "
|
||||
f"({settings.merge_defer_max_attempts})"
|
||||
)
|
||||
|
||||
|
||||
def _handle_merge_gate_rollback(
|
||||
task_id, current_stage, repo, work_item_id, branch, reason, result: AdvanceResult
|
||||
):
|
||||
"""Rebase conflict / red re-test -> ROLLBACK to development + developer retry.
|
||||
|
||||
Mirrors the staging/deploy rollback pattern but is capped by
|
||||
MAX_DEVELOPER_RETRIES (AC-11 / TC-22: no infinite bounce). The merge lease was
|
||||
already released by check_branch_mergeable on failure; a defensive holder-aware
|
||||
release here is a harmless no-op.
|
||||
"""
|
||||
update_task_stage(task_id, "development")
|
||||
notify_stage_change(task_id, current_stage, "development")
|
||||
plane_notify_stage(work_item_id, current_stage, "development")
|
||||
result.rolled_back_to = "development"
|
||||
set_issue_in_progress(work_item_id)
|
||||
try:
|
||||
merge_gate.release_merge_lease(repo, branch)
|
||||
except Exception as e: # noqa: BLE001 - defensive
|
||||
logger.warning(f"Task {task_id}: merge-lease release on rollback failed: {e}")
|
||||
notify_qg_failure(task_id, current_stage, "check_branch_mergeable", reason)
|
||||
plane_add_comment(
|
||||
work_item_id,
|
||||
f"❌ Merge-gate FAILED ({reason}). Rolled back to development. "
|
||||
f"Developer нужен для фикса.",
|
||||
author="deployer",
|
||||
)
|
||||
retry_count = _developer_retry_count(task_id)
|
||||
if retry_count < MAX_DEVELOPER_RETRIES:
|
||||
task_desc = (
|
||||
f"Work item: {work_item_id}\nRepo: {repo}\nBranch: {branch}\n"
|
||||
f"Stage: development\nNote: Merge-gate failed "
|
||||
f"(attempt {retry_count + 1}/{MAX_DEVELOPER_RETRIES}). "
|
||||
f"Причина: {reason}."
|
||||
)
|
||||
new_job = enqueue_job("developer", repo, task_desc, task_id=task_id)
|
||||
result.enqueued_agent = "developer"
|
||||
result.enqueued_job_id = new_job
|
||||
logger.info(
|
||||
f"Task {task_id}: merge-gate FAILED, enqueued developer (job_id={new_job})"
|
||||
)
|
||||
else:
|
||||
set_issue_blocked(work_item_id)
|
||||
send_telegram(
|
||||
f"\U0001f6a8 {work_item_id}: Merge-gate still failing after "
|
||||
f"{MAX_DEVELOPER_RETRIES} developer retries ({reason}). "
|
||||
f"Manual intervention needed."
|
||||
)
|
||||
result.alerted = True
|
||||
logger.error(
|
||||
f"Task {task_id}: merge-gate FAILED, rolled back deploy-staging -> "
|
||||
f"development ({reason})"
|
||||
)
|
||||
|
||||
@@ -334,6 +334,15 @@ async def handle_pr(payload: dict):
|
||||
logger.error(f"Task {task_id}: max retries reached, needs manual intervention")
|
||||
|
||||
elif action == "closed" and pr.get("merged", False):
|
||||
# ORCH-043: the branch's PR just merged into main -> release the per-repo
|
||||
# merge lease this task held from the merge-gate (holder-aware by branch, so
|
||||
# it can't clobber a lease another task acquired afterwards). Never raises.
|
||||
try:
|
||||
from ..merge_gate import release_merge_lease
|
||||
release_merge_lease(repo_name, head_branch)
|
||||
except Exception as e: # noqa: BLE001 - defensive, never block the webhook
|
||||
logger.warning(f"Task {task_id}: merge-lease release on PR-merge failed: {e}")
|
||||
|
||||
# BUG 8 (second door): at the deploy stage `done` is gated by the
|
||||
# deployer's verdict (check_deploy_status via advance_stage), NOT by the
|
||||
# fact that the PR was merged. The deployer merges the PR at the START of
|
||||
|
||||
Reference in New Issue
Block a user