Files
orchestrator/src/serial_gate.py
claude-bot ee4773f5b0 feat(serial-gate): per-repo serial gate + deferred branch cut + rollback-freeze (ORCH-088)
Этап 1 (serial e2e) пакетного автономного режима. Новая задача репо не входит
в analysis (analyst-job не выбирается, ветка не режется), пока в репо есть более
ранняя незавершённая задача (FIFO, t2.id < jobs.task_id) ИЛИ репо заморожен.

- src/serial_gate.py — новый leaf (never-raise): build_claim_clause (fail-OPEN),
  is_repo_frozen (fail-CLOSED), set/clear_repo_freeze, serial_gate_applies, snapshot.
- src/db.py — идемпотентная миграция repo_freeze + serial_gate-фрагмент в claim_next_job.
- src/webhooks/plane.py + src/agents/launcher.py — отложенный срез ветки: start_pipeline
  не создаёт Gitea-ветку/docs для применимого репо; релокация в _materialize_deferred_branch
  на момент claim analyst-job (база = свежий origin/main с кодом предшественника, AC-6).
- src/stage_engine.py — post-deploy DEGRADED → durable per-repo freeze + Telegram-алерт.
- src/main.py — блок serial_gate в GET /queue + POST /serial-gate/unfreeze.
- src/config.py — serial_gate_enabled / serial_gate_repos / serial_gate_freeze_enabled.

FIFO-уточнение реализации (FR-2): ADR-001 D1 фиксировал t2.id != jobs.task_id; при !=
пакет одновременно созданных свежих задач взаимно блокировался бы (дедлок). t2.id <
jobs.task_id допускает самую раннюю задачу и сериализует остальные, сохраняя AC-1/R-7.

STAGE_TRANSITIONS / QG_CHECKS / check_* — без изменений. Аддитивно, под kill-switch,
never-raise, restart-safe; при выключенном флаге — нулевая регрессия (enduro не затронут).

Тесты: TC-01..TC-22 (test_serial_gate*.py + test_queue_endpoint.py); полный прогон 1114 зелёных.
Docs: README (serial gate / /queue / API / БД), CLAUDE.md, CHANGELOG.md, .env.example.

Refs: ORCH-088
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-09 11:24:48 +03:00

405 lines
16 KiB
Python

"""ORCH-088 (Этап 1, serial e2e): per-repo serial gate + durable rollback-freeze.
Leaf module — pure, unit-testable logic over the existing ``tasks`` / ``jobs``
tables and the additive ``repo_freeze`` table (see src/db.py /
08-data-requirements.md). Mirrors the leaf pattern of ``src/task_deps.py`` /
``src/post_deploy.py``: imports only ``db`` + ``config`` (and lazily
``projects`` for the snapshot), never ``stage_engine`` / ``launcher``.
What it enforces (ADR-001):
* A NEW task's analyst-job does NOT enter analysis (no branch cut, no analyst
agent) while the same repo has ANOTHER unfinished task (``tasks.stage !=
'done'``) OR the repo is frozen. The gate is a SQL fragment spliced into
``db.claim_next_job`` (offline hot path) — ``build_claim_clause``.
* After a post-deploy ``DEGRADED`` verdict the repo is frozen
(``set_repo_freeze``); the gate stays CLOSED until an operator clears it
(``clear_repo_freeze``). The degraded task is already ``stage='done'`` (BR-7)
so freeze is a SEPARATE durable signal, not derived from a stage.
never-raise contract (self-hosting safety): every public function degrades
conservatively and NEVER propagates into the worker / webhook / stage engine.
Two deliberately different failure directions (ADR-001 D10, NFR-1):
* hot-claim clause build -> fail-OPEN ("" fragment): a transient DB/build error
must not wedge the queue of ALL projects (AC-8).
* freeze decision (``is_repo_frozen``) -> fail-CLOSED (``True``): when we cannot
confirm the ABSENCE of a freeze we keep the gate closed for prod safety (AC-9).
"""
from __future__ import annotations
import logging
import re
from . import db
from .config import settings
logger = logging.getLogger("orchestrator.serial_gate")
# Repo tokens embedded into the claim SQL ``IN (...)`` list must match this — a
# guard against a broken/injected ORCH_SERIAL_GATE_REPOS CSV (R-6). The CSV is an
# operator config (not user input), but the guard is mandatory; an invalid token
# is silently dropped.
_REPO_TOKEN = re.compile(r"^[A-Za-z0-9._-]+$")
# ---------------------------------------------------------------------------
# Conditionality (mirrors post_deploy_applies / _merge_gate_applies)
# ---------------------------------------------------------------------------
def _scope_repos() -> set[str]:
"""Sanitised set of in-scope repo tokens from ``serial_gate_repos`` (CSV).
Empty/blank CSV -> empty set, meaning "apply to ALL repos" (D5). Invalid
tokens (regex miss) are dropped. Never raises.
"""
try:
raw = (settings.serial_gate_repos or "").strip()
except Exception: # noqa: BLE001
return set()
if not raw:
return set()
out: set[str] = set()
for tok in raw.split(","):
t = tok.strip()
if t and _REPO_TOKEN.match(t):
out.add(t)
elif t:
logger.warning("serial_gate: dropping invalid repo token %r from CSV", t)
return out
def serial_gate_applies(repo: str) -> bool:
"""Whether the serial gate is REAL for this repo (D5 / AC-7).
* ``serial_gate_enabled=False`` -> always False (kill-switch; claim and
start_pipeline are 1:1 as before ORCH-088).
* ``serial_gate_repos`` (CSV) non-empty -> real only for listed repos.
* empty CSV -> real for ALL repos (serial e2e + anti-stale-base help every
repo, unlike the self-hosting-only ORCH-35/43/58 gates).
Never raises -> False on error (degrade to "gate inert", the safe-for-flow
default that matches the kill-switch off behaviour).
"""
try:
if not getattr(settings, "serial_gate_enabled", False):
return False
scope = _scope_repos()
if scope:
return (repo or "").strip() in scope
return True
except Exception as e: # noqa: BLE001 - never-raise
logger.warning("serial_gate_applies error for %s: %s", repo, e)
return False
def _freeze_layer_enabled() -> bool:
"""Whether the FR-5 freeze layer is active (independent tumbler, D7)."""
try:
return bool(getattr(settings, "serial_gate_freeze_enabled", False))
except Exception: # noqa: BLE001
return False
# ---------------------------------------------------------------------------
# Read helpers (active task + freeze) — only the local DB
# ---------------------------------------------------------------------------
def repo_has_active_task(repo: str, exclude_task_id: int | None = None) -> bool:
"""True iff repo has a task with ``stage != 'done'`` (excluding one task).
``exclude_task_id`` is the task being evaluated (a new/rework task must not
count ITSELF as the active task that blocks it — R-7). Observability/Python
mirror of the SQL gate; never raises -> False on error.
"""
try:
conn = db.get_db()
try:
if exclude_task_id is not None:
row = conn.execute(
"SELECT 1 FROM tasks WHERE repo=? AND id != ? AND stage != 'done' LIMIT 1",
(repo, exclude_task_id),
).fetchone()
else:
row = conn.execute(
"SELECT 1 FROM tasks WHERE repo=? AND stage != 'done' LIMIT 1",
(repo,),
).fetchone()
return row is not None
finally:
conn.close()
except Exception as e: # noqa: BLE001 - never-raise
logger.warning("repo_has_active_task error for %s: %s", repo, e)
return False
def _active_freeze_row(repo: str) -> dict | None:
"""Most-recent active (``cleared_at IS NULL``) freeze row for repo, or None.
Raises on a real DB error (the caller decides fail-open vs fail-closed) — this
private helper does NOT swallow so ``is_repo_frozen`` can fail CLOSED.
"""
conn = db.get_db()
try:
row = conn.execute(
"SELECT repo, frozen_at, reason, work_item_id FROM repo_freeze "
"WHERE repo=? AND cleared_at IS NULL ORDER BY id DESC LIMIT 1",
(repo,),
).fetchone()
return dict(row) if row else None
finally:
conn.close()
def is_repo_frozen(repo: str) -> bool:
"""True iff repo currently has an active freeze (FR-5).
fail-CLOSED (AC-9): when the freeze layer is enabled and we CANNOT confirm the
absence of a freeze (DB error), return True — keep the gate closed for prod
safety. When the freeze layer is disabled the repo is never considered frozen.
"""
if not _freeze_layer_enabled():
return False
try:
return _active_freeze_row(repo) is not None
except Exception as e: # noqa: BLE001 - fail-CLOSED on doubt (AC-9)
logger.warning("is_repo_frozen error for %s -> fail-CLOSED (frozen): %s", repo, e)
return True
# ---------------------------------------------------------------------------
# Freeze mutators (FR-5)
# ---------------------------------------------------------------------------
def set_repo_freeze(repo: str, reason: str = "", work_item_id: str | None = None) -> bool:
"""Insert a durable freeze row for repo (no-op when the freeze layer is off).
Append-only: a repeated DEGRADED while already frozen simply adds another row
(history); ``is_repo_frozen``'s EXISTS is idempotent. Returns True iff a row
was inserted. never-raise -> False on error (a freeze write failure must not
crash the post-deploy monitor tick).
"""
if not _freeze_layer_enabled():
logger.info("set_repo_freeze: freeze layer disabled, skipping for %s", repo)
return False
if not repo:
return False
try:
conn = db.get_db()
try:
conn.execute(
"INSERT INTO repo_freeze (repo, reason, work_item_id) VALUES (?, ?, ?)",
(repo, reason or None, work_item_id),
)
conn.commit()
finally:
conn.close()
logger.warning(
"serial_gate: repo %s FROZEN (reason=%r, work_item=%s) — next task will "
"NOT start until manual unfreeze", repo, reason, work_item_id,
)
return True
except Exception as e: # noqa: BLE001 - never-raise
logger.error("set_repo_freeze error for %s: %s", repo, e)
return False
def clear_repo_freeze(repo: str) -> int:
"""Clear ALL active freeze rows for repo (operator unfreeze, D4).
Sets ``cleared_at=now`` on every row with ``cleared_at IS NULL``. Idempotent
(a repeat clears 0 rows). Returns the number of rows cleared. never-raise -> 0
on error.
"""
if not repo:
return 0
try:
conn = db.get_db()
try:
cur = conn.execute(
"UPDATE repo_freeze SET cleared_at=datetime('now') "
"WHERE repo=? AND cleared_at IS NULL",
(repo,),
)
conn.commit()
n = cur.rowcount or 0
finally:
conn.close()
if n:
logger.warning("serial_gate: repo %s UNFROZEN (%d row(s) cleared)", repo, n)
return n
except Exception as e: # noqa: BLE001 - never-raise
logger.error("clear_repo_freeze error for %s: %s", repo, e)
return 0
# ---------------------------------------------------------------------------
# Hot-claim SQL fragment (fail-OPEN) — ADR-001 D1
# ---------------------------------------------------------------------------
def build_claim_clause() -> str:
"""Build the ``AND NOT (...)`` fragment spliced into ``claim_next_job``.
Blocks an analyst-job whose repo (a) has an EARLIER-queued unfinished task or
(b) is frozen. Only ``jobs.agent='analyst'`` is gated — jobs of an
already-active task pass freely (else the single active task could never
advance).
Ordering term — ``t2.id < jobs.task_id`` (FIFO, ADR-001 D1 / FR-2): a task is
blocked only by EARLIER tasks (lower ``tasks.id``) that are not yet done. This
is the FIFO refinement of the ADR's pseudo-SQL ``t2.id != jobs.task_id``: with
``!=`` a BATCH of fresh tasks all sitting in ``analysis`` would mutually block
(each is "another unfinished task" for the others) -> the whole serial queue
deadlocks, contradicting FR-2 ("строго по одной, FIFO по jobs.id"). ``<`` admits
exactly the oldest unfinished task and serialises the rest behind it, while
still never self-blocking a new/rework analyst-job on its OWN row (R-7) and
keeping AC-1 (a newer task is held by the older active one) intact.
Repo scope: empty CSV -> no repo filter (all repos); non-empty CSV -> ``AND
jobs.repo IN ('a','b')`` with sanitised tokens (R-6).
fail-OPEN (AC-8): kill-switch off OR any build error -> ``""`` (claim behaves
exactly as before ORCH-088). The trailing space keeps the spliced SQL valid.
"""
try:
if not getattr(settings, "serial_gate_enabled", False):
return ""
scope = _scope_repos()
if scope:
# All tokens already passed the _REPO_TOKEN regex -> safe to embed.
repo_in = ", ".join(f"'{t}'" for t in sorted(scope))
repo_scope = f"AND jobs.repo IN ({repo_in}) "
else:
repo_scope = ""
active_clause = (
"EXISTS (SELECT 1 FROM tasks t2 "
"WHERE t2.repo = jobs.repo AND t2.id < jobs.task_id "
"AND t2.stage != 'done') "
)
if _freeze_layer_enabled():
freeze_clause = (
"OR EXISTS (SELECT 1 FROM repo_freeze f "
"WHERE f.repo = jobs.repo AND f.cleared_at IS NULL) "
)
else:
freeze_clause = ""
return (
"AND NOT ( jobs.agent = 'analyst' "
f"{repo_scope}"
f"AND ( {active_clause}{freeze_clause}) "
") "
)
except Exception as e: # noqa: BLE001 - fail-OPEN: never wedge the queue
logger.warning("build_claim_clause error -> fail-OPEN (no gate): %s", e)
return ""
# ---------------------------------------------------------------------------
# Observability snapshot for GET /queue (D9 / AC-10)
# ---------------------------------------------------------------------------
def _known_repos() -> list[str]:
"""Registered repo names (best-effort) plus any repo with live gate state."""
repos: set[str] = set()
try:
from . import projects
for p in projects.PROJECTS:
if getattr(p, "repo", None):
repos.add(p.repo)
except Exception: # noqa: BLE001
pass
# Also surface repos that have an active freeze or a queued analyst-job even if
# they are not in the static registry (defensive — never hide a frozen repo).
try:
conn = db.get_db()
try:
for (r,) in conn.execute(
"SELECT DISTINCT repo FROM repo_freeze WHERE cleared_at IS NULL"
).fetchall():
if r:
repos.add(r)
for (r,) in conn.execute(
"SELECT DISTINCT repo FROM jobs WHERE status='queued' AND agent='analyst'"
).fetchall():
if r:
repos.add(r)
finally:
conn.close()
except Exception: # noqa: BLE001
pass
return sorted(repos)
def _per_repo_snapshot(repo: str) -> dict:
"""Per-repo gate state for the /queue snapshot (never raises here)."""
active_task = None
waiting: list[dict] = []
try:
conn = db.get_db()
try:
row = conn.execute(
"SELECT work_item_id, stage FROM tasks "
"WHERE repo=? AND stage != 'done' ORDER BY id LIMIT 1",
(repo,),
).fetchone()
if row:
active_task = {"work_item_id": row["work_item_id"], "stage": row["stage"]}
for j in conn.execute(
"SELECT j.id AS job_id, t.work_item_id AS work_item_id, t.stage AS stage "
"FROM jobs j LEFT JOIN tasks t ON t.id = j.task_id "
"WHERE j.repo=? AND j.status='queued' AND j.agent='analyst' "
"ORDER BY j.id",
(repo,),
).fetchall():
waiting.append({
"job_id": j["job_id"],
"work_item_id": j["work_item_id"],
"stage": j["stage"],
})
finally:
conn.close()
except Exception as e: # noqa: BLE001
logger.warning("serial_gate per-repo snapshot error for %s: %s", repo, e)
frozen = is_repo_frozen(repo)
frozen_reason = None
frozen_at = None
if frozen:
try:
fr = _active_freeze_row(repo)
if fr:
frozen_reason = fr.get("reason")
frozen_at = fr.get("frozen_at")
except Exception: # noqa: BLE001
pass
return {
"active_task": active_task,
"waiting": waiting,
"frozen": frozen,
"frozen_reason": frozen_reason,
"frozen_at": frozen_at,
}
def snapshot() -> dict:
"""Read-only serial-gate summary for GET /queue (D9 / AC-10).
Additive block; existing /queue keys are untouched. never-raise: any error ->
a minimal dict with the flags and empty per-repo data.
"""
try:
enabled = bool(getattr(settings, "serial_gate_enabled", False))
except Exception: # noqa: BLE001
enabled = False
try:
repos_cfg = getattr(settings, "serial_gate_repos", "") or ""
except Exception: # noqa: BLE001
repos_cfg = ""
try:
per_repo = {r: _per_repo_snapshot(r) for r in _known_repos()}
return {
"enabled": enabled,
"freeze_enabled": _freeze_layer_enabled(),
"repos": repos_cfg,
"per_repo": per_repo,
}
except Exception as e: # noqa: BLE001 - never-raise -> minimal dict
logger.warning("serial_gate snapshot error: %s", e)
return {
"enabled": enabled,
"freeze_enabled": False,
"repos": repos_cfg,
"per_repo": {},
}