Этап 1 (serial e2e) пакетного автономного режима. Новая задача репо не входит в analysis (analyst-job не выбирается, ветка не режется), пока в репо есть более ранняя незавершённая задача (FIFO, t2.id < jobs.task_id) ИЛИ репо заморожен. - src/serial_gate.py — новый leaf (never-raise): build_claim_clause (fail-OPEN), is_repo_frozen (fail-CLOSED), set/clear_repo_freeze, serial_gate_applies, snapshot. - src/db.py — идемпотентная миграция repo_freeze + serial_gate-фрагмент в claim_next_job. - src/webhooks/plane.py + src/agents/launcher.py — отложенный срез ветки: start_pipeline не создаёт Gitea-ветку/docs для применимого репо; релокация в _materialize_deferred_branch на момент claim analyst-job (база = свежий origin/main с кодом предшественника, AC-6). - src/stage_engine.py — post-deploy DEGRADED → durable per-repo freeze + Telegram-алерт. - src/main.py — блок serial_gate в GET /queue + POST /serial-gate/unfreeze. - src/config.py — serial_gate_enabled / serial_gate_repos / serial_gate_freeze_enabled. FIFO-уточнение реализации (FR-2): ADR-001 D1 фиксировал t2.id != jobs.task_id; при != пакет одновременно созданных свежих задач взаимно блокировался бы (дедлок). t2.id < jobs.task_id допускает самую раннюю задачу и сериализует остальные, сохраняя AC-1/R-7. STAGE_TRANSITIONS / QG_CHECKS / check_* — без изменений. Аддитивно, под kill-switch, never-raise, restart-safe; при выключенном флаге — нулевая регрессия (enduro не затронут). Тесты: TC-01..TC-22 (test_serial_gate*.py + test_queue_endpoint.py); полный прогон 1114 зелёных. Docs: README (serial gate / /queue / API / БД), CLAUDE.md, CHANGELOG.md, .env.example. Refs: ORCH-088 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
405 lines
16 KiB
Python
405 lines
16 KiB
Python
"""ORCH-088 (Этап 1, serial e2e): per-repo serial gate + durable rollback-freeze.
|
|
|
|
Leaf module — pure, unit-testable logic over the existing ``tasks`` / ``jobs``
|
|
tables and the additive ``repo_freeze`` table (see src/db.py /
|
|
08-data-requirements.md). Mirrors the leaf pattern of ``src/task_deps.py`` /
|
|
``src/post_deploy.py``: imports only ``db`` + ``config`` (and lazily
|
|
``projects`` for the snapshot), never ``stage_engine`` / ``launcher``.
|
|
|
|
What it enforces (ADR-001):
|
|
* A NEW task's analyst-job does NOT enter analysis (no branch cut, no analyst
|
|
agent) while the same repo has ANOTHER unfinished task (``tasks.stage !=
|
|
'done'``) OR the repo is frozen. The gate is a SQL fragment spliced into
|
|
``db.claim_next_job`` (offline hot path) — ``build_claim_clause``.
|
|
* After a post-deploy ``DEGRADED`` verdict the repo is frozen
|
|
(``set_repo_freeze``); the gate stays CLOSED until an operator clears it
|
|
(``clear_repo_freeze``). The degraded task is already ``stage='done'`` (BR-7)
|
|
so freeze is a SEPARATE durable signal, not derived from a stage.
|
|
|
|
never-raise contract (self-hosting safety): every public function degrades
|
|
conservatively and NEVER propagates into the worker / webhook / stage engine.
|
|
Two deliberately different failure directions (ADR-001 D10, NFR-1):
|
|
* hot-claim clause build -> fail-OPEN ("" fragment): a transient DB/build error
|
|
must not wedge the queue of ALL projects (AC-8).
|
|
* freeze decision (``is_repo_frozen``) -> fail-CLOSED (``True``): when we cannot
|
|
confirm the ABSENCE of a freeze we keep the gate closed for prod safety (AC-9).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
|
|
from . import db
|
|
from .config import settings
|
|
|
|
logger = logging.getLogger("orchestrator.serial_gate")
|
|
|
|
# Repo tokens embedded into the claim SQL ``IN (...)`` list must match this — a
|
|
# guard against a broken/injected ORCH_SERIAL_GATE_REPOS CSV (R-6). The CSV is an
|
|
# operator config (not user input), but the guard is mandatory; an invalid token
|
|
# is silently dropped.
|
|
_REPO_TOKEN = re.compile(r"^[A-Za-z0-9._-]+$")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Conditionality (mirrors post_deploy_applies / _merge_gate_applies)
|
|
# ---------------------------------------------------------------------------
|
|
def _scope_repos() -> set[str]:
|
|
"""Sanitised set of in-scope repo tokens from ``serial_gate_repos`` (CSV).
|
|
|
|
Empty/blank CSV -> empty set, meaning "apply to ALL repos" (D5). Invalid
|
|
tokens (regex miss) are dropped. Never raises.
|
|
"""
|
|
try:
|
|
raw = (settings.serial_gate_repos or "").strip()
|
|
except Exception: # noqa: BLE001
|
|
return set()
|
|
if not raw:
|
|
return set()
|
|
out: set[str] = set()
|
|
for tok in raw.split(","):
|
|
t = tok.strip()
|
|
if t and _REPO_TOKEN.match(t):
|
|
out.add(t)
|
|
elif t:
|
|
logger.warning("serial_gate: dropping invalid repo token %r from CSV", t)
|
|
return out
|
|
|
|
|
|
def serial_gate_applies(repo: str) -> bool:
|
|
"""Whether the serial gate is REAL for this repo (D5 / AC-7).
|
|
|
|
* ``serial_gate_enabled=False`` -> always False (kill-switch; claim and
|
|
start_pipeline are 1:1 as before ORCH-088).
|
|
* ``serial_gate_repos`` (CSV) non-empty -> real only for listed repos.
|
|
* empty CSV -> real for ALL repos (serial e2e + anti-stale-base help every
|
|
repo, unlike the self-hosting-only ORCH-35/43/58 gates).
|
|
Never raises -> False on error (degrade to "gate inert", the safe-for-flow
|
|
default that matches the kill-switch off behaviour).
|
|
"""
|
|
try:
|
|
if not getattr(settings, "serial_gate_enabled", False):
|
|
return False
|
|
scope = _scope_repos()
|
|
if scope:
|
|
return (repo or "").strip() in scope
|
|
return True
|
|
except Exception as e: # noqa: BLE001 - never-raise
|
|
logger.warning("serial_gate_applies error for %s: %s", repo, e)
|
|
return False
|
|
|
|
|
|
def _freeze_layer_enabled() -> bool:
|
|
"""Whether the FR-5 freeze layer is active (independent tumbler, D7)."""
|
|
try:
|
|
return bool(getattr(settings, "serial_gate_freeze_enabled", False))
|
|
except Exception: # noqa: BLE001
|
|
return False
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Read helpers (active task + freeze) — only the local DB
|
|
# ---------------------------------------------------------------------------
|
|
def repo_has_active_task(repo: str, exclude_task_id: int | None = None) -> bool:
|
|
"""True iff repo has a task with ``stage != 'done'`` (excluding one task).
|
|
|
|
``exclude_task_id`` is the task being evaluated (a new/rework task must not
|
|
count ITSELF as the active task that blocks it — R-7). Observability/Python
|
|
mirror of the SQL gate; never raises -> False on error.
|
|
"""
|
|
try:
|
|
conn = db.get_db()
|
|
try:
|
|
if exclude_task_id is not None:
|
|
row = conn.execute(
|
|
"SELECT 1 FROM tasks WHERE repo=? AND id != ? AND stage != 'done' LIMIT 1",
|
|
(repo, exclude_task_id),
|
|
).fetchone()
|
|
else:
|
|
row = conn.execute(
|
|
"SELECT 1 FROM tasks WHERE repo=? AND stage != 'done' LIMIT 1",
|
|
(repo,),
|
|
).fetchone()
|
|
return row is not None
|
|
finally:
|
|
conn.close()
|
|
except Exception as e: # noqa: BLE001 - never-raise
|
|
logger.warning("repo_has_active_task error for %s: %s", repo, e)
|
|
return False
|
|
|
|
|
|
def _active_freeze_row(repo: str) -> dict | None:
|
|
"""Most-recent active (``cleared_at IS NULL``) freeze row for repo, or None.
|
|
|
|
Raises on a real DB error (the caller decides fail-open vs fail-closed) — this
|
|
private helper does NOT swallow so ``is_repo_frozen`` can fail CLOSED.
|
|
"""
|
|
conn = db.get_db()
|
|
try:
|
|
row = conn.execute(
|
|
"SELECT repo, frozen_at, reason, work_item_id FROM repo_freeze "
|
|
"WHERE repo=? AND cleared_at IS NULL ORDER BY id DESC LIMIT 1",
|
|
(repo,),
|
|
).fetchone()
|
|
return dict(row) if row else None
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
def is_repo_frozen(repo: str) -> bool:
|
|
"""True iff repo currently has an active freeze (FR-5).
|
|
|
|
fail-CLOSED (AC-9): when the freeze layer is enabled and we CANNOT confirm the
|
|
absence of a freeze (DB error), return True — keep the gate closed for prod
|
|
safety. When the freeze layer is disabled the repo is never considered frozen.
|
|
"""
|
|
if not _freeze_layer_enabled():
|
|
return False
|
|
try:
|
|
return _active_freeze_row(repo) is not None
|
|
except Exception as e: # noqa: BLE001 - fail-CLOSED on doubt (AC-9)
|
|
logger.warning("is_repo_frozen error for %s -> fail-CLOSED (frozen): %s", repo, e)
|
|
return True
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Freeze mutators (FR-5)
|
|
# ---------------------------------------------------------------------------
|
|
def set_repo_freeze(repo: str, reason: str = "", work_item_id: str | None = None) -> bool:
|
|
"""Insert a durable freeze row for repo (no-op when the freeze layer is off).
|
|
|
|
Append-only: a repeated DEGRADED while already frozen simply adds another row
|
|
(history); ``is_repo_frozen``'s EXISTS is idempotent. Returns True iff a row
|
|
was inserted. never-raise -> False on error (a freeze write failure must not
|
|
crash the post-deploy monitor tick).
|
|
"""
|
|
if not _freeze_layer_enabled():
|
|
logger.info("set_repo_freeze: freeze layer disabled, skipping for %s", repo)
|
|
return False
|
|
if not repo:
|
|
return False
|
|
try:
|
|
conn = db.get_db()
|
|
try:
|
|
conn.execute(
|
|
"INSERT INTO repo_freeze (repo, reason, work_item_id) VALUES (?, ?, ?)",
|
|
(repo, reason or None, work_item_id),
|
|
)
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
logger.warning(
|
|
"serial_gate: repo %s FROZEN (reason=%r, work_item=%s) — next task will "
|
|
"NOT start until manual unfreeze", repo, reason, work_item_id,
|
|
)
|
|
return True
|
|
except Exception as e: # noqa: BLE001 - never-raise
|
|
logger.error("set_repo_freeze error for %s: %s", repo, e)
|
|
return False
|
|
|
|
|
|
def clear_repo_freeze(repo: str) -> int:
|
|
"""Clear ALL active freeze rows for repo (operator unfreeze, D4).
|
|
|
|
Sets ``cleared_at=now`` on every row with ``cleared_at IS NULL``. Idempotent
|
|
(a repeat clears 0 rows). Returns the number of rows cleared. never-raise -> 0
|
|
on error.
|
|
"""
|
|
if not repo:
|
|
return 0
|
|
try:
|
|
conn = db.get_db()
|
|
try:
|
|
cur = conn.execute(
|
|
"UPDATE repo_freeze SET cleared_at=datetime('now') "
|
|
"WHERE repo=? AND cleared_at IS NULL",
|
|
(repo,),
|
|
)
|
|
conn.commit()
|
|
n = cur.rowcount or 0
|
|
finally:
|
|
conn.close()
|
|
if n:
|
|
logger.warning("serial_gate: repo %s UNFROZEN (%d row(s) cleared)", repo, n)
|
|
return n
|
|
except Exception as e: # noqa: BLE001 - never-raise
|
|
logger.error("clear_repo_freeze error for %s: %s", repo, e)
|
|
return 0
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Hot-claim SQL fragment (fail-OPEN) — ADR-001 D1
|
|
# ---------------------------------------------------------------------------
|
|
def build_claim_clause() -> str:
|
|
"""Build the ``AND NOT (...)`` fragment spliced into ``claim_next_job``.
|
|
|
|
Blocks an analyst-job whose repo (a) has an EARLIER-queued unfinished task or
|
|
(b) is frozen. Only ``jobs.agent='analyst'`` is gated — jobs of an
|
|
already-active task pass freely (else the single active task could never
|
|
advance).
|
|
|
|
Ordering term — ``t2.id < jobs.task_id`` (FIFO, ADR-001 D1 / FR-2): a task is
|
|
blocked only by EARLIER tasks (lower ``tasks.id``) that are not yet done. This
|
|
is the FIFO refinement of the ADR's pseudo-SQL ``t2.id != jobs.task_id``: with
|
|
``!=`` a BATCH of fresh tasks all sitting in ``analysis`` would mutually block
|
|
(each is "another unfinished task" for the others) -> the whole serial queue
|
|
deadlocks, contradicting FR-2 ("строго по одной, FIFO по jobs.id"). ``<`` admits
|
|
exactly the oldest unfinished task and serialises the rest behind it, while
|
|
still never self-blocking a new/rework analyst-job on its OWN row (R-7) and
|
|
keeping AC-1 (a newer task is held by the older active one) intact.
|
|
|
|
Repo scope: empty CSV -> no repo filter (all repos); non-empty CSV -> ``AND
|
|
jobs.repo IN ('a','b')`` with sanitised tokens (R-6).
|
|
|
|
fail-OPEN (AC-8): kill-switch off OR any build error -> ``""`` (claim behaves
|
|
exactly as before ORCH-088). The trailing space keeps the spliced SQL valid.
|
|
"""
|
|
try:
|
|
if not getattr(settings, "serial_gate_enabled", False):
|
|
return ""
|
|
scope = _scope_repos()
|
|
if scope:
|
|
# All tokens already passed the _REPO_TOKEN regex -> safe to embed.
|
|
repo_in = ", ".join(f"'{t}'" for t in sorted(scope))
|
|
repo_scope = f"AND jobs.repo IN ({repo_in}) "
|
|
else:
|
|
repo_scope = ""
|
|
active_clause = (
|
|
"EXISTS (SELECT 1 FROM tasks t2 "
|
|
"WHERE t2.repo = jobs.repo AND t2.id < jobs.task_id "
|
|
"AND t2.stage != 'done') "
|
|
)
|
|
if _freeze_layer_enabled():
|
|
freeze_clause = (
|
|
"OR EXISTS (SELECT 1 FROM repo_freeze f "
|
|
"WHERE f.repo = jobs.repo AND f.cleared_at IS NULL) "
|
|
)
|
|
else:
|
|
freeze_clause = ""
|
|
return (
|
|
"AND NOT ( jobs.agent = 'analyst' "
|
|
f"{repo_scope}"
|
|
f"AND ( {active_clause}{freeze_clause}) "
|
|
") "
|
|
)
|
|
except Exception as e: # noqa: BLE001 - fail-OPEN: never wedge the queue
|
|
logger.warning("build_claim_clause error -> fail-OPEN (no gate): %s", e)
|
|
return ""
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Observability snapshot for GET /queue (D9 / AC-10)
|
|
# ---------------------------------------------------------------------------
|
|
def _known_repos() -> list[str]:
|
|
"""Registered repo names (best-effort) plus any repo with live gate state."""
|
|
repos: set[str] = set()
|
|
try:
|
|
from . import projects
|
|
for p in projects.PROJECTS:
|
|
if getattr(p, "repo", None):
|
|
repos.add(p.repo)
|
|
except Exception: # noqa: BLE001
|
|
pass
|
|
# Also surface repos that have an active freeze or a queued analyst-job even if
|
|
# they are not in the static registry (defensive — never hide a frozen repo).
|
|
try:
|
|
conn = db.get_db()
|
|
try:
|
|
for (r,) in conn.execute(
|
|
"SELECT DISTINCT repo FROM repo_freeze WHERE cleared_at IS NULL"
|
|
).fetchall():
|
|
if r:
|
|
repos.add(r)
|
|
for (r,) in conn.execute(
|
|
"SELECT DISTINCT repo FROM jobs WHERE status='queued' AND agent='analyst'"
|
|
).fetchall():
|
|
if r:
|
|
repos.add(r)
|
|
finally:
|
|
conn.close()
|
|
except Exception: # noqa: BLE001
|
|
pass
|
|
return sorted(repos)
|
|
|
|
|
|
def _per_repo_snapshot(repo: str) -> dict:
|
|
"""Per-repo gate state for the /queue snapshot (never raises here)."""
|
|
active_task = None
|
|
waiting: list[dict] = []
|
|
try:
|
|
conn = db.get_db()
|
|
try:
|
|
row = conn.execute(
|
|
"SELECT work_item_id, stage FROM tasks "
|
|
"WHERE repo=? AND stage != 'done' ORDER BY id LIMIT 1",
|
|
(repo,),
|
|
).fetchone()
|
|
if row:
|
|
active_task = {"work_item_id": row["work_item_id"], "stage": row["stage"]}
|
|
for j in conn.execute(
|
|
"SELECT j.id AS job_id, t.work_item_id AS work_item_id, t.stage AS stage "
|
|
"FROM jobs j LEFT JOIN tasks t ON t.id = j.task_id "
|
|
"WHERE j.repo=? AND j.status='queued' AND j.agent='analyst' "
|
|
"ORDER BY j.id",
|
|
(repo,),
|
|
).fetchall():
|
|
waiting.append({
|
|
"job_id": j["job_id"],
|
|
"work_item_id": j["work_item_id"],
|
|
"stage": j["stage"],
|
|
})
|
|
finally:
|
|
conn.close()
|
|
except Exception as e: # noqa: BLE001
|
|
logger.warning("serial_gate per-repo snapshot error for %s: %s", repo, e)
|
|
frozen = is_repo_frozen(repo)
|
|
frozen_reason = None
|
|
frozen_at = None
|
|
if frozen:
|
|
try:
|
|
fr = _active_freeze_row(repo)
|
|
if fr:
|
|
frozen_reason = fr.get("reason")
|
|
frozen_at = fr.get("frozen_at")
|
|
except Exception: # noqa: BLE001
|
|
pass
|
|
return {
|
|
"active_task": active_task,
|
|
"waiting": waiting,
|
|
"frozen": frozen,
|
|
"frozen_reason": frozen_reason,
|
|
"frozen_at": frozen_at,
|
|
}
|
|
|
|
|
|
def snapshot() -> dict:
|
|
"""Read-only serial-gate summary for GET /queue (D9 / AC-10).
|
|
|
|
Additive block; existing /queue keys are untouched. never-raise: any error ->
|
|
a minimal dict with the flags and empty per-repo data.
|
|
"""
|
|
try:
|
|
enabled = bool(getattr(settings, "serial_gate_enabled", False))
|
|
except Exception: # noqa: BLE001
|
|
enabled = False
|
|
try:
|
|
repos_cfg = getattr(settings, "serial_gate_repos", "") or ""
|
|
except Exception: # noqa: BLE001
|
|
repos_cfg = ""
|
|
try:
|
|
per_repo = {r: _per_repo_snapshot(r) for r in _known_repos()}
|
|
return {
|
|
"enabled": enabled,
|
|
"freeze_enabled": _freeze_layer_enabled(),
|
|
"repos": repos_cfg,
|
|
"per_repo": per_repo,
|
|
}
|
|
except Exception as e: # noqa: BLE001 - never-raise -> minimal dict
|
|
logger.warning("serial_gate snapshot error: %s", e)
|
|
return {
|
|
"enabled": enabled,
|
|
"freeze_enabled": False,
|
|
"repos": repos_cfg,
|
|
"per_repo": {},
|
|
}
|