"""ORCH-088 (Этап 1, serial e2e): per-repo serial gate + durable rollback-freeze. Leaf module — pure, unit-testable logic over the existing ``tasks`` / ``jobs`` tables and the additive ``repo_freeze`` table (see src/db.py / 08-data-requirements.md). Mirrors the leaf pattern of ``src/task_deps.py`` / ``src/post_deploy.py``: imports only ``db`` + ``config`` (and lazily ``projects`` for the snapshot), never ``stage_engine`` / ``launcher``. What it enforces (ADR-001): * A NEW task's analyst-job does NOT enter analysis (no branch cut, no analyst agent) while the same repo has ANOTHER unfinished task (``tasks.stage != 'done'``) OR the repo is frozen. The gate is a SQL fragment spliced into ``db.claim_next_job`` (offline hot path) — ``build_claim_clause``. * After a post-deploy ``DEGRADED`` verdict the repo is frozen (``set_repo_freeze``); the gate stays CLOSED until an operator clears it (``clear_repo_freeze``). The degraded task is already ``stage='done'`` (BR-7) so freeze is a SEPARATE durable signal, not derived from a stage. never-raise contract (self-hosting safety): every public function degrades conservatively and NEVER propagates into the worker / webhook / stage engine. Two deliberately different failure directions (ADR-001 D10, NFR-1): * hot-claim clause build -> fail-OPEN ("" fragment): a transient DB/build error must not wedge the queue of ALL projects (AC-8). * freeze decision (``is_repo_frozen``) -> fail-CLOSED (``True``): when we cannot confirm the ABSENCE of a freeze we keep the gate closed for prod safety (AC-9). """ from __future__ import annotations import logging import re from . import db from .config import settings logger = logging.getLogger("orchestrator.serial_gate") # Repo tokens embedded into the claim SQL ``IN (...)`` list must match this — a # guard against a broken/injected ORCH_SERIAL_GATE_REPOS CSV (R-6). The CSV is an # operator config (not user input), but the guard is mandatory; an invalid token # is silently dropped. _REPO_TOKEN = re.compile(r"^[A-Za-z0-9._-]+$") # --------------------------------------------------------------------------- # Conditionality (mirrors post_deploy_applies / _merge_gate_applies) # --------------------------------------------------------------------------- def _scope_repos() -> set[str]: """Sanitised set of in-scope repo tokens from ``serial_gate_repos`` (CSV). Empty/blank CSV -> empty set, meaning "apply to ALL repos" (D5). Invalid tokens (regex miss) are dropped. Never raises. """ try: raw = (settings.serial_gate_repos or "").strip() except Exception: # noqa: BLE001 return set() if not raw: return set() out: set[str] = set() for tok in raw.split(","): t = tok.strip() if t and _REPO_TOKEN.match(t): out.add(t) elif t: logger.warning("serial_gate: dropping invalid repo token %r from CSV", t) return out def serial_gate_applies(repo: str) -> bool: """Whether the serial gate is REAL for this repo (D5 / AC-7). * ``serial_gate_enabled=False`` -> always False (kill-switch; claim and start_pipeline are 1:1 as before ORCH-088). * ``serial_gate_repos`` (CSV) non-empty -> real only for listed repos. * empty CSV -> real for ALL repos (serial e2e + anti-stale-base help every repo, unlike the self-hosting-only ORCH-35/43/58 gates). Never raises -> False on error (degrade to "gate inert", the safe-for-flow default that matches the kill-switch off behaviour). """ try: if not getattr(settings, "serial_gate_enabled", False): return False scope = _scope_repos() if scope: return (repo or "").strip() in scope return True except Exception as e: # noqa: BLE001 - never-raise logger.warning("serial_gate_applies error for %s: %s", repo, e) return False def _freeze_layer_enabled() -> bool: """Whether the FR-5 freeze layer is active (independent tumbler, D7).""" try: return bool(getattr(settings, "serial_gate_freeze_enabled", False)) except Exception: # noqa: BLE001 return False # --------------------------------------------------------------------------- # Read helpers (active task + freeze) — only the local DB # --------------------------------------------------------------------------- def repo_has_active_task(repo: str, exclude_task_id: int | None = None) -> bool: """True iff repo has a task with ``stage != 'done'`` (excluding one task). ``exclude_task_id`` is the task being evaluated (a new/rework task must not count ITSELF as the active task that blocks it — R-7). Observability/Python mirror of the SQL gate; never raises -> False on error. """ try: conn = db.get_db() try: if exclude_task_id is not None: row = conn.execute( "SELECT 1 FROM tasks WHERE repo=? AND id != ? AND stage != 'done' LIMIT 1", (repo, exclude_task_id), ).fetchone() else: row = conn.execute( "SELECT 1 FROM tasks WHERE repo=? AND stage != 'done' LIMIT 1", (repo,), ).fetchone() return row is not None finally: conn.close() except Exception as e: # noqa: BLE001 - never-raise logger.warning("repo_has_active_task error for %s: %s", repo, e) return False def _active_freeze_row(repo: str) -> dict | None: """Most-recent active (``cleared_at IS NULL``) freeze row for repo, or None. Raises on a real DB error (the caller decides fail-open vs fail-closed) — this private helper does NOT swallow so ``is_repo_frozen`` can fail CLOSED. """ conn = db.get_db() try: row = conn.execute( "SELECT repo, frozen_at, reason, work_item_id FROM repo_freeze " "WHERE repo=? AND cleared_at IS NULL ORDER BY id DESC LIMIT 1", (repo,), ).fetchone() return dict(row) if row else None finally: conn.close() def is_repo_frozen(repo: str) -> bool: """True iff repo currently has an active freeze (FR-5). fail-CLOSED (AC-9): when the freeze layer is enabled and we CANNOT confirm the absence of a freeze (DB error), return True — keep the gate closed for prod safety. When the freeze layer is disabled the repo is never considered frozen. """ if not _freeze_layer_enabled(): return False try: return _active_freeze_row(repo) is not None except Exception as e: # noqa: BLE001 - fail-CLOSED on doubt (AC-9) logger.warning("is_repo_frozen error for %s -> fail-CLOSED (frozen): %s", repo, e) return True # --------------------------------------------------------------------------- # Freeze mutators (FR-5) # --------------------------------------------------------------------------- def set_repo_freeze(repo: str, reason: str = "", work_item_id: str | None = None) -> bool: """Insert a durable freeze row for repo (no-op when the freeze layer is off). Append-only: a repeated DEGRADED while already frozen simply adds another row (history); ``is_repo_frozen``'s EXISTS is idempotent. Returns True iff a row was inserted. never-raise -> False on error (a freeze write failure must not crash the post-deploy monitor tick). """ if not _freeze_layer_enabled(): logger.info("set_repo_freeze: freeze layer disabled, skipping for %s", repo) return False if not repo: return False try: conn = db.get_db() try: conn.execute( "INSERT INTO repo_freeze (repo, reason, work_item_id) VALUES (?, ?, ?)", (repo, reason or None, work_item_id), ) conn.commit() finally: conn.close() logger.warning( "serial_gate: repo %s FROZEN (reason=%r, work_item=%s) — next task will " "NOT start until manual unfreeze", repo, reason, work_item_id, ) return True except Exception as e: # noqa: BLE001 - never-raise logger.error("set_repo_freeze error for %s: %s", repo, e) return False def clear_repo_freeze(repo: str) -> int: """Clear ALL active freeze rows for repo (operator unfreeze, D4). Sets ``cleared_at=now`` on every row with ``cleared_at IS NULL``. Idempotent (a repeat clears 0 rows). Returns the number of rows cleared. never-raise -> 0 on error. """ if not repo: return 0 try: conn = db.get_db() try: cur = conn.execute( "UPDATE repo_freeze SET cleared_at=datetime('now') " "WHERE repo=? AND cleared_at IS NULL", (repo,), ) conn.commit() n = cur.rowcount or 0 finally: conn.close() if n: logger.warning("serial_gate: repo %s UNFROZEN (%d row(s) cleared)", repo, n) return n except Exception as e: # noqa: BLE001 - never-raise logger.error("clear_repo_freeze error for %s: %s", repo, e) return 0 # --------------------------------------------------------------------------- # Hot-claim SQL fragment (fail-OPEN) — ADR-001 D1 # --------------------------------------------------------------------------- def build_claim_clause() -> str: """Build the ``AND NOT (...)`` fragment spliced into ``claim_next_job``. Blocks an analyst-job whose repo (a) has an EARLIER-queued unfinished task or (b) is frozen. Only ``jobs.agent='analyst'`` is gated — jobs of an already-active task pass freely (else the single active task could never advance). Ordering term — ``t2.id < jobs.task_id`` (FIFO, ADR-001 D1 / FR-2): a task is blocked only by EARLIER tasks (lower ``tasks.id``) that are not yet done. This is the FIFO refinement of the ADR's pseudo-SQL ``t2.id != jobs.task_id``: with ``!=`` a BATCH of fresh tasks all sitting in ``analysis`` would mutually block (each is "another unfinished task" for the others) -> the whole serial queue deadlocks, contradicting FR-2 ("строго по одной, FIFO по jobs.id"). ``<`` admits exactly the oldest unfinished task and serialises the rest behind it, while still never self-blocking a new/rework analyst-job on its OWN row (R-7) and keeping AC-1 (a newer task is held by the older active one) intact. Repo scope: empty CSV -> no repo filter (all repos); non-empty CSV -> ``AND jobs.repo IN ('a','b')`` with sanitised tokens (R-6). fail-OPEN (AC-8): kill-switch off OR any build error -> ``""`` (claim behaves exactly as before ORCH-088). The trailing space keeps the spliced SQL valid. """ try: if not getattr(settings, "serial_gate_enabled", False): return "" scope = _scope_repos() if scope: # All tokens already passed the _REPO_TOKEN regex -> safe to embed. repo_in = ", ".join(f"'{t}'" for t in sorted(scope)) repo_scope = f"AND jobs.repo IN ({repo_in}) " else: repo_scope = "" active_clause = ( "EXISTS (SELECT 1 FROM tasks t2 " "WHERE t2.repo = jobs.repo AND t2.id < jobs.task_id " "AND t2.stage != 'done') " ) if _freeze_layer_enabled(): freeze_clause = ( "OR EXISTS (SELECT 1 FROM repo_freeze f " "WHERE f.repo = jobs.repo AND f.cleared_at IS NULL) " ) else: freeze_clause = "" return ( "AND NOT ( jobs.agent = 'analyst' " f"{repo_scope}" f"AND ( {active_clause}{freeze_clause}) " ") " ) except Exception as e: # noqa: BLE001 - fail-OPEN: never wedge the queue logger.warning("build_claim_clause error -> fail-OPEN (no gate): %s", e) return "" # --------------------------------------------------------------------------- # Observability snapshot for GET /queue (D9 / AC-10) # --------------------------------------------------------------------------- def _known_repos() -> list[str]: """Registered repo names (best-effort) plus any repo with live gate state.""" repos: set[str] = set() try: from . import projects for p in projects.PROJECTS: if getattr(p, "repo", None): repos.add(p.repo) except Exception: # noqa: BLE001 pass # Also surface repos that have an active freeze or a queued analyst-job even if # they are not in the static registry (defensive — never hide a frozen repo). try: conn = db.get_db() try: for (r,) in conn.execute( "SELECT DISTINCT repo FROM repo_freeze WHERE cleared_at IS NULL" ).fetchall(): if r: repos.add(r) for (r,) in conn.execute( "SELECT DISTINCT repo FROM jobs WHERE status='queued' AND agent='analyst'" ).fetchall(): if r: repos.add(r) finally: conn.close() except Exception: # noqa: BLE001 pass return sorted(repos) def _per_repo_snapshot(repo: str) -> dict: """Per-repo gate state for the /queue snapshot (never raises here).""" active_task = None waiting: list[dict] = [] try: conn = db.get_db() try: row = conn.execute( "SELECT work_item_id, stage FROM tasks " "WHERE repo=? AND stage != 'done' ORDER BY id LIMIT 1", (repo,), ).fetchone() if row: active_task = {"work_item_id": row["work_item_id"], "stage": row["stage"]} for j in conn.execute( "SELECT j.id AS job_id, t.work_item_id AS work_item_id, t.stage AS stage " "FROM jobs j LEFT JOIN tasks t ON t.id = j.task_id " "WHERE j.repo=? AND j.status='queued' AND j.agent='analyst' " "ORDER BY j.id", (repo,), ).fetchall(): waiting.append({ "job_id": j["job_id"], "work_item_id": j["work_item_id"], "stage": j["stage"], }) finally: conn.close() except Exception as e: # noqa: BLE001 logger.warning("serial_gate per-repo snapshot error for %s: %s", repo, e) frozen = is_repo_frozen(repo) frozen_reason = None frozen_at = None if frozen: try: fr = _active_freeze_row(repo) if fr: frozen_reason = fr.get("reason") frozen_at = fr.get("frozen_at") except Exception: # noqa: BLE001 pass return { "active_task": active_task, "waiting": waiting, "frozen": frozen, "frozen_reason": frozen_reason, "frozen_at": frozen_at, } def snapshot() -> dict: """Read-only serial-gate summary for GET /queue (D9 / AC-10). Additive block; existing /queue keys are untouched. never-raise: any error -> a minimal dict with the flags and empty per-repo data. """ try: enabled = bool(getattr(settings, "serial_gate_enabled", False)) except Exception: # noqa: BLE001 enabled = False try: repos_cfg = getattr(settings, "serial_gate_repos", "") or "" except Exception: # noqa: BLE001 repos_cfg = "" try: per_repo = {r: _per_repo_snapshot(r) for r in _known_repos()} return { "enabled": enabled, "freeze_enabled": _freeze_layer_enabled(), "repos": repos_cfg, "per_repo": per_repo, } except Exception as e: # noqa: BLE001 - never-raise -> minimal dict logger.warning("serial_gate snapshot error: %s", e) return { "enabled": enabled, "freeze_enabled": False, "repos": repos_cfg, "per_repo": {}, }