orchestrator/src/serial_gate.py

"""ORCH-088 (Этап 1, serial e2e): per-repo serial gate + durable rollback-freeze.

Leaf module — pure, unit-testable logic over the existing ``tasks`` / ``jobs``
tables and the additive ``repo_freeze`` table (see src/db.py /
08-data-requirements.md). Mirrors the leaf pattern of ``src/task_deps.py`` /
``src/post_deploy.py``: imports only ``db`` + ``config`` (and lazily
``projects`` for the snapshot), never ``stage_engine`` / ``launcher``.

What it enforces (ADR-001):
  * A NEW task's analyst-job does NOT enter analysis (no branch cut, no analyst
    agent) while the same repo has ANOTHER unfinished task (``tasks.stage !=
    'done'``) OR the repo is frozen. The gate is a SQL fragment spliced into
    ``db.claim_next_job`` (offline hot path) — ``build_claim_clause``.
  * After a post-deploy ``DEGRADED`` verdict the repo is frozen
    (``set_repo_freeze``); the gate stays CLOSED until an operator clears it
    (``clear_repo_freeze``). The degraded task is already ``stage='done'`` (BR-7)
    so freeze is a SEPARATE durable signal, not derived from a stage.

never-raise contract (self-hosting safety): every public function degrades
conservatively and NEVER propagates into the worker / webhook / stage engine.
Two deliberately different failure directions (ADR-001 D10, NFR-1):
  * hot-claim clause build -> fail-OPEN ("" fragment): a transient DB/build error
    must not wedge the queue of ALL projects (AC-8).
  * freeze decision (``is_repo_frozen``) -> fail-CLOSED (``True``): when we cannot
    confirm the ABSENCE of a freeze we keep the gate closed for prod safety (AC-9).
"""
from __future__ import annotations

import logging
import re

from . import db
from .config import settings

logger = logging.getLogger("orchestrator.serial_gate")

# Repo tokens embedded into the claim SQL ``IN (...)`` list must match this — a
# guard against a broken/injected ORCH_SERIAL_GATE_REPOS CSV (R-6). The CSV is an
# operator config (not user input), but the guard is mandatory; an invalid token
# is silently dropped.
_REPO_TOKEN = re.compile(r"^[A-Za-z0-9._-]+$")


# ---------------------------------------------------------------------------
# Conditionality (mirrors post_deploy_applies / _merge_gate_applies)
# ---------------------------------------------------------------------------
def _scope_repos() -> set[str]:
    """Sanitised set of in-scope repo tokens from ``serial_gate_repos`` (CSV).

    Empty/blank CSV -> empty set, meaning "apply to ALL repos" (D5). Invalid
    tokens (regex miss) are dropped. Never raises.
    """
    try:
        raw = (settings.serial_gate_repos or "").strip()
    except Exception:  # noqa: BLE001
        return set()
    if not raw:
        return set()
    out: set[str] = set()
    for tok in raw.split(","):
        t = tok.strip()
        if t and _REPO_TOKEN.match(t):
            out.add(t)
        elif t:
            logger.warning("serial_gate: dropping invalid repo token %r from CSV", t)
    return out


def serial_gate_applies(repo: str) -> bool:
    """Whether the serial gate is REAL for this repo (D5 / AC-7).

      * ``serial_gate_enabled=False`` -> always False (kill-switch; claim and
        start_pipeline are 1:1 as before ORCH-088).
      * ``serial_gate_repos`` (CSV) non-empty -> real only for listed repos.
      * empty CSV -> real for ALL repos (serial e2e + anti-stale-base help every
        repo, unlike the self-hosting-only ORCH-35/43/58 gates).
    Never raises -> False on error (degrade to "gate inert", the safe-for-flow
    default that matches the kill-switch off behaviour).
    """
    try:
        if not getattr(settings, "serial_gate_enabled", False):
            return False
        scope = _scope_repos()
        if scope:
            return (repo or "").strip() in scope
        return True
    except Exception as e:  # noqa: BLE001 - never-raise
        logger.warning("serial_gate_applies error for %s: %s", repo, e)
        return False


def _freeze_layer_enabled() -> bool:
    """Whether the FR-5 freeze layer is active (independent tumbler, D7)."""
    try:
        return bool(getattr(settings, "serial_gate_freeze_enabled", False))
    except Exception:  # noqa: BLE001
        return False


# ---------------------------------------------------------------------------
# Read helpers (active task + freeze) — only the local DB
# ---------------------------------------------------------------------------
def repo_has_active_task(repo: str, exclude_task_id: int | None = None) -> bool:
    """True iff repo has a task with ``stage != 'done'`` (excluding one task).

    ``exclude_task_id`` is the task being evaluated (a new/rework task must not
    count ITSELF as the active task that blocks it — R-7). Observability/Python
    mirror of the SQL gate; never raises -> False on error.
    """
    try:
        conn = db.get_db()
        try:
            if exclude_task_id is not None:
                row = conn.execute(
                    "SELECT 1 FROM tasks WHERE repo=? AND id != ? AND stage != 'done' LIMIT 1",
                    (repo, exclude_task_id),
                ).fetchone()
            else:
                row = conn.execute(
                    "SELECT 1 FROM tasks WHERE repo=? AND stage != 'done' LIMIT 1",
                    (repo,),
                ).fetchone()
            return row is not None
        finally:
            conn.close()
    except Exception as e:  # noqa: BLE001 - never-raise
        logger.warning("repo_has_active_task error for %s: %s", repo, e)
        return False


def _active_freeze_row(repo: str) -> dict | None:
    """Most-recent active (``cleared_at IS NULL``) freeze row for repo, or None.

    Raises on a real DB error (the caller decides fail-open vs fail-closed) — this
    private helper does NOT swallow so ``is_repo_frozen`` can fail CLOSED.
    """
    conn = db.get_db()
    try:
        row = conn.execute(
            "SELECT repo, frozen_at, reason, work_item_id FROM repo_freeze "
            "WHERE repo=? AND cleared_at IS NULL ORDER BY id DESC LIMIT 1",
            (repo,),
        ).fetchone()
        return dict(row) if row else None
    finally:
        conn.close()


def is_repo_frozen(repo: str) -> bool:
    """True iff repo currently has an active freeze (FR-5).

    fail-CLOSED (AC-9): when the freeze layer is enabled and we CANNOT confirm the
    absence of a freeze (DB error), return True — keep the gate closed for prod
    safety. When the freeze layer is disabled the repo is never considered frozen.
    """
    if not _freeze_layer_enabled():
        return False
    try:
        return _active_freeze_row(repo) is not None
    except Exception as e:  # noqa: BLE001 - fail-CLOSED on doubt (AC-9)
        logger.warning("is_repo_frozen error for %s -> fail-CLOSED (frozen): %s", repo, e)
        return True


# ---------------------------------------------------------------------------
# Freeze mutators (FR-5)
# ---------------------------------------------------------------------------
def set_repo_freeze(repo: str, reason: str = "", work_item_id: str | None = None) -> bool:
    """Insert a durable freeze row for repo (no-op when the freeze layer is off).

    Append-only: a repeated DEGRADED while already frozen simply adds another row
    (history); ``is_repo_frozen``'s EXISTS is idempotent. Returns True iff a row
    was inserted. never-raise -> False on error (a freeze write failure must not
    crash the post-deploy monitor tick).
    """
    if not _freeze_layer_enabled():
        logger.info("set_repo_freeze: freeze layer disabled, skipping for %s", repo)
        return False
    if not repo:
        return False
    try:
        conn = db.get_db()
        try:
            conn.execute(
                "INSERT INTO repo_freeze (repo, reason, work_item_id) VALUES (?, ?, ?)",
                (repo, reason or None, work_item_id),
            )
            conn.commit()
        finally:
            conn.close()
        logger.warning(
            "serial_gate: repo %s FROZEN (reason=%r, work_item=%s) — next task will "
            "NOT start until manual unfreeze", repo, reason, work_item_id,
        )
        return True
    except Exception as e:  # noqa: BLE001 - never-raise
        logger.error("set_repo_freeze error for %s: %s", repo, e)
        return False


def clear_repo_freeze(repo: str) -> int:
    """Clear ALL active freeze rows for repo (operator unfreeze, D4).

    Sets ``cleared_at=now`` on every row with ``cleared_at IS NULL``. Idempotent
    (a repeat clears 0 rows). Returns the number of rows cleared. never-raise -> 0
    on error.
    """
    if not repo:
        return 0
    try:
        conn = db.get_db()
        try:
            cur = conn.execute(
                "UPDATE repo_freeze SET cleared_at=datetime('now') "
                "WHERE repo=? AND cleared_at IS NULL",
                (repo,),
            )
            conn.commit()
            n = cur.rowcount or 0
        finally:
            conn.close()
        if n:
            logger.warning("serial_gate: repo %s UNFROZEN (%d row(s) cleared)", repo, n)
        return n
    except Exception as e:  # noqa: BLE001 - never-raise
        logger.error("clear_repo_freeze error for %s: %s", repo, e)
        return 0


# ---------------------------------------------------------------------------
# Hot-claim SQL fragment (fail-OPEN) — ADR-001 D1
# ---------------------------------------------------------------------------
def build_claim_clause() -> str:
    """Build the ``AND NOT (...)`` fragment spliced into ``claim_next_job``.

    Blocks an analyst-job whose repo (a) has an EARLIER-queued unfinished task or
    (b) is frozen. Only ``jobs.agent='analyst'`` is gated — jobs of an
    already-active task pass freely (else the single active task could never
    advance).

    Ordering term — ``t2.id < jobs.task_id`` (FIFO, ADR-001 D1 / FR-2): a task is
    blocked only by EARLIER tasks (lower ``tasks.id``) that are not yet done. This
    is the FIFO refinement of the ADR's pseudo-SQL ``t2.id != jobs.task_id``: with
    ``!=`` a BATCH of fresh tasks all sitting in ``analysis`` would mutually block
    (each is "another unfinished task" for the others) -> the whole serial queue
    deadlocks, contradicting FR-2 ("строго по одной, FIFO по jobs.id"). ``<`` admits
    exactly the oldest unfinished task and serialises the rest behind it, while
    still never self-blocking a new/rework analyst-job on its OWN row (R-7) and
    keeping AC-1 (a newer task is held by the older active one) intact.

    Repo scope: empty CSV -> no repo filter (all repos); non-empty CSV -> ``AND
    jobs.repo IN ('a','b')`` with sanitised tokens (R-6).

    fail-OPEN (AC-8): kill-switch off OR any build error -> ``""`` (claim behaves
    exactly as before ORCH-088). The trailing space keeps the spliced SQL valid.
    """
    try:
        if not getattr(settings, "serial_gate_enabled", False):
            return ""
        scope = _scope_repos()
        if scope:
            # All tokens already passed the _REPO_TOKEN regex -> safe to embed.
            repo_in = ", ".join(f"'{t}'" for t in sorted(scope))
            repo_scope = f"AND jobs.repo IN ({repo_in}) "
        else:
            repo_scope = ""
        active_clause = (
            "EXISTS (SELECT 1 FROM tasks t2 "
            "WHERE t2.repo = jobs.repo AND t2.id < jobs.task_id "
            "AND t2.stage != 'done') "
        )
        if _freeze_layer_enabled():
            freeze_clause = (
                "OR EXISTS (SELECT 1 FROM repo_freeze f "
                "WHERE f.repo = jobs.repo AND f.cleared_at IS NULL) "
            )
        else:
            freeze_clause = ""
        return (
            "AND NOT ( jobs.agent = 'analyst' "
            f"{repo_scope}"
            f"AND ( {active_clause}{freeze_clause}) "
            ") "
        )
    except Exception as e:  # noqa: BLE001 - fail-OPEN: never wedge the queue
        logger.warning("build_claim_clause error -> fail-OPEN (no gate): %s", e)
        return ""


# ---------------------------------------------------------------------------
# Observability snapshot for GET /queue (D9 / AC-10)
# ---------------------------------------------------------------------------
def _known_repos() -> list[str]:
    """Registered repo names (best-effort) plus any repo with live gate state."""
    repos: set[str] = set()
    try:
        from . import projects
        for p in projects.PROJECTS:
            if getattr(p, "repo", None):
                repos.add(p.repo)
    except Exception:  # noqa: BLE001
        pass
    # Also surface repos that have an active freeze or a queued analyst-job even if
    # they are not in the static registry (defensive — never hide a frozen repo).
    try:
        conn = db.get_db()
        try:
            for (r,) in conn.execute(
                "SELECT DISTINCT repo FROM repo_freeze WHERE cleared_at IS NULL"
            ).fetchall():
                if r:
                    repos.add(r)
            for (r,) in conn.execute(
                "SELECT DISTINCT repo FROM jobs WHERE status='queued' AND agent='analyst'"
            ).fetchall():
                if r:
                    repos.add(r)
        finally:
            conn.close()
    except Exception:  # noqa: BLE001
        pass
    return sorted(repos)


def _per_repo_snapshot(repo: str) -> dict:
    """Per-repo gate state for the /queue snapshot (never raises here)."""
    active_task = None
    waiting: list[dict] = []
    try:
        conn = db.get_db()
        try:
            row = conn.execute(
                "SELECT work_item_id, stage FROM tasks "
                "WHERE repo=? AND stage != 'done' ORDER BY id LIMIT 1",
                (repo,),
            ).fetchone()
            if row:
                active_task = {"work_item_id": row["work_item_id"], "stage": row["stage"]}
            for j in conn.execute(
                "SELECT j.id AS job_id, t.work_item_id AS work_item_id, t.stage AS stage "
                "FROM jobs j LEFT JOIN tasks t ON t.id = j.task_id "
                "WHERE j.repo=? AND j.status='queued' AND j.agent='analyst' "
                "ORDER BY j.id",
                (repo,),
            ).fetchall():
                waiting.append({
                    "job_id": j["job_id"],
                    "work_item_id": j["work_item_id"],
                    "stage": j["stage"],
                })
        finally:
            conn.close()
    except Exception as e:  # noqa: BLE001
        logger.warning("serial_gate per-repo snapshot error for %s: %s", repo, e)
    frozen = is_repo_frozen(repo)
    frozen_reason = None
    frozen_at = None
    if frozen:
        try:
            fr = _active_freeze_row(repo)
            if fr:
                frozen_reason = fr.get("reason")
                frozen_at = fr.get("frozen_at")
        except Exception:  # noqa: BLE001
            pass
    return {
        "active_task": active_task,
        "waiting": waiting,
        "frozen": frozen,
        "frozen_reason": frozen_reason,
        "frozen_at": frozen_at,
    }


def snapshot() -> dict:
    """Read-only serial-gate summary for GET /queue (D9 / AC-10).

    Additive block; existing /queue keys are untouched. never-raise: any error ->
    a minimal dict with the flags and empty per-repo data.
    """
    try:
        enabled = bool(getattr(settings, "serial_gate_enabled", False))
    except Exception:  # noqa: BLE001
        enabled = False
    try:
        repos_cfg = getattr(settings, "serial_gate_repos", "") or ""
    except Exception:  # noqa: BLE001
        repos_cfg = ""
    try:
        per_repo = {r: _per_repo_snapshot(r) for r in _known_repos()}
        return {
            "enabled": enabled,
            "freeze_enabled": _freeze_layer_enabled(),
            "repos": repos_cfg,
            "per_repo": per_repo,
        }
    except Exception as e:  # noqa: BLE001 - never-raise -> minimal dict
        logger.warning("serial_gate snapshot error: %s", e)
        return {
            "enabled": enabled,
            "freeze_enabled": False,
            "repos": repos_cfg,
            "per_repo": {},
        }