orchestrator/src/deploy_status_guard.py

"""ORCH-094: terminal-window-aware guard for deploy-phase Plane status setters.

Leaf module — pure, never-raise, config-gated logic over the existing ``tasks``
table and the restart-safe post-deploy sentinels. Mirrors the leaf pattern of
``src/serial_gate.py`` / ``src/labels.py`` / ``src/cancel.py``: it imports only
``config`` (and lazily ``db`` / ``post_deploy`` / ``qg.checks``), never
``plane_sync`` / ``stage_engine`` — the setters that need a verdict call
:func:`decide`, they do not live here.

The bug (verified live on ORCH-061, task 47, done since 07.06): a task with DB
``stage='done'`` and no active job flaps in Plane between ``Awaiting Deploy`` and
``Monitoring after Deploy`` instead of holding ``Done``. The three deploy-phase
setters (``set_issue_awaiting_deploy`` / ``set_issue_deploying`` /
``set_issue_monitoring``) are **terminal-blind**: any stale / duplicate / unknown
caller under the bot token re-stamps an intermediate deploy status over the
terminal Done, and the pendulum never settles.

The fix is a single low choke-point on the entry of those three setters. For a
task whose DB stage is terminal the verdict converges to ``Done`` idempotently,
EXCEPT the one legitimate case: the post-deploy ``Monitoring`` status while the
observation window is still active (``post_deploy.window_active`` — ARMED & not
DONE). The deploy ``Awaiting``/``Deploying`` statuses are ALWAYS spurious for a
``done`` task (Phase A/B happen strictly BEFORE ``deploy -> done``).

Key invariant (ADR-001 D2): a deploy-phase status is legitimate iff the task is
non-terminal OR (``done`` AND the post-deploy window is active); otherwise the
verdict is idempotent convergence to ``Done`` (for ``done``) / suppression (for
``cancelled``).

never-raise contract (self-hosting safety): any error / inability to determine
the DB stage degrades to ``ALLOW`` (fail-safe to the prior 1:1 behaviour, NFR-1)
— a local SQLite read is reliable, so in the normal case the stage is read and
the pendulum cannot arise.
"""
from __future__ import annotations

import logging
import re

from .config import settings

logger = logging.getLogger("orchestrator.deploy_status_guard")

# Verdicts returned by decide() (the setter executes them).
ALLOW = "ALLOW"  # PATCH the requested deploy-phase status (normal path).
CONVERGE_DONE = "CONVERGE_DONE"  # set_issue_done instead (idempotent convergence).
SUPPRESS = "SUPPRESS"  # do nothing (do not stamp over a `cancelled` terminal).

# Deploy-phase target tokens (one per guarded setter).
AWAITING = "awaiting"
DEPLOYING = "deploying"
MONITORING = "monitoring"

# Terminal DB stages (harmonised with serial_gate / adr-0026).
_TERMINAL = ("done", "cancelled")

# Repo tokens embedded into config CSV must match this (mirrors serial_gate R-6).
_REPO_TOKEN = re.compile(r"^[A-Za-z0-9._-]+$")


# ---------------------------------------------------------------------------
# Conditionality (mirrors post_deploy_applies / _merge_gate_applies)
# ---------------------------------------------------------------------------
def _scope_repos() -> set[str]:
    """Sanitised set of in-scope repo tokens from ``deploy_status_guard_repos``.

    Empty/blank CSV -> empty set, meaning "self-hosting only" (resolved by the
    caller via :func:`applies`). Invalid tokens (regex miss) are dropped. Never
    raises.
    """
    try:
        raw = (settings.deploy_status_guard_repos or "").strip()
    except Exception:  # noqa: BLE001
        return set()
    if not raw:
        return set()
    out: set[str] = set()
    for tok in raw.split(","):
        t = tok.strip()
        if t and _REPO_TOKEN.match(t):
            out.add(t)
        elif t:
            logger.warning("deploy_status_guard: dropping invalid repo token %r", t)
    return out


def applies(repo: str) -> bool:
    """Whether the guard is REAL for this repo (D6).

      * ``deploy_status_guard_enabled=False`` -> always False (kill-switch; the
        setters are terminal-blind, 1:1 as before ORCH-094).
      * ``deploy_status_guard_repos`` (CSV) non-empty -> real only for listed repos.
      * empty CSV -> real ONLY for the self-hosting repo (``orchestrator``), where
        deploy-phase statuses are set at all. Mirrors the ORCH-35/36/43/58
        self-hosting-only rollout -> non-self repos (enduro-trails) are untouched
        (they never see Awaiting/Deploying/Monitoring; terminal-sync goes straight
        to Done), i.e. zero regression.
    Never raises -> False on error (degrade to "guard inert").
    """
    try:
        if not getattr(settings, "deploy_status_guard_enabled", False):
            return False
        scope = _scope_repos()
        if scope:
            return (repo or "").strip() in scope
        # Lazy import keeps this module a leaf (avoid importing qg at load time).
        from .qg.checks import is_self_hosting_repo
        return is_self_hosting_repo(repo)
    except Exception as e:  # noqa: BLE001 - never-raise
        logger.warning("deploy_status_guard.applies error for %s: %s", repo, e)
        return False


# ---------------------------------------------------------------------------
# Verdict (the single predicate — ADR-001 D2)
# ---------------------------------------------------------------------------
def decide(work_item_id: str, target_status: str, reason: str | None = None) -> str:
    """Decide what a deploy-phase setter should do for ``work_item_id`` (D2).

    Returns one of :data:`ALLOW` / :data:`CONVERGE_DONE` / :data:`SUPPRESS`.
    Steps (ADR-001 D2):

      1. kill-switch off                          -> ALLOW (behaviour 1:1).
      2. task not found                           -> ALLOW (foreign/unknown issue).
      3. guard not applicable for the repo        -> ALLOW (non-self / out-of-scope).
      4. DB stage non-terminal                    -> ALLOW (live deploy cycle, AC-4).
      5. DB stage == 'cancelled'                  -> SUPPRESS (do not stamp over it).
      6. DB stage == 'done':
           * target == 'monitoring' AND window active -> ALLOW (legit post-deploy).
           * otherwise                                -> CONVERGE_DONE.
      7. any exception / undeterminable stage     -> ALLOW (fail-safe, NFR-1).

    Always emits exactly one structured observability line (FR-4 / D5): work_item,
    caller (``reason``), target_status, db_stage, window_active, verdict.
    """
    db_stage = None
    window = None
    verdict = ALLOW
    try:
        if not getattr(settings, "deploy_status_guard_enabled", False):
            return ALLOW  # step 1 (logged in finally)

        from . import db
        task = db.get_task_by_work_item_id(work_item_id)
        if task is None:
            return ALLOW  # step 2

        repo = task.get("repo")
        if not applies(repo):
            return ALLOW  # step 3

        db_stage = (task.get("stage") or "").strip()
        if db_stage not in _TERMINAL:
            verdict = ALLOW  # step 4 — non-terminal: legit working deploy cycle
            return verdict

        if db_stage == "cancelled":
            verdict = SUPPRESS  # step 5
            return verdict

        # step 6 — db_stage == 'done'
        if target_status == MONITORING:
            from . import post_deploy
            window = post_deploy.window_active(repo, work_item_id)
            if window:
                verdict = ALLOW
                return verdict
        verdict = CONVERGE_DONE
        return verdict
    except Exception as e:  # noqa: BLE001 - never-raise; fail-safe to ALLOW
        logger.warning(
            "deploy_status_guard.decide error for %s (target=%s) -> ALLOW: %s",
            work_item_id, target_status, e,
        )
        verdict = ALLOW
        return verdict
    finally:
        # FR-4 / D5: one structured line per call. Convergence/suppression is the
        # interesting case — log it at WARNING so a future flapp is easy to attribute.
        try:
            msg = (
                "deploy_status_guard: work_item=%s caller=%s target=%s db_stage=%s "
                "window_active=%s verdict=%s"
            )
            argv = (work_item_id, reason, target_status, db_stage, window, verdict)
            if verdict == ALLOW:
                logger.info(msg, *argv)
            else:
                logger.warning(msg, *argv)
        except Exception:  # noqa: BLE001 - logging must never raise
            pass