"""ORCH-094: terminal-window-aware guard for deploy-phase Plane status setters. Leaf module — pure, never-raise, config-gated logic over the existing ``tasks`` table and the restart-safe post-deploy sentinels. Mirrors the leaf pattern of ``src/serial_gate.py`` / ``src/labels.py`` / ``src/cancel.py``: it imports only ``config`` (and lazily ``db`` / ``post_deploy`` / ``qg.checks``), never ``plane_sync`` / ``stage_engine`` — the setters that need a verdict call :func:`decide`, they do not live here. The bug (verified live on ORCH-061, task 47, done since 07.06): a task with DB ``stage='done'`` and no active job flaps in Plane between ``Awaiting Deploy`` and ``Monitoring after Deploy`` instead of holding ``Done``. The three deploy-phase setters (``set_issue_awaiting_deploy`` / ``set_issue_deploying`` / ``set_issue_monitoring``) are **terminal-blind**: any stale / duplicate / unknown caller under the bot token re-stamps an intermediate deploy status over the terminal Done, and the pendulum never settles. The fix is a single low choke-point on the entry of those three setters. For a task whose DB stage is terminal the verdict converges to ``Done`` idempotently, EXCEPT the one legitimate case: the post-deploy ``Monitoring`` status while the observation window is still active (``post_deploy.window_active`` — ARMED & not DONE). The deploy ``Awaiting``/``Deploying`` statuses are ALWAYS spurious for a ``done`` task (Phase A/B happen strictly BEFORE ``deploy -> done``). Key invariant (ADR-001 D2): a deploy-phase status is legitimate iff the task is non-terminal OR (``done`` AND the post-deploy window is active); otherwise the verdict is idempotent convergence to ``Done`` (for ``done``) / suppression (for ``cancelled``). never-raise contract (self-hosting safety): any error / inability to determine the DB stage degrades to ``ALLOW`` (fail-safe to the prior 1:1 behaviour, NFR-1) — a local SQLite read is reliable, so in the normal case the stage is read and the pendulum cannot arise. """ from __future__ import annotations import logging import re from .config import settings logger = logging.getLogger("orchestrator.deploy_status_guard") # Verdicts returned by decide() (the setter executes them). ALLOW = "ALLOW" # PATCH the requested deploy-phase status (normal path). CONVERGE_DONE = "CONVERGE_DONE" # set_issue_done instead (idempotent convergence). SUPPRESS = "SUPPRESS" # do nothing (do not stamp over a `cancelled` terminal). # Deploy-phase target tokens (one per guarded setter). AWAITING = "awaiting" DEPLOYING = "deploying" MONITORING = "monitoring" # Terminal DB stages (harmonised with serial_gate / adr-0026). _TERMINAL = ("done", "cancelled") # Repo tokens embedded into config CSV must match this (mirrors serial_gate R-6). _REPO_TOKEN = re.compile(r"^[A-Za-z0-9._-]+$") # --------------------------------------------------------------------------- # Conditionality (mirrors post_deploy_applies / _merge_gate_applies) # --------------------------------------------------------------------------- def _scope_repos() -> set[str]: """Sanitised set of in-scope repo tokens from ``deploy_status_guard_repos``. Empty/blank CSV -> empty set, meaning "self-hosting only" (resolved by the caller via :func:`applies`). Invalid tokens (regex miss) are dropped. Never raises. """ try: raw = (settings.deploy_status_guard_repos or "").strip() except Exception: # noqa: BLE001 return set() if not raw: return set() out: set[str] = set() for tok in raw.split(","): t = tok.strip() if t and _REPO_TOKEN.match(t): out.add(t) elif t: logger.warning("deploy_status_guard: dropping invalid repo token %r", t) return out def applies(repo: str) -> bool: """Whether the guard is REAL for this repo (D6). * ``deploy_status_guard_enabled=False`` -> always False (kill-switch; the setters are terminal-blind, 1:1 as before ORCH-094). * ``deploy_status_guard_repos`` (CSV) non-empty -> real only for listed repos. * empty CSV -> real ONLY for the self-hosting repo (``orchestrator``), where deploy-phase statuses are set at all. Mirrors the ORCH-35/36/43/58 self-hosting-only rollout -> non-self repos (enduro-trails) are untouched (they never see Awaiting/Deploying/Monitoring; terminal-sync goes straight to Done), i.e. zero regression. Never raises -> False on error (degrade to "guard inert"). """ try: if not getattr(settings, "deploy_status_guard_enabled", False): return False scope = _scope_repos() if scope: return (repo or "").strip() in scope # Lazy import keeps this module a leaf (avoid importing qg at load time). from .qg.checks import is_self_hosting_repo return is_self_hosting_repo(repo) except Exception as e: # noqa: BLE001 - never-raise logger.warning("deploy_status_guard.applies error for %s: %s", repo, e) return False # --------------------------------------------------------------------------- # Verdict (the single predicate — ADR-001 D2) # --------------------------------------------------------------------------- def decide(work_item_id: str, target_status: str, reason: str | None = None) -> str: """Decide what a deploy-phase setter should do for ``work_item_id`` (D2). Returns one of :data:`ALLOW` / :data:`CONVERGE_DONE` / :data:`SUPPRESS`. Steps (ADR-001 D2): 1. kill-switch off -> ALLOW (behaviour 1:1). 2. task not found -> ALLOW (foreign/unknown issue). 3. guard not applicable for the repo -> ALLOW (non-self / out-of-scope). 4. DB stage non-terminal -> ALLOW (live deploy cycle, AC-4). 5. DB stage == 'cancelled' -> SUPPRESS (do not stamp over it). 6. DB stage == 'done': * target == 'monitoring' AND window active -> ALLOW (legit post-deploy). * otherwise -> CONVERGE_DONE. 7. any exception / undeterminable stage -> ALLOW (fail-safe, NFR-1). Always emits exactly one structured observability line (FR-4 / D5): work_item, caller (``reason``), target_status, db_stage, window_active, verdict. """ db_stage = None window = None verdict = ALLOW try: if not getattr(settings, "deploy_status_guard_enabled", False): return ALLOW # step 1 (logged in finally) from . import db task = db.get_task_by_work_item_id(work_item_id) if task is None: return ALLOW # step 2 repo = task.get("repo") if not applies(repo): return ALLOW # step 3 db_stage = (task.get("stage") or "").strip() if db_stage not in _TERMINAL: verdict = ALLOW # step 4 — non-terminal: legit working deploy cycle return verdict if db_stage == "cancelled": verdict = SUPPRESS # step 5 return verdict # step 6 — db_stage == 'done' if target_status == MONITORING: from . import post_deploy window = post_deploy.window_active(repo, work_item_id) if window: verdict = ALLOW return verdict verdict = CONVERGE_DONE return verdict except Exception as e: # noqa: BLE001 - never-raise; fail-safe to ALLOW logger.warning( "deploy_status_guard.decide error for %s (target=%s) -> ALLOW: %s", work_item_id, target_status, e, ) verdict = ALLOW return verdict finally: # FR-4 / D5: one structured line per call. Convergence/suppression is the # interesting case — log it at WARNING so a future flapp is easy to attribute. try: msg = ( "deploy_status_guard: work_item=%s caller=%s target=%s db_stage=%s " "window_active=%s verdict=%s" ) argv = (work_item_id, reason, target_status, db_stage, window, verdict) if verdict == ALLOW: logger.info(msg, *argv) else: logger.warning(msg, *argv) except Exception: # noqa: BLE001 - logging must never raise pass