orchestrator/src/finalizer_liveness.py

"""ORCH-113 (adr-0043): process-local finalizer-ownership registry.

Leaf module — pure, process-local, never-raise (pattern of ``serial_gate`` /
``coverage_gate``: imports nothing from ``stage_engine`` / ``launcher`` / the DB,
talks to no network). It records "a LIVE monitor thread is currently finalizing
job X" so the job-reaper can tell a long-running-but-alive finalizer apart from a
genuinely dead one.

Why in-memory is authoritative (ADR-001 / adr-0043): the monitor
(``launcher._monitor_agent``) and the reaper (``job_reaper``) are daemon THREADS
of the SAME single uvicorn process (CMD has no ``--workers``), sharing one SQLite
DB. So liveness of the finalizing thread can be observed in-process. A whole-process
death is covered by the startup ``requeue_running_jobs()`` (``running -> queued``),
which ``main.lifespan`` runs BEFORE the reaper starts — so a restart leaves this
registry empty and the requeued jobs are re-driven cleanly (restart-safe, no durable
state needed).

The bug this closes (incident ORCH-111, deployer job 1914): on the
``deploy-staging -> deploy`` edge the monitor stamps ``agent_runs.finished_at``
FIRST, then runs the heavy edge sub-gates (security -> merge-gate re-test ->
coverage -> image-freshness) synchronously in its own thread — MINUTES — and only
THEN ``_finalize_job``. Reaper Tier-2 measures ``finished_age_s`` from
``finished_at`` (= the START of finalization), so once it exceeds
``reaper_finalize_grace_s`` (300s) it treated the live, long-finalizing monitor as
dead and independently re-ran the same heavy advance -> a second re-test went red ->
false rollback ``deploy-staging -> development`` while the original finalizer
concurrently merged the PR. State diverged.

No own TTL: time-bounding is the reaper's Tier-3 backstop (``reaper_max_running_s``),
which deliberately IGNORES this marker so a truly stuck finalizer is still reaped in
bounded time. Every public function is isolated (``try/except`` -> safe default);
``is_active`` defaults to ``False`` on error (conservative: never block the reaping
of a possibly-dead finalizer).

See docs/work-items/ORCH-113/06-adr/ADR-001-reaper-finalizer-liveness-ownership.md
and the cross-cutting docs/architecture/adr/adr-0043-reaper-finalizer-liveness-ownership.md.
"""
from __future__ import annotations

import logging
import threading
import time

logger = logging.getLogger("orchestrator.finalizer_liveness")

# Process-local ownership registry: {job_id: {"run_id", "stage", "started_ts"}}.
# Guarded by a Lock because the monitor thread writes (mark/clear) while the reaper
# thread reads (is_active/snapshot). All state resets on process restart, which is
# safe (the startup requeue_running_jobs covers the restart path).
_LOCK = threading.Lock()
_OWNED: dict[int, dict] = {}


def mark(job_id: int | None, run_id: int | None, stage: str | None) -> None:
    """Register that a live monitor thread is finalizing ``job_id``.

    Called by ``launcher._monitor_agent`` right after the ``exit_code`` stamp (the
    earliest moment the reaper can enter Tier-2). ``stage`` is best-effort context
    for the snapshot only — the reaper decides the actual stage from ``tasks`` via
    its own ``_task_meta`` lookup. No-op when ``job_id is None`` (legacy direct
    ``launch()`` jobs are not in ``get_running_jobs`` and are unreapable). Never
    raises.
    """
    if job_id is None:
        return
    try:
        with _LOCK:
            _OWNED[job_id] = {
                "run_id": run_id,
                "stage": stage,
                "started_ts": time.time(),
            }
    except Exception as e:  # noqa: BLE001 - never-raise contract
        logger.warning("finalizer_liveness.mark failed for job %s: %s", job_id, e)


def clear(job_id: int | None) -> None:
    """Release ownership of ``job_id`` (idempotent).

    Called from the ``finally`` of the monitor's finalization tail, so ANY exception
    in the monitor thread still releases ownership -> a genuinely dead finalizer is
    reaped (FR-4). Never raises.
    """
    if job_id is None:
        return
    try:
        with _LOCK:
            _OWNED.pop(job_id, None)
    except Exception as e:  # noqa: BLE001 - never-raise contract
        logger.warning("finalizer_liveness.clear failed for job %s: %s", job_id, e)


def is_active(job_id: int | None) -> bool:
    """True iff a live monitor currently owns the finalization of ``job_id``.

    Consulted by the reaper Tier-2 branch. Defaults to ``False`` on any error or
    when ``job_id is None`` (conservative: never block the reaping of a possibly
    dead finalizer). Never raises.
    """
    if job_id is None:
        return False
    try:
        with _LOCK:
            return job_id in _OWNED
    except Exception as e:  # noqa: BLE001 - never-raise contract
        logger.warning("finalizer_liveness.is_active failed for job %s: %s", job_id, e)
        return False


def snapshot() -> dict:
    """Read-only view of current ownership for ``GET /queue`` observability.

    Returns ``{"active": <count>, "jobs": [job_id, ...]}``. Never raises.
    """
    try:
        with _LOCK:
            return {"active": len(_OWNED), "jobs": sorted(_OWNED.keys())}
    except Exception as e:  # noqa: BLE001 - never-raise contract
        logger.warning("finalizer_liveness.snapshot failed: %s", e)
        return {"active": 0, "jobs": []}