fix(staging): host-side ssh execution + env classification for staging-runner (ORCH-123)

The ORCH-115 deterministic staging-runner ran `docker exec` FROM INSIDE the prod `orchestrator` container, which ships only `openssh-client git curl` — no `docker` CLI (Dockerfile:11). `Popen(["docker", ...])` hit FileNotFoundError -> a PERMANENT environment defect that was mis-routed as a code-fail rollback `deploy-staging -> development` (burning developer-retries). Incident ORCH-116: every self-hosting task reaching deploy-staging was doomed to a false rollback. Fix (adr-0049, additive, flag-gated, never-raise, self-hosting scope; the gate / artifact contract / STAGE_TRANSITIONS / DB schema are byte-for-byte unchanged): - D1: build_staging_command() wraps the SAME `docker exec ... staging_check.py ... --mode stub` in `ssh <user@host> '<...>'` so it runs HOST-SIDE over the existing trusted ssh channel (mirror self_deploy / image_freshness). New flag staging_runner_exec_host_side (default True). No docker CLI/SDK added to the image, docker.sock not used in-container (D2 security). - D3: three-way classify_staging_outcome (suite-ran / permanent-env / transient-infra), disambiguating the exit=1 collision by scanning stderr. - D4: invariant "infra != code-fail" — permanent-env / exhausted transient-infra end in an infra-HOLD (no rollback, no developer-retry), NOT a false FAILED rollback (supersedes ORCH-115 D5). A really-executed failing suite still rolls back (anti-over-tolerance). R-2 verified: a held deploy-staging task is not rolled back by the reconciler. - D5: prod-like preflight() of the host-side channel at startup (main.lifespan, best-effort, never blocks). - D8: snapshot adds permanent_env / exec_host_side / preflight. Docs (golden source, same PR): INFRA.md execution-boundary section, architecture/README.md, CLAUDE.md, CHANGELOG.md, .env.example. Tests: tests/test_orch123_staging_runner_exec.py (TC-01 mandatory regression red->green; TC-02..TC-14 + R-2). ORCH-115 anti-drift green (3 tests updated for the D1/D4/D8 supersession). Full suite: 2131 passed. Refs: ORCH-123 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-16 08:42:36 +03:00
parent e1872e3d94
commit cc41dd849c
9 changed files with 917 additions and 52 deletions
--- a/src/main.py
+++ b/src/main.py
@@ -79,6 +79,21 @@ async def lifespan(app: FastAPI):
    except Exception as e:
        log.warning(f"Transition-lease recovery skipped: {e}")

+    # ORCH-123 (adr-0049 / D5 / FR-4): prod-like preflight of the host-side staging
+    # execution channel. The deploy-staging staging-runner (ORCH-115) runs the suite
+    # HOST-SIDE over ssh because the prod container ships no docker CLI (Dockerfile:11);
+    # probe the channel at startup so a broken environment (no docker on host / staging
+    # down / ssh unreachable / no ssh target) surfaces HERE — not postfactum as a false
+    # rollback of a real task (incident ORCH-116). Purely observational: it never blocks
+    # the start / gates the pipeline. never raises (runs after requeue + lease-recovery).
+    try:
+        from . import staging_runner
+        ok, reason = staging_runner.preflight()
+        if not ok:
+            log.warning(f"Staging-runner preflight: {reason}")
+    except Exception as e:
+        log.warning(f"Staging-runner preflight skipped: {e}")
+
    # ORCH-065: proactive startup reclaim of dead/stale merge-leases, next to the
    # queue-recovery above. A lease held by the previous (now dead) process pid is
    # released at once instead of waiting for the TTL / a foreign acquire so the
--- a/src/staging_runner.py
+++ b/src/staging_runner.py
@@ -178,26 +178,74 @@ def should_intercept(job: dict) -> bool:
 # ---------------------------------------------------------------------------
 # Suite execution (D3 / FR-2 / NFR-3 / AC-8 / AC-9)
 # ---------------------------------------------------------------------------
-def build_staging_command() -> list[str]:
-    """Build the canonical staging-suite argv (same command the LLM-deployer ran).
+def _ssh_target() -> str:
+    """ssh ``user@host`` for host-side execution, or ``""`` when no host is
+    configured (mirror ``self_deploy``/``image_freshness._ssh_target``). On the prod
+    host ``ORCH_DEPLOY_SSH_HOST=127.0.0.1`` is set by compose; the config default is
+    empty so tests / non-self contexts fall back to the in-container command."""
+    host = (settings.deploy_ssh_host or "").strip()
+    if not host:
+        return ""
+    user = (settings.deploy_ssh_user or "").strip()
+    return f"{user}@{host}" if user else host

-    ``docker exec <STAGING_SERVICE> python3 <repos_dir>/<self-repo>/scripts/staging_check.py
-    --base-url http://localhost:<staging_port> --mode stub``. Host-specifics come from
-    config (ORCH-101, no host hardcodes). Self-hosting safety (BR-7 / AC-8 / TC-12):
-    NO restart of 8500, NO ``docker compose up orchestrator`` / ``--build``, NO
-    force-push, NO ``.env`` edit — the runner only reads/executes the staging suite
-    (8501) and writes a log.
+
+def _channel_ssh_configured() -> bool:
+    """Whether the execution channel is viable w.r.t. ssh config (fed to
+    ``classify_staging_outcome``). In-container mode (host-side disabled) is always
+    'configured' — its viability is decided purely by the result signals; host-side
+    mode needs a non-empty ssh target (R-6: an empty target -> ``permanent-env``).
+    never-raise -> assume configured (rely on the result signals)."""
+    try:
+        if not bool(getattr(settings, "staging_runner_exec_host_side", True)):
+            return True
+        return bool(_ssh_target())
+    except Exception:  # noqa: BLE001 - never-raise
+        return True
+
+
+def build_staging_command() -> list[str]:
+    """Build the staging-suite argv (same suite command the LLM-deployer ran).
+
+    The INNER command is unchanged: ``docker exec <STAGING_SERVICE> python3
+    <repos_dir>/<self-repo>/scripts/staging_check.py --base-url
+    http://localhost:<staging_port> --mode stub``. Host-specifics come from config
+    (ORCH-101, no host hardcodes).
+
+    ORCH-123 (D1, adr-0049): ``docker`` lives on the HOST — the prod container ships
+    only ``openssh-client git curl`` (``Dockerfile:11``), so a ``docker exec`` spawned
+    FROM INSIDE the prod container hit ``FileNotFoundError`` (incident ORCH-116). When
+    ``staging_runner_exec_host_side`` (default True) is set AND an ssh target is
+    configured, the inner command is wrapped in ``ssh <user@host> '<docker exec ...>'``
+    so it runs host-side over the existing trusted ssh channel (mirror
+    ``self_deploy.build_deploy_command`` / ``image_freshness.image_revision``). With
+    the flag off OR no ssh target configured (tests / non-self contexts) it falls back
+    to the prior in-container ``docker exec`` (valid only where a docker CLI is baked
+    into the image).
+
+    Self-hosting safety (BR-7 / AC-8 / TC-08): the argv carries ONLY ``docker exec
+    <staging-service> python3 staging_check.py ... --mode stub`` — NO restart of 8500,
+    NO ``docker compose up orchestrator`` / ``--build``, NO force-push, NO ``.env``
+    edit. The runner only reads/executes the staging suite (8501) and writes a log.
    """
    from .qg.checks import SELF_HOSTING_REPO
    repos_dir = (settings.repos_dir or "/repos").rstrip("/")
    script = f"{repos_dir}/{SELF_HOSTING_REPO}/scripts/staging_check.py"
    base_url = f"http://localhost:{int(settings.staging_port)}"
-    return [
+    inner_argv = [
        "docker", "exec", STAGING_SERVICE,
        "python3", script,
        "--base-url", base_url,
        "--mode", "stub",
    ]
+    # ORCH-123 (D1): host-side ssh-wrap when enabled AND a target is configured.
+    if bool(getattr(settings, "staging_runner_exec_host_side", True)):
+        target = _ssh_target()
+        if target:
+            remote = " ".join(shlex.quote(a) for a in inner_argv)
+            return ["ssh", "-o", "StrictHostKeyChecking=no", target, remote]
+    # Fallback: prior in-container docker exec (no ssh target / flag off).
+    return inner_argv


 def _resolve_timeout() -> int:
@@ -255,6 +303,71 @@ def map_exit_code_to_status(exit_code) -> str:
    return _map(exit_code)


+# ---------------------------------------------------------------------------
+# Three-way outcome classification (D3 / FR-2 / FR-3 / AC-3, AC-4, AC-6)
+# ---------------------------------------------------------------------------
+def classify_staging_outcome(result, ssh_configured: bool) -> str:
+    """Classify a staging-suite ProcResult into one of three classes (D3). Pure,
+    never-raise (mirror of ``merge_gate.classify_retest_failure``, ORCH-110 D2):
+
+      * ``"suite-ran"``       — a recognised executor exit-code (any int except the
+                                ssh transport code 255) AND no environment marker in
+                                stderr: the suite DEMONSTRABLY executed -> trust the
+                                code (``0->SUCCESS``, ``!=0->FAILED``). Anti-over-
+                                tolerance (BR-3): a real suite fail is NEVER
+                                reclassified as infra/env.
+      * ``"permanent-env"``   — a deterministic PERMANENT environment defect: an
+                                stderr env-marker (no docker / no such container /
+                                daemon unreachable), a shell "command not found /
+                                cannot execute" code (126/127), no host-side ssh
+                                target configured, or a bare local spawn-error
+                                (``returncode is None`` and not a timeout). Retrying is
+                                pointless (FR-3) -> immediate distinguishable
+                                infra-HOLD (no rollback, no developer-retry).
+      * ``"transient-infra"`` — a timeout OR an ssh transport/connection failure (255)
+                                OR any unknown signal: a retry is meaningful -> the
+                                bounded DEFER.
+
+    The exit=1 collision (``docker exec`` "No such container"=1 vs a real suite fail=1)
+    is disambiguated by SCANNING stderr for env-markers, NEVER by the bare exit-code.
+    Fail-safe on doubt -> ``"transient-infra"`` (DEFER), never a silent ``"suite-ran"``
+    (a mis-routed environment->code-fail is exactly the defect this fixes; an over-
+    tolerated code-fail is guarded by trusting recognised suite exit-codes first).
+    """
+    try:
+        rc = getattr(result, "returncode", None)
+        timed_out = bool(getattr(result, "timed_out", False))
+        stderr = (getattr(result, "stderr", "") or "").lower()
+
+        # 1. env-marker in stderr -> permanent, REGARDLESS of the exit-code (this is
+        #    what disambiguates a `docker exec` "No such container" exit=1 from a real
+        #    suite fail=1 — R-3).
+        if any(m in stderr for m in _ENV_MARKERS):
+            return "permanent-env"
+        # 2. shell "command not found" (127) / "cannot execute" (126) -> permanent.
+        if rc in (126, 127):
+            return "permanent-env"
+        # 3. a recognised executor exit-code (any int except the ssh transport code
+        #    255) WITHOUT an env-marker -> the suite executed; trust it (BR-3). This is
+        #    checked BEFORE the channel guards so a real suite fail is never masked.
+        if isinstance(rc, int) and rc != 255:
+            return "suite-ran"
+        # --- below this line the suite did NOT produce a trustworthy exit-code ---
+        # 4. timeout / ssh transport failure (255) -> transient: a retry is meaningful.
+        if timed_out or rc == 255:
+            return "transient-infra"
+        # 5. host-side ssh target missing (R-6) OR a bare local spawn-error (proc_group
+        #    degraded an OSError to rc None without a timeout) -> permanent env defect.
+        if not ssh_configured:
+            return "permanent-env"
+        if rc is None:
+            return "permanent-env"
+        # 6. unknown signal -> fail-safe DEFER (never a silent suite-ran).
+        return "transient-infra"
+    except Exception:  # noqa: BLE001 - never-raise; unknown -> transient (no false rollback)
+        return "transient-infra"
+
+
 # ---------------------------------------------------------------------------
 # Artifact 15-staging-log.md (D6 / FR-4 / AC-2 / AC-8) — mirror write_deploy_log
 # ---------------------------------------------------------------------------
@@ -439,17 +552,21 @@ def _infra_retry_count(task_id) -> int:
        return 0


-def _handle_tool_error(
+def _handle_transient_infra(
    task_id, repo: str, work_item_id: str, branch: str, result: proc_group.ProcResult
 ) -> None:
-    """Suite did NOT execute (tool-error) -> bounded DEFER, then fail-closed (D5).
+    """Transient infra (timeout / ssh transport) — the suite did not execute but a
+    retry is meaningful (D4). Bounded DEFER: re-queue a fresh ``deployer`` job (which
+    re-enters this runner) with a delay + a restart-safe marker, instead of an
+    immediate rollback that would burn a developer-retry.

-    Anti ORCH-110: an infra fault is NOT a code fault, so we re-queue a fresh
-    ``deployer`` job (which re-enters this runner) with a delay instead of an
-    immediate FAILED-rollback that would burn a developer-retry. On budget exhaustion
-    -> write ``staging_status: FAILED`` + advance (the existing rollback) + an
-    INFRA-specific alert (explicitly "not a code defect"). Never a silent advance /
-    false green; never wedges the queue. never-raise."""
+    On budget exhaustion -> **infra-HOLD + alert** (D4, SUPERSEDES ORCH-115 D5): NOT
+    the prior fail-closed ``write_staging_log("FAILED") + advance``, which falsely
+    rolled an unresolved infra hiccup back to ``development`` as a code-fail (BR-2,
+    anti-pattern ORCH-110). The task is HELD on ``deploy-staging`` (a red/missing gate
+    keeps the reconciler's ``advance_if_gate_passed`` from rolling it back — R-2); the
+    operator re-drives it after fixing the stand. Never a silent advance / false green;
+    never wedges the queue. never-raise."""
    retries = _infra_retry_count(task_id)
    try:
        max_retries = int(settings.staging_runner_infra_max_retries)
@@ -462,7 +579,7 @@ def _handle_tool_error(

    if retries < max_retries:
        _bump("deferred")
-        reason = "timeout" if result.timed_out else "suite did not execute (tool-error)"
+        reason = "timeout" if result.timed_out else "ssh transport/connection error"
        task_desc = (
            f"Work item: {work_item_id}\nRepo: {repo}\nBranch: {branch}\n"
            f"Stage: deploy-staging\nNote: {_INFRA_RETRY_MARKER} "
@@ -474,40 +591,77 @@ def _handle_tool_error(
                "deployer", repo, task_desc, task_id=task_id, available_at_delay_s=delay,
            )
            logger.warning(
-                "Task %s (%s): staging suite did not execute (%s) -> infra-DEFER "
-                "(job_id=%s, attempt %d/%d)",
+                "Task %s (%s): staging suite did not execute (%s) -> transient-infra "
+                "DEFER (job_id=%s, attempt %d/%d)",
                task_id, work_item_id, reason, new_job, retries + 1, max_retries,
            )
        except Exception as e:  # noqa: BLE001 - never-raise
            logger.error("staging_runner: infra-DEFER enqueue failed for %s: %s", task_id, e)
        return

-    # Budget exhausted -> fail-closed FAILED (terminal, never a false green).
-    _bump("failed")
+    # Budget exhausted -> infra-HOLD (D4): NO write_staging_log("FAILED"), NO advance,
+    # NO developer-retry. The task is held on deploy-staging for the operator.
+    _bump("permanent_env")
    logger.error(
-        "Task %s (%s): staging tool-error DEFER budget exhausted (%d) -> fail-closed FAILED",
+        "Task %s (%s): staging transient-infra DEFER budget exhausted (%d) -> "
+        "infra-HOLD on deploy-staging (NOT a code defect, no rollback)",
        task_id, work_item_id, max_retries,
    )
-    write_staging_log(repo, work_item_id, branch, result.returncode, "FAILED",
-                      result.stdout, tool_error=True)
    _alert_infra_exhausted(work_item_id, max_retries)
-    _advance(task_id, repo, work_item_id, branch)
+
+
+def _handle_permanent_env(
+    task_id, repo: str, work_item_id: str, branch: str, result: proc_group.ProcResult
+) -> None:
+    """Permanent environment defect (D4): an immediate, distinguishable infra-HOLD.
+
+    The DEFER cycle is SKIPPED — retrying a permanent defect (no docker / no ssh
+    target / container down) is pointless (FR-3). We do NOT write a FAILED staging-log,
+    do NOT advance, do NOT burn a developer-retry — the task is HELD on
+    ``deploy-staging``: a red/missing ``check_staging_status`` keeps the reconciler's
+    ``advance_if_gate_passed`` from rolling it back to ``development`` (R-2), and the
+    rollback-to-development path (``advance_stage(finished_agent="deployer")``) is never
+    taken. A structured error log + an operator alert ("infra/environment, NOT a code
+    defect") make the hold visible; the operator re-drives after fixing the
+    environment. never-raise."""
+    detail = (getattr(result, "stderr", "") or "").strip()[:300]
+    logger.error(
+        "Task %s (%s): PERMANENT staging environment defect -> infra-HOLD on "
+        "deploy-staging (NOT a code defect, no rollback, no developer-retry): %s",
+        task_id, work_item_id, detail or "<no stderr>",
+    )
+    _alert_permanent_env(work_item_id, detail)


 def _alert_infra_exhausted(work_item_id: str, max_retries: int) -> None:
    """Best-effort Telegram alert that the staging suite never executed (infra, NOT a
-    code defect) after the retry budget. never-raise."""
+    code defect) after the retry budget -> infra-HOLD (no rollback). never-raise."""
    try:
        from .notifications import send_telegram, link_for
        send_telegram(
-            f"\U0001f6a8 {link_for(work_item_id)}: staging suite не запустилась "
-            f"(инфра, НЕ дефект кода) после {max_retries} попыток — fail-closed FAILED, "
-            f"откат на development. Нужно проверить staging-стенд."
+            f"\U0001f6a8 {link_for(work_item_id)}: staging-сюита не запустилась "
+            f"(инфра, НЕ дефект кода) после {max_retries} попыток — задача удержана на "
+            f"deploy-staging (без отката на development). Нужно проверить staging-стенд."
        )
    except Exception as e:  # noqa: BLE001 - never-raise
        logger.warning("staging_runner: infra-exhausted alert failed for %s: %s", work_item_id, e)


+def _alert_permanent_env(work_item_id: str, detail: str) -> None:
+    """Best-effort Telegram alert that the staging suite could not execute due to a
+    permanent environment defect (NOT a code defect) -> infra-HOLD. never-raise."""
+    try:
+        from .notifications import send_telegram, link_for
+        tail = f": {detail}" if detail else ""
+        send_telegram(
+            f"\U0001f6a8 {link_for(work_item_id)}: staging-сюита не смогла исполниться — "
+            f"постоянный дефект окружения (инфра, НЕ дефект кода){tail}. Задача удержана "
+            f"на deploy-staging до починки окружения (без отката на development)."
+        )
+    except Exception as e:  # noqa: BLE001 - never-raise
+        logger.warning("staging_runner: permanent-env alert failed for %s: %s", work_item_id, e)
+
+
 # ---------------------------------------------------------------------------
 # Entry point (D2) — owns the full deterministic flow, mirror run_deploy_finalizer
 # ---------------------------------------------------------------------------
@@ -517,10 +671,12 @@ def run_staging_gate(job: dict) -> None:
    Flow (mirror of ``stage_engine.run_deploy_finalizer``):
      1. resolve ``work_item_id`` / ``branch`` by ``task_id``;
      2. execute the staging suite (D3) -> ProcResult;
-      3. suite EXECUTED -> map exit-code -> ``staging_status:``, write
-         ``15-staging-log.md``, initiate the existing gate via ``advance_stage`` (D7);
-      4. suite did NOT execute (tool-error) -> bounded DEFER / fail-closed (D5);
-      5. observability counters + one structured verdict log (D10).
+      3. classify the outcome three ways (D3): ``suite-ran`` -> map exit-code ->
+         ``staging_status:``, write ``15-staging-log.md``, initiate the existing gate
+         via ``advance_stage`` (D7);
+      4. ``permanent-env`` -> immediate infra-HOLD (no rollback, no developer-retry);
+         ``transient-infra`` -> bounded DEFER, then infra-HOLD on exhaustion (D4);
+      5. observability counters + one structured verdict log (D8).
    Never raises into the caller (the launcher marks the job done/failed)."""
    started = time.time()
    _bump("runs")
@@ -552,9 +708,10 @@ def run_staging_gate(job: dict) -> None:
    try:
        result = run_staging_suite()
        duration_s = round(time.time() - started, 1)
-        suite_ran = (result.returncode is not None) and (not result.timed_out)
+        # ORCH-123 (D3): three-way classification (env != transient infra != code-fail).
+        outcome = classify_staging_outcome(result, _channel_ssh_configured())

-        if suite_ran:
+        if outcome == "suite-ran":
            # 3. trust the exit-code (ORCH-061 already inside staging_check.py).
            status = map_exit_code_to_status(result.returncode)
            _bump("success" if status == "SUCCESS" else "failed")
@@ -568,14 +725,27 @@ def run_staging_gate(job: dict) -> None:
            _advance(task_id, repo, work_item_id, branch)
            return

-        # 4. tool-error (suite did not execute) -> DEFER / fail-closed (D5).
+        # The suite did NOT execute. Count it as a tool-error, then route by class
+        # (D4): permanent-env -> immediate infra-HOLD; transient-infra -> bounded DEFER.
+        # NEITHER rolls back to development or burns a developer-retry (BR-2 invariant).
        _bump("tool_error")
+        if outcome == "permanent-env":
+            _bump("permanent_env")
+            logger.warning(
+                "staging_runner verdict: work_item=%s repo=%s exit_code=%s "
+                "duration_s=%s outcome=permanent-env (ssh_configured=%s)",
+                work_item_id, repo, result.returncode, duration_s, _channel_ssh_configured(),
+            )
+            _handle_permanent_env(task_id, repo, work_item_id, branch, result)
+            return
+
+        # transient-infra (timeout / ssh transport / unknown) -> bounded DEFER.
        logger.warning(
-            "staging_runner verdict: work_item=%s repo=%s exit_code=%s status=%s "
-            "duration_s=%s outcome=tool-error (timed_out=%s)",
-            work_item_id, repo, result.returncode, "TOOL-ERROR", duration_s, result.timed_out,
+            "staging_runner verdict: work_item=%s repo=%s exit_code=%s "
+            "duration_s=%s outcome=transient-infra (timed_out=%s)",
+            work_item_id, repo, result.returncode, duration_s, result.timed_out,
        )
-        _handle_tool_error(task_id, repo, work_item_id, branch, result)
+        _handle_transient_infra(task_id, repo, work_item_id, branch, result)
    except Exception as e:  # noqa: BLE001 - never-raise into the worker (AC-7)
        logger.error(
            "staging_runner.run_staging_gate: unexpected error for task %s (%s): %s",
@@ -584,24 +754,107 @@ def run_staging_gate(job: dict) -> None:


 # ---------------------------------------------------------------------------
-# Observability (D10 / FR-7 / AC-10)
+# Prod-like preflight of the host-side execution channel (D5 / FR-4 / AC-5)
+# ---------------------------------------------------------------------------
+def _alert_preflight(reason: str) -> None:
+    """Best-effort Telegram alert that the host-side staging channel is unworkable at
+    startup (so a real task never silently false-routes later). never-raise."""
+    try:
+        from .notifications import send_telegram
+        send_telegram(
+            f"⚠️ staging-runner preflight: {reason}. Хост-сторона исполнения "
+            f"staging-сюиты неработоспособна — задачи на deploy-staging будут удержаны "
+            f"(инфра, НЕ дефект кода). Проверьте ssh/docker/staging-стенд."
+        )
+    except Exception as e:  # noqa: BLE001 - never-raise
+        logger.warning("staging_runner: preflight alert failed: %s", e)
+
+
+def preflight() -> tuple[bool, str]:
+    """Prod-like preflight of the host-side staging execution channel (D5 / AC-5).
+
+    Probes the channel with a short bounded ssh probe (``command -v docker`` +
+    ``docker inspect -f '{{.State.Running}}' <staging-service>``) so a broken
+    environment (no docker on the host / staging container down / ssh unreachable / no
+    ssh target configured) surfaces at SERVICE START — NOT postfactum as a false
+    rollback of a real task. Records ``_PREFLIGHT_STATE`` + alerts on failure.
+
+    Purely observational (FR-4): it NEVER gates/blocks the pipeline, touches stages /
+    QG, or raises — called best-effort from ``main.lifespan``. Self-hosting scope:
+    ``applies()`` first (a disabled runner / out-of-scope repo -> n/a). Returns
+    ``(ok, reason)`` (``ok=True`` for n/a / in-container mode / a healthy channel).
+    """
+    try:
+        from .qg.checks import SELF_HOSTING_REPO
+        if not applies(SELF_HOSTING_REPO):
+            _PREFLIGHT_STATE.update(ok=None, reason="n/a (runner disabled / out of scope)")
+            return True, "n/a"
+        # In-container mode (host-side disabled): nothing to probe host-side.
+        if not bool(getattr(settings, "staging_runner_exec_host_side", True)):
+            _PREFLIGHT_STATE.update(ok=True, reason="in-container mode (host-side disabled)")
+            return True, "in-container mode"
+        target = _ssh_target()
+        if not target:
+            reason = "no ssh target configured (deploy_ssh_host empty) — host-side staging unworkable"
+            _PREFLIGHT_STATE.update(ok=False, reason=reason)
+            _alert_preflight(reason)
+            return False, reason
+        probe = (
+            "command -v docker >/dev/null 2>&1 && "
+            f"docker inspect -f '{{{{.State.Running}}}}' {shlex.quote(STAGING_SERVICE)}"
+        )
+        cmd = ["ssh", "-o", "StrictHostKeyChecking=no", target, probe]
+        try:
+            r = subprocess.run(cmd, capture_output=True, text=True, timeout=_PREFLIGHT_TIMEOUT_S)
+        except subprocess.TimeoutExpired:
+            reason = "preflight ssh probe timed out"
+            _PREFLIGHT_STATE.update(ok=False, reason=reason)
+            _alert_preflight(reason)
+            return False, reason
+        except (subprocess.SubprocessError, OSError) as e:
+            reason = f"preflight ssh probe error: {e}"
+            _PREFLIGHT_STATE.update(ok=False, reason=reason)
+            _alert_preflight(reason)
+            return False, reason
+        out = (r.stdout or "").strip().lower()
+        if r.returncode == 0 and out == "true":
+            _PREFLIGHT_STATE.update(ok=True, reason="host-side channel ok (docker present, staging running)")
+            return True, "ok"
+        reason = f"host-side staging channel not ready (rc={r.returncode}, running={out!r})"
+        _PREFLIGHT_STATE.update(ok=False, reason=reason)
+        _alert_preflight(reason)
+        return False, reason
+    except Exception as e:  # noqa: BLE001 - never-raise; preflight must never block start
+        logger.warning("staging_runner.preflight error: %s", e)
+        _PREFLIGHT_STATE.update(ok=None, reason=f"preflight error: {e}")
+        return True, "preflight skipped (error)"
+
+
+# ---------------------------------------------------------------------------
+# Observability (D8 / FR-7 / AC-10)
 # ---------------------------------------------------------------------------
 def snapshot() -> dict:
    """Read-only staging-runner summary for ``GET /queue`` (FR-7 / AC-10).

-    Additive block; existing ``/queue`` keys are untouched. never-raise: any error ->
-    a minimal dict with the kill-switch state."""
+    Additive block; existing ``/queue`` keys are untouched. ORCH-123 adds
+    ``exec_host_side`` (the strategy flag), ``permanent_env`` (the infra-HOLD counter,
+    distinct from ``failed``=code-fail and ``deferred``=transient-infra) and the last
+    ``preflight`` verdict. never-raise: any error -> a minimal dict with the
+    kill-switch state."""
    try:
        return {
            "enabled": bool(settings.staging_runner_enabled),
            "repos": getattr(settings, "staging_runner_repos", "") or "",
            "timeout_s": getattr(settings, "staging_runner_timeout_s", _DEFAULT_TIMEOUT_S),
            "infra_max_retries": getattr(settings, "staging_runner_infra_max_retries", 2),
+            "exec_host_side": bool(getattr(settings, "staging_runner_exec_host_side", True)),
            "runs": _STAGING_RUNNER_COUNTERS["runs"],
            "success": _STAGING_RUNNER_COUNTERS["success"],
            "failed": _STAGING_RUNNER_COUNTERS["failed"],
            "tool_error": _STAGING_RUNNER_COUNTERS["tool_error"],
            "deferred": _STAGING_RUNNER_COUNTERS["deferred"],
+            "permanent_env": _STAGING_RUNNER_COUNTERS["permanent_env"],
+            "preflight": dict(_PREFLIGHT_STATE),
        }
    except Exception as e:  # noqa: BLE001 - never-raise -> minimal dict
        logger.warning("staging_runner.snapshot error: %s", e)