"""Staging-image provenance for the BUILD-ONCE retag to prod (ORCH-058). ORCH-36 made the ``deploy`` stage promote the staging image to prod by a plain ``docker tag`` (BUILD-ONCE, no rebuild), assuming "the staging image is fresh and built from the validated code". That guarantee never existed: nothing in the pipeline rebuilt the staging image from the validated commit, so a STALE image could be silently promoted — the most dangerous bootstrap bug of LESSONS_ORCH-036 (§4): a green deploy that quietly rolled prod back to 2-day-old code. This module provides the deterministic (no-LLM) primitives that enforce the ``INV-FRESH`` invariant (ADR-001), as **two complementary layers** wired only for self-hosting: * **A — liveness:** :func:`check_staging_image_fresh` is a QG sub-check on the ``deploy-staging -> deploy`` edge (composed by ``stage_engine`` AFTER the merge-gate, BEFORE Phase A). It rebuilds ``orchestrator-orchestrator-staging`` from the VALIDATED commit (worktree HEAD after the merge-gate rebase), recreates the 8501 container, and runs ``staging_check.py --mode stub`` against that fresh 8501 (ADR-001 step 3), so we validate exactly the ONE artefact later retagged to prod (AC-4). FAIL -> rollback to ``development`` (mirrors the merge-gate). * **B — safety:** :func:`expected_revision` feeds the validated SHA to ``self_deploy.build_deploy_command`` as ``EXPECTED_REVISION``; the host hook fail-closes (``exit 1``) before ``docker tag`` if the SOURCE_IMAGE revision label does not match. :func:`provenance_verdict` is the PURE verdict logic that mirrors the hook's comparison (unit-tested in isolation). Both layers share ONE anchor — :func:`validated_revision` — so the build stamp (A) and the expected revision (B) can never diverge. This module is a **leaf**: it imports only ``config`` / ``git_worktree`` and lazily ``qg.checks.is_self_hosting_repo``; it never imports ``stage_engine`` / ``self_deploy``. Every public helper honours a strict **never-raise** contract and is **fail-closed** on any doubt (missing label, empty SHA, docker/ssh/inspect error) -> treated as a mismatch, never promoted "on faith". """ import logging import os import shlex import subprocess from .config import settings logger = logging.getLogger("orchestrator.image_freshness") # The OCI-standard label key carrying the build commit (Dockerfile stamps it). REVISION_LABEL = "org.opencontainers.image.revision" # Bounded timeouts so a hung git/docker/ssh never wedges the monitor-thread. _GIT_TIMEOUT = 30 _INSPECT_TIMEOUT = 30 # The remote rebuild (docker build + compose recreate + health + staging_check) is # the slow path; keep it generous but bounded (mirrors the merge-gate re-test order). _REBUILD_TIMEOUT = 1200 # Explicit STAGING target for the --build-staging rebuild (Strategy A). These mirror # the hook's staging-safe defaults but are passed EXPLICITLY so a future change to the # hook defaults can never silently retarget the self-rebuild at prod (8500) — the whole # path builds/recreates STAGING ONLY (AC-9, review P2). Never the prod 8500 target. _STAGING_SERVICE = "orchestrator-staging" _STAGING_PORT = 8501 _STAGING_COMPOSE_PROFILE = "staging" # --------------------------------------------------------------------------- # Conditionality (mirrors self_deploy_applies / _merge_gate_applies) # --------------------------------------------------------------------------- def image_freshness_applies(repo: str) -> bool: """Whether the staging-image provenance feature (A + B) is REAL for this repo. Mirrors the ORCH-35 / ORCH-43 / ORCH-36 conditional rollout: * ``image_freshness_enabled=False`` -> always False (single kill-switch for the WHOLE feature; legacy ORCH-36 BUILD-ONCE behaviour for everyone). * ``image_freshness_repos`` (CSV) non-empty -> real only for listed repos. * empty CSV -> real ONLY for the self-hosting repo (``orchestrator``). Never raises. """ try: if not settings.image_freshness_enabled: return False raw = (settings.image_freshness_repos or "").strip() if raw: allowed = {r.strip().lower() for r in raw.split(",") if r.strip()} return (repo or "").strip().lower() in allowed # Lazy import keeps this module a leaf (avoids importing qg at module load). from .qg.checks import is_self_hosting_repo return is_self_hosting_repo(repo) except Exception as e: # noqa: BLE001 - never-raise contract logger.warning("image_freshness_applies error for %s: %s", repo, e) return False # --------------------------------------------------------------------------- # The validated-commit anchor (single source for both A and B) # --------------------------------------------------------------------------- def validated_revision(repo: str, branch: str) -> str: """Return the SHA of the VALIDATED commit = ``git rev-parse HEAD`` in the task worktree AFTER the merge-gate (post auto-rebase + push --force-with-lease). This is exactly the tree the merge-gate re-tested green and that merges into ``main``. It is the SINGLE anchor that feeds both the staging rebuild stamp (A) and the expected revision passed to the hook (B), so the two layers cannot disagree about "what commit prod must run". Fail-closed / never-raise (AC-3 / AC-8): a missing worktree or any git/OS error returns ``""`` (an empty SHA, which downstream treats as a provenance mismatch), never a propagated exception. """ from .git_worktree import get_worktree_path try: wt = get_worktree_path(repo, branch) except Exception as e: # noqa: BLE001 - never-raise contract logger.warning("validated_revision: worktree error for %s/%s: %s", repo, branch, e) return "" if not os.path.isdir(wt): logger.warning("validated_revision: no worktree at %s for %s/%s", wt, repo, branch) return "" try: r = subprocess.run( ["git", "-C", wt, "rev-parse", "HEAD"], capture_output=True, text=True, timeout=_GIT_TIMEOUT, ) except (subprocess.SubprocessError, OSError) as e: logger.warning("validated_revision: git error for %s/%s: %s", repo, branch, e) return "" if r.returncode != 0: logger.warning( "validated_revision: rev-parse rc=%s for %s/%s", r.returncode, repo, branch ) return "" return (r.stdout or "").strip() def expected_revision(repo: str, branch: str) -> str: """The revision the hook must require (Strategy B), or ``""`` when the feature is inactive for this repo. Returns :func:`validated_revision` ONLY when :func:`image_freshness_applies` (so non-self / disabled callers get ``""`` -> the hook keeps its backward- compatible "no provenance check" behaviour, no EXPECTED_REVISION env). The config invariant (ADR-001) is that B is active iff A is active — both gated by the SAME flag — so there is never a "B without A" deadlock. Never raises. """ try: if not image_freshness_applies(repo): return "" return validated_revision(repo, branch) except Exception as e: # noqa: BLE001 - never-raise contract logger.warning("expected_revision error for %s/%s: %s", repo, branch, e) return "" # --------------------------------------------------------------------------- # Pure provenance verdict (mirrors the hook's bash comparison — Strategy B) # --------------------------------------------------------------------------- def provenance_verdict(expected_sha: str, image_sha: str) -> tuple[bool, str]: """Pure, deterministic provenance verdict (no I/O) — the Python mirror of the hook's fail-closed comparison (Strategy B), unit-testable in isolation. Contract (AC-1 / AC-2 / AC-3, fail-closed): * both non-empty AND equal -> ``(True, "provenance match: ")``. * expected empty / image empty -> ``(False, "...")`` — fail-closed: a missing expected SHA or an unlabelled image is NEVER treated as fresh. * both non-empty but different -> ``(False, "provenance mismatch ...")``. """ exp = (expected_sha or "").strip() img = (image_sha or "").strip() if not exp: return False, "provenance fail-closed: empty expected revision" if not img: return False, "provenance fail-closed: image has no revision label" if exp == img: return True, f"provenance match: {exp[:12]}" return False, f"provenance mismatch: image {img[:12]} != expected {exp[:12]}" def image_revision(image: str, ssh_target: str | None = None) -> str: """Read an image's ``org.opencontainers.image.revision`` label via ``docker image inspect``. Returns ``""`` on any error or when the label is absent (fail-closed -> downstream treats it as a mismatch). ``docker`` lives on the HOST (the container ships only ``openssh-client git``), so when ``ssh_target`` is given the inspect runs over ssh; otherwise it runs locally (covers host-side callers and tests). Never raises (AC-8). """ fmt = '{{ index .Config.Labels "%s" }}' % REVISION_LABEL local_cmd = ["docker", "image", "inspect", "--format", fmt, image] if ssh_target: remote = "docker image inspect --format " + shlex.quote(fmt) + " " + shlex.quote(image) cmd = ["ssh", "-o", "StrictHostKeyChecking=no", ssh_target, remote] else: cmd = local_cmd try: r = subprocess.run(cmd, capture_output=True, text=True, timeout=_INSPECT_TIMEOUT) except (subprocess.SubprocessError, OSError) as e: logger.warning("image_revision: inspect error for %s: %s", image, e) return "" if r.returncode != 0: logger.warning("image_revision: inspect rc=%s for %s", r.returncode, image) return "" out = (r.stdout or "").strip() # `docker inspect` prints "" for a missing label key. if out in ("", ""): return "" return out # --------------------------------------------------------------------------- # Staging rebuild from the validated commit (Strategy A) — host-side via the hook # --------------------------------------------------------------------------- def _ssh_target() -> str | None: """ssh ``user@host`` for the host rebuild, or None when no host is configured (tests / non-self contexts that mock this away).""" host = (settings.deploy_ssh_host or "").strip() if not host: return None user = (settings.deploy_ssh_user or "").strip() return f"{user}@{host}" if user else host def _host_worktree_path(repo: str, branch: str) -> str: """The task worktree path AS SEEN FROM THE HOST (docker build context). The container path uses ``settings.worktrees_dir`` (under ``repos_dir``); the host sees the same files under ``host_repos_dir``. Derive the host path by swapping the mount prefix (mirrors ``self_deploy.host_state_dir``). """ from .git_worktree import get_worktree_path container_wt = get_worktree_path(repo, branch) repos_dir = settings.repos_dir.rstrip("/") host_repos_dir = settings.host_repos_dir.rstrip("/") if container_wt.startswith(repos_dir): return host_repos_dir + container_wt[len(repos_dir):] return container_wt def rebuild_staging_image(repo: str, branch: str, sha: str) -> tuple[bool, str]: """Rebuild the staging image from the VALIDATED commit and recreate 8501 (Strategy A) by invoking the host hook in ``--build-staging`` mode over ssh. The hook (``orchestrator-deploy-hook.sh --build-staging``) runs, on the host: ``docker build --build-arg GIT_SHA= -t `` -> ``docker compose --profile staging up -d --no-build orchestrator-staging`` -> health-check 8501 -> ``staging_check.py --mode stub`` against the FRESH 8501 (ADR-001 step 3, AC-4: validate exactly the artefact later retagged to prod). Same exit-code contract (0 = ok). This trades prod for staging ONLY (8501), NEVER prod (8500) (AC-9): all build/recreate/validate targets are the staging service — passed EXPLICITLY below, not left to hook defaults (review P2). Synchronous ssh is fine here (unlike Phase B): recreating staging does not kill the prod worker running this code. Bounded by ``_REBUILD_TIMEOUT``. Returns ``(True, msg)`` on a healthy rebuild, else ``(False, reason)``. Never raises (AC-8). """ target = _ssh_target() if not target: return False, "no ssh host configured for staging rebuild" host_ctx = _host_worktree_path(repo, branch) # Pass the STAGING target explicitly (service/port/profile/container), so the # rebuild + recreate + staging_check can never drift onto the prod 8500 service # even if the hook's defaults change (AC-9, review P2). STAGING_CONTAINER is the # container staging_check is docker-exec'd inside (step 3b). env_assignments = ( f"GIT_SHA={shlex.quote(sha)} " f"BUILD_CONTEXT={shlex.quote(host_ctx)} " f"TARGET_IMAGE={shlex.quote(settings.deploy_prod_source_image)} " f"TARGET_SERVICE={shlex.quote(_STAGING_SERVICE)} " f"TARGET_PORT={shlex.quote(str(_STAGING_PORT))} " f"COMPOSE_PROFILE={shlex.quote(_STAGING_COMPOSE_PROFILE)} " f"STAGING_CONTAINER={shlex.quote(_STAGING_SERVICE)}" ) inner = ( f"cd {shlex.quote(settings.deploy_host_repo_path)} && " f"{env_assignments} " f"bash {shlex.quote(settings.deploy_hook_script)} --build-staging" ) cmd = ["ssh", "-o", "StrictHostKeyChecking=no", target, inner] try: r = subprocess.run(cmd, capture_output=True, text=True, timeout=_REBUILD_TIMEOUT) except subprocess.TimeoutExpired: return False, f"staging rebuild timeout after {_REBUILD_TIMEOUT}s" except (subprocess.SubprocessError, OSError) as e: return False, f"staging rebuild ssh error: {e}" if r.returncode != 0: detail = ((r.stderr or "") + (r.stdout or "")).strip()[-200:] return False, f"staging rebuild failed (rc={r.returncode}): {detail}" logger.info("rebuild_staging_image: %s/%s rebuilt from %s and healthy", repo, branch, sha[:12]) return True, f"staging rebuilt from {sha[:12]} and healthy" # --------------------------------------------------------------------------- # QG sub-check: check_staging_image_fresh (Strategy A liveness, AC-4/AC-6) # --------------------------------------------------------------------------- def check_staging_image_fresh(repo: str, work_item_id: str, branch: str) -> tuple[bool, str]: """ORCH-058 freshness sub-gate on the ``deploy-staging -> deploy`` edge. Deterministic, no LLM. Mirrors ``check_branch_mergeable`` (ORCH-043): 1. Conditionality: ``image_freshness_enabled=False`` -> ``(True, "...disabled")``; a repo the feature is not real for -> ``(True, "image-freshness N/A for ")``. 2. Anchor: ``sha = validated_revision(repo, branch)``. Empty -> fail-closed ``(False, ...)`` (AC-3): we never rebuild/promote without a known commit. 3. Rebuild the staging image from that commit, recreate 8501, and run ``staging_check.py --mode stub`` against the fresh 8501 (host hook). PASS -> ``(True, ...)``: the artefact we just validated (build + e2e) is the exact one that will be retagged to prod (AC-4, loop closed). FAIL -> ``(False, ...)`` -> the engine rolls back to ``development`` (AC-2). Never-raise (AC-8): any internal error -> ``(False, "")``; an exception never escapes into ``advance_stage``. Returns ``(True, "N/A")`` for non-self repos so the deploy edge is unchanged for them (AC-5). """ try: if not settings.image_freshness_enabled: return True, "image-freshness disabled" if not image_freshness_applies(repo): return True, f"image-freshness N/A for {repo}" sha = validated_revision(repo, branch) if not sha: # Fail-closed: without the validated commit we cannot prove freshness. return False, "cannot resolve validated revision (fail-closed)" ok, reason = rebuild_staging_image(repo, branch, sha) if not ok: return False, f"staging rebuild failed: {reason}" return True, f"staging image fresh ({sha[:12]})" except Exception as e: # noqa: BLE001 - never-raise contract logger.error("check_staging_image_fresh error for %s/%s: %s", repo, branch, e) return False, f"image-freshness error: {e}"