From 83397570fe718d7380957bc7864b2312d667f597 Mon Sep 17 00:00:00 2001 From: claude-bot Date: Sun, 7 Jun 2026 07:46:19 +0000 Subject: [PATCH] developer(ET): auto-commit from developer run_id=264 --- docs/architecture/README.md | 2 +- docs/operations/DEPLOY_HOOK.md | 14 ++ docs/operations/INFRA.md | 3 + docs/operations/STAGING.md | 21 ++ src/config.py | 24 +++ src/image_freshness.py | 312 +++++++++++++++++++++++++++ src/qg/checks.py | 15 ++ src/self_deploy.py | 13 ++ src/stage_engine.py | 88 ++++++++ tests/test_config.py | 27 +++ tests/test_deploy_approve.py | 3 +- tests/test_deploy_build_once.py | 53 ++++- tests/test_deploy_hook_mapping.py | 6 + tests/test_deploy_hook_provenance.py | 58 +++++ tests/test_image_freshness.py | 171 +++++++++++++++ tests/test_qg_registry_snapshot.py | 1 + tests/test_stage_engine.py | 111 +++++++++- 17 files changed, 916 insertions(+), 6 deletions(-) create mode 100644 src/image_freshness.py create mode 100644 tests/test_deploy_hook_provenance.py create mode 100644 tests/test_image_freshness.py diff --git a/docs/architecture/README.md b/docs/architecture/README.md index 657481f..da47217 100644 --- a/docs/architecture/README.md +++ b/docs/architecture/README.md @@ -81,7 +81,7 @@ sentinel-файлы (`/.deploy-state-//`), без мигр Подробнее: [adr-0007](adr/adr-0007-executable-self-deploy.md), детально — `docs/work-items/ORCH-036/06-adr/ADR-001-executable-self-deploy.md`. -### Свежесть артефакта BUILD-ONCE: провенанс staging-образа (ORCH-058 — design) +### Свежесть артефакта BUILD-ONCE: провенанс staging-образа (ORCH-058 — реализовано) BUILD-ONCE retag (ORCH-36) промоутит `SOURCE_IMAGE=orchestrator-orchestrator-staging` в прод **без rebuild**, полагаясь на «staging-образ свеж и провалидирован». Этой гарантии нет: конвейер нигде не пересобирает staging-образ из провалидированного коммита → retag мог тихо diff --git a/docs/operations/DEPLOY_HOOK.md b/docs/operations/DEPLOY_HOOK.md index 0f81102..215de1b 100644 --- a/docs/operations/DEPLOY_HOOK.md +++ b/docs/operations/DEPLOY_HOOK.md @@ -9,6 +9,7 @@ 1. **Захват текущего образа** — до рестарта записывает ID образа работающего контейнера в `$PREV_IMAGE_FILE` (best-effort, не падает если сервис не запущен). 2. **git pull** — обновляет код репозитория. 2b. **Build-once retag** (ORCH-036, BR-6) — если задан `$SOURCE_IMAGE`, хук ретегает его на `$TARGET_IMAGE` (`docker tag $SOURCE_IMAGE $TARGET_IMAGE`) и поднимает контейнер на этом образе через `up -d --no-build`. Это деплой РОВНО того образа, что прошёл staging, **без `docker build`**. Если `$SOURCE_IMAGE` не задан (дефолт) — шаг пропускается (обратная совместимость). + - **Fail-closed провенанс-guard** (ORCH-058, Strategy B) — ПЕРЕД `docker tag`, если задан `$EXPECTED_REVISION`, хук сверяет OCI-лейбл `org.opencontainers.image.revision` у `$SOURCE_IMAGE` с `$EXPECTED_REVISION`. Несовпадение / пустой лейбл (``) / ошибка inspect → лог + `exit 1` (FAILED → авто-rollback), **прод не трогается**. Не задан `$EXPECTED_REVISION` (дефолт) → проверка пропускается (обратная совместимость для не-self репозиториев). 3. **Рестарт контейнера** — `docker compose --profile $COMPOSE_PROFILE up -d --no-build $TARGET_SERVICE`. 4. **Health-цикл** — 10 попыток × 6с = до 60с. Критерий: HTTP 200 + тело содержит `"status":"ok"`. - **Успех** → `exit 0`, лог "Deploy SUCCESS". @@ -17,6 +18,16 @@ - Если восстановился → `exit 1` (деплой провалился, откат успешен). - Если и откат не помог → `exit 2` (критично). +### Режим `--build-staging` (ORCH-058, Strategy A) + +Пересобирает **staging-образ** из провалидированного коммита и пересоздаёт 8501, чтобы артефакт, который мы валидируем, был РОВНО тем, что позже build-once ретегается в прод (инвариант `INV-FRESH`). Собирает/пересоздаёт **только staging (8501)** — никогда прод (8500). + +1. `docker build --build-arg GIT_SHA=$GIT_SHA -t $TARGET_IMAGE $BUILD_CONTEXT` — пересборка из host-worktree валидированного коммита; `GIT_SHA` штампуется в OCI-лейбл `org.opencontainers.image.revision`. +2. `docker compose [--profile $COMPOSE_PROFILE] up -d --no-build $TARGET_SERVICE` — пересоздание staging на свежем образе. +3. Health-цикл 10×6с. Здоров → `exit 0`; провал сборки/health → `exit 1`. + +Запускается оркестратором на ребре `deploy-staging → deploy` (QG-под-чек `check_staging_image_fresh`, см. `INFRA.md`). Тот же контракт кодов выхода (0 = здоров). + ### Режим `--rollback` Вручную откатывает сервис на предыдущий образ из `$PREV_IMAGE_FILE`. @@ -31,6 +42,9 @@ | `COMPOSE_PROFILE`| `staging` | Docker compose profile (пусто = без профиля) | | `PREV_IMAGE_FILE`| `$REPO/.deploy-prev-image-staging`| Файл для сохранения предыдущего образа | | `SOURCE_IMAGE` | _(unset)_ | Build-once (ORCH-036): провалидированный образ для retag на `$TARGET_IMAGE` перед рестартом (без rebuild). Не задан → шаг пропущен. | +| `EXPECTED_REVISION` | _(unset)_ | Build-once (ORCH-058, Strategy B): ожидаемый git-SHA `$SOURCE_IMAGE` (лейбл `org.opencontainers.image.revision`). Задан → fail-closed guard перед `docker tag`. Не задан → проверка пропущена. | +| `GIT_SHA` | _(unset)_ | `--build-staging` (ORCH-058, Strategy A): коммит, штампуемый в OCI-лейбл `revision` при пересборке staging-образа. | +| `BUILD_CONTEXT` | `$REPO` | `--build-staging`: docker build context (host-worktree валидированного коммита). | | `LOG` | `/var/log/orchestrator/deploy-hook.log` | Лог-файл (fallback: `$REPO/deploy-hook.log`) | > ⚠️ **Дефолт — всегда STAGING**. Прод активируется только явным переопределением env. diff --git a/docs/operations/INFRA.md b/docs/operations/INFRA.md index 90ab13b..3eb70aa 100644 --- a/docs/operations/INFRA.md +++ b/docs/operations/INFRA.md @@ -83,6 +83,8 @@ ADR `docs/work-items/ORCH-040/06-adr/ADR-001-run-agents-as-host-uid.md` и гл | `ORCH_DEPLOY_HOOK_SCRIPT` / `_HOST_REPO_PATH` | путь к хук-скрипту (отн. репо) и чекаут orchestrator на хосте | | `ORCH_DEPLOY_PROD_SOURCE_IMAGE` | staging-образ для build-once retag на прод-тег (без rebuild) | | `ORCH_DEPLOY_PROD_TARGET_SERVICE` / `_TARGET_PORT` / `_TARGET_IMAGE` / `_COMPOSE_PROFILE` / `_PREV_IMAGE_FILE` | прод-цель хука + снапшот для авто-rollback | +| `ORCH_IMAGE_FRESHNESS_ENABLED` | ORCH-058 единый kill-switch провенанса staging-образа (A+B как целое); дефолт `true`, false → legacy build-once без проверки свежести | +| `ORCH_IMAGE_FRESHNESS_REPOS` | CSV репозиториев с реальным гейтом свежести; пусто → только self-hosting `orchestrator` | | `ORCH_RECONCILE_ENABLED` | kill-switch sweeper потерянных webhook (ORCH-053); дефолт `true`. **При инциденте/раскатке** — `false` глушит весь фоновый reconciler | | `ORCH_RECONCILE_PLANE_ENABLED` | отдельный флаг F-2 (опрос Plane API); `false` гасит только plane-ветку, F-1 продолжает работать; дефолт `true` | | `ORCH_RECONCILE_INTERVAL_S` | период фонового прохода reconciler, сек; дефолт `120` | @@ -131,6 +133,7 @@ ADR `docs/work-items/ORCH-040/06-adr/ADR-001-run-agents-as-host-uid.md` и гл **Страховки:** - Стадия `deploy-staging` (порт 8501) — обязательный гейт перед прод-деплоем орка. Прод-деплой недостижим, пока staging-гейт не зелёный (см. `STAGING.md`, ORCH-35). Гейт условный: реален только для self-hosting (repo=orchestrator), для остальных проектов — no-op. +- **Свежесть staging-образа (ORCH-058):** на ребре `deploy-staging → deploy` (ПОСЛЕ merge-gate, ДО Phase A) QG-под-чек `check_staging_image_fresh` пересобирает staging-образ из валидированного коммита и пересоздаёт 8501 (Strategy A), а хук перед build-once retag fail-closed сверяет OCI-лейбл `revision` с `EXPECTED_REVISION` (Strategy B). Гарантирует: в прод промоутится РОВНО провалидированный артефакт (инцидент LESSONS_ORCH-036 п.4 — тихий промоут устаревшего образа). Сборки/recreate — ТОЛЬКО staging (8501); FAIL → откат на `development`. Условный: реален только для self-hosting. **Правила для агентов при задачах ORCH:** 1. НЕ перезапускать / не ронять прод-контейнер `orchestrator` в рамках задачи. diff --git a/docs/operations/STAGING.md b/docs/operations/STAGING.md index f750b20..471ae9a 100644 --- a/docs/operations/STAGING.md +++ b/docs/operations/STAGING.md @@ -75,6 +75,27 @@ completely invisible to commands that do not pass `--profile staging`. docker logs -f orchestrator-staging ``` +## Staging-образ как источник прод-артефакта (ORCH-058) + +Прод-деплой орка — **build-once**: хук ретегает провалидированный staging-образ +(`orchestrator-orchestrator-staging`) на прод-тег **без rebuild** (ORCH-036). Чтобы +в прод не попал устаревший образ (инцидент LESSONS_ORCH-036 п.4), ORCH-058 гарантирует +свежесть staging-образа **двумя слоями** (только self-hosting): + +- **A — пересборка staging (liveness):** на ребре `deploy-staging → deploy` (ПОСЛЕ + merge-gate, ДО Phase A) QG-под-чек `check_staging_image_fresh` через хук + `--build-staging` пересобирает staging-образ из worktree валидированного коммита + (`--build-arg GIT_SHA=`, OCI-лейбл `org.opencontainers.image.revision`) и + пересоздаёт 8501. Так валидируем РОВНО тот артефакт, что промоутится в прод. + FAIL → откат на `development`. Сборки/recreate — **только staging (8501)**. +- **B — fail-closed guard (safety):** прод-хук перед `docker tag` сверяет лейбл + `revision` у `SOURCE_IMAGE` с `EXPECTED_REVISION` (пробрасывает оркестратор); + несовпадение / пустой лейбл / ошибка inspect → `exit 1`, прод не трогается. + +Kill-switch `ORCH_IMAGE_FRESHNESS_ENABLED` включает A+B **как целое**; область — +`ORCH_IMAGE_FRESHNESS_REPOS` (пусто → только `orchestrator`). Детали — `DEPLOY_HOOK.md`, +`docs/work-items/ORCH-058/06-adr/ADR-001-staging-image-provenance.md`. + ## Roadmap | Task | Description | diff --git a/src/config.py b/src/config.py index df8bcc1..dd30d4a 100644 --- a/src/config.py +++ b/src/config.py @@ -195,6 +195,30 @@ class Settings(BaseSettings): deploy_prod_target_image: str = "orchestrator-orchestrator" deploy_prod_compose_profile: str = "" deploy_prod_prev_image_file: str = ".deploy-prev-image-prod" + + # ORCH-058: staging-image provenance before the BUILD-ONCE retag to prod. + # Closes the INV-FRESH gap (ADR-001): the BUILD-ONCE retag (ORCH-36) promotes + # the staging image to prod WITHOUT a rebuild, assuming the staging image is + # fresh — a guarantee the pipeline never had (a stale image could be silently + # promoted, LESSONS_ORCH-036 §4). Two complementary layers, self-hosting only: + # A (liveness): the QG sub-check check_staging_image_fresh rebuilds the + # staging image from the VALIDATED commit (worktree HEAD after merge-gate) + # and recreates 8501 on the deploy-staging -> deploy edge, so we validate + # and promote ONE artefact. + # B (safety): build_deploy_command passes EXPECTED_REVISION and the hook + # fail-closes (exit 1) if SOURCE_IMAGE's revision label != EXPECTED_REVISION + # before `docker tag`, making a silent stale promote structurally impossible. + # + # image_freshness_enabled -> SINGLE kill-switch for the WHOLE feature (A + B + # together; never "B without A" = a deadlock). False + # -> legacy ORCH-36 behaviour (BUILD-ONCE, no guard, + # no EXPECTED_REVISION). Env ORCH_IMAGE_FRESHNESS_ENABLED. + # image_freshness_repos -> CSV of repos where the feature is REAL; empty -> + # only the self-hosting repo (orchestrator). Mirrors + # self_deploy_repos / merge_gate_repos. + image_freshness_enabled: bool = True + image_freshness_repos: str = "" + # ORCH-053: stuck-task reconciler (sweeper for lost webhooks). A background # daemon thread reconciles the "source of truth (gate / Plane) != task stage" # drift left behind by a dropped webhook (502 on rebuild, no Plane/Gitea diff --git a/src/image_freshness.py b/src/image_freshness.py new file mode 100644 index 0000000..574d54b --- /dev/null +++ b/src/image_freshness.py @@ -0,0 +1,312 @@ +"""Staging-image provenance for the BUILD-ONCE retag to prod (ORCH-058). + +ORCH-36 made the ``deploy`` stage promote the staging image to prod by a plain +``docker tag`` (BUILD-ONCE, no rebuild), assuming "the staging image is fresh and +built from the validated code". That guarantee never existed: nothing in the +pipeline rebuilt the staging image from the validated commit, so a STALE image +could be silently promoted — the most dangerous bootstrap bug of LESSONS_ORCH-036 +(§4): a green deploy that quietly rolled prod back to 2-day-old code. + +This module provides the deterministic (no-LLM) primitives that enforce the +``INV-FRESH`` invariant (ADR-001), as **two complementary layers** wired only for +self-hosting: + + * **A — liveness:** :func:`check_staging_image_fresh` is a QG sub-check on the + ``deploy-staging -> deploy`` edge (composed by ``stage_engine`` AFTER the + merge-gate, BEFORE Phase A). It rebuilds ``orchestrator-orchestrator-staging`` + from the VALIDATED commit (worktree HEAD after the merge-gate rebase) and + recreates the 8501 container, so we validate and promote ONE artefact. FAIL -> + rollback to ``development`` (mirrors the merge-gate). + * **B — safety:** :func:`expected_revision` feeds the validated SHA to + ``self_deploy.build_deploy_command`` as ``EXPECTED_REVISION``; the host hook + fail-closes (``exit 1``) before ``docker tag`` if the SOURCE_IMAGE revision + label does not match. :func:`provenance_verdict` is the PURE verdict logic + that mirrors the hook's comparison (unit-tested in isolation). + +Both layers share ONE anchor — :func:`validated_revision` — so the build stamp (A) +and the expected revision (B) can never diverge. + +This module is a **leaf**: it imports only ``config`` / ``git_worktree`` and lazily +``qg.checks.is_self_hosting_repo``; it never imports ``stage_engine`` / +``self_deploy``. Every public helper honours a strict **never-raise** contract and +is **fail-closed** on any doubt (missing label, empty SHA, docker/ssh/inspect +error) -> treated as a mismatch, never promoted "on faith". +""" + +import logging +import os +import shlex +import subprocess + +from .config import settings + +logger = logging.getLogger("orchestrator.image_freshness") + +# The OCI-standard label key carrying the build commit (Dockerfile stamps it). +REVISION_LABEL = "org.opencontainers.image.revision" + +# Bounded timeouts so a hung git/docker/ssh never wedges the monitor-thread. +_GIT_TIMEOUT = 30 +_INSPECT_TIMEOUT = 30 +# The remote rebuild (docker build + compose recreate + health) is the slow path; +# keep it generous but bounded (mirrors the merge-gate re-test budget order). +_REBUILD_TIMEOUT = 1200 + + +# --------------------------------------------------------------------------- +# Conditionality (mirrors self_deploy_applies / _merge_gate_applies) +# --------------------------------------------------------------------------- +def image_freshness_applies(repo: str) -> bool: + """Whether the staging-image provenance feature (A + B) is REAL for this repo. + + Mirrors the ORCH-35 / ORCH-43 / ORCH-36 conditional rollout: + * ``image_freshness_enabled=False`` -> always False (single kill-switch for + the WHOLE feature; legacy ORCH-36 BUILD-ONCE behaviour for everyone). + * ``image_freshness_repos`` (CSV) non-empty -> real only for listed repos. + * empty CSV -> real ONLY for the self-hosting repo (``orchestrator``). + Never raises. + """ + try: + if not settings.image_freshness_enabled: + return False + raw = (settings.image_freshness_repos or "").strip() + if raw: + allowed = {r.strip().lower() for r in raw.split(",") if r.strip()} + return (repo or "").strip().lower() in allowed + # Lazy import keeps this module a leaf (avoids importing qg at module load). + from .qg.checks import is_self_hosting_repo + return is_self_hosting_repo(repo) + except Exception as e: # noqa: BLE001 - never-raise contract + logger.warning("image_freshness_applies error for %s: %s", repo, e) + return False + + +# --------------------------------------------------------------------------- +# The validated-commit anchor (single source for both A and B) +# --------------------------------------------------------------------------- +def validated_revision(repo: str, branch: str) -> str: + """Return the SHA of the VALIDATED commit = ``git rev-parse HEAD`` in the task + worktree AFTER the merge-gate (post auto-rebase + push --force-with-lease). + + This is exactly the tree the merge-gate re-tested green and that merges into + ``main``. It is the SINGLE anchor that feeds both the staging rebuild stamp (A) + and the expected revision passed to the hook (B), so the two layers cannot + disagree about "what commit prod must run". + + Fail-closed / never-raise (AC-3 / AC-8): a missing worktree or any git/OS error + returns ``""`` (an empty SHA, which downstream treats as a provenance mismatch), + never a propagated exception. + """ + from .git_worktree import get_worktree_path + + try: + wt = get_worktree_path(repo, branch) + except Exception as e: # noqa: BLE001 - never-raise contract + logger.warning("validated_revision: worktree error for %s/%s: %s", repo, branch, e) + return "" + if not os.path.isdir(wt): + logger.warning("validated_revision: no worktree at %s for %s/%s", wt, repo, branch) + return "" + try: + r = subprocess.run( + ["git", "-C", wt, "rev-parse", "HEAD"], + capture_output=True, text=True, timeout=_GIT_TIMEOUT, + ) + except (subprocess.SubprocessError, OSError) as e: + logger.warning("validated_revision: git error for %s/%s: %s", repo, branch, e) + return "" + if r.returncode != 0: + logger.warning( + "validated_revision: rev-parse rc=%s for %s/%s", r.returncode, repo, branch + ) + return "" + return (r.stdout or "").strip() + + +def expected_revision(repo: str, branch: str) -> str: + """The revision the hook must require (Strategy B), or ``""`` when the feature + is inactive for this repo. + + Returns :func:`validated_revision` ONLY when :func:`image_freshness_applies` + (so non-self / disabled callers get ``""`` -> the hook keeps its backward- + compatible "no provenance check" behaviour, no EXPECTED_REVISION env). The + config invariant (ADR-001) is that B is active iff A is active — both gated by + the SAME flag — so there is never a "B without A" deadlock. Never raises. + """ + try: + if not image_freshness_applies(repo): + return "" + return validated_revision(repo, branch) + except Exception as e: # noqa: BLE001 - never-raise contract + logger.warning("expected_revision error for %s/%s: %s", repo, branch, e) + return "" + + +# --------------------------------------------------------------------------- +# Pure provenance verdict (mirrors the hook's bash comparison — Strategy B) +# --------------------------------------------------------------------------- +def provenance_verdict(expected_sha: str, image_sha: str) -> tuple[bool, str]: + """Pure, deterministic provenance verdict (no I/O) — the Python mirror of the + hook's fail-closed comparison (Strategy B), unit-testable in isolation. + + Contract (AC-1 / AC-2 / AC-3, fail-closed): + * both non-empty AND equal -> ``(True, "provenance match: ")``. + * expected empty / image empty -> ``(False, "...")`` — fail-closed: a + missing expected SHA or an unlabelled image is NEVER treated as fresh. + * both non-empty but different -> ``(False, "provenance mismatch ...")``. + """ + exp = (expected_sha or "").strip() + img = (image_sha or "").strip() + if not exp: + return False, "provenance fail-closed: empty expected revision" + if not img: + return False, "provenance fail-closed: image has no revision label" + if exp == img: + return True, f"provenance match: {exp[:12]}" + return False, f"provenance mismatch: image {img[:12]} != expected {exp[:12]}" + + +def image_revision(image: str, ssh_target: str | None = None) -> str: + """Read an image's ``org.opencontainers.image.revision`` label via + ``docker image inspect``. Returns ``""`` on any error or when the label is + absent (fail-closed -> downstream treats it as a mismatch). + + ``docker`` lives on the HOST (the container ships only ``openssh-client git``), + so when ``ssh_target`` is given the inspect runs over ssh; otherwise it runs + locally (covers host-side callers and tests). Never raises (AC-8). + """ + fmt = '{{ index .Config.Labels "%s" }}' % REVISION_LABEL + local_cmd = ["docker", "image", "inspect", "--format", fmt, image] + if ssh_target: + remote = "docker image inspect --format " + shlex.quote(fmt) + " " + shlex.quote(image) + cmd = ["ssh", "-o", "StrictHostKeyChecking=no", ssh_target, remote] + else: + cmd = local_cmd + try: + r = subprocess.run(cmd, capture_output=True, text=True, timeout=_INSPECT_TIMEOUT) + except (subprocess.SubprocessError, OSError) as e: + logger.warning("image_revision: inspect error for %s: %s", image, e) + return "" + if r.returncode != 0: + logger.warning("image_revision: inspect rc=%s for %s", r.returncode, image) + return "" + out = (r.stdout or "").strip() + # `docker inspect` prints "" for a missing label key. + if out in ("", ""): + return "" + return out + + +# --------------------------------------------------------------------------- +# Staging rebuild from the validated commit (Strategy A) — host-side via the hook +# --------------------------------------------------------------------------- +def _ssh_target() -> str | None: + """ssh ``user@host`` for the host rebuild, or None when no host is configured + (tests / non-self contexts that mock this away).""" + host = (settings.deploy_ssh_host or "").strip() + if not host: + return None + user = (settings.deploy_ssh_user or "").strip() + return f"{user}@{host}" if user else host + + +def _host_worktree_path(repo: str, branch: str) -> str: + """The task worktree path AS SEEN FROM THE HOST (docker build context). + + The container path uses ``settings.worktrees_dir`` (under ``repos_dir``); the + host sees the same files under ``host_repos_dir``. Derive the host path by + swapping the mount prefix (mirrors ``self_deploy.host_state_dir``). + """ + from .git_worktree import get_worktree_path + + container_wt = get_worktree_path(repo, branch) + repos_dir = settings.repos_dir.rstrip("/") + host_repos_dir = settings.host_repos_dir.rstrip("/") + if container_wt.startswith(repos_dir): + return host_repos_dir + container_wt[len(repos_dir):] + return container_wt + + +def rebuild_staging_image(repo: str, branch: str, sha: str) -> tuple[bool, str]: + """Rebuild the staging image from the VALIDATED commit and recreate 8501 + (Strategy A) by invoking the host hook in ``--build-staging`` mode over ssh. + + The hook (``orchestrator-deploy-hook.sh --build-staging``) runs, on the host: + ``docker build --build-arg GIT_SHA= -t `` + -> ``docker compose --profile staging up -d --no-build orchestrator-staging`` + -> health-check 8501. Same exit-code contract (0 = ok). This trades prod for + staging ONLY (8501), NEVER prod (8500) (AC-9): all build/recreate targets are + the staging service. + + Synchronous ssh is fine here (unlike Phase B): recreating staging does not kill + the prod worker running this code. Bounded by ``_REBUILD_TIMEOUT``. + + Returns ``(True, msg)`` on a healthy rebuild, else ``(False, reason)``. + Never raises (AC-8). + """ + target = _ssh_target() + if not target: + return False, "no ssh host configured for staging rebuild" + host_ctx = _host_worktree_path(repo, branch) + env_assignments = ( + f"GIT_SHA={shlex.quote(sha)} " + f"BUILD_CONTEXT={shlex.quote(host_ctx)} " + f"TARGET_IMAGE={shlex.quote(settings.deploy_prod_source_image)}" + ) + inner = ( + f"cd {shlex.quote(settings.deploy_host_repo_path)} && " + f"{env_assignments} " + f"bash {shlex.quote(settings.deploy_hook_script)} --build-staging" + ) + cmd = ["ssh", "-o", "StrictHostKeyChecking=no", target, inner] + try: + r = subprocess.run(cmd, capture_output=True, text=True, timeout=_REBUILD_TIMEOUT) + except subprocess.TimeoutExpired: + return False, f"staging rebuild timeout after {_REBUILD_TIMEOUT}s" + except (subprocess.SubprocessError, OSError) as e: + return False, f"staging rebuild ssh error: {e}" + if r.returncode != 0: + detail = ((r.stderr or "") + (r.stdout or "")).strip()[-200:] + return False, f"staging rebuild failed (rc={r.returncode}): {detail}" + logger.info("rebuild_staging_image: %s/%s rebuilt from %s and healthy", repo, branch, sha[:12]) + return True, f"staging rebuilt from {sha[:12]} and healthy" + + +# --------------------------------------------------------------------------- +# QG sub-check: check_staging_image_fresh (Strategy A liveness, AC-4/AC-6) +# --------------------------------------------------------------------------- +def check_staging_image_fresh(repo: str, work_item_id: str, branch: str) -> tuple[bool, str]: + """ORCH-058 freshness sub-gate on the ``deploy-staging -> deploy`` edge. + + Deterministic, no LLM. Mirrors ``check_branch_mergeable`` (ORCH-043): + 1. Conditionality: ``image_freshness_enabled=False`` -> ``(True, "...disabled")``; + a repo the feature is not real for -> ``(True, "image-freshness N/A for ")``. + 2. Anchor: ``sha = validated_revision(repo, branch)``. Empty -> fail-closed + ``(False, ...)`` (AC-3): we never rebuild/promote without a known commit. + 3. Rebuild the staging image from that commit + recreate 8501 (host hook). + Healthy -> ``(True, ...)``: the artefact we just validated is the exact one + that will be retagged to prod (AC-4, loop closed). FAIL -> ``(False, ...)`` + -> the engine rolls back to ``development`` (AC-2). + + Never-raise (AC-8): any internal error -> ``(False, "")``; an exception + never escapes into ``advance_stage``. Returns ``(True, "N/A")`` for non-self + repos so the deploy edge is unchanged for them (AC-5). + """ + try: + if not settings.image_freshness_enabled: + return True, "image-freshness disabled" + if not image_freshness_applies(repo): + return True, f"image-freshness N/A for {repo}" + + sha = validated_revision(repo, branch) + if not sha: + # Fail-closed: without the validated commit we cannot prove freshness. + return False, "cannot resolve validated revision (fail-closed)" + + ok, reason = rebuild_staging_image(repo, branch, sha) + if not ok: + return False, f"staging rebuild failed: {reason}" + return True, f"staging image fresh ({sha[:12]})" + except Exception as e: # noqa: BLE001 - never-raise contract + logger.error("check_staging_image_fresh error for %s/%s: %s", repo, branch, e) + return False, f"image-freshness error: {e}" diff --git a/src/qg/checks.py b/src/qg/checks.py index 78f5c81..ead2b95 100644 --- a/src/qg/checks.py +++ b/src/qg/checks.py @@ -702,6 +702,20 @@ def check_branch_mergeable(repo: str, work_item_id: str, branch: str) -> tuple[b return False, f"merge-gate error: {e}" +def _check_staging_image_fresh(repo: str, work_item_id: str, branch: str) -> tuple[bool, str]: + """ORCH-058 freshness sub-gate (Strategy A) on the deploy-staging -> deploy edge. + + Thin registry wrapper that delegates to ``image_freshness.check_staging_image_fresh`` + (rebuild the staging image from the validated commit + recreate 8501). The real + logic lives in ``src/image_freshness.py`` (leaf module, never-raise, fail-closed); + importing it lazily here avoids an import cycle (image_freshness imports + is_self_hosting_repo from this module). For non-self repos it returns + ``(True, "N/A")`` so the deploy edge is unchanged for them (AC-5). + """ + from ..image_freshness import check_staging_image_fresh + return check_staging_image_fresh(repo, work_item_id, branch) + + # Registry for dynamic lookup by name QG_CHECKS = { "check_analysis_approved": check_analysis_approved, @@ -715,4 +729,5 @@ QG_CHECKS = { "check_deploy_status": check_deploy_status, "check_staging_status": check_staging_status, "check_branch_mergeable": check_branch_mergeable, + "check_staging_image_fresh": _check_staging_image_fresh, } diff --git a/src/self_deploy.py b/src/self_deploy.py index 989679a..17a14a7 100644 --- a/src/self_deploy.py +++ b/src/self_deploy.py @@ -230,7 +230,17 @@ def build_deploy_command(repo: str, work_item_id: str | None, branch: str) -> li Build-once (BR-6): ``SOURCE_IMAGE=`` makes the hook retag the staging-validated image to the prod tag instead of rebuilding (no ``docker build``). The exit-code contract of the hook is untouched. + + Provenance guard (ORCH-058, Strategy B): when the image-freshness feature is + active for this repo, the VALIDATED commit SHA is passed as + ``EXPECTED_REVISION=`` so the hook fail-closes (``exit 1``) before + ``docker tag`` if SOURCE_IMAGE's revision label does not match — a stale image + can never be silently promoted. When inactive (non-self / kill-switch off) + ``expected_revision`` returns ``""`` and the env is omitted, keeping the hook's + backward-compatible "no provenance check" behaviour (AC-5 / AC-7). """ + from . import image_freshness + host_dir = host_state_dir(repo, work_item_id) result_sentinel = os.path.join(host_dir, RESULT) hook_log = os.path.join(host_dir, "hook.log") @@ -243,6 +253,9 @@ def build_deploy_command(repo: str, work_item_id: str | None, branch: str) -> li f"COMPOSE_PROFILE={shlex.quote(settings.deploy_prod_compose_profile)} " f"PREV_IMAGE_FILE={shlex.quote(settings.deploy_prod_prev_image_file)}" ) + expected_rev = image_freshness.expected_revision(repo, branch) + if expected_rev: + env_assignments += f" EXPECTED_REVISION={shlex.quote(expected_rev)}" inner = ( f"cd {shlex.quote(settings.deploy_host_repo_path)} && " f"{env_assignments} " diff --git a/src/stage_engine.py b/src/stage_engine.py index af1a3e4..c9bf7b2 100644 --- a/src/stage_engine.py +++ b/src/stage_engine.py @@ -271,6 +271,17 @@ def advance_stage( ): return result + # --- ORCH-058 freshness sub-gate (deploy-staging -> deploy edge) --- + # AFTER the merge-gate finalised the validated HEAD and BEFORE Phase A. + # Rebuilds the staging image from that validated commit + recreates 8501 + # so the artefact we validate is the exact one promoted to prod (AC-4). + # FAIL -> rollback to development (mirrors the merge-gate). Like the + # merge-gate it owns the outcome on intervention. + if _handle_image_freshness( + task_id, current_stage, repo, work_item_id, branch, agent, result + ): + return result + # --- ORCH-036 Phase A: request approve before the prod deploy --------- # On the deploy-staging -> deploy edge, AFTER a green check_staging_status # and the merge-gate, the self-hosting repo does NOT auto-launch a prod @@ -878,6 +889,83 @@ def _handle_merge_gate_rollback( ) +# --------------------------------------------------------------------------- +# ORCH-058: staging-image freshness sub-gate on the deploy-staging -> deploy edge +# --------------------------------------------------------------------------- +def _handle_image_freshness( + task_id, current_stage, repo, work_item_id, branch, agent, result: AdvanceResult +) -> bool: + """Run check_staging_image_fresh on the deploy-staging -> deploy edge (ORCH-058). + + Runs AFTER the merge-gate (validated HEAD finalised) and BEFORE Phase A. The + sub-check rebuilds the staging image from the validated commit + recreates 8501; + a green result means the artefact we validate is the exact one that will be + BUILD-ONCE retagged to prod (AC-4). + + Returns True if the gate INTERVENED (the caller must return without advancing): + * FAIL (stale / rebuild error / fail-closed) -> ROLLBACK to development + (+ developer retry, capped by MAX_DEVELOPER_RETRIES) and RELEASE the merge + lease (the merge-gate held it on its PASS). Mirrors the merge-gate rollback. + Returns False when the gate PASSED (fresh, or N/A for a non-self repo) so + advance_stage proceeds to Phase A. On a PASS the merge lease stays HELD until + the actual merge (released on PR-merged webhook / deploy->done / rollback). + """ + passed, reason = _run_qg("check_staging_image_fresh", repo, work_item_id, branch) + if passed: + logger.info(f"Task {task_id}: image-freshness passed ({reason})") + return False + + result.qg_name = "check_staging_image_fresh" + result.qg_passed = False + result.qg_reason = reason + + update_task_stage(task_id, "development") + notify_stage_change(task_id, current_stage, "development") + plane_notify_stage(work_item_id, current_stage, "development") + result.rolled_back_to = "development" + set_issue_in_progress(work_item_id) + # The merge-gate held the lease on its PASS; freshness failed before the merge, + # so release it (holder-aware no-op if a different task already owns it). + try: + merge_gate.release_merge_lease(repo, branch) + except Exception as e: # noqa: BLE001 - defensive + logger.warning(f"Task {task_id}: merge-lease release on image-freshness fail failed: {e}") + notify_qg_failure(task_id, current_stage, "check_staging_image_fresh", reason) + plane_add_comment( + work_item_id, + f"❌ Staging-образ не свеж ({reason}). Откат на development. " + f"Developer нужен для фикса.", + author="deployer", + ) + retry_count = _developer_retry_count(task_id) + if retry_count < MAX_DEVELOPER_RETRIES: + task_desc = ( + f"Work item: {work_item_id}\nRepo: {repo}\nBranch: {branch}\n" + f"Stage: development\nNote: Staging image freshness failed " + f"(attempt {retry_count + 1}/{MAX_DEVELOPER_RETRIES}). " + f"Причина: {reason}." + ) + new_job = enqueue_job("developer", repo, task_desc, task_id=task_id) + result.enqueued_agent = "developer" + result.enqueued_job_id = new_job + logger.info( + f"Task {task_id}: image-freshness FAILED, enqueued developer (job_id={new_job})" + ) + else: + set_issue_blocked(work_item_id) + send_telegram( + f"\U0001f6a8 {work_item_id}: Staging image freshness still failing after " + f"{MAX_DEVELOPER_RETRIES} developer retries ({reason}). " + f"Manual intervention needed." + ) + result.alerted = True + logger.error( + f"Task {task_id}: image-freshness FAILED, rolled back deploy-staging -> " + f"development ({reason})" + ) + return True + + # --------------------------------------------------------------------------- # ORCH-036: executable self-deploy (Phase A/B/C) # --------------------------------------------------------------------------- diff --git a/tests/test_config.py b/tests/test_config.py index 012de48..b751be4 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -115,3 +115,30 @@ def test_reconcile_settings_env_override(monkeypatch): assert s.reconcile_grace_default_s == 900 assert s.reconcile_grace_overrides_json == '{"development": 300}' assert s.reconcile_notify_unblock is False + + +# --------------------------------------------------------------------------- +# ORCH-058 / TC-13: image-freshness settings defaults + env override. +# --------------------------------------------------------------------------- +_FRESH_ENV = ( + "ORCH_IMAGE_FRESHNESS_ENABLED", + "ORCH_IMAGE_FRESHNESS_REPOS", +) + + +def test_image_freshness_settings_defaults(monkeypatch): + """TC-13 / AC-9: kill-switch ON by default, empty CSV (self-hosting only).""" + for name in _FRESH_ENV: + monkeypatch.delenv(name, raising=False) + s = Settings() + assert s.image_freshness_enabled is True + assert s.image_freshness_repos == "" + + +def test_image_freshness_settings_env_override(monkeypatch): + """TC-13 / AC-9: each field is read from its ORCH_* env var.""" + monkeypatch.setenv("ORCH_IMAGE_FRESHNESS_ENABLED", "false") + monkeypatch.setenv("ORCH_IMAGE_FRESHNESS_REPOS", "orchestrator,enduro-trails") + s = Settings() + assert s.image_freshness_enabled is False + assert s.image_freshness_repos == "orchestrator,enduro-trails" diff --git a/tests/test_deploy_approve.py b/tests/test_deploy_approve.py index ee91ebd..146a8e4 100644 --- a/tests/test_deploy_approve.py +++ b/tests/test_deploy_approve.py @@ -101,7 +101,8 @@ def test_tc05_no_approve_does_not_call_prod_hook(monkeypatch): stage_engine, "QG_CHECKS", {**stage_engine.QG_CHECKS, "check_staging_status": _pass, - "check_branch_mergeable": _pass}, + "check_branch_mergeable": _pass, + "check_staging_image_fresh": _pass}, ) # Spy: the deploy launcher must never run on the staging->deploy edge. initiate = MagicMock() diff --git a/tests/test_deploy_build_once.py b/tests/test_deploy_build_once.py index 1d797a0..a9de36c 100644 --- a/tests/test_deploy_build_once.py +++ b/tests/test_deploy_build_once.py @@ -39,9 +39,56 @@ def test_tc14_hook_retag_branch_present(): assert 'SOURCE_IMAGE="${SOURCE_IMAGE:-}"' in text # Build-once retag branch present; the hook never runs `docker build`. assert 'docker tag "$SOURCE_IMAGE" "$TARGET_IMAGE"' in text - # No EXECUTABLE `docker build` line (comments mentioning it are fine). + # No EXECUTABLE `docker build` line on the PROD path (comments are fine). + # ORCH-058: the only build allowed is the staging-freshness rebuild, + # which is explicitly tagged with `--build-arg GIT_SHA` (Strategy A). + # Executable lines only: drop comments and `log "..."` strings that merely + # mention "docker build" in human-readable diagnostics. exec_lines = [ ln.strip() for ln in text.splitlines() - if ln.strip() and not ln.strip().startswith("#") + if ln.strip() + and not ln.strip().startswith("#") + and not ln.strip().startswith("log ") ] - assert not any("docker build" in ln for ln in exec_lines) + for ln in exec_lines: + if "docker build" in ln: + assert "--build-arg GIT_SHA" in ln, ( + f"unexpected docker build on prod retag path: {ln}" + ) + + +# --------------------------------------------------------------------------- +# ORCH-058 TC-06: build_deploy_command threads EXPECTED_REVISION (Strategy B) +# --------------------------------------------------------------------------- +def test_tc06_deploy_command_passes_expected_revision(monkeypatch): + """When image-freshness is active, the prod hook receives EXPECTED_REVISION.""" + from src import image_freshness + monkeypatch.setattr(self_deploy.settings, "deploy_ssh_user", "slin") + monkeypatch.setattr(self_deploy.settings, "deploy_ssh_host", "mva154") + monkeypatch.setattr( + self_deploy.settings, "deploy_prod_source_image", "orchestrator-orchestrator-staging" + ) + monkeypatch.setattr( + image_freshness, "expected_revision", lambda repo, branch: "abc123def456" + ) + + cmd = self_deploy.build_deploy_command("orchestrator", "ORCH-058", "feature/ORCH-058-x") + remote = cmd[-1] + + assert "EXPECTED_REVISION=abc123def456" in remote + + +def test_tc06_no_expected_revision_when_inactive(monkeypatch): + """When image-freshness resolves to no SHA, EXPECTED_REVISION is omitted.""" + from src import image_freshness + monkeypatch.setattr(self_deploy.settings, "deploy_ssh_user", "slin") + monkeypatch.setattr(self_deploy.settings, "deploy_ssh_host", "mva154") + monkeypatch.setattr( + self_deploy.settings, "deploy_prod_source_image", "orchestrator-orchestrator-staging" + ) + monkeypatch.setattr(image_freshness, "expected_revision", lambda repo, branch: "") + + cmd = self_deploy.build_deploy_command("orchestrator", "ORCH-058", "feature/ORCH-058-x") + remote = cmd[-1] + + assert "EXPECTED_REVISION=" not in remote diff --git a/tests/test_deploy_hook_mapping.py b/tests/test_deploy_hook_mapping.py index e40d806..122a626 100644 --- a/tests/test_deploy_hook_mapping.py +++ b/tests/test_deploy_hook_mapping.py @@ -27,6 +27,12 @@ def test_tc03_exit2_rollback_also_failed_maps_to_failed(): assert map_exit_code_to_status(2) == "FAILED" +def test_tc09_provenance_fail_closed_exit1_maps_to_failed(): + """ORCH-058 TC-09: the Strategy-B hook fail-close uses `exit 1`; that must map + to FAILED so the existing БАГ-8 rollback path triggers (prod never left stale).""" + assert map_exit_code_to_status(1) == "FAILED" + + def test_other_exit_codes_map_to_failed(): for code in (3, 127, 255, -1): assert map_exit_code_to_status(code) == "FAILED" diff --git a/tests/test_deploy_hook_provenance.py b/tests/test_deploy_hook_provenance.py new file mode 100644 index 0000000..02c048b --- /dev/null +++ b/tests/test_deploy_hook_provenance.py @@ -0,0 +1,58 @@ +"""ORCH-058 TC-07/08: static guarantees of the Strategy-B provenance plumbing. + +These assert the *shape* of the deploy artefacts that can't be unit-tested by +running them (they shell out to docker/ssh on the host): + + * TC-07 — the deploy hook fail-closes BEFORE `docker tag` when the staging + image's git-revision label != EXPECTED_REVISION (exit 1), and the + new `--build-staging` rebuild mode stamps GIT_SHA into the image. + * TC-08 — the Dockerfile declares `ARG GIT_SHA` and stamps it into the + `org.opencontainers.image.revision` OCI label (the anchor B reads). +""" + +import pathlib + +_ROOT = pathlib.Path(__file__).resolve().parents[1] +_HOOK = _ROOT / "scripts" / "orchestrator-deploy-hook.sh" +_DOCKERFILE = _ROOT / "Dockerfile" + + +# --------------------------------------------------------------------------- +# TC-07: hook fail-closed provenance guard + --build-staging rebuild mode +# --------------------------------------------------------------------------- +def test_tc07_hook_has_fail_closed_provenance_guard(): + text = _HOOK.read_text(encoding="utf-8") + # The label key the hook inspects must be the OCI revision label. + assert 'REVISION_LABEL="org.opencontainers.image.revision"' in text + # EXPECTED_REVISION is read (default unset -> backward compatible). + assert 'EXPECTED_REVISION="${EXPECTED_REVISION:-}"' in text + # The guard must inspect the source image's label and normalise . + assert "docker image inspect --format" in text + assert '""' in text + # Fail-closed: empty OR mismatch -> abort with exit 1. + assert '-z "$IMG_REV" || "$IMG_REV" != "$EXPECTED_REVISION"' in text + + +def test_tc07_provenance_guard_precedes_docker_tag(): + """The fail-closed `exit 1` must sit BEFORE the `docker tag` retag line.""" + text = _HOOK.read_text(encoding="utf-8") + guard = text.index("$EXPECTED_REVISION") + retag = text.index('docker tag "$SOURCE_IMAGE" "$TARGET_IMAGE"') + assert guard < retag, "provenance guard must run before the prod retag" + + +def test_tc07_build_staging_mode_stamps_git_sha(): + text = _HOOK.read_text(encoding="utf-8") + # The new Strategy-A rebuild mode exists and is keyed on --build-staging. + assert '"${1:-}" == "--build-staging"' in text + # It rebuilds the staging image stamping the validated commit as a build-arg. + assert 'docker build --build-arg GIT_SHA="$GIT_SHA"' in text + + +# --------------------------------------------------------------------------- +# TC-08: Dockerfile stamps the OCI revision label from a build-arg +# --------------------------------------------------------------------------- +def test_tc08_dockerfile_stamps_revision_label(): + text = _DOCKERFILE.read_text(encoding="utf-8") + assert "ARG GIT_SHA" in text + assert "LABEL org.opencontainers.image.revision=$GIT_SHA" in text diff --git a/tests/test_image_freshness.py b/tests/test_image_freshness.py new file mode 100644 index 0000000..6fef54c --- /dev/null +++ b/tests/test_image_freshness.py @@ -0,0 +1,171 @@ +"""ORCH-058 TC-01..05: staging-image provenance helpers (src/image_freshness.py). + +Covers the INV-FRESH building blocks in isolation: + * TC-01/02/03 — the PURE provenance verdict (match / mismatch / fail-closed). + * TC-04 — never-raise: docker/ssh/git errors -> safe verdict, no exception. + * TC-05 — conditionality: non-self repo = no-op (N/A); self repo = real. +""" + +import os +import subprocess + +os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token") +os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token") + +from src import image_freshness as imf # noqa: E402 + + +# --------------------------------------------------------------------------- +# TC-01: matching revisions -> fresh (PASS) +# --------------------------------------------------------------------------- +def test_tc01_matching_revisions_are_fresh(): + ok, reason = imf.provenance_verdict("abc123def456", "abc123def456") + assert ok is True + assert "match" in reason.lower() + + +# --------------------------------------------------------------------------- +# TC-02: differing revisions -> NOT fresh (input for fail-fast) +# --------------------------------------------------------------------------- +def test_tc02_differing_revisions_are_not_fresh(): + ok, reason = imf.provenance_verdict("aaaaaaaaaaaa", "bbbbbbbbbbbb") + assert ok is False + assert "mismatch" in reason.lower() + + +# --------------------------------------------------------------------------- +# TC-03: fail-closed — empty label OR empty expected -> never "fresh by default" +# --------------------------------------------------------------------------- +def test_tc03_empty_image_label_fails_closed(): + ok, reason = imf.provenance_verdict("abc123", "") + assert ok is False + assert "fail-closed" in reason.lower() + + +def test_tc03_empty_expected_revision_fails_closed(): + ok, reason = imf.provenance_verdict("", "abc123") + assert ok is False + assert "fail-closed" in reason.lower() + + +def test_tc03_both_empty_fails_closed(): + ok, _ = imf.provenance_verdict("", "") + assert ok is False + + +# --------------------------------------------------------------------------- +# TC-04: never-raise on docker/ssh/inspect/git errors -> safe verdict +# --------------------------------------------------------------------------- +def test_tc04_image_revision_inspect_error_returns_empty(monkeypatch): + def _boom(*a, **k): + raise OSError("docker not found") + monkeypatch.setattr(imf.subprocess, "run", _boom) + # Never raises; fail-closed empty -> downstream provenance mismatch. + assert imf.image_revision("orchestrator-orchestrator-staging") == "" + + +def test_tc04_image_revision_nonzero_rc_returns_empty(monkeypatch): + monkeypatch.setattr( + imf.subprocess, "run", + lambda *a, **k: subprocess.CompletedProcess(a, 1, stdout="", stderr="no such image"), + ) + assert imf.image_revision("missing-image") == "" + + +def test_tc04_image_revision_no_value_label_returns_empty(monkeypatch): + # `docker inspect` prints "" when the label key is absent. + monkeypatch.setattr( + imf.subprocess, "run", + lambda *a, **k: subprocess.CompletedProcess(a, 0, stdout="\n", stderr=""), + ) + assert imf.image_revision("unlabelled-image") == "" + + +def test_tc04_validated_revision_missing_worktree_returns_empty(monkeypatch, tmp_path): + # No worktree on disk -> fail-closed empty SHA, never raises. + monkeypatch.setattr(imf.settings, "worktrees_dir", str(tmp_path / "nope")) + monkeypatch.setattr(imf.settings, "repos_dir", str(tmp_path / "nope")) + assert imf.validated_revision("orchestrator", "feature/ORCH-058-x") == "" + + +def test_tc04_check_staging_image_fresh_never_raises(monkeypatch): + # Self repo + enabled, but rebuild blows up -> caught -> safe (False) verdict. + monkeypatch.setattr(imf.settings, "image_freshness_enabled", True) + monkeypatch.setattr(imf.settings, "image_freshness_repos", "") + monkeypatch.setattr(imf, "validated_revision", lambda r, b: "deadbeef") + + def _boom(*a, **k): + raise RuntimeError("ssh exploded") + monkeypatch.setattr(imf, "rebuild_staging_image", _boom) + ok, reason = imf.check_staging_image_fresh("orchestrator", "ORCH-058", "feature/ORCH-058-x") + assert ok is False + assert "error" in reason.lower() + + +# --------------------------------------------------------------------------- +# TC-05: conditionality (self-hosting only) +# --------------------------------------------------------------------------- +def test_tc05_applies_only_to_self_hosting_by_default(monkeypatch): + monkeypatch.setattr(imf.settings, "image_freshness_enabled", True) + monkeypatch.setattr(imf.settings, "image_freshness_repos", "") + assert imf.image_freshness_applies("orchestrator") is True + assert imf.image_freshness_applies("enduro-trails") is False + + +def test_tc05_applies_respects_repos_csv(monkeypatch): + monkeypatch.setattr(imf.settings, "image_freshness_enabled", True) + monkeypatch.setattr(imf.settings, "image_freshness_repos", "enduro-trails") + assert imf.image_freshness_applies("enduro-trails") is True + # CSV is authoritative: orchestrator not listed -> not real. + assert imf.image_freshness_applies("orchestrator") is False + + +def test_tc05_kill_switch_disables_for_everyone(monkeypatch): + monkeypatch.setattr(imf.settings, "image_freshness_enabled", False) + monkeypatch.setattr(imf.settings, "image_freshness_repos", "") + assert imf.image_freshness_applies("orchestrator") is False + + +def test_tc05_check_is_noop_for_non_self_repo(monkeypatch): + monkeypatch.setattr(imf.settings, "image_freshness_enabled", True) + monkeypatch.setattr(imf.settings, "image_freshness_repos", "") + ok, reason = imf.check_staging_image_fresh("enduro-trails", "ET-001", "feature/ET-001-x") + assert ok is True + assert "N/A" in reason + + +def test_tc05_check_disabled_is_pass(monkeypatch): + monkeypatch.setattr(imf.settings, "image_freshness_enabled", False) + ok, reason = imf.check_staging_image_fresh("orchestrator", "ORCH-058", "feature/ORCH-058-x") + assert ok is True + assert "disabled" in reason.lower() + + +def test_tc05_check_real_for_self_repo_rebuilds(monkeypatch): + # Self repo + enabled: validated commit resolved + rebuild OK -> fresh PASS. + monkeypatch.setattr(imf.settings, "image_freshness_enabled", True) + monkeypatch.setattr(imf.settings, "image_freshness_repos", "") + monkeypatch.setattr(imf, "validated_revision", lambda r, b: "abc123def456") + monkeypatch.setattr(imf, "rebuild_staging_image", lambda r, b, s: (True, "healthy")) + ok, reason = imf.check_staging_image_fresh("orchestrator", "ORCH-058", "feature/ORCH-058-x") + assert ok is True + assert "abc123def456"[:12] in reason + + +def test_tc05_check_fail_closed_when_no_validated_revision(monkeypatch): + monkeypatch.setattr(imf.settings, "image_freshness_enabled", True) + monkeypatch.setattr(imf.settings, "image_freshness_repos", "") + monkeypatch.setattr(imf, "validated_revision", lambda r, b: "") + ok, reason = imf.check_staging_image_fresh("orchestrator", "ORCH-058", "feature/ORCH-058-x") + assert ok is False + assert "fail-closed" in reason.lower() + + +def test_tc05_check_fails_when_rebuild_fails(monkeypatch): + monkeypatch.setattr(imf.settings, "image_freshness_enabled", True) + monkeypatch.setattr(imf.settings, "image_freshness_repos", "") + monkeypatch.setattr(imf, "validated_revision", lambda r, b: "abc123def456") + monkeypatch.setattr(imf, "rebuild_staging_image", lambda r, b, s: (False, "build error")) + ok, reason = imf.check_staging_image_fresh("orchestrator", "ORCH-058", "feature/ORCH-058-x") + assert ok is False + assert "rebuild failed" in reason.lower() diff --git a/tests/test_qg_registry_snapshot.py b/tests/test_qg_registry_snapshot.py index 71ee2d0..5270bbc 100644 --- a/tests/test_qg_registry_snapshot.py +++ b/tests/test_qg_registry_snapshot.py @@ -29,6 +29,7 @@ _EXPECTED_QGS = { "check_deploy_status", "check_staging_status", "check_branch_mergeable", # ORCH-043 merge-gate (deploy-staging -> deploy edge) + "check_staging_image_fresh", # ORCH-058 image-freshness sub-gate (same edge) } diff --git a/tests/test_stage_engine.py b/tests/test_stage_engine.py index 89229b5..f229141 100644 --- a/tests/test_stage_engine.py +++ b/tests/test_stage_engine.py @@ -832,7 +832,8 @@ class TestMergeGate: stage_engine, "QG_CHECKS", {**stage_engine.QG_CHECKS, "check_staging_status": _pass, - "check_branch_mergeable": _pass}, + "check_branch_mergeable": _pass, + "check_staging_image_fresh": _pass}, ) task_id = _make_task("deploy-staging", repo="orchestrator", wi="ORCH-043", branch="feature/ORCH-043-x") @@ -992,6 +993,114 @@ class TestMergeGate: assert _stage(task_id) == "deploy" +class TestImageFreshnessGate: + """ORCH-058 TC-10/11: the image-freshness sub-gate on the deploy-staging -> + deploy edge. It runs AFTER staging-status + merge-gate, BEFORE Phase A.""" + + def _jobs_full(self): + conn = get_db() + rows = conn.execute( + "SELECT agent, task_content FROM jobs ORDER BY id" + ).fetchall() + conn.close() + return [dict(r) for r in rows] + + def test_tc10_stale_image_fails_fast_and_rolls_back(self, monkeypatch): + """TC-10 / AC-1/AC-4: staging status + merge-gate green but the staging + image is STALE -> fail-fast: rollback to development, developer re-queued, + prod NEVER reached (no advance to deploy).""" + monkeypatch.setattr(stage_engine.settings, "deploy_require_manual_approve", False) + monkeypatch.setattr( + stage_engine, "QG_CHECKS", + {**stage_engine.QG_CHECKS, + "check_staging_status": _pass, + "check_branch_mergeable": _pass, + "check_staging_image_fresh": _fail( + "staging rebuild failed: health FAILED")}, + ) + task_id = _make_task("deploy-staging", repo="orchestrator", wi="ORCH-058", + branch="feature/ORCH-058-x") + res = advance_stage( + task_id, "deploy-staging", "orchestrator", "ORCH-058", + "feature/ORCH-058-x", finished_agent="deployer", + ) + assert res.advanced is False + assert res.rolled_back_to == "development" + assert _stage(task_id) == "development" # never reached deploy + jobs = self._jobs_full() + assert len(jobs) == 1 + assert jobs[0]["agent"] == "developer" + # The rollback task_desc carries the freshness reason for the developer. + assert "staging rebuild failed" in jobs[0]["task_content"] + + def test_tc10_stale_rollback_respects_max_retries(self, monkeypatch): + """AC-1: image-freshness rollback is capped by MAX_DEVELOPER_RETRIES — + 4th attempt -> block + alert, no new developer job.""" + monkeypatch.setattr(stage_engine.settings, "deploy_require_manual_approve", False) + monkeypatch.setattr( + stage_engine, "QG_CHECKS", + {**stage_engine.QG_CHECKS, + "check_staging_status": _pass, + "check_branch_mergeable": _pass, + "check_staging_image_fresh": _fail("provenance mismatch")}, + ) + task_id = _make_task("deploy-staging", repo="orchestrator", wi="ORCH-058", + branch="feature/ORCH-058-x") + _add_developer_runs(task_id, 3) # already at the cap + res = advance_stage( + task_id, "deploy-staging", "orchestrator", "ORCH-058", + "feature/ORCH-058-x", finished_agent="deployer", + ) + assert res.rolled_back_to == "development" + assert stage_engine.set_issue_blocked.called + assert stage_engine.send_telegram.called + assert _jobs() == [] # no developer job past the cap + + def test_tc11_fresh_image_advances_to_deploy(self, monkeypatch): + """TC-11 / AC-1/AC-4: all three sub-checks green -> advance to deploy, + deployer enqueued, NO rollback (happy path).""" + monkeypatch.setattr(stage_engine.settings, "deploy_require_manual_approve", False) + monkeypatch.setattr( + stage_engine, "QG_CHECKS", + {**stage_engine.QG_CHECKS, + "check_staging_status": _pass, + "check_branch_mergeable": _pass, + "check_staging_image_fresh": _pass}, + ) + task_id = _make_task("deploy-staging", repo="orchestrator", wi="ORCH-058", + branch="feature/ORCH-058-x") + res = advance_stage( + task_id, "deploy-staging", "orchestrator", "ORCH-058", + "feature/ORCH-058-x", finished_agent="deployer", + ) + assert res.advanced is True + assert res.to_stage == "deploy" + assert _stage(task_id) == "deploy" + assert res.rolled_back_to is None + jobs = _jobs() + assert len(jobs) == 1 + assert jobs[0]["agent"] == "deployer" + + def test_tc11_non_self_repo_skips_freshness_gate(self, monkeypatch): + """Regression: for a non-self repo the REAL freshness gate is a no-op + (N/A), so deploy-staging -> deploy advances exactly as before ORCH-058.""" + monkeypatch.setattr(stage_engine.settings, "deploy_require_manual_approve", False) + monkeypatch.setattr( + stage_engine, "QG_CHECKS", + {**stage_engine.QG_CHECKS, + "check_staging_status": _pass, + "check_branch_mergeable": _pass}, + ) # check_staging_image_fresh left REAL -> N/A for enduro-trails + task_id = _make_task("deploy-staging", repo="enduro-trails", wi="ET-099", + branch="feature/ET-099-x") + res = advance_stage( + task_id, "deploy-staging", "enduro-trails", "ET-099", + "feature/ET-099-x", finished_agent="deployer", + ) + assert res.advanced is True + assert _stage(task_id) == "deploy" + + class TestDelegation: def test_launcher_calls_engine(self): from src.agents.launcher import AgentLauncher