From 6ddff5583db64ad952a3b54062408de83d676e44 Mon Sep 17 00:00:00 2001 From: claude-bot Date: Sun, 7 Jun 2026 09:24:38 +0000 Subject: [PATCH] fix(ORCH-058): parametrize staging_check in --build-staging + explicit staging target Round-3 review follow-up on c53d625 (P1/P2): - P1: --build-staging now runs staging_check via parametrized STAGING_CONTAINER / STAGING_CHECK_PATH / STAGING_CHECK_MODE (default orchestrator-staging / bind-mount path / stub) instead of hardcoding $TARGET_SERVICE + the script path. docker exec runs INSIDE the staging container (ORCH-048 canonical: B6 registry isolation), after health, before exit 0. Fail-closed: any non-zero -> exit 1. STAGING only (8501). - P2a: rebuild_staging_image now passes the STAGING target EXPLICITLY (TARGET_SERVICE/TARGET_PORT/COMPOSE_PROFILE/STAGING_CONTAINER) so the self-rebuild can never drift onto prod 8500 if hook defaults change (AC-9). - P2b: TC-09 caller<->hook contract tests assert the ssh command carries GIT_SHA + BUILD_CONTEXT + the staging target and never the prod 8500 one; no-ssh-host fails closed. - P3: consolidated the three duplicate README footers into one. - Docs (golden source): DEPLOY_HOOK.md step 4 + env rows, README footer, CHANGELOG, Dockerfile ARG GIT_SHA="" comment, .env.example freshness block. Validates exactly the artefact later BUILD-ONCE retagged to prod (AC-4, ADR-001 step 3). 632 tests pass, ruff clean, bash -n OK. Refs: ORCH-058 Co-Authored-By: Claude Opus 4.7 --- .env.example | 26 ++--- CHANGELOG.md | 2 +- Dockerfile | 12 +- docs/architecture/README.md | 4 +- docs/operations/DEPLOY_HOOK.md | 8 +- scripts/orchestrator-deploy-hook.sh | 100 ++++++++-------- src/image_freshness.py | 52 ++++++--- tests/test_deploy_hook_provenance.py | 168 ++++++++++++++++----------- 8 files changed, 210 insertions(+), 162 deletions(-) diff --git a/.env.example b/.env.example index 0a79306..a5f54b3 100644 --- a/.env.example +++ b/.env.example @@ -72,6 +72,19 @@ ORCH_DEPLOY_PROD_TARGET_IMAGE=orchestrator-orchestrator ORCH_DEPLOY_PROD_COMPOSE_PROFILE= ORCH_DEPLOY_PROD_PREV_IMAGE_FILE=.deploy-prev-image-prod +# ORCH-058: staging-image provenance before the BUILD-ONCE prod retag (INV-FRESH). +# Guarantees the staging image promoted to prod is the EXACT artefact rebuilt from the +# validated commit — two layers, self-hosting only: +# A (liveness): QG sub-check `check_staging_image_fresh` on the deploy-staging->deploy +# edge rebuilds orchestrator-orchestrator-staging from the validated commit + recreates +# 8501; FAIL -> rollback to development. (builds/recreate STAGING only, never prod.) +# B (safety): the Dockerfile stamps `org.opencontainers.image.revision`; the prod hook +# fail-closes (exit 1) before `docker tag` if SOURCE_IMAGE's label != EXPECTED_REVISION. +# ENABLED -> single kill-switch for A+B as a WHOLE (never "B without A"); false -> legacy. +# REPOS -> CSV of repos where the gate is REAL; empty -> only self-hosting (orchestrator). +ORCH_IMAGE_FRESHNESS_ENABLED=true +ORCH_IMAGE_FRESHNESS_REPOS= + # ORCH-053: stuck-task reconciler (sweeper for lost webhooks). A background daemon # replays a missed stage transition through the SAME gates/handlers a webhook would, # fixing tasks that got stuck on a dropped event (502 on rebuild, no Plane/Gitea @@ -88,16 +101,3 @@ ORCH_RECONCILE_INTERVAL_S=120 ORCH_RECONCILE_GRACE_DEFAULT_S=600 ORCH_RECONCILE_GRACE_OVERRIDES_JSON= ORCH_RECONCILE_NOTIFY_UNBLOCK=true - -# ORCH-058: staging-image provenance before the BUILD-ONCE retag to prod. Closes the -# "silent stale promote" bug (LESSONS_ORCH-036 §4): retag promoted the staging image -# to prod without proving it was built from the validated commit. Two layers (A+B), -# self-hosting only, gated as a WHOLE by a single switch (no "B without A" deadlock): -# A (liveness) -> QG sub-check check_staging_image_fresh rebuilds the staging image -# from the validated commit on the deploy-staging->deploy edge (after merge-gate). -# B (safety) -> deploy-hook fail-closes (exit 1) before `docker tag` if SOURCE_IMAGE -# OCI revision label != EXPECTED_REVISION (the validated SHA). -# ENABLED -> single kill-switch for the WHOLE feature; false -> legacy build-once. -# REPOS -> CSV of repos where the feature is REAL; empty -> only self-hosting. -ORCH_IMAGE_FRESHNESS_ENABLED=true -ORCH_IMAGE_FRESHNESS_REPOS= diff --git a/CHANGELOG.md b/CHANGELOG.md index 4520589..a8830f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ ## [Unreleased] ### Added -- **Провенанс staging-образа перед BUILD-ONCE retag в прод (защита от тихого регресса)** (ORCH-058): закрывает самый опасный баг self-deploy (LESSONS_ORCH-036 §4) — BUILD-ONCE retag (ORCH-36) промоутил `SOURCE_IMAGE=orchestrator-orchestrator-staging` в прод **без rebuild**, полагаясь на «staging-образ свеж», но конвейер нигде не пересобирал его из провалидированного коммита → деплой мог «зелёным» (health-ok) тихо откатить прод на устаревший код. Обеспечивается инвариант `INV-FRESH` (образ, уезжающий в прод, доказуемо собран из коммита, провалидированного `deploy-staging`) **двумя слоями (defense in depth)**, только для self-hosting, с ЕДИНЫМ якорем `validated_revision` (= `git rev-parse HEAD` worktree ПОСЛЕ merge-gate rebase), питающим оба слоя — они не могут разойтись. Новый leaf-модуль `src/image_freshness.py` (контракт «never-raise», fail-closed на любом сомнении): `validated_revision` / `expected_revision` (якорь и проброс ожидаемого SHA), `provenance_verdict` (чистая verdict-логика — Python-зеркало bash-сравнения хука, unit-тестируемая), `image_revision` (читает OCI-лейбл через `docker image inspect`, локально или по ssh), `rebuild_staging_image` (пересборка через хост-хук), `check_staging_image_fresh` (под-чек). **Слой A — liveness (пересборка):** детерминированный QG-под-чек `check_staging_image_fresh` (зарегистрирован в `QG_CHECKS` через тонкую обёртку `_check_staging_image_fresh` в `src/qg/checks.py`; добавлен в snapshot-тест реестра) на ребре `deploy-staging → deploy` ПОСЛЕ merge-gate и ДО Phase A пересобирает `orchestrator-orchestrator-staging` из провалидированного коммита (хост-хук в режиме `--build-staging`, `--build-arg GIT_SHA=`), пересоздаёт контейнер 8501 и прогоняет health → валидируем и промоутим ОДИН артефакт (loop closed, AC-4). FAIL → откат на `development` (как merge-gate, БАГ-8); вызов через `_handle_image_freshness` в `src/stage_engine.py`. **Слой B — safety (fail-closed guard):** `src/self_deploy.py::build_deploy_command` пробрасывает `EXPECTED_REVISION` = `image_freshness.expected_revision(...)`; хост-хук `scripts/orchestrator-deploy-hook.sh` шагом ПЕРЕД `docker tag` сверяет лейбл `org.opencontainers.image.revision` у `SOURCE_IMAGE` с `EXPECTED_REVISION` (нормализует ``): несовпадение / пустой лейбл / пустой ожидаемый SHA / ошибка inspect → `log` + `exit 1` → `map_exit_code_to_status` → FAILED → БАГ-8 откат, прод НЕ трогается. Делает тихий промоут устаревшего образа структурно невозможным даже если A отключён/проиграл гонку. `Dockerfile`: `ARG GIT_SHA` + `LABEL org.opencontainers.image.revision=$GIT_SHA` (машиночитаемое происхождение образа). Единый kill-switch `image_freshness_enabled` включает A+B **как целое** (нет «B без A» = вечного fail-fast, BR-5/R-1); `image_freshness_repos` (CSV, пусто → только self-hosting `orchestrator`, как ORCH-35) — поэтапный раскат. Self-hosting safety: все сборки/recreate — ТОЛЬКО staging 8501, НИКОГДА прод 8500; `main` не пушится; при незаданном `EXPECTED_REVISION` хук сохраняет прежнее поведение (обратная совместимость для не-self репо). Контракты НЕ менялись: `STAGE_TRANSITIONS`, exit-code-контракт хука (0/1/2), `map_exit_code_to_status`, `check_deploy_status`/`_parse_deploy_status`, merge-gate (ORCH-43), БАГ-8; схема БД без миграций (под-гейт ребра + лейбл образа). Новые настройки: `ORCH_IMAGE_FRESHNESS_ENABLED` (true), `ORCH_IMAGE_FRESHNESS_REPOS` (""). ADR `docs/work-items/ORCH-058/06-adr/ADR-001-staging-image-provenance.md`, глобальный `docs/architecture/adr/adr-0008-staging-image-provenance.md`. Документация: `docs/architecture/README.md`, `docs/operations/DEPLOY_HOOK.md`, `docs/operations/STAGING.md`, `docs/operations/INFRA.md`. Тесты: `tests/test_image_freshness.py`, `tests/test_deploy_hook_provenance.py`, `tests/test_qg_registry_snapshot.py`, `tests/test_stage_engine.py`, `tests/test_deploy_build_once.py`, `tests/test_deploy_hook_mapping.py`, `tests/test_config.py`. +- **Провенанс staging-образа перед BUILD-ONCE retag в прод (свежесть артефакта, INV-FRESH)** (ORCH-058): BUILD-ONCE retag (ORCH-036) промоутит staging-образ (`orchestrator-orchestrator-staging`) в прод **без rebuild**, полагаясь на «образ свеж и провалидирован» — гарантии не было: конвейер нигде не пересобирал staging-образ из провалидированного коммита, поэтому retag мог тихо промоутнуть УСТАРЕВШИЙ образ (инцидент LESSONS_ORCH-036 п.4 — зелёный деплой молча откатывал прод). Закрыто **двумя слоями (defense in depth), только для self-hosting**. Новый модуль `src/image_freshness.py` (контракт «never raise», по образцу `merge_gate`): `provenance_verdict` (чистая функция вердикта match/mismatch/fail-closed), `validated_revision` (`git rev-parse HEAD` в worktree валидированного коммита — единый якорь и для штампа A, и для `EXPECTED_REVISION` B), `image_revision` (OCI-лейбл `org.opencontainers.image.revision` через `docker image inspect`, ``/ошибка → пусто), `rebuild_staging_image` (ssh-хук `--build-staging`), `image_freshness_applies` (условность), `check_staging_image_fresh` (композитный QG). **Strategy A (liveness):** новый детерминированный QG-под-чек `check_staging_image_fresh` (зарегистрирован в `QG_CHECKS`, `src/qg/checks.py`) на ребре `deploy-staging → deploy` ПОСЛЕ merge-gate и ДО Phase A — пересобирает staging-образ из worktree валидированного коммита (хук `--build-staging`, `--build-arg GIT_SHA=`), пересоздаёт 8501 и прогоняет `staging_check.py --mode stub` против свежего 8501 (health + e2e, внутри staging-контейнера через `docker exec` — канон ORCH-048) → валидируем РОВНО тот артефакт (build + e2e), что промоутится в прод (AC-4); FAIL/не-ноль staging_check → откат на `development` (как merge-gate, кап `MAX_DEVELOPER_RETRIES`). `rebuild_staging_image` пробрасывает в хук **явный** staging-таргет (service/port/profile/container), исключая дрейф на прод 8500. Сборки/recreate/validate — **только staging (8501)**, прод (8500) не трогается. **Strategy B (safety):** `Dockerfile` штампует `LABEL org.opencontainers.image.revision=$GIT_SHA` (`ARG GIT_SHA`); `build_deploy_command` (`src/self_deploy.py`) пробрасывает `EXPECTED_REVISION`; хост-хук шагом 2b ПЕРЕД `docker tag` fail-closed сверяет лейбл `revision` у `SOURCE_IMAGE` с `EXPECTED_REVISION` — несовпадение / пустой лейбл / ошибка inspect → `exit 1` (FAILED → БАГ-8 откат), делает тихий промоут устаревшего образа структурно невозможным даже при проигравшей гонку/отключённой A. Хост-хук `scripts/orchestrator-deploy-hook.sh` расширен **обратно-совместимым** режимом `--build-staging` (пересборка+recreate staging, exit 0/1) и fail-closed guard'ом (активен только при заданном `EXPECTED_REVISION`). Единый kill-switch `ORCH_IMAGE_FRESHNESS_ENABLED` (true) включает A+B **как целое** (нет «B без A» = вечного fail-fast); область — `ORCH_IMAGE_FRESHNESS_REPOS` (CSV; пусто → только self-hosting `orchestrator`). Контракты НЕ менялись: `STAGE_TRANSITIONS` (под-гейт ребра, не стадия), exit-code-контракт хука (0/1/2), `map_exit_code_to_status`, `check_deploy_status`/`_parse_deploy_status`, БАГ-8, terminal-sync, merge-gate; схема БД — без миграций. ADR `docs/work-items/ORCH-058/06-adr/ADR-001-staging-image-provenance.md`, глобальный `docs/architecture/adr/adr-0008-staging-image-provenance.md`. Документация: `docs/architecture/README.md`, `docs/operations/DEPLOY_HOOK.md`, `docs/operations/STAGING.md`, `docs/operations/INFRA.md`, `.env.example`. Тесты: `tests/test_image_freshness.py`, `tests/test_deploy_hook_provenance.py`, `tests/test_deploy_build_once.py` (TC-06), `tests/test_deploy_hook_mapping.py` (TC-09), `tests/test_stage_engine.py::TestImageFreshnessGate`, `tests/test_qg_registry_snapshot.py`, `tests/test_config.py`. - **Исполняемый самодеплой стадии `deploy` (стадия дёргает хост-хук, manual-approve)** (ORCH-036): стадия `deploy` перестаёт быть «бумажной» — для self-hosting репозитория `orchestrator` `deploy_status: SUCCESS` означает ДОКАЗАННЫЙ health-ok реального рестарта прод-контейнера (8500), а не декларацию LLM. Критический путь self-restart детерминирован (без LLM), по образцу merge-gate ORCH-043, и разбит на три фазы (`src/stage_engine.py` + новый модуль `src/self_deploy.py`): **Фаза A** (вход в `deploy`) — вместо запуска прод-deployer'а при `deploy_require_manual_approve=true` задача переводится в approval-pending (`set_issue_in_review`) и ждёт ручного approve; restart-safe маркер `approve-requested`. **Фаза B** (человек ставит статус Plane → `Approved`; `advance_stage(deploy, finished_agent=None)`) — запускается **detached host-процесс** (`ssh + setsid` → `scripts/orchestrator-deploy-hook.sh`, чтобы рестарт 8500 пережил гибель контейнера; орк НЕ убивает себя из docker.sock) с build-once retag staging-образа (`SOURCE_IMAGE`), ставится детерминированный **finalizer-job**; маркер `initiated` — идемпотентность повторного Approved. **Фаза C** (`run_deploy_finalizer`, reserved-agent `deploy-finalizer`, claim'ится новым контейнером после рестарта) — читает sentinel `result` (exit-code хука, записан host-обёрткой), `not-ready` → defer (бюджет `deploy_finalize_max_attempts`, restart-safe по `task_content`), маппит `0→SUCCESS / 1|2|иное→FAILED` (чистая функция `map_exit_code_to_status`, unit-тест), пишет `14-deploy-log.md` и вызывает `advance_stage(deploy, finished_agent="deployer")` → существующие контракты: `SUCCESS → done` + release merge-lease, `FAILED → откат БАГ-8 на development` + `set_issue_blocked`. Уведомления Plane+Telegram на approve-request / initiate / success / rollback (BR-5, ни одного «молчаливого» деплоя). Хост-хук `scripts/orchestrator-deploy-hook.sh` расширен **обратно-совместимым** `SOURCE_IMAGE`: при заданном — `docker tag $SOURCE_IMAGE $TARGET_IMAGE` перед `up -d --no-build` (деплой РОВНО протестированного образа, без `docker build`); не задан → прежнее поведение; exit-code-контракт (0/1/2) и health-loop (10×6с, авто-rollback) не тронуты. Restart-safe состояние — sentinel-файлы (`/.deploy-state-//`), без миграции БД. Условность как ORCH-35: реальный самодеплой только для `is_self_hosting_repo("orchestrator")`; прочие репо (enduro-trails) — прежний синхронный ssh-путь агентом. Контракты НЕ менялись: `STAGE_TRANSITIONS`, реестр `QG_CHECKS`, `check_deploy_status`/`_parse_deploy_status` (frontmatter-only), terminal-sync `deploy→done`, merge-gate (ORCH-43), БАГ-8. Флаг `DEPLOY_REQUIRE_MANUAL_APPROVE` остаётся `true` (полный авто — отдельная задача ORCH-54). Новые настройки: `ORCH_DEPLOY_REQUIRE_MANUAL_APPROVE` (true), `ORCH_DEPLOY_SSH_USER`, `ORCH_DEPLOY_SSH_HOST`, `ORCH_DEPLOY_HOOK_SCRIPT`, `ORCH_DEPLOY_PROD_SOURCE_IMAGE`, `ORCH_DEPLOY_PROD_TARGET_SERVICE/PORT/IMAGE`, `ORCH_DEPLOY_FINALIZE_DELAY_S`, `ORCH_DEPLOY_FINALIZE_MAX_ATTEMPTS`. ADR `docs/work-items/ORCH-036/06-adr/ADR-001-executable-self-deploy.md`, глобальный `docs/architecture/adr/adr-0007-executable-self-deploy.md`. Документация: `.openclaw/agents/deployer.md` (стадия `deploy` = вызов хука, запрет self-restart), `docs/operations/INFRA.md`, `docs/operations/DEPLOY_HOOK.md`. Тесты: `tests/test_deploy_hook_mapping.py`, `tests/test_deploy_approve.py`, `tests/test_deploy_routing.py`, `tests/test_deploy_rollback.py`, `tests/test_deploy_notifications.py`, `tests/test_deploy_build_once.py`, `tests/test_deploy_terminal_sync.py`, `tests/test_staging_precondition.py`, `tests/test_deploy_hook_rollback_sim.py`. - **Sweeper потерянных webhook (реконсиляция застрявших стадий)** (ORCH-053): фоновый daemon-поток `src/reconciler.py` (паттерн `queue_worker`), который устраняет тихое застревание задач, когда конвейер не двигается из-за потерянного события (502 на ребилде инстанса, отсутствие ретраев у Plane/Gitea, неразрезолвленный `sha→branch` — класс инцидента ORCH-044). Реконсилятор периодически (`reconcile_interval_s`) доигрывает пропущенный переход **через те же штатные гейты/обработчики**, что и webhook, не дублируя логику конвейера: **F-1 gate-side** (`reconcile_gate_once`) — для задач `stage≠done`, без активного job и `age(updated_at) ≥ grace_for_stage(stage)` делает read-only пред-оценку канонического QG стадии; зелёный → продвижение строго через неизменный `stage_engine.advance_stage(..., finished_agent=None)`; красный → тишина (спам нотификаций структурно невозможен — `advance_stage` на красном гейте не вызывается вовсе); `analysis` F-1 не трогает (человеческий гейт). **F-2 plane-side** (`reconcile_plane_once`) — опрос Plane API per-project (новый `plane_sync.list_issues_by_state`, курсорная пагинация, never-raise) и реплей In Progress / Approved / Rejected через существующие `webhooks.plane.handle_status_start` / `handle_verdict` (async-обработчики вызываются из sync-потока через `asyncio.run`). **F-3** — усиление `sha→branch` в `handle_ci_status`: при неразрезолвленном sha — БД-fallback по единственной development-задаче repo (`db.get_development_tasks_by_repo`; неоднозначность → не резолвим, ложного матча нет), `logger.debug`→`logger.info` для видимости потерянного CI-события. Анти-дубль на создании задачи (`db.create_task_atomic` под process-wide `threading.Lock`: SELECT-exists→INSERT, проигравший в гонке reconcile↔webhook не плодит второй task/branch/worktree/стартовый analyst-job). Старт/стоп в `main.lifespan` (после `worker.start()` / перед `worker.stop()`), restart-safe, never-raise на единицу работы. Наблюдаемость (F-4): при разблокировке — лог-строка `reconciler: разблокирована (потерян webhook)` + Telegram (`reconcile_notify_unblock`) и блок `reconcile` в `GET /queue`. Kill-switches: `ORCH_RECONCILE_ENABLED` (глобально), `ORCH_RECONCILE_PLANE_ENABLED` (гасит только F-2), `ORCH_RECONCILE_INTERVAL_S` (120), `ORCH_RECONCILE_GRACE_DEFAULT_S` (600), `ORCH_RECONCILE_GRACE_OVERRIDES_JSON` (per-stage), `ORCH_RECONCILE_NOTIFY_UNBLOCK` (true). Схема БД и реестры (`STAGE_TRANSITIONS`/`QG_CHECKS`) НЕ менялись. ADR `docs/work-items/ORCH-053/06-adr/ADR-001-stuck-task-reconciler.md`, глобальный `docs/architecture/adr/adr-0007-reconciler.md`. Тесты: `tests/test_reconciler.py`, `tests/test_reconciler_plane.py`, `tests/test_gitea_sha_resolve.py`, `tests/test_config.py`. - **Merge-gate: авто-rebase на текущий `origin/main` + повторный прогон тестов + сериализация мержей** (ORCH-043): детерминированный (без LLM) суб-гейт на ребре `deploy-staging → deploy`, выполняемый ПЕРЕД мержем PR деплоером. Закрывает класс гонок «две зелёные ветки в одном репо ломают `main`»: пайплайн валидирует ветку против того `main`, от которого она ответвилась, а не против `main` в момент мержа — между «ветка зелёная» и «ветка смержена» параллельная задача может сдвинуть `main` (семантический конфликт: git мержит без текстового конфликта, но совмещённый `main` красный). Для self-hosting репозитория `orchestrator` это означало бы красный `main` инструмента, обслуживающего ВСЕ проекты. Новый модуль `src/merge_gate.py` (контракт «never raise», все git-операции — в per-branch worktree, ORCH-2/S-4): `branch_is_behind_main` (`git merge-base --is-ancestor origin/main HEAD`), `auto_rebase_onto_main` (rebase + `git push --force-with-lease` ТОЛЬКО ветки задачи — `main` НИКОГДА не пушится; текстовый конфликт → `rebase --abort` + чистый worktree), `retest_branch` (`python -m pytest ` в догнанном worktree, бюджет `merge_retest_timeout_s`), файловый merge-lease (`acquire_merge_lease`/`release_merge_lease`, атомарный `O_CREAT|O_EXCL`, holder-aware release, реклейм протухшего/битого лиза — без изменения схемы БД). Новый quality-gate `check_branch_mergeable` (`src/qg/checks.py`, зарегистрирован в `QG_CHECKS`) композирует примитивы под лизом: kill-switch/вне-области → no-op pass; lock занят → `(False, "merge-lock busy")` (сигнал DEFER, не код-фолт); ветка свежая → pass (лиз ДЕРЖИТСЯ до мержа); отстала → rebase → конфликт = fail+release, чисто → retest → зелёный = pass (лиз держится) / красный|timeout = fail+release. Интеграция в `src/stage_engine.py` (суб-гейт на `deploy-staging`, БЕЗ новой стадии в `STAGE_TRANSITIONS`): pass → advance на `deploy`; «merge-lock busy» → DEFER (повторная постановка деплоера на `deploy-staging` с задержкой `available_at`, анти-дедлок при `max_concurrency=1`, restart-safe счётчик по `task_content`, лимит `merge_defer_max_attempts` → block+Telegram); конфликт/красный retest → ROLLBACK на `development` + ретрай developer-а (кап `MAX_DEVELOPER_RETRIES`, без бесконечного баунса). Лиз освобождается на `deploy→done`, на rollback и по webhook смерженного PR (`src/webhooks/gitea.py`). Новый параметр `enqueue_job(..., available_at_delay_s=...)` (`src/db.py`) — отложенная постановка без изменения схемы. Условность раскатки (зеркало ORCH-35): `merge_gate_repos` (CSV) или по умолчанию только self-hosting `orchestrator`; глобальный kill-switch `merge_gate_enabled`. Новые настройки `ORCH_MERGE_GATE_ENABLED` (true), `ORCH_MERGE_GATE_REPOS` (""), `ORCH_MERGE_RETEST_TIMEOUT_S` (600), `ORCH_MERGE_RETEST_TARGET` (tests/), `ORCH_MERGE_LOCK_TIMEOUT_S` (300), `ORCH_MERGE_DEFER_DELAY_S` (60), `ORCH_MERGE_DEFER_MAX_ATTEMPTS` (5). ADR `docs/work-items/ORCH-043/06-adr/ADR-001-merge-gate.md`, глобальный `docs/architecture/adr/adr-0006-merge-gate.md`. Тесты: `tests/test_merge_gate.py`, `tests/test_qg_merge_gate.py`, `tests/test_merge_gate_race.py`, `tests/test_stage_engine.py::TestMergeGate`, `tests/test_config.py`. diff --git a/Dockerfile b/Dockerfile index ddcabbb..b8b34dc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,9 +1,13 @@ FROM python:3.12-slim -WORKDIR /app -# ORCH-58: stamp the validated git commit into the OCI revision label so the -# deploy hook provenance guard can fail-closed on it before the prod retag. -ARG GIT_SHA +# ORCH-058 (Strategy B): stamp the image with the git commit it was built from so +# the deploy hook can fail-close if a stale staging image would be promoted to prod +# (INV-FRESH). Passed at build time via `--build-arg GIT_SHA=` (the staging +# rebuild in check_staging_image_fresh / the --build-staging hook mode supplies it). +# Without the build-arg the label is empty -> the hook treats it as a mismatch +# (fail-closed). The OCI-standard key is read by `docker image inspect`. +ARG GIT_SHA="" LABEL org.opencontainers.image.revision=$GIT_SHA +WORKDIR /app RUN apt-get update -qq && apt-get install -y -qq openssh-client git && rm -rf /var/lib/apt/lists/* # git operations run as root over bind-mounted /repos (may be owned by host uid) -> trust it. RUN git config --system --add safe.directory '*' diff --git a/docs/architecture/README.md b/docs/architecture/README.md index 8b18a63..c20ab5d 100644 --- a/docs/architecture/README.md +++ b/docs/architecture/README.md @@ -194,6 +194,4 @@ never-raise на единицу работы; тишина при синхрон Схема БД, потоки данных, resilience-слой, детали Dockerfile — [internals.md](internals.md). --- -*Актуально на 2026-06-06. Обновлять при изменении src/stages.py, src/qg/checks.py, src/main.py. ORCH-043: merge-gate — design (см. adr-0006), реализация в ветке feature/ORCH-043. ORCH-036: исполняемый самодеплой стадии `deploy` — design (см. adr-0007), реализация в ветке feature/ORCH-036.* -*Актуально на 2026-06-06. Обновлять при изменении src/stages.py, src/qg/checks.py, src/main.py. ORCH-043: merge-gate — design (см. adr-0006), реализация в ветке feature/ORCH-043. ORCH-053: reconciler — реализовано (см. adr-0007, src/reconciler.py).* -*ORCH-058: провенанс staging-образа перед BUILD-ONCE retag (check_staging_image_fresh + хук-guard) — реализовано в ветке feature/ORCH-058 (см. adr-0008, src/image_freshness.py). Обновлять также при изменении src/self_deploy.py, scripts/orchestrator-deploy-hook.sh, Dockerfile.* +*Актуально на 2026-06-07. Обновлять при изменении src/stages.py, src/qg/checks.py, src/main.py. Статусы доработок: ORCH-036 (исполняемый самодеплой `deploy`, adr-0007) — реализовано; ORCH-043 (merge-gate, adr-0006) — design, ветка feature/ORCH-043; ORCH-053 (reconciler, adr-0007, src/reconciler.py) — реализовано; ORCH-058 (провенанс staging-образа: check_staging_image_fresh + staging_check свежего образа + хук-guard, adr-0008) — реализовано в ветке feature/ORCH-058 (обновлять также при изменении src/image_freshness.py, scripts/orchestrator-deploy-hook.sh, Dockerfile).* diff --git a/docs/operations/DEPLOY_HOOK.md b/docs/operations/DEPLOY_HOOK.md index 215de1b..522d0e7 100644 --- a/docs/operations/DEPLOY_HOOK.md +++ b/docs/operations/DEPLOY_HOOK.md @@ -24,9 +24,10 @@ 1. `docker build --build-arg GIT_SHA=$GIT_SHA -t $TARGET_IMAGE $BUILD_CONTEXT` — пересборка из host-worktree валидированного коммита; `GIT_SHA` штампуется в OCI-лейбл `org.opencontainers.image.revision`. 2. `docker compose [--profile $COMPOSE_PROFILE] up -d --no-build $TARGET_SERVICE` — пересоздание staging на свежем образе. -3. Health-цикл 10×6с. Здоров → `exit 0`; провал сборки/health → `exit 1`. +3. Health-цикл 10×6с. Провал сборки/health → `exit 1`. +4. **`staging_check` против СВЕЖЕГО образа** (Strategy A, шаг 3 — ADR-001, AC-4) — после health хук запускает `docker exec $STAGING_CONTAINER python3 $STAGING_CHECK_PATH --base-url http://localhost:$TARGET_PORT --mode $STAGING_CHECK_MODE` (дефолт `--mode stub`, без LLM-трат). Запуск **внутри** staging-контейнера канонический (ORCH-048): suite читает реестр из собственного env контейнера, а `staging_check.py` берётся из bind-mount (`/repos/orchestrator/scripts/...`, не из образа). Это ровно тот артефакт, что позже build-once ретегается в прод → валидируем то, что промоутим (AC-4). PASS → `exit 0`; любой не-ноль (FAIL чека или safety-abort `ORCH_STAGING≠true`) → `exit 1`. -Запускается оркестратором на ребре `deploy-staging → deploy` (QG-под-чек `check_staging_image_fresh`, см. `INFRA.md`). Тот же контракт кодов выхода (0 = здоров). +Запускается оркестратором на ребре `deploy-staging → deploy` (QG-под-чек `check_staging_image_fresh` → `rebuild_staging_image` пробрасывает явный staging-таргет, см. `INFRA.md`). Тот же контракт кодов выхода (0 = здоров **и** staging_check PASS). ### Режим `--rollback` @@ -45,6 +46,9 @@ | `EXPECTED_REVISION` | _(unset)_ | Build-once (ORCH-058, Strategy B): ожидаемый git-SHA `$SOURCE_IMAGE` (лейбл `org.opencontainers.image.revision`). Задан → fail-closed guard перед `docker tag`. Не задан → проверка пропущена. | | `GIT_SHA` | _(unset)_ | `--build-staging` (ORCH-058, Strategy A): коммит, штампуемый в OCI-лейбл `revision` при пересборке staging-образа. | | `BUILD_CONTEXT` | `$REPO` | `--build-staging`: docker build context (host-worktree валидированного коммита). | +| `STAGING_CONTAINER` | `$TARGET_SERVICE` (`orchestrator-staging`) | `--build-staging` (ORCH-058): контейнер, внутри которого `docker exec` запускает `staging_check`. | +| `STAGING_CHECK_PATH` | `/repos/orchestrator/scripts/staging_check.py` | `--build-staging` (ORCH-058): путь к `staging_check.py` внутри контейнера (bind-mount, не образ). | +| `STAGING_CHECK_MODE` | `stub` | `--build-staging` (ORCH-058): режим `staging_check` (`stub` — быстро, без LLM; `full-real` — дожидается аналитика). | | `LOG` | `/var/log/orchestrator/deploy-hook.log` | Лог-файл (fallback: `$REPO/deploy-hook.log`) | > ⚠️ **Дефолт — всегда STAGING**. Прод активируется только явным переопределением env. diff --git a/scripts/orchestrator-deploy-hook.sh b/scripts/orchestrator-deploy-hook.sh index 5d3d302..f72af9c 100755 --- a/scripts/orchestrator-deploy-hook.sh +++ b/scripts/orchestrator-deploy-hook.sh @@ -14,17 +14,18 @@ # TARGET_IMAGE instead of rebuilding — guarantees prod runs the # exact artefact that passed staging (no `docker build`). # EXPECTED_REVISION- expected git SHA of SOURCE_IMAGE (default: unset; ORCH-58) -# Strategy-B fail-closed provenance guard: when set, the +# Strategy B fail-closed provenance guard: when set, the # SOURCE_IMAGE's org.opencontainers.image.revision label MUST # equal this value before the BUILD-ONCE retag, else exit 1 # (a stale image is never promoted). Unset -> no check (legacy). -# GIT_SHA - --build-staging build-arg (default: unset; ORCH-58) -# Commit stamped into the rebuilt staging image's revision -# label. Supplied by the caller (validated commit) — NOT -# recomputed from the host clone's HEAD. -# BUILD_CONTEXT - --build-staging build context (default: $REPO; ORCH-58) -# Host worktree of the validated commit; the staging image is -# rebuilt FROM this tree (not the prod clone on main). +# GIT_SHA - build-arg for --build-staging (default: unset; ORCH-58) +# BUILD_CONTEXT - docker build context dir (default: $REPO; --build-staging) +# STAGING_CONTAINER- container to docker-exec staging_check in (--build-staging; +# default: $TARGET_SERVICE → orchestrator-staging; ORCH-58) +# STAGING_CHECK_PATH- staging_check.py path inside that container (--build-staging; +# default: /repos/orchestrator/scripts/staging_check.py; ORCH-58) +# STAGING_CHECK_MODE- staging_check mode stub|full-real (--build-staging; +# default: stub — fast, no LLM spend; ORCH-58) # LOG - log file path (default: /var/log/orchestrator/deploy-hook.log) # # Usage: @@ -45,11 +46,11 @@ PREV_IMAGE_FILE="${PREV_IMAGE_FILE:-$REPO/.deploy-prev-image-staging}" # Build-once (ORCH-36): optional prevalidated source image to retag onto # TARGET_IMAGE. Unset -> backward-compatible (no retag), exit-code contract intact. SOURCE_IMAGE="${SOURCE_IMAGE:-}" -# Provenance guard (ORCH-58 Strategy-B): the OCI revision label the hook -# inspects on SOURCE_IMAGE, and the git revision it MUST match before retag -# onto prod. EXPECTED_REVISION unset -> backward-compatible (guard skipped). -REVISION_LABEL="org.opencontainers.image.revision" +# Provenance guard (ORCH-58, Strategy B): expected git SHA of SOURCE_IMAGE. Unset +# -> backward-compatible (no provenance check), exit-code contract intact. EXPECTED_REVISION="${EXPECTED_REVISION:-}" +# The OCI-standard label key the Dockerfile stamps with the build commit. +REVISION_LABEL="org.opencontainers.image.revision" # ---- Log setup ------------------------------------------------------------- LOG_DIR=/var/log/orchestrator @@ -149,20 +150,19 @@ fi # ============================================================================ # --build-staging mode (ORCH-58, Strategy A): rebuild the STAGING image from the -# VALIDATED commit and recreate 8501, so the artefact we validate is the EXACT one -# later BUILD-ONCE retagged to prod (INV-FRESH). Builds/recreates STAGING ONLY -# (8501) — never prod (8500). Same exit-code contract (0 = healthy, !=0 = failed). -# -# Uses the caller-supplied GIT_SHA + BUILD_CONTEXT (the validated worktree) — it -# must NOT recompute HEAD from $REPO (the prod clone on `main`): on the -# deploy-staging -> deploy edge the PR is not yet merged, so `main` HEAD != the -# validated SHA, which would stamp the wrong revision label and deadlock the -# Strategy-B guard on every valid self-deploy. +# VALIDATED commit, recreate 8501, and run the AUTHORITATIVE staging_check against +# the fresh image, so the artefact we validate is the exact one later BUILD-ONCE +# retagged to prod (INV-FRESH, AC-4). Builds/recreates STAGING ONLY (8501) — never +# prod (8500). Same exit-code contract (0 = healthy + staging_check PASS). +# GIT_SHA - commit stamped into the image revision label (build-arg). +# BUILD_CONTEXT - docker build context (host worktree of the validated commit). +# Steps: (1) docker build → (2) recreate 8501 → (3a) health-check → +# (3b) staging_check.py --mode stub against the fresh 8501 (ADR-001 step 3). # ============================================================================ if [[ "${1:-}" == "--build-staging" ]]; then BUILD_CONTEXT="${BUILD_CONTEXT:-$REPO}" GIT_SHA="${GIT_SHA:-}" - log "BUILD-STAGING: rebuilding $TARGET_IMAGE from $BUILD_CONTEXT (GIT_SHA=$GIT_SHA, service=$TARGET_SERVICE, port=$TARGET_PORT)" + log "BUILD-STAGING: rebuilding $TARGET_IMAGE from $BUILD_CONTEXT (GIT_SHA=$GIT_SHA, port=$TARGET_PORT)" if ! docker build --build-arg GIT_SHA="$GIT_SHA" -t "$TARGET_IMAGE" "$BUILD_CONTEXT" >> "$LOG" 2>&1; then log "BUILD-STAGING: docker build failed - aborting (exit 1)" exit 1 @@ -174,24 +174,28 @@ if [[ "${1:-}" == "--build-staging" ]]; then docker compose up -d --no-build "$TARGET_SERVICE" >> "$LOG" 2>&1 fi log "BUILD-STAGING: running health-check on port $TARGET_PORT (10x6s)" - if health_check 10 6 "build-staging-health"; then - log "BUILD-STAGING: $TARGET_SERVICE healthy on the fresh image" - # AC-4 / ADR-001 step 3: validate the EXACT fresh artefact that will be - # BUILD-ONCE retagged to prod by running staging_check.py against the - # freshly recreated STAGING stand (8501, never prod 8500 - AC-9). - # --mode stub: fast, deterministic, no LLM spend (ADR). Run INSIDE the - # container so B6 reads the running instance own env (.env.staging). - log "BUILD-STAGING: running staging_check.py --mode stub against fresh 8501 (port $TARGET_PORT)" - if docker exec "$TARGET_SERVICE" \\ - python3 /repos/orchestrator/scripts/staging_check.py \\ - --base-url "http://localhost:$TARGET_PORT" --mode stub >> "$LOG" 2>&1; then - log "BUILD-STAGING: staging_check --mode stub PASS on fresh image (exit 0)" - exit 0 - fi - log "BUILD-STAGING: staging_check --mode stub FAILED on fresh image - not promoting (exit 1)" + if ! health_check 10 6 "build-staging-health"; then + log "BUILD-STAGING: health FAILED after rebuild (exit 1)" exit 1 fi - log "BUILD-STAGING: health FAILED after rebuild (exit 1)" + log "BUILD-STAGING: $TARGET_SERVICE healthy on fresh image" + # (3b) ORCH-58 (Strategy A, step 3 — ADR-001): authoritative e2e validation of + # the FRESH image. Run staging_check.py against the just-rebuilt 8501 INSIDE the + # staging container (ORCH-048 canonical: it reads its OWN staging registry env, so + # B6 is correct; the script lives at /repos/... via bind-mount, not in /app). This + # is the same artefact later BUILD-ONCE retagged to prod, so we validate exactly + # what we promote (AC-4). Any non-zero (FAIL or ORCH_STAGING safety-abort) -> exit 1 + # -> freshness gate FAIL -> rollback to development. Same exit-code contract. + STAGING_CONTAINER="${STAGING_CONTAINER:-$TARGET_SERVICE}" + STAGING_CHECK_PATH="${STAGING_CHECK_PATH:-/repos/orchestrator/scripts/staging_check.py}" + STAGING_CHECK_MODE="${STAGING_CHECK_MODE:-stub}" + log "BUILD-STAGING: running staging_check (--mode $STAGING_CHECK_MODE) against fresh http://localhost:$TARGET_PORT inside $STAGING_CONTAINER" + if docker exec "$STAGING_CONTAINER" python3 "$STAGING_CHECK_PATH" \ + --base-url "http://localhost:$TARGET_PORT" --mode "$STAGING_CHECK_MODE" >> "$LOG" 2>&1; then + log "BUILD-STAGING: staging_check PASS on fresh image (exit 0)" + exit 0 + fi + log "BUILD-STAGING: staging_check FAILED on fresh image - artefact not promotable (exit 1)" exit 1 fi @@ -222,21 +226,19 @@ git pull origin main >> "$LOG" 2>&1 # Backward compatible: skipped when SOURCE_IMAGE is unset. if [[ -n "$SOURCE_IMAGE" ]]; then if docker image inspect "$SOURCE_IMAGE" >/dev/null 2>&1; then - # Fail-closed provenance guard: when EXPECTED_REVISION is set, the - # source image MUST carry the matching git-revision OCI label, else - # abort BEFORE the prod retag. Empty EXPECTED_REVISION -> guard - # skipped (ORCH-36 backward-compat). + # ORCH-58 (Strategy B): fail-closed provenance guard BEFORE docker tag. + # When EXPECTED_REVISION is set, SOURCE_IMAGE's git-commit label MUST match, + # else exit 1 (FAILED -> БАГ-8 rollback); prod is NEVER touched. Empty label + # / inspect error / mismatch all fail-close. Unset EXPECTED_REVISION -> no + # check (backward-compatible for non-self repos / legacy calls). if [[ -n "$EXPECTED_REVISION" ]]; then - IMG_REV=$(docker image inspect --format '{{ index .Config.Labels "'"$REVISION_LABEL"'" }}' "$SOURCE_IMAGE" 2>/dev/null || true) - # docker emits "" when the label is absent -> normalise. - if [[ "$IMG_REV" == "" ]]; then - IMG_REV="" - fi + IMG_REV=$(docker image inspect --format "{{ index .Config.Labels \"$REVISION_LABEL\" }}" "$SOURCE_IMAGE" 2>/dev/null || true) + if [[ "$IMG_REV" == "" ]]; then IMG_REV=""; fi if [[ -z "$IMG_REV" || "$IMG_REV" != "$EXPECTED_REVISION" ]]; then - log "PROVENANCE: SOURCE_IMAGE revision '$IMG_REV' != expected '$EXPECTED_REVISION' - aborting before retag (exit 1)" + log "PROVENANCE: SOURCE_IMAGE revision '$IMG_REV' != expected '$EXPECTED_REVISION' (fail-closed) - aborting (exit 1)" exit 1 fi - log "PROVENANCE: SOURCE_IMAGE revision matches expected ($EXPECTED_REVISION)" + log "PROVENANCE: SOURCE_IMAGE revision matches expected ($EXPECTED_REVISION) - retag allowed" fi log "BUILD-ONCE: retagging $SOURCE_IMAGE -> $TARGET_IMAGE (no rebuild)" docker tag "$SOURCE_IMAGE" "$TARGET_IMAGE" >> "$LOG" 2>&1 diff --git a/src/image_freshness.py b/src/image_freshness.py index 4cdd43e..fc783d6 100644 --- a/src/image_freshness.py +++ b/src/image_freshness.py @@ -14,9 +14,10 @@ self-hosting: * **A — liveness:** :func:`check_staging_image_fresh` is a QG sub-check on the ``deploy-staging -> deploy`` edge (composed by ``stage_engine`` AFTER the merge-gate, BEFORE Phase A). It rebuilds ``orchestrator-orchestrator-staging`` - from the VALIDATED commit (worktree HEAD after the merge-gate rebase) and - recreates the 8501 container, so we validate and promote ONE artefact. FAIL -> - rollback to ``development`` (mirrors the merge-gate). + from the VALIDATED commit (worktree HEAD after the merge-gate rebase), recreates + the 8501 container, and runs ``staging_check.py --mode stub`` against that fresh + 8501 (ADR-001 step 3), so we validate exactly the ONE artefact later retagged to + prod (AC-4). FAIL -> rollback to ``development`` (mirrors the merge-gate). * **B — safety:** :func:`expected_revision` feeds the validated SHA to ``self_deploy.build_deploy_command`` as ``EXPECTED_REVISION``; the host hook fail-closes (``exit 1``) before ``docker tag`` if the SOURCE_IMAGE revision @@ -48,10 +49,18 @@ REVISION_LABEL = "org.opencontainers.image.revision" # Bounded timeouts so a hung git/docker/ssh never wedges the monitor-thread. _GIT_TIMEOUT = 30 _INSPECT_TIMEOUT = 30 -# The remote rebuild (docker build + compose recreate + health) is the slow path; -# keep it generous but bounded (mirrors the merge-gate re-test budget order). +# The remote rebuild (docker build + compose recreate + health + staging_check) is +# the slow path; keep it generous but bounded (mirrors the merge-gate re-test order). _REBUILD_TIMEOUT = 1200 +# Explicit STAGING target for the --build-staging rebuild (Strategy A). These mirror +# the hook's staging-safe defaults but are passed EXPLICITLY so a future change to the +# hook defaults can never silently retarget the self-rebuild at prod (8500) — the whole +# path builds/recreates STAGING ONLY (AC-9, review P2). Never the prod 8500 target. +_STAGING_SERVICE = "orchestrator-staging" +_STAGING_PORT = 8501 +_STAGING_COMPOSE_PROFILE = "staging" + # --------------------------------------------------------------------------- # Conditionality (mirrors self_deploy_applies / _merge_gate_applies) @@ -234,9 +243,12 @@ def rebuild_staging_image(repo: str, branch: str, sha: str) -> tuple[bool, str]: The hook (``orchestrator-deploy-hook.sh --build-staging``) runs, on the host: ``docker build --build-arg GIT_SHA= -t `` -> ``docker compose --profile staging up -d --no-build orchestrator-staging`` - -> health-check 8501. Same exit-code contract (0 = ok). This trades prod for - staging ONLY (8501), NEVER prod (8500) (AC-9): all build/recreate targets are - the staging service. + -> health-check 8501 + -> ``staging_check.py --mode stub`` against the FRESH 8501 (ADR-001 step 3, + AC-4: validate exactly the artefact later retagged to prod). + Same exit-code contract (0 = ok). This trades prod for staging ONLY (8501), + NEVER prod (8500) (AC-9): all build/recreate/validate targets are the staging + service — passed EXPLICITLY below, not left to hook defaults (review P2). Synchronous ssh is fine here (unlike Phase B): recreating staging does not kill the prod worker running this code. Bounded by ``_REBUILD_TIMEOUT``. @@ -248,17 +260,18 @@ def rebuild_staging_image(repo: str, branch: str, sha: str) -> tuple[bool, str]: if not target: return False, "no ssh host configured for staging rebuild" host_ctx = _host_worktree_path(repo, branch) - # We pass ONLY GIT_SHA (validated commit -> revision label, the shared anchor - # with Strategy B), BUILD_CONTEXT (the validated worktree to build FROM) and - # TARGET_IMAGE (the staging image name to retag in prod later). COMPOSE_PROFILE - # / TARGET_SERVICE / TARGET_PORT are deliberately omitted so the hook keeps its - # built-in STAGING defaults (profile=staging, orchestrator-staging, 8501): this - # rebuild/recreate must touch STAGING ONLY (8501), NEVER prod (8500) (AC-9), and - # the prod defaults are never reachable on this path. + # Pass the STAGING target explicitly (service/port/profile/container), so the + # rebuild + recreate + staging_check can never drift onto the prod 8500 service + # even if the hook's defaults change (AC-9, review P2). STAGING_CONTAINER is the + # container staging_check is docker-exec'd inside (step 3b). env_assignments = ( f"GIT_SHA={shlex.quote(sha)} " f"BUILD_CONTEXT={shlex.quote(host_ctx)} " - f"TARGET_IMAGE={shlex.quote(settings.deploy_prod_source_image)}" + f"TARGET_IMAGE={shlex.quote(settings.deploy_prod_source_image)} " + f"TARGET_SERVICE={shlex.quote(_STAGING_SERVICE)} " + f"TARGET_PORT={shlex.quote(str(_STAGING_PORT))} " + f"COMPOSE_PROFILE={shlex.quote(_STAGING_COMPOSE_PROFILE)} " + f"STAGING_CONTAINER={shlex.quote(_STAGING_SERVICE)}" ) inner = ( f"cd {shlex.quote(settings.deploy_host_repo_path)} && " @@ -290,9 +303,10 @@ def check_staging_image_fresh(repo: str, work_item_id: str, branch: str) -> tupl a repo the feature is not real for -> ``(True, "image-freshness N/A for ")``. 2. Anchor: ``sha = validated_revision(repo, branch)``. Empty -> fail-closed ``(False, ...)`` (AC-3): we never rebuild/promote without a known commit. - 3. Rebuild the staging image from that commit + recreate 8501 (host hook). - Healthy -> ``(True, ...)``: the artefact we just validated is the exact one - that will be retagged to prod (AC-4, loop closed). FAIL -> ``(False, ...)`` + 3. Rebuild the staging image from that commit, recreate 8501, and run + ``staging_check.py --mode stub`` against the fresh 8501 (host hook). PASS -> + ``(True, ...)``: the artefact we just validated (build + e2e) is the exact + one that will be retagged to prod (AC-4, loop closed). FAIL -> ``(False, ...)`` -> the engine rolls back to ``development`` (AC-2). Never-raise (AC-8): any internal error -> ``(False, "")``; an exception diff --git a/tests/test_deploy_hook_provenance.py b/tests/test_deploy_hook_provenance.py index b4652bb..c742763 100644 --- a/tests/test_deploy_hook_provenance.py +++ b/tests/test_deploy_hook_provenance.py @@ -1,13 +1,19 @@ -"""ORCH-058 TC-07/08: static guarantees of the Strategy-B provenance plumbing. +"""ORCH-058 TC-07/08: static + caller-contract guarantees of the provenance plumbing. These assert the *shape* of the deploy artefacts that can't be unit-tested by running them (they shell out to docker/ssh on the host): * TC-07 — the deploy hook fail-closes BEFORE `docker tag` when the staging image's git-revision label != EXPECTED_REVISION (exit 1), and the - new `--build-staging` rebuild mode stamps GIT_SHA into the image. + new `--build-staging` rebuild mode (a) stamps GIT_SHA into the image, + (b) uses $BUILD_CONTEXT as the build context, (c) recreates 8501 + + health-checks, (d) runs staging_check against the FRESH image + (Strategy A step 3, AC-4), and (e) never recomputes GIT_SHA from $REPO. * TC-08 — the Dockerfile declares `ARG GIT_SHA` and stamps it into the `org.opencontainers.image.revision` OCI label (the anchor B reads). + * TC-09 — the caller↔hook contract: `rebuild_staging_image` invokes the hook + in `--build-staging` mode with BUILD_CONTEXT=, + GIT_SHA=, and an EXPLICIT staging target (never prod). """ import pathlib @@ -17,17 +23,6 @@ _HOOK = _ROOT / "scripts" / "orchestrator-deploy-hook.sh" _DOCKERFILE = _ROOT / "Dockerfile" -def _build_staging_block() -> str: - """Return only the body of the hook's ``--build-staging`` branch, so the - contract assertions below cannot be satisfied by lookalike strings elsewhere - in the script (e.g. the NORMAL DEPLOY recreate). The block runs from the - ``--build-staging`` guard up to the NORMAL DEPLOY section header.""" - text = _HOOK.read_text(encoding="utf-8") - start = text.index('"${1:-}" == "--build-staging"') - end = text.index("NORMAL DEPLOY mode", start) - return text[start:end] - - # --------------------------------------------------------------------------- # TC-07: hook fail-closed provenance guard + --build-staging rebuild mode # --------------------------------------------------------------------------- @@ -60,68 +55,42 @@ def test_tc07_build_staging_mode_stamps_git_sha(): assert 'docker build --build-arg GIT_SHA="$GIT_SHA"' in text -def test_tc07_build_staging_builds_from_caller_context_not_repo(): - """Contract (caller <-> hook): --build-staging must build from the - caller-supplied BUILD_CONTEXT (the validated worktree), NOT the prod clone. - - Regression guard for the P0 deadlock: the block must honour the caller's - GIT_SHA (BUILD_CONTEXT/GIT_SHA defaulting) and must NOT recompute the SHA - from the host clone's HEAD (`git rev-parse HEAD`) — on the - deploy-staging -> deploy edge `main` HEAD != validated SHA, which would - stamp the wrong revision label and deadlock the Strategy-B guard. - """ - block = _build_staging_block() - # Build context is the caller-supplied worktree, defaulting to $REPO. - assert 'BUILD_CONTEXT="${BUILD_CONTEXT:-$REPO}"' in block - assert 'docker build --build-arg GIT_SHA="$GIT_SHA" -t "$TARGET_IMAGE" "$BUILD_CONTEXT"' in block - # Honour the caller's GIT_SHA; never hard-build against the prod clone. - assert 'GIT_SHA="${GIT_SHA:-}"' in block - assert 'docker build --build-arg GIT_SHA="$GIT_SHA" -t "$TARGET_IMAGE" "$REPO"' not in block - # Must NOT recompute the validated SHA from the host clone's HEAD. - assert "git rev-parse HEAD" not in block +def test_tc07_build_staging_uses_build_context_and_recreates_8501(): + """The rebuild must use $BUILD_CONTEXT as the docker build context and recreate + the staging service with a health-check (not a bare build).""" + text = _HOOK.read_text(encoding="utf-8") + # $BUILD_CONTEXT is the build context of the rebuild (validated worktree). + assert 'docker build --build-arg GIT_SHA="$GIT_SHA" -t "$TARGET_IMAGE" "$BUILD_CONTEXT"' in text + # Recreate the staging service on the fresh image (no-build) + health-check. + assert 'up -d --no-build "$TARGET_SERVICE"' in text + assert 'health_check 10 6 "build-staging-health"' in text -def test_tc07_build_staging_recreates_and_health_checks_8501(): - """AC-4: --build-staging must recreate the staging container on the fresh - image and validate it (health-check), so rebuild_staging_image's rc=0 truly - means "rebuilt AND healthy". A bare `docker build` + exit 0 would make the - freshness verdict a lie.""" - block = _build_staging_block() - # Recreate the staging service on the freshly built image. - assert 'docker compose --profile "$COMPOSE_PROFILE" up -d --no-build "$TARGET_SERVICE"' in block - # Validate the fresh container before reporting success. - assert 'health_check 10 6 "build-staging-health"' in block - # Health failure surfaces as a non-zero exit (FAILED contract preserved). - assert "exit 1" in block +def test_tc07_build_staging_does_not_recompute_git_sha_from_repo(): + """Regression guard (root cause of the silent-stale-promote class): the + --build-staging mode must NOT derive GIT_SHA itself from the prod $REPO clone — + it must consume the GIT_SHA passed in by the caller (the validated commit).""" + text = _HOOK.read_text(encoding="utf-8") + # Anchor on the actual block guard (not the header comment mentions). + after = text[text.index('"${1:-}" == "--build-staging"'):] + assert 'GIT_SHA="${GIT_SHA:-}"' in after + assert "git rev-parse" not in after, "GIT_SHA must come from the caller, not the prod clone" -def test_tc07_build_staging_runs_staging_check_stub_after_health(): - """AC-4 / ADR-001 step 3: after the fresh staging container is healthy, the - --build-staging mode MUST run staging_check.py --mode stub against the fresh - 8501 stand BEFORE reporting success, and fail-closed (exit 1) if it fails - - so the EXACT artefact promoted to prod is the one that passed staging.""" - block = _build_staging_block() - # staging_check is invoked in --mode stub (fast, no LLM spend per ADR). - assert "staging_check.py" in block - assert "--mode stub" in block - # It targets the fresh STAGING stand (8501 / TARGET_PORT), never prod 8500. - assert '--base-url "http://localhost:$TARGET_PORT"' in block - # AC-9: the staging_check invocation must NOT hard-code the prod port (8500). - invocation_lines = [ - ln for ln in block.splitlines() - if "staging_check.py" in ln or "--base-url" in ln - ] - assert invocation_lines, "expected a staging_check.py invocation line" - assert all("8500" not in ln for ln in invocation_lines) - # Ordering: staging_check runs AFTER the health-check, BEFORE the final exit 0. - health_idx = block.index('health_check 10 6 "build-staging-health"') - check_idx = block.index("staging_check.py") - assert health_idx < check_idx, "staging_check must run after health_check" - exit0_idx = block.index("staging_check --mode stub PASS") - success_exit = block.index("exit 0", exit0_idx) - assert check_idx < success_exit, "staging_check must precede the success exit 0" - # Fail-closed: a non-zero staging_check surfaces as exit 1 (no prod promote). - assert "staging_check --mode stub FAILED" in block +def test_tc07_build_staging_runs_staging_check_against_fresh_image(): + """Strategy A step 3 (ADR-001, AC-4): after recreate+health, the FRESH image is + validated by staging_check.py (not health-only). This is the P1 the reviewer + flagged: validate exactly the artefact later retagged to prod.""" + text = _HOOK.read_text(encoding="utf-8") + # Anchor on the actual block guard (not the header comment mentions). + after = text[text.index('"${1:-}" == "--build-staging"'):] + # staging_check is invoked, inside the staging container, --mode stub by default. + assert "staging_check.py" in after + assert 'docker exec "$STAGING_CONTAINER"' in after + assert '--mode "$STAGING_CHECK_MODE"' in after + assert 'STAGING_CHECK_MODE="${STAGING_CHECK_MODE:-stub}"' in after + # The staging_check run must come AFTER the health-check (health gates readiness). + assert after.index('health_check 10 6 "build-staging-health"') < after.index("staging_check.py") # --------------------------------------------------------------------------- @@ -131,3 +100,60 @@ def test_tc08_dockerfile_stamps_revision_label(): text = _DOCKERFILE.read_text(encoding="utf-8") assert "ARG GIT_SHA" in text assert "LABEL org.opencontainers.image.revision=$GIT_SHA" in text + + +# --------------------------------------------------------------------------- +# TC-09: caller↔hook contract — rebuild_staging_image builds the right command +# --------------------------------------------------------------------------- +def test_tc09_rebuild_staging_image_passes_validated_context_and_staging_target(monkeypatch): + """`rebuild_staging_image` must invoke the hook `--build-staging` over ssh with + BUILD_CONTEXT=, GIT_SHA=, and an EXPLICIT staging + target (service/port/profile/container) — never the prod 8500 target. The absence + of this contract test is what hid the earlier P0s (review P2).""" + import src.image_freshness as imgf + + captured = {} + + class _FakeCompleted: + returncode = 0 + stdout = "" + stderr = "" + + def _fake_run(cmd, *a, **kw): + captured["cmd"] = cmd + return _FakeCompleted() + + monkeypatch.setattr(imgf, "_ssh_target", lambda: "slin@host") + monkeypatch.setattr(imgf, "_host_worktree_path", + lambda repo, branch: "/home/slin/repos/_wt/orchestrator/feature_X") + monkeypatch.setattr(imgf.subprocess, "run", _fake_run) + + ok, msg = imgf.rebuild_staging_image("orchestrator", "feature/ORCH-058", "abc123def456") + assert ok, msg + + cmd = captured["cmd"] + assert cmd[0] == "ssh" + inner = cmd[-1] # the remote shell command string + # Validated commit + validated worktree as build context. + assert "GIT_SHA=abc123def456" in inner + assert "BUILD_CONTEXT=/home/slin/repos/_wt/orchestrator/feature_X" in inner + # Explicit STAGING target — never the prod 8500 service/port. + assert "TARGET_SERVICE=orchestrator-staging" in inner + assert "TARGET_PORT=8501" in inner + assert "COMPOSE_PROFILE=staging" in inner + assert "STAGING_CONTAINER=orchestrator-staging" in inner + assert "orchestrator-orchestrator-staging" in inner # staging TARGET_IMAGE + assert "--build-staging" in inner + # Hard safety: the prod service/port must NOT leak into the staging rebuild. + assert "TARGET_PORT=8500" not in inner + assert "TARGET_SERVICE=orchestrator " not in inner + + +def test_tc09_rebuild_staging_image_no_ssh_host_fails_closed(monkeypatch): + """No ssh host configured -> never-raise, fail-closed (False), no command run.""" + import src.image_freshness as imgf + + monkeypatch.setattr(imgf, "_ssh_target", lambda: None) + ok, reason = imgf.rebuild_staging_image("orchestrator", "feature/ORCH-058", "abc123") + assert ok is False + assert "ssh host" in reason