From 8273c1fc9d7db1207da76fdd75d4060b5a6d42eb Mon Sep 17 00:00:00 2001 From: claude-bot Date: Sun, 7 Jun 2026 14:16:12 +0000 Subject: [PATCH] feat(post-deploy): post-deploy prod monitoring + degradation reaction (ORCH-021) Extend pipeline responsibility past deploy->done: after the terminal transition for an applicable repo, arm a ~15min observation window that probes prod and reacts to a degradation the restart-time health-check missed ("green deploy, red prod"). - src/post_deploy.py: new leaf module (config + lazy qg/db only). Sentinel-file restart-safe state (.post-deploy-state-//), no DB migration. probe_signals/classify/decide_action/run_rollback, all never-raise. - Reserved-agent job `post-deploy-monitor` (no-LLM, Variant B, calque of deploy-finalizer): self-requeues each tick via enqueue_job. - Deterministic classify: DEGRADED iff >= fail_threshold consecutive health failures OR window 5xx ratio > 5xx_threshold; fail-safe HEALTHY. - Self-hosting invariant (BR-5/AC-8): a tick NEVER restarts the prod orchestrator container -> orchestrator is ALWAYS ALERT_ONLY. - Conditionality (ORCH-35/36/43/58): kill-switch + CSV repos, empty -> self-hosting only. - QG_CHECKS / STAGE_TRANSITIONS / schema unchanged (AC-12). - Docs: CHANGELOG, CLAUDE artefact list (16-post-deploy-log.md), architecture README, .env.example (ORCH_POST_DEPLOY_*). Refs: ORCH-021 Co-Authored-By: Claude Opus 4.7 --- .env.example | 24 + CHANGELOG.md | 1 + CLAUDE.md | 2 +- docs/architecture/README.md | 4 +- src/agents/launcher.py | 26 ++ src/config.py | 31 ++ src/main.py | 2 + src/post_deploy.py | 614 ++++++++++++++++++++++++++ src/stage_engine.py | 148 +++++++ tests/test_deploy_terminal_sync.py | 4 + tests/test_post_deploy.py | 210 +++++++++ tests/test_post_deploy_integration.py | 259 +++++++++++ 12 files changed, 1322 insertions(+), 3 deletions(-) create mode 100644 src/post_deploy.py create mode 100644 tests/test_post_deploy.py create mode 100644 tests/test_post_deploy_integration.py diff --git a/.env.example b/.env.example index eb9fbfa..9a74109 100644 --- a/.env.example +++ b/.env.example @@ -116,3 +116,27 @@ ORCH_RECONCILE_GRACE_DEFAULT_S=600 ORCH_RECONCILE_GRACE_OVERRIDES_JSON= ORCH_RECONCILE_NOTIFY_UNBLOCK=true ORCH_RECONCILE_SKIP_BLOCKED_ENABLED=true + +# ORCH-021: post-deploy production monitoring + degradation reaction. After the +# terminal deploy->done transition for an applicable repo, a reserved-agent job +# `post-deploy-monitor` (no LLM, modelled on deploy-finalizer) probes prod over a +# window and reacts to a degradation the restart-time health-check missed (class +# "green deploy, red prod", precedent ET-8). State is in sentinel files +# (.post-deploy-state-//), no DB migration. +# MONITOR_ENABLED -> global kill-switch; false -> pipeline is 1:1 as before ORCH-021. +# REPOS -> CSV of repos where monitoring is REAL; empty -> only self-hosting. +# WINDOW_S -> observation window length (~15 min). +# INTERVAL_S -> seconds between probe ticks. +# FAIL_THRESHOLD -> N CONSECUTIVE health failures -> DEGRADED. +# 5XX_THRESHOLD -> window 5xx ratio above this -> DEGRADED. +# AUTO_ROLLBACK -> allow auto-rollback; acts ONLY for non-self repos. Self-hosting +# is ALWAYS ALERT_ONLY (a tick NEVER restarts the prod container). +# BASE_URL -> base URL of the observed prod instance. +ORCH_POST_DEPLOY_MONITOR_ENABLED=true +ORCH_POST_DEPLOY_REPOS= +ORCH_POST_DEPLOY_WINDOW_S=900 +ORCH_POST_DEPLOY_INTERVAL_S=30 +ORCH_POST_DEPLOY_FAIL_THRESHOLD=3 +ORCH_POST_DEPLOY_5XX_THRESHOLD=0.5 +ORCH_POST_DEPLOY_AUTO_ROLLBACK=false +ORCH_POST_DEPLOY_BASE_URL=http://localhost:8500 diff --git a/CHANGELOG.md b/CHANGELOG.md index 655c084..054341f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ## [Unreleased] ### Added +- **Post-deploy наблюдение прода + реакция на деградацию** (ORCH-021): конвейер больше не «забывает про прод» после `deploy → done` — раньше «успех» означал прохождение health-check лишь в момент рестарта (~60с-окно хука), и класс инцидентов «зелёный деплой, красный прод» (прецедент ET-8: деградация проявляется через минуты под трафиком, `/health` отвечает `200 ok`, но фича сломана) не ловился. ORCH-021 продлевает ответственность **ЗА** `done`: для применимого репозитория после терминального перехода армится наблюдение окна `post_deploy_window_s` (~15 мин) с интервалом `post_deploy_interval_s`; деградация фиксируется детерминированными порогами, при подтверждении — реакция. Новый leaf-модуль `src/post_deploy.py` (контракт «never raise», по образцу `self_deploy.py`/`staging_verdict.py`; импортирует только config + lazy `qg.checks.is_self_hosting_repo`): `post_deploy_applies` (условность раската), `probe_signals` (один опрос `/health` 200+`{"status":"ok"}` + доля 5xx на `/status`,`/queue`; сеть/таймаут → консервативный провал, не исключение), `classify` (чистая, главный предмет юнит-тестов: `DEGRADED` ⇔ `≥ post_deploy_fail_threshold` ПОСЛЕДОВАТЕЛЬНЫХ провалов health ИЛИ доля 5xx окна `> post_deploy_5xx_threshold`; иначе `HEALTHY` — одиночный глюк не откатывает), `decide_action` (self-hosting → ВСЕГДА `ALERT_ONLY`; не-self + `post_deploy_auto_rollback=true` → `ROLLBACK`; иначе `ALERT_ONLY`), `map_rollback_exit_code` (`0→ROLLBACK_OK`, иначе `ROLLBACK_FAILED`), sentinel-state хелперы (`armed`/`series`/`done` под `/.post-deploy-state-//`, restart-safe счётчики), `build_rollback_command`/`run_rollback` (ssh-хук `--rollback` с прод-env, синхронно — только для не-self), `build/write_post_deploy_log` (артефакт `16-post-deploy-log.md`), `arm_monitor` (идемпотентный арм + первый отложенный job), `status` (снимок для `/queue`). **Механизм наблюдения — reserved-agent job `post-deploy-monitor`** (детерминированный, no-LLM, калька `deploy-finalizer`, НЕ стадия и НЕ daemon): арм в `stage_engine.advance_stage` в блоке `next_stage == "done"` ПОСЛЕ terminal-sync/release-lease (`post_deploy.arm_monitor`, sentinel `armed` = идемпотентность при двойном webhook/reconciler/finalizer); один тик = один job — перехват в `agents/launcher.launch_job` ДО `_spawn` → `stage_engine.run_post_deploy_monitor` (один опрос → append в `series` → `classify` → перепостановка с задержкой `available_at_delay_s` ИЛИ реакция+артефакт+`mark_done`); бюджет тиков `window_s/interval_s` (анти-livelock). **Self-hosting safety (BR-5):** для `orchestrator` тик НИКОГДА не откатывает/рестартит прод-контейнер — реакция всегда `ALERT_ONLY` (громкий Telegram + Plane-коммент с запросом ручного approve); авто-rollback хуком `--rollback` — только для не-self репо при `post_deploy_auto_rollback=true` (целевой контейнер ≠ orchestrator). Наблюдаемость — блок `post_deploy` в `GET /queue` (enabled/window/interval/активные наблюдения). Артефакт `16-post-deploy-log.md` (YAML-frontmatter `post_deploy_status`/`action_taken`/`window_s`/`checks_total`/`checks_failed`) — машиночитаемо для петли уроков ORCH-8; best-effort. Новые настройки: `ORCH_POST_DEPLOY_MONITOR_ENABLED` (true, kill-switch), `ORCH_POST_DEPLOY_REPOS` (CSV; пусто → только self-hosting), `ORCH_POST_DEPLOY_WINDOW_S` (900), `ORCH_POST_DEPLOY_INTERVAL_S` (30), `ORCH_POST_DEPLOY_FAIL_THRESHOLD` (3), `ORCH_POST_DEPLOY_5XX_THRESHOLD` (0.5), `ORCH_POST_DEPLOY_AUTO_ROLLBACK` (false), `ORCH_POST_DEPLOY_BASE_URL` (http://localhost:8500); параметры отката переиспользуют `deploy_prod_*`. Инварианты НЕ менялись: `STAGE_TRANSITIONS`, реестр `QG_CHECKS`, `check_deploy_status`/`_parse_deploy_status`, terminal-sync `deploy→done`, merge-gate, exit-код-контракт хука (0/1/2), схема БД (без миграций; состояние — sentinel-файлы). Условность как ORCH-35/36/43/58. ADR `docs/work-items/ORCH-021/06-adr/ADR-001-post-deploy-monitor.md`, глобальный `docs/architecture/adr/adr-0010-post-deploy-monitor.md`. Тесты: `tests/test_post_deploy.py`, `tests/test_post_deploy_integration.py`. - **Провенанс staging-образа перед BUILD-ONCE retag в прод (свежесть артефакта, INV-FRESH)** (ORCH-058): BUILD-ONCE retag (ORCH-036) промоутит staging-образ (`orchestrator-orchestrator-staging`) в прод **без rebuild**, полагаясь на «образ свеж и провалидирован» — гарантии не было: конвейер нигде не пересобирал staging-образ из провалидированного коммита, поэтому retag мог тихо промоутнуть УСТАРЕВШИЙ образ (инцидент LESSONS_ORCH-036 п.4 — зелёный деплой молча откатывал прод). Закрыто **двумя слоями (defense in depth), только для self-hosting**. Новый модуль `src/image_freshness.py` (контракт «never raise», по образцу `merge_gate`): `provenance_verdict` (чистая функция вердикта match/mismatch/fail-closed), `validated_revision` (`git rev-parse HEAD` в worktree валидированного коммита — единый якорь и для штампа A, и для `EXPECTED_REVISION` B), `image_revision` (OCI-лейбл `org.opencontainers.image.revision` через `docker image inspect`, ``/ошибка → пусто), `rebuild_staging_image` (ssh-хук `--build-staging`), `image_freshness_applies` (условность), `check_staging_image_fresh` (композитный QG). **Strategy A (liveness):** новый детерминированный QG-под-чек `check_staging_image_fresh` (зарегистрирован в `QG_CHECKS`, `src/qg/checks.py`) на ребре `deploy-staging → deploy` ПОСЛЕ merge-gate и ДО Phase A — пересобирает staging-образ из worktree валидированного коммита (хук `--build-staging`, `--build-arg GIT_SHA=`), пересоздаёт 8501 и прогоняет `staging_check.py --mode stub` против свежего 8501 (health + e2e, внутри staging-контейнера через `docker exec` — канон ORCH-048) → валидируем РОВНО тот артефакт (build + e2e), что промоутится в прод (AC-4); FAIL/не-ноль staging_check → откат на `development` (как merge-gate, кап `MAX_DEVELOPER_RETRIES`). `rebuild_staging_image` пробрасывает в хук **явный** staging-таргет (service/port/profile/container), исключая дрейф на прод 8500. Сборки/recreate/validate — **только staging (8501)**, прод (8500) не трогается. **Strategy B (safety):** `Dockerfile` штампует `LABEL org.opencontainers.image.revision=$GIT_SHA` (`ARG GIT_SHA`); `build_deploy_command` (`src/self_deploy.py`) пробрасывает `EXPECTED_REVISION`; хост-хук шагом 2b ПЕРЕД `docker tag` fail-closed сверяет лейбл `revision` у `SOURCE_IMAGE` с `EXPECTED_REVISION` — несовпадение / пустой лейбл / ошибка inspect → `exit 1` (FAILED → БАГ-8 откат), делает тихий промоут устаревшего образа структурно невозможным даже при проигравшей гонку/отключённой A. Хост-хук `scripts/orchestrator-deploy-hook.sh` расширен **обратно-совместимым** режимом `--build-staging` (пересборка+recreate staging, exit 0/1) и fail-closed guard'ом (активен только при заданном `EXPECTED_REVISION`). Единый kill-switch `ORCH_IMAGE_FRESHNESS_ENABLED` (true) включает A+B **как целое** (нет «B без A» = вечного fail-fast); область — `ORCH_IMAGE_FRESHNESS_REPOS` (CSV; пусто → только self-hosting `orchestrator`). Контракты НЕ менялись: `STAGE_TRANSITIONS` (под-гейт ребра, не стадия), exit-code-контракт хука (0/1/2), `map_exit_code_to_status`, `check_deploy_status`/`_parse_deploy_status`, БАГ-8, terminal-sync, merge-gate; схема БД — без миграций. ADR `docs/work-items/ORCH-058/06-adr/ADR-001-staging-image-provenance.md`, глобальный `docs/architecture/adr/adr-0008-staging-image-provenance.md`. Документация: `docs/architecture/README.md`, `docs/operations/DEPLOY_HOOK.md`, `docs/operations/STAGING.md`, `docs/operations/INFRA.md`, `.env.example`. Тесты: `tests/test_image_freshness.py`, `tests/test_deploy_hook_provenance.py`, `tests/test_deploy_build_once.py` (TC-06), `tests/test_deploy_hook_mapping.py` (TC-09), `tests/test_stage_engine.py::TestImageFreshnessGate`, `tests/test_qg_registry_snapshot.py`, `tests/test_config.py`. - **Исполняемый самодеплой стадии `deploy` (стадия дёргает хост-хук, manual-approve)** (ORCH-036): стадия `deploy` перестаёт быть «бумажной» — для self-hosting репозитория `orchestrator` `deploy_status: SUCCESS` означает ДОКАЗАННЫЙ health-ok реального рестарта прод-контейнера (8500), а не декларацию LLM. Критический путь self-restart детерминирован (без LLM), по образцу merge-gate ORCH-043, и разбит на три фазы (`src/stage_engine.py` + новый модуль `src/self_deploy.py`): **Фаза A** (вход в `deploy`) — вместо запуска прод-deployer'а при `deploy_require_manual_approve=true` задача переводится в approval-pending (`set_issue_in_review`) и ждёт ручного approve; restart-safe маркер `approve-requested`. **Фаза B** (человек ставит статус Plane → `Approved`; `advance_stage(deploy, finished_agent=None)`) — запускается **detached host-процесс** (`ssh + setsid` → `scripts/orchestrator-deploy-hook.sh`, чтобы рестарт 8500 пережил гибель контейнера; орк НЕ убивает себя из docker.sock) с build-once retag staging-образа (`SOURCE_IMAGE`), ставится детерминированный **finalizer-job**; маркер `initiated` — идемпотентность повторного Approved. **Фаза C** (`run_deploy_finalizer`, reserved-agent `deploy-finalizer`, claim'ится новым контейнером после рестарта) — читает sentinel `result` (exit-code хука, записан host-обёрткой), `not-ready` → defer (бюджет `deploy_finalize_max_attempts`, restart-safe по `task_content`), маппит `0→SUCCESS / 1|2|иное→FAILED` (чистая функция `map_exit_code_to_status`, unit-тест), пишет `14-deploy-log.md` и вызывает `advance_stage(deploy, finished_agent="deployer")` → существующие контракты: `SUCCESS → done` + release merge-lease, `FAILED → откат БАГ-8 на development` + `set_issue_blocked`. Уведомления Plane+Telegram на approve-request / initiate / success / rollback (BR-5, ни одного «молчаливого» деплоя). Хост-хук `scripts/orchestrator-deploy-hook.sh` расширен **обратно-совместимым** `SOURCE_IMAGE`: при заданном — `docker tag $SOURCE_IMAGE $TARGET_IMAGE` перед `up -d --no-build` (деплой РОВНО протестированного образа, без `docker build`); не задан → прежнее поведение; exit-code-контракт (0/1/2) и health-loop (10×6с, авто-rollback) не тронуты. Restart-safe состояние — sentinel-файлы (`/.deploy-state-//`), без миграции БД. Условность как ORCH-35: реальный самодеплой только для `is_self_hosting_repo("orchestrator")`; прочие репо (enduro-trails) — прежний синхронный ssh-путь агентом. Контракты НЕ менялись: `STAGE_TRANSITIONS`, реестр `QG_CHECKS`, `check_deploy_status`/`_parse_deploy_status` (frontmatter-only), terminal-sync `deploy→done`, merge-gate (ORCH-43), БАГ-8. Флаг `DEPLOY_REQUIRE_MANUAL_APPROVE` остаётся `true` (полный авто — отдельная задача ORCH-54). Новые настройки: `ORCH_DEPLOY_REQUIRE_MANUAL_APPROVE` (true), `ORCH_DEPLOY_SSH_USER`, `ORCH_DEPLOY_SSH_HOST`, `ORCH_DEPLOY_HOOK_SCRIPT`, `ORCH_DEPLOY_PROD_SOURCE_IMAGE`, `ORCH_DEPLOY_PROD_TARGET_SERVICE/PORT/IMAGE`, `ORCH_DEPLOY_FINALIZE_DELAY_S`, `ORCH_DEPLOY_FINALIZE_MAX_ATTEMPTS`. ADR `docs/work-items/ORCH-036/06-adr/ADR-001-executable-self-deploy.md`, глобальный `docs/architecture/adr/adr-0007-executable-self-deploy.md`. Документация: `.openclaw/agents/deployer.md` (стадия `deploy` = вызов хука, запрет self-restart), `docs/operations/INFRA.md`, `docs/operations/DEPLOY_HOOK.md`. Тесты: `tests/test_deploy_hook_mapping.py`, `tests/test_deploy_approve.py`, `tests/test_deploy_routing.py`, `tests/test_deploy_rollback.py`, `tests/test_deploy_notifications.py`, `tests/test_deploy_build_once.py`, `tests/test_deploy_terminal_sync.py`, `tests/test_staging_precondition.py`, `tests/test_deploy_hook_rollback_sim.py`. - **Sweeper потерянных webhook (реконсиляция застрявших стадий)** (ORCH-053): фоновый daemon-поток `src/reconciler.py` (паттерн `queue_worker`), который устраняет тихое застревание задач, когда конвейер не двигается из-за потерянного события (502 на ребилде инстанса, отсутствие ретраев у Plane/Gitea, неразрезолвленный `sha→branch` — класс инцидента ORCH-044). Реконсилятор периодически (`reconcile_interval_s`) доигрывает пропущенный переход **через те же штатные гейты/обработчики**, что и webhook, не дублируя логику конвейера: **F-1 gate-side** (`reconcile_gate_once`) — для задач `stage≠done`, без активного job и `age(updated_at) ≥ grace_for_stage(stage)` делает read-only пред-оценку канонического QG стадии; зелёный → продвижение строго через неизменный `stage_engine.advance_stage(..., finished_agent=None)`; красный → тишина (спам нотификаций структурно невозможен — `advance_stage` на красном гейте не вызывается вовсе); `analysis` F-1 не трогает (человеческий гейт). **F-2 plane-side** (`reconcile_plane_once`) — опрос Plane API per-project (новый `plane_sync.list_issues_by_state`, курсорная пагинация, never-raise) и реплей In Progress / Approved / Rejected через существующие `webhooks.plane.handle_status_start` / `handle_verdict` (async-обработчики вызываются из sync-потока через `asyncio.run`). **F-3** — усиление `sha→branch` в `handle_ci_status`: при неразрезолвленном sha — БД-fallback по единственной development-задаче repo (`db.get_development_tasks_by_repo`; неоднозначность → не резолвим, ложного матча нет), `logger.debug`→`logger.info` для видимости потерянного CI-события. Анти-дубль на создании задачи (`db.create_task_atomic` под process-wide `threading.Lock`: SELECT-exists→INSERT, проигравший в гонке reconcile↔webhook не плодит второй task/branch/worktree/стартовый analyst-job). Старт/стоп в `main.lifespan` (после `worker.start()` / перед `worker.stop()`), restart-safe, never-raise на единицу работы. Наблюдаемость (F-4): при разблокировке — лог-строка `reconciler: разблокирована (потерян webhook)` + Telegram (`reconcile_notify_unblock`) и блок `reconcile` в `GET /queue`. Kill-switches: `ORCH_RECONCILE_ENABLED` (глобально), `ORCH_RECONCILE_PLANE_ENABLED` (гасит только F-2), `ORCH_RECONCILE_INTERVAL_S` (120), `ORCH_RECONCILE_GRACE_DEFAULT_S` (600), `ORCH_RECONCILE_GRACE_OVERRIDES_JSON` (per-stage), `ORCH_RECONCILE_NOTIFY_UNBLOCK` (true). Схема БД и реестры (`STAGE_TRANSITIONS`/`QG_CHECKS`) НЕ менялись. ADR `docs/work-items/ORCH-053/06-adr/ADR-001-stuck-task-reconciler.md`, глобальный `docs/architecture/adr/adr-0007-reconciler.md`. Тесты: `tests/test_reconciler.py`, `tests/test_reconciler_plane.py`, `tests/test_gitea_sha_resolve.py`, `tests/test_config.py`. diff --git a/CLAUDE.md b/CLAUDE.md index 1a9f279..63cf19e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -47,7 +47,7 @@ created → analysis → architecture → development → review → testing → - Машинные вердикты Quality Gate — строго YAML-frontmatter (`verdict:`, `deploy_status:`, `staging_status:`), никогда проза ## Артефакты задачи (`docs/work-items//`) -`00-business-request.md`, `01-brd.md`, `02-trz.md`, `03-acceptance-criteria.md`, `04-test-plan.yaml`, `06-adr/ADR-NNN-slug.md`, `07-infra-requirements.md`, `08-data-requirements.md`, `10-tech-risks.md`, `12-review.md`, `13-test-report.md`, `14-deploy-log.md`, `15-staging-log.md`. +`00-business-request.md`, `01-brd.md`, `02-trz.md`, `03-acceptance-criteria.md`, `04-test-plan.yaml`, `06-adr/ADR-NNN-slug.md`, `07-infra-requirements.md`, `08-data-requirements.md`, `10-tech-risks.md`, `12-review.md`, `13-test-report.md`, `14-deploy-log.md`, `15-staging-log.md`, `16-post-deploy-log.md` (post-deploy наблюдение, ORCH-021). ## Правила для агентов 1. Перед любым действием прочесть этот файл и `docs/architecture/README.md`. diff --git a/docs/architecture/README.md b/docs/architecture/README.md index 11a6e47..4ae2094 100644 --- a/docs/architecture/README.md +++ b/docs/architecture/README.md @@ -91,7 +91,7 @@ sentinel-файлы (`/.deploy-state-//`), без мигр Подробнее: [adr-0007](adr/adr-0007-executable-self-deploy.md), детально — `docs/work-items/ORCH-036/06-adr/ADR-001-executable-self-deploy.md`. -### Post-deploy наблюдение прода + реакция на деградацию (ORCH-021 — design) +### Post-deploy наблюдение прода + реакция на деградацию (ORCH-021 — реализовано) Конвейер заканчивался на `deploy → done` и **забывал про прод**: «успех» = health-check в момент рестарта (~60с). Класс «зелёный деплой, красный прод» (прецедент ET-8 — деградация через минуты под трафиком, health `200 ok`, фича сломана). ORCH-021 продлевает @@ -247,4 +247,4 @@ never-raise на единицу работы; тишина при синхрон Схема БД, потоки данных, resilience-слой, детали Dockerfile — [internals.md](internals.md). --- -*Актуально на 2026-06-07. Обновлять при изменении src/stages.py, src/qg/checks.py, src/main.py. Статусы доработок: ORCH-036 (исполняемый самодеплой `deploy`, adr-0007) — реализовано; ORCH-043 (merge-gate, adr-0006) — design, ветка feature/ORCH-043; ORCH-053 (reconciler, adr-0007, src/reconciler.py) — реализовано; ORCH-060 (F-1 skip escalated/Blocked/Needs-Input, `docs/work-items/ORCH-060/06-adr/ADR-001`) — реализовано в ветке feature/ORCH-060 (Guard 1 `developer_retry_count>=MAX_DEVELOPER_RETRIES` + Guard 2 `plane_sync.fetch_issue_state` Blocked/Needs-Input, флаг `ORCH_RECONCILE_SKIP_BLOCKED_ENABLED`); ORCH-058 (провенанс staging-образа: check_staging_image_fresh + staging_check свежего образа + хук-guard, adr-0008) — реализовано в ветке feature/ORCH-058 (обновлять также при изменении src/image_freshness.py, scripts/orchestrator-deploy-hook.sh, Dockerfile); ORCH-061 (толерантность staging-вердикта к инфра-FAIL C9a/C9b, adr-0009, `docs/work-items/ORCH-061/06-adr/ADR-001`) — реализовано в ветке feature/ORCH-061 (обновлять также при изменении src/staging_verdict.py, scripts/staging_check.py, флаг staging_infra_tolerance_enabled); ORCH-021 (post-deploy наблюдение прода + реакция на деградацию, adr-0010, `docs/work-items/ORCH-021/06-adr/ADR-001`) — design, ветка feature/ORCH-021-post-deploy-rollback (`arch:major-change`; при реализации обновлять также при изменении src/post_deploy.py, src/stage_engine.py арм/run_post_deploy_monitor, src/agents/launcher.py перехват, флаги post_deploy_*; артефакт 16-post-deploy-log.md).* +*Актуально на 2026-06-07. Обновлять при изменении src/stages.py, src/qg/checks.py, src/main.py. Статусы доработок: ORCH-036 (исполняемый самодеплой `deploy`, adr-0007) — реализовано; ORCH-043 (merge-gate, adr-0006) — design, ветка feature/ORCH-043; ORCH-053 (reconciler, adr-0007, src/reconciler.py) — реализовано; ORCH-060 (F-1 skip escalated/Blocked/Needs-Input, `docs/work-items/ORCH-060/06-adr/ADR-001`) — реализовано в ветке feature/ORCH-060 (Guard 1 `developer_retry_count>=MAX_DEVELOPER_RETRIES` + Guard 2 `plane_sync.fetch_issue_state` Blocked/Needs-Input, флаг `ORCH_RECONCILE_SKIP_BLOCKED_ENABLED`); ORCH-058 (провенанс staging-образа: check_staging_image_fresh + staging_check свежего образа + хук-guard, adr-0008) — реализовано в ветке feature/ORCH-058 (обновлять также при изменении src/image_freshness.py, scripts/orchestrator-deploy-hook.sh, Dockerfile); ORCH-061 (толерантность staging-вердикта к инфра-FAIL C9a/C9b, adr-0009, `docs/work-items/ORCH-061/06-adr/ADR-001`) — реализовано в ветке feature/ORCH-061 (обновлять также при изменении src/staging_verdict.py, scripts/staging_check.py, флаг staging_infra_tolerance_enabled); ORCH-021 (post-deploy наблюдение прода + реакция на деградацию, adr-0010, `docs/work-items/ORCH-021/06-adr/ADR-001`) — реализовано в ветке feature/ORCH-021-post-deploy-rollback (reserved-agent job `post-deploy-monitor`: арм в src/stage_engine.py блок `next_stage == "done"`, тик `run_post_deploy_monitor` + перехват в src/agents/launcher.py ДО _spawn; чистая логика src/post_deploy.py never-raise; флаги `post_deploy_*` в src/config.py; блок `post_deploy` в `/queue`; артефакт 16-post-deploy-log.md; self-hosting всегда ALERT_ONLY — тик не рестартит прод; обновлять также при изменении src/post_deploy.py / арм-блока / launcher-перехвата).* diff --git a/src/agents/launcher.py b/src/agents/launcher.py index 31454ef..ec957d8 100644 --- a/src/agents/launcher.py +++ b/src/agents/launcher.py @@ -249,6 +249,11 @@ class AgentLauncher: """ if job.get("agent") == "deploy-finalizer": return self._run_deploy_finalizer_job(job) + # ORCH-021: the reserved-agent `post-deploy-monitor` is also a + # DETERMINISTIC (no-LLM) tick — intercept it BEFORE _spawn and run one + # observation tick synchronously. Returns None (no agent_run row). + if job.get("agent") == "post-deploy-monitor": + return self._run_post_deploy_monitor_job(job) return self._spawn( job["agent"], job["repo"], @@ -278,6 +283,27 @@ class AgentLauncher: pass return None + def _run_post_deploy_monitor_job(self, job: dict): + """ORCH-021: run one deterministic post-deploy monitor tick for a job. + + Not an LLM spawn — there is no subprocess/monitor, so we mark the jobs row + done/failed here. The tick never-raises, but we guard anyway so a monitor + fault can never wedge the worker / starve other projects (AC-16). + """ + from ..db import mark_job + from .. import stage_engine + try: + stage_engine.run_post_deploy_monitor(job) + mark_job(job["id"], "done") + logger.info(f"post-deploy-monitor job {job['id']} done") + except Exception as e: + logger.error(f"post-deploy-monitor job {job['id']} failed: {e}") + try: + mark_job(job["id"], "failed", error=f"post-deploy-monitor error: {e}") + except Exception: + pass + return None + def _spawn(self, agent: str, repo: str, task_content: str = None, task_id: int = None, job_id: int = None) -> int: """Shared spawn implementation for launch() and launch_job(). diff --git a/src/config.py b/src/config.py index 1a0612b..1161959 100644 --- a/src/config.py +++ b/src/config.py @@ -265,6 +265,37 @@ class Settings(BaseSettings): reconcile_notify_unblock: bool = True reconcile_skip_blocked_enabled: bool = True + # ORCH-021: post-deploy production monitoring + degradation reaction. After + # the terminal deploy->done transition for an applicable repo, a reserved-agent + # `post-deploy-monitor` job (no LLM, modelled on deploy-finalizer) probes prod + # over a window and reacts to a degradation the restart-time health-check + # missed (class "green deploy, red prod", precedent ET-8). State is in sentinel + # files (.post-deploy-state-//), no DB migration. See + # docs/architecture/adr/adr-0010-post-deploy-monitor.md. + # post_deploy_monitor_enabled -> global kill-switch (BR-8); False -> the + # pipeline is 1:1 as before ORCH-021 (no arm). + # post_deploy_repos -> CSV of repos where monitoring is REAL; empty + # -> only the self-hosting repo (orchestrator). + # Mirrors self_deploy_repos / merge_gate_repos. + # post_deploy_window_s -> observation window length (~15 min, BR-1). + # post_deploy_interval_s -> seconds between probe ticks. + # post_deploy_fail_threshold -> N CONSECUTIVE health failures -> DEGRADED. + # post_deploy_5xx_threshold -> window 5xx ratio above this -> DEGRADED. + # post_deploy_auto_rollback -> globally allow auto-rollback; True acts ONLY + # for non-self repos. For self-hosting the + # reaction is ALWAYS ALERT_ONLY (BR-5) — a tick + # NEVER restarts the prod orchestrator container. + # post_deploy_base_url -> base URL of the observed prod instance. + # Rollback target params reuse the existing deploy_prod_* settings (no dupes). + post_deploy_monitor_enabled: bool = True + post_deploy_repos: str = "" + post_deploy_window_s: int = 900 + post_deploy_interval_s: int = 30 + post_deploy_fail_threshold: int = 3 + post_deploy_5xx_threshold: float = 0.5 + post_deploy_auto_rollback: bool = False + post_deploy_base_url: str = "http://localhost:8500" + # Telegram notifications telegram_bot_token: str = "" telegram_chat_id: str = "" diff --git a/src/main.py b/src/main.py index 0d9314d..c21e5b2 100644 --- a/src/main.py +++ b/src/main.py @@ -123,11 +123,13 @@ async def queue(): from .db import job_status_counts, recent_jobs from .queue_worker import worker from .reconciler import reconciler + from . import post_deploy return { "counts": job_status_counts(), "max_concurrency": worker.max_concurrency, "poll_interval": worker.poll_interval, "resilience": worker.status(), "reconcile": reconciler.status(), + "post_deploy": post_deploy.status(), "recent": recent_jobs(10), } diff --git a/src/post_deploy.py b/src/post_deploy.py new file mode 100644 index 0000000..75afe42 --- /dev/null +++ b/src/post_deploy.py @@ -0,0 +1,614 @@ +"""Post-deploy production monitoring + degradation reaction (ORCH-021). + +The pipeline used to end at ``deploy -> done`` and then **forget about prod**: +"success" meant the health-check passed at restart (~60s window in +``scripts/orchestrator-deploy-hook.sh``). The class of incidents "green deploy, +red prod" (precedent ET-8 — degradation appears minutes later under real +traffic; ``/health`` answers ``200 ok`` while the feature is broken) was never +caught. ORCH-021 extends responsibility **PAST** ``done``: after the terminal +transition for an applicable repo we arm an observation window +(``post_deploy_window_s`` ~15 min, interval ``post_deploy_interval_s``); +degradation is detected by deterministic thresholds and, when confirmed, +triggers a reaction. + +The observation mechanism (ADR-001 §1, Variant B) is a **reserved-agent job** +``post-deploy-monitor`` — a deterministic, no-LLM job modelled exactly on +``deploy-finalizer``. One "tick" == one job: it does ONE probe, appends to a +persisted ``series`` file, classifies, and either re-queues itself with a delay +(``available_at_delay_s``) or finishes (DEGRADED -> reaction; or window expired +-> HEALTHY). Between ticks no job runs (it is scheduled in the future), so the +single worker stays free for other projects — exactly like the finalizer defer. + +This module is a **leaf** (mirrors ``self_deploy.py`` / ``staging_verdict.py``): +it imports only config (and lazily ``qg.checks.is_self_hosting_repo``), never +``stage_engine`` / ``launcher`` — the orchestration that needs those lives in +``stage_engine.run_post_deploy_monitor``. Every public helper honours a +**never-raise** contract so a monitoring hiccup can never crash the worker / +lifespan / the pipeline of other projects (AC-16). + +Restart-safe state lives in sentinel files under +``/.post-deploy-state-//`` (mirrors the +deploy-state pattern, no DB migration — ТЗ §2.7): + * ``armed`` — monitoring armed for this work item (idempotency-guard, AC-15); + * ``series`` — JSON list of probe results (restart-safe streak/5xx counters); + * ``done`` — monitoring finished (anti-dupe, AC-15). + +Self-hosting safety (BR-5 / AC-8): a monitor tick NEVER auto-rolls-back or +restarts the prod ``orchestrator`` container — for ``orchestrator`` the reaction +is ALWAYS ``ALERT_ONLY`` (loud Telegram + Plane, manual approve). +""" + +from __future__ import annotations + +import glob +import json +import logging +import os +import shlex +import subprocess +import urllib.error +import urllib.request +from dataclasses import dataclass + +from .config import settings + +logger = logging.getLogger("orchestrator.post_deploy") + +# Sentinel marker filenames (see module docstring). +ARMED = "armed" +SERIES = "series" +DONE = "done" + +# Verdicts (classify). +HEALTHY = "HEALTHY" +DEGRADED = "DEGRADED" + +# Reaction decisions (decide_action). +NONE = "NONE" +ROLLBACK = "ROLLBACK" +ALERT_ONLY = "ALERT_ONLY" + +# action_taken values written to the artefact frontmatter. +ROLLBACK_OK = "ROLLBACK_OK" +ROLLBACK_FAILED = "ROLLBACK_FAILED" + +# The 5xx-monitored endpoints (besides /health, whose 200+ok is its own signal). +_FIVEXX_ENDPOINTS = ("/status", "/queue") + +_PROBE_TIMEOUT = 5 +_SSH_TIMEOUT = 60 +_GIT_TIMEOUT = 60 + + +# --------------------------------------------------------------------------- +# Conditionality (mirrors self_deploy_applies / _merge_gate_applies) +# --------------------------------------------------------------------------- +def post_deploy_applies(repo: str) -> bool: + """Whether post-deploy monitoring is REAL for this repo (AC-2 / AC-10). + + Mirrors the ORCH-35/36/43/58 conditional rollout: + * ``post_deploy_monitor_enabled=False`` -> always False (global + kill-switch); the pipeline is 1:1 as before ORCH-021 (AC-10). + * ``post_deploy_repos`` (CSV) non-empty -> real only for listed repos. + * empty CSV -> real ONLY for the self-hosting repo (``orchestrator``). + Never raises. + """ + try: + if not settings.post_deploy_monitor_enabled: + return False + raw = (settings.post_deploy_repos or "").strip() + if raw: + allowed = {r.strip().lower() for r in raw.split(",") if r.strip()} + return (repo or "").strip().lower() in allowed + # Lazy import keeps this module a leaf (avoid importing qg at load time). + from .qg.checks import is_self_hosting_repo + return is_self_hosting_repo(repo) + except Exception as e: # noqa: BLE001 - never-raise contract + logger.warning("post_deploy_applies error for %s: %s", repo, e) + return False + + +# --------------------------------------------------------------------------- +# Signal probe (one tick) +# --------------------------------------------------------------------------- +@dataclass +class ProbeResult: + """Outcome of ONE probe tick (JSON-serialisable via ``as_dict``). + + ``health_ok`` — ``/health`` answered HTTP 200 with ``{"status": "ok"}``. + ``total`` — number of 5xx-monitored endpoints probed (``/status``, + ``/queue``) — the denominator of the window 5xx ratio. + ``fivexx`` — how many of those returned 5xx (or were unreachable, which + is conservatively counted as a server failure). + ``detail`` — human-readable note (logs / artefact body). + """ + + health_ok: bool + total: int + fivexx: int + detail: str = "" + + def as_dict(self) -> dict: + return { + "health_ok": bool(self.health_ok), + "total": int(self.total), + "fivexx": int(self.fivexx), + "detail": str(self.detail), + } + + +def _http_status(url: str) -> tuple[int, str]: + """GET ``url`` -> (http_code, body). Network/timeout -> (0, ""). + + Never raises. ``urllib`` raises ``HTTPError`` for >=400 responses; we treat + that as a real status code (so a 5xx is observed, not swallowed). + """ + try: + with urllib.request.urlopen(url, timeout=_PROBE_TIMEOUT) as resp: # noqa: S310 + body = resp.read(4096).decode("utf-8", "replace") + return int(getattr(resp, "status", resp.getcode())), body + except urllib.error.HTTPError as e: + try: + body = e.read(4096).decode("utf-8", "replace") + except Exception: + body = "" + return int(e.code), body + except Exception as e: # noqa: BLE001 - URLError / socket timeout / anything + logger.warning("post_deploy probe error for %s: %s", url, e) + return 0, "" + + +def probe_signals(base_url: str) -> ProbeResult: + """Probe ``/health`` + the key endpoints of the prod instance ONCE (AC-16). + + ``/health`` is healthy iff HTTP 200 AND the body parses to + ``{"status": "ok"}``. ``/status`` and ``/queue`` contribute to the window + 5xx ratio: an HTTP 5xx OR an unreachable endpoint (network error / timeout, + code 0) is counted as a failure (conservative — a down server is bad). A + network failure yields a conservative "failed" probe, NEVER an exception + (TC-14). + """ + base = (base_url or "").rstrip("/") + # --- /health: the primary liveness signal --- + code, body = _http_status(base + "/health") + health_ok = False + if code == 200: + try: + health_ok = json.loads(body).get("status") == "ok" + except Exception: + health_ok = False + # --- /status, /queue: 5xx ratio over the window --- + total = 0 + fivexx = 0 + for ep in _FIVEXX_ENDPOINTS: + total += 1 + ep_code, _ = _http_status(base + ep) + if ep_code == 0 or 500 <= ep_code <= 599: + fivexx += 1 + detail = f"health={code}({'ok' if health_ok else 'bad'}) 5xx={fivexx}/{total}" + return ProbeResult(health_ok=health_ok, total=total, fivexx=fivexx, detail=detail) + + +# --------------------------------------------------------------------------- +# Classification (pure, no I/O — the MAIN unit-test subject, like +# compute_staging_verdict in ORCH-061) +# --------------------------------------------------------------------------- +def classify(series, fail_threshold: int, fivexx_threshold: float) -> str: + """Fold a probe series into ``HEALTHY`` | ``DEGRADED`` (deterministic, pure). + + ``series`` — iterable of probe dicts (``{"health_ok", "total", "fivexx"}``), + as persisted by :func:`append_probe`. + + Decision (BR-3 / AC-3..AC-6): + * ``>= fail_threshold`` CONSECUTIVE health failures -> ``DEGRADED`` (AC-4); + * window 5xx ratio ``sum(fivexx)/sum(total)`` strictly ``> fivexx_threshold`` + -> ``DEGRADED`` even if ``/health`` answers 200 (AC-5); + * otherwise ``HEALTHY`` — a single glitch below the threshold that recovers + does NOT trip (AC-3 / AC-6, no false rollback). + + Never raises: on malformed input it returns ``HEALTHY`` (fail-SAFE — a false + ``DEGRADED`` would trigger an unwanted rollback, the worse outcome). + """ + try: + # Non-list input is malformed -> fail-safe HEALTHY (never a false rollback). + if not isinstance(series, (list, tuple)): + return HEALTHY + # Longest run of consecutive health failures. + streak = 0 + best = 0 + total = 0 + fivexx = 0 + for row in series: + # A non-dict row is malformed: skip it (do NOT count it as a failure, + # which could fabricate a DEGRADED streak from garbage). + if not isinstance(row, dict): + continue + ok = bool(row.get("health_ok")) + total += int(row.get("total") or 0) + fivexx += int(row.get("fivexx") or 0) + if ok: + streak = 0 + else: + streak += 1 + if streak > best: + best = streak + if best >= int(fail_threshold): + return DEGRADED + if total > 0 and (fivexx / total) > float(fivexx_threshold): + return DEGRADED + return HEALTHY + except Exception as e: # noqa: BLE001 - never-raise; fail-safe to HEALTHY + logger.warning("post_deploy classify error: %s", e) + return HEALTHY + + +def decide_action(repo: str, verdict: str) -> str: + """Decide the reaction for ``(repo, verdict)`` (pure, BR-5 / AC-7 / AC-8). + + * ``HEALTHY`` -> ``NONE`` (no reaction, any repo); + * ``DEGRADED`` + self-hosting -> ``ALERT_ONLY`` (ALWAYS — the tick + NEVER auto-rolls-back / restarts the prod orchestrator container, AC-8); + * ``DEGRADED`` + non-self + ``post_deploy_auto_rollback=True`` -> ``ROLLBACK``; + * ``DEGRADED`` + non-self + auto_rollback False (default) -> ``ALERT_ONLY``. + + Never raises: on doubt returns ``ALERT_ONLY`` (never an unexpected rollback). + """ + try: + if verdict != DEGRADED: + return NONE + from .qg.checks import is_self_hosting_repo + if is_self_hosting_repo(repo): + return ALERT_ONLY # BR-5: self-hosting is NEVER auto-rolled-back + if settings.post_deploy_auto_rollback: + return ROLLBACK + return ALERT_ONLY + except Exception as e: # noqa: BLE001 - never-raise; safe default + logger.warning("post_deploy decide_action error for %s: %s", repo, e) + return ALERT_ONLY + + +def map_rollback_exit_code(exit_code) -> str: + """Map a ``--rollback`` hook exit-code to an ``action_taken`` (pure, AC-9). + + Hook exit-code contract (unchanged, 0/1/2): + * ``0`` -> ``ROLLBACK_OK`` (rollback proven healthy); + * ``1`` (no prev image), ``2`` (rollback also failed), anything else, or a + non-int/None -> ``ROLLBACK_FAILED`` (fail-closed -> loud escalation). + """ + try: + code = int(exit_code) + except (TypeError, ValueError): + return ROLLBACK_FAILED + return ROLLBACK_OK if code == 0 else ROLLBACK_FAILED + + +# --------------------------------------------------------------------------- +# Sentinel state (restart-safe, no DB migration — ТЗ §2.7) +# --------------------------------------------------------------------------- +def _state_dir(base: str, repo: str, work_item_id: str | None) -> str: + return os.path.join(base, f".post-deploy-state-{repo}", (work_item_id or "_")) + + +def state_dir(repo: str, work_item_id: str | None) -> str: + """State dir as seen from the container (``settings.repos_dir`` mount).""" + return _state_dir(settings.repos_dir, repo, work_item_id) + + +def host_state_dir(repo: str, work_item_id: str | None) -> str: + """State dir as seen from the HOST (``settings.host_repos_dir``). + + Same physical directory as :func:`state_dir` via the shared mount; the host + path is what we embed in an ssh command if a host-side helper needs it. + """ + return _state_dir(settings.host_repos_dir, repo, work_item_id) + + +def marker_path(repo: str, work_item_id: str | None, name: str) -> str: + return os.path.join(state_dir(repo, work_item_id), name) + + +def has_marker(repo: str, work_item_id: str | None, name: str) -> bool: + """True iff the named sentinel exists. Never raises.""" + try: + return os.path.isfile(marker_path(repo, work_item_id, name)) + except Exception as e: # noqa: BLE001 - never-raise + logger.warning("has_marker error for %s/%s/%s: %s", repo, work_item_id, name, e) + return False + + +def write_marker(repo: str, work_item_id: str | None, name: str, content: str = "") -> bool: + """Create/overwrite a sentinel (best-effort). Returns True on success.""" + try: + d = state_dir(repo, work_item_id) + os.makedirs(d, exist_ok=True) + with open(os.path.join(d, name), "w", encoding="utf-8") as f: + f.write(str(content)) + return True + except OSError as e: + logger.warning("write_marker error for %s/%s/%s: %s", repo, work_item_id, name, e) + return False + + +def mark_done(repo: str, work_item_id: str | None) -> bool: + """Mark monitoring finished for this work item (anti-dupe, AC-15).""" + return write_marker(repo, work_item_id, DONE, "done") + + +def read_series(repo: str, work_item_id: str | None) -> list: + """Read the persisted probe series (JSON list). Missing/corrupt -> ``[]``. + + Never raises — restart-safe streak/5xx counters survive a container restart. + """ + p = marker_path(repo, work_item_id, SERIES) + try: + with open(p, "r", encoding="utf-8") as f: + data = json.load(f) + return data if isinstance(data, list) else [] + except FileNotFoundError: + return [] + except Exception as e: # noqa: BLE001 - never-raise; corrupt -> empty + logger.warning("read_series error for %s/%s: %s", repo, work_item_id, e) + return [] + + +def append_probe(repo: str, work_item_id: str | None, probe: ProbeResult) -> list: + """Append a probe to the persisted series and return the new list. + + Best-effort (a write error logs and returns the in-memory list so the tick + still classifies). Never raises. + """ + series = read_series(repo, work_item_id) + try: + series.append(probe.as_dict() if isinstance(probe, ProbeResult) else dict(probe)) + except Exception as e: # noqa: BLE001 + logger.warning("append_probe coerce error for %s/%s: %s", repo, work_item_id, e) + return series + try: + d = state_dir(repo, work_item_id) + os.makedirs(d, exist_ok=True) + with open(os.path.join(d, SERIES), "w", encoding="utf-8") as f: + json.dump(series, f) + except OSError as e: + logger.warning("append_probe write error for %s/%s: %s", repo, work_item_id, e) + return series + + +def arm_monitor(repo: str, work_item_id: str | None, branch: str, task_id: int) -> bool: + """Arm post-deploy monitoring after ``deploy -> done`` (AC-1 / AC-15). + + Idempotent: if the ``armed`` sentinel already exists this is a no-op (a double + webhook / reconciler F-1 / finalizer Phase C can drive ``done`` more than once, + AC-15). Otherwise creates the state dir, writes ``armed`` + an empty ``series``, + and enqueues the FIRST ``post-deploy-monitor`` job with a delay of one interval + (so the prod has settled before the first probe). Returns True iff it armed a + NEW monitor. Never raises — the caller (terminal block of ``advance_stage``) + must never be crashed by a monitoring hiccup. + """ + try: + if has_marker(repo, work_item_id, ARMED): + logger.info("arm_monitor: already armed for %s/%s (no-op)", repo, work_item_id) + return False + write_marker(repo, work_item_id, ARMED, "armed") + # Initialise an empty series so read_series is well-defined from tick 1. + try: + d = state_dir(repo, work_item_id) + os.makedirs(d, exist_ok=True) + with open(os.path.join(d, SERIES), "w", encoding="utf-8") as f: + json.dump([], f) + except OSError as e: + logger.warning("arm_monitor: series init error for %s/%s: %s", repo, work_item_id, e) + # Lazy import keeps this module a leaf (db is a low-level dependency). + from .db import enqueue_job + task_desc = ( + f"Work item: {work_item_id}\nRepo: {repo}\nBranch: {branch}\n" + f"Stage: post-deploy\nNote: post-deploy monitor tick 1 " + f"(window {settings.post_deploy_window_s}s, interval " + f"{settings.post_deploy_interval_s}s)." + ) + job_id = enqueue_job( + "post-deploy-monitor", repo, task_desc, task_id=task_id, + available_at_delay_s=settings.post_deploy_interval_s, + ) + logger.info( + "arm_monitor: armed post-deploy monitor for %s/%s (job_id=%s)", + repo, work_item_id, job_id, + ) + return True + except Exception as e: # noqa: BLE001 - never-raise contract + logger.error("arm_monitor error for %s/%s: %s", repo, work_item_id, e) + return False + + +def max_ticks() -> int: + """Bounded tick budget for the window (anti-livelock, like + ``deploy_finalize_max_attempts``): ``window_s // interval_s`` (>= 1).""" + try: + interval = max(1, int(settings.post_deploy_interval_s)) + return max(1, int(settings.post_deploy_window_s) // interval) + except Exception: # noqa: BLE001 - never-raise + return 1 + + +# --------------------------------------------------------------------------- +# Rollback command (non-self repos only; reuses deploy_prod_* env — ТЗ §2.4) +# --------------------------------------------------------------------------- +def build_rollback_command(repo: str) -> list[str]: + """Build the ssh argv that runs the deploy hook in ``--rollback`` mode. + + Mirrors ``self_deploy.build_deploy_command`` (same prod-env, INFRA P-2 ssh + target) but the action is ``--rollback`` and the call is SYNCHRONOUS (the + target container is NOT the orchestrator, so it is safe to wait for the hook + exit-code directly — no detached setsid wrapper, no ``result`` sentinel). + Reuses the existing ``deploy_prod_*`` settings; no new duplicate config. + """ + env_assignments = ( + f"TARGET_SERVICE={shlex.quote(settings.deploy_prod_target_service)} " + f"TARGET_PORT={int(settings.deploy_prod_target_port)} " + f"TARGET_IMAGE={shlex.quote(settings.deploy_prod_target_image)} " + f"COMPOSE_PROFILE={shlex.quote(settings.deploy_prod_compose_profile)} " + f"PREV_IMAGE_FILE={shlex.quote(settings.deploy_prod_prev_image_file)}" + ) + inner = ( + f"cd {shlex.quote(settings.deploy_host_repo_path)} && " + f"{env_assignments} " + f"bash {shlex.quote(settings.deploy_hook_script)} --rollback" + ) + user = (settings.deploy_ssh_user or "").strip() + host = (settings.deploy_ssh_host or "").strip() + target = f"{user}@{host}" if user else host + return ["ssh", "-o", "StrictHostKeyChecking=no", target, inner] + + +def run_rollback(repo: str) -> tuple[int, str]: + """Run the ``--rollback`` hook synchronously. Returns ``(exit_code, detail)``. + + Never raises: an ssh launch error / timeout maps to a non-zero exit-code so + the caller records ``ROLLBACK_FAILED`` and escalates (AC-9). NEVER used for + the self-hosting repo (``decide_action`` returns ``ALERT_ONLY`` there) — the + structural guard against a tick restarting the prod orchestrator (AC-8). + """ + cmd = build_rollback_command(repo) + try: + r = subprocess.run(cmd, capture_output=True, text=True, timeout=_SSH_TIMEOUT) + except subprocess.TimeoutExpired: + return 2, "rollback ssh timeout" + except (subprocess.SubprocessError, OSError) as e: + return 2, f"rollback ssh error: {e}" + detail = ((r.stderr or "") + (r.stdout or "")).strip()[:200] + return int(r.returncode), detail + + +# --------------------------------------------------------------------------- +# Artefact 16-post-deploy-log.md (machine-readable frontmatter — ТЗ §2.5) +# --------------------------------------------------------------------------- +def build_post_deploy_log( + work_item_id: str, + status: str, + action_taken: str, + window_s: int, + checks_total: int, + checks_failed: int, + body_extra: str = "", +) -> str: + """Render a 16-post-deploy-log.md body. Only the YAML-frontmatter is machine + read (canon of gates; the loop-of-lessons ORCH-8 consumes it, BR-10). The + body is informational. Parseable by ``yaml.safe_load`` (AC-13). + """ + return ( + "---\n" + f"post_deploy_status: {status}\n" + f"action_taken: {action_taken}\n" + f"work_item: {work_item_id}\n" + f"window_s: {int(window_s)}\n" + f"checks_total: {int(checks_total)}\n" + f"checks_failed: {int(checks_failed)}\n" + "---\n\n" + "# Post-deploy log — ORCH-021 post-deploy monitor\n\n" + f"Наблюдение прода завершено: `post_deploy_status: {status}`, " + f"`action_taken: {action_taken}`.\n\n" + f"Окно наблюдения: {int(window_s)}s; опросов всего: {int(checks_total)}, " + f"из них с провалом: {int(checks_failed)}.\n" + f"{body_extra}" + ) + + +def write_post_deploy_log( + repo: str, + work_item_id: str, + branch: str, + status: str, + action_taken: str, + window_s: int, + checks_total: int, + checks_failed: int, + body_extra: str = "", +) -> bool: + """Write 16-post-deploy-log.md into the task worktree and best-effort + commit+push it. Returns True iff the file was written. Never raises — the + artefact is best-effort, its absence rolls nothing back (AC-13 / TC-15). + """ + from .git_worktree import get_worktree_path + + rel = f"docs/work-items/{work_item_id}/16-post-deploy-log.md" + try: + wt = get_worktree_path(repo, branch) + except Exception as e: # noqa: BLE001 - never-raise + logger.error("write_post_deploy_log: worktree error for %s/%s: %s", repo, branch, e) + return False + + path = os.path.join(wt, rel) + content = build_post_deploy_log( + work_item_id, status, action_taken, window_s, checks_total, checks_failed, body_extra + ) + try: + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + f.write(content) + except OSError as e: + logger.error("write_post_deploy_log: write error at %s: %s", path, e) + return False + + git_env = { + **os.environ, + "HOME": "/home/slin", + "GIT_AUTHOR_NAME": "post-deploy-monitor", + "GIT_AUTHOR_EMAIL": "post-deploy-monitor@mva154.local", + "GIT_COMMITTER_NAME": "post-deploy-monitor", + "GIT_COMMITTER_EMAIL": "post-deploy-monitor@mva154.local", + } + try: + subprocess.run(["git", "-C", wt, "add", rel], + capture_output=True, timeout=_GIT_TIMEOUT, env=git_env) + commit = subprocess.run( + ["git", "-C", wt, "commit", "-m", + f"docs(ORCH-021): post-deploy {status}/{action_taken} for {work_item_id}"], + capture_output=True, text=True, timeout=_GIT_TIMEOUT, env=git_env, + ) + if commit.returncode == 0: + subprocess.run(["git", "-C", wt, "push", "origin", branch], + capture_output=True, timeout=_GIT_TIMEOUT, env=git_env) + except (subprocess.SubprocessError, OSError) as e: + logger.warning("write_post_deploy_log: git commit/push best-effort failed: %s", e) + return True + + +# --------------------------------------------------------------------------- +# Observability snapshot for GET /queue (BR-9 / AC-14) +# --------------------------------------------------------------------------- +def status() -> dict: + """Post-deploy snapshot for /queue observability. Never raises. + + ``active`` — work items with an ``armed`` sentinel but no ``done`` yet (a + monitoring window in flight). ``last_outcome`` — best-effort last finished + window read from the most-recent ``done`` state dir's series length. + """ + snap = { + "enabled": False, + "window_s": None, + "interval_s": None, + "repos": "", + "active": [], + "active_count": 0, + } + try: + snap["enabled"] = bool(settings.post_deploy_monitor_enabled) + snap["window_s"] = int(settings.post_deploy_window_s) + snap["interval_s"] = int(settings.post_deploy_interval_s) + snap["repos"] = settings.post_deploy_repos or "" + pattern = os.path.join(settings.repos_dir, ".post-deploy-state-*", "*") + active: list[str] = [] + for d in glob.glob(pattern): + try: + if not os.path.isdir(d): + continue + if os.path.isfile(os.path.join(d, ARMED)) and not os.path.isfile( + os.path.join(d, DONE) + ): + active.append(os.path.basename(d)) + except Exception: # noqa: BLE001 - skip one dir + continue + snap["active"] = sorted(active) + snap["active_count"] = len(active) + except Exception as e: # noqa: BLE001 - never-raise + logger.warning("post_deploy status snapshot error: %s", e) + return snap diff --git a/src/stage_engine.py b/src/stage_engine.py index 9cc3b1a..df84ca5 100644 --- a/src/stage_engine.py +++ b/src/stage_engine.py @@ -37,6 +37,7 @@ from .review_parse import extract_review_findings, extract_test_failures from .qg.checks import QG_CHECKS from . import merge_gate from . import self_deploy +from . import post_deploy from .notifications import ( notify_stage_change, notify_qg_failure, @@ -352,6 +353,17 @@ def advance_stage( except Exception as e: # noqa: BLE001 - defensive logger.warning(f"Task {task_id}: merge-lease release on done failed: {e}") + # ORCH-021: arm post-deploy monitoring PAST `done`. Responsibility extends + # beyond the restart-time health-check to catch the "green deploy, red prod" + # class (ET-8). Idempotent (sentinel `armed`) + conditional (applies()), so a + # double webhook / reconciler / finalizer re-driving `done` never doubles it + # and non-applicable repos are untouched. never-raise (arm_monitor + guard). + if next_stage == "done" and post_deploy.post_deploy_applies(repo): + try: + post_deploy.arm_monitor(repo, work_item_id, branch, task_id) + except Exception as e: # noqa: BLE001 - monitoring must never crash done + logger.warning(f"Task {task_id}: post-deploy arm failed: {e}") + # --- Launch the next agent (ORCH-4 fix: current_stage, not next) ----- next_agent = get_agent_for_stage(current_stage) if next_agent: @@ -1176,3 +1188,139 @@ def run_deploy_finalizer(job: dict): branch=branch, finished_agent="deployer", ) + + +def run_post_deploy_monitor(job: dict): + """ORCH-021 — one post-deploy monitor tick (reserved-agent, no LLM). + + A deterministic tick modelled on ``run_deploy_finalizer``: it does ONE probe + of the prod instance, appends to the persisted ``series`` (restart-safe + streak/5xx counters), classifies, and then either RE-QUEUES itself with a + delay (window not over and still HEALTHY) or FINISHES the window (DEGRADED -> + reaction; window expired -> HEALTHY). Observation happens entirely AFTER the + terminal ``done`` — it never touches ``STAGE_TRANSITIONS`` / ``QG_CHECKS`` and + never restarts the prod orchestrator container itself (AC-8 / AC-12). + + never-raise into the caller (the launcher marks the job done/failed); each + branch is individually defensive. + """ + task_id = job.get("task_id") + repo = job.get("repo") + try: + conn = get_db() + row = conn.execute( + "SELECT work_item_id, branch FROM tasks WHERE id=?", (task_id,) + ).fetchone() + conn.close() + except Exception as e: # noqa: BLE001 - never-raise + logger.error(f"post-deploy-monitor: db error for task_id={task_id}: {e}") + return + if not row: + logger.error(f"post-deploy-monitor: no task row for task_id={task_id}") + return + work_item_id, branch = row[0], row[1] + + # AC-15: a finished window is a no-op (defends against a duplicate job). + if post_deploy.has_marker(repo, work_item_id, post_deploy.DONE): + logger.info(f"post-deploy-monitor: {work_item_id} already done (no-op)") + return + + # One probe -> append -> classify (restart-safe via the persisted series). + probe = post_deploy.probe_signals(settings.post_deploy_base_url) + series = post_deploy.append_probe(repo, work_item_id, probe) + verdict = post_deploy.classify( + series, + settings.post_deploy_fail_threshold, + settings.post_deploy_5xx_threshold, + ) + ticks = len(series) + budget = post_deploy.max_ticks() + logger.info( + f"post-deploy-monitor: {work_item_id} tick {ticks}/{budget} " + f"probe=[{probe.detail}] verdict={verdict}" + ) + + # HEALTHY and window not exhausted -> defer the next tick (worker stays free). + if verdict == post_deploy.HEALTHY and ticks < budget: + task_desc = ( + f"Work item: {work_item_id}\nRepo: {repo}\nBranch: {branch}\n" + f"Stage: post-deploy\nNote: post-deploy monitor tick {ticks + 1} " + f"(healthy so far; re-poll after {settings.post_deploy_interval_s}s)." + ) + enqueue_job( + "post-deploy-monitor", repo, task_desc, task_id=task_id, + available_at_delay_s=settings.post_deploy_interval_s, + ) + return + + checks_total = ticks + checks_failed = sum(1 for r in series if not r.get("health_ok")) + + # HEALTHY and window exhausted -> clean finish (BR-6 / AC-17). + if verdict == post_deploy.HEALTHY: + post_deploy.write_post_deploy_log( + repo, work_item_id, branch, post_deploy.HEALTHY, post_deploy.NONE, + settings.post_deploy_window_s, checks_total, checks_failed, + ) + post_deploy.mark_done(repo, work_item_id) + _notify_post_deploy( + work_item_id, + f"✅ {work_item_id}: пост-деплой окно завершено чисто " + f"(HEALTHY, {checks_total} опросов).", + ) + return + + # DEGRADED -> decide + execute the reaction (§5), write artefact, finish. + action = post_deploy.decide_action(repo, verdict) + action_taken = post_deploy.ALERT_ONLY + if action == post_deploy.ROLLBACK: + # Non-self repo + auto policy: run the --rollback hook synchronously (the + # target is NOT the orchestrator, so its restart is safe for the pipeline). + exit_code, detail = post_deploy.run_rollback(repo) + action_taken = post_deploy.map_rollback_exit_code(exit_code) + if action_taken == post_deploy.ROLLBACK_OK: + _notify_post_deploy( + work_item_id, + f"⚠️ {work_item_id}: пост-деплой DEGRADED -> авто-rollback выполнен " + f"(exit {exit_code}).", + ) + else: + # AC-9: a failed rollback escalates loudly for manual intervention. + _notify_post_deploy( + work_item_id, + f"🚨 {work_item_id}: пост-деплой DEGRADED -> авто-rollback ПРОВАЛИЛСЯ " + f"(exit {exit_code}: {detail}). Нужно ручное вмешательство.", + ) + else: + # ALERT_ONLY: self-hosting ALWAYS lands here — the tick NEVER auto-rolls-back + # or restarts the prod orchestrator container (BR-5 / AC-8). Loud alert + + # manual-approve request (mirrors deploy Phase A CTA). + action_taken = post_deploy.ALERT_ONLY + _notify_post_deploy( + work_item_id, + f"🚨 {work_item_id}: пост-деплой DEGRADED ({checks_failed}/{checks_total} " + f"провалов). Требуется ручной approve отката — авто-rollback для " + f"self-hosting запрещён (BR-5).", + ) + + post_deploy.write_post_deploy_log( + repo, work_item_id, branch, post_deploy.DEGRADED, action_taken, + settings.post_deploy_window_s, checks_total, checks_failed, + ) + post_deploy.mark_done(repo, work_item_id) + + +def _notify_post_deploy(work_item_id: str, message: str) -> None: + """Best-effort Telegram + Plane notification for a post-deploy event (AC-17). + + Never raises — a notification failure must not wedge the monitor tick. + """ + try: + send_telegram(message) + except Exception as e: # noqa: BLE001 - never break the tick + logger.warning(f"post-deploy notify telegram failed for {work_item_id}: {e}") + if work_item_id: + try: + plane_add_comment(work_item_id, message, author="deployer") + except Exception as e: # noqa: BLE001 - never break the tick + logger.warning(f"post-deploy notify plane failed for {work_item_id}: {e}") diff --git a/tests/test_deploy_terminal_sync.py b/tests/test_deploy_terminal_sync.py index 5aae57e..d7b9b5e 100644 --- a/tests/test_deploy_terminal_sync.py +++ b/tests/test_deploy_terminal_sync.py @@ -90,6 +90,10 @@ def test_tc17_success_deploy_syncs_terminal_done(monkeypatch): # Spy the merge-lease release to confirm the terminal-sync still frees it. release = MagicMock() monkeypatch.setattr(stage_engine.merge_gate, "release_merge_lease", release) + # ORCH-021 arms an orthogonal post-deploy-monitor reserved job at deploy->done + # for the self-hosting repo; disable it here so this test stays focused on the + # ORCH-036 terminal-sync contract (no PIPELINE agent launched leaving deploy). + monkeypatch.setattr(stage_engine.post_deploy.settings, "post_deploy_monitor_enabled", False) task_id = _make_task("deploy") stage_engine.run_deploy_finalizer( diff --git a/tests/test_post_deploy.py b/tests/test_post_deploy.py new file mode 100644 index 0000000..dabea89 --- /dev/null +++ b/tests/test_post_deploy.py @@ -0,0 +1,210 @@ +"""ORCH-021 unit tests — post-deploy monitor pure logic (TC-01..TC-15). + +The deterministic, network-free core (classification + reaction decision + +exit-code mapping + artefact frontmatter + never-raise) of ``src/post_deploy.py``. +Network probes and the rollback hook are exercised via mocks; the classifier is +the main subject (mirrors compute_staging_verdict in ORCH-061). +""" + +import os +import tempfile + +import pytest +import yaml + +# Isolate the settings singleton onto a tmp repos_dir BEFORE importing the module. +os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token") +os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token") + +from src import post_deploy # noqa: E402 + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- +def _probe(health_ok=True, total=2, fivexx=0): + return {"health_ok": health_ok, "total": total, "fivexx": fivexx} + + +@pytest.fixture(autouse=True) +def _tmp_state(monkeypatch, tmp_path): + monkeypatch.setattr(post_deploy.settings, "repos_dir", str(tmp_path)) + monkeypatch.setattr(post_deploy.settings, "host_repos_dir", str(tmp_path)) + yield + + +# --------------------------------------------------------------------------- +# TC-01..TC-05 — classification (the core) +# --------------------------------------------------------------------------- +def test_tc01_healthy_no_failures(): + series = [_probe() for _ in range(5)] + assert post_deploy.classify(series, fail_threshold=3, fivexx_threshold=0.5) == "HEALTHY" + + +def test_tc02_degraded_consecutive_health_failures(): + # Exactly fail_threshold consecutive failures -> DEGRADED (>= contract). + series = [_probe(health_ok=False) for _ in range(3)] + assert post_deploy.classify(series, fail_threshold=3, fivexx_threshold=0.5) == "DEGRADED" + + +def test_tc03_degraded_by_5xx_ratio_even_when_health_200(): + # /health stays 200 (health_ok True) but the 5xx ratio is above threshold. + series = [_probe(health_ok=True, total=2, fivexx=2) for _ in range(3)] + assert post_deploy.classify(series, fail_threshold=10, fivexx_threshold=0.5) == "DEGRADED" + + +def test_tc04_no_false_trip_single_glitch_then_recovery(): + # One isolated failure (1 < threshold) surrounded by healthy probes -> HEALTHY. + series = [_probe(), _probe(health_ok=False), _probe(), _probe()] + assert post_deploy.classify(series, fail_threshold=3, fivexx_threshold=0.5) == "HEALTHY" + + +def test_tc05_thresholds_change_verdict_on_same_data(): + # Same data, different threshold flips the verdict (AC-11): two consecutive fails. + series = [_probe(health_ok=False), _probe(health_ok=False)] + assert post_deploy.classify(series, fail_threshold=3, fivexx_threshold=0.5) == "HEALTHY" + assert post_deploy.classify(series, fail_threshold=2, fivexx_threshold=0.5) == "DEGRADED" + + +def test_classify_uses_settings_thresholds(monkeypatch): + # The tick reads thresholds from Settings (env ORCH_*) — verify the wiring point. + monkeypatch.setattr(post_deploy.settings, "post_deploy_fail_threshold", 2) + series = [_probe(health_ok=False), _probe(health_ok=False)] + assert post_deploy.classify( + series, + post_deploy.settings.post_deploy_fail_threshold, + post_deploy.settings.post_deploy_5xx_threshold, + ) == "DEGRADED" + + +# --------------------------------------------------------------------------- +# TC-06..TC-08 — reaction decision (self-hosting safety) +# --------------------------------------------------------------------------- +def test_tc06_nonself_auto_rollback_degraded_rolls_back(monkeypatch): + monkeypatch.setattr(post_deploy.settings, "post_deploy_auto_rollback", True) + assert post_deploy.decide_action("enduro-trails", "DEGRADED") == "ROLLBACK" + + +def test_tc07_self_hosting_degraded_never_rolls_back(monkeypatch): + # orchestrator (self-hosting) is ALWAYS ALERT_ONLY, even with auto_rollback on. + monkeypatch.setattr(post_deploy.settings, "post_deploy_auto_rollback", True) + assert post_deploy.decide_action("orchestrator", "DEGRADED") == "ALERT_ONLY" + + +def test_tc08_healthy_means_none_for_any_repo(): + assert post_deploy.decide_action("orchestrator", "HEALTHY") == "NONE" + assert post_deploy.decide_action("enduro-trails", "HEALTHY") == "NONE" + + +def test_nonself_default_policy_alert_only(monkeypatch): + monkeypatch.setattr(post_deploy.settings, "post_deploy_auto_rollback", False) + assert post_deploy.decide_action("enduro-trails", "DEGRADED") == "ALERT_ONLY" + + +# --------------------------------------------------------------------------- +# TC-09..TC-10 — conditionality / kill-switch +# --------------------------------------------------------------------------- +def test_tc09_applies_empty_repos_only_self_hosting(monkeypatch): + monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True) + monkeypatch.setattr(post_deploy.settings, "post_deploy_repos", "") + assert post_deploy.post_deploy_applies("orchestrator") is True + assert post_deploy.post_deploy_applies("enduro-trails") is False + + +def test_tc09_applies_explicit_repos_csv(monkeypatch): + monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True) + monkeypatch.setattr(post_deploy.settings, "post_deploy_repos", "enduro-trails") + assert post_deploy.post_deploy_applies("enduro-trails") is True + assert post_deploy.post_deploy_applies("orchestrator") is False + + +def test_tc10_kill_switch_disables_for_everyone(monkeypatch): + monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", False) + assert post_deploy.post_deploy_applies("orchestrator") is False + assert post_deploy.post_deploy_applies("enduro-trails") is False + + +# --------------------------------------------------------------------------- +# TC-11..TC-12 — rollback exit-code mapping +# --------------------------------------------------------------------------- +def test_tc11_rollback_exit0_is_ok(): + assert post_deploy.map_rollback_exit_code(0) == "ROLLBACK_OK" + + +def test_tc12_rollback_exit_nonzero_is_failed(): + assert post_deploy.map_rollback_exit_code(1) == "ROLLBACK_FAILED" + assert post_deploy.map_rollback_exit_code(2) == "ROLLBACK_FAILED" + assert post_deploy.map_rollback_exit_code(None) == "ROLLBACK_FAILED" + assert post_deploy.map_rollback_exit_code("garbage") == "ROLLBACK_FAILED" + + +# --------------------------------------------------------------------------- +# TC-13 — artefact frontmatter +# --------------------------------------------------------------------------- +def test_tc13_log_frontmatter_parses(): + body = post_deploy.build_post_deploy_log( + "ORCH-021", "DEGRADED", "ALERT_ONLY", 900, 12, 4 + ) + assert body.startswith("---\n") + fm = body.split("---", 2)[1] + data = yaml.safe_load(fm) + assert data["post_deploy_status"] == "DEGRADED" + assert data["action_taken"] == "ALERT_ONLY" + assert data["work_item"] == "ORCH-021" + assert data["window_s"] == 900 + assert data["checks_total"] == 12 + assert data["checks_failed"] == 4 + + +# --------------------------------------------------------------------------- +# TC-14..TC-15 — never-raise +# --------------------------------------------------------------------------- +def test_tc14_probe_network_error_is_conservative_not_raise(monkeypatch): + # urlopen raises on every call -> health bad + monitored endpoints counted as + # 5xx, but NO exception propagates (the helper swallows and reports code 0). + def boom(*a, **k): + raise OSError("network down") + + monkeypatch.setattr(post_deploy.urllib.request, "urlopen", boom) + res = post_deploy.probe_signals("http://localhost:8500") + assert res.health_ok is False + assert res.total == 2 + assert res.fivexx == 2 # unreachable endpoints counted as failures + + +def test_tc14_classify_junk_input_swallowed(): + # If classify gets junk it must not raise (fail-safe to HEALTHY). + assert post_deploy.classify("not-a-list", 3, 0.5) == "HEALTHY" + assert post_deploy.classify([{"bad": "row"}], 3, 0.5) == "HEALTHY" + assert post_deploy.classify(None, 3, 0.5) == "HEALTHY" + + +def test_tc15_write_log_no_worktree_returns_false(monkeypatch): + # get_worktree_path raises -> write returns False, no exception (best-effort). + def boom(repo, branch): + raise FileNotFoundError("no worktree") + + monkeypatch.setattr("src.git_worktree.get_worktree_path", boom) + ok = post_deploy.write_post_deploy_log( + "nope-repo", "ORCH-021", "feature/x", "HEALTHY", "NONE", 900, 3, 0 + ) + assert ok is False + + +# --------------------------------------------------------------------------- +# Sentinel state restart-safe counters +# --------------------------------------------------------------------------- +def test_series_append_and_read_roundtrip(): + post_deploy.write_marker("orchestrator", "ORCH-021", post_deploy.ARMED, "armed") + post_deploy.append_probe("orchestrator", "ORCH-021", post_deploy.ProbeResult(False, 2, 1, "x")) + post_deploy.append_probe("orchestrator", "ORCH-021", post_deploy.ProbeResult(True, 2, 0, "y")) + series = post_deploy.read_series("orchestrator", "ORCH-021") + assert len(series) == 2 + assert series[0]["health_ok"] is False + assert series[1]["health_ok"] is True + + +def test_mark_done_idempotency_marker(): + assert post_deploy.has_marker("orchestrator", "ORCH-021", post_deploy.DONE) is False + post_deploy.mark_done("orchestrator", "ORCH-021") + assert post_deploy.has_marker("orchestrator", "ORCH-021", post_deploy.DONE) is True diff --git a/tests/test_post_deploy_integration.py b/tests/test_post_deploy_integration.py new file mode 100644 index 0000000..7e1e8f6 --- /dev/null +++ b/tests/test_post_deploy_integration.py @@ -0,0 +1,259 @@ +"""ORCH-021 integration tests — arming + tick orchestration (TC-16..TC-20). + +Exercises the wiring in ``stage_engine`` (arm on deploy->done, +``run_post_deploy_monitor`` tick + reaction) and the ``/queue`` observability +block, with the network probe and the rollback hook mocked. Mirrors the +test_deploy_terminal_sync.py harness. +""" + +import os +import tempfile + +import pytest + +_test_db = os.path.join(tempfile.gettempdir(), "test_orch_post_deploy.db") +os.environ["ORCH_DB_PATH"] = _test_db +os.environ["ORCH_REPOS_DIR"] = tempfile.gettempdir() +os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token") +os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token") + +from unittest.mock import MagicMock # noqa: E402 + +import src.db as _db # noqa: E402 +from src.db import init_db, get_db # noqa: E402 +from src import stage_engine # noqa: E402 +from src import post_deploy # noqa: E402 + + +@pytest.fixture(autouse=True) +def fresh_db(monkeypatch, tmp_path): + monkeypatch.setattr(_db.settings, "db_path", _test_db) + if os.path.exists(_test_db): + os.unlink(_test_db) + init_db() + # State sentinels live under the tmp repos_dir (container view). + monkeypatch.setattr(post_deploy.settings, "repos_dir", str(tmp_path)) + monkeypatch.setattr(post_deploy.settings, "host_repos_dir", str(tmp_path)) + monkeypatch.setattr(stage_engine.settings, "repos_dir", str(tmp_path)) + # The artefact write is best-effort; stub it so no worktree is needed. + monkeypatch.setattr(post_deploy, "write_post_deploy_log", MagicMock(return_value=True)) + yield + + +@pytest.fixture(autouse=True) +def silence_side_effects(monkeypatch): + for name in ( + "notify_stage_change", "notify_qg_failure", "notify_approve_requested", + "send_telegram", "plane_notify_stage", "plane_notify_qg", "plane_add_comment", + "set_issue_in_review", "set_issue_needs_input", "set_issue_in_progress", + "set_issue_blocked", "set_issue_done", + ): + monkeypatch.setattr(stage_engine, name, MagicMock()) + + +def _make_task(stage, repo="orchestrator", branch="feature/ORCH-021-x", wi="ORCH-021"): + conn = get_db() + cur = conn.execute( + "INSERT INTO tasks (plane_id, work_item_id, repo, branch, stage) " + "VALUES (?, ?, ?, ?, ?)", + (f"plane-{wi}", wi, repo, branch, stage), + ) + task_id = cur.lastrowid + conn.commit() + conn.close() + return task_id + + +def _jobs(agent=None): + conn = get_db() + if agent: + rows = conn.execute( + "SELECT agent FROM jobs WHERE agent=? ORDER BY id", (agent,) + ).fetchall() + else: + rows = conn.execute("SELECT agent FROM jobs ORDER BY id").fetchall() + conn.close() + return [r[0] for r in rows] + + +def _pass(*a, **k): + return (True, "ok") + + +def _drive_deploy_to_done(monkeypatch, task_id, repo="orchestrator", + branch="feature/ORCH-021-x", wi="ORCH-021"): + """Advance a deploy-stage task to done through the real terminal block.""" + monkeypatch.setattr( + stage_engine, "QG_CHECKS", + {**stage_engine.QG_CHECKS, "check_deploy_status": _pass}, + ) + monkeypatch.setattr(stage_engine.merge_gate, "release_merge_lease", MagicMock()) + return stage_engine.advance_stage( + task_id=task_id, current_stage="deploy", repo=repo, + work_item_id=wi, branch=branch, finished_agent="deployer", + ) + + +# --------------------------------------------------------------------------- +# TC-16 — arm on deploy->done (applicable repo only) +# --------------------------------------------------------------------------- +def test_tc16_arm_for_self_hosting(monkeypatch): + monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True) + monkeypatch.setattr(post_deploy.settings, "post_deploy_repos", "") + task_id = _make_task("deploy") + _drive_deploy_to_done(monkeypatch, task_id) + + assert post_deploy.has_marker("orchestrator", "ORCH-021", post_deploy.ARMED) + assert "post-deploy-monitor" in _jobs("post-deploy-monitor") + + +def test_tc16_no_arm_for_nonself(monkeypatch): + monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True) + monkeypatch.setattr(post_deploy.settings, "post_deploy_repos", "") + task_id = _make_task("deploy", repo="enduro-trails", branch="feature/ET-9", wi="ET-9") + _drive_deploy_to_done(monkeypatch, task_id, repo="enduro-trails", + branch="feature/ET-9", wi="ET-9") + + assert not post_deploy.has_marker("enduro-trails", "ET-9", post_deploy.ARMED) + assert _jobs("post-deploy-monitor") == [] + + +def test_tc16_no_arm_when_kill_switch_off(monkeypatch): + monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", False) + task_id = _make_task("deploy") + _drive_deploy_to_done(monkeypatch, task_id) + assert not post_deploy.has_marker("orchestrator", "ORCH-021", post_deploy.ARMED) + assert _jobs("post-deploy-monitor") == [] + + +# --------------------------------------------------------------------------- +# TC-17 — idempotent arm (double webhook) +# --------------------------------------------------------------------------- +def test_tc17_double_arm_is_noop(monkeypatch): + monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True) + armed1 = post_deploy.arm_monitor("orchestrator", "ORCH-021", "feature/ORCH-021-x", 1) + armed2 = post_deploy.arm_monitor("orchestrator", "ORCH-021", "feature/ORCH-021-x", 1) + assert armed1 is True + assert armed2 is False + # Exactly ONE monitor job enqueued despite two arm calls. + assert _jobs("post-deploy-monitor") == ["post-deploy-monitor"] + + +# --------------------------------------------------------------------------- +# TC-18 — DEGRADED -> non-self auto-rollback (hook mocked) +# --------------------------------------------------------------------------- +def test_tc18_degraded_nonself_rolls_back(monkeypatch): + monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True) + monkeypatch.setattr(post_deploy.settings, "post_deploy_repos", "enduro-trails") + monkeypatch.setattr(post_deploy.settings, "post_deploy_auto_rollback", True) + monkeypatch.setattr(post_deploy.settings, "post_deploy_fail_threshold", 1) + monkeypatch.setattr(post_deploy.settings, "post_deploy_window_s", 30) + monkeypatch.setattr(post_deploy.settings, "post_deploy_interval_s", 30) # budget=1 tick + # Probe reports unhealthy. + monkeypatch.setattr( + post_deploy, "probe_signals", + lambda url: post_deploy.ProbeResult(False, 2, 2, "down"), + ) + rollback = MagicMock(return_value=(0, "ok")) + monkeypatch.setattr(post_deploy, "run_rollback", rollback) + notify = MagicMock() + monkeypatch.setattr(stage_engine, "_notify_post_deploy", notify) + logspy = MagicMock(return_value=True) + monkeypatch.setattr(post_deploy, "write_post_deploy_log", logspy) + + task_id = _make_task("done", repo="enduro-trails", branch="feature/ET-9", wi="ET-9") + post_deploy.write_marker("enduro-trails", "ET-9", post_deploy.ARMED, "armed") + stage_engine.run_post_deploy_monitor( + {"task_id": task_id, "repo": "enduro-trails", "id": 1, "agent": "post-deploy-monitor"} + ) + + rollback.assert_called_once_with("enduro-trails") + assert post_deploy.has_marker("enduro-trails", "ET-9", post_deploy.DONE) + # Artefact written with ROLLBACK_OK; a notification was sent. + args = logspy.call_args[0] + assert "DEGRADED" in args + assert "ROLLBACK_OK" in args + assert notify.called + + +# --------------------------------------------------------------------------- +# TC-19 — self-hosting DEGRADED never rolls back, alerts instead +# --------------------------------------------------------------------------- +def test_tc19_degraded_self_hosting_alert_only(monkeypatch): + monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True) + monkeypatch.setattr(post_deploy.settings, "post_deploy_auto_rollback", True) + monkeypatch.setattr(post_deploy.settings, "post_deploy_fail_threshold", 1) + monkeypatch.setattr(post_deploy.settings, "post_deploy_window_s", 30) + monkeypatch.setattr(post_deploy.settings, "post_deploy_interval_s", 30) + monkeypatch.setattr( + post_deploy, "probe_signals", + lambda url: post_deploy.ProbeResult(False, 2, 2, "down"), + ) + # Rollback hook MUST NOT be called for self-hosting (AC-8 structural invariant). + rollback = MagicMock(return_value=(0, "ok")) + monkeypatch.setattr(post_deploy, "run_rollback", rollback) + notify = MagicMock() + monkeypatch.setattr(stage_engine, "_notify_post_deploy", notify) + logspy = MagicMock(return_value=True) + monkeypatch.setattr(post_deploy, "write_post_deploy_log", logspy) + + task_id = _make_task("done") + post_deploy.write_marker("orchestrator", "ORCH-021", post_deploy.ARMED, "armed") + stage_engine.run_post_deploy_monitor( + {"task_id": task_id, "repo": "orchestrator", "id": 1, "agent": "post-deploy-monitor"} + ) + + rollback.assert_not_called() + assert post_deploy.has_marker("orchestrator", "ORCH-021", post_deploy.DONE) + args = logspy.call_args[0] + assert "DEGRADED" in args + assert "ALERT_ONLY" in args + assert notify.called + + +def test_healthy_tick_requeues_without_finishing(monkeypatch): + # HEALTHY and window not exhausted -> re-queue, do NOT mark done. + monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True) + monkeypatch.setattr(post_deploy.settings, "post_deploy_window_s", 90) + monkeypatch.setattr(post_deploy.settings, "post_deploy_interval_s", 30) # budget=3 + monkeypatch.setattr( + post_deploy, "probe_signals", + lambda url: post_deploy.ProbeResult(True, 2, 0, "ok"), + ) + task_id = _make_task("done") + post_deploy.write_marker("orchestrator", "ORCH-021", post_deploy.ARMED, "armed") + stage_engine.run_post_deploy_monitor( + {"task_id": task_id, "repo": "orchestrator", "id": 1, "agent": "post-deploy-monitor"} + ) + assert not post_deploy.has_marker("orchestrator", "ORCH-021", post_deploy.DONE) + # A follow-up tick job was enqueued. + assert _jobs("post-deploy-monitor") == ["post-deploy-monitor"] + + +def test_finished_window_tick_is_noop(monkeypatch): + # AC-15: a tick after the window is done is a no-op (no new job, no re-probe). + probe = MagicMock() + monkeypatch.setattr(post_deploy, "probe_signals", probe) + task_id = _make_task("done") + post_deploy.mark_done("orchestrator", "ORCH-021") + stage_engine.run_post_deploy_monitor( + {"task_id": task_id, "repo": "orchestrator", "id": 9, "agent": "post-deploy-monitor"} + ) + probe.assert_not_called() + + +# --------------------------------------------------------------------------- +# TC-20 — /queue observability block +# --------------------------------------------------------------------------- +def test_tc20_queue_block_present(monkeypatch): + monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True) + post_deploy.write_marker("orchestrator", "ORCH-021", post_deploy.ARMED, "armed") + snap = post_deploy.status() + assert snap["enabled"] is True + assert snap["window_s"] == post_deploy.settings.post_deploy_window_s + assert "ORCH-021" in snap["active"] + assert snap["active_count"] >= 1 + # A finished window drops out of "active". + post_deploy.mark_done("orchestrator", "ORCH-021") + snap2 = post_deploy.status() + assert "ORCH-021" not in snap2["active"]