From 38081c16304f64f83c35479ff4e81959c5b4213f Mon Sep 17 00:00:00 2001 From: claude-bot Date: Mon, 15 Jun 2026 21:16:28 +0300 Subject: [PATCH] fix(plane): sandbox-only fail-closed guard for Plane writes from test process (ORCH-117) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Close the root class of incident ORCH-114: a pytest/worktree process performed a REAL write (PATCH issues state= + comment) against the PRODUCTION Plane project, because test/staging processes inherit the live Plane token (PLANE_HEADERS/PROJECT_ID are captured at import — a post-hoc env/token swap is a no-op) and nothing forced them to write only to the sandbox. Symmetric to the existing _no_telegram autouse floor. - New pure never-raise leaf src/plane_write_guard.py (decide/audit_block/ audit_allow), wired into the 3 plane_sync write primitives (update_issue_state / add_comment / _set_issue_state_direct) via _guard_allows_write, AT CALL TIME, before any network step. Active ONLY in a test process (pytest in sys.modules / PYTEST_CURRENT_TEST); live + staging runtimes (uvicorn) are a strict no-op. - In a test process: default-deny. A write is allowed iff opt-in (plane_test_write_enabled) AND target project in the sandbox allowlist (plane_test_sandbox_projects, default = the one SANDBOX id). Prod is blocked even with opt-in (allowlist sandbox-only); unresolved project -> block (fail-closed). - Independent second layer: tests/conftest.py::_plane_sandbox_only autouse floor. Intentionally NO prod-block kill-switch (anti back-door, NFR-6). - Audit: block -> loud ERROR; sandbox-allow -> INFO. - Bypass fixtures for the 3 (+1) pre-existing tests that assert on the mocked write primitive's httpx call (header/URL/state logic), the guard is no Quality Gate: STAGE_TRANSITIONS / QG_CHECKS / check_* / machine-verdict / DB schema untouched. - Tests: tests/test_orch117_plane_write_isolation.py (TC-01 mandatory ORCH-114 regression + TC-02..TC-14). Docs: CLAUDE.md, architecture/README.md, operations/INFRA.md, .env.example, CHANGELOG.md. Refs: ORCH-117 Co-Authored-By: Claude Opus 4.8 --- .env.example | 13 + CHANGELOG.md | 1 + CLAUDE.md | 45 +++ docs/architecture/README.md | 2 +- docs/operations/INFRA.md | 14 + src/config.py | 19 ++ src/plane_sync.py | 28 ++ src/plane_write_guard.py | 193 +++++++++++++ tests/conftest.py | 28 ++ tests/test_orch117_plane_write_isolation.py | 287 ++++++++++++++++++++ tests/test_plane_author.py | 16 ++ tests/test_plane_status_model.py | 13 + tests/test_plane_sync_labels.py | 8 + tests/test_stage_visibility.py | 13 + 14 files changed, 679 insertions(+), 1 deletion(-) create mode 100644 src/plane_write_guard.py create mode 100644 tests/test_orch117_plane_write_isolation.py diff --git a/.env.example b/.env.example index 701771a..4bd4d96 100644 --- a/.env.example +++ b/.env.example @@ -24,6 +24,19 @@ ORCH_PLANE_BOT_REVIEWER= ORCH_PLANE_BOT_TESTER= ORCH_PLANE_BOT_DEPLOYER= ORCH_PLANE_BOT_STREAM= +# ORCH-117: sandbox-only fail-closed guard for Plane WRITES from a test/worktree +# process (regression of ORCH-114, where pytest mutated a live prod board issue). +# In the live runtime (uvicorn, no pytest) the guard is a no-op; in a test process +# it BLOCKS every Plane write unless BOTH the opt-in is true AND the target project +# is in the sandbox allowlist. Defaults are SAFE (default-deny): leave both as-is. +# ORCH_PLANE_TEST_WRITE_ENABLED -> opt-in for REAL Plane writes from a test process. +# false (default) = no test may write to Plane. NOT a kill-switch for the prod +# block: even true, only the sandbox allowlist below is writable (a prod write +# from pytest stays impossible). +# ORCH_PLANE_TEST_SANDBOX_PROJECTS -> CSV allowlist of sandbox project ids the +# opt-in may write to. Default = the single SANDBOX project; empty = none. +ORCH_PLANE_TEST_WRITE_ENABLED=false +ORCH_PLANE_TEST_SANDBOX_PROJECTS=8c5a3025-4f9d-4190-b79f-fa06276bb27e # Telegram live-tracker / alerts (empty -> notifications are logged, not sent). ORCH_TELEGRAM_BOT_TOKEN= ORCH_TELEGRAM_CHAT_ID= diff --git a/CHANGELOG.md b/CHANGELOG.md index 848e112..3460e28 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ Формат: [Keep a Changelog](https://keepachangelog.com/). Записи — на смысловой PR/задачу. ## [Unreleased] +- **Sandbox-only fail-closed изоляция записи в Plane из тест-процесса** (ORCH-117, `fix`, bug→escalate full-cycle): закрыт корневой класс инцидента **ORCH-114** — тест/worktree-процесс выполнил РЕАЛЬНУЮ запись (`PATCH …/issues/… state=` + комментарий «Stage: deploy → done») против **боевого** Plane-проекта, т.к. тест/staging-процессы наследуют живой боевой Plane-токен (`PLANE_HEADERS`/`PROJECT_ID` захвачены литералами **на импорте** — подмена env/токена постфактум бесполезна, NFR-4), и **ничто** не принуждало их писать только в sandbox. Симметрия прецеденту `tests/conftest.py::_no_telegram` (autouse-глушилка Telegram «pytest на проде слал реальные сообщения») — для Plane-**записи** такой защиты не было. Аддитивно, never-raise в боевом пути; `STAGE_TRANSITIONS`/реестр `QG_CHECKS`/семантика и имена `check_*`/machine-verdict-ключи/схема БД — **байт-в-байт не тронуты** (это изоляция клиента Plane, **не** Quality Gate и **не** стадия). Новый чистый leaf `src/plane_write_guard.py` (`decide(project_id, op, work_item) -> (ALLOW|BLOCK, reason)`, по образцу `deploy_status_guard`/`serial_gate`) врезан в **3 примитива записи** `plane_sync` (`update_issue_state`/`add_comment`/`_set_issue_state_direct`) **на момент вызова** — сразу после локального `_resolve_project_id` и **до** любого сетевого шага (ни GET, ни PATCH/POST). Гард активен **только в тест-процессе** (детект `"pytest" in sys.modules` / `PYTEST_CURRENT_TEST`); боевой и staging рантаймы (`uvicorn src.main:app`, без pytest в процессе) — строгий **no-op** (NFR-2/NFR-3). В тест-процессе запись разрешена **только** при одновременном (а) opt-in `plane_test_write_enabled=True` **и** (б) целевом проекте ∈ sandbox-allowlist `plane_test_sandbox_projects` (дефолт = единственный SANDBOX `8c5a3025-…`); иначе — default-deny; нерезолвимый проект → блок (fail-closed, NFR-1); боевой проект запрещён **даже при opt-in** (allowlist sandbox-only). Второй независимый sandbox-bound слой — autouse-floor `tests/conftest.py::_plane_sandbox_only` (opt-in OFF для всего сьюта, по образцу `_no_telegram`/`_disable_*`); sandbox-e2e ре-энейблит opt-in в своей фикстуре поверх floor. **Умышленно БЕЗ kill-switch прод-блока** (NFR-6/FR-7/anti-drift): выключателя, переоткрывающего прод-запись из pytest, нет — единственный реверс — sandbox-bound opt-in. Аудит: блок → громкий структурный ERROR (`project_id`/`work_item`/`op`/`reason` — делает инцидент класса ORCH-114 очевидным), разрешённая sandbox-запись → INFO. Новые ключи `ORCH_PLANE_TEST_WRITE_ENABLED` (дефолт `false`) / `ORCH_PLANE_TEST_SANDBOX_PROJECTS` (дефолт = SANDBOX id) с безопасными дефолтами; `scripts/staging_check.py` Block C (E2E в SANDBOX) — отдельный процесс с собственными httpx-вызовами, гардом не затронут. Покрытие — `tests/test_orch117_plane_write_isolation.py` (TC-01 — обязательный регресс ORCH-114: красный до врезки, зелёный после; TC-02…TC-14). ADR: `docs/work-items/ORCH-117/06-adr/ADR-001-sandbox-only-plane-write-guard.md`, сквозной `docs/architecture/adr/adr-0046-sandbox-only-plane-write-guard.md`. - **Ownership-lease для side-effectful переходов стадий + умное восстановление при старте** (ORCH-114, `fix`, bug→escalate full-cycle): закрыт **корневой класс** инцидент-цепочки ORCH-110/111/112/113 — у side-effectful переходов стадий не было единого владения. `advance_stage` ре-ентерабельна и пишет стадию «голым» `UPDATE … WHERE id=?` (без compare-and-swap), а ≥5 акторов (монитор / Plane-webhook / reconciler F-1 / job-reaper / deploy-finalizer) входят в один переход независимо → конкурентный или после-рестартовый повторный вход **дважды** применял необратимые эффекты (merge_pr / coverage-ratchet / image-rebuild / инициация прод-деплоя) и давал **противоречие rollback↔done** (инцидент ORCH-111, job 1914 / PR #130). Два комплементарных слоя, оба аддитивные, под единым kill-switch, never-raise: **(1) durable transition-lease** (новая таблица `transition_lease`) — владение на ВХОДЕ в side-effectful регион (второй актор, увидев живого владельца, не стартует тяжёлые под-гейты вовсе — предотвращение, не починка постфактум); **(2) expected-stage CAS** (`update_task_stage_cas`) — на ЗАПИСИ стадии (проигравший гонку — аборт без побочных эффектов), что закрывает и **6 путей записи стадии в обход `advance_stage`** (gitea×5 + plane rollback). Liveness владельца = `owner_pid` + `owner_boot_id` (НЕ heartbeat: блокирующий 900s merge re-test не может бить heartbeat — довод самого ORCH-113), что делает рестарт-recovery бесплатным (новый процесс → новый boot-id → все прежние lease мгновенно устаревшие → реклеймятся). Lease без собственного TTL: его потолок возраста = Tier-3 backstop `reaper_max_running_s` (5400) → сквозной бюджет ORCH-065/109/110/113 не тронут. `STAGE_TRANSITIONS` / реестр `QG_CHECKS` / семантика и имена `check_*` / machine-verdict-ключи / **схемы существующих таблиц** — байт-в-байт (одна аддитивная таблица, без epoch-колонки на `tasks`). Скоуп self-hosting (`transition_lease_repos=""` → только `orchestrator`; enduro не затронут); kill-switch `ORCH_TRANSITION_LEASE_ENABLED=false` → CAS вырождается в прежний безусловный `update_task_stage`, lease инертен → поведение байт-в-байт до ORCH-114. ADR: `docs/work-items/ORCH-114/06-adr/ADR-001-transition-ownership-lease-and-stage-cas.md`, сквозной `docs/architecture/adr/adr-0045-transition-ownership-lease-and-stage-cas.md`. - **Leaf `src/transition_lease.py` (новый, чистый never-raise):** по образцу `serial_gate`/`coverage_gate`/`finalizer_liveness` (импортирует только `db`+`config`, лениво `merge_gate.pid_alive`/`qg.checks`/`notifications`; НЕ импортирует `stage_engine`/`launcher`) — `applies(repo)` / `acquire(task_id, owner, run_id, stage)` (атомарный rowcount-guard `INSERT … ON CONFLICT DO NOTHING` после очистки stale-строки) / `is_held_by_live_owner(task_id)` (fail-closed → defer на сомнении) / `release(task_id, force=False)` (holder-aware по boot) / `reclaim_if_stale` / `recover_on_startup` / `commit_stage_cas(task_id, expected, new, repo)` (flag-off → unconditional `update_task_stage`; flag-on → CAS) / `snapshot()`. - **Интеграция:** `advance_stage` захватывает lease на входе в side-effectful ребро (`deploy-staging`/`deploy`), пишет стадию через CAS, освобождает lease в `try/finally` (на любом исходе, включая исключение/откат); **rollback-записи side-effectful под-гейтов** (`_handle_merge_gate_rollback`/`_handle_security_gate`/`_handle_coverage_gate`/`_handle_image_freshness`) пишут `development` через тот же CAS (общий хелпер `_rollback_stage_cas`, ADR-001 D4: защита rollback↔done — под держимым lease это единственный владелец, проигранный CAS → аборт без side-effects, не слепой перетир `done`); job-reaper `_finalizer_owns` обобщён с процесс-локального ORCH-113 (Tier-2/`deploy-staging`) на **durable cross-path** lease (defer при живом владельце; Tier-3 backstop игнорирует маркер → bounded reclaim; реап force-освобождает lease); reconciler F-1 и Plane-webhook (`_try_advance_stage`) делают **defer** при активном lease; `main.lifespan` зовёт `recover_on_startup()` после `requeue_running_jobs`. Наблюдаемость — read-only блок `transition_lease` в `GET /queue` + Telegram-алерт на форсированный/устаревший реклейм + опциональный `POST /transition-lease/release?work_item=`. Покрытие — `tests/test_orch114_transition_ownership.py` (TC-01 обязательный регресс класса ORCH-111: красный до фикса, зелёный после; TC-02…TC-14 + регресс CAS на in-region rollback). Флаги (`config.py`, дефолт = боевое): `transition_lease_enabled` (env `ORCH_TRANSITION_LEASE_ENABLED`), `transition_lease_repos` (env `ORCH_TRANSITION_LEASE_REPOS`). diff --git a/CLAUDE.md b/CLAUDE.md index b24be1c..f23bbc4 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -366,6 +366,51 @@ lease **не консультирует** (fail-open, очередь репо н `docs/work-items/ORCH-114/06-adr/ADR-001-transition-ownership-lease-and-stage-cas.md`, сквозной `docs/architecture/adr/adr-0045-transition-ownership-lease-and-stage-cas.md`. +## Sandbox-only fail-closed изоляция записи в Plane (ORCH-117) +Закрыт корневой класс инцидента **ORCH-114**: тест/worktree-процесс выполнил РЕАЛЬНУЮ запись +(`PATCH …/issues/… state=` + комментарий «Stage: deploy → done») против **боевого** +Plane-проекта, т.к. тест/staging-процессы наследуют живой боевой Plane-токен +(`PLANE_HEADERS`/`PROJECT_ID` захвачены литералами **на импорте** `plane_sync` — подмена env/токена +постфактум бесполезна, NFR-4) и **ничто** не принуждало их писать только в sandbox. Прямой +прецедент — `tests/conftest.py::_no_telegram` (autouse-глушилка «pytest на проде слал реальные +Telegram-сообщения»); симметричной защиты для Plane-**записи** не было. **Инвариант:** запись в +боевой Plane-проект из любого pytest/worktree-процесса **физически невозможна** независимо от +токена. Аддитивно, never-raise в боевом пути; `STAGE_TRANSITIONS`/реестр `QG_CHECKS`/семантика и +имена `check_*`/machine-verdict-ключи/схема БД — **байт-в-байт не тронуты** (это изоляция клиента +Plane, **не** Quality Gate и **не** стадия). +- **Чокпоинт (D1):** новый чистый leaf `src/plane_write_guard.py` (`decide(project_id, op, + work_item) -> (ALLOW|BLOCK, reason)`, never-raise, по образцу `deploy_status_guard`/`serial_gate`/ + `cancel`) врезан в **3 примитива записи** `plane_sync` (`update_issue_state` / `add_comment` / + `_set_issue_state_direct`) **на момент вызова** — сразу после локального `_resolve_project_id` и + **до** любого сетевого шага (ни GET, ни PATCH/POST). Все `set_issue_*`/`notify_*` сводятся к этим + трём примитивам → один гард ловит любой путь, включая будущие. +- **Детект тест-процесса (D2):** `"pytest" in sys.modules` ∨ `PYTEST_CURRENT_TEST` (на момент + вызова). Боевой и staging рантаймы — `uvicorn src.main:app`, pytest в процесс **не** импортируют → + гард там строгий **no-op** (NFR-2/NFR-3); worktree `python -m pytest` (инцидентный путь) + гарантированно имеет pytest в `sys.modules` → ловится. +- **Решение (D3):** default-deny. Запись из тест-процесса разрешена ⇔ одновременно (а) opt-in + `plane_test_write_enabled=True` **и** (б) целевой проект ∈ sandbox-allowlist + `plane_test_sandbox_projects` (дефолт = единственный SANDBOX `8c5a3025-4f9d-4190-b79f-fa06276bb27e`). + Нерезолвимый/пустой проект → блок (fail-closed, NFR-1). Боевой проект запрещён **даже при opt-in** + (allowlist sandbox-only). Внутренняя ошибка `decide` в тест-контексте → fail-CLOSED (`guard-error`). +- **Второй слой (D5):** независимый autouse-floor `tests/conftest.py::_plane_sandbox_only` форсит + opt-in OFF для **всего** сьюта (по образцу `_no_telegram`/`_disable_*`); sandbox-e2e ре-энейблит + opt-in в своей фикстуре поверх floor. Два sandbox-bound слоя → нет одиночной точки, чьё выключение + переоткрывает прод. +- **Умышленно БЕЗ kill-switch прод-блока (D4, NFR-6/FR-7, anti-drift):** выключателя, + переоткрывающего прод-запись из pytest, **нет** — единственный реверс — sandbox-bound opt-in. Не + добавлять «общий kill-switch гарда» (реинтродуцирует дефект ORCH-114; reviewer ловит как ≥P1). +- **Аудит (D7):** блок → громкий структурный ERROR (`project_id`/`work_item`/`op`/`reason`: + `prod-project-in-test`/`opt-in-disabled`/`ambiguous-target`/`guard-error`); разрешённая + sandbox-запись → INFO. **Флаги** (`config.py`, дефолты безопасные): `plane_test_write_enabled` + (env `ORCH_PLANE_TEST_WRITE_ENABLED`, дефолт `False`), `plane_test_sandbox_projects` (env + `ORCH_PLANE_TEST_SANDBOX_PROJECTS`, CSV). `scripts/staging_check.py` Block C (E2E в SANDBOX) — + отдельный процесс с собственными httpx-вызовами, гардом не затронут. Покрытие — + `tests/test_orch117_plane_write_isolation.py` (TC-01 — обязательный регресс ORCH-114: красный до + врезки, зелёный после; TC-02…TC-14). Детали — + `docs/work-items/ORCH-117/06-adr/ADR-001-sandbox-only-plane-write-guard.md`, сквозной + `docs/architecture/adr/adr-0046-sandbox-only-plane-write-guard.md`. + ## Машинный журнал уроков (ORCH-098) Шаг 1 («Фундамент», F2) эпика саморазвития: формализует свободнотекстовые «уроки» из `memory/` в **машинную структурированную таблицу отклонений конвейера** `lessons`, фундамент для будущих diff --git a/docs/architecture/README.md b/docs/architecture/README.md index 71dee53..647c840 100644 --- a/docs/architecture/README.md +++ b/docs/architecture/README.md @@ -19,7 +19,7 @@ - **Notifications / Live-tracker** (`src/notifications.py`, ORCH-042/ORCH-067) — ОДНА live-карточка на задачу (`update_task_tracker`), обновляется на каждом переходе. Режим `ORCH_TRACKER_MODE` (дефолт `bump` с ORCH-067: delete+silent send+repoint внизу чата; `edit` — правка на месте). Карточка несёт строку Plane-статуса `📍 …` (оффлайн-ядро `plane_status_label` + best-effort live-overlay `_live_plane_branch_override`, kill-switch `ORCH_TRACKER_LIVE_STATUS`) и кликабельный номер задачи (`plane_issue_link`/`link_for` → ссылка в Plane, fail-safe на сырой номер). **ORCH-080:** оба низкоуровневых примитива (`send_telegram`/`edit_telegram`) шлют payload с `disable_web_page_preview: True` — Telegram больше не разворачивает баннер link-preview Plane под карточкой/уведомлениями; `parse_mode: HTML` сохранён (ссылка остаётся кликабельной), безусловно без kill-switch. Все алерты, упоминающие `work_item_id`, делают номер кликабельным. **ORCH-087:** bump ведёт авторитетный леджер всех созданных карточек (`tracker_messages`, `deleted_at IS NULL` = жива) и на каждом обновлении зачищает ВСЕ незакрытые mid (а не только скаляр `tracker_message_id`) → класс «замёрзшая сирота» устранён; строка стадии несёт фактический эффорт рядом с моделью (`· {model} · {effort}`, колонка `agent_runs.effort`, стамп в `launcher._spawn`); done-строка времени переписана на три подписанных метрики `⏱️ Агенты · твоё{~cap} · общее с ожиданием` (кап `ORCH_TRACKER_BRD_REVIEW_CAP_S`); deploy-цикл дополнен overlay-ключом `confirm_deploy`. **ORCH-091 (индикация-only):** три корректности рендера — (1) `_STAGE_STATUS_LABEL` покрывает ВСЕ ключи `STAGE_TRANSITIONS` (добавлены `deploy-staging`→«Deploying (staging)», `cancelled`→«Cancelled»; полнота гарантируется тестом по `stages.STAGE_TRANSITIONS`, не статичным списком — NFR-3), runtime-фолбэк для неизвестной стадии стал нейтральным (капитализированное имя) вместо «To Analyse»; (2) при откате конвейера `✅`-строки стадий ПОЗЖЕ текущей позиции (позиция — из порядка `STAGE_TRANSITIONS`, с нормализацией `deploy-staging→deploy` только в гейте подавления; `is_active_stage` не тронут) больше не рисуются; (3) строка стадии суммирует ВСЕ `agent_runs` агента (Σ cost/токены/время теми же формулами, что блок тоталов) → строгая сходимость с `SUM(agent_runs)`. Только `src/notifications.py` + тесты; `STAGE_TRANSITIONS`/`QG_CHECKS`/схема БД/транспорт — не тронуты. Контракт всего компонента — never raises; карточка всегда silent. **ORCH-095 (HTML-безопасность данных):** текст карточки шлётся с `parse_mode=HTML`; каждый **data**-слот (длительности `_fmt_minutes`/`_capped_review_str`, статус-лейбл, модель/эффорт, токены/стоимость) экранируется `html.escape` ровно один раз на границе рендера, **markup**-слоты (`num_html`/`link_for`/`_done_link`/`esc_title`) — нет (двойное экранирование запрещено). Устранён класс «неэкранированные данные в HTML» (литерал `<1м` от `_fmt_minutes` → Telegram `400 can't parse entities` → застывшая карточка, инцидент ORCH-093); `_fmt_minutes` по-прежнему даёт `<1м` (escape рендерит визуально идентично). Застрявшая карточка в окне авто-восстанавливается следующим рендером; `edit_telegram`/`update_task_tracker`/леджер сирот не тронуты. Детали — [internals.md](internals.md) §7, [ADR-087](../work-items/ORCH-087/06-adr/ADR-001-tracker-orphan-cleanup.md), [ORCH-091 ADR-001](../work-items/ORCH-091/06-adr/ADR-001-tracker-status-rollback-metrics.md) и [ORCH-095 ADR-001](../work-items/ORCH-095/06-adr/ADR-001-html-safe-card-data-render.md). - **Project Registry** (`src/projects.py`, ORCH-6) — Plane project id → repo + prefix; фильтрация вебхуков по проекту. - **Plane Sync** (`src/plane_sync.py`) — синхронизация статусов/комментариев в Plane. Резолв статусов проекта `get_project_states` (ORCH-10) кэширует `{logical_key→uuid}` per-project; **ORCH-068** добавляет в кэш-запись `{uuid→group}` (для терминал-исключения F-2) и **TTL** `ORCH_PLANE_STATES_TTL_S` (дефолт 300с; `0` → прежний lifetime-кэш) — устаревший набор статусов самозалечивается без рестарта процесса через существующий `reload_project_states()` (баг кэша после появления нового Plane-статуса). Форма возврата `get_project_states` неизменна (обратная совместимость). -- **Plane write guard** (`src/plane_write_guard.py`, ORCH-117 — design, [adr-0046](adr/adr-0046-sandbox-only-plane-write-guard.md)) — чистый **never-raise** leaf (паттерн `serial_gate`/`deploy_status_guard`), закрывает дефект изоляции ORCH-114: тестовый/worktree-процесс (`python -m pytest` из worktree) с живым боевым токеном выполнил **реальную** запись в **боевой** Plane-проект («ложный Done»). Гард `decide(project_id, op, work_item_id) -> (ok, reason)` врезается в три примитива записи `plane_sync` (`update_issue_state`/`add_comment`/`_set_issue_state_direct`) сразу после `_resolve_project_id` и **до** любого сетевого шага. Активен **только в тест-процессе** (детект `"pytest" in sys.modules`/`PYTEST_CURRENT_TEST`, call-time → иммунитет к захвату `PLANE_HEADERS`/`PROJECT_ID` на импорте); боевой и staging рантаймы (`uvicorn src.main:app`, pytest в процесс не импортирован) — **no-op, byte-for-byte**. В тест-процессе **default-deny**: запись разрешена ⇔ (а) opt-in `plane_test_write_enabled` **и** (б) `project_id ∈ plane_test_sandbox_projects` (дефолт = единственный SANDBOX `8c5a3025-…`); боевой проект запрещён **даже при opt-in** (allowlist sandbox-only). Второй независимый sandbox-bound слой — autouse-фикстура `tests/conftest.py::_plane_sandbox_only` (паттерн `_no_telegram`), форсящая безопасные дефолты во всех тестах; sandbox-e2e ре-энейблит opt-in после неё. **Намеренно без prod-блок kill-switch** (NFR-6: выключатель = чёрный ход; реверс — sandbox-bound opt-in; прецедент `_no_telegram`); **НЕ `*_repos`-scope** (защищает запись в любой боевой проект общего workspace, как observer-leaf `lessons`). Аудит: блок → структурный WARNING (`project_id`/`work_item`/`op`/`reason`), sandbox-allow → INFO. `STAGE_TRANSITIONS`/`QG_CHECKS`/`check_*`/machine-verdict/схема БД — **не тронуты**. Детали — `docs/work-items/ORCH-117/06-adr/ADR-001-sandbox-only-plane-write-guard.md`. +- **Plane write guard** (`src/plane_write_guard.py`, ORCH-117 — реализовано, [adr-0046](adr/adr-0046-sandbox-only-plane-write-guard.md)) — чистый **never-raise** leaf (паттерн `serial_gate`/`deploy_status_guard`), закрывает дефект изоляции ORCH-114: тестовый/worktree-процесс (`python -m pytest` из worktree) с живым боевым токеном выполнил **реальную** запись в **боевой** Plane-проект («ложный Done»). Гард `decide(project_id, op, work_item_id) -> (ok, reason)` врезается в три примитива записи `plane_sync` (`update_issue_state`/`add_comment`/`_set_issue_state_direct`) через тонкий хелпер `_guard_allows_write` сразу после `_resolve_project_id` и **до** любого сетевого шага. Активен **только в тест-процессе** (детект `"pytest" in sys.modules`/`PYTEST_CURRENT_TEST`, call-time → иммунитет к захвату `PLANE_HEADERS`/`PROJECT_ID` на импорте); боевой и staging рантаймы (`uvicorn src.main:app`, pytest в процесс не импортирован) — **no-op, byte-for-byte** (live-path возвращает ALLOW ДО любого try-блока → гард не может уронить боевую запись). В тест-процессе **default-deny**: запись разрешена ⇔ (а) opt-in `plane_test_write_enabled` **и** (б) `project_id ∈ plane_test_sandbox_projects` (дефолт = единственный SANDBOX `8c5a3025-…`); боевой проект запрещён **даже при opt-in** (allowlist sandbox-only); нерезолвимый/пустой проект → блок (fail-closed); внутренняя ошибка `decide` в тест-контексте → fail-CLOSED. Второй независимый sandbox-bound слой — autouse-фикстура `tests/conftest.py::_plane_sandbox_only` (паттерн `_no_telegram`), форсящая безопасные дефолты во всех тестах; sandbox-e2e ре-энейблит opt-in после неё. **Намеренно без prod-блок kill-switch** (NFR-6: выключатель = чёрный ход; реверс — sandbox-bound opt-in; прецедент `_no_telegram`); **НЕ `*_repos`-scope** (защищает запись в любой боевой проект общего workspace, как observer-leaf `lessons`). Аудит: блок → громкий структурный ERROR (`project_id`/`work_item`/`op`/`reason`), sandbox-allow → INFO. `STAGE_TRANSITIONS`/`QG_CHECKS`/`check_*`/machine-verdict/схема БД — **не тронуты**. Покрытие — `tests/test_orch117_plane_write_isolation.py` (TC-01 обязательный регресс ORCH-114). Детали — `docs/work-items/ORCH-117/06-adr/ADR-001-sandbox-only-plane-write-guard.md`. - **FS ownership detect** (`src/fs_normalize.py`, ORCH-057 — [adr-0031](adr/adr-0031-legacy-ownership-normalization.md)) — чистый **never-raise** leaf (паттерн `serial_gate`/`preflight`), закрывает пробел ORCH-040: при миграции на `user: "1000:1000"` legacy `root:root` файлы в `/repos` ломали создание worktree под uid 1000 (`ensure_worktree` → сырой `fatal: … Permission denied`, агент не стартовал). Три слоя: (1) **D1** — `src/git_worktree.py::ensure_worktree` классифицирует класс «нет прав» (`Permission denied`/`could not create leading directories`/`insufficient permission`/`EACCES`/`EPERM`) и поднимает actionable `RuntimeError` с причиной + лечащей командой (не-прав-ошибки сохраняют прежний контракт — меняется только формулировка, не факт сбоя); (2) **D2** — `scan_ownership(roots, target_uid=os.getuid())` обходит `/repos/_wt`, `/.git/{objects,worktrees}`, `data/runs` с ранним выходом при первом `st_uid != target_uid` + TTL-кэш; (3) **D3** — best-effort вызов на старте `main.lifespan` → WARNING + Telegram при mismatch (claim **НЕ** блокируется — внятный ранний отказ даёт D1 в точке launch, знающей repo; preflight-блок отвергнут как repo-слепой → регресс enduro). Опц. `normalize()` chown'ит только при `CAP_CHOWN` (под uid 1000 — no-op; init-контейнер/root-entrypoint отвергнуты — реинтродукция root-контекста + self-deploy compose). Фактическая нормализация = **операторская процедура** под root на хосте (`INFRA.md` «Миграция uid»). Условность `applies(repo)` first: `fs_normalize_enabled` (kill-switch) + `fs_normalize_repos` (CSV, пусто → self-hosting only). Наблюдаемость — блок `fs_ownership` в `GET /queue`; опц. `POST /fs-normalize/check`. `STAGE_TRANSITIONS`/`QG_CHECKS`/`check_*`/machine-verdict/схема БД — не тронуты. Детали — `docs/work-items/ORCH-057/06-adr/ADR-001-legacy-ownership-normalization.md`. - **Metrics endpoint** (`src/metrics.py` + `GET /metrics`, ORCH-099 — [adr-0030](adr/adr-0030-metrics-endpoint.md)) — лёгкий **read-only** leaf-сборщик (`build_metrics() -> dict`, never-raise по разделам, паттерн `serial_gate.snapshot()`) + тонкий эндпоинт (стиль `GET /queue`). Отдаёт JSON-«сырьё» о самом орке (стадии задач / очередь jobs / agent-liveness / стоимость-токены) как **стабильный машинный контракт для sidecar F1b** (`watchdog/`, отдельная задача — наблюдатель отделён от наблюдаемого). Только чтение существующих `tasks`/`jobs`/`agent_runs` + in-memory-снапшотов (`worker.breaker`); два read-only helper'а в `db.py` (`get_running_agents`/`agent_cost_totals`). Логику мониторинга (пороги/алерты/история/Telegram) НЕ несёт — это F1b. Контракт ниже (§ «Сырьё-эндпоинт `/metrics`»). Kill-switch `metrics_endpoint_enabled` (дефолт `True`). `STAGE_TRANSITIONS`/`QG_CHECKS`/схема БД — не тронуты. - **Lessons journal** (`src/lessons.py` + таблица `lessons`, ORCH-098 — реализовано, [adr-0034](adr/adr-0034-lessons-journal.md)) — машинный журнал уроков (структурированная база отклонений конвейера); шаг 1 эпика саморазвития (домен 0 «Фундамент», F2; топливо петли самообучения 8A), фундамент для будущих ретроспективщика (E2)/приоритизатора RICE (E3)/Стрим. Чистый **observer-leaf** (never-raise, паттерн `serial_gate`/`coverage_gate`/`metrics`): `record()`/`get()`/`update()`/`snapshot()`. **Аддитивная идемпотентная таблица `lessons`** (`CREATE TABLE IF NOT EXISTS` в `init_db()`, restart-safe) с полями контекста (`work_item_id`/`task_id`/`stage`/`agent`/`repo`), анализа (`root_cause`/`suggestion`), статуса (`status`/`related_task`) и **атрибуции — сразу и нуллабельно** (`attribution`/`target_repo`/`target_domain`, требование Славы 10.06 / NFR-6, заполняется позже ретроспективщиком/человеком) + `source`/`detail`; без `enum`-констрейнтов (слаги forward-compatible). **Автозапись 4 типов** (`source="auto"`, best-effort, дедуп в окне; `transient_retry` — только на исчерпании бюджета ретраев) тонкими врезками: `gate_failure` (`stage_engine._handle_qg_failure_rollbacks`), `merge_hold` (`merge_gate._handle_merge_verify` HOLD), `transient_retry` (merge-retry/launcher transient budget-exhaustion), `deploy_degraded` (post-deploy `DEGRADED → set_repo_freeze`, урок слоя-3 «деплой OK / прод сломан», ET-8). Эндпоинты `GET /lessons` (read-only, фильтры), `POST /lessons` (ручная запись), `POST /lessons/{id}` (update/доклассификация), + read-only ключ `lessons` в `GET /queue`. **Расхождение с гейт-шаблоном:** журнал observer-only → **НЕ скоупится по репо** (kill-switch `lessons_enabled` only, без `lessons_repos`); репо-разрез — на выборке (`repo`-колонка/фильтр), enduro не затронут (общая БД, аддитивная таблица). `STAGE_TRANSITIONS`/`QG_CHECKS`/`check_*`/machine-verdict/схемы существующих таблиц — байт-в-байт не тронуты (журнал не участвует в решении гейта). Kill-switch `lessons_enabled` (env `ORCH_LESSONS_ENABLED`, дефолт `True`). Детали — `docs/work-items/ORCH-098/06-adr/ADR-001-lessons-journal.md`. diff --git a/docs/operations/INFRA.md b/docs/operations/INFRA.md index 1324421..c0efd19 100644 --- a/docs/operations/INFRA.md +++ b/docs/operations/INFRA.md @@ -141,6 +141,8 @@ watchdog'а: **watchdog сигналит, pruner убирает**. | `ORCH_PLANE_API_URL` / `_TOKEN` / `_WORKSPACE_SLUG` | доступ к Plane API | | `ORCH_PLANE_WEB_URL` | внешний (браузерный) web-URL Plane для кликабельных ссылок на issue в уведомлениях (ORCH-017); пусто → фолбэк на `ORCH_PLANE_API_URL`, loopback-фолбэк → ссылка опускается | | `ORCH_PLANE_WEBHOOK_SECRET` | HMAC-проверка вебхуков Plane | +| `ORCH_PLANE_TEST_WRITE_ENABLED` | ORCH-117: opt-in реальной записи в Plane из **тест-процесса** (дефолт `false` = default-deny). НЕ kill-switch прод-блока: даже `true` пишет только в sandbox-allowlist (прод-запись из pytest невозможна). В боевом/staging рантайме гард — no-op | +| `ORCH_PLANE_TEST_SANDBOX_PROJECTS` | ORCH-117: CSV-allowlist sandbox-проектов, куда opt-in разрешает запись из тестов (дефолт = единственный SANDBOX `8c5a3025-…`; пусто → ни один проект из тестов не пишется) | | `ORCH_GITEA_URL` / `_TOKEN` / `_WEBHOOK_SECRET` | доступ к Gitea + HMAC | | `ORCH_CLAUDE_BIN` | путь к claude CLI | | `ORCH_REPOS_DIR` / `ORCH_HOST_REPOS_DIR` | каталог репозиториев (в контейнере / на хосте) | @@ -224,6 +226,18 @@ watchdog'а: **watchdog сигналит, pruner убирает**. **Что изолировано (безопасно):** - Staging (8501) — отдельная БД (`./data/staging`), отдельный реестр (`ORCH_PROJECTS_JSON` = только sandbox). Прод-проекты не видит. - Репозитории разделены, изоляция веток через git worktree (ORCH-2). +- **Запись в Plane из тест-процесса — sandbox-only fail-closed (ORCH-117).** Тест/worktree-процесс + наследует живой боевой Plane-токен (`PLANE_HEADERS`/`PROJECT_ID` захвачены на импорте `plane_sync`); + раньше **ничто** не мешало pytest смутировать боевую доску (инцидент ORCH-114 — «ложный Done»). + Теперь leaf `src/plane_write_guard.py` врезан в 3 примитива записи `plane_sync` + (`update_issue_state`/`add_comment`/`_set_issue_state_direct`) и **в тест-процессе** (детект + `pytest`-в-процессе) блокирует запись по умолчанию; разрешена только при opt-in + `ORCH_PLANE_TEST_WRITE_ENABLED=true` **И** целевом проекте ∈ `ORCH_PLANE_TEST_SANDBOX_PROJECTS` + (sandbox-only — боевой проект запрещён даже при opt-in). Боевой и staging рантаймы + (`uvicorn src.main:app`, без pytest в процессе) — гард **no-op**, запись как прежде. Прод-блок + **без kill-switch** (выключателя-чёрного-хода нет); второй слой — autouse-floor + `tests/conftest.py::_plane_sandbox_only` (по образцу `_no_telegram`). Детали — `CLAUDE.md` + «Sandbox-only fail-closed изоляция записи в Plane (ORCH-117)», adr-0046. **Страховки:** - Стадия `deploy-staging` (порт 8501) — обязательный гейт перед прод-деплоем орка. Прод-деплой недостижим, пока staging-гейт не зелёный (см. `STAGING.md`, ORCH-35). Гейт условный: реален только для self-hosting (repo=orchestrator), для остальных проектов — no-op. diff --git a/src/config.py b/src/config.py index a21b7c8..888ef66 100644 --- a/src/config.py +++ b/src/config.py @@ -29,6 +29,25 @@ class Settings(BaseSettings): plane_bot_deployer: str = "" plane_bot_stream: str = "" + # ORCH-117 (ADR-001 D6): sandbox-only fail-closed guard for Plane WRITE + # primitives from a test/worktree process (regression of incident ORCH-114, + # where a pytest run mutated a live prod board issue). The guard (leaf + # src/plane_write_guard.py) is a no-op in the live runtime (no pytest in the + # uvicorn process); in a test process it blocks every Plane write UNLESS both + # the opt-in flag is ON and the target project is in the sandbox allowlist. + # plane_test_write_enabled -> opt-in for REAL Plane writes from a test process + # (env ORCH_PLANE_TEST_WRITE_ENABLED). Default False + # = safe (default-deny). NOT a kill-switch for the + # prod-block: even ON, only sandbox projects are + # writable (allowlist below); a prod write from + # pytest stays physically impossible (NFR-6/FR-7). + # plane_test_sandbox_projects -> CSV allowlist of sandbox project ids the opt-in + # may write to (env ORCH_PLANE_TEST_SANDBOX_PROJECTS). + # Default = the single SANDBOX project. Empty -> no + # project is writable from a test process at all. + plane_test_write_enabled: bool = False + plane_test_sandbox_projects: str = "8c5a3025-4f9d-4190-b79f-fa06276bb27e" + # Gitea gitea_url: str = "http://localhost:3000" gitea_public_url: str = "" # external URL for clickable links in comments; falls back to gitea_url diff --git a/src/plane_sync.py b/src/plane_sync.py index e501bd5..c7c90b6 100644 --- a/src/plane_sync.py +++ b/src/plane_sync.py @@ -4,6 +4,7 @@ import logging import time import httpx from .config import settings +from . import plane_write_guard logger = logging.getLogger("orchestrator.plane_sync") @@ -843,9 +844,30 @@ def find_issue_id(work_item_id: str, project_id: str = None) -> str | None: return None +def _guard_allows_write(work_item_id: str, project_id: str, op: str) -> bool: + """ORCH-117: fail-closed gate in front of every Plane WRITE (state/comment). + + Returns True if the write may proceed. In the live orchestrator/staging runtime + this is always True (the guard is a no-op — no pytest in the process). In a + test/worktree process a non-sandbox / non-opt-in write is BLOCKED here (audited + loudly) and this returns False, so the calling primitive returns BEFORE any + network step (no GET, no PATCH/POST). See src/plane_write_guard.py / ORCH-114. + """ + ok, reason = plane_write_guard.decide(project_id, op, work_item_id) + if not ok: + plane_write_guard.audit_block(project_id, op, work_item_id, reason) + return False + if reason == plane_write_guard.R_SANDBOX_OPT_IN: + plane_write_guard.audit_allow(project_id, op, work_item_id, reason) + return True + + def update_issue_state(work_item_id: str, stage: str, project_id: str = None): """Update Plane issue state based on orchestrator stage.""" project_id = _resolve_project_id(work_item_id, project_id) + # ORCH-117: fail-closed guard — block prod Plane writes from a test process. + if not _guard_allows_write(work_item_id, project_id, plane_write_guard.OP_STATE): + return # ORCH-10: resolve state UUID for this specific project (not global dict). state_id = stage_to_state(stage, project_id) if not state_id: @@ -874,6 +896,9 @@ def add_comment(work_item_id: str, text: str, project_id: str = None, author: st ``_headers_for``). GET/PATCH calls elsewhere keep using PLANE_HEADERS. """ project_id = _resolve_project_id(work_item_id, project_id) + # ORCH-117: fail-closed guard — block prod Plane comment-writes from a test process. + if not _guard_allows_write(work_item_id, project_id, plane_write_guard.OP_COMMENT): + return issue_id = find_issue_id(work_item_id, project_id) if not issue_id: logger.warning(f"Issue not found in Plane for {work_item_id}, skipping comment") @@ -1038,6 +1063,9 @@ def set_issue_stage_state(work_item_id: str, stage: str, project_id: str = None) def _set_issue_state_direct(work_item_id: str, state_id: str, project_id: str = None): """Set issue state directly by state_id.""" project_id = _resolve_project_id(work_item_id, project_id) + # ORCH-117: fail-closed guard — block prod Plane writes from a test process. + if not _guard_allows_write(work_item_id, project_id, plane_write_guard.OP_STATE): + return issue_id = find_issue_id(work_item_id, project_id) if not issue_id: logger.warning(f"Issue not found in Plane for {work_item_id}") diff --git a/src/plane_write_guard.py b/src/plane_write_guard.py new file mode 100644 index 0000000..0ff4201 --- /dev/null +++ b/src/plane_write_guard.py @@ -0,0 +1,193 @@ +"""ORCH-117: fail-closed guard for Plane WRITE primitives from a test/worktree process. + +Leaf module — pure, never-raise in the live path, config-gated. Mirrors the leaf +pattern of ``src/deploy_status_guard.py`` / ``src/serial_gate.py`` / ``src/cancel.py``: +it imports only ``config`` (and stdlib ``os``/``sys``), never ``plane_sync`` / +``stage_engine`` — the three write primitives that need a verdict call +:func:`decide`, the guard does not live there. + +The incident (established fact, ORCH-114). A test/worktree process performed a REAL +write to Plane against the **production** project: ``PATCH …/issues/… state=`` +plus a "Stage: deploy → done" comment, i.e. ``notify_stage_change("ORCH-114", +"deploy","done")`` run from pytest mutated a live board issue ("false Done"). The +root: test/staging processes inherit the live Plane token (``PLANE_HEADERS`` / +``PROJECT_ID`` are captured as literals at module import, so a post-hoc env/token +swap is a no-op, NFR-4), and *nothing* forced them to write only to the sandbox. + +The precedent. ``tests/conftest.py::_no_telegram`` is an autouse fixture muting +``send_telegram`` in ALL tests, exactly because "pytest on prod sent REAL Telegram +messages to Slava". The symmetric protection for Plane WRITES did not exist — this +is that protection. + +The fix (ADR-001 D1/D3): a low choke-point on the entry of the three write +primitives, evaluated **at call time** (not at import). The guard is active **only +in a test process** (``pytest``-in-process detection) — for the live orchestrator +runtime and the staging runtime (both ``uvicorn src.main:app``, no pytest in the +process) it is a strict no-op (byte-for-byte, NFR-2/NFR-3). In a test process a +write is allowed **iff** simultaneously (a) the dedicated opt-in flag is ON **and** +(b) the target project ∈ the sandbox-allowlist; otherwise BLOCK (default-deny). A +non-resolvable target → BLOCK (fail-closed, NFR-1). An allowed sandbox write is +audited at INFO; a block is audited LOUDLY at ERROR so an ORCH-114-class incident is +obvious, not silent (FR-5 / D7). + +Deliberately NO kill-switch for the prod-block (ADR-001 D4 / FR-7 / NFR-6): a guard +that makes a prod write from pytest *physically impossible* must not ship with a +config that re-opens it (that would be the very back-door NFR-6 forbids). The only +reversible regulator is the sandbox-bound opt-in (``plane_test_write_enabled`` + +``plane_test_sandbox_projects``); "disable the guard" ≠ "allow prod from pytest" — +that transition does not exist by design. The independent conftest floor +(``_plane_sandbox_only``, ADR-001 D5) is the second sandbox-bound layer. + +This is bugfix-isolation, NOT a Quality Gate and NOT a stage: ``STAGE_TRANSITIONS`` / +``QG_CHECKS`` / ``check_*`` / machine-verdict keys / the DB schema are byte-for-byte +untouched. +""" +from __future__ import annotations + +import logging +import os +import sys + +from .config import settings + +logger = logging.getLogger("orchestrator.plane_write_guard") + +# Verdicts returned by decide() (the calling primitive executes them). +ALLOW = True +BLOCK = False + +# Reasons (stable slugs — asserted by tests / read in audit lines). +R_LIVE_RUNTIME = "live-runtime" # not a test process -> no-op (prod/staging). +R_AMBIGUOUS = "ambiguous-target" # project_id empty/unresolved -> "don't know => don't write". +R_OPT_IN_DISABLED = "opt-in-disabled" # test process, opt-in OFF -> default-deny. +R_PROD_IN_TEST = "prod-project-in-test" # test process, project NOT in sandbox allowlist. +R_SANDBOX_OPT_IN = "sandbox-opt-in" # test process, opt-in ON + sandbox project -> ALLOW. +R_GUARD_ERROR = "guard-error" # internal error inside the test-path -> fail-closed BLOCK. + +# Operation tokens (one per call site) — used only for the audit line. +OP_STATE = "state" # update_issue_state / _set_issue_state_direct (httpx.patch) +OP_COMMENT = "comment" # add_comment (httpx.post) + + +def _in_test_process() -> bool: + """True iff this Python process is a pytest/worktree test process (ADR-001 D2). + + ``"pytest" in sys.modules`` is true for the whole pytest run (collection + + execution) in THIS process, which is exactly the worktree ``python -m pytest`` + process from the incident. The live orchestrator and the staging runtime start + via ``uvicorn src.main:app`` and never import pytest into their process, so the + detection never fires there (NFR-2/NFR-3, AC-5/AC-6). ``PYTEST_CURRENT_TEST`` is + a secondary confirming signal pytest sets for the duration of a test body. Both + are read at call time (NFR-4). Never raises: on any error we treat the process + as NOT-a-test (-> live ALLOW), so the guard can never accidentally wedge a + legitimate prod write. + """ + try: + if "pytest" in sys.modules: + return True + if os.environ.get("PYTEST_CURRENT_TEST"): + return True + except Exception: # noqa: BLE001 - never-raise; ambiguity -> "not a test" (live ALLOW). + return False + return False + + +def _sandbox_allowlist() -> set[str]: + """Sanitised set of sandbox project ids from ``plane_test_sandbox_projects``. + + Empty/blank CSV -> empty set (then EVERY project blocks in a test process, + fail-closed). Never raises. + """ + try: + raw = (settings.plane_test_sandbox_projects or "").strip() + except Exception: # noqa: BLE001 - never-raise. + return set() + if not raw: + return set() + return {tok.strip() for tok in raw.split(",") if tok.strip()} + + +def decide(project_id: str | None, op: str, work_item_id: str | None = None) -> tuple[bool, str]: + """Decide whether a Plane WRITE primitive may hit the network (ADR-001 D3). + + Returns ``(ok, reason)``. Steps: + + 1. ``not _in_test_process()`` -> ALLOW (``live-runtime``: prod/staging no-op). + 2. ``project_id`` empty/None/unresolved -> BLOCK (``ambiguous-target``, NFR-1). + 3. opt-in flag OFF -> BLOCK (``opt-in-disabled``, default-deny). + 4. ``project_id`` ∉ sandbox allowlist -> BLOCK (``prod-project-in-test``, AC-3). + 5. otherwise -> ALLOW (``sandbox-opt-in``, audit INFO). + + never-raise: the live path returns at step 1 BEFORE the try-block, so a prod + write can never be wedged by a guard bug. Once we know we are in a test process, + any internal error fails CLOSED to BLOCK (``guard-error``) — the defect surfaces + loudly rather than re-opening prod (AC-9 / FR-7). + """ + # Step 1 — outside any test process the guard is a strict no-op. Evaluated FIRST + # and OUTSIDE the try-block so a live prod/staging write is never affected. + if not _in_test_process(): + return ALLOW, R_LIVE_RUNTIME + + # From here on we are in a test process: default-deny, fail-closed on any error. + try: + pid = (project_id or "").strip() + if not pid: + return BLOCK, R_AMBIGUOUS # step 2 + if not bool(getattr(settings, "plane_test_write_enabled", False)): + return BLOCK, R_OPT_IN_DISABLED # step 3 + if pid not in _sandbox_allowlist(): + return BLOCK, R_PROD_IN_TEST # step 4 — sandbox-only, even with opt-in. + return ALLOW, R_SANDBOX_OPT_IN # step 5 + except Exception as e: # noqa: BLE001 - never-raise; in-test -> fail CLOSED. + logger.error( + "plane_write_guard.decide error in test-process -> BLOCK (fail-closed): %s", e + ) + return BLOCK, R_GUARD_ERROR + + +def audit_block(project_id: str | None, op: str, work_item_id: str | None, reason: str) -> None: + """Loud structured audit of a BLOCKED write (FR-5 / D7). + + Logged at ERROR so an ORCH-114-class incident (a pytest mutating a non-sandbox + project) is obvious in the run log, not silent. Never raises. + """ + try: + logger.error( + "plane_write_guard BLOCKED Plane %s write from a test process: " + "project_id=%s work_item=%s reason=%s " + "(ORCH-117 fail-closed; this would have mutated a non-sandbox Plane " + "project from pytest — cf. the ORCH-114 incident)", + op, project_id, work_item_id, reason, + ) + except Exception: # noqa: BLE001 - logging must never raise. + pass + + +def audit_allow(project_id: str | None, op: str, work_item_id: str | None, + reason: str = R_SANDBOX_OPT_IN) -> None: + """Audit (INFO) an ALLOWED real sandbox write from a test process (FR-5 / D7). + + Only the ``sandbox-opt-in`` case is audited here — the ``live-runtime`` ALLOW + (prod/staging) is the normal hot path and is intentionally NOT logged to avoid + spamming the production log. Never raises. + """ + try: + logger.info( + "plane_write_guard ALLOWED sandbox Plane %s write from a test process: " + "project_id=%s work_item=%s reason=%s", + op, project_id, work_item_id, reason, + ) + except Exception: # noqa: BLE001 - logging must never raise. + pass + + +def snapshot() -> dict: + """Read-only view of the guard state (optional observability, D7). Never raises.""" + try: + return { + "in_test_process": _in_test_process(), + "opt_in_enabled": bool(getattr(settings, "plane_test_write_enabled", False)), + "sandbox_allowlist": sorted(_sandbox_allowlist()), + } + except Exception as e: # noqa: BLE001 - never-raise. + return {"error": str(e)} diff --git a/tests/conftest.py b/tests/conftest.py index 7f55a30..66f6a11 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -135,6 +135,34 @@ def _disable_merge_verify(monkeypatch): yield +@pytest.fixture(autouse=True) +def _plane_sandbox_only(monkeypatch): + """ORCH-117: fail-closed FLOOR — no test may write to a non-sandbox Plane project. + + The independent second layer of the sandbox-only Plane-write guard (ADR-001 D5), + by the same model as ``_no_telegram``: it forces the safe defaults for EVERY + test, OVERRIDING any live variable inherited from the container environment. + + With the opt-in OFF, ``src/plane_write_guard.decide`` blocks ALL Plane writes + from the test process (both sandbox and prod) -> default-deny (AC-4). Even if the + runtime leaf ever erroneously returned ALLOW, this floor keeps a prod write from + a plain ``pytest tests/`` impossible. Sandbox-e2e tests that need a REAL write to + SANDBOX re-enable the opt-in in their OWN fixture AFTER this autouse (exactly as + ``test_orch114_*`` / ``test_merge_verify`` re-enable their flags); the allowlist + already contains the SANDBOX id, so the write to SANDBOX passes while a prod write + still blocks (allowlist sandbox-only, AC-3). + """ + from src import config as _cfg + monkeypatch.setattr(_cfg.settings, "plane_test_write_enabled", False, raising=False) + monkeypatch.setattr( + _cfg.settings, + "plane_test_sandbox_projects", + "8c5a3025-4f9d-4190-b79f-fa06276bb27e", + raising=False, + ) + yield + + @pytest.fixture(autouse=True) def _disable_transition_lease(monkeypatch): """ORCH-114: disable the transition-ownership lease + expected-stage CAS by diff --git a/tests/test_orch117_plane_write_isolation.py b/tests/test_orch117_plane_write_isolation.py new file mode 100644 index 0000000..c7ffa24 --- /dev/null +++ b/tests/test_orch117_plane_write_isolation.py @@ -0,0 +1,287 @@ +"""ORCH-117 (adr-0046): sandbox-only fail-closed isolation of Plane WRITES. + +Regression of the ORCH-114 incident: a pytest/worktree process performed a REAL +``PATCH …/issues/… state=`` + comment against the PRODUCTION Plane project, +because test/staging processes inherit the live Plane token and nothing forced them +to write only to the sandbox. This suite pins the fix (``src/plane_write_guard.py`` +врезка in the three ``plane_sync`` write primitives + the conftest floor). + +Covers TC-01…TC-14 (see docs/work-items/ORCH-117/04-test-plan.yaml). httpx is mocked +throughout — there are NO real network calls (a prod write is the very thing the fix +forbids). The autouse conftest fixture ``_plane_sandbox_only`` sets the safe floor +(opt-in OFF, sandbox allowlist = the one SANDBOX id) for the whole suite; ALLOW-path +tests re-enable the opt-in in their own monkeypatch AFTER it (the documented pattern). + +TC-01 is the MANDATORY incident regression: it is RED before the fix (без the +guard врезка the call reaches ``httpx.patch``/``httpx.post``) and GREEN after. +""" + +import logging +import os + +# Match the env-default convention of the other plane suites so config loads cleanly. +os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token") +os.environ.setdefault("ORCH_PLANE_WORKSPACE_SLUG", "test-ws") +os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token") + +from unittest.mock import MagicMock, patch # noqa: E402 + +import pytest # noqa: E402 + +from src import config as _cfg # noqa: E402 +from src import plane_sync as PS # noqa: E402 +from src import plane_write_guard as PWG # noqa: E402 + +# Project ids (verified literals — TRZ §3 / ADR-001 / test-plan notes). +PROD = "7a79f0a9-5278-49cd-9007-9a338f238f9c" # a live (non-sandbox) project. +SANDBOX = "8c5a3025-4f9d-4190-b79f-fa06276bb27e" # the one allowed sandbox project. + + +# --------------------------------------------------------------------------- # +# Helpers / fixtures +# --------------------------------------------------------------------------- # +def _opt_in(monkeypatch, projects: str = SANDBOX): + """Turn the sandbox-write opt-in ON (it is OFF by default via the conftest floor).""" + monkeypatch.setattr(_cfg.settings, "plane_test_write_enabled", True, raising=False) + monkeypatch.setattr(_cfg.settings, "plane_test_sandbox_projects", projects, raising=False) + + +def _mock_httpx(): + """Patch ``plane_sync.httpx`` so any patch/post/get is RECORDED, never sent.""" + return patch.object(PS, "httpx", MagicMock()) + + +def _resp_ok(): + r = MagicMock() + r.status_code = 200 + r.raise_for_status.return_value = None + return r + + +@pytest.fixture +def _network_stubs(): + """Stub the network helpers so an ALLOWED write would reach httpx (not the DB/API).""" + with patch.object(PS, "find_issue_id", return_value="issue-uuid"), \ + patch.object(PS, "stage_to_state", return_value="state-uuid"): + yield + + +# --------------------------------------------------------------------------- # +# TC-01 — MANDATORY regression of the ORCH-114 incident. +# --------------------------------------------------------------------------- # +def test_tc01_notify_stage_change_prod_makes_zero_writes(monkeypatch): + """A live prod token in PLANE_HEADERS + pytest + the incident call + ``notify_stage_change('ORCH-114','deploy','done')`` against the prod project -> + ZERO real httpx.patch/post. RED before the guard врезка, GREEN after.""" + # Mirror the incident: a REAL prod token is captured in the module headers. + monkeypatch.setattr(PS, "PLANE_HEADERS", {"X-API-Key": "LIVE-PROD-TOKEN"}, raising=False) + # No opt-in (default floor) — exactly a normal `pytest tests/` run. + with _mock_httpx() as mock_httpx, \ + patch.object(PS, "find_issue_id", return_value="issue-uuid"), \ + patch.object(PS, "stage_to_state", return_value="state-uuid"): + PS.notify_stage_change("ORCH-114", "deploy", "done", project_id=PROD) + + mock_httpx.patch.assert_not_called() + mock_httpx.post.assert_not_called() + + +# --------------------------------------------------------------------------- # +# TC-02 / TC-03 / TC-04 — each write primitive blocks a prod target in-test. +# --------------------------------------------------------------------------- # +def test_tc02_update_issue_state_prod_blocked(monkeypatch, caplog, _network_stubs): + """update_issue_state -> prod project -> httpx.patch NOT called; reason prod-project-in-test.""" + _opt_in(monkeypatch) # opt-in ON so the BLOCK reason is the allowlist, not opt-in-off. + with _mock_httpx() as mock_httpx, caplog.at_level(logging.INFO, logger="orchestrator.plane_write_guard"): + PS.update_issue_state("ORCH-1", "done", project_id=PROD) + mock_httpx.patch.assert_not_called() + assert PWG.R_PROD_IN_TEST in caplog.text + + +def test_tc03_add_comment_prod_blocked(monkeypatch, _network_stubs): + """add_comment -> prod project -> httpx.post NOT called.""" + _opt_in(monkeypatch) + with _mock_httpx() as mock_httpx: + PS.add_comment("ORCH-1", "hello", project_id=PROD) + mock_httpx.post.assert_not_called() + + +def test_tc04_set_issue_state_direct_prod_blocked(monkeypatch, _network_stubs): + """_set_issue_state_direct (the primitive every set_issue_* funnels into) -> + prod project -> httpx.patch NOT called.""" + _opt_in(monkeypatch) + with _mock_httpx() as mock_httpx: + PS._set_issue_state_direct("ORCH-1", "state-uuid", project_id=PROD) + mock_httpx.patch.assert_not_called() + + +def test_tc04_set_issue_done_prod_blocked(monkeypatch): + """set_issue_done -> _set_issue_state_direct -> prod -> blocked (covers the + public set_issue_* surface, which all reduce to the guarded primitive).""" + _opt_in(monkeypatch) + with _mock_httpx() as mock_httpx, \ + patch.object(PS, "get_project_states", return_value={"done": "done-uuid"}), \ + patch.object(PS, "find_issue_id", return_value="issue-uuid"): + PS.set_issue_done("ORCH-1", project_id=PROD) + mock_httpx.patch.assert_not_called() + + +# --------------------------------------------------------------------------- # +# TC-05 — default-deny: without opt-in, EVERY target (incl. sandbox) is blocked. +# --------------------------------------------------------------------------- # +def test_tc05_default_deny_blocks_sandbox_and_prod(_network_stubs): + """No opt-in (conftest floor) -> sandbox AND prod both blocked.""" + with _mock_httpx() as mock_httpx: + PS.update_issue_state("ORCH-1", "done", project_id=SANDBOX) + PS.update_issue_state("ORCH-1", "done", project_id=PROD) + mock_httpx.patch.assert_not_called() + # Verdict-level: the reason is opt-in-disabled for both. + assert PS.plane_write_guard.decide(SANDBOX, "state")[1] == PWG.R_OPT_IN_DISABLED + assert PS.plane_write_guard.decide(PROD, "state")[1] == PWG.R_OPT_IN_DISABLED + + +# --------------------------------------------------------------------------- # +# TC-06 — sandbox allow: opt-in ON + sandbox project -> real (mocked) write fires. +# --------------------------------------------------------------------------- # +def test_tc06_sandbox_optin_allows_write(monkeypatch, _network_stubs): + """opt-in ON + SANDBOX -> httpx.patch IS called, addressed to the sandbox URL.""" + _opt_in(monkeypatch) + with _mock_httpx() as mock_httpx: + mock_httpx.patch.return_value = _resp_ok() + PS.update_issue_state("ORCH-1", "done", project_id=SANDBOX) + mock_httpx.patch.assert_called_once() + url = mock_httpx.patch.call_args.args[0] + assert SANDBOX in url + assert PROD not in url + + +# --------------------------------------------------------------------------- # +# TC-07 — sandbox-only even with opt-in: a prod target is ALWAYS blocked. +# --------------------------------------------------------------------------- # +def test_tc07_optin_still_blocks_prod(monkeypatch): + """opt-in ON does NOT unlock prod — the allowlist is sandbox-only (AC-3).""" + _opt_in(monkeypatch) + ok, reason = PS.plane_write_guard.decide(PROD, "state", "ORCH-1") + assert ok is False + assert reason == PWG.R_PROD_IN_TEST + + +# --------------------------------------------------------------------------- # +# TC-08 — fail-closed on ambiguity: empty/None target -> block. +# --------------------------------------------------------------------------- # +def test_tc08_ambiguous_target_blocked(monkeypatch): + """opt-in ON but project_id empty/None -> block (NFR-1 'don't know => don't write').""" + _opt_in(monkeypatch) + assert PS.plane_write_guard.decide("", "state")[1] == PWG.R_AMBIGUOUS + assert PS.plane_write_guard.decide(None, "comment")[1] == PWG.R_AMBIGUOUS + assert PS.plane_write_guard.decide(" ", "state")[1] == PWG.R_AMBIGUOUS + + +# --------------------------------------------------------------------------- # +# TC-09 — immune to the import-time token capture (AC-7 / NFR-4). +# --------------------------------------------------------------------------- # +def test_tc09_blocks_regardless_of_captured_token(monkeypatch, _network_stubs): + """A REAL token in PLANE_HEADERS (captured at import) does not help: the guard + decides at CALL time on (test-process + target project), not on the token, and + does not rely on os.environ.setdefault / a settings token swap.""" + monkeypatch.setattr(PS, "PLANE_HEADERS", {"X-API-Key": "LIVE-PROD-TOKEN"}, raising=False) + # No opt-in: a plain pytest run with a live token still cannot mutate prod. + with _mock_httpx() as mock_httpx: + PS.update_issue_state("ORCH-1", "done", project_id=PROD) + PS._set_issue_state_direct("ORCH-1", "state-uuid", project_id=PROD) + mock_httpx.patch.assert_not_called() + # The verdict is token-independent. + assert PS.plane_write_guard.decide(PROD, "state")[0] is False + + +# --------------------------------------------------------------------------- # +# TC-10 — zero regression of the LIVE runtime: not-a-test -> guard is a no-op. +# --------------------------------------------------------------------------- # +def test_tc10_live_runtime_is_noop(monkeypatch, _network_stubs): + """Simulate a non-pytest process -> guard ALLOWs (live-runtime) and the prod + write goes out byte-for-byte (same URL/headers/payload as before ORCH-117).""" + monkeypatch.setattr(PWG, "_in_test_process", lambda: False) + monkeypatch.setattr(PS, "PLANE_HEADERS", {"X-API-Key": "LIVE-PROD-TOKEN"}, raising=False) + with _mock_httpx() as mock_httpx: + mock_httpx.patch.return_value = _resp_ok() + PS.update_issue_state("ORCH-1", "done", project_id=PROD) + mock_httpx.patch.assert_called_once() + args, kwargs = mock_httpx.patch.call_args + assert PROD in args[0] + assert kwargs["headers"] == {"X-API-Key": "LIVE-PROD-TOKEN"} + assert kwargs["json"] == {"state": "state-uuid"} + # The verdict itself is ALLOW/live-runtime. + assert PWG.decide(PROD, "state") == (True, PWG.R_LIVE_RUNTIME) + + +# --------------------------------------------------------------------------- # +# TC-11 — staging runtime (not pytest) writes to SANDBOX normally. +# --------------------------------------------------------------------------- # +def test_tc11_staging_writes_sandbox(monkeypatch, _network_stubs): + """Staging is a real uvicorn process (not pytest) on the sandbox project -> + the test-process detection does NOT fire, the write to SANDBOX passes.""" + monkeypatch.setattr(PWG, "_in_test_process", lambda: False) + with _mock_httpx() as mock_httpx: + mock_httpx.patch.return_value = _resp_ok() + PS.update_issue_state("ORCH-1", "done", project_id=SANDBOX) + mock_httpx.patch.assert_called_once() + assert SANDBOX in mock_httpx.patch.call_args.args[0] + + +# --------------------------------------------------------------------------- # +# TC-12 — audit/observability of block (loud) and allow (info). +# --------------------------------------------------------------------------- # +def test_tc12_block_audited_loudly(monkeypatch, caplog, _network_stubs): + """A blocked write emits a structured WARNING/ERROR carrying project_id / + work_item / op / reason.""" + _opt_in(monkeypatch) + with caplog.at_level(logging.INFO, logger="orchestrator.plane_write_guard"), _mock_httpx(): + PS.update_issue_state("ORCH-114", "done", project_id=PROD) + blocks = [r for r in caplog.records if r.levelno >= logging.WARNING] + assert blocks, "a block must emit at least one WARNING/ERROR record" + text = caplog.text + assert PROD in text and "ORCH-114" in text + assert PWG.OP_STATE in text and PWG.R_PROD_IN_TEST in text + + +def test_tc12_sandbox_allow_audited_info(monkeypatch, caplog, _network_stubs): + """An allowed sandbox write emits an INFO audit line.""" + _opt_in(monkeypatch) + with caplog.at_level(logging.INFO, logger="orchestrator.plane_write_guard"), \ + _mock_httpx() as mock_httpx: + mock_httpx.patch.return_value = _resp_ok() + PS.update_issue_state("ORCH-1", "done", project_id=SANDBOX) + infos = [r for r in caplog.records if r.levelno == logging.INFO and "ALLOWED" in r.message] + assert infos, "an allowed sandbox write must emit an INFO audit line" + assert SANDBOX in caplog.text + + +# --------------------------------------------------------------------------- # +# TC-13 — the autouse conftest floor protects the whole suite by default. +# --------------------------------------------------------------------------- # +def test_tc13_conftest_floor_default_deny(): + """Without any per-test opt-in, the floor leaves the opt-in OFF and the sandbox + allowlist pinned to the one SANDBOX id -> a representative write to prod is a + no-op (default-deny is active for every test, not just this file).""" + assert _cfg.settings.plane_test_write_enabled is False + assert _cfg.settings.plane_test_sandbox_projects == SANDBOX + with _mock_httpx() as mock_httpx, \ + patch.object(PS, "find_issue_id", return_value="issue-uuid"), \ + patch.object(PS, "stage_to_state", return_value="state-uuid"): + PS.update_issue_state("ORCH-2", "done", project_id=PROD) + mock_httpx.patch.assert_not_called() + + +# --------------------------------------------------------------------------- # +# TC-14 — kill-switch без чёрного хода (NFR-6 / FR-7 / D4 anti-drift). +# --------------------------------------------------------------------------- # +def test_tc14_no_killswitch_backdoor(monkeypatch): + """There is intentionally NO ``plane_write_guard_enabled`` kill-switch that + re-opens a prod write from pytest. The only reversible regulator is the + sandbox-bound opt-in; even with it ON, prod stays blocked.""" + # Anti-drift: the back-door config key must not exist (a future agent adding it + # would reintroduce the ORCH-114 defect — see ADR-001 D4 / TR-4). + assert not hasattr(_cfg.settings, "plane_write_guard_enabled") + # Opt-in ON is sandbox-bound, never a prod back-door. + _opt_in(monkeypatch) + assert PWG.decide(PROD, "state")[0] is False + assert PWG.decide(SANDBOX, "state")[0] is True diff --git a/tests/test_plane_author.py b/tests/test_plane_author.py index 2b672db..d2a7c4d 100644 --- a/tests/test_plane_author.py +++ b/tests/test_plane_author.py @@ -16,11 +16,27 @@ import os os.environ.setdefault("ORCH_PLANE_API_TOKEN", "shared-token") os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token") +import pytest # noqa: E402 from unittest.mock import patch, MagicMock # noqa: E402 from src import plane_sync # noqa: E402 +@pytest.fixture(autouse=True) +def _allow_plane_writes(monkeypatch): + """ORCH-117: these tests exercise the write primitives' header/URL routing and + assert on the (mocked) httpx call. The fail-closed sandbox guard (conftest + ``_plane_sandbox_only``) would otherwise block the write in-process (proj is not + a sandbox id + opt-in off). Bypass the guard verdict here so the network-shape + assertions still run; the guard ITSELF is covered by + tests/test_orch117_plane_write_isolation.py.""" + monkeypatch.setattr( + plane_sync.plane_write_guard, "decide", + lambda *a, **k: (True, "test-bypass"), raising=False, + ) + yield + + # --------------------------------------------------------------------------- # # _headers_for # --------------------------------------------------------------------------- # diff --git a/tests/test_plane_status_model.py b/tests/test_plane_status_model.py index a330573..acd00d0 100644 --- a/tests/test_plane_status_model.py +++ b/tests/test_plane_status_model.py @@ -15,11 +15,24 @@ import os os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token") os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token") +import pytest # noqa: E402 from unittest.mock import patch, MagicMock # noqa: E402 from src import plane_sync as PS # noqa: E402 +@pytest.fixture(autouse=True) +def _allow_plane_writes(monkeypatch): + """ORCH-117: bypass the fail-closed sandbox write-guard so these layer-B + URL/state-resolution assertions still reach the (mocked) httpx.patch. The guard + itself is covered by tests/test_orch117_plane_write_isolation.py.""" + monkeypatch.setattr( + PS.plane_write_guard, "decide", + lambda *a, **k: (True, "test-bypass"), raising=False, + ) + yield + + # A per-project state map that DEFINES the new ORCH-066 statuses with distinct # UUIDs, so we can prove the dedicated status (not the base alias) is used. _STATES_WITH_NEW = { diff --git a/tests/test_plane_sync_labels.py b/tests/test_plane_sync_labels.py index 365e8d5..b849b2d 100644 --- a/tests/test_plane_sync_labels.py +++ b/tests/test_plane_sync_labels.py @@ -33,6 +33,14 @@ def fresh_cache(monkeypatch): ps.reload_project_labels() monkeypatch.setattr(ps, "_resolve_project_id", lambda w=None, p=None: "proj-1") monkeypatch.setattr(ps.settings, "auto_label_states_ttl_s", 300, raising=False) + # ORCH-117: the TC-09 set_issue_approved test reaches the guarded write primitive + # with a non-sandbox project ("proj-1"); bypass the fail-closed sandbox guard so + # its (mocked) httpx.patch assertion runs. The guard is covered by + # tests/test_orch117_plane_write_isolation.py. + monkeypatch.setattr( + ps.plane_write_guard, "decide", + lambda *a, **k: (True, "test-bypass"), raising=False, + ) yield ps.reload_project_labels() diff --git a/tests/test_stage_visibility.py b/tests/test_stage_visibility.py index d7be813..35b375f 100644 --- a/tests/test_stage_visibility.py +++ b/tests/test_stage_visibility.py @@ -16,11 +16,24 @@ import os os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token") os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token") +import pytest # noqa: E402 from unittest.mock import patch, MagicMock # noqa: E402 from src import plane_sync as PS # noqa: E402 +@pytest.fixture(autouse=True) +def _allow_plane_writes(monkeypatch): + """ORCH-117: bypass the fail-closed sandbox write-guard so these stage-visibility + PATCH assertions still reach the (mocked) httpx.patch. The guard itself is covered + by tests/test_orch117_plane_write_isolation.py.""" + monkeypatch.setattr( + PS.plane_write_guard, "decide", + lambda *a, **k: (True, "test-bypass"), raising=False, + ) + yield + + EXPECTED_UUIDS = { "architecture": "3020bbb7-6122-4663-930c-0315ba8dfa3d", "development": "9920609b-f140-4e46-ab95-89acda8412c8",