From 1d72c445871d1d8e73a35211fbc452a1c5ef37f8 Mon Sep 17 00:00:00 2001 From: claude-bot Date: Mon, 8 Jun 2026 05:06:29 +0000 Subject: [PATCH] fix(reconciler): stop F-2 livelock spam on synced terminal tasks + cache TTL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reconciler F-2 spammed Telegram " разблокирована" every ~120s for a fully-synchronized Done task (incident ET-002, 191+ msgs/night) after the ORCH-066 Plane status model merge. Two stacked defects (defense in depth): - D1 (selection): actionable states were told apart by bare UUID, so a Done issue aliased onto the approved UUID entered the approved branch. Now terminal states are excluded by Plane state GROUP (completed/cancelled), a project-independent discriminator robust to UUID aliasing; per-issue check with a logical-key fallback when the group is unavailable. get_project_states caches {uuid -> group} from the same /states/ fetch; new sibling accessor get_project_state_groups. - D2 (notification): _note_unblock fired unconditionally after _dispatch. Now it only fires on a confirmed state change (stage before/after _dispatch; task-appears for the start case) — handlers' contracts untouched. - TR-3: in-memory dedup guard {issue_id -> last unblocked state} as a backstop. - TR-4: _STATES_CACHE lived for the whole process lifetime, so a new Plane status was invisible without a restart. Added TTL ORCH_PLANE_STATES_TTL_S (default 300s; 0 = previous lifetime cache) reusing reload_project_states(); a failed refresh serves the stale-but-correct set, not enduro defaults. STAGE_TRANSITIONS / QG_CHECKS / DB schema / handle_* contracts / F-1 / F-3 unchanged; never-raise preserved; self-hosting tick never restarts prod. Observability: skipped_terminal_total / deduped_total in /queue reconcile block. Tests: tests/test_reconciler_plane.py (TC-01..TC-10), tests/test_plane_states_cache.py (TC-11/TC-12). Refs: ORCH-068 Co-Authored-By: Claude Opus 4.7 --- .env.example | 9 + CHANGELOG.md | 1 + docs/architecture/README.md | 6 +- src/config.py | 12 ++ src/plane_sync.py | 81 +++++++- src/reconciler.py | 97 ++++++++- tests/test_plane_states_cache.py | 180 ++++++++++++++++ tests/test_reconciler_plane.py | 339 +++++++++++++++++++++++++++++++ 8 files changed, 709 insertions(+), 16 deletions(-) create mode 100644 tests/test_plane_states_cache.py diff --git a/.env.example b/.env.example index a7ef50c..c14636e 100644 --- a/.env.example +++ b/.env.example @@ -117,6 +117,15 @@ ORCH_RECONCILE_GRACE_OVERRIDES_JSON= ORCH_RECONCILE_NOTIFY_UNBLOCK=true ORCH_RECONCILE_SKIP_BLOCKED_ENABLED=true +# ORCH-068: TTL (seconds) for the per-project Plane states cache (plane_sync +# _STATES_CACHE). Historically the cache lived for the whole process lifetime, +# so a status added to Plane after start was invisible until a restart +# ("stale set -> no pipeline action"). With a TTL the entry self-heals by +# re-fetching /states/ once it expires (reuses reload_project_states()). +# >0 -> re-fetch after this many seconds (default 300 = 5 min); +# 0 -> disable TTL -> strictly the previous lifetime cache (back-compat). +ORCH_PLANE_STATES_TTL_S=300 + # ORCH-065: job-reaper + proactive merge-lease reclaim. A background daemon thread # (src/job_reaper.py, started LAST in main.lifespan after requeue_running_jobs) reaps # zombie 'running' jobs whose monitor/process died before writing the terminal status diff --git a/CHANGELOG.md b/CHANGELOG.md index 09dfba8..fc67008 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,7 @@ - Цепочка стадий: `... testing → deploy-staging → deploy → done` (была без `deploy-staging`). ### Fixed +- **Reconciler (F-2) больше не зацикливается на спаме «разблокирована» по синхронизированной done-задаче** (ORCH-068): после мерджа новой статусной модели Plane (ORCH-066) sweeper потерянных webhook (ORCH-053) каждые ~120с слал в Telegram `reconciler: ET-002 done разблокирована (потерян webhook)` для полностью синхронизированной задачи (БД `stage=done`, Plane `state=Done`) — livelock без advance/jobs/токенов, но 191+ сообщений за ночь (alert-fatigue, подрыв доверия к нотификациям). Два независимых складывающихся дефекта (defense in depth, ADR-001): **D1 (выборка)** — F-2 различал actionable-статусы по голому UUID, а после ORCH-066 терминальный `Done` перестал однозначно отличаться от `approved` по UUID (UUID-алиасинг) и done-issue попадала в ветку `approved`; терминалы нигде не исключались. Решение: исключение терминалов по **группе состояния Plane** (`state.group ∈ {completed, cancelled}`) — проектно-независимый, устойчивый к переименованиям дискриминатор; проверка per-issue (а не сужением `wanted`-набора, т.к. при алиасинге терминал физически совпадает с actionable-UUID); fallback по логическим ключам `done`/`cancelled`, когда группа недоступна. `get_project_states` расширен записью `{uuid → group}` из ТОГО ЖЕ `/states/`-запроса (без новой сетевой стоимости) + sibling-аксессор `get_project_state_groups`. **D2 (нотификация)** — `_note_unblock` вызывался безусловно сразу после `_dispatch`, не проверяя, изменил ли обработчик реально состояние; `handle_verdict(approved)` для уже-`done` задачи — no-op, но нотификация всё равно уходила (нарушение собственного docstring и инварианта silence-when-in-sync). Решение: сравнение стадии задачи **до/после** `_dispatch` на стороне reconciler (контракты `handle_*` НЕ тронуты) — `_note_unblock` только при подтверждённом state change; для in_progress-старта подтверждение = задача появилась. Плюс **TR-3** — in-memory дедуп-guard `{issue_id → last_unblocked_state}` (страховка против любого будущего no-op-пути). Вторичный баг кэша (**TR-4**): `_STATES_CACHE` жил весь lifetime процесса → новый Plane-статус был невидим без рестарта («stale set → no pipeline action»); добавлен TTL `ORCH_PLANE_STATES_TTL_S` (дефолт 300с; `0` → прежний lifetime-кэш) — запись самозалечивается перезапросом `/states/` (примитив сброса — существующий `reload_project_states()`); при сбое перезапроса отдаётся stale-but-correct набор, а не enduro-дефолты. Форма возврата `get_project_states` неизменна (AC-13). `STAGE_TRANSITIONS`, `QG_CHECKS`, схема БД, контракты `handle_status_start`/`handle_verdict`, F-1/F-3 — не тронуты; never-raise per-issue/-project сохранён; self-hosting — тик не рестартит прод. Наблюдаемость: счётчики `skipped_terminal_total`/`deduped_total` в блоке `reconcile` снимка `GET /queue`. ADR `docs/work-items/ORCH-068/06-adr/ADR-001-reconciler-terminal-exclusion-and-cache-ttl.md`. Тесты: `tests/test_reconciler_plane.py` (TC-01…TC-10), `tests/test_plane_states_cache.py` (TC-11/TC-12). - **Staging-rebuild больше не падает на `COPY data/` (worktree-контекст)** (ORCH-021): `check_staging_image_fresh` (ORCH-058, Strategy A) пересобирает staging-образ с **worktree задачи** в качестве docker build context (`docker build … "$BUILD_CONTEXT"`). Свежий git-worktree содержит только трекаемые файлы, а `Dockerfile` делал `COPY data/ ./data/` — но `data/` (директория SQLite) **gitignored** и в worktree-контексте отсутствует → `docker build` падал с `exit 1` («BUILD-STAGING: docker build failed - aborting»), задачу заворачивало с `deploy-staging` на `development` (петля, выжигание developer-ретраев, инцидент текущего прогона ORCH-021). При этом COPY был мёртвым грузом: `data/` всегда приходит рантайм-volume'ом (`./data:/app/data` / `./data/staging:/app/data` в `docker-compose.yml`), который затеняет всё, что было запечено в образ. Заменено на `RUN mkdir -p /app/data` (директория-mountpoint существует и без bind-mount, без зависимости от build-контекста). Контракты `STAGE_TRANSITIONS`/`QG_CHECKS`, штамп `LABEL org.opencontainers.image.revision=$GIT_SHA` (ORCH-058 Strategy B), exit-код-контракт хука — не тронуты. Регресс-гард: `tests/test_deploy_hook_provenance.py::test_tc08b_dockerfile_does_not_copy_gitignored_data_dir` (запрещает `COPY` любого gitignored-пути). - **`deploy-staging` больше не зацикливается на infra-only FAIL песочницы (C9a/C9b)** (ORCH-061): self-hosting `orchestrator` крутился в петле `deploy-staging → development` — `scripts/staging_check.py` давал `exit 1` при ЛЮБОМ упавшем чеке, поэтому две чисто инфраструктурные проверки **C9a** (ветка не появилась в `orchestrator-sandbox`) и **C9b** (job аналитика не встал в очередь staging) — вызванные тем, что SANDBOX-бот-аккаунты не состоят в sandbox-проекте Plane (шаги 6+ конвейера в песочнице недостижимы, это НЕ регресс конвейера) — приводили к `staging_status: FAILED` → откат → цикл (выжигание developer-ретраев, токенов, паразитная нагрузка общего инстанса). Решение (Direction «б», ADR-001): чеки классифицируются на `REAL` (все проверки конвейера A*/B*/C7/C8 — fail-closed) и `SANDBOX_INFRA` (строго allowlist `{C9a, C9b}` — waivable). Новый leaf-модуль `src/staging_verdict.py` (stdlib-only, контракт «never raise», по образцу `merge_gate`/`image_freshness`): `classify_check(label)` (allowlist по ведущему токену, всё неизвестное/малформенное → `REAL` fail-closed) и `compute_staging_verdict(items, infra_tolerant) -> StagingVerdict`: любой REAL-FAIL → `FAILED`/exit 1 (страховка при ЛЮБОМ значении флага); упали ТОЛЬКО C9a/C9b и толерантность включена → `SUCCESS`/exit 0 + упавшие метки в `waived` (наблюдаемость); только C9a/C9b и толерантность выключена → `FAILED`/exit 1 (legacy-строгий); любая внутренняя ошибка вердикта → `FAILED`/exit 1 (никогда не ложный green). `scripts/staging_check.py`: `Results` авто-классифицирует каждый чек (публичная 3-tuple форма `_items` сохранена — регрессия-гард ORCH-048 b6), `categorized_items()` отдаёт категорию, `summary()` печатает разбивку REAL/SANDBOX_INFRA; `main()` сворачивает прогон через `_verdict(...)`, печатает строки `INFRA-WAIVED:`/`VERDICT:` и делает `sys.exit(verdict.exit_code)`; новый флаг `--strict` форсит строгий режим для одного запуска. Глобальный kill-switch `ORCH_STAGING_INFRA_TOLERANCE_ENABLED` (`Settings.staging_infra_tolerance_enabled`, default `true`; `false` → строгий 1:1 до ORCH-061), живёт в `.env.staging`; `--strict` имеет приоритет над env. Наблюдаемость на стороне конвейера: `src/agents/launcher.py` получил `action_stage_no_changes_note(stage, repo)` — на action-стадиях (`deploy-staging`/`deploy`) self-hosting-репо «нет изменений для коммита» логируется как ожидаемое, а не трактуется как недопоставка. Контракты НЕ менялись: `STAGE_TRANSITIONS`, реестр `QG_CHECKS`, frontmatter `staging_status: SUCCESS|FAILED` / `deploy_status:` (толерантность применяется в скрипте ДО записи артефакта деплоером), exit-code-контракт хука (0/1/2), `check_staging_status`/`_parse_staging_status`; схема БД — без миграций. ADR `docs/work-items/ORCH-061/06-adr/ADR-001-staging-infra-tolerance.md`. Документация: `docs/architecture/README.md`, `docs/operations/STAGING_CHECK.md`, `.openclaw/agents/deployer.md`. Тесты: `tests/test_staging_check_b6.py`, `tests/test_qg_checks.py`, `tests/test_config.py`, `tests/test_launcher.py`, `tests/test_qg.py`, `tests/test_stage_engine.py::TestStagingInfraTolerance`. - **Reconciler (F-1) больше не разблокирует escalated / Blocked / Needs-Input задачи** (ORCH-060): sweeper потерянных webhook (ORCH-053) не отличал «застряла из-за потерянного события» от «исчерпала лимит developer-ретраев и ждёт человека» — если CI зелёный, а reviewer слал REQUEST_CHANGES до `MAX_DEVELOPER_RETRIES`, каждый тик F-1 видел зелёный `check_ci_green` и доигрывал `development → review` → reviewer снова REQUEST_CHANGES → откат (стадия не меняется, escalated в `gitea.py` лишь шлёт `notify_error`) → следующий тик снова разблокировал. Бесконечная петля (инцидент ET-013: 10 разблокировок за ночь, лишние запуски агентов/токены, спам в Telegram, паразитная нагрузка общего self-hosting-инстанса). В `Reconciler._reconcile_gate_task` (`src/reconciler.py`) ПОСЛЕ существующих гардов (`analysis` carve-out, нет гейта, активный job, grace) и ДО пред-оценки гейта добавлены два пред-гарда с ранним `return` (молчаливый skip — без `advance`, без инкремента `unblocked_total`, без нотификаций): **Guard 1 (escalated, детерминированный, без сети, проверяется первым)** — `developer_retry_count(task_id) >= MAX_DEVELOPER_RETRIES`; приватный `stage_engine._developer_retry_count` повышен до публичного `developer_retry_count` (единый источник истины по подсчёту ретраев `agent_runs`, приватное имя сохранено как алиас), граница берётся из `stage_engine.MAX_DEVELOPER_RETRIES` (не хардкод `3`). **Guard 2 (явный человеческий Plane-статус, Вариант A — без миграции БД)** — новый never-raise хелпер `plane_sync.fetch_issue_state(issue_id, project_id) -> str|None` (тот же endpoint/headers, что `fetch_issue_sequence_id`) + `Reconciler._is_blocked_or_needs_input(task)`: резолв проекта (`projects.get_project_by_repo`) → `get_project_states(pid)` → сверка текущего state issue с `blocked`/`needs_input`; любая ошибка/`None`/нерезолвленный проект → консервативный skip (`True`: не-разблокировать безопаснее). F-2 по существу не менялся: Blocked/Needs Input не входят в опрашиваемый набор `{in_progress, approved, rejected}` → не доигрываются (зафиксировано регресс-тестом). Новый под-флаг `ORCH_RECONCILE_SKIP_BLOCKED_ENABLED` (true) гасит ТОЛЬКО сетевой Guard 2 (escape hatch при Plane-outage); Guard 1 всегда активен. Схема БД, `STAGE_TRANSITIONS`, `QG_CHECKS`, never-raise на единицу работы, `analysis` carve-out и kill-switch'и (`reconcile_enabled`/`reconcile_plane_enabled`) не менялись. ADR `docs/work-items/ORCH-060/06-adr/ADR-001-reconciler-skip-escalated.md`. Тесты: `tests/test_reconciler.py` (TC-01…TC-11 + регресс ORCH-053). diff --git a/docs/architecture/README.md b/docs/architecture/README.md index 02baba8..af2aa3d 100644 --- a/docs/architecture/README.md +++ b/docs/architecture/README.md @@ -186,7 +186,9 @@ helper `validated_revision` питает и штамп A, и `EXPECTED_REVISION` development-задаче repo; неоднозначность → не резолвим). - **F-4 observability:** при разблокировке — лог-строка `reconciler: разблокирована (потерян webhook)` + Telegram (`reconcile_notify_unblock`); снимок - состояния в `GET /queue` (блок `reconcile`). + состояния в `GET /queue` (блок `reconcile`). **ORCH-068** добавляет в снимок + счётчики `skipped_terminal_total` (исключённые терминалы) и `deduped_total` + (подавленные повторные нотификации). Реализация: `src/reconciler.py` (daemon-поток по образцу `queue_worker`), стартует в `main.lifespan` **после** `worker.start()`, останавливается в `finally` **перед** @@ -314,4 +316,4 @@ ORCH-065 вводит фоновый watchdog, чтобы смерть проц Схема БД, потоки данных, resilience-слой, детали Dockerfile — [internals.md](internals.md). --- -*Актуально на 2026-06-07. Обновлять при изменении src/stages.py, src/qg/checks.py, src/main.py. Статусы доработок: ORCH-036 (исполняемый самодеплой `deploy`, adr-0007) — реализовано; ORCH-043 (merge-gate, adr-0006) — design, ветка feature/ORCH-043; ORCH-053 (reconciler, adr-0007, src/reconciler.py) — реализовано; ORCH-060 (F-1 skip escalated/Blocked/Needs-Input, `docs/work-items/ORCH-060/06-adr/ADR-001`) — реализовано в ветке feature/ORCH-060 (Guard 1 `developer_retry_count>=MAX_DEVELOPER_RETRIES` + Guard 2 `plane_sync.fetch_issue_state` Blocked/Needs-Input, флаг `ORCH_RECONCILE_SKIP_BLOCKED_ENABLED`); ORCH-058 (провенанс staging-образа: check_staging_image_fresh + staging_check свежего образа + хук-guard, adr-0008) — реализовано в ветке feature/ORCH-058 (обновлять также при изменении src/image_freshness.py, scripts/orchestrator-deploy-hook.sh, Dockerfile); ORCH-061 (толерантность staging-вердикта к инфра-FAIL C9a/C9b, adr-0009, `docs/work-items/ORCH-061/06-adr/ADR-001`) — реализовано в ветке feature/ORCH-061 (обновлять также при изменении src/staging_verdict.py, scripts/staging_check.py, флаг staging_infra_tolerance_enabled); ORCH-021 (post-deploy наблюдение прода + реакция на деградацию, adr-0010, `docs/work-items/ORCH-021/06-adr/ADR-001`) — реализовано в ветке feature/ORCH-021-post-deploy-rollback (reserved-agent job `post-deploy-monitor`: арм в src/stage_engine.py блок `next_stage == "done"`, тик `run_post_deploy_monitor` + перехват в src/agents/launcher.py ДО _spawn; чистая логика src/post_deploy.py never-raise; флаги `post_deploy_*` в src/config.py; блок `post_deploy` в `/queue`; артефакт 16-post-deploy-log.md; self-hosting всегда ALERT_ONLY — тик не рестартит прод; обновлять также при изменении src/post_deploy.py / арм-блока / launcher-перехвата); ORCH-065 (job-reaper + проактивный реклейм merge-lease + идемпотентная финализация merge, adr-0011, `docs/work-items/ORCH-065/06-adr/ADR-001`) — реализовано в ветке feature/ORCH-065 (новый daemon-поток src/job_reaper.py + старт/стоп в src/main.py lifespan; колонка `jobs.pid` через _ensure_column + проставление в src/agents/launcher.py `_spawn`; функции реклейма lease `pid_alive`/`reclaim_stale_lease` + guard `pr_already_merged` в src/merge_gate.py (консультируется merge-актором — промпт `.openclaw/agents/deployer.md`); флаги `reaper_*`/`lease_reclaim_*` в src/config.py; блок `reaper` в `/queue`; обновлять также при изменении этих мест); ORCH-068 (livelock-fix reconciler F-2: терминал-исключение по группе состояния + `_note_unblock` только при подтверждённом state change + дедуп; TTL `_STATES_CACHE`, `docs/work-items/ORCH-068/06-adr/ADR-001`) — design, ветка feature/ORCH-068 (обновлять также при изменении src/reconciler.py F-2, src/plane_sync.py `get_project_states`/`_STATES_CACHE`, флаг `plane_states_ttl_s` в src/config.py).* +*Актуально на 2026-06-07. Обновлять при изменении src/stages.py, src/qg/checks.py, src/main.py. Статусы доработок: ORCH-036 (исполняемый самодеплой `deploy`, adr-0007) — реализовано; ORCH-043 (merge-gate, adr-0006) — design, ветка feature/ORCH-043; ORCH-053 (reconciler, adr-0007, src/reconciler.py) — реализовано; ORCH-060 (F-1 skip escalated/Blocked/Needs-Input, `docs/work-items/ORCH-060/06-adr/ADR-001`) — реализовано в ветке feature/ORCH-060 (Guard 1 `developer_retry_count>=MAX_DEVELOPER_RETRIES` + Guard 2 `plane_sync.fetch_issue_state` Blocked/Needs-Input, флаг `ORCH_RECONCILE_SKIP_BLOCKED_ENABLED`); ORCH-058 (провенанс staging-образа: check_staging_image_fresh + staging_check свежего образа + хук-guard, adr-0008) — реализовано в ветке feature/ORCH-058 (обновлять также при изменении src/image_freshness.py, scripts/orchestrator-deploy-hook.sh, Dockerfile); ORCH-061 (толерантность staging-вердикта к инфра-FAIL C9a/C9b, adr-0009, `docs/work-items/ORCH-061/06-adr/ADR-001`) — реализовано в ветке feature/ORCH-061 (обновлять также при изменении src/staging_verdict.py, scripts/staging_check.py, флаг staging_infra_tolerance_enabled); ORCH-021 (post-deploy наблюдение прода + реакция на деградацию, adr-0010, `docs/work-items/ORCH-021/06-adr/ADR-001`) — реализовано в ветке feature/ORCH-021-post-deploy-rollback (reserved-agent job `post-deploy-monitor`: арм в src/stage_engine.py блок `next_stage == "done"`, тик `run_post_deploy_monitor` + перехват в src/agents/launcher.py ДО _spawn; чистая логика src/post_deploy.py never-raise; флаги `post_deploy_*` в src/config.py; блок `post_deploy` в `/queue`; артефакт 16-post-deploy-log.md; self-hosting всегда ALERT_ONLY — тик не рестартит прод; обновлять также при изменении src/post_deploy.py / арм-блока / launcher-перехвата); ORCH-065 (job-reaper + проактивный реклейм merge-lease + идемпотентная финализация merge, adr-0011, `docs/work-items/ORCH-065/06-adr/ADR-001`) — реализовано в ветке feature/ORCH-065 (новый daemon-поток src/job_reaper.py + старт/стоп в src/main.py lifespan; колонка `jobs.pid` через _ensure_column + проставление в src/agents/launcher.py `_spawn`; функции реклейма lease `pid_alive`/`reclaim_stale_lease` + guard `pr_already_merged` в src/merge_gate.py (консультируется merge-актором — промпт `.openclaw/agents/deployer.md`); флаги `reaper_*`/`lease_reclaim_*` в src/config.py; блок `reaper` в `/queue`; обновлять также при изменении этих мест); ORCH-068 (livelock-fix reconciler F-2: терминал-исключение по группе состояния + `_note_unblock` только при подтверждённом state change + дедуп; TTL `_STATES_CACHE`, `docs/work-items/ORCH-068/06-adr/ADR-001`) — реализовано в ветке feature/ORCH-068 (D1 терминал-гард по группе `_is_terminal_state` + `get_project_state_groups` в src/plane_sync.py; D2 сравнение стадии до/после `_dispatch` + дедуп-словарь в src/reconciler.py; TTL-запись `_STATES_CACHE` + флаг `plane_states_ttl_s` в src/config.py; счётчики `skipped_terminal_total`/`deduped_total` в `/queue`; обновлять также при изменении src/reconciler.py F-2, src/plane_sync.py `get_project_states`/`get_project_state_groups`/`_STATES_CACHE`).* diff --git a/src/config.py b/src/config.py index a35aafa..20bd8b8 100644 --- a/src/config.py +++ b/src/config.py @@ -265,6 +265,18 @@ class Settings(BaseSettings): reconcile_notify_unblock: bool = True reconcile_skip_blocked_enabled: bool = True + # ORCH-068: TTL for the per-project Plane states cache (_STATES_CACHE in + # plane_sync). Historically the cache lived for the whole process lifetime, + # so a status added to Plane after start was never seen without a restart + # ("stale set -> no pipeline action"). With a TTL the entry self-heals by + # re-fetching /states/ after it expires (invalidation reuses the existing + # reload_project_states() primitive — no duplicated reset logic). + # plane_states_ttl_s (env ORCH_PLANE_STATES_TTL_S): + # >0 -> seconds before a cache entry is re-fetched (default 300 = 5 min); + # 0 -> disable TTL -> strictly the previous lifetime cache (back-compat + # escape hatch). get_project_states return shape is unchanged. + plane_states_ttl_s: int = 300 + # ORCH-021: post-deploy production monitoring + degradation reaction. After # the terminal deploy->done transition for an applicable repo, a reserved-agent # `post-deploy-monitor` job (no LLM, modelled on deploy-finalizer) probes prod diff --git a/src/plane_sync.py b/src/plane_sync.py index f6ed56f..0334669 100644 --- a/src/plane_sync.py +++ b/src/plane_sync.py @@ -1,6 +1,7 @@ """Plane API sync — update issue state and add comments.""" import logging +import time import httpx from .config import settings @@ -130,18 +131,42 @@ _PLANE_NAME_TO_KEY: dict[str, str] = { "Blocked": "blocked", } -# Per-project state cache: {project_id: {logical_key: state_uuid}} -_STATES_CACHE: dict[str, dict[str, str]] = {} +# Per-project state cache (ORCH-10 + ORCH-068). +# +# Each entry is a RECORD, not a bare mapping: +# {"states": {logical_key: state_uuid}, # the ORCH-10 mapping (unchanged shape) +# "groups": {state_uuid: group}, # ORCH-068 D1: {uuid -> Plane state.group} +# "ts": monotonic timestamp} # ORCH-068 TR-4: for TTL self-heal +# get_project_states() still RETURNS the bare {logical_key: state_uuid} mapping +# (backward compatible — AC-13); the richer record is internal. +_STATES_CACHE: dict[str, dict] = {} + + +def _cache_record_fresh(record: dict) -> bool: + """ORCH-068 (TR-4): is a cache record still within its TTL? + + ``plane_states_ttl_s <= 0`` disables the TTL -> a record never expires + (strictly the previous lifetime-cache behaviour, back-compat escape hatch). + """ + ttl = settings.plane_states_ttl_s + if ttl <= 0: + return True + ts = record.get("ts", 0.0) + return (time.monotonic() - ts) <= ttl def get_project_states(project_id: str) -> dict[str, str]: """ORCH-10: resolve {logical_key -> state_uuid} for a specific Plane project. Source of truth: Plane API GET /projects//states/. - Results are cached per project_id for the lifetime of the process. + Results are cached per project_id. ORCH-068 (TR-4): a cached entry is + re-fetched once it is older than ``plane_states_ttl_s`` (default 300s) so a + status added to Plane after start self-heals without a process restart; + ``plane_states_ttl_s = 0`` keeps the previous lifetime cache. + Falls back to _DEFAULT_STATES (enduro-trails values) if: * project_id is empty/None, - * the API call fails (network error, non-2xx), + * the API call fails (network error, non-2xx) AND nothing is cached, * the response contains no recognisable states. The enduro-trails project therefore returns the same UUIDs as before @@ -151,8 +176,9 @@ def get_project_states(project_id: str) -> dict[str, str]: if not project_id: return _DEFAULT_STATES - if project_id in _STATES_CACHE: - return _STATES_CACHE[project_id] + cached = _STATES_CACHE.get(project_id) + if cached is not None and _cache_record_fresh(cached): + return cached["states"] url = f"{PLANE_BASE}/workspaces/{WORKSPACE}/projects/{project_id}/states/" try: @@ -165,12 +191,21 @@ def get_project_states(project_id: str) -> dict[str, str]: raise ValueError(f"unexpected states response shape: {type(items)}") resolved: dict[str, str] = {} + groups: dict[str, str] = {} for item in items: name = item.get("name", "") uid = item.get("id", "") key = _PLANE_NAME_TO_KEY.get(name) if key and uid: resolved[key] = uid + # ORCH-068 D1: capture {uuid -> group} for terminal-state detection + # (a single API fetch — no extra network cost). The group is the + # authoritative, project-independent discriminator of terminal + # (completed/cancelled) vs review/work statuses, robust to UUID + # aliasing after status renames (ORCH-066). + grp = item.get("group", "") + if uid and grp: + groups[uid] = grp if not resolved: raise ValueError("no recognisable states in API response") @@ -180,13 +215,26 @@ def get_project_states(project_id: str) -> dict[str, str]: for k, v in _DEFAULT_STATES.items(): resolved.setdefault(k, v) - _STATES_CACHE[project_id] = resolved + _STATES_CACHE[project_id] = { + "states": resolved, + "groups": groups, + "ts": time.monotonic(), + } logger.debug( - f"get_project_states: cached {len(resolved)} states for project {project_id[:8]}..." + f"get_project_states: cached {len(resolved)} states / " + f"{len(groups)} groups for project {project_id[:8]}..." ) return resolved except Exception as e: + # On a transient API failure keep serving the stale (but project-correct) + # set if we have one — far safer than reverting to enduro defaults. + if cached is not None: + logger.warning( + f"get_project_states: API refresh failed for project " + f"{project_id[:8]}..., serving stale cached set. Error: {e}" + ) + return cached["states"] logger.warning( f"get_project_states: API failed for project {project_id[:8]}..., " f"falling back to _DEFAULT_STATES. Error: {e}" @@ -194,6 +242,23 @@ def get_project_states(project_id: str) -> dict[str, str]: return _DEFAULT_STATES +def get_project_state_groups(project_id: str) -> dict[str, str]: + """ORCH-068 (D1): return {state_uuid -> group} for a Plane project. + + Reads the SAME cache record populated by ``get_project_states`` (no extra + network call). Call ``get_project_states(project_id)`` first to ensure the + record is fresh/populated. Returns ``{}`` when nothing is cached (e.g. the + API was unreachable and the caller fell back to ``_DEFAULT_STATES``); the + reconciler then falls back to logical terminal keys. + """ + record = _STATES_CACHE.get(project_id) + if isinstance(record, dict): + groups = record.get("groups") + if isinstance(groups, dict): + return groups + return {} + + def reload_project_states(project_id: str = None) -> None: """ORCH-10: clear the per-project states cache. diff --git a/src/reconciler.py b/src/reconciler.py index 6d65baa..21f42fd 100644 --- a/src/reconciler.py +++ b/src/reconciler.py @@ -60,7 +60,12 @@ from .stage_engine import ( MAX_DEVELOPER_RETRIES, ) from .stages import get_qg_for_stage -from .plane_sync import fetch_issue_state, get_project_states, list_issues_by_state +from .plane_sync import ( + fetch_issue_state, + get_project_states, + get_project_state_groups, + list_issues_by_state, +) from .webhooks.plane import handle_status_start, handle_verdict from .notifications import send_telegram from . import projects @@ -139,6 +144,13 @@ class Reconciler: self.last_run_ts: float | None = None self.unblocked_total: int = 0 self.last_unblocked: str | None = None + # ORCH-068 observability: terminal-state skips and dedup suppressions. + self.skipped_terminal_total: int = 0 + self.deduped_total: int = 0 + # ORCH-068 (TR-3): in-memory dedup guard {issue_id -> last unblocked + # state uuid}. Best-effort (resets on restart, like unblocked_total); + # suppresses a repeat unblock notification for the same issue+state. + self._unblock_dedup: dict[str, str] = {} # -- F-1: gate-side ---------------------------------------------------- def reconcile_gate_once(self) -> None: @@ -242,6 +254,9 @@ class Reconciler: pid = proj.plane_project_id # Resolve the actionable state uuids per-project (never hardcode). states = get_project_states(pid) + # ORCH-068 D1: {uuid -> group} from the SAME cache record (no extra + # fetch); empty when the API was unreachable -> per-issue fallback by key. + groups = get_project_state_groups(pid) in_progress = states["in_progress"] approved = states["approved"] rejected = states["rejected"] @@ -249,16 +264,36 @@ class Reconciler: for issue in issues: try: self._reconcile_plane_issue( - issue, pid, in_progress, approved, rejected + issue, pid, in_progress, approved, rejected, states, groups ) except Exception as e: # noqa: BLE001 - isolate one issue's failure logger.error( f"reconciler F-2: issue {issue.get('id')} failed: {e}" ) + def _is_terminal_state( + self, state_uuid: str, states: dict, groups: dict + ) -> bool: + """ORCH-068 D1: is ``state_uuid`` a terminal (completed/cancelled) state? + + Primary discriminator is the Plane **state group** (project-independent, + robust to UUID aliasing after status renames): ``group`` in + ``{completed, cancelled}`` -> terminal. When the group is unavailable + (API gave no ``group`` / we fell back to ``_DEFAULT_STATES``), fall back + to the logical terminal keys ``done`` / ``cancelled``. + """ + if not state_uuid: + return False + grp = groups.get(state_uuid) + if grp: + return grp in {"completed", "cancelled"} + # Fallback (group unknown): logical terminal keys for this project. + return state_uuid in {states.get("done"), states.get("cancelled")} + def _reconcile_plane_issue( self, issue: dict, project_id: str, in_progress: str, approved: str, rejected: str, + states: dict, groups: dict, ) -> None: issue_id = str(issue.get("id") or "") if not issue_id: @@ -266,6 +301,15 @@ class Reconciler: state = issue.get("state") new_state = state.get("id") if isinstance(state, dict) else state + # ORCH-068 D1: a terminal issue (Done / Cancelled) is fully in sync by + # definition -> never actionable. Excluded per-issue (not by narrowing + # `wanted`) because UUID aliasing can make a terminal uuid collide with + # an actionable one — only the state GROUP disentangles them. Restores + # the silence-when-in-sync invariant (AC-1/AC-2). + if self._is_terminal_state(new_state, states, groups): + self.skipped_terminal_total += 1 + return + # Grace ("lost, not merely delayed"): use the issue's own updated_at age. # A missing/unparseable timestamp is treated as old enough (the active-job # guard + atomic create-claim still prevent doubling). @@ -290,18 +334,41 @@ class Reconciler: if new_state == in_progress and task is None: # In Progress without a task -> start the pipeline (lost start webhook). + # ORCH-068 D2: confirm a REAL change (the task now exists) before + # announcing — a no-op dispatch stays silent. self._dispatch(handle_status_start, issue_data, project_id) - self._note_unblock(issue_id, "analysis") + if get_task_by_plane_id(issue_id) is not None: + self._note_unblock(issue_id, "analysis", new_state) elif new_state == approved and task is not None: # Approved but the stage never advanced -> replay the verdict. + stage_before = task["stage"] self._dispatch(handle_verdict, issue_data, project_id, approved=True) - self._note_unblock(task.get("work_item_id") or issue_id, task["stage"]) + if self._stage_changed(issue_id, stage_before): + self._note_unblock( + task.get("work_item_id") or issue_id, stage_before, new_state + ) elif new_state == rejected and task is not None: # Rejected but never rolled back -> replay the verdict. + stage_before = task["stage"] self._dispatch(handle_verdict, issue_data, project_id, approved=False) - self._note_unblock(task.get("work_item_id") or issue_id, task["stage"]) + if self._stage_changed(issue_id, stage_before): + self._note_unblock( + task.get("work_item_id") or issue_id, stage_before, new_state + ) # else: everything is in sync -> silence (AC-10). + @staticmethod + def _stage_changed(issue_id: str, stage_before: str) -> bool: + """ORCH-068 D2: did the dispatched handler actually move the stage? + + Re-reads the task after ``_dispatch`` and compares to the captured + ``stage_before``. A no-op replay (the task was already in the target + state) leaves the stage unchanged -> no unblock notification. + """ + after = get_task_by_plane_id(issue_id) + stage_after = after["stage"] if after else stage_before + return stage_after != stage_before + @staticmethod def _dispatch(coro_fn, *args, **kwargs) -> None: """Run an async plane handler from this sync thread. @@ -314,12 +381,27 @@ class Reconciler: asyncio.run(coro_fn(*args, **kwargs)) # -- observability (F-4) ---------------------------------------------- - def _note_unblock(self, work_item_id: str, stage: str) -> None: + def _note_unblock( + self, work_item_id: str, stage: str, state_uuid: str | None = None + ) -> None: """Record + announce that a stuck task was unblocked (AC-12). Fires only on an actual state change (an advance / replayed transition), never per idle tick, so it does not conflict with AC-9 / AC-10. + + ORCH-068 (TR-3): an in-memory dedup guard keyed by ``issue_id -> + state_uuid`` suppresses a repeat notification for the same issue+state + if a future no-op path ever reaches here. ``state_uuid`` is the issue's + Plane state; ``work_item_id`` doubles as the issue id for the + pipeline-start case (which has no work item yet). """ + dedup_key = work_item_id + if state_uuid is not None and self._unblock_dedup.get(dedup_key) == state_uuid: + self.deduped_total += 1 + return + if state_uuid is not None: + self._unblock_dedup[dedup_key] = state_uuid + self.unblocked_total += 1 self.last_unblocked = work_item_id logger.info( @@ -380,6 +462,9 @@ class Reconciler: "last_run_ts": self.last_run_ts, "unblocked_total": self.unblocked_total, "last_unblocked": self.last_unblocked, + # ORCH-068 observability. + "skipped_terminal_total": self.skipped_terminal_total, + "deduped_total": self.deduped_total, } diff --git a/tests/test_plane_states_cache.py b/tests/test_plane_states_cache.py new file mode 100644 index 0000000..0b952f3 --- /dev/null +++ b/tests/test_plane_states_cache.py @@ -0,0 +1,180 @@ +"""ORCH-068 (TR-4): tests for the Plane states cache TTL self-heal. + +The per-project ``_STATES_CACHE`` used to live for the whole process lifetime, +so a status added to Plane after start was never seen without a restart +("stale set -> no pipeline action"). ORCH-068 adds a TTL: an entry is +re-fetched once it is older than ``plane_states_ttl_s`` (default 300s); ``0`` +disables the TTL (strictly the previous lifetime cache). + +All tests are offline: the Plane API (httpx) and the monotonic clock are mocked. +""" + +import os +import tempfile +from unittest.mock import MagicMock, patch + +import pytest + +os.environ.setdefault("ORCH_PLANE_API_URL", "http://plane.local") +os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token") +os.environ.setdefault("ORCH_PLANE_WORKSPACE_SLUG", "test-ws") +os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token") + +_test_db = os.path.join(tempfile.gettempdir(), "test_plane_states_cache.db") +os.environ["ORCH_DB_PATH"] = _test_db + +import src.plane_sync as ps # noqa: E402 + +_PROJECT = "proj-ttl" +_ET_PROJECT = "7a79f0a9-5278-49cd-9007-9a338f238f9c" + + +def _resp(data: dict, status: int = 200): + m = MagicMock() + m.status_code = status + m.json.return_value = data + if status >= 400: + from httpx import HTTPStatusError + m.raise_for_status.side_effect = HTTPStatusError( + "error", request=MagicMock(), response=MagicMock() + ) + else: + m.raise_for_status.return_value = None + return m + + +def _states_response(in_progress_uuid: str) -> dict: + """A minimal /states/ payload; In Progress carries the given UUID.""" + return { + "results": [ + {"id": in_progress_uuid, "name": "In Progress", "group": "started"}, + {"id": "uuid-done", "name": "Done", "group": "completed"}, + ] + } + + +@pytest.fixture(autouse=True) +def reset_cache(): + ps.reload_project_states() + yield + ps.reload_project_states() + + +# --------------------------------------------------------------------------- +# TC-11 (AC-12): a stale cache entry self-heals after the TTL — no restart. +# --------------------------------------------------------------------------- +def test_tc11_stale_cache_refreshes_after_ttl(monkeypatch): + monkeypatch.setattr(ps.settings, "plane_states_ttl_s", 300) + clock = {"t": 1000.0} + monkeypatch.setattr(ps.time, "monotonic", lambda: clock["t"]) + + responses = iter([ + _resp(_states_response("uuid-A")), # first fetch: old set + _resp(_states_response("uuid-B")), # second fetch: new status appeared + ]) + mock_get = MagicMock(side_effect=lambda *a, **k: next(responses)) + monkeypatch.setattr(ps.httpx, "get", mock_get) + + # t=1000: first call -> fetch set A. + s1 = ps.get_project_states(_PROJECT) + assert s1["in_progress"] == "uuid-A" + assert mock_get.call_count == 1 + + # t=1100: within TTL -> served from cache, no new fetch. + clock["t"] = 1100.0 + s2 = ps.get_project_states(_PROJECT) + assert s2["in_progress"] == "uuid-A" + assert mock_get.call_count == 1 + + # t=1400: TTL (300s) elapsed -> re-fetch -> fresh set B (self-heal). + clock["t"] = 1400.0 + s3 = ps.get_project_states(_PROJECT) + assert s3["in_progress"] == "uuid-B" + assert mock_get.call_count == 2 + + +def test_tc11_ttl_zero_keeps_lifetime_cache(monkeypatch): + """plane_states_ttl_s=0 -> strictly the previous lifetime cache (back-compat).""" + monkeypatch.setattr(ps.settings, "plane_states_ttl_s", 0) + clock = {"t": 1000.0} + monkeypatch.setattr(ps.time, "monotonic", lambda: clock["t"]) + + responses = iter([ + _resp(_states_response("uuid-A")), + _resp(_states_response("uuid-B")), + ]) + mock_get = MagicMock(side_effect=lambda *a, **k: next(responses)) + monkeypatch.setattr(ps.httpx, "get", mock_get) + + assert ps.get_project_states(_PROJECT)["in_progress"] == "uuid-A" + clock["t"] = 1_000_000.0 # far in the future + # TTL disabled -> still the cached A, never re-fetched. + assert ps.get_project_states(_PROJECT)["in_progress"] == "uuid-A" + assert mock_get.call_count == 1 + + +def test_tc11_groups_exposed_via_accessor(monkeypatch): + """get_project_state_groups returns {uuid -> group} from the same record.""" + monkeypatch.setattr(ps.settings, "plane_states_ttl_s", 300) + monkeypatch.setattr(ps.httpx, "get", lambda *a, **k: _resp(_states_response("uuid-A"))) + + ps.get_project_states(_PROJECT) + groups = ps.get_project_state_groups(_PROJECT) + assert groups["uuid-A"] == "started" + assert groups["uuid-done"] == "completed" + + +def test_tc11_groups_empty_when_uncached(monkeypatch): + """No cache record (e.g. API fell back to defaults) -> groups == {}.""" + assert ps.get_project_state_groups("never-fetched") == {} + + +# --------------------------------------------------------------------------- +# TC-12 (AC-13): default-config compatibility — enduro UUIDs + API-error fallback. +# --------------------------------------------------------------------------- +def test_tc12_enduro_uuids_unchanged(monkeypatch): + """enduro project still resolves its own UUIDs (return shape unchanged).""" + body = { + "results": [ + {"id": "b873d9eb-993c-48cd-97ac-99a9b1623967", + "name": "In Progress", "group": "started"}, + ] + } + monkeypatch.setattr(ps.httpx, "get", lambda *a, **k: _resp(body)) + states = ps.get_project_states(_ET_PROJECT) + assert states["in_progress"] == "b873d9eb-993c-48cd-97ac-99a9b1623967" + # Missing keys are still backfilled from _DEFAULT_STATES (complete mapping). + assert states["done"] == ps._DEFAULT_STATES["done"] + + +def test_tc12_api_error_falls_back_to_defaults(monkeypatch): + """API failure with nothing cached -> _DEFAULT_STATES (fallback preserved).""" + monkeypatch.setattr( + ps.httpx, "get", MagicMock(side_effect=Exception("network error")) + ) + states = ps.get_project_states(_PROJECT) + assert states is ps._DEFAULT_STATES + + +def test_tc12_stale_served_when_refresh_fails(monkeypatch): + """TTL expiry + transient API failure -> serve the stale (project-correct) + set rather than reverting to enduro defaults.""" + monkeypatch.setattr(ps.settings, "plane_states_ttl_s", 300) + clock = {"t": 1000.0} + monkeypatch.setattr(ps.time, "monotonic", lambda: clock["t"]) + + calls = {"n": 0} + + def flaky_get(*a, **k): + calls["n"] += 1 + if calls["n"] == 1: + return _resp(_states_response("uuid-A")) + raise Exception("transient outage") + + monkeypatch.setattr(ps.httpx, "get", flaky_get) + + assert ps.get_project_states(_PROJECT)["in_progress"] == "uuid-A" + clock["t"] = 2000.0 # past TTL -> refresh attempt fails + states = ps.get_project_states(_PROJECT) + assert states["in_progress"] == "uuid-A" # stale-but-correct, not defaults + assert states is not ps._DEFAULT_STATES diff --git a/tests/test_reconciler_plane.py b/tests/test_reconciler_plane.py index e68d498..1087ffd 100644 --- a/tests/test_reconciler_plane.py +++ b/tests/test_reconciler_plane.py @@ -295,3 +295,342 @@ def test_tc17_polls_all_projects_resolves_states_per_project(monkeypatch): # state uuids are resolved per-project (not hardcoded): each call carries them. for _pid, states in issues_calls: assert set(states) == {_IN_PROGRESS, _APPROVED, _REJECTED} + + +# =========================================================================== +# ORCH-068: livelock-fix — terminal exclusion (D1) + confirmed-change unblock +# (D2) + dedup (TR-3). The old code spammed `_note_unblock` every ~120s for a +# fully synchronized Done task (incident: ET-002, 191+ Telegram messages/night). +# =========================================================================== + +_DONE = "uuid-done" +_CANCELLED = "uuid-cancelled" + + +def _patch_states_with_terminals(monkeypatch, *, alias_done_to_approved=False): + """Patch F-2 state resolution to include terminals + their groups. + + ``alias_done_to_approved`` models the regression trigger (ORCH-066): the + project "collapses" Done onto the approved UUID, so a genuinely-Done issue + would enter the ``approved`` branch by UUID. Only the state GROUP + (``completed``) disentangles it — the heart of D1. + """ + done_uuid = _APPROVED if alias_done_to_approved else _DONE + states = { + "in_progress": _IN_PROGRESS, + "approved": _APPROVED, + "rejected": _REJECTED, + "done": done_uuid, + "cancelled": _CANCELLED, + } + groups = { + _IN_PROGRESS: "started", + _APPROVED: "started", + _REJECTED: "started", + done_uuid: "completed", # genuinely-done issue -> completed group + _CANCELLED: "cancelled", + } + monkeypatch.setattr(reconciler_mod, "get_project_states", lambda pid: states) + monkeypatch.setattr( + reconciler_mod, "get_project_state_groups", lambda pid: groups + ) + return states, groups + + +def _spy_telegram(monkeypatch): + sent = [] + monkeypatch.setattr(reconciler_mod, "send_telegram", lambda msg: sent.append(msg)) + return sent + + +def _job_count(): + conn = get_db() + n = conn.execute("SELECT COUNT(*) FROM jobs").fetchone()[0] + conn.close() + return n + + +# --------------------------------------------------------------------------- +# TC-01 (AC-1, AC-7): synchronized Done task -> total silence, 0 jobs. +# --------------------------------------------------------------------------- +def test_tc01_synced_done_is_silent(monkeypatch, single_project): + start, verdict = _patch_handlers(monkeypatch) + _patch_states_with_terminals(monkeypatch) + sent = _spy_telegram(monkeypatch) + _make_task("iss-done", stage="done", wi="ET-002") + _patch_issues(monkeypatch, [ + {"id": "iss-done", "state": {"id": _DONE}, "updated_at": _OLD_TS}, + ]) + + recon = Reconciler() + recon.reconcile_plane_once() + + start.assert_not_called() + verdict.assert_not_called() + assert sent == [] + assert recon.unblocked_total == 0 + assert recon.skipped_terminal_total == 1 + assert _job_count() == 0 + + +# --------------------------------------------------------------------------- +# TC-02 (AC-2): Done UUID aliased onto approved -> still excluded by GROUP. +# --------------------------------------------------------------------------- +def test_tc02_terminal_aliased_to_approved_excluded(monkeypatch, single_project): + start, verdict = _patch_handlers(monkeypatch) + _patch_states_with_terminals(monkeypatch, alias_done_to_approved=True) + sent = _spy_telegram(monkeypatch) + # Task is Done; its Plane state UUID equals the approved UUID (aliasing). + _make_task("iss-alias", stage="done", wi="ET-002") + _patch_issues(monkeypatch, [ + {"id": "iss-alias", "state": {"id": _APPROVED}, "updated_at": _OLD_TS}, + ]) + + recon = Reconciler() + recon.reconcile_plane_once() + + # Without the group check this would enter the approved branch and notify. + start.assert_not_called() + verdict.assert_not_called() + assert sent == [] + assert recon.unblocked_total == 0 + assert recon.skipped_terminal_total == 1 + + +# --------------------------------------------------------------------------- +# TC-03 (AC-2): Cancelled terminal is also excluded. +# --------------------------------------------------------------------------- +def test_tc03_cancelled_excluded(monkeypatch, single_project): + start, verdict = _patch_handlers(monkeypatch) + _patch_states_with_terminals(monkeypatch) + sent = _spy_telegram(monkeypatch) + _make_task("iss-cancel", stage="done", wi="ET-003") + _patch_issues(monkeypatch, [ + {"id": "iss-cancel", "state": {"id": _CANCELLED}, "updated_at": _OLD_TS}, + ]) + + recon = Reconciler() + recon.reconcile_plane_once() + + start.assert_not_called() + verdict.assert_not_called() + assert sent == [] + assert recon.unblocked_total == 0 + assert recon.skipped_terminal_total == 1 + + +# --------------------------------------------------------------------------- +# TC-04 (AC-3): no-op dispatch (stage unchanged) -> no notification. +# --------------------------------------------------------------------------- +def test_tc04_noop_dispatch_no_unblock(monkeypatch, single_project): + # handle_verdict is a no-op AsyncMock -> the task stage never moves. + start, verdict = _patch_handlers(monkeypatch) + sent = _spy_telegram(monkeypatch) + _make_task("iss-noop", stage="review") + _patch_issues(monkeypatch, [ + {"id": "iss-noop", "state": {"id": _APPROVED}, "updated_at": _OLD_TS}, + ]) + + recon = Reconciler() + recon.reconcile_plane_once() + + # The handler was replayed (idempotent), but nothing changed -> silence. + assert verdict.call_count == 1 + assert sent == [] + assert recon.unblocked_total == 0 + + +# --------------------------------------------------------------------------- +# TC-05 (AC-4): two consecutive ticks on a synced task -> 0 repeat unblocks; +# plus a direct check of the in-memory dedup guard. +# --------------------------------------------------------------------------- +def test_tc05_dedup_no_repeat_notification(monkeypatch, single_project): + start, verdict = _patch_handlers(monkeypatch) + _patch_states_with_terminals(monkeypatch) + sent = _spy_telegram(monkeypatch) + _make_task("iss-dedup", stage="done", wi="ET-004") + _patch_issues(monkeypatch, [ + {"id": "iss-dedup", "state": {"id": _DONE}, "updated_at": _OLD_TS}, + ]) + + recon = Reconciler() + recon.reconcile_plane_once() + recon.reconcile_plane_once() + + assert sent == [] + assert recon.unblocked_total == 0 + + # Direct dedup-guard exercise: the same issue+state notifies at most once. + recon._note_unblock("ET-004", "review", "state-x") + recon._note_unblock("ET-004", "review", "state-x") + assert recon.unblocked_total == 1 + assert recon.deduped_total == 1 + + +# --------------------------------------------------------------------------- +# TC-06 (AC-5): legit lost Approved webhook -> replayed, advanced, ONE unblock. +# --------------------------------------------------------------------------- +def test_tc06_legit_approved_unblock_once(monkeypatch, single_project): + _patch_states_with_terminals(monkeypatch) # non-terminal approved -> actionable + sent = _spy_telegram(monkeypatch) + _make_task("iss-appr", stage="review", wi="ET-005") + + async def fake_verdict(issue_data, project_id, approved=True): + # Simulate the real handler advancing the stage (review -> testing). + conn = get_db() + conn.execute( + "UPDATE tasks SET stage='testing' WHERE plane_id=?", + (issue_data["id"],), + ) + conn.commit() + conn.close() + + monkeypatch.setattr(reconciler_mod, "handle_verdict", fake_verdict) + monkeypatch.setattr(reconciler_mod, "handle_status_start", AsyncMock()) + _patch_issues(monkeypatch, [ + {"id": "iss-appr", "state": {"id": _APPROVED}, "updated_at": _OLD_TS}, + ]) + + recon = Reconciler() + recon.reconcile_plane_once() + + assert recon.unblocked_total == 1 + assert len(sent) == 1 + assert "ET-005" in sent[0] + + +# --------------------------------------------------------------------------- +# TC-07 (AC-6): lost In Progress start (task appears) and lost Rejected +# rollback (stage moves) each fire exactly one unblock. +# --------------------------------------------------------------------------- +def test_tc07_in_progress_start_and_rejected_each_one_unblock( + monkeypatch, single_project +): + _patch_states_with_terminals(monkeypatch) + sent = _spy_telegram(monkeypatch) + + async def fake_start(issue_data, project_id): + # Simulate the real start handler creating the task. + _make_task(issue_data["id"], stage="analysis", wi="ET-006") + + async def fake_verdict(issue_data, project_id, approved=True): + conn = get_db() + conn.execute( + "UPDATE tasks SET stage='development' WHERE plane_id=?", + (issue_data["id"],), + ) + conn.commit() + conn.close() + + monkeypatch.setattr(reconciler_mod, "handle_status_start", fake_start) + monkeypatch.setattr(reconciler_mod, "handle_verdict", fake_verdict) + + # Rejected task already exists at review; In Progress one has no task yet. + _make_task("iss-rej", stage="review", wi="ET-007") + _patch_issues(monkeypatch, [ + {"id": "iss-start", "state": {"id": _IN_PROGRESS}, "updated_at": _OLD_TS}, + {"id": "iss-rej", "state": {"id": _REJECTED}, "updated_at": _OLD_TS}, + ]) + + recon = Reconciler() + recon.reconcile_plane_once() + + assert recon.unblocked_total == 2 + assert len(sent) == 2 + + +# --------------------------------------------------------------------------- +# TC-08 (AC-8): never-raise — a failing dependency isolates to its unit of work. +# --------------------------------------------------------------------------- +def test_tc08_never_raise_isolation(monkeypatch, single_project): + _patch_states_with_terminals(monkeypatch) + monkeypatch.setattr(reconciler_mod, "send_telegram", lambda msg: None) + + # _dispatch blows up for one issue -> isolated; the tick must not crash. + def boom_dispatch(*a, **k): + raise RuntimeError("handler exploded") + + monkeypatch.setattr(Reconciler, "_dispatch", staticmethod(boom_dispatch)) + _make_task("iss-boom", stage="review", wi="ET-008") + _patch_issues(monkeypatch, [ + {"id": "iss-boom", "state": {"id": _APPROVED}, "updated_at": _OLD_TS}, + ]) + + recon = Reconciler() + recon.reconcile_plane_once() # must NOT raise + assert recon.unblocked_total == 0 + + # list_issues_by_state raising -> per-project isolation, still no crash. + def boom_list(pid, states): + raise RuntimeError("plane down") + + monkeypatch.setattr(reconciler_mod, "list_issues_by_state", boom_list) + recon.reconcile_plane_once() # must NOT raise + + +# --------------------------------------------------------------------------- +# TC-09 (AC-9): kill-switches mute F-2. +# --------------------------------------------------------------------------- +def test_tc09_kill_switches(monkeypatch, single_project): + start, verdict = _patch_handlers(monkeypatch) + _patch_states_with_terminals(monkeypatch) + called = {"list": 0} + + def counting_list(pid, states): + called["list"] += 1 + return [{"id": "iss-x", "state": {"id": _APPROVED}, "updated_at": _OLD_TS}] + + monkeypatch.setattr(reconciler_mod, "list_issues_by_state", counting_list) + + monkeypatch.setattr(reconciler_mod.settings, "reconcile_enabled", False) + Reconciler().reconcile_plane_once() + assert called["list"] == 0 # global switch off -> F-2 never runs + + monkeypatch.setattr(reconciler_mod.settings, "reconcile_enabled", True) + monkeypatch.setattr(reconciler_mod.settings, "reconcile_plane_enabled", False) + Reconciler().reconcile_plane_once() + assert called["list"] == 0 # F-2 switch off -> still no poll + + +# --------------------------------------------------------------------------- +# TC-10 (AC-1, AC-2): end-to-end on BOTH registry projects (enduro AND +# orchestrator): a Done task on each -> 0 notifications / 0 jobs, regardless +# of per-project status aliasing. The headline regression test. +# --------------------------------------------------------------------------- +def test_tc10_done_silent_on_all_projects(monkeypatch): + from src import projects as projects_mod + projects_mod.reload_projects() + assert len({p.plane_project_id for p in projects_mod.PROJECTS}) >= 2 + + start, verdict = _patch_handlers(monkeypatch) + sent = _spy_telegram(monkeypatch) + + states = { + "in_progress": _IN_PROGRESS, + "approved": _APPROVED, + "rejected": _REJECTED, + "done": _DONE, + "cancelled": _CANCELLED, + } + groups = {_DONE: "completed", _CANCELLED: "cancelled"} + monkeypatch.setattr(reconciler_mod, "get_project_states", lambda pid: states) + monkeypatch.setattr( + reconciler_mod, "get_project_state_groups", lambda pid: groups + ) + # Each project returns a Done issue (unique id per project). + monkeypatch.setattr( + reconciler_mod, "list_issues_by_state", + lambda pid, st: [ + {"id": f"done-{pid}", "state": {"id": _DONE}, "updated_at": _OLD_TS} + ], + ) + + recon = Reconciler() + recon.reconcile_plane_once() + + start.assert_not_called() + verdict.assert_not_called() + assert sent == [] + assert recon.unblocked_total == 0 + assert recon.skipped_terminal_total >= 2 # one per project + assert _job_count() == 0