From 0b25fc152775c87c47155ac4156e31bc940ec4a0 Mon Sep 17 00:00:00 2001 From: claude-bot Date: Tue, 9 Jun 2026 22:38:16 +0300 Subject: [PATCH] fix(merge_gate): retry transient Gitea merge errors + already-in-main guard merge_pr now wraps ONLY the mutating POST /pulls/{n}/merge in a bounded exponential-backoff retry-loop on TRANSIENT outcomes (405 "try again later", 408, any 5xx, network/timeout, and 409|422 while the PR is still mergeable); TERMINAL outcomes (403/404/real conflict via mergeable==False) -> fast honest False, so the ORCH-071/081 not-merged HOLD backstop is unchanged. Fixes the ORCH-063 false HOLD + manual re-merge on Gitea's post-push mergeability hiccup. ensure_open_pr gains an "already fully in main" guard (_branch_fully_in_main, git merge-base --is-ancestor HEAD origin/main) BEFORE creating a PR -> new "already-in-main" outcome avoids the garbage empty PR on a re-driven finalizer; _handle_merge_verify skips merge_pr on that outcome and lets the authoritative SHA-in-main check confirm -> done (not a HOLD). git error of the guard fails OPEN to the create path. New ORCH_MERGE_RETRY_* settings (kill-switch merge_retry_enabled -> one-shot, max_attempts=3, backoff base=2/max=5). INV-4 (merge only via Gitea PR-merge API, never push/force-push main), never-raise, STAGE_TRANSITIONS/QG_CHECKS/DB schema unchanged. Docs (README merge-verify section, CLAUDE.md, CHANGELOG, .env.example) updated in the same PR. Tests: test_merge_gate.py TC-01..12, test_config.py TC-13, test_merge_verify.py TC-14..16; full suite green (1389). Refs: ORCH-093 Co-Authored-By: Claude Opus 4.8 --- .env.example | 16 +++ .task-dev.md | 4 +- CHANGELOG.md | 6 + CLAUDE.md | 2 +- docs/architecture/README.md | 38 ++++++ src/config.py | 25 ++++ src/merge_gate.py | 230 +++++++++++++++++++++++++++++--- src/stage_engine.py | 18 ++- tests/test_config.py | 35 +++++ tests/test_merge_gate.py | 204 ++++++++++++++++++++++++++++ tests/test_merge_verify.py | 89 ++++++++++++ tests/test_orch082_ensure_pr.py | 5 + 12 files changed, 651 insertions(+), 21 deletions(-) diff --git a/.env.example b/.env.example index 9119703..dfba7c6 100644 --- a/.env.example +++ b/.env.example @@ -166,6 +166,22 @@ ORCH_MERGE_PR_TIMEOUT_S=60 ORCH_MERGE_VERIFY_TIMEOUT_S=60 ORCH_REGRESSION_GUARD_ENABLED=true ORCH_MERGE_VERIFY_AUTOCREATE_PR_ENABLED=true +# ORCH-093: deterministic merge-actor retry of TRANSIENT Gitea merge errors. merge_pr +# wraps ONLY the mutating POST /pulls/{n}/merge in a bounded exponential-backoff +# retry-loop on transient outcomes (405 "try again later" / 408 / 5xx / network / +# timeout, and 409|422 while the PR is still mergeable); terminal outcomes +# (403/404/real conflict) -> fast honest False (the ORCH-071/081 HOLD backstop is +# unchanged). Fixes the ORCH-063 false HOLD + manual re-merge. The already-in-main +# guard (no commits beyond origin/main -> no garbage PR) is always-on under +# MERGE_VERIFY_AUTOCREATE_PR_ENABLED (no separate flag). +# MERGE_RETRY_ENABLED -> kill-switch; false -> exactly one POST (one-shot, prior behaviour). +# MERGE_RETRY_MAX_ATTEMPTS -> max POST attempts on a transient outcome. +# MERGE_RETRY_BACKOFF_BASE_S -> exponential backoff base seconds (sleep = base*2^(i-1)). +# MERGE_RETRY_BACKOFF_MAX_S -> per-sleep backoff ceiling seconds (bounds total wait). +ORCH_MERGE_RETRY_ENABLED=true +ORCH_MERGE_RETRY_MAX_ATTEMPTS=3 +ORCH_MERGE_RETRY_BACKOFF_BASE_S=2 +ORCH_MERGE_RETRY_BACKOFF_MAX_S=5 # ORCH-036: executable self-deploy of the `deploy` stage. For the self-hosting repo # (orchestrator) the stage REALLY restarts prod (8500) via a detached host hook; # deploy_status: SUCCESS means proven health-ok, not an LLM declaration. Three diff --git a/.task-dev.md b/.task-dev.md index 4922723..32c813d 100644 --- a/.task-dev.md +++ b/.task-dev.md @@ -1,4 +1,4 @@ -Work item: ORCH-091 +Work item: ORCH-093 Repo: orchestrator -Branch: feature/ORCH-091-bug-to-analyse-stage-deploy-st +Branch: feature/ORCH-093-bug-merge-gitea-405-5xx-hold-p Stage: development \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 2dee4f5..e0721df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,12 @@ Формат: [Keep a Changelog](https://keepachangelog.com/). Записи — на смысловой PR/задачу. ## [Unreleased] +- **Merge-актор ретраит транзиентные ошибки Gitea (405/5xx) + гард «ветка уже в `main`»** (ORCH-093, `fix`): две точечные доработки детерминированного merge-актора `src/merge_gate.py`, чинящие инцидент **ORCH-063**: self-deploy прошёл, staging OK, PR был `open`+`mergeable`, но `POST /pulls/{n}/merge` вернул `HTTP 405 "Please try again later"` (Gitea пересчитывал `mergeable` сразу после пуша) → one-shot `merge_pr` мгновенно вернул `False` → корректная защита ORCH-071/081 удержала задачу на `deploy` + потребовала ручной домерж; повторный прогон финализатора плодил мусорный пустой PR. **Аддитивно, never-raise, под существующими kill-switch'ами:** `STAGE_TRANSITIONS` / `QG_CHECKS` / схема БД — **не тронуты**; INV-4 (мерж только через Gitea PR-merge API, никогда `push`/`force-push` в `main`) сохранён 1:1. + - **Retry-loop транзиента (FR-1/FR-2, AC-1/AC-2/AC-3, D1/D2):** `merge_pr` оборачивает **только** мутирующий `POST …/merge` в ограниченный retry-loop с экспоненциальным backoff (`min(base*2^(i-1), max)`, дефолты 2/5 с → суммарный сон `(N-1)*max ≤ 10 с`, monitor-поток не подвешивается). Классификатор `_classify_merge_response`: **транзиент** (ретрай) — `405`/`408`/любой `5xx`/`httpx`-таймаут/сетевая ошибка, **и** `409`/`422` когда PR всё ещё mergeable; **терминал** (быстрый честный `False`, защита ORCH-071/081 как прежде) — `403`/`404`/реальный конфликт (`409`/`422` при `mergeable==False`). Неоднозначный `409`/`422` разрешается доп. `GET /pulls/{index}` → `mergeable`; дефолт-политика `mergeable==None`/недоступно → **транзиент** (fail-OPEN-в-ретрай: икота Gitea — наблюдаемый кейс, бюджет конечен, backstop сохранён). Каждая попытка логируется `attempt i/N` (образец `check_ci_green`). + - **Гард already-in-main (FR-3/FR-4, AC-4, D3/D4):** новый leaf `_branch_fully_in_main` (`git merge-base --is-ancestor HEAD origin/main` в per-branch worktree) вызывается в `ensure_open_pr` **между** «открытый code-PR не найден» и `POST …/pulls`: ветка целиком в `main` (нет коммитов `origin/main..HEAD`) → новый исход `"already-in-main"` **без создания PR** (нет мусорного пустого PR на уже влитой ветке). git-ошибка/ambiguous (`None`) → **fail-OPEN** (деградация на create-путь, НЕ ложный no-op). В `stage_engine._handle_merge_verify` исход `already-in-main` **пропускает** `merge_pr` (мержить нечего) и отдаёт авторитетному SHA-in-main (`verify_merged_to_main`) довести до `done`; это НЕ HOLD. SHA-in-main остаётся единственным доказательством мержа (ADR-0014). + - **Конфиг/откат (FR-5, AC-5/AC-7, D5):** новые поля `src/config.py` `merge_retry_enabled` (kill-switch; `False` → ровно один POST = байт-в-байт прежнее one-shot, нулевая регрессия) / `merge_retry_max_attempts` (3) / `merge_retry_backoff_base_s` (2) / `merge_retry_backoff_max_s` (5), env `ORCH_MERGE_RETRY_*`, дескрипторы в `.env.example`. Гард already-in-main — без отдельного флага (накрыт `merge_verify_autocreate_pr_enabled`). Откат: `ORCH_MERGE_RETRY_ENABLED=false` (мгновенный runtime) или revert PR. + - **Трассировка:** перед правкой `merge_pr`/`ensure_open_pr`/`_handle_merge_verify` прочитаны ADR ORCH-071/073/082 — инварианты (SHA-in-main authoritative, never-raise, idempotency-guard `pr_already_merged`, base==main фильтр code-PR) сохранены; в `MAIN_REGRESSION_MARKERS` добавлена строка `("ORCH-093", "_classify_merge_response", "src/merge_gate.py")` (append-only). + - Тесты: `tests/test_merge_gate.py` (TC-01..TC-12: 405×2→200, 5xx→200, network→200, реальный конфликт/403 терминал, ambiguous-mergeable, исчерпание ретраев, kill-switch one-shot, already-in-main без POST, create при коммитах сверх main, fail-OPEN на git-ошибке гарда, never-raise; `httpx` мокается, `time.sleep` → no-op), `tests/test_config.py` (TC-13: дефолты + env-override `ORCH_MERGE_RETRY_*`), `tests/test_merge_verify.py` (TC-14..TC-16: already-in-main пропускает `merge_pr`→done; исчерпание+SHA-not-in-main→HOLD; транзиент-успех→done). Обновлён `tests/test_orch082_ensure_pr.py` (гард запинён на create-путь — у гарда своё покрытие). Полный регресс `tests/ -q` зелёный (1389). ADR: `docs/work-items/ORCH-093/06-adr/ADR-001-merge-transient-retry-and-already-in-main-guard.md`, сквозной `docs/architecture/adr/adr-0027-merge-actor-transient-retry-and-already-in-main.md`. - **Live-карточка трекера: полнота карты статусов, отражение откатов, суммирование метрик стадии по попыткам** (ORCH-091, `fix`): три верифицированных дефекта рендера Telegram-карточки (`src/notifications.py`, ORCH-067/087). **Аддитивно, never-raise, без нового поведения конвейера:** `STAGE_TRANSITIONS` / `QG_CHECKS` / `check_*` / транспорт нотификаций / схема БД — **не тронуты** (затронут ровно один модуль индикативного слоя); kill-switch не требуется (рендер деградирует безопасно, откат = `git revert`). - **Деф.1 — застрявший заголовок «To Analyse» (FR-1/2/3, AC-1/2/3):** `_STAGE_STATUS_LABEL` покрывал 8 из 10 ключей `STAGE_TRANSITIONS` — `deploy-staging` и `cancelled` (ORCH-090) выпадали в дефолт-«To Analyse» (ложный «первый статус» на стадии staging-деплоя). Карта расширена: `deploy-staging → "Deploying (staging)"` (plain-стиль активной стадии, суффикс «(staging)» снимает коллизию с prod-overlay `_LIVE_BRANCH_LABELS['deploying']` и с pause-лейблом `deploy`), `cancelled → "Cancelled"` (offline-база ORCH-090, совпадает с overlay-лейблом → нет конфликта precedence). Runtime-фолбэк `plane_status_label` для **немаппленной** (будущей/неизвестной) стадии заменён с «To Analyse» на **нейтральный** капитализированный лейбл (`_neutral_stage_label`, `"deploy-staging" → "Deploy Staging"`); `created` остаётся явным ключом → честная «To Analyse»; битый/None-вход → безопасный дефолт. Полнота карты гарантируется **программно** тестом, итерирующим `STAGE_TRANSITIONS.keys()` (единый источник истины) — новая стадия без курируемого лейбла даёт красный тест; автогенерация лейблов в самом модуле запрещена (карта остаётся курируемой/человекочитаемой). - **Деф.2 — ложная картина при откате (FR-4, AC-4):** цикл рендера выводил `✅`-строку для каждой стадии с завершённым прогоном её агента **без учёта позиции** относительно текущей — после отката (`deploy-staging → development` ORCH-043, `review → development` REQUEST_CHANGES) карточка показывала абсурд «✅ Внедрение … + 🔄 Разработка». Введён лёгкий read-only хелпер `_pipeline_pos` от **порядка `STAGE_TRANSITIONS`** (не от `_TRACKER_STAGES`, который не содержит `deploy-staging`/`cancelled` и не авторитетен по порядку); гейт подавления: `✅`-строка рисуется только если `current_pos >= _pipeline_pos(stage_key)`. Нормализация `deploy-staging → deploy` применяется **только** к вычислению текущей позиции (схлопнутая строка «Внедрение» несёт `stage_key="deploy"`); `is_active_stage` — **без изменений** (нулевой регресс активного рендера). Подавлённые откатом прогоны по-прежнему входят в тоталы задачи (намеренная семантика отката). diff --git a/CLAUDE.md b/CLAUDE.md index 6368e7d..dfb0be4 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -7,7 +7,7 @@ - Backend: FastAPI + uvicorn (Python 3.12) - БД: SQLite (`src/db.py`) - Агенты: Claude CLI (`ORCH_CLAUDE_BIN`), по одному промпту на роль в `.openclaw/agents/`. **ORCH-74:** модель/эффорт агента берутся ТОЛЬКО из config (`resolve_agent_model`/`resolve_agent_effort`, ORCH-41) — frontmatter `model:` удалён как мёртвый, frontmatter описательный; имя модели валидируется форматом `^claude-…$` перед `--model` (never-break). **ORCH-077 (52d, замыкает эпик 52):** тело всех 6 промптов переписано в едином **каноне Anthropic** (5 обязательных XML-секций в нормативном порядке ``→``→``→``→``, запреты в формате «❌ X → ✅ Y», `` у решающих ролей), и каждый промпт **добровольно** эмитит 6-польную frontmatter-схему 52c (`work_item`/`stage`/`author_agent`/`status`/`created_at`/`model_used`) **аддитивно** — рядом с machine-verdict ключом, НЕ меняя его имя/регистр/значения (`verdict:`/`result:`/`staging_status:`/`deploy_status:`/`security_status:` — байт-в-байт). Это **docs/prompts-only** изменение: `src/**`/`STAGE_TRANSITIONS`/`QG_CHECKS`/схема БД не тронуты; `frontmatter_validation_strict` остаётся `False` (enforcement НЕ включён). Промпт `cat`-ается из worktree в момент запуска → новые промпты вступают в силу на следующем worktree от `main` без прод-рестарта. Анти-регресс — структурные тесты `tests/test_agent_prompts_canon.py` + зелёный `test_agent_frontmatter_no_model.py`. **Норматив на будущее:** новые/изменённые агент-промпты следуют этому канону. Детали — `docs/architecture/adr/adr-0021-prompt-canon-anthropic.md`. **ORCH-092 (эпилог эпика 52, docs/prompts-only):** аудит 6 промптов поверх канона — копируемые frontmatter-примеры расхардкожены (`created_at: `/`model_used: ` + врезка «подставь `date +%F`/модель из конфига, не копируй буквально»; литерал `claude-opus-4-8` — только справка в таблице полей); добавлена секция `` developer/reviewer/tester (после ``, порядок 5 секций цел); developer лишён ручного `git rebase origin/main` (свежесть базы — инвариант движка serial-gate ORCH-088 + `auto_rebase_onto_main` под merge-lease; ручной rebase конфликтовал с запретом force-push — ADR-001 D1); tester обогащён worktree-путём + smoke `serial_gate` + покрытием каждого TC; из reviewer удалена мёртвая строка «тот же экземпляр Developer». **Языковое исключение (нормативно, ADR-001 D2):** `deployer.md` сознательно остаётся на **английском** (5 ru + 1 en) как самый safety-critical промпт — НЕ «чинить» язык вслепую; критичные self-hosting-запреты подняты в видную рамку. Verdict-ключи и канон 52d — байт-в-байт; анти-регресс — `tests/test_agent_prompts_canon.py` (ORCH-092 TC-01…TC-08). Детали — `docs/work-items/ORCH-092/06-adr/ADR-001-developer-rebase-and-deployer-language.md`. -- Очередь задач: собственная (SQLite `jobs`, `src/queue_worker.py`, ORCH-1). **ORCH-026:** `claim_next_job` гейтит задачи с незавершёнными зависимостями (`job_deps`, `NOT EXISTS`) без занятия слота `max_concurrency`; декларации/детект циклов — leaf `src/task_deps.py` (kill-switch `ORCH_TASK_DEPS_ENABLED`). Сериализация мержа одного репо — безусловный pre-merge rebase под merge-lease (`ORCH_PREMERGE_REBASE_ALWAYS`). **ORCH-088 (serial gate, Этап 1):** новая задача репо не входит в `analysis` (analyst-job не выбирается, ветка не режется), пока в репо есть **более ранняя** незавершённая задача (`t2.id < jobs.task_id`, FIFO) ИЛИ репо заморожен (`repo_freeze`). Срез ветки **отложен** со `start_pipeline` на момент claim analyst-job (`launcher._materialize_deferred_branch`) — база = свежий `origin/main` с кодом предшественника (анти-stale-base). Post-deploy `DEGRADED` → durable per-repo freeze (`repo_freeze`, `cleared_at IS NULL` = активен) + Telegram; снятие — вручную `POST /serial-gate/unfreeze?repo=…`. Leaf `src/serial_gate.py` (claim — fail-OPEN, freeze — fail-CLOSED); флаги `ORCH_SERIAL_GATE_ENABLED` (kill-switch), `ORCH_SERIAL_GATE_REPOS` (CSV; пусто = все репо), `ORCH_SERIAL_GATE_FREEZE_ENABLED`. Блок `serial_gate` в `GET /queue`. `STAGE_TRANSITIONS`/`QG_CHECKS` не тронуты. +- Очередь задач: собственная (SQLite `jobs`, `src/queue_worker.py`, ORCH-1). **ORCH-026:** `claim_next_job` гейтит задачи с незавершёнными зависимостями (`job_deps`, `NOT EXISTS`) без занятия слота `max_concurrency`; декларации/детект циклов — leaf `src/task_deps.py` (kill-switch `ORCH_TASK_DEPS_ENABLED`). Сериализация мержа одного репо — безусловный pre-merge rebase под merge-lease (`ORCH_PREMERGE_REBASE_ALWAYS`). **ORCH-088 (serial gate, Этап 1):** новая задача репо не входит в `analysis` (analyst-job не выбирается, ветка не режется), пока в репо есть **более ранняя** незавершённая задача (`t2.id < jobs.task_id`, FIFO) ИЛИ репо заморожен (`repo_freeze`). Срез ветки **отложен** со `start_pipeline` на момент claim analyst-job (`launcher._materialize_deferred_branch`) — база = свежий `origin/main` с кодом предшественника (анти-stale-base). Post-deploy `DEGRADED` → durable per-repo freeze (`repo_freeze`, `cleared_at IS NULL` = активен) + Telegram; снятие — вручную `POST /serial-gate/unfreeze?repo=…`. Leaf `src/serial_gate.py` (claim — fail-OPEN, freeze — fail-CLOSED); флаги `ORCH_SERIAL_GATE_ENABLED` (kill-switch), `ORCH_SERIAL_GATE_REPOS` (CSV; пусто = все репо), `ORCH_SERIAL_GATE_FREEZE_ENABLED`. Блок `serial_gate` в `GET /queue`. `STAGE_TRANSITIONS`/`QG_CHECKS` не тронуты. **ORCH-093 (merge-актор устойчив к икоте Gitea):** детерминированный merge-актор под-гейта `deploy → done` (`src/merge_gate.py`) ретраит **транзиентные** ошибки Gitea вместо ложного HOLD (инцидент ORCH-063: `POST …/merge` → `405 "try again later"` сразу после пуша). `merge_pr` оборачивает **только** мутирующий `POST …/merge` в ограниченный retry-loop с экспоненциальным backoff (`min(base*2^(i-1), max)`, потолок суммарного сна `(N-1)*max ≤ 10 с`); классификатор `_classify_merge_response`: транзиент (ретрай) — `405`/`408`/`5xx`/таймаут/сетевая + `409`/`422` при `mergeable==True` (доп. `GET /pulls/{index}`; `mergeable==None` → дефолт-транзиент, fail-OPEN-в-ретрай), терминал (быстрый честный `False`, защита ORCH-071/073 как прежде) — `403`/`404`/реальный конфликт (`mergeable==False`). Kill-switch `merge_retry_enabled=false` → ровно один POST (байт-в-байт прежнее one-shot); флаги `ORCH_MERGE_RETRY_*` (`max_attempts=3`, `backoff_base_s=2`, `backoff_max_s=5`). Гард **already-in-main** в `ensure_open_pr` (leaf `_branch_fully_in_main`, `git merge-base --is-ancestor HEAD origin/main`): ветка целиком в `main` → исход `"already-in-main"` без создания мусорного пустого PR; `_handle_merge_verify` пропускает `merge_pr` и отдаёт авторитетному SHA-в-main довести до `done` (НЕ HOLD); git-ошибка → fail-OPEN на create-путь. Без отдельного флага (накрыт `merge_verify_autocreate_pr_enabled`). INV-4 (мерж только через Gitea PR-merge API, никогда push/force-push в `main`), never-raise, `STAGE_TRANSITIONS`/`QG_CHECKS`/схема БД — сохранены. Детали — `docs/work-items/ORCH-093/06-adr/ADR-001-merge-transient-retry-and-already-in-main-guard.md`, сквозной `docs/architecture/adr/adr-0027-merge-actor-transient-retry-and-already-in-main.md`. - Контейнеризация: Docker + Compose - CI/CD: Gitea Actions (`.gitea/workflows/`) - Деплой: docker compose на mva154 diff --git a/docs/architecture/README.md b/docs/architecture/README.md index 805f683..9d953f8 100644 --- a/docs/architecture/README.md +++ b/docs/architecture/README.md @@ -477,6 +477,44 @@ developer-пути и **только** при свежем worktree-коммит Подробнее: [adr-0016](adr/adr-0016-ensure-open-pr-before-merge-verify.md) (amends 0013/0014); детально — `docs/work-items/ORCH-082/06-adr/ADR-001-ensure-open-pr-before-merge-verify.md`. +#### Ретрай транзиентных merge-ошибок Gitea + гард already-in-main (ORCH-093 — фикс ложного HOLD на 405/5xx) +Инцидент **ORCH-063**: self-deploy прошёл, staging OK, PR был `open`+`mergeable`, конфликтов не было, +но `POST /pulls/{n}/merge` вернул `HTTP 405 {"message":"Please try again later"}` (Gitea пересчитывал +`mergeable` сразу после пуша). One-shot `merge_pr` мгновенно вернул `False` → корректная защита +ORCH-071/073 удержала задачу на `deploy` (HOLD+alert) + потребовала **ручной домерж** (повтор влился с +первого раза); повторный прогон финализатора плодил **мусорный пустой PR** на уже влитой ветке. У +Claude-агентов есть transient-breaker, у CI-гейта — `check_ci_green`, а у детерминированного +merge-актора аналога не было. ORCH-093 закрывает это аддитивно, внутри того же под-гейта, не трогая +машину стадий: +- **Retry-loop в `merge_pr` (ORCH-093 D1/D2):** ретраится **только** мутирующий `POST …/merge` + (идемпотентные шаги до него — без изменений). Классификатор `_classify_merge_response → + transient|terminal`: **транзиент** (ретрай с backoff) — `405`/`408`/любой `5xx`/`httpx`-таймаут/ + сетевая ошибка, **и** `409`/`422` когда PR всё ещё `mergeable` (доп. `GET /pulls/{index}`); + **терминал** (быстрый честный `False`, защита ORCH-071/073 как прежде) — `403`/`404`/реальный + конфликт (`409`/`422` при `mergeable==False`). Дефолт-политика `mergeable==None`/недоступно → + транзиент (fail-OPEN-в-ретрай: икота Gitea наблюдаема, бюджет конечен, backstop сохранён). + Backoff экспоненциальный с потолком `min(base*2^(i-1), max)` (дефолты 2/5 с → суммарный сон + `(N-1)*max ≤ 10 с`, monitor-поток merge-verify не подвешивается). Лог `attempt i/N` (образец + `check_ci_green`). +- **Гард already-in-main в `ensure_open_pr` (ORCH-093 D3):** leaf `_branch_fully_in_main` + (`git merge-base --is-ancestor HEAD origin/main` в per-branch worktree) вызывается **между** «код-PR + не найден» и `POST …/pulls`: ветка целиком в `main` (нет коммитов `origin/main..HEAD`) → новый исход + `("already-in-main", …)` **без создания PR** (нет мусорного пустого PR). git-ошибка/ambiguous + (`None`) → **fail-OPEN** (деградация на create-путь, НЕ ложный no-op). Без отдельного флага — + накрыт `merge_verify_autocreate_pr_enabled`. +- **Врезка в `_handle_merge_verify` (ORCH-093 D4):** `pr_status == "already-in-main"` → лог, + **пропуск** `merge_pr` (мержить нечего), сразу к `verify_merged_to_main` (SHA-в-main подтвердит → + `done`). Это НЕ HOLD; SHA-в-main остаётся авторитетным (если SHA не в `main` — прежний HOLD, + fail-closed). +- **Конфиг/откат:** `merge_retry_enabled` (kill-switch; `False` → ровно один POST = байт-в-байт + прежнее one-shot) / `merge_retry_max_attempts` (3) / `merge_retry_backoff_base_s` (2) / + `merge_retry_backoff_max_s` (5), env `ORCH_MERGE_RETRY_*`. `STAGE_TRANSITIONS`, `QG_CHECKS`, схема + БД, exit-коды хука, merge-gate, image-freshness — без изменений; `main` не push/force-push. + +Подробнее: [adr-0027](adr/adr-0027-merge-actor-transient-retry-and-already-in-main.md) +(amends 0013/0014/0016); детально — +`docs/work-items/ORCH-093/06-adr/ADR-001-merge-transient-retry-and-already-in-main-guard.md`. + #### Ретрай транзиентных merge-ошибок Gitea + гард already-in-main (ORCH-093 — фикс ложного HOLD на 405/5xx) Инцидент ORCH-063 (09.06): self-deploy прошёл, PR `open`+`mergeable=True`, конфликтов нет — но `POST …/merge` вернул `HTTP 405 {"message":"Please try again later"}` (Gitea пересчитывал diff --git a/src/config.py b/src/config.py index bca46f8..f1313ea 100644 --- a/src/config.py +++ b/src/config.py @@ -549,6 +549,31 @@ class Settings(BaseSettings): merge_pr_timeout_s: int = 60 merge_verify_timeout_s: int = 60 + # ORCH-093: deterministic merge-actor retry of TRANSIENT Gitea merge errors. + # The incident ORCH-063 had a green self-deploy + an open, mergeable PR, yet + # POST /pulls/{n}/merge returned HTTP 405 ("Please try again later") because + # Gitea was still recomputing `mergeable` right after the push — the one-shot + # merge_pr returned False, the ORCH-071/081 backstop HELD the task on `deploy`, + # and a human had to re-merge by hand. merge_pr now wraps ONLY the mutating + # POST in a bounded exponential-backoff retry-loop on TRANSIENT outcomes + # (405/408/5xx/network-timeout, and 409|422 while the PR is still mergeable); + # TERMINAL outcomes (403/404/real conflict) -> fast honest False (the HOLD + # protection is unchanged). Mirrors the ci_poll_* idiom of check_ci_green. + # merge_retry_enabled -> kill-switch; False -> exactly one POST + # (byte-for-byte the prior one-shot behaviour, + # env ORCH_MERGE_RETRY_ENABLED). + # merge_retry_max_attempts -> max POST attempts on a transient outcome + # (env ORCH_MERGE_RETRY_MAX_ATTEMPTS). + # merge_retry_backoff_base_s -> exponential backoff base seconds + # (env ORCH_MERGE_RETRY_BACKOFF_BASE_S). + # merge_retry_backoff_max_s -> per-sleep backoff ceiling seconds; total sleep + # is bounded by (N-1) * max so the monitor-thread + # is never wedged (env ORCH_MERGE_RETRY_BACKOFF_MAX_S). + merge_retry_enabled: bool = True + merge_retry_max_attempts: int = 3 + merge_retry_backoff_base_s: int = 2 + merge_retry_backoff_max_s: int = 5 + # ORCH-026: intra-repo merge serialisation (Level A) + declarative task # dependencies (Level B). Level A reuses the ORCH-043/065 merge-lease window # (no new mechanism) — the merge-lease already serialises "merge -> main-updated" diff --git a/src/merge_gate.py b/src/merge_gate.py index 1341016..7629561 100644 --- a/src/merge_gate.py +++ b/src/merge_gate.py @@ -602,6 +602,51 @@ def merge_verify_applies(repo: str) -> bool: return False +def _branch_fully_in_main(repo: str, branch: str) -> bool | None: + """Return True iff ``branch`` has NO commits beyond ``origin/main`` (ORCH-093 D3). + + Used by ``ensure_open_pr`` to avoid creating an empty PR on a branch that is + already fully merged into ``main`` (the ORCH-063 garbage-PR symptom on a + re-driven finalizer after a manual merge). In the per-branch worktree: + ``git fetch origin main`` then ``git merge-base --is-ancestor HEAD origin/main`` + (equivalent to ``git rev-list --count origin/main..HEAD == 0``; same idiom as + ``branch_is_behind_main`` / ``verify_merged_to_main``). + + * ``rc == 0`` -> HEAD is an ancestor of origin/main -> fully in main -> ``True``. + * ``rc == 1`` -> there are commits beyond main -> ``False``. + * git/OS error / ambiguous rc -> ``None`` (caller fail-OPENs: degrade to the + create path; an infra hiccup must NOT become a false no-op merge). + + Never-raise: any error -> ``None``. + """ + try: + wt = ensure_worktree(repo, branch) + except Exception as e: # noqa: BLE001 - never-raise contract -> fail-OPEN + logger.warning("_branch_fully_in_main: worktree error for %s/%s: %s", repo, branch, e) + return None + try: + subprocess.run( + ["git", "-C", wt, "fetch", "origin", "main"], + capture_output=True, timeout=_FETCH_TIMEOUT, + ) + r = subprocess.run( + ["git", "-C", wt, "merge-base", "--is-ancestor", "HEAD", "origin/main"], + capture_output=True, timeout=_SHORT_TIMEOUT, + ) + except (subprocess.SubprocessError, OSError) as e: + logger.warning("_branch_fully_in_main: git error for %s/%s: %s", repo, branch, e) + return None + if r.returncode == 0: + return True + if r.returncode == 1: + return False + logger.warning( + "_branch_fully_in_main: ambiguous merge-base rc=%s for %s/%s (fail-open)", + r.returncode, repo, branch, + ) + return None + + def ensure_open_pr(repo: str, branch: str) -> tuple[str, str]: """Guarantee an open **code-PR** (``head==branch`` AND ``base=="main"``) exists. @@ -625,6 +670,12 @@ def ensure_open_pr(repo: str, branch: str) -> tuple[str, str]: ``("existed", …)``; no duplicate is created (AC-2 / FR-5). 4. Any other HTTP/parse/network error -> ``("failed", "")``. + ORCH-093 (D3) adds a guard BETWEEN steps 1 and 2: if the branch is already fully + in ``main`` (no commits beyond ``origin/main``) there is nothing to PR -> the new + outcome ``("already-in-main", "")`` is returned WITHOUT a ``POST`` (avoids + an empty garbage PR on a re-driven finalizer). A git error of the guard fails OPEN + (degrade to the create path) so an infra hiccup never becomes a false no-op. + Reuses ``settings.merge_pr_timeout_s`` (same class of Gitea calls as ``merge_pr``). Never-raise (AC-7): any unexpected error -> ``("failed", str(e))``; the exception is NEVER propagated into ``_handle_merge_verify`` / ``advance_stage``. @@ -657,6 +708,21 @@ def ensure_open_pr(repo: str, branch: str) -> tuple[str, str]: logger.info("ensure_open_pr: %s/%s already has open code-PR #%s", repo, branch, existing) return "existed", str(existing) + # Step 1b (ORCH-093 D3): guard "branch already fully in main". If the branch + # has no commits beyond origin/main there is nothing to PR — creating one + # would yield an empty garbage PR (the ORCH-063 symptom on a re-driven + # finalizer after a manual merge). Return the new "already-in-main" outcome + # so _handle_merge_verify skips merge_pr and lets the authoritative + # SHA-in-main check confirm -> done. fail-OPEN on git error / ambiguous + # (None): degrade to the create path below, NEVER block — an infra hiccup + # must not become a false no-op merge (SHA-in-main downstream stays the proof). + if _branch_fully_in_main(repo, branch) is True: + logger.info( + "ensure_open_pr: %s/%s already fully in main -> already-in-main (no PR created)", + repo, branch, + ) + return "already-in-main", "branch already in main (no commits beyond origin/main)" + # Step 2: create the code-PR onto main. parts = branch.split("/") title = parts[-1] if parts else branch @@ -697,6 +763,89 @@ def ensure_open_pr(repo: str, branch: str) -> tuple[str, str]: return "failed", f"ensure_open_pr error: {e}" +# --------------------------------------------------------------------------- +# ORCH-093: transient-error retry of the merge POST + classification helpers. +# --------------------------------------------------------------------------- +def _merge_backoff(attempt: int) -> float: + """Exponential backoff (s) with a ceiling for the merge-POST retry (ORCH-093 D1). + + ``backoff(i) = min(base * 2**(i-1), max)`` — the transient-breaker idiom of the + Claude agents, bounded so the total sleep ``(N-1) * max`` can never wedge the + monitor-thread running merge-verify (NFR-4). Defaults base=2, max=5 -> the + sequence is 2, 4, 5, 5, … seconds. + """ + base = settings.merge_retry_backoff_base_s + cap = settings.merge_retry_backoff_max_s + try: + return float(min(base * (2 ** (max(attempt, 1) - 1)), cap)) + except Exception: # noqa: BLE001 - never-raise; degrade to the ceiling + return float(cap) + + +def _pr_mergeable(repo: str, index) -> bool | None: + """Read the ``mergeable`` field of PR ``index`` via ``GET /pulls/{index}`` (ORCH-093 D2). + + Used ONLY to disambiguate a ``409``/``422`` merge POST: Gitea may still be + recomputing mergeability right after a push (the ORCH-063 root cause). Returns + the boolean ``mergeable`` flag, or ``None`` when it is absent / non-boolean / the + GET fails (never-raise) — the caller treats ``None`` as the default-policy + transient (D2). + """ + try: + import httpx + owner = settings.gitea_owner + headers = {"Authorization": f"token {settings.gitea_token}"} + resp = httpx.get( + f"{settings.gitea_url}/api/v1/repos/{owner}/{repo}/pulls/{index}", + headers=headers, timeout=settings.merge_pr_timeout_s, + ) + if resp.status_code != 200: + return None + val = (resp.json() or {}).get("mergeable") + return val if isinstance(val, bool) else None + except Exception as e: # noqa: BLE001 - never-raise contract + logger.warning("_pr_mergeable check failed for %s PR #%s: %s", repo, index, e) + return None + + +def _classify_merge_response(repo: str, branch: str, index, status_code: int) -> str: + """Classify a non-2xx ``POST /pulls/{index}/merge`` outcome (ORCH-093 D2). + + Returns ``"transient"`` (retry within budget) or ``"terminal"`` (fast honest + ``False``; the ORCH-071/081 HOLD backstop takes over). Decision tree: + + * ``405`` ("try again later"), ``408``, any ``5xx`` -> **transient**. + * ``403`` (no rights), ``404`` (PR gone) -> **terminal**. + * ``409`` / ``422`` (ambiguous) -> ``GET /pulls/{index}`` -> ``mergeable``: + - ``False`` -> **terminal** (real conflict, fast HOLD). + - ``True`` / ``None`` / GET failed -> **transient** (default-policy + fail-OPEN-in-retry: Gitea has not recomputed yet — the ORCH-063 case; + the retry budget is finite, so a real conflict still HOLDs after it). + * any other unexpected code -> **terminal** (do not loop on unknowns). + + Never-raise: any error -> ``"transient"`` (conservative, within the bounded + retry budget). + """ + try: + if status_code in (405, 408) or 500 <= status_code <= 599: + return "transient" + if status_code in (403, 404): + return "terminal" + if status_code in (409, 422): + mergeable = _pr_mergeable(repo, index) + if mergeable is False: + return "terminal" + # True OR None/unavailable -> transient (default-policy, D2). + return "transient" + return "terminal" + except Exception as e: # noqa: BLE001 - never-raise contract + logger.warning( + "_classify_merge_response error for %s/%s PR #%s: %s (transient)", + repo, branch, index, e, + ) + return "transient" + + def merge_pr(repo: str, branch: str) -> tuple[bool, str]: """Deterministically merge the open PR for ``branch`` via the Gitea PR-merge API. @@ -712,8 +861,16 @@ def merge_pr(repo: str, branch: str) -> tuple[bool, str]: (FR-3) adds the ``base == main`` filter so the actor merges exactly the feature code-PR and never an auto docs-PR / a PR onto a foreign base. No such open PR -> ``(False, "no open PR")``. - 3. ``POST /repos/{owner}/{repo}/pulls/{index}/merge`` (Do: ``merge``) -> - 200/201 -> ``(True, "merged PR #")``; otherwise ``(False, "")``. + 3. ``POST /repos/{owner}/{repo}/pulls/{index}/merge`` (Do: ``merge``) in a + bounded retry-loop (ORCH-093 D1): ``200/201`` -> ``(True, "merged PR #")``; + a TRANSIENT outcome (405/408/5xx/network/timeout, or 409|422 while still + mergeable) is retried with exponential backoff up to + ``merge_retry_max_attempts``; a TERMINAL outcome (403/404/real conflict) -> + immediate ``(False, "merge failed: HTTP ")``; exhausting the budget on + a transient -> ``(False, "merge failed after attempts: HTTP ")``. + The kill-switch ``merge_retry_enabled=False`` forces exactly one POST + (the prior one-shot behaviour). Only the mutating POST is retried — the + idempotent steps above are not. Never-raise (INV-1/AC-9 / TC-09): any HTTP/parse error -> ``(False, reason)``. """ @@ -744,21 +901,59 @@ def merge_pr(repo: str, branch: str) -> tuple[bool, str]: if index is None: return False, "no open PR" - m = httpx.post( - f"{base}/pulls/{index}/merge", - json={"Do": "merge"}, - headers=headers, - timeout=timeout, - ) - if m.status_code in (200, 201): - logger.info("merge_pr: merged PR #%s for %s/%s", index, repo, branch) - return True, f"merged PR #{index}" - detail = (m.text or "").strip()[:200] - logger.warning( - "merge_pr: merge failed for %s/%s PR #%s: HTTP %s %s", - repo, branch, index, m.status_code, detail, - ) - return False, f"merge failed: HTTP {m.status_code}" + # ORCH-093 D1: retry ONLY the mutating POST on transient outcomes. The + # kill-switch collapses the budget to one attempt = the prior one-shot path + # (no branching of the loop body, ADR D1). + n_eff = settings.merge_retry_max_attempts if settings.merge_retry_enabled else 1 + if n_eff < 1: + n_eff = 1 + for attempt in range(1, n_eff + 1): + try: + m = httpx.post( + f"{base}/pulls/{index}/merge", + json={"Do": "merge"}, + headers=headers, + timeout=timeout, + ) + except (httpx.HTTPError, OSError) as e: + # Network/timeout -> transient within the bounded budget (never-raise). + logger.warning( + "merge_pr: attempt %s/%s network error for %s/%s PR #%s: %s (transient)", + attempt, n_eff, repo, branch, index, e, + ) + if attempt < n_eff: + time.sleep(_merge_backoff(attempt)) + continue + return False, f"merge failed after {n_eff} attempts: network error" + + if m.status_code in (200, 201): + logger.info( + "merge_pr: merged PR #%s for %s/%s (attempt %s/%s)", + index, repo, branch, attempt, n_eff, + ) + return True, f"merged PR #{index}" + + detail = (m.text or "").strip()[:200] + cls = _classify_merge_response(repo, branch, index, m.status_code) + if cls == "terminal": + logger.warning( + "merge_pr: merge failed for %s/%s PR #%s: HTTP %s %s (terminal)", + repo, branch, index, m.status_code, detail, + ) + return False, f"merge failed: HTTP {m.status_code}" + + # Transient: log attempt i/N (check_ci_green idiom) and retry if budget left. + logger.warning( + "merge_pr: attempt %s/%s transient HTTP %s for %s/%s PR #%s %s", + attempt, n_eff, m.status_code, repo, branch, index, detail, + ) + if attempt < n_eff: + time.sleep(_merge_backoff(attempt)) + continue + return False, f"merge failed after {n_eff} attempts: HTTP {m.status_code}" + + # Unreachable (loop always returns), defensive only. + return False, f"merge failed after {n_eff} attempts" except Exception as e: # noqa: BLE001 - never-raise contract logger.warning("merge_pr unexpected error for %s/%s: %s", repo, branch, e) return False, f"merge error: {e}" @@ -841,6 +1036,7 @@ MAIN_REGRESSION_MARKERS: list[tuple[str, str, str]] = [ ("ORCH-071", "verify_merged_to_main", "src/merge_gate.py"), ("ORCH-073", "check_main_regression", "src/merge_gate.py"), ("ORCH-082", "ensure_open_pr", "src/merge_gate.py"), + ("ORCH-093", "_classify_merge_response", "src/merge_gate.py"), ] diff --git a/src/stage_engine.py b/src/stage_engine.py index 267d009..36b169d 100644 --- a/src/stage_engine.py +++ b/src/stage_engine.py @@ -1483,6 +1483,7 @@ def _handle_merge_verify(task_id, repo, work_item_id, branch, result: AdvanceRes # `created`/`existed` -> proceed unchanged; `failed` -> honest HOLD with a # distinguishable text (NOT the not-merged HOLD). ORCH-073's SHA-in-main proof # below is untouched and stays authoritative. Kill-switch off -> 1:1 prior path. + skip_merge = False if settings.merge_verify_autocreate_pr_enabled: pr_status, pr_detail = merge_gate.ensure_open_pr(repo, branch) logger.info( @@ -1492,10 +1493,25 @@ def _handle_merge_verify(task_id, repo, work_item_id, branch, result: AdvanceRes return _hold_pr_create_failed( task_id, repo, work_item_id, branch, pr_detail, result ) + if pr_status == "already-in-main": + # ORCH-093 (D4): the branch is already fully in `main` -> nothing to + # merge and no PR was created. Skip the deterministic merge_pr; the + # authoritative SHA-in-main check below confirms the merge -> done. + # This is NOT a HOLD (the goal is already achieved); if for some + # reason the SHA is not in main the prior not-merged HOLD still fires + # (fail-closed, safe). + logger.info( + f"Task {task_id}: merge-verify already-in-main -> skip merge_pr " + "(SHA-in-main authoritative)" + ) + skip_merge = True # "created" | "existed" -> proceed normally to merge_pr. # Deterministic merge-actor (no-op if the PR is already merged, INV-5/AC-9). - merged_ok, merge_msg = merge_gate.merge_pr(repo, branch) + if skip_merge: + merged_ok, merge_msg = True, "already-in-main (skipped merge_pr)" + else: + merged_ok, merge_msg = merge_gate.merge_pr(repo, branch) logger.info( f"Task {task_id}: merge-verify merge_pr -> ok={merged_ok} ({merge_msg})" ) diff --git a/tests/test_config.py b/tests/test_config.py index 697864b..4e6da8d 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -257,3 +257,38 @@ def test_tc19_check_branch_mergeable_signature_intact(): from src.qg.checks import check_branch_mergeable params = list(inspect.signature(check_branch_mergeable).parameters) assert params == ["repo", "work_item_id", "branch"] + + +# --------------------------------------------------------------------------- +# ORCH-093 / TC-13: merge_retry_* settings defaults + env override (AC-5). +# --------------------------------------------------------------------------- +_MERGE_RETRY_ENV = ( + "ORCH_MERGE_RETRY_ENABLED", + "ORCH_MERGE_RETRY_MAX_ATTEMPTS", + "ORCH_MERGE_RETRY_BACKOFF_BASE_S", + "ORCH_MERGE_RETRY_BACKOFF_MAX_S", +) + + +def test_merge_retry_settings_defaults(monkeypatch): + """Documented defaults when no ORCH_MERGE_RETRY_* env is set.""" + for name in _MERGE_RETRY_ENV: + monkeypatch.delenv(name, raising=False) + s = Settings() + assert s.merge_retry_enabled is True + assert s.merge_retry_max_attempts == 3 + assert s.merge_retry_backoff_base_s == 2 + assert s.merge_retry_backoff_max_s == 5 + + +def test_merge_retry_settings_env_override(monkeypatch): + """Each field is read from its ORCH_MERGE_RETRY_* env var.""" + monkeypatch.setenv("ORCH_MERGE_RETRY_ENABLED", "false") + monkeypatch.setenv("ORCH_MERGE_RETRY_MAX_ATTEMPTS", "5") + monkeypatch.setenv("ORCH_MERGE_RETRY_BACKOFF_BASE_S", "1") + monkeypatch.setenv("ORCH_MERGE_RETRY_BACKOFF_MAX_S", "8") + s = Settings() + assert s.merge_retry_enabled is False + assert s.merge_retry_max_attempts == 5 + assert s.merge_retry_backoff_base_s == 1 + assert s.merge_retry_backoff_max_s == 8 diff --git a/tests/test_merge_gate.py b/tests/test_merge_gate.py index 0a12c4c..463b9ad 100644 --- a/tests/test_merge_gate.py +++ b/tests/test_merge_gate.py @@ -389,3 +389,207 @@ def test_tc16_deployer_prompt_consults_guard(): assert "no second merge" in lowered, ( "deployer prompt must document the already-merged no-op (AC-11)" ) + + +# =========================================================================== +# ORCH-093: merge_pr transient-retry + ensure_open_pr already-in-main guard. +# TC-01..TC-12 — httpx mocked; time.sleep no-op so backoff never slows tests. +# =========================================================================== +ORCH093_BRANCH = "feature/ORCH-093-x" + + +class _Resp093: + """Response stand-in with status_code / json() / text (merge_pr reads .text).""" + + def __init__(self, status_code, payload=None, text=""): + self.status_code = status_code + self._payload = payload if payload is not None else [] + self.text = text + + def json(self): + return self._payload + + +@pytest.fixture +def merge093(monkeypatch): + """Wire Gitea settings + retry defaults; no-op backoff; PR not-already-merged.""" + monkeypatch.setattr(merge_gate.settings, "gitea_url", "http://gitea.test") + monkeypatch.setattr(merge_gate.settings, "gitea_owner", "admin") + monkeypatch.setattr(merge_gate.settings, "gitea_token", "tok") + monkeypatch.setattr(merge_gate.settings, "merge_pr_timeout_s", 5) + monkeypatch.setattr(merge_gate.settings, "merge_retry_enabled", True) + monkeypatch.setattr(merge_gate.settings, "merge_retry_max_attempts", 3) + monkeypatch.setattr(merge_gate.settings, "merge_retry_backoff_base_s", 2) + monkeypatch.setattr(merge_gate.settings, "merge_retry_backoff_max_s", 5) + monkeypatch.setattr(merge_gate.time, "sleep", lambda *a, **k: None) + monkeypatch.setattr(merge_gate, "pr_already_merged", lambda r, b: False) + + +def _open_code_pr_get(number=7): + """A list-PRs GET returning exactly one open code-PR (head==branch, base==main).""" + return lambda *a, **k: _Resp093( + 200, [{"head": {"ref": ORCH093_BRANCH}, "base": {"ref": "main"}, "number": number}] + ) + + +class _PostSeq: + """Returns queued responses (or raises queued exceptions) on each POST call.""" + + def __init__(self, items): + self._items = list(items) + self.calls = 0 + + def __call__(self, *a, **k): + self.calls += 1 + item = self._items.pop(0) if self._items else self._items_last + self._items_last = item + if isinstance(item, Exception): + raise item + return item + + +# --- TC-01: 405, 405, 200 -> (True, ...); exactly 3 POST; no false False (AC-1) --- +def test_tc01_merge_retries_405_then_succeeds(merge093, monkeypatch): + monkeypatch.setattr(httpx, "get", _open_code_pr_get(7)) + seq = _PostSeq([_Resp093(405, text="try again later"), + _Resp093(405, text="try again later"), + _Resp093(200)]) + monkeypatch.setattr(httpx, "post", seq) + ok, msg = merge_gate.merge_pr("orchestrator", ORCH093_BRANCH) + assert ok is True and "PR #7" in msg + assert seq.calls == 3 + + +# --- TC-02: 503 (5xx) then 200 -> retry -> (True, ...) (AC-1) --- +def test_tc02_merge_retries_5xx_then_succeeds(merge093, monkeypatch): + monkeypatch.setattr(httpx, "get", _open_code_pr_get(7)) + seq = _PostSeq([_Resp093(503, text="bad gateway"), _Resp093(200)]) + monkeypatch.setattr(httpx, "post", seq) + ok, msg = merge_gate.merge_pr("orchestrator", ORCH093_BRANCH) + assert ok is True and seq.calls == 2 + + +# --- TC-03: httpx Timeout in attempt 1, then 200 -> retry; never-raise (AC-1/AC-6) --- +def test_tc03_merge_retries_network_error_then_succeeds(merge093, monkeypatch): + monkeypatch.setattr(httpx, "get", _open_code_pr_get(7)) + seq = _PostSeq([httpx.ConnectTimeout("timed out"), _Resp093(200)]) + monkeypatch.setattr(httpx, "post", seq) + ok, msg = merge_gate.merge_pr("orchestrator", ORCH093_BRANCH) + assert ok is True and seq.calls == 2 + + +# --- TC-04: real conflict 409 + mergeable=False -> (False, ...), no extra POST (AC-2) --- +def test_tc04_real_conflict_terminal_no_retry(merge093, monkeypatch): + monkeypatch.setattr(httpx, "get", _open_code_pr_get(7)) + monkeypatch.setattr(merge_gate, "_pr_mergeable", lambda r, i: False) + seq = _PostSeq([_Resp093(409, text="conflict")]) + monkeypatch.setattr(httpx, "post", seq) + ok, msg = merge_gate.merge_pr("orchestrator", ORCH093_BRANCH) + assert ok is False and "HTTP 409" in msg + assert seq.calls == 1 # terminal -> no retry + + +# --- TC-05: ambiguous 409 + mergeable=True -> transient -> retry -> 200 (AC-2) --- +def test_tc05_ambiguous_409_mergeable_true_retries(merge093, monkeypatch): + monkeypatch.setattr(httpx, "get", _open_code_pr_get(7)) + monkeypatch.setattr(merge_gate, "_pr_mergeable", lambda r, i: True) + seq = _PostSeq([_Resp093(409, text="recomputing"), _Resp093(200)]) + monkeypatch.setattr(httpx, "post", seq) + ok, msg = merge_gate.merge_pr("orchestrator", ORCH093_BRANCH) + assert ok is True and seq.calls == 2 + + +# --- TC-06: 403 (no rights) -> immediate (False, ...) without retry (AC-2) --- +def test_tc06_403_terminal_no_retry(merge093, monkeypatch): + monkeypatch.setattr(httpx, "get", _open_code_pr_get(7)) + seq = _PostSeq([_Resp093(403, text="forbidden")]) + monkeypatch.setattr(httpx, "post", seq) + ok, msg = merge_gate.merge_pr("orchestrator", ORCH093_BRANCH) + assert ok is False and "HTTP 403" in msg and seq.calls == 1 + + +# --- TC-07: 405 on all N attempts -> (False, "merge failed after N attempts: HTTP 405") (AC-3) --- +def test_tc07_exhausts_retries_clear_reason(merge093, monkeypatch): + monkeypatch.setattr(httpx, "get", _open_code_pr_get(7)) + seq = _PostSeq([_Resp093(405), _Resp093(405), _Resp093(405)]) + monkeypatch.setattr(httpx, "post", seq) + ok, msg = merge_gate.merge_pr("orchestrator", ORCH093_BRANCH) + assert ok is False + assert "after 3 attempts" in msg and "HTTP 405" in msg + assert seq.calls == 3 + + +# --- TC-08: kill-switch off -> exactly one POST (one-shot) at 405 -> (False, ...) (AC-5/AC-3) --- +def test_tc08_killswitch_off_one_shot(merge093, monkeypatch): + monkeypatch.setattr(merge_gate.settings, "merge_retry_enabled", False) + monkeypatch.setattr(httpx, "get", _open_code_pr_get(7)) + seq = _PostSeq([_Resp093(405), _Resp093(200)]) # 2nd would succeed if retried + monkeypatch.setattr(httpx, "post", seq) + ok, msg = merge_gate.merge_pr("orchestrator", ORCH093_BRANCH) + assert ok is False and seq.calls == 1 # one-shot: never retried + + +# --- TC-09: ensure_open_pr — no open PR, branch fully in main -> already-in-main, no POST (AC-4) --- +def test_tc09_ensure_already_in_main_no_post(merge093, monkeypatch): + monkeypatch.setattr(httpx, "get", lambda *a, **k: _Resp093(200, [])) # no open PR + monkeypatch.setattr(merge_gate, "_branch_fully_in_main", lambda r, b: True) + monkeypatch.setattr(httpx, "post", lambda *a, **k: (_ for _ in ()).throw( + AssertionError("must NOT POST /pulls for an already-in-main branch"))) + status, detail = merge_gate.ensure_open_pr("orchestrator", ORCH093_BRANCH) + assert status == "already-in-main" + + +# --- TC-10: ensure_open_pr — no open PR, commits beyond main -> creates PR (regress) (AC-4) --- +def test_tc10_ensure_creates_when_commits_beyond_main(merge093, monkeypatch): + monkeypatch.setattr(httpx, "get", lambda *a, **k: _Resp093(200, [])) + monkeypatch.setattr(merge_gate, "_branch_fully_in_main", lambda r, b: False) + post_calls = [] + + def fake_post(url, json=None, headers=None, timeout=None): + post_calls.append(url) + return _Resp093(201, {"number": 12}) + + monkeypatch.setattr(httpx, "post", fake_post) + status, detail = merge_gate.ensure_open_pr("orchestrator", ORCH093_BRANCH) + assert status == "created" and detail == "12" + assert len(post_calls) == 1 + + +# --- TC-11: ensure_open_pr — git error in guard (None) -> fail-OPEN -> create path (AC-6) --- +def test_tc11_ensure_guard_git_error_fail_open(merge093, monkeypatch): + monkeypatch.setattr(httpx, "get", lambda *a, **k: _Resp093(200, [])) + # None == git/OS error / ambiguous -> must NOT block; degrade to create. + monkeypatch.setattr(merge_gate, "_branch_fully_in_main", lambda r, b: None) + monkeypatch.setattr(httpx, "post", lambda *a, **k: _Resp093(201, {"number": 13})) + status, detail = merge_gate.ensure_open_pr("orchestrator", ORCH093_BRANCH) + assert status == "created" # fail-open: did not become a false no-op + + +def test_tc11_branch_fully_in_main_never_raises(monkeypatch): + """_branch_fully_in_main: any git/OS error -> None (never-raise) (AC-6).""" + monkeypatch.setattr(merge_gate, "ensure_worktree", lambda r, b: "/wt") + + def boom(*a, **k): + raise OSError("git exploded") + + monkeypatch.setattr(merge_gate.subprocess, "run", boom) + assert merge_gate._branch_fully_in_main("orchestrator", ORCH093_BRANCH) is None + + +# --- TC-12: merge_pr / ensure_open_pr — uncaught httpx error -> safe tuple (never-raise) (AC-6) --- +def test_tc12_merge_pr_never_raises(merge093, monkeypatch): + def boom(*a, **k): + raise httpx.HTTPError("kaboom") + + monkeypatch.setattr(httpx, "get", boom) + ok, msg = merge_gate.merge_pr("orchestrator", ORCH093_BRANCH) + assert ok is False and isinstance(msg, str) + + +def test_tc12_ensure_open_pr_never_raises(merge093, monkeypatch): + def boom(*a, **k): + raise httpx.HTTPError("kaboom") + + monkeypatch.setattr(httpx, "get", boom) + status, detail = merge_gate.ensure_open_pr("orchestrator", ORCH093_BRANCH) + assert status == "failed" and isinstance(detail, str) diff --git a/tests/test_merge_verify.py b/tests/test_merge_verify.py index 4d29b14..42ba1ee 100644 --- a/tests/test_merge_verify.py +++ b/tests/test_merge_verify.py @@ -131,3 +131,92 @@ def test_tc12_kill_switch_disables_under_gate(monkeypatch): monkeypatch.setattr(merge_gate.settings, "merge_verify_enabled", False) assert merge_gate.merge_verify_applies("orchestrator") is False assert merge_gate.merge_verify_applies("enduro-trails") is False + + +# =========================================================================== +# ORCH-093 / TC-14..16: _handle_merge_verify integration (deploy->done under-gate). +# already-in-main skips merge_pr; transient-retry success -> done; exhausted -> HOLD. +# =========================================================================== +import os # noqa: E402 +import tempfile # noqa: E402 + +os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token") +os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token") +os.environ.setdefault("ORCH_DB_PATH", os.path.join(tempfile.gettempdir(), "test_orch093.db")) + +from unittest.mock import MagicMock # noqa: E402 + +from src import stage_engine, image_freshness # noqa: E402 +from src.stage_engine import AdvanceResult, _handle_merge_verify # noqa: E402 + +_O93_REPO = "orchestrator" +_O93_WI = "ORCH-093" +_O93_BRANCH = "feature/ORCH-093-x" + + +@pytest.fixture +def _o93_wire(monkeypatch): + monkeypatch.setattr(stage_engine.merge_gate, "merge_verify_applies", lambda r: True) + monkeypatch.setattr(stage_engine.settings, "merge_verify_autocreate_pr_enabled", True) + monkeypatch.setattr(stage_engine.settings, "regression_guard_enabled", False) + monkeypatch.setattr(image_freshness, "validated_revision", lambda r, b: "deadbeef") + for name in ("set_issue_blocked", "plane_add_comment", "send_telegram", "link_for"): + monkeypatch.setattr(stage_engine, name, MagicMock()) + monkeypatch.setattr( + stage_engine.self_deploy, "record_merged_to_main", MagicMock(return_value=True) + ) + + +# --- TC-14: ensure_open_pr -> already-in-main -> skip merge_pr; SHA-in-main -> done (AC-4) --- +def test_tc14_already_in_main_skips_merge_pr_then_done(_o93_wire, monkeypatch): + monkeypatch.setattr( + stage_engine.merge_gate, "ensure_open_pr", lambda r, b: ("already-in-main", "x") + ) + merge = MagicMock() + monkeypatch.setattr(stage_engine.merge_gate, "merge_pr", merge) + monkeypatch.setattr(stage_engine.merge_gate, "verify_merged_to_main", lambda r, b, s: True) + + res = AdvanceResult() + intervened = _handle_merge_verify(1, _O93_REPO, _O93_WI, _O93_BRANCH, res) + + assert intervened is False # advance to done + assert res.alerted is False + assert not merge.called # merge_pr SKIPPED (nothing to merge) + assert not stage_engine.set_issue_blocked.called + + +# --- TC-15: merge_pr exhausted (False) + SHA not in main -> HOLD + alert (ORCH-071/081) (AC-3) --- +def test_tc15_merge_failed_and_not_in_main_holds(_o93_wire, monkeypatch): + monkeypatch.setattr( + stage_engine.merge_gate, "ensure_open_pr", lambda r, b: ("existed", "9") + ) + monkeypatch.setattr( + stage_engine.merge_gate, "merge_pr", + lambda r, b: (False, "merge failed after 3 attempts: HTTP 405"), + ) + monkeypatch.setattr(stage_engine.merge_gate, "verify_merged_to_main", lambda r, b, s: False) + + res = AdvanceResult() + intervened = _handle_merge_verify(1, _O93_REPO, _O93_WI, _O93_BRANCH, res) + + assert intervened is True # HOLD, NOT done + assert res.advanced is False + assert res.note == "merge-not-verified-hold" + assert stage_engine.set_issue_blocked.called + + +# --- TC-16: happy path — transient retry success in merge_pr -> SHA-in-main -> done (AC-1) --- +def test_tc16_transient_retry_success_then_done(_o93_wire, monkeypatch): + monkeypatch.setattr( + stage_engine.merge_gate, "ensure_open_pr", lambda r, b: ("existed", "9") + ) + # merge_pr already rode out the 405x2->200 transient internally -> (True, ...). + monkeypatch.setattr(stage_engine.merge_gate, "merge_pr", lambda r, b: (True, "merged PR #9")) + monkeypatch.setattr(stage_engine.merge_gate, "verify_merged_to_main", lambda r, b, s: True) + + res = AdvanceResult() + intervened = _handle_merge_verify(1, _O93_REPO, _O93_WI, _O93_BRANCH, res) + + assert intervened is False # done, no false HOLD + assert res.alerted is False + assert not stage_engine.set_issue_blocked.called diff --git a/tests/test_orch082_ensure_pr.py b/tests/test_orch082_ensure_pr.py index a12644e..c42dd09 100644 --- a/tests/test_orch082_ensure_pr.py +++ b/tests/test_orch082_ensure_pr.py @@ -32,6 +32,11 @@ def _settings(monkeypatch): monkeypatch.setattr(merge_gate.settings, "gitea_owner", "owner") monkeypatch.setattr(merge_gate.settings, "gitea_token", "tok") monkeypatch.setattr(merge_gate.settings, "gitea_url", "http://gitea.test") + # ORCH-093: these tests target the HTTP create/race logic of ensure_open_pr. + # The new already-in-main guard (_branch_fully_in_main) runs real git; pin it + # to "commits beyond main" (False) so the create path is exercised as intended. + # The guard itself has dedicated coverage (test_merge_gate.py TC-09/10/11). + monkeypatch.setattr(merge_gate, "_branch_fully_in_main", lambda r, b: False) def _install_httpx(monkeypatch, get_resp, post_resp=None, record=None):