diff --git a/.env.example b/.env.example index 9fb8d2e..94d3a65 100644 --- a/.env.example +++ b/.env.example @@ -123,11 +123,17 @@ ORCH_TASK_DEPS_SOURCE=db # REGRESSION_GUARD_ENABLED -> kill-switch for the ORCH-073 main-integrity regression # guard (false -> SHA-in-main alone gates done); reuses the # merge-verify scope, so non-self repos are a no-op. +# MERGE_VERIFY_AUTOCREATE_PR_ENABLED -> ORCH-082: guarantee an open code-PR +# (head==branch, base==main) via merge_gate.ensure_open_pr +# BEFORE the deterministic merge_pr (fixes the false HOLD +# "no open PR"). false -> exactly pre-ORCH-082 behaviour. +# Reuses the merge-verify scope; non-self repos -> no-op. ORCH_MERGE_VERIFY_ENABLED=true ORCH_MERGE_VERIFY_REPOS= ORCH_MERGE_PR_TIMEOUT_S=60 ORCH_MERGE_VERIFY_TIMEOUT_S=60 ORCH_REGRESSION_GUARD_ENABLED=true +ORCH_MERGE_VERIFY_AUTOCREATE_PR_ENABLED=true # ORCH-036: executable self-deploy of the `deploy` stage. For the self-hosting repo # (orchestrator) the stage REALLY restarts prod (8500) via a detached host hook; # deploy_status: SUCCESS means proven health-ok, not an LLM declaration. Three diff --git a/CHANGELOG.md b/CHANGELOG.md index 377131e..c3fd024 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ Формат: [Keep a Changelog](https://keepachangelog.com/). Записи — на смысловой PR/задачу. ## [Unreleased] +- **Гарантированный идемпотентный код-PR перед merge-verify (фикс ложного HOLD «no open PR»)** (ORCH-082/ORCH-81): закрыт отсутствующий инвариант «к моменту merge-verify у ветки есть открытый код-PR». **Корень (ORCH-074, 08.06):** PR создавался единственной `launcher._ensure_pr` ТОЛЬКО на developer-пути и ТОЛЬКО при свежем worktree-коммите (`exit==0 → git status непуст → commit → push → agent=="developer"`); после ручных восстановлений `main` у ветки ORCH-074 не оказалось открытого код-PR → детерминированный `merge_gate.merge_pr` вернул `("False", "no open PR")` → защита ORCH-073 верно удержала задачу (HOLD, не ложный `done`), но лечила следствие. **Фикс (ADR-001, аддитивно, внутри того же под-гейта merge-verify, машина стадий не тронута):** (1) новый идемпотентный leaf-актор `merge_gate.ensure_open_pr(repo, branch) -> (status, detail)` (never-raise): `GET …/pulls?state=open` с фильтром **`head.ref==branch` И `base.ref=="main"`** (идентичен `merge_pr`/ORCH-073 FR-3 — авто-docs-PR `base!=main` НЕ код-PR) → `("existed", N)`; иначе `POST …/pulls` → `("created", N)`; гонка `409/422` «PR exists» → повторный GET → `existed` (без дублей); любая иная HTTP/parse/сетевая ошибка → `("failed", reason)`. (2) Врезка в `stage_engine._handle_merge_verify` ПОСЛЕ резолва `validated_revision` и ПЕРЕД `merge_pr`: при `merge_verify_autocreate_pr_enabled` → `ensure_open_pr`; `created|existed` → штатно к `merge_pr` → `verify_merged_to_main`; `failed` → честный HOLD через новый helper `_hold_pr_create_failed` (текст «PR создать не удалось», `result.note="pr-create-failed-hold"` — текстуально отличим от not-merged HOLD; задача остаётся на `deploy`, НЕ `done`, БЕЗ отката на development). (3) `launcher._ensure_pr` делегирован в `merge_gate.ensure_open_pr` (единый код создания PR, общий фильтр `head==branch & base==main`); триггер «создавать только на developer-пути со свежим коммитом» НЕ ужесточён — менялась только реализация под капотом. **Защита ORCH-073 неприкосновенна и приоритетна:** подтверждение merge остаётся ТОЛЬКО `verify_merged_to_main` (SHA-в-main) + `check_main_regression`; `ensure_open_pr` устраняет лишь ЛОЖНЫЙ HOLD «no open PR», реально невлитый код → HOLD как прежде. Kill-switch `ORCH_MERGE_VERIFY_AUTOCREATE_PR_ENABLED` (дефолт `true`); область — `merge_verify_applies(repo)` (self-hosting / `merge_verify_repos`), non-self → no-op; `false` → поведение ORCH-074 1:1. Идемпотентность из Gitea (наличие открытого PR), без миграции БД (restart-safe); `main` не push/force-push. Инварианты НЕ менялись: `STAGE_TRANSITIONS`, реестр `QG_CHECKS` (под-гейт — врезка в `advance_stage`, не новый QG), схема БД, `check_deploy_status`/`_parse_deploy_status`, exit-коды хука, merge-gate (ORCH-043), image-freshness (ORCH-058), внешние HTTP-эндпоинты. ADR `docs/work-items/ORCH-082/06-adr/ADR-001-ensure-open-pr-before-merge-verify.md` (+ сквозной `adr-0016`). Документация: `docs/architecture/README.md` (блок ORCH-082 в merge-verify). Тесты: `tests/test_orch082_ensure_pr.py` (TC-01..05: идемпотентный актор, фильтр base==main, гонка 409/422, never-raise), `tests/test_orch082_merge_verify_autocreate.py` (TC-06..12: врезка, регресс ORCH-073, kill-switch, условность, наблюдаемость). - **Устойчивость резолва `--effort` к пустому env + developer → `xhigh`** (ORCH-081/ORCH-52h): фикс конфигурационного бага, из-за которого в проде `resolve_agent_effort()` возвращал `''` для всех 6 агентов и `--effort` не передавался в Claude CLI (каждый агент бежал на встроенном CLI-дефолте вместо заявленного уровня — прямой удар по предсказуемости качества всего конвейера, включая enduro-trails из общего инстанса). **Корень:** pydantic Settings трактует ПРИСУТСТВУЮЩУЮ env-переменную, даже пустую (`ORCH_AGENT_EFFORT_*=` без значения), как явное `''` и перебивает class-default; в проде пусты И per-agent, И `agent_effort_default`, поэтому у цепочки резолва (`_resolve_agent_attr`: project-override → per-agent env → default → `''`) не остаётся непустого «пола» для отката. **Фикс (вариант c, ADR-001):** в `resolve_agent_effort` (`src/agents/launcher.py`) добавлен уровень 4 — непустой **per-role floor** ниже `default`: новый чистый helper `_agent_effort_floor(agent)` возвращает декларированный class-default поля `agent_effort_` через `type(settings).model_fields[...].default` (значение, которое пустой env перебить НЕ может). Floor срабатывает ТОЛЬКО когда уровни 1–3 пусты и применяется ДО валидации, поэтому: (а) при пустом прод-`.env` каждая роль получает СВОЙ канонический уровень (developer=`xhigh`, tester/deployer=`medium`, analyst/architect/reviewer=`high`), а не общий default; (б) явная опечатка (`turbo`/`ultra`) непуста → floor НЕ применяется → значение штатно дропается валидацией `VALID_EFFORTS` в `''` (never-break ORCH-41 не регрессирует, floor не маскирует мусор); (в) непустой явный env/project-override/`default` по-прежнему ПОБЕЖДАЕТ floor (приоритет резолва сохранён 1:1). Unknown-agent (имя вне 6 ролей) деградирует на class-default `agent_effort_default` (`high`) — безопасный непустой пол. **`config.py`:** `agent_effort_developer` `high → xhigh` (канон Opus 4.8: coding/agentic роль) — единственное изменение значений; floor подтягивает его автоматически (единый источник правды, ноль риска дрейфа floor-карты). Инварианты НЕ менялись: приоритеты/сигнатуры резолва ORCH-41, `_resolve_agent_attr` (общий с model-резолвом, не тронут), `resolve_agent_model` (ORCH-074), путь проброса `--effort` в `_spawn`, `VALID_EFFORTS`, API, схема БД (без миграций). ADR `docs/work-items/ORCH-081/06-adr/ADR-001-effort-resolution-floor.md`. Документация: `docs/architecture/README.md` (таблица «модель/эффорт по ролям»: developer `xhigh` + ремарка про floor), `.env.example` (`ORCH_AGENT_EFFORT_DEVELOPER=xhigh` + комментарий split/floor). Тесты: `tests/test_resolve_agent_effort.py` (TC-01..08: канон-дефолты, floor при пустом env per-role, floor-не-маскирует-typo, приоритет, `xhigh∈VALID_EFFORTS`, сборка флага `--effort xhigh`/`--effort medium`). - **Убран мёртвый frontmatter `model:` + валидация имени модели (never-break)** (ORCH-074): закрыты два дефекта данных/валидации каркаса выбора модели агентов (ORCH-41), без изменения механизма резолва, API или схемы БД. **G1 — мёртвый frontmatter:** из YAML-frontmatter всех 6 промптов `.openclaw/agents/*.md` удалена строка `model:` (`claude-sonnet-4-6` у analyst/developer/tester/deployer, `claude-opus-4-7` у architect/reviewer). launcher НЕ читал frontmatter `model:` — это была лживая/мёртвая декларация, противоречащая реально используемой модели (config) и принципу «документация = golden source»; мина: если бы кто-то «починил» launcher читать frontmatter, все агенты молча уехали бы на устаревшие модели. config (`agent_model_*`/`agent_model_default`) остаётся единственным источником правды; frontmatter описательный. **G2 — валидация имени модели:** добавлен чистый helper `is_valid_model(name)` + `_MODEL_NAME_RE` (`^claude-[a-z0-9.-]+$`) рядом с `VALID_EFFORTS` в `src/agents/launcher.py`. Резолвенное имя модели валидируется ПЕРЕД попаданием в `--model`: невалидное (опечатка, `gpt-4`, пустое, неверный префикс) → `logger.warning` + откат на следующий валидный уровень каскада ORCH-41 (project-override → env → default), в пределе → `""` (без флага `--model`, CLI-дефолт). Никогда не возвращается мусор и не бросается исключение (never-break, поведенческая аналогия `resolve_agent_effort`/`VALID_EFFORTS`). Выбран **формат-чек, а не allowlist `VALID_MODELS`**: allowlist воссоздаёт ровно ту мину, что убивается в G1 (статичный список врёт при устаревании — молча дропнул бы корректную будущую `claude-opus-4-9`); формат-чек forward-compatible (новые `claude-*` проходят без правки кода), финальный авторитет о существовании модели — сам Claude CLI. Тот же предикат применён к inline-чтению `--fallback-model` (`agent_fallback_model` читается напрямую в `_spawn`, мимо `resolve_agent_model` — TRZ §4), поэтому опечатка в `ORCH_AGENT_FALLBACK_MODEL` тоже дропается с warning; для текущего пустого значения регрессии нет. **G4 (fallback) НЕ включён** (`agent_fallback_model=""`, AC-5 N/A) — ради детерминизма (все агенты на `claude-opus-4-8`); **G3 (routing) НЕ включён** (AC-4 N/A) — осознанное решение стейкхолдера (Слава 08.06). Реализация `resolve_agent_model` рефакторнута на генератор кандидатов `_agent_model_candidates` (тот же приоритет ORCH-41) + валидация-со-скипом. Инварианты НЕ менялись: приоритеты/сигнатуры резолва ORCH-41, структура CLI-команды `_spawn`, `VALID_EFFORTS`-гард эффорта, `STAGE_TRANSITIONS`, реестр `QG_CHECKS`, схема БД (без миграций); enduro per-project override валидные имена проходят без изменения поведения. ADR `docs/work-items/ORCH-074/06-adr/ADR-001-model-name-validation.md`. Документация: `docs/architecture/README.md` (таблица «модель/эффорт по ролям» + валидация), `CLAUDE.md`, `.env.example` (блок `ORCH_AGENT_MODEL_*`/`ORCH_AGENT_EFFORT_*`/`ORCH_AGENT_FALLBACK_MODEL`). Тесты: `tests/test_agent_frontmatter_no_model.py` (G1: TC-01/02), `tests/test_resolve_agent_model.py` (G2 never-break: TC-03..09, TC-11 + is_valid_model). - **Управление зависимостями задач (B ждёт A) + сериализация мержа одного репо** (ORCH-026): два уровня по ADR-001, оба условны (kill-switch + CSV-область, never-raise), без новой стадии и без изменения `STAGE_TRANSITIONS`/реестра `QG_CHECKS`. **Уровень A — сериализация merge/deploy внутри одного репо:** переиспользует существующий merge-lease ORCH-043/065 (никакого нового механизма); единственная новая логика — **безусловный pre-merge rebase**: в `check_branch_mergeable` (`src/qg/checks.py`) под удержанным лизом при флаге `premerge_rebase_always` (дефолт `True`) `auto_rebase_onto_main` вызывается **всегда** (а не только при `branch_is_behind_main`) — детерминированный структурный анти-фантом на ребре планировщика, дополняющий рубежи ORCH-073. На актуальной ветке это no-op (rebase не сдвигает HEAD, `push --force-with-lease` → «Everything up-to-date», CI не триггерится); kill-switch `premerge_rebase_always=False` → прежнее поведение ORCH-043 1:1. Окно сериализации «merge → main-updated» per-repo (для self `done` ⇔ SHA-in-main, ORCH-073): пока A не в `main`, B того же репо получает `merge-lock busy` → defer (не откат); кросс-репо параллелизм сохранён (лиз — per-repo файл). **Уровень B — декларативные зависимости задач:** аддитивная таблица `job_deps(task_id, depends_on_task_id)` (идемпотентный `CREATE TABLE/INDEX IF NOT EXISTS` в `init_db`, без миграции на живой БД); гейт планировщика в `claim_next_job` (`src/db.py`) — `NOT EXISTS (job_deps d JOIN tasks t … WHERE d.task_id=jobs.task_id AND t.stage!='done')` при `task_deps_enabled`: задача с незавершённой зависимостью **не выбирается** и слот `max_concurrency` не занимает; инертно при пустой `job_deps` → нулевая регрессия, kill-switch `task_deps_enabled=False` → запрос 1:1 как ORCH-1. Новый leaf-модуль `src/task_deps.py` (контракт never-raise): `is_task_ready` (fail-open → ready), DFS-детектор циклов (`detect_cycle`/`find_any_cycle`, итеративный WHITE/GREY/BLACK), `handle_cycle` (`set_issue_blocked` по каждой задаче цикла + один Telegram-alert с цепочкой «A → B → A»), `declare_dependency` (вставка + детект цикла), `ingest_plane_relations` (только для `task_deps_source=plane|hybrid`: резолв Plane `blocked-by` UUID → локальный task → запись в `job_deps`; источник истины горячего цикла остаётся БД, дефолт `db` НЕ ходит в сеть на claim), `snapshot` (read-only сводка). Видимость: строка «⏳ ждёт ORCH-NNN» в Telegram-карточке (`src/notifications.py`, never-raise, инвариант «одна карточка на задачу» сохранён); блок `task_deps` в `GET /queue` (`src/main.py`). Совместимость: `reconciler` F-1 пропускает dep-заблокированные задачи (`is_task_ready`, паттерн ORCH-060) + backstop-детект цикла; `job_reaper` сканирует только `running` → dep-блок остаётся `queued`. Зависимости — только intra-repo (v1). Новые настройки: `ORCH_PREMERGE_REBASE_ALWAYS` (true), `ORCH_TASK_DEPS_ENABLED` (true), `ORCH_TASK_DEPS_SOURCE` (db). Инварианты НЕ менялись: `STAGE_TRANSITIONS`, реестр `QG_CHECKS` (гейт зависимостей — врезка в `claim_next_job`, НЕ зарегистрированный QG), схема `tasks`/`jobs`/`agent_runs`, внешние HTTP-эндпоинты; non-self (enduro) — no-op при пустых `job_deps`/области. ADR `docs/work-items/ORCH-026/06-adr/ADR-001-merge-serialization-and-task-deps.md`, глобальный `docs/architecture/adr/adr-0015-task-deps-and-merge-serialization.md`. Документация: `docs/architecture/README.md`, `CLAUDE.md`, `.env.example`. Тесты: `tests/test_orch026_premerge_rebase.py`, `tests/test_orch026_merge_serialize.py`, `tests/test_orch026_conditionality.py`, `tests/test_orch026_task_deps.py`, `tests/test_orch026_dep_cycles.py`, `tests/test_orch026_dep_visibility.py`, `tests/test_orch026_migration.py`, `tests/test_orch026_queue_observability.py`, `tests/test_orch026_serialize_integration.py`, `tests/test_orch026_deps_integration.py`. diff --git a/src/agents/launcher.py b/src/agents/launcher.py index 6bc29c9..ee27af4 100644 --- a/src/agents/launcher.py +++ b/src/agents/launcher.py @@ -1077,35 +1077,28 @@ class AgentLauncher: return None def _ensure_pr(self, repo: str, branch: str, run_id: int): - import httpx - owner = settings.gitea_owner - headers = {"Authorization": f"token {settings.gitea_token}"} - base_url = f"{settings.gitea_url}/api/v1" - try: - resp = httpx.get( - f"{base_url}/repos/{owner}/{repo}/pulls", - params={"state": "open", "head": branch}, - headers=headers, timeout=10 - ) - resp.raise_for_status() - prs = resp.json() - if prs: - return prs[0]["number"] - parts = branch.split("/") - title = parts[-1] if parts else branch - resp = httpx.post( - f"{base_url}/repos/{owner}/{repo}/pulls", - json={"title": f"feat: {title}", "head": branch, "base": "main", - "body": f"Auto-created by orchestrator after developer run_id={run_id}"}, - headers=headers, timeout=10 - ) - resp.raise_for_status() - pr_number = resp.json()["number"] - logger.info(f"Created PR #{pr_number} for {branch}") - return pr_number - except Exception as e: - logger.error(f"Failed to create PR for {branch}: {e}") - return None + """Ensure an open code-PR exists for ``branch``; return its number or None. + + ORCH-082 (ADR-001 Р-4): delegated to the single idempotent PR-creation actor + ``merge_gate.ensure_open_pr`` so PR creation lives in ONE place and logs the + same created/existed/failed outcomes (G3). The CALL TRIGGER is unchanged — the + caller (`_monitor_agent`) still invokes this ONLY on the developer path with a + fresh worktree commit; only the implementation under the hood is shared. The + actor uses the same ``head==branch AND base==main`` filter as ``merge_pr``, so + the developer-created PR and the one merge-verify merges are guaranteed to be + the same code-PR. Never raises (the actor is never-raise); ``failed`` -> None, + preserving the previous "best-effort, return None on failure" contract. + """ + from .. import merge_gate + status, detail = merge_gate.ensure_open_pr(repo, branch) + logger.info(f"_ensure_pr({branch}, run_id={run_id}) -> {status} ({detail})") + if status in ("created", "existed"): + try: + return int(detail) + except (TypeError, ValueError): + return None + logger.error(f"Failed to ensure PR for {branch}: {detail}") + return None def _write_task_file(self, repo: str, branch: str, task_file: str, content: str): """Write task file directly into the task's worktree. diff --git a/src/config.py b/src/config.py index 2fc010d..1b3b118 100644 --- a/src/config.py +++ b/src/config.py @@ -442,6 +442,22 @@ class Settings(BaseSettings): # merge_verify_repos), so non-self repos are a no-op. regression_guard_enabled: bool = True + # ORCH-082 (ADR-001 Р-5): guarantee an open code-PR BEFORE the deterministic + # merge_pr inside the merge-verify under-gate. The pipeline never guaranteed the + # branch had an open PR (head==branch, base==main) at merge time — PRs are created + # ONLY on the developer path with a fresh worktree commit (launcher._ensure_pr), + # so a branch (e.g. after a manual main restore / a bounce with no new commits) + # could reach merge-verify PR-less -> merge_pr returns "no open PR" -> a FALSE HOLD + # that ORCH-073 fail-closed correctly catches but should never have to. The + # idempotent leaf-actor merge_gate.ensure_open_pr creates/finds the code-PR ДО + # merge_pr; ORCH-073's SHA-in-main proof is untouched and stays authoritative. + # merge_verify_autocreate_pr_enabled -> kill-switch (env + # ORCH_MERGE_VERIFY_AUTOCREATE_PR_ENABLED). False -> exactly the pre-ORCH-082 + # behaviour (no auto-create; "no open PR" -> HOLD as before). Reuses the + # merge_verify_applies scope (self-hosting / merge_verify_repos) — no separate + # *_repos, since auto-create is semantically inseparable from merge-verify. + merge_verify_autocreate_pr_enabled: bool = True + # Telegram notifications telegram_bot_token: str = "" telegram_chat_id: str = "" diff --git a/src/merge_gate.py b/src/merge_gate.py index b8a51fe..2cb78ff 100644 --- a/src/merge_gate.py +++ b/src/merge_gate.py @@ -587,6 +587,101 @@ def merge_verify_applies(repo: str) -> bool: return False +def ensure_open_pr(repo: str, branch: str) -> tuple[str, str]: + """Guarantee an open **code-PR** (``head==branch`` AND ``base=="main"``) exists. + + ORCH-082 (ADR-001 Р-1 / FR-1): the idempotent leaf-actor that closes the missing + invariant "by merge-verify time the branch has an open code-PR". The pipeline used + to create a PR ONLY on the developer path with a fresh worktree commit + (``launcher._ensure_pr``), so a branch could reach the ``deploy -> done`` merge-verify + under-gate with no open code-PR -> ``merge_pr`` returned ``"no open PR"`` -> a FALSE + HOLD (the ORCH-074 incident). This actor creates/finds the code-PR ДО the + deterministic ``merge_pr``; ORCH-073's SHA-in-main proof stays authoritative. + + Algorithm (FR-1): + 1. ``GET …/pulls?state=open`` -> a PR with **``head.ref==branch`` AND + ``base.ref=="main"``**. The filter is **identical** to ``merge_pr``/ORCH-073 + FR-3 so both actors agree on exactly the same PR — an auto docs-PR + (``base != main``) is NOT a code-PR (AC-6). Found -> ``("existed", "")``. + 2. Otherwise ``POST …/pulls`` (``head=branch``, ``base=main``, auto title/body) -> + ``201`` -> ``("created", "")``. + 3. Idempotency on a race: a ``POST`` that fails because the PR already exists + (Gitea ``409``/``422``) -> a repeat ``GET`` (step 1) confirms the existing PR -> + ``("existed", …)``; no duplicate is created (AC-2 / FR-5). + 4. Any other HTTP/parse/network error -> ``("failed", "")``. + + Reuses ``settings.merge_pr_timeout_s`` (same class of Gitea calls as ``merge_pr``). + Never-raise (AC-7): any unexpected error -> ``("failed", str(e))``; the exception is + NEVER propagated into ``_handle_merge_verify`` / ``advance_stage``. + """ + try: + import httpx + owner = settings.gitea_owner + headers = {"Authorization": f"token {settings.gitea_token}"} + base = f"{settings.gitea_url}/api/v1/repos/{owner}/{repo}" + timeout = settings.merge_pr_timeout_s + + def _find_open_code_pr() -> int | None: + """GET open PRs; return the code-PR number (head==branch AND base==main).""" + resp = httpx.get( + f"{base}/pulls", params={"state": "open"}, headers=headers, timeout=timeout + ) + if resp.status_code != 200: + return None + for pr in resp.json() or []: + if ( + pr.get("head", {}).get("ref") == branch + and pr.get("base", {}).get("ref") == "main" + ): + return pr.get("number") + return None + + # Step 1: an open code-PR already exists -> existed (no duplicate POST). + existing = _find_open_code_pr() + if existing is not None: + logger.info("ensure_open_pr: %s/%s already has open code-PR #%s", repo, branch, existing) + return "existed", str(existing) + + # Step 2: create the code-PR onto main. + parts = branch.split("/") + title = parts[-1] if parts else branch + m = httpx.post( + f"{base}/pulls", + json={ + "title": f"feat: {title}", + "head": branch, + "base": "main", + "body": f"Auto-created by orchestrator merge-verify for {branch}", + }, + headers=headers, + timeout=timeout, + ) + if m.status_code in (200, 201): + number = (m.json() or {}).get("number") + logger.info("ensure_open_pr: created PR #%s for %s/%s", number, repo, branch) + return "created", str(number) + + # Step 3: race / already-exists (409 conflict, 422 unprocessable) -> re-GET. + if m.status_code in (409, 422): + again = _find_open_code_pr() + if again is not None: + logger.info( + "ensure_open_pr: %s/%s PR already existed on retry (#%s, HTTP %s)", + repo, branch, again, m.status_code, + ) + return "existed", str(again) + + detail = (m.text or "").strip()[:200] + logger.warning( + "ensure_open_pr: create failed for %s/%s: HTTP %s %s", + repo, branch, m.status_code, detail, + ) + return "failed", f"create PR failed: HTTP {m.status_code}" + except Exception as e: # noqa: BLE001 - never-raise contract (AC-7) + logger.warning("ensure_open_pr unexpected error for %s/%s: %s", repo, branch, e) + return "failed", f"ensure_open_pr error: {e}" + + def merge_pr(repo: str, branch: str) -> tuple[bool, str]: """Deterministically merge the open PR for ``branch`` via the Gitea PR-merge API. @@ -730,6 +825,7 @@ MAIN_REGRESSION_MARKERS: list[tuple[str, str, str]] = [ ("ORCH-069", "qg0_title_max", "src/config.py"), ("ORCH-071", "verify_merged_to_main", "src/merge_gate.py"), ("ORCH-073", "check_main_regression", "src/merge_gate.py"), + ("ORCH-082", "ensure_open_pr", "src/merge_gate.py"), ] diff --git a/src/stage_engine.py b/src/stage_engine.py index c8427c8..e76af93 100644 --- a/src/stage_engine.py +++ b/src/stage_engine.py @@ -1321,6 +1321,52 @@ def _hold_main_regressed( return True +def _hold_pr_create_failed( + task_id, repo, work_item_id, branch, reason: str, result: AdvanceResult +) -> bool: + """HOLD the task because the open code-PR could not be ensured (ORCH-082 Р-3). + + FR-2/FR-4 (AC-5/AC-7): ``ensure_open_pr`` returned ``"failed"`` (Gitea unreachable / + HTTP error) — there is no open code-PR and one could not be created. Symmetric to the + not-merged / regressed HOLD: task stays on ``deploy`` (NOT done), NO rollback to + development, ALERT-only (Telegram + Plane ``set_issue_blocked`` + comment). The HOLD + text MUST be distinguishable from the not-merged HOLD so the operator sees the cause is + "could not CREATE the PR" (infra), not "could not MERGE an existing one". Returns + ``True`` (INTERVENED). Never breaks the HOLD on a notify error; ``failed`` is a + structured outcome, not a propagated exception (INV-1). + """ + merge_gate.note_not_merged_alert(work_item_id) # reuse the counter-notifier. + msg = ( + f"PR создать не удалось: {reason} (repo={repo}, branch={branch}, " + f"wi={work_item_id}). Открытый код-PR отсутствует и не создан — задача " + f"удержана на `deploy` (НЕ done). Нужно проверить доступность Gitea / создать PR." + ) + logger.warning(f"Task {task_id}: {msg}") + if work_item_id: + try: + set_issue_blocked(work_item_id) + except Exception as e: # noqa: BLE001 - never break the HOLD + logger.warning(f"Task {task_id}: set_issue_blocked failed: {e}") + try: + plane_add_comment( + work_item_id, + "\U0001f6a8 PR создать не удалось: " + reason + ". Открытый код-PR " + "отсутствует — задача удержана на `deploy` (НЕ done). Проверьте " + "доступность Gitea / создайте PR вручную и повторите approve.", + author="deployer", + ) + except Exception as e: # noqa: BLE001 - never break the HOLD + logger.warning(f"Task {task_id}: plane pr-create-failed comment failed: {e}") + try: + send_telegram(f"\U0001f6a8 {msg}") + except Exception as e: # noqa: BLE001 - never break the HOLD + logger.warning(f"Task {task_id}: pr-create-failed telegram failed: {e}") + result.alerted = True + result.note = "pr-create-failed-hold" + result.advanced = False + return True + + def _handle_merge_verify(task_id, repo, work_item_id, branch, result: AdvanceResult) -> bool: """ORCH-071 merge-verify under-gate on the `deploy -> done` edge. @@ -1353,6 +1399,24 @@ def _handle_merge_verify(task_id, repo, work_item_id, branch, result: AdvanceRes from . import image_freshness sha = image_freshness.validated_revision(repo, branch) + # ORCH-082 (Р-2 / FR-2): guarantee an open code-PR (head==branch, base==main) + # BEFORE the deterministic merge_pr. The pipeline never guaranteed the branch + # had one at merge time (PRs are created only on the developer path with a fresh + # commit) -> a PR-less branch hit merge_pr "no open PR" -> a FALSE HOLD (ORCH-074). + # `created`/`existed` -> proceed unchanged; `failed` -> honest HOLD with a + # distinguishable text (NOT the not-merged HOLD). ORCH-073's SHA-in-main proof + # below is untouched and stays authoritative. Kill-switch off -> 1:1 prior path. + if settings.merge_verify_autocreate_pr_enabled: + pr_status, pr_detail = merge_gate.ensure_open_pr(repo, branch) + logger.info( + f"Task {task_id}: merge-verify ensure_open_pr -> {pr_status} ({pr_detail})" + ) + if pr_status == "failed": + return _hold_pr_create_failed( + task_id, repo, work_item_id, branch, pr_detail, result + ) + # "created" | "existed" -> proceed normally to merge_pr. + # Deterministic merge-actor (no-op if the PR is already merged, INV-5/AC-9). merged_ok, merge_msg = merge_gate.merge_pr(repo, branch) logger.info( diff --git a/tests/conftest.py b/tests/conftest.py index 26b8255..216fe0f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -98,4 +98,10 @@ def _disable_merge_verify(monkeypatch): # _handle_merge_verify's confirmed branch. Default it OFF too so unrelated # deploy->done tests stay 1:1; the dedicated ORCH-073 tests re-enable it. monkeypatch.setattr(_cfg.settings, "regression_guard_enabled", False, raising=False) + # ORCH-082: the merge-verify ensure_open_pr врезка makes REAL Gitea calls before + # merge_pr. Default it OFF so unrelated deploy->done / merge-verify tests stay 1:1 + # (no network); the dedicated ORCH-082 tests re-enable it via their own monkeypatch. + monkeypatch.setattr( + _cfg.settings, "merge_verify_autocreate_pr_enabled", False, raising=False + ) yield diff --git a/tests/test_orch082_ensure_pr.py b/tests/test_orch082_ensure_pr.py new file mode 100644 index 0000000..a12644e --- /dev/null +++ b/tests/test_orch082_ensure_pr.py @@ -0,0 +1,163 @@ +"""ORCH-082 FR-1 — merge_gate.ensure_open_pr: idempotent open-code-PR actor. + +Covers TC-01..05 / AC-2 / AC-6 / AC-7. The actor guarantees an open code-PR +(``head==branch`` AND ``base=="main"``) exists before the deterministic ``merge_pr``, +without ever creating a duplicate. Gitea HTTP is mocked; the actor honours the strict +never-raise contract (any error -> ``("failed", reason)``). +""" + +import pytest + +from src import merge_gate + +REPO = "orchestrator" +BRANCH = "feature/ORCH-082-x" + + +class _Resp: + """Minimal httpx.Response stand-in (status_code + json/text).""" + + def __init__(self, status_code, payload=None, text=""): + self.status_code = status_code + self._payload = payload if payload is not None else [] + self.text = text + + def json(self): + return self._payload + + +@pytest.fixture(autouse=True) +def _settings(monkeypatch): + monkeypatch.setattr(merge_gate.settings, "merge_pr_timeout_s", 5) + monkeypatch.setattr(merge_gate.settings, "gitea_owner", "owner") + monkeypatch.setattr(merge_gate.settings, "gitea_token", "tok") + monkeypatch.setattr(merge_gate.settings, "gitea_url", "http://gitea.test") + + +def _install_httpx(monkeypatch, get_resp, post_resp=None, record=None): + """Patch merge_gate's lazily-imported httpx with stub get/post callables.""" + import httpx + + def fake_get(url, *a, **k): + if record is not None: + record.append(("GET", url, k.get("params"))) + return get_resp() if callable(get_resp) else get_resp + + def fake_post(url, *a, **k): + if record is not None: + record.append(("POST", url, k.get("json"))) + if post_resp is None: + raise AssertionError("POST must NOT be called") + return post_resp() if callable(post_resp) else post_resp + + monkeypatch.setattr(httpx, "get", fake_get) + monkeypatch.setattr(httpx, "post", fake_post) + + +# --------------------------------------------------------------------------- +# TC-01: no open code-PR -> POST creates one -> ("created", N); base==main filter. +# --------------------------------------------------------------------------- +def test_tc01_creates_pr_when_absent(monkeypatch): + record = [] + _install_httpx( + monkeypatch, + get_resp=_Resp(200, []), # no open PRs at all + post_resp=_Resp(201, {"number": 42}), + record=record, + ) + status, detail = merge_gate.ensure_open_pr(REPO, BRANCH) + assert (status, detail) == ("created", "42") + # POST body targets head=branch, base=main. + post = [r for r in record if r[0] == "POST"][0] + assert post[2]["head"] == BRANCH + assert post[2]["base"] == "main" + + +# --------------------------------------------------------------------------- +# TC-02: an open code-PR (head==branch AND base==main) already exists -> existed, +# POST is never called (no duplicate). +# --------------------------------------------------------------------------- +def test_tc02_existed_no_duplicate(monkeypatch): + payload = [{"number": 7, "head": {"ref": BRANCH}, "base": {"ref": "main"}}] + _install_httpx(monkeypatch, get_resp=_Resp(200, payload), post_resp=None) + status, detail = merge_gate.ensure_open_pr(REPO, BRANCH) + assert (status, detail) == ("existed", "7") # POST stub would raise if called + + +# --------------------------------------------------------------------------- +# TC-03 (AC-6): only a docs-PR (base != main) exists -> NOT a code-PR -> create on main. +# --------------------------------------------------------------------------- +def test_tc03_docs_pr_not_counted_creates_on_main(monkeypatch): + record = [] + # An open PR exists but onto a docs base, and another onto a different head. + docs_payload = [ + {"number": 9, "head": {"ref": BRANCH}, "base": {"ref": "docs/logs"}}, + {"number": 10, "head": {"ref": "other/branch"}, "base": {"ref": "main"}}, + ] + _install_httpx( + monkeypatch, + get_resp=_Resp(200, docs_payload), + post_resp=_Resp(201, {"number": 11}), + record=record, + ) + status, detail = merge_gate.ensure_open_pr(REPO, BRANCH) + assert (status, detail) == ("created", "11") + assert any(r[0] == "POST" for r in record) + + +# --------------------------------------------------------------------------- +# TC-04 (AC-7): Gitea GET/POST raise -> ("failed", reason), never raises. +# --------------------------------------------------------------------------- +def test_tc04_never_raise_on_get_error(monkeypatch): + import httpx + + def boom(*a, **k): + raise httpx.ConnectError("gitea down") + + monkeypatch.setattr(httpx, "get", boom) + monkeypatch.setattr(httpx, "post", boom) + status, detail = merge_gate.ensure_open_pr(REPO, BRANCH) + assert status == "failed" + assert detail # carries a reason + + +def test_tc04_never_raise_on_post_error(monkeypatch): + import httpx + + def boom_post(*a, **k): + raise httpx.ConnectError("post exploded") + + _install_httpx(monkeypatch, get_resp=_Resp(200, []), post_resp=None) + monkeypatch.setattr(httpx, "post", boom_post) + status, detail = merge_gate.ensure_open_pr(REPO, BRANCH) + assert status == "failed" + + +def test_tc04_failed_when_post_non_2xx(monkeypatch): + # A plain non-2xx, non-conflict POST -> failed (not silently swallowed). + _install_httpx( + monkeypatch, get_resp=_Resp(200, []), post_resp=_Resp(500, text="boom") + ) + status, detail = merge_gate.ensure_open_pr(REPO, BRANCH) + assert status == "failed" + assert "500" in detail + + +# --------------------------------------------------------------------------- +# TC-05 (AC-2 / FR-5): race -> POST returns 409/422 "PR exists" -> re-GET confirms +# the existing PR -> ("existed", N), no duplicate. +# --------------------------------------------------------------------------- +@pytest.mark.parametrize("conflict_code", [409, 422]) +def test_tc05_race_post_conflict_confirms_existing(monkeypatch, conflict_code): + # First GET: no PR (so we attempt POST). POST: conflict. Re-GET: PR now present. + gets = iter([ + _Resp(200, []), # first probe: absent + _Resp(200, [{"number": 99, "head": {"ref": BRANCH}, "base": {"ref": "main"}}]), + ]) + _install_httpx( + monkeypatch, + get_resp=lambda: next(gets), + post_resp=_Resp(conflict_code, text="pull request already exists"), + ) + status, detail = merge_gate.ensure_open_pr(REPO, BRANCH) + assert (status, detail) == ("existed", "99") diff --git a/tests/test_orch082_merge_verify_autocreate.py b/tests/test_orch082_merge_verify_autocreate.py new file mode 100644 index 0000000..2b038e3 --- /dev/null +++ b/tests/test_orch082_merge_verify_autocreate.py @@ -0,0 +1,183 @@ +"""ORCH-082 FR-2/FR-3/FR-4 — ensure_open_pr врезка in _handle_merge_verify. + +Covers TC-06..12 / AC-3 / AC-4 / AC-5 / AC-7 / AC-8 / AC-9 / FR-5. Calls the +``deploy -> done`` under-gate handler directly with mocked merge_gate primitives + +side effects (Plane/Telegram). Asserts the return contract: ``False`` == advance to +``done``, ``True`` == HOLD (alert, NOT done). The ORCH-073 SHA-in-main proof stays +authoritative — auto-creating a PR must NEVER mask un-merged code. +""" + +import os +import tempfile + +os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token") +os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token") +os.environ.setdefault("ORCH_DB_PATH", os.path.join(tempfile.gettempdir(), "test_orch082.db")) + +import logging # noqa: E402 +from unittest.mock import MagicMock # noqa: E402 + +import pytest # noqa: E402 + +from src import stage_engine, image_freshness # noqa: E402 +from src.stage_engine import AdvanceResult, _handle_merge_verify # noqa: E402 + +REPO = "orchestrator" +WI = "ORCH-082" +BRANCH = "feature/ORCH-082-x" + + +@pytest.fixture(autouse=True) +def _wire(monkeypatch): + # Under-gate in scope; autocreate ON; regression guard OFF (its own tests cover it). + monkeypatch.setattr(stage_engine.merge_gate, "merge_verify_applies", lambda r: True) + monkeypatch.setattr(stage_engine.settings, "merge_verify_autocreate_pr_enabled", True) + monkeypatch.setattr(stage_engine.settings, "regression_guard_enabled", False) + monkeypatch.setattr(image_freshness, "validated_revision", lambda r, b: "deadbeef") + # Silence Plane/Telegram side effects (assert on .called where relevant). + for name in ("set_issue_blocked", "plane_add_comment", "send_telegram", "link_for"): + monkeypatch.setattr(stage_engine, name, MagicMock()) + monkeypatch.setattr( + stage_engine.self_deploy, "record_merged_to_main", MagicMock(return_value=True) + ) + + +# --------------------------------------------------------------------------- +# TC-06 (AC-3): PR absent -> ensure_open_pr creates -> merge_pr -> verify True -> +# deploy->done with NO false HOLD. +# --------------------------------------------------------------------------- +def test_tc06_autocreate_then_merge_then_done(monkeypatch): + ensure = MagicMock(return_value=("created", "5")) + merge = MagicMock(return_value=(True, "merged PR #5")) + monkeypatch.setattr(stage_engine.merge_gate, "ensure_open_pr", ensure) + monkeypatch.setattr(stage_engine.merge_gate, "merge_pr", merge) + monkeypatch.setattr(stage_engine.merge_gate, "verify_merged_to_main", lambda r, b, s: True) + + res = AdvanceResult() + intervened = _handle_merge_verify(1, REPO, WI, BRANCH, res) + + assert intervened is False # advance to done + assert res.alerted is False + ensure.assert_called_once_with(REPO, BRANCH) + assert merge.called + assert not stage_engine.set_issue_blocked.called + + +# --------------------------------------------------------------------------- +# TC-07 (AC-4 / FR-3): PR created/merged but verify_merged_to_main=False (code not +# in main) -> HOLD + set_issue_blocked, NOT done, no rollback. ORCH-073 protection +# is untouched by auto-create. +# --------------------------------------------------------------------------- +def test_tc07_verify_false_still_holds(monkeypatch): + monkeypatch.setattr(stage_engine.merge_gate, "ensure_open_pr", lambda r, b: ("created", "5")) + monkeypatch.setattr(stage_engine.merge_gate, "merge_pr", lambda r, b: (True, "merged PR #5")) + monkeypatch.setattr(stage_engine.merge_gate, "verify_merged_to_main", lambda r, b, s: False) + + res = AdvanceResult() + intervened = _handle_merge_verify(1, REPO, WI, BRANCH, res) + + assert intervened is True # HOLD + assert res.advanced is False + assert res.note == "merge-not-verified-hold" + assert stage_engine.set_issue_blocked.called + + +# --------------------------------------------------------------------------- +# TC-08 (AC-7 / AC-5): ensure_open_pr -> failed -> honest HOLD with distinguishable +# text/note; merge_pr is NOT reached; advance_stage does not raise. +# --------------------------------------------------------------------------- +def test_tc08_ensure_failed_holds_distinct(monkeypatch): + monkeypatch.setattr( + stage_engine.merge_gate, "ensure_open_pr", lambda r, b: ("failed", "gitea down") + ) + merge = MagicMock() + monkeypatch.setattr(stage_engine.merge_gate, "merge_pr", merge) + + res = AdvanceResult() + intervened = _handle_merge_verify(1, REPO, WI, BRANCH, res) + + assert intervened is True # HOLD + assert res.advanced is False + assert res.note == "pr-create-failed-hold" # distinct from "merge-not-verified-hold" + assert not merge.called # merge_pr never reached + assert stage_engine.set_issue_blocked.called + + +# --------------------------------------------------------------------------- +# TC-09 (AC-8): kill-switch OFF -> ensure_open_pr NOT called; "no open PR" -> prior +# HOLD 1:1 (ORCH-074 behaviour reproduced). +# --------------------------------------------------------------------------- +def test_tc09_killswitch_off_no_autocreate(monkeypatch): + monkeypatch.setattr(stage_engine.settings, "merge_verify_autocreate_pr_enabled", False) + ensure = MagicMock() + monkeypatch.setattr(stage_engine.merge_gate, "ensure_open_pr", ensure) + # merge_pr finds no open PR -> verify False -> prior not-merged HOLD. + monkeypatch.setattr(stage_engine.merge_gate, "merge_pr", lambda r, b: (False, "no open PR")) + monkeypatch.setattr(stage_engine.merge_gate, "verify_merged_to_main", lambda r, b, s: False) + + res = AdvanceResult() + intervened = _handle_merge_verify(1, REPO, WI, BRANCH, res) + + assert intervened is True + assert res.note == "merge-not-verified-hold" # exactly the prior HOLD + assert not ensure.called # auto-create skipped entirely + + +# --------------------------------------------------------------------------- +# TC-10 (AC-9): non-self repo (merge_verify_applies=False) -> врезка no-op, neither +# ensure_open_pr nor merge_pr called. +# --------------------------------------------------------------------------- +def test_tc10_non_self_repo_noop(monkeypatch): + monkeypatch.setattr(stage_engine.merge_gate, "merge_verify_applies", lambda r: False) + ensure = MagicMock() + merge = MagicMock() + monkeypatch.setattr(stage_engine.merge_gate, "ensure_open_pr", ensure) + monkeypatch.setattr(stage_engine.merge_gate, "merge_pr", merge) + + res = AdvanceResult() + intervened = _handle_merge_verify(1, "enduro-trails", "ET-1", "feature/x", res) + + assert intervened is False # advance unchanged + assert not ensure.called + assert not merge.called + + +# --------------------------------------------------------------------------- +# TC-11 (AC-2 / FR-5): idempotent re-drive (reaper/reconciler) -> ensure existed, +# merge_pr already-merged -> verify True -> done, no duplicate PR. +# --------------------------------------------------------------------------- +def test_tc11_idempotent_redrive(monkeypatch): + ensure = MagicMock(return_value=("existed", "5")) + monkeypatch.setattr(stage_engine.merge_gate, "ensure_open_pr", ensure) + monkeypatch.setattr(stage_engine.merge_gate, "merge_pr", lambda r, b: (True, "already-merged")) + monkeypatch.setattr(stage_engine.merge_gate, "verify_merged_to_main", lambda r, b, s: True) + + res = AdvanceResult() + intervened = _handle_merge_verify(1, REPO, WI, BRANCH, res) + + assert intervened is False # advance to done + assert ensure.return_value[0] == "existed" + assert not stage_engine.set_issue_blocked.called + + +# --------------------------------------------------------------------------- +# TC-12 (AC-5): logs distinguish created/existed/failed; the create-failed HOLD text +# differs from the not-merged HOLD text. +# --------------------------------------------------------------------------- +def test_tc12_logs_distinguish_outcomes(monkeypatch, caplog): + monkeypatch.setattr(stage_engine.merge_gate, "ensure_open_pr", lambda r, b: ("created", "5")) + monkeypatch.setattr(stage_engine.merge_gate, "merge_pr", lambda r, b: (True, "merged PR #5")) + monkeypatch.setattr(stage_engine.merge_gate, "verify_merged_to_main", lambda r, b, s: True) + + with caplog.at_level(logging.INFO, logger="orchestrator"): + _handle_merge_verify(1, REPO, WI, BRANCH, AdvanceResult()) + assert any("ensure_open_pr -> created" in r.message for r in caplog.records) + + # create-failed note differs from not-merged note (text-distinguishable HOLD). + monkeypatch.setattr( + stage_engine.merge_gate, "ensure_open_pr", lambda r, b: ("failed", "gitea down") + ) + res = AdvanceResult() + _handle_merge_verify(1, REPO, WI, BRANCH, res) + assert res.note == "pr-create-failed-hold" + assert res.note != "merge-not-verified-hold"