Конвейер продвигается только входящими webhook; потерянное событие (502 на ребилде, отсутствие ретраев у Plane/Gitea, неразрезолвленный sha→branch) оставляет задачу молча застрявшей (класс инцидента ORCH-044). Новый фоновый daemon-поток src/reconciler.py (паттерн queue_worker) доигрывает пропущенный переход через те же штатные гейты/обработчики, что и webhook: - F-1 gate-side: для задач stage≠done, без активного job и age(updated_at) ≥ grace_for_stage(stage) — read-only пред-оценка канонического QG; зелёный → stage_engine.advance_stage(..., finished_agent=None); красный → тишина (спам нотификаций структурно невозможен). analysis F-1 не трогает (человеческий гейт). - F-2 plane-side: опрос Plane API per-project (plane_sync.list_issues_by_state, курсорная пагинация, never-raise) → реплей In Progress/Approved/Rejected через существующие handle_status_start/handle_verdict (async из sync-потока, asyncio.run). - F-3: усиление sha→branch в handle_ci_status — БД-fallback по единственной development-задаче repo (неоднозначность → не резолвим), debug→info. - Анти-дубль на создании (db.create_task_atomic под process-wide Lock): гонка reconcile↔webhook не плодит второй task/branch/worktree/analyst-job (AC-4). - F-4 observability: лог-строка разблокировки + Telegram + блок reconcile в /queue. Старт/стоп в main.lifespan (после worker.start() / перед worker.stop()), restart-safe, never-raise на единицу работы. Kill-switches ORCH_RECONCILE_ENABLED / ORCH_RECONCILE_PLANE_ENABLED + grace-настройки. Схема БД и реестры STAGE_TRANSITIONS/QG_CHECKS не менялись. Тесты: test_reconciler.py, test_reconciler_plane.py, test_gitea_sha_resolve.py, test_config.py (33 новых, 563 всего зелёные). Документация обновлена (golden source): architecture/README.md, INFRA.md, README.md, CHANGELOG.md, adr-0007 → accepted. Refs: ORCH-053 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
380 lines
14 KiB
Python
380 lines
14 KiB
Python
"""ORCH-053: tests for the gate-side stuck-task reconciler (F-1) + lifecycle.
|
|
|
|
These cover the F-1 sweeper (``Reconciler.reconcile_gate_once``), the per-stage
|
|
grace / config (``grace_for_stage``), the no-spam guarantee, the analysis carve-
|
|
out (AC-16), never-raise isolation, the kill-switch, the unblock observability
|
|
(AC-12 / F-4) and the restart-safe daemon thread (AC-11).
|
|
|
|
Everything that touches the network (the quality gate, Plane sync, Telegram) is
|
|
mocked at the src.stage_engine / src.reconciler level so the reconciler runs
|
|
against a real isolated sqlite DB (same convention as test_stage_engine.py).
|
|
"""
|
|
|
|
import os
|
|
import tempfile
|
|
|
|
import pytest
|
|
|
|
# Isolated test DB (set BEFORE importing src.* so settings picks it up).
|
|
_test_db = os.path.join(tempfile.gettempdir(), "test_orchestrator_reconciler.db")
|
|
os.environ["ORCH_DB_PATH"] = _test_db
|
|
os.environ["ORCH_REPOS_DIR"] = tempfile.gettempdir()
|
|
os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
|
|
os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")
|
|
|
|
from unittest.mock import MagicMock # noqa: E402
|
|
|
|
import src.db as _db # noqa: E402
|
|
from src.db import init_db, get_db, enqueue_job # noqa: E402
|
|
from src import stage_engine # noqa: E402
|
|
from src import reconciler as reconciler_mod # noqa: E402
|
|
from src.reconciler import Reconciler, grace_for_stage # noqa: E402
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fixtures
|
|
# ---------------------------------------------------------------------------
|
|
@pytest.fixture(autouse=True)
|
|
def fresh_db(monkeypatch):
|
|
"""Fresh isolated DB per test."""
|
|
monkeypatch.setattr(_db.settings, "db_path", _test_db)
|
|
if os.path.exists(_test_db):
|
|
os.unlink(_test_db)
|
|
init_db()
|
|
yield
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def silence_side_effects(monkeypatch):
|
|
"""No-op every Plane/Telegram/notification side effect in the engine so the
|
|
real advance_stage runs deterministically and offline."""
|
|
for name in (
|
|
"notify_stage_change",
|
|
"notify_qg_failure",
|
|
"notify_approve_requested",
|
|
"notify_error",
|
|
"send_telegram",
|
|
"plane_notify_stage",
|
|
"plane_notify_qg",
|
|
"plane_add_comment",
|
|
"set_issue_in_review",
|
|
"set_issue_needs_input",
|
|
"set_issue_in_progress",
|
|
"set_issue_blocked",
|
|
"set_issue_done",
|
|
):
|
|
monkeypatch.setattr(stage_engine, name, MagicMock(), raising=False)
|
|
|
|
|
|
def _make_task(stage, *, repo="enduro-trails", branch="feature/ET-001-x",
|
|
wi="ET-001", age_s=None):
|
|
"""Insert a task; if age_s is given, backdate updated_at by that many secs."""
|
|
conn = get_db()
|
|
cur = conn.execute(
|
|
"INSERT INTO tasks (plane_id, work_item_id, repo, branch, stage) "
|
|
"VALUES (?, ?, ?, ?, ?)",
|
|
(f"plane-{wi}", wi, repo, branch, stage),
|
|
)
|
|
task_id = cur.lastrowid
|
|
if age_s is not None:
|
|
conn.execute(
|
|
"UPDATE tasks SET updated_at = datetime('now', ?) WHERE id = ?",
|
|
(f"-{int(age_s)} seconds", task_id),
|
|
)
|
|
conn.commit()
|
|
conn.close()
|
|
return task_id
|
|
|
|
|
|
def _stage_of(task_id):
|
|
conn = get_db()
|
|
row = conn.execute("SELECT stage FROM tasks WHERE id = ?", (task_id,)).fetchone()
|
|
conn.close()
|
|
return row["stage"]
|
|
|
|
|
|
def _jobs_for(task_id, agent=None):
|
|
conn = get_db()
|
|
if agent:
|
|
rows = conn.execute(
|
|
"SELECT * FROM jobs WHERE task_id = ? AND agent = ?", (task_id, agent)
|
|
).fetchall()
|
|
else:
|
|
rows = conn.execute(
|
|
"SELECT * FROM jobs WHERE task_id = ?", (task_id,)
|
|
).fetchall()
|
|
conn.close()
|
|
return [dict(r) for r in rows]
|
|
|
|
|
|
def _green_ci(monkeypatch, value=(True, "CI green")):
|
|
"""Patch the check_ci_green entry in QG_CHECKS; return the mock."""
|
|
m = MagicMock(return_value=value)
|
|
monkeypatch.setitem(stage_engine.QG_CHECKS, "check_ci_green", m)
|
|
return m
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TC-01: happy path — stuck development task is advanced to review
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc01_advances_stuck_development_task(monkeypatch):
|
|
_green_ci(monkeypatch)
|
|
task_id = _make_task("development", age_s=3600) # well past grace
|
|
|
|
Reconciler().reconcile_gate_once()
|
|
|
|
assert _stage_of(task_id) == "review"
|
|
reviewer_jobs = _jobs_for(task_id, "reviewer")
|
|
assert len(reviewer_jobs) == 1
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TC-02: source of truth is the gate — advance goes through advance_stage
|
|
# with finished_agent=None (no own update_task_stage/enqueue_job).
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc02_advances_via_advance_stage_finished_agent_none(monkeypatch):
|
|
_green_ci(monkeypatch)
|
|
spy = MagicMock(wraps=stage_engine.advance_stage)
|
|
# advance_if_gate_passed resolves advance_stage as a module global.
|
|
monkeypatch.setattr(stage_engine, "advance_stage", spy)
|
|
|
|
task_id = _make_task("development", age_s=3600)
|
|
Reconciler().reconcile_gate_once()
|
|
|
|
assert spy.call_count == 1
|
|
# finished_agent must be None (the webhook path).
|
|
_args, kwargs = spy.call_args
|
|
assert kwargs.get("finished_agent", "MISSING") is None
|
|
assert spy.call_args.args[0] == task_id
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TC-03: task with an active job is skipped — gate not evaluated, no advance.
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc03_active_job_skipped(monkeypatch):
|
|
ci = _green_ci(monkeypatch)
|
|
spy = MagicMock(wraps=stage_engine.advance_stage)
|
|
monkeypatch.setattr(stage_engine, "advance_stage", spy)
|
|
|
|
task_id = _make_task("development", age_s=3600)
|
|
enqueue_job("reviewer", "enduro-trails", task_id=task_id) # active (queued)
|
|
|
|
Reconciler().reconcile_gate_once()
|
|
|
|
assert _stage_of(task_id) == "development"
|
|
ci.assert_not_called()
|
|
spy.assert_not_called()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TC-04: per-stage grace — fresh task untouched, at-threshold task eligible.
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc04_grace_boundary(monkeypatch):
|
|
monkeypatch.setattr(reconciler_mod.settings, "reconcile_grace_default_s", 600)
|
|
_green_ci(monkeypatch)
|
|
|
|
fresh = _make_task("development", branch="feature/ET-002-fresh",
|
|
wi="ET-002", age_s=10) # < grace -> untouched
|
|
stuck = _make_task("development", branch="feature/ET-003-stuck",
|
|
wi="ET-003", age_s=3600) # >= grace -> advanced
|
|
|
|
Reconciler().reconcile_gate_once()
|
|
|
|
assert _stage_of(fresh) == "development"
|
|
assert _stage_of(stuck) == "review"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TC-05: grace_for_stage reads overrides JSON; bad JSON -> default, no crash.
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc05_grace_for_stage_overrides(monkeypatch):
|
|
monkeypatch.setattr(reconciler_mod.settings, "reconcile_grace_default_s", 600)
|
|
monkeypatch.setattr(
|
|
reconciler_mod.settings,
|
|
"reconcile_grace_overrides_json",
|
|
'{"development": 30, "review": 7200}',
|
|
)
|
|
assert grace_for_stage("development") == 30
|
|
assert grace_for_stage("review") == 7200
|
|
# missing key -> default
|
|
assert grace_for_stage("testing") == 600
|
|
|
|
|
|
def test_tc05_grace_for_stage_invalid_json_falls_back(monkeypatch):
|
|
monkeypatch.setattr(reconciler_mod.settings, "reconcile_grace_default_s", 600)
|
|
monkeypatch.setattr(
|
|
reconciler_mod.settings, "reconcile_grace_overrides_json", "{not valid json"
|
|
)
|
|
# Must not raise, must fall back to the default.
|
|
assert grace_for_stage("development") == 600
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TC-06: no spam — a stable-red gate never advances and never notifies, even
|
|
# across many ticks.
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc06_red_gate_no_spam(monkeypatch):
|
|
_green_ci(monkeypatch, value=(False, "CI red"))
|
|
task_id = _make_task("development", age_s=3600)
|
|
|
|
rec = Reconciler()
|
|
for _ in range(5):
|
|
rec.reconcile_gate_once()
|
|
|
|
assert _stage_of(task_id) == "development"
|
|
# The QG-failure notification branch inside advance_stage must never fire,
|
|
# because advance_if_gate_passed returns None on a red gate (no advance call).
|
|
stage_engine.notify_qg_failure.assert_not_called()
|
|
stage_engine.plane_notify_qg.assert_not_called()
|
|
assert rec.unblocked_total == 0
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TC-07: silence when in sync — done / busy / within-grace tasks => no advance.
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc07_silence_when_in_sync(monkeypatch):
|
|
_green_ci(monkeypatch)
|
|
spy = MagicMock(wraps=stage_engine.advance_stage)
|
|
monkeypatch.setattr(stage_engine, "advance_stage", spy)
|
|
|
|
_make_task("done", branch="feature/ET-010-done", wi="ET-010", age_s=3600)
|
|
fresh = _make_task("development", branch="feature/ET-011-fresh",
|
|
wi="ET-011", age_s=5)
|
|
busy = _make_task("development", branch="feature/ET-012-busy",
|
|
wi="ET-012", age_s=3600)
|
|
enqueue_job("reviewer", "enduro-trails", task_id=busy)
|
|
|
|
rec = Reconciler()
|
|
rec.reconcile_gate_once()
|
|
|
|
spy.assert_not_called()
|
|
assert rec.unblocked_total == 0
|
|
assert _stage_of(fresh) == "development"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TC-08 (AC-16): F-1 never advances the human analysis gate.
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc08_analysis_not_advanced_by_f1(monkeypatch):
|
|
# Even if the analysis gate would "pass", F-1 must not touch analysis.
|
|
monkeypatch.setitem(
|
|
stage_engine.QG_CHECKS, "check_analysis_approved",
|
|
MagicMock(return_value=(True, "approved")),
|
|
)
|
|
spy = MagicMock(wraps=stage_engine.advance_stage)
|
|
monkeypatch.setattr(stage_engine, "advance_stage", spy)
|
|
|
|
task_id = _make_task("analysis", age_s=3600)
|
|
Reconciler().reconcile_gate_once()
|
|
|
|
assert _stage_of(task_id) == "analysis"
|
|
spy.assert_not_called()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TC-09: never-raise — one task blowing up does not stop the others.
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc09_never_raise_isolates_failure(monkeypatch):
|
|
calls = []
|
|
|
|
def boom(task_id, stage, repo, wi, branch):
|
|
calls.append(task_id)
|
|
raise RuntimeError("boom")
|
|
|
|
monkeypatch.setattr(reconciler_mod, "advance_if_gate_passed", boom)
|
|
|
|
t1 = _make_task("development", branch="feature/ET-020-a", wi="ET-020", age_s=3600)
|
|
t2 = _make_task("development", branch="feature/ET-021-b", wi="ET-021", age_s=3600)
|
|
|
|
# Must not raise despite both tasks raising inside advance_if_gate_passed.
|
|
Reconciler().reconcile_gate_once()
|
|
|
|
assert set(calls) == {t1, t2} # both attempted
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TC-10: kill-switches.
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc10_kill_switch_disables_gate(monkeypatch):
|
|
monkeypatch.setattr(reconciler_mod.settings, "reconcile_enabled", False)
|
|
spy = MagicMock(wraps=stage_engine.advance_stage)
|
|
monkeypatch.setattr(stage_engine, "advance_stage", spy)
|
|
_green_ci(monkeypatch)
|
|
|
|
task_id = _make_task("development", age_s=3600)
|
|
Reconciler().reconcile_gate_once()
|
|
|
|
assert _stage_of(task_id) == "development"
|
|
spy.assert_not_called()
|
|
|
|
|
|
def test_tc10_plane_switch_mutes_only_f2(monkeypatch):
|
|
monkeypatch.setattr(reconciler_mod.settings, "reconcile_enabled", True)
|
|
monkeypatch.setattr(reconciler_mod.settings, "reconcile_plane_enabled", False)
|
|
|
|
plane_pass = MagicMock()
|
|
monkeypatch.setattr(reconciler_mod.Reconciler, "_reconcile_plane_project", plane_pass)
|
|
# F-2 muted -> reconcile_plane_once is a no-op.
|
|
Reconciler().reconcile_plane_once()
|
|
plane_pass.assert_not_called()
|
|
|
|
# F-1 still runs.
|
|
_green_ci(monkeypatch)
|
|
task_id = _make_task("development", age_s=3600)
|
|
Reconciler().reconcile_gate_once()
|
|
assert _stage_of(task_id) == "review"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TC-20: observability — explicit unblock log line + telegram (AC-12 / F-4).
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc20_unblock_logs_and_notifies(monkeypatch, caplog):
|
|
_green_ci(monkeypatch)
|
|
monkeypatch.setattr(reconciler_mod.settings, "reconcile_notify_unblock", True)
|
|
tg = MagicMock()
|
|
monkeypatch.setattr(reconciler_mod, "send_telegram", tg)
|
|
|
|
_make_task("development", wi="ET-042", age_s=3600)
|
|
|
|
rec = Reconciler()
|
|
with caplog.at_level("INFO", logger="orchestrator.reconciler"):
|
|
rec.reconcile_gate_once()
|
|
|
|
# Exact AC-12 contract string.
|
|
assert "reconciler: ET-042 development разблокирована (потерян webhook)" in caplog.text
|
|
assert rec.unblocked_total == 1
|
|
assert rec.last_unblocked == "ET-042"
|
|
tg.assert_called_once()
|
|
|
|
|
|
def test_tc20_no_telegram_when_disabled(monkeypatch):
|
|
_green_ci(monkeypatch)
|
|
monkeypatch.setattr(reconciler_mod.settings, "reconcile_notify_unblock", False)
|
|
tg = MagicMock()
|
|
monkeypatch.setattr(reconciler_mod, "send_telegram", tg)
|
|
|
|
_make_task("development", wi="ET-043", age_s=3600)
|
|
Reconciler().reconcile_gate_once()
|
|
|
|
tg.assert_not_called()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TC-21: restart-safe daemon thread — start/stop/idempotent start.
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc21_daemon_thread_lifecycle(monkeypatch):
|
|
# Avoid any real work in the loop: disable both branches, big interval.
|
|
monkeypatch.setattr(reconciler_mod.settings, "reconcile_enabled", False)
|
|
rec = Reconciler(interval_s=60)
|
|
|
|
rec.start()
|
|
assert rec._thread is not None and rec._thread.is_alive()
|
|
first_thread = rec._thread
|
|
|
|
# Idempotent: a second start does not spawn a new thread.
|
|
rec.start()
|
|
assert rec._thread is first_thread
|
|
|
|
rec.stop(timeout=5.0)
|
|
assert not first_thread.is_alive()
|