A DB stage=done task with 0 active jobs flapped in Plane between `Awaiting Deploy` and `Monitoring after Deploy` instead of holding `Done` (verified live on ORCH-061, task 47): the three deploy-phase setters were terminal-blind, so any stale/duplicate/unknown caller under the bot token re-stamped an intermediate status over the terminal Done, forever. - New leaf src/deploy_status_guard.py (pure, never-raise, config-gated): decide() -> ALLOW | CONVERGE_DONE | SUPPRESS on the entry of set_issue_awaiting_deploy / set_issue_deploying / set_issue_monitoring. A deploy-phase status is legitimate iff the task is non-terminal OR (done AND post-deploy window active); otherwise done converges to Done idempotently, cancelled is suppressed (FR-2, D1/D2). - D3: move post_deploy.arm_monitor ABOVE the terminal-sync block in advance_stage so window_active is True when the legitimate first Monitoring is set (the task is already DB-done by then); a re-drive after the window closes converges to Done. - D4: run_post_deploy_monitor no-ops without a status PATCH / re-queue when the task became cancelled mid-window (zombie-tick guard, FR-3). - D5: additive `reason` kwarg on the three setters + one structured log line per verdict (work_item/caller/target/db_stage/window_active/verdict); new read-only db.get_task_by_work_item_id; post_deploy.window_active helper. - Flags deploy_status_guard_enabled (kill-switch -> 1:1) / deploy_status_guard_repos (CSV; empty = self-hosting only). STAGE_TRANSITIONS / QG_CHECKS / check_* / machine-verdict keys / DB schema untouched (reads existing tasks.stage). Tests: TC-01..TC-12 across 5 new test modules + config flags; updated the reason-kwarg assertions in test_deploy_terminal_sync / test_deploy_approve. Full regress green (1413). Docs: CHANGELOG, CLAUDE.md, docs/architecture/README.md (status -> реализовано), .env.example. Refs: ORCH-094 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
171 lines
7.4 KiB
Python
171 lines
7.4 KiB
Python
"""ORCH-094 — deterministic post-deploy-monitor termination (FR-3 / AC-3).
|
|
|
|
Covers (04-test-plan.yaml):
|
|
TC-06 after the window finishes (HEALTHY, ticks==budget -> set_issue_done +
|
|
`done` marker) there are NO further status PATCHes for the task (a second
|
|
tick is a no-op: 0 set_issue_* calls).
|
|
TC-07 a tick at DB stage=done with a closed window OR a task cancelled mid-window
|
|
-> immediate no-op: no status PATCH and no next-tick enqueue (zombie-tick
|
|
excluded).
|
|
TC-08 arm_monitor does not re-arm a task already in done (armed/done marker ->
|
|
no-op), and a deploy->done re-drive after the window closed converges to
|
|
Done instead of resurrecting Monitoring.
|
|
"""
|
|
import os
|
|
import tempfile
|
|
|
|
import pytest
|
|
|
|
_test_db = os.path.join(tempfile.gettempdir(), "test_post_deploy_termination.db")
|
|
os.environ["ORCH_DB_PATH"] = _test_db
|
|
os.environ["ORCH_REPOS_DIR"] = tempfile.gettempdir()
|
|
os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
|
|
os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")
|
|
|
|
from unittest.mock import MagicMock # noqa: E402
|
|
|
|
import src.db as _db # noqa: E402
|
|
from src.db import init_db, get_db # noqa: E402
|
|
from src import stage_engine # noqa: E402
|
|
from src import post_deploy # noqa: E402
|
|
from src import config as cfg # noqa: E402
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def fresh_db(monkeypatch, tmp_path):
|
|
monkeypatch.setattr(_db.settings, "db_path", _test_db)
|
|
if os.path.exists(_test_db):
|
|
os.unlink(_test_db)
|
|
init_db()
|
|
monkeypatch.setattr(post_deploy.settings, "repos_dir", str(tmp_path))
|
|
monkeypatch.setattr(post_deploy.settings, "host_repos_dir", str(tmp_path))
|
|
# Small window so the budget is 1 tick (window // interval).
|
|
monkeypatch.setattr(stage_engine.settings, "post_deploy_window_s", 10)
|
|
monkeypatch.setattr(stage_engine.settings, "post_deploy_interval_s", 10)
|
|
monkeypatch.setattr(post_deploy.settings, "post_deploy_window_s", 10)
|
|
monkeypatch.setattr(post_deploy.settings, "post_deploy_interval_s", 10)
|
|
# write_post_deploy_log touches a worktree/git; stub it.
|
|
monkeypatch.setattr(post_deploy, "write_post_deploy_log", MagicMock(return_value=True))
|
|
yield
|
|
|
|
|
|
@pytest.fixture
|
|
def spy_status(monkeypatch):
|
|
setters = {}
|
|
for name in ("set_issue_done", "set_issue_monitoring", "set_issue_awaiting_deploy",
|
|
"set_issue_deploying", "set_issue_blocked"):
|
|
m = MagicMock()
|
|
monkeypatch.setattr(stage_engine, name, m)
|
|
setters[name] = m
|
|
monkeypatch.setattr(stage_engine, "_notify_post_deploy", MagicMock())
|
|
return setters
|
|
|
|
|
|
def _make_task(stage="done", repo="orchestrator", wi="ORCH-061", branch="feature/ORCH-061-x"):
|
|
conn = get_db()
|
|
cur = conn.execute(
|
|
"INSERT INTO tasks (plane_id, work_item_id, repo, branch, stage) "
|
|
"VALUES (?, ?, ?, ?, ?)",
|
|
(f"plane-{wi}", wi, repo, branch, stage),
|
|
)
|
|
tid = cur.lastrowid
|
|
conn.commit()
|
|
conn.close()
|
|
return tid
|
|
|
|
|
|
def _jobs():
|
|
conn = get_db()
|
|
rows = conn.execute("SELECT agent FROM jobs ORDER BY id").fetchall()
|
|
conn.close()
|
|
return [r[0] for r in rows]
|
|
|
|
|
|
def _healthy(*a, **k):
|
|
return post_deploy.ProbeResult(health_ok=True, total=2, fivexx=0, detail="ok")
|
|
|
|
|
|
# --- TC-06 ------------------------------------------------------------------
|
|
def test_tc06_clean_finish_then_no_more_patches(spy_status, monkeypatch):
|
|
monkeypatch.setattr(post_deploy, "probe_signals", _healthy)
|
|
tid = _make_task("done")
|
|
post_deploy.write_marker("orchestrator", "ORCH-061", post_deploy.ARMED, "armed")
|
|
|
|
job = {"task_id": tid, "repo": "orchestrator", "id": 1, "agent": "post-deploy-monitor"}
|
|
# Tick 1: budget==1, ticks==1 -> HEALTHY window exhausted -> finish.
|
|
stage_engine.run_post_deploy_monitor(job)
|
|
spy_status["set_issue_done"].assert_called_once_with("ORCH-061")
|
|
assert post_deploy.has_marker("orchestrator", "ORCH-061", post_deploy.DONE)
|
|
# No next tick was enqueued (window exhausted).
|
|
assert _jobs() == []
|
|
|
|
# Tick 2 (e.g. duplicate job): DONE marker present -> no-op, ZERO new PATCHes.
|
|
spy_status["set_issue_done"].reset_mock()
|
|
stage_engine.run_post_deploy_monitor(job)
|
|
spy_status["set_issue_done"].assert_not_called()
|
|
spy_status["set_issue_monitoring"].assert_not_called()
|
|
assert _jobs() == []
|
|
|
|
|
|
# --- TC-07 ------------------------------------------------------------------
|
|
def test_tc07_cancelled_mid_window_is_noop(spy_status, monkeypatch):
|
|
monkeypatch.setattr(post_deploy, "probe_signals", _healthy)
|
|
tid = _make_task("cancelled")
|
|
post_deploy.write_marker("orchestrator", "ORCH-061", post_deploy.ARMED, "armed")
|
|
|
|
job = {"task_id": tid, "repo": "orchestrator", "id": 1, "agent": "post-deploy-monitor"}
|
|
stage_engine.run_post_deploy_monitor(job)
|
|
# Zombie-tick guard: window closed, NO status PATCH, NO next tick.
|
|
for name, m in spy_status.items():
|
|
m.assert_not_called()
|
|
assert post_deploy.has_marker("orchestrator", "ORCH-061", post_deploy.DONE)
|
|
assert _jobs() == []
|
|
|
|
|
|
def test_tc07_finished_window_is_noop(spy_status, monkeypatch):
|
|
monkeypatch.setattr(post_deploy, "probe_signals", _healthy)
|
|
tid = _make_task("done")
|
|
# Window already finished (DONE marker present) -> no active basis to tick.
|
|
post_deploy.write_marker("orchestrator", "ORCH-061", post_deploy.ARMED, "armed")
|
|
post_deploy.mark_done("orchestrator", "ORCH-061")
|
|
|
|
job = {"task_id": tid, "repo": "orchestrator", "id": 1, "agent": "post-deploy-monitor"}
|
|
stage_engine.run_post_deploy_monitor(job)
|
|
spy_status["set_issue_done"].assert_not_called()
|
|
spy_status["set_issue_monitoring"].assert_not_called()
|
|
assert _jobs() == []
|
|
|
|
|
|
# --- TC-08 ------------------------------------------------------------------
|
|
def test_tc08_arm_monitor_idempotent_no_rearm(monkeypatch):
|
|
tid = _make_task("done")
|
|
# First arm: writes ARMED + enqueues tick 1.
|
|
assert post_deploy.arm_monitor("orchestrator", "ORCH-061", "feature/ORCH-061-x", tid) is True
|
|
assert _jobs() == ["post-deploy-monitor"]
|
|
# Second arm (re-drive deploy->done): ARMED present -> no-op, no new job.
|
|
assert post_deploy.arm_monitor("orchestrator", "ORCH-061", "feature/ORCH-061-x", tid) is False
|
|
assert _jobs() == ["post-deploy-monitor"]
|
|
|
|
|
|
def test_tc08_redrive_after_window_closed_converges(spy_status, monkeypatch):
|
|
# Guard ON, self-hosting.
|
|
monkeypatch.setattr(cfg.settings, "deploy_status_guard_enabled", True, raising=False)
|
|
monkeypatch.setattr(cfg.settings, "deploy_status_guard_repos", "", raising=False)
|
|
_make_task("done")
|
|
# Window armed then closed (a completed post-deploy observation).
|
|
post_deploy.write_marker("orchestrator", "ORCH-061", post_deploy.ARMED, "armed")
|
|
post_deploy.mark_done("orchestrator", "ORCH-061")
|
|
# A stale re-drive calling the REAL guarded setter must converge to Done, not
|
|
# resurrect Monitoring. (Use the real plane_sync setter via stage_engine import.)
|
|
from src import plane_sync
|
|
direct = MagicMock()
|
|
done = MagicMock()
|
|
monkeypatch.setattr(plane_sync, "_set_issue_state_direct", direct)
|
|
monkeypatch.setattr(plane_sync, "set_issue_done", done)
|
|
monkeypatch.setattr(plane_sync, "_resolve_project_id", lambda w=None, p=None: "proj-1")
|
|
monkeypatch.setattr(plane_sync, "get_project_states", lambda pid: {"monitoring": "S-mon"})
|
|
|
|
plane_sync.set_issue_monitoring("ORCH-061", reason="advance:deploy->done")
|
|
direct.assert_not_called()
|
|
done.assert_called_once_with("ORCH-061")
|