feat(post-deploy): post-deploy prod monitoring + degradation reaction (ORCH-021)

Extend pipeline responsibility past deploy->done: after the terminal
transition for an applicable repo, arm a ~15min observation window that
probes prod and reacts to a degradation the restart-time health-check
missed ("green deploy, red prod").

- src/post_deploy.py: new leaf module (config + lazy qg/db only).
  Sentinel-file restart-safe state (.post-deploy-state-<repo>/<wi>/),
  no DB migration. probe_signals/classify/decide_action/run_rollback,
  all never-raise.
- Reserved-agent job `post-deploy-monitor` (no-LLM, Variant B, calque of
  deploy-finalizer): self-requeues each tick via enqueue_job.
- Deterministic classify: DEGRADED iff >= fail_threshold consecutive
  health failures OR window 5xx ratio > 5xx_threshold; fail-safe HEALTHY.
- Self-hosting invariant (BR-5/AC-8): a tick NEVER restarts the prod
  orchestrator container -> orchestrator is ALWAYS ALERT_ONLY.
- Conditionality (ORCH-35/36/43/58): kill-switch + CSV repos, empty ->
  self-hosting only.
- QG_CHECKS / STAGE_TRANSITIONS / schema unchanged (AC-12).
- Docs: CHANGELOG, CLAUDE artefact list (16-post-deploy-log.md),
  architecture README, .env.example (ORCH_POST_DEPLOY_*).

Refs: ORCH-021

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-06-07 14:16:12 +00:00
committed by Dev Agent
parent a4ad55c862
commit 8273c1fc9d
12 changed files with 1322 additions and 3 deletions

View File

@@ -90,6 +90,10 @@ def test_tc17_success_deploy_syncs_terminal_done(monkeypatch):
# Spy the merge-lease release to confirm the terminal-sync still frees it.
release = MagicMock()
monkeypatch.setattr(stage_engine.merge_gate, "release_merge_lease", release)
# ORCH-021 arms an orthogonal post-deploy-monitor reserved job at deploy->done
# for the self-hosting repo; disable it here so this test stays focused on the
# ORCH-036 terminal-sync contract (no PIPELINE agent launched leaving deploy).
monkeypatch.setattr(stage_engine.post_deploy.settings, "post_deploy_monitor_enabled", False)
task_id = _make_task("deploy")
stage_engine.run_deploy_finalizer(

210
tests/test_post_deploy.py Normal file
View File

@@ -0,0 +1,210 @@
"""ORCH-021 unit tests — post-deploy monitor pure logic (TC-01..TC-15).
The deterministic, network-free core (classification + reaction decision +
exit-code mapping + artefact frontmatter + never-raise) of ``src/post_deploy.py``.
Network probes and the rollback hook are exercised via mocks; the classifier is
the main subject (mirrors compute_staging_verdict in ORCH-061).
"""
import os
import tempfile
import pytest
import yaml
# Isolate the settings singleton onto a tmp repos_dir BEFORE importing the module.
os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")
from src import post_deploy # noqa: E402
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _probe(health_ok=True, total=2, fivexx=0):
return {"health_ok": health_ok, "total": total, "fivexx": fivexx}
@pytest.fixture(autouse=True)
def _tmp_state(monkeypatch, tmp_path):
monkeypatch.setattr(post_deploy.settings, "repos_dir", str(tmp_path))
monkeypatch.setattr(post_deploy.settings, "host_repos_dir", str(tmp_path))
yield
# ---------------------------------------------------------------------------
# TC-01..TC-05 — classification (the core)
# ---------------------------------------------------------------------------
def test_tc01_healthy_no_failures():
series = [_probe() for _ in range(5)]
assert post_deploy.classify(series, fail_threshold=3, fivexx_threshold=0.5) == "HEALTHY"
def test_tc02_degraded_consecutive_health_failures():
# Exactly fail_threshold consecutive failures -> DEGRADED (>= contract).
series = [_probe(health_ok=False) for _ in range(3)]
assert post_deploy.classify(series, fail_threshold=3, fivexx_threshold=0.5) == "DEGRADED"
def test_tc03_degraded_by_5xx_ratio_even_when_health_200():
# /health stays 200 (health_ok True) but the 5xx ratio is above threshold.
series = [_probe(health_ok=True, total=2, fivexx=2) for _ in range(3)]
assert post_deploy.classify(series, fail_threshold=10, fivexx_threshold=0.5) == "DEGRADED"
def test_tc04_no_false_trip_single_glitch_then_recovery():
# One isolated failure (1 < threshold) surrounded by healthy probes -> HEALTHY.
series = [_probe(), _probe(health_ok=False), _probe(), _probe()]
assert post_deploy.classify(series, fail_threshold=3, fivexx_threshold=0.5) == "HEALTHY"
def test_tc05_thresholds_change_verdict_on_same_data():
# Same data, different threshold flips the verdict (AC-11): two consecutive fails.
series = [_probe(health_ok=False), _probe(health_ok=False)]
assert post_deploy.classify(series, fail_threshold=3, fivexx_threshold=0.5) == "HEALTHY"
assert post_deploy.classify(series, fail_threshold=2, fivexx_threshold=0.5) == "DEGRADED"
def test_classify_uses_settings_thresholds(monkeypatch):
# The tick reads thresholds from Settings (env ORCH_*) — verify the wiring point.
monkeypatch.setattr(post_deploy.settings, "post_deploy_fail_threshold", 2)
series = [_probe(health_ok=False), _probe(health_ok=False)]
assert post_deploy.classify(
series,
post_deploy.settings.post_deploy_fail_threshold,
post_deploy.settings.post_deploy_5xx_threshold,
) == "DEGRADED"
# ---------------------------------------------------------------------------
# TC-06..TC-08 — reaction decision (self-hosting safety)
# ---------------------------------------------------------------------------
def test_tc06_nonself_auto_rollback_degraded_rolls_back(monkeypatch):
monkeypatch.setattr(post_deploy.settings, "post_deploy_auto_rollback", True)
assert post_deploy.decide_action("enduro-trails", "DEGRADED") == "ROLLBACK"
def test_tc07_self_hosting_degraded_never_rolls_back(monkeypatch):
# orchestrator (self-hosting) is ALWAYS ALERT_ONLY, even with auto_rollback on.
monkeypatch.setattr(post_deploy.settings, "post_deploy_auto_rollback", True)
assert post_deploy.decide_action("orchestrator", "DEGRADED") == "ALERT_ONLY"
def test_tc08_healthy_means_none_for_any_repo():
assert post_deploy.decide_action("orchestrator", "HEALTHY") == "NONE"
assert post_deploy.decide_action("enduro-trails", "HEALTHY") == "NONE"
def test_nonself_default_policy_alert_only(monkeypatch):
monkeypatch.setattr(post_deploy.settings, "post_deploy_auto_rollback", False)
assert post_deploy.decide_action("enduro-trails", "DEGRADED") == "ALERT_ONLY"
# ---------------------------------------------------------------------------
# TC-09..TC-10 — conditionality / kill-switch
# ---------------------------------------------------------------------------
def test_tc09_applies_empty_repos_only_self_hosting(monkeypatch):
monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
monkeypatch.setattr(post_deploy.settings, "post_deploy_repos", "")
assert post_deploy.post_deploy_applies("orchestrator") is True
assert post_deploy.post_deploy_applies("enduro-trails") is False
def test_tc09_applies_explicit_repos_csv(monkeypatch):
monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
monkeypatch.setattr(post_deploy.settings, "post_deploy_repos", "enduro-trails")
assert post_deploy.post_deploy_applies("enduro-trails") is True
assert post_deploy.post_deploy_applies("orchestrator") is False
def test_tc10_kill_switch_disables_for_everyone(monkeypatch):
monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", False)
assert post_deploy.post_deploy_applies("orchestrator") is False
assert post_deploy.post_deploy_applies("enduro-trails") is False
# ---------------------------------------------------------------------------
# TC-11..TC-12 — rollback exit-code mapping
# ---------------------------------------------------------------------------
def test_tc11_rollback_exit0_is_ok():
assert post_deploy.map_rollback_exit_code(0) == "ROLLBACK_OK"
def test_tc12_rollback_exit_nonzero_is_failed():
assert post_deploy.map_rollback_exit_code(1) == "ROLLBACK_FAILED"
assert post_deploy.map_rollback_exit_code(2) == "ROLLBACK_FAILED"
assert post_deploy.map_rollback_exit_code(None) == "ROLLBACK_FAILED"
assert post_deploy.map_rollback_exit_code("garbage") == "ROLLBACK_FAILED"
# ---------------------------------------------------------------------------
# TC-13 — artefact frontmatter
# ---------------------------------------------------------------------------
def test_tc13_log_frontmatter_parses():
body = post_deploy.build_post_deploy_log(
"ORCH-021", "DEGRADED", "ALERT_ONLY", 900, 12, 4
)
assert body.startswith("---\n")
fm = body.split("---", 2)[1]
data = yaml.safe_load(fm)
assert data["post_deploy_status"] == "DEGRADED"
assert data["action_taken"] == "ALERT_ONLY"
assert data["work_item"] == "ORCH-021"
assert data["window_s"] == 900
assert data["checks_total"] == 12
assert data["checks_failed"] == 4
# ---------------------------------------------------------------------------
# TC-14..TC-15 — never-raise
# ---------------------------------------------------------------------------
def test_tc14_probe_network_error_is_conservative_not_raise(monkeypatch):
# urlopen raises on every call -> health bad + monitored endpoints counted as
# 5xx, but NO exception propagates (the helper swallows and reports code 0).
def boom(*a, **k):
raise OSError("network down")
monkeypatch.setattr(post_deploy.urllib.request, "urlopen", boom)
res = post_deploy.probe_signals("http://localhost:8500")
assert res.health_ok is False
assert res.total == 2
assert res.fivexx == 2 # unreachable endpoints counted as failures
def test_tc14_classify_junk_input_swallowed():
# If classify gets junk it must not raise (fail-safe to HEALTHY).
assert post_deploy.classify("not-a-list", 3, 0.5) == "HEALTHY"
assert post_deploy.classify([{"bad": "row"}], 3, 0.5) == "HEALTHY"
assert post_deploy.classify(None, 3, 0.5) == "HEALTHY"
def test_tc15_write_log_no_worktree_returns_false(monkeypatch):
# get_worktree_path raises -> write returns False, no exception (best-effort).
def boom(repo, branch):
raise FileNotFoundError("no worktree")
monkeypatch.setattr("src.git_worktree.get_worktree_path", boom)
ok = post_deploy.write_post_deploy_log(
"nope-repo", "ORCH-021", "feature/x", "HEALTHY", "NONE", 900, 3, 0
)
assert ok is False
# ---------------------------------------------------------------------------
# Sentinel state restart-safe counters
# ---------------------------------------------------------------------------
def test_series_append_and_read_roundtrip():
post_deploy.write_marker("orchestrator", "ORCH-021", post_deploy.ARMED, "armed")
post_deploy.append_probe("orchestrator", "ORCH-021", post_deploy.ProbeResult(False, 2, 1, "x"))
post_deploy.append_probe("orchestrator", "ORCH-021", post_deploy.ProbeResult(True, 2, 0, "y"))
series = post_deploy.read_series("orchestrator", "ORCH-021")
assert len(series) == 2
assert series[0]["health_ok"] is False
assert series[1]["health_ok"] is True
def test_mark_done_idempotency_marker():
assert post_deploy.has_marker("orchestrator", "ORCH-021", post_deploy.DONE) is False
post_deploy.mark_done("orchestrator", "ORCH-021")
assert post_deploy.has_marker("orchestrator", "ORCH-021", post_deploy.DONE) is True

View File

@@ -0,0 +1,259 @@
"""ORCH-021 integration tests — arming + tick orchestration (TC-16..TC-20).
Exercises the wiring in ``stage_engine`` (arm on deploy->done,
``run_post_deploy_monitor`` tick + reaction) and the ``/queue`` observability
block, with the network probe and the rollback hook mocked. Mirrors the
test_deploy_terminal_sync.py harness.
"""
import os
import tempfile
import pytest
_test_db = os.path.join(tempfile.gettempdir(), "test_orch_post_deploy.db")
os.environ["ORCH_DB_PATH"] = _test_db
os.environ["ORCH_REPOS_DIR"] = tempfile.gettempdir()
os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")
from unittest.mock import MagicMock # noqa: E402
import src.db as _db # noqa: E402
from src.db import init_db, get_db # noqa: E402
from src import stage_engine # noqa: E402
from src import post_deploy # noqa: E402
@pytest.fixture(autouse=True)
def fresh_db(monkeypatch, tmp_path):
monkeypatch.setattr(_db.settings, "db_path", _test_db)
if os.path.exists(_test_db):
os.unlink(_test_db)
init_db()
# State sentinels live under the tmp repos_dir (container view).
monkeypatch.setattr(post_deploy.settings, "repos_dir", str(tmp_path))
monkeypatch.setattr(post_deploy.settings, "host_repos_dir", str(tmp_path))
monkeypatch.setattr(stage_engine.settings, "repos_dir", str(tmp_path))
# The artefact write is best-effort; stub it so no worktree is needed.
monkeypatch.setattr(post_deploy, "write_post_deploy_log", MagicMock(return_value=True))
yield
@pytest.fixture(autouse=True)
def silence_side_effects(monkeypatch):
for name in (
"notify_stage_change", "notify_qg_failure", "notify_approve_requested",
"send_telegram", "plane_notify_stage", "plane_notify_qg", "plane_add_comment",
"set_issue_in_review", "set_issue_needs_input", "set_issue_in_progress",
"set_issue_blocked", "set_issue_done",
):
monkeypatch.setattr(stage_engine, name, MagicMock())
def _make_task(stage, repo="orchestrator", branch="feature/ORCH-021-x", wi="ORCH-021"):
conn = get_db()
cur = conn.execute(
"INSERT INTO tasks (plane_id, work_item_id, repo, branch, stage) "
"VALUES (?, ?, ?, ?, ?)",
(f"plane-{wi}", wi, repo, branch, stage),
)
task_id = cur.lastrowid
conn.commit()
conn.close()
return task_id
def _jobs(agent=None):
conn = get_db()
if agent:
rows = conn.execute(
"SELECT agent FROM jobs WHERE agent=? ORDER BY id", (agent,)
).fetchall()
else:
rows = conn.execute("SELECT agent FROM jobs ORDER BY id").fetchall()
conn.close()
return [r[0] for r in rows]
def _pass(*a, **k):
return (True, "ok")
def _drive_deploy_to_done(monkeypatch, task_id, repo="orchestrator",
branch="feature/ORCH-021-x", wi="ORCH-021"):
"""Advance a deploy-stage task to done through the real terminal block."""
monkeypatch.setattr(
stage_engine, "QG_CHECKS",
{**stage_engine.QG_CHECKS, "check_deploy_status": _pass},
)
monkeypatch.setattr(stage_engine.merge_gate, "release_merge_lease", MagicMock())
return stage_engine.advance_stage(
task_id=task_id, current_stage="deploy", repo=repo,
work_item_id=wi, branch=branch, finished_agent="deployer",
)
# ---------------------------------------------------------------------------
# TC-16 — arm on deploy->done (applicable repo only)
# ---------------------------------------------------------------------------
def test_tc16_arm_for_self_hosting(monkeypatch):
monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
monkeypatch.setattr(post_deploy.settings, "post_deploy_repos", "")
task_id = _make_task("deploy")
_drive_deploy_to_done(monkeypatch, task_id)
assert post_deploy.has_marker("orchestrator", "ORCH-021", post_deploy.ARMED)
assert "post-deploy-monitor" in _jobs("post-deploy-monitor")
def test_tc16_no_arm_for_nonself(monkeypatch):
monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
monkeypatch.setattr(post_deploy.settings, "post_deploy_repos", "")
task_id = _make_task("deploy", repo="enduro-trails", branch="feature/ET-9", wi="ET-9")
_drive_deploy_to_done(monkeypatch, task_id, repo="enduro-trails",
branch="feature/ET-9", wi="ET-9")
assert not post_deploy.has_marker("enduro-trails", "ET-9", post_deploy.ARMED)
assert _jobs("post-deploy-monitor") == []
def test_tc16_no_arm_when_kill_switch_off(monkeypatch):
monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", False)
task_id = _make_task("deploy")
_drive_deploy_to_done(monkeypatch, task_id)
assert not post_deploy.has_marker("orchestrator", "ORCH-021", post_deploy.ARMED)
assert _jobs("post-deploy-monitor") == []
# ---------------------------------------------------------------------------
# TC-17 — idempotent arm (double webhook)
# ---------------------------------------------------------------------------
def test_tc17_double_arm_is_noop(monkeypatch):
monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
armed1 = post_deploy.arm_monitor("orchestrator", "ORCH-021", "feature/ORCH-021-x", 1)
armed2 = post_deploy.arm_monitor("orchestrator", "ORCH-021", "feature/ORCH-021-x", 1)
assert armed1 is True
assert armed2 is False
# Exactly ONE monitor job enqueued despite two arm calls.
assert _jobs("post-deploy-monitor") == ["post-deploy-monitor"]
# ---------------------------------------------------------------------------
# TC-18 — DEGRADED -> non-self auto-rollback (hook mocked)
# ---------------------------------------------------------------------------
def test_tc18_degraded_nonself_rolls_back(monkeypatch):
monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
monkeypatch.setattr(post_deploy.settings, "post_deploy_repos", "enduro-trails")
monkeypatch.setattr(post_deploy.settings, "post_deploy_auto_rollback", True)
monkeypatch.setattr(post_deploy.settings, "post_deploy_fail_threshold", 1)
monkeypatch.setattr(post_deploy.settings, "post_deploy_window_s", 30)
monkeypatch.setattr(post_deploy.settings, "post_deploy_interval_s", 30) # budget=1 tick
# Probe reports unhealthy.
monkeypatch.setattr(
post_deploy, "probe_signals",
lambda url: post_deploy.ProbeResult(False, 2, 2, "down"),
)
rollback = MagicMock(return_value=(0, "ok"))
monkeypatch.setattr(post_deploy, "run_rollback", rollback)
notify = MagicMock()
monkeypatch.setattr(stage_engine, "_notify_post_deploy", notify)
logspy = MagicMock(return_value=True)
monkeypatch.setattr(post_deploy, "write_post_deploy_log", logspy)
task_id = _make_task("done", repo="enduro-trails", branch="feature/ET-9", wi="ET-9")
post_deploy.write_marker("enduro-trails", "ET-9", post_deploy.ARMED, "armed")
stage_engine.run_post_deploy_monitor(
{"task_id": task_id, "repo": "enduro-trails", "id": 1, "agent": "post-deploy-monitor"}
)
rollback.assert_called_once_with("enduro-trails")
assert post_deploy.has_marker("enduro-trails", "ET-9", post_deploy.DONE)
# Artefact written with ROLLBACK_OK; a notification was sent.
args = logspy.call_args[0]
assert "DEGRADED" in args
assert "ROLLBACK_OK" in args
assert notify.called
# ---------------------------------------------------------------------------
# TC-19 — self-hosting DEGRADED never rolls back, alerts instead
# ---------------------------------------------------------------------------
def test_tc19_degraded_self_hosting_alert_only(monkeypatch):
monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
monkeypatch.setattr(post_deploy.settings, "post_deploy_auto_rollback", True)
monkeypatch.setattr(post_deploy.settings, "post_deploy_fail_threshold", 1)
monkeypatch.setattr(post_deploy.settings, "post_deploy_window_s", 30)
monkeypatch.setattr(post_deploy.settings, "post_deploy_interval_s", 30)
monkeypatch.setattr(
post_deploy, "probe_signals",
lambda url: post_deploy.ProbeResult(False, 2, 2, "down"),
)
# Rollback hook MUST NOT be called for self-hosting (AC-8 structural invariant).
rollback = MagicMock(return_value=(0, "ok"))
monkeypatch.setattr(post_deploy, "run_rollback", rollback)
notify = MagicMock()
monkeypatch.setattr(stage_engine, "_notify_post_deploy", notify)
logspy = MagicMock(return_value=True)
monkeypatch.setattr(post_deploy, "write_post_deploy_log", logspy)
task_id = _make_task("done")
post_deploy.write_marker("orchestrator", "ORCH-021", post_deploy.ARMED, "armed")
stage_engine.run_post_deploy_monitor(
{"task_id": task_id, "repo": "orchestrator", "id": 1, "agent": "post-deploy-monitor"}
)
rollback.assert_not_called()
assert post_deploy.has_marker("orchestrator", "ORCH-021", post_deploy.DONE)
args = logspy.call_args[0]
assert "DEGRADED" in args
assert "ALERT_ONLY" in args
assert notify.called
def test_healthy_tick_requeues_without_finishing(monkeypatch):
# HEALTHY and window not exhausted -> re-queue, do NOT mark done.
monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
monkeypatch.setattr(post_deploy.settings, "post_deploy_window_s", 90)
monkeypatch.setattr(post_deploy.settings, "post_deploy_interval_s", 30) # budget=3
monkeypatch.setattr(
post_deploy, "probe_signals",
lambda url: post_deploy.ProbeResult(True, 2, 0, "ok"),
)
task_id = _make_task("done")
post_deploy.write_marker("orchestrator", "ORCH-021", post_deploy.ARMED, "armed")
stage_engine.run_post_deploy_monitor(
{"task_id": task_id, "repo": "orchestrator", "id": 1, "agent": "post-deploy-monitor"}
)
assert not post_deploy.has_marker("orchestrator", "ORCH-021", post_deploy.DONE)
# A follow-up tick job was enqueued.
assert _jobs("post-deploy-monitor") == ["post-deploy-monitor"]
def test_finished_window_tick_is_noop(monkeypatch):
# AC-15: a tick after the window is done is a no-op (no new job, no re-probe).
probe = MagicMock()
monkeypatch.setattr(post_deploy, "probe_signals", probe)
task_id = _make_task("done")
post_deploy.mark_done("orchestrator", "ORCH-021")
stage_engine.run_post_deploy_monitor(
{"task_id": task_id, "repo": "orchestrator", "id": 9, "agent": "post-deploy-monitor"}
)
probe.assert_not_called()
# ---------------------------------------------------------------------------
# TC-20 — /queue observability block
# ---------------------------------------------------------------------------
def test_tc20_queue_block_present(monkeypatch):
monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
post_deploy.write_marker("orchestrator", "ORCH-021", post_deploy.ARMED, "armed")
snap = post_deploy.status()
assert snap["enabled"] is True
assert snap["window_s"] == post_deploy.settings.post_deploy_window_s
assert "ORCH-021" in snap["active"]
assert snap["active_count"] >= 1
# A finished window drops out of "active".
post_deploy.mark_done("orchestrator", "ORCH-021")
snap2 = post_deploy.status()
assert "ORCH-021" not in snap2["active"]