Extend pipeline responsibility past deploy->done: after the terminal
transition for an applicable repo, arm a ~15min observation window that
probes prod and reacts to a degradation the restart-time health-check
missed ("green deploy, red prod").
- src/post_deploy.py: new leaf module (config + lazy qg/db only).
Sentinel-file restart-safe state (.post-deploy-state-<repo>/<wi>/),
no DB migration. probe_signals/classify/decide_action/run_rollback,
all never-raise.
- Reserved-agent job `post-deploy-monitor` (no-LLM, Variant B, calque of
deploy-finalizer): self-requeues each tick via enqueue_job.
- Deterministic classify: DEGRADED iff >= fail_threshold consecutive
health failures OR window 5xx ratio > 5xx_threshold; fail-safe HEALTHY.
- Self-hosting invariant (BR-5/AC-8): a tick NEVER restarts the prod
orchestrator container -> orchestrator is ALWAYS ALERT_ONLY.
- Conditionality (ORCH-35/36/43/58): kill-switch + CSV repos, empty ->
self-hosting only.
- QG_CHECKS / STAGE_TRANSITIONS / schema unchanged (AC-12).
- Docs: CHANGELOG, CLAUDE artefact list (16-post-deploy-log.md),
architecture README, .env.example (ORCH_POST_DEPLOY_*).
Refs: ORCH-021
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
211 lines
9.3 KiB
Python
211 lines
9.3 KiB
Python
"""ORCH-021 unit tests — post-deploy monitor pure logic (TC-01..TC-15).
|
|
|
|
The deterministic, network-free core (classification + reaction decision +
|
|
exit-code mapping + artefact frontmatter + never-raise) of ``src/post_deploy.py``.
|
|
Network probes and the rollback hook are exercised via mocks; the classifier is
|
|
the main subject (mirrors compute_staging_verdict in ORCH-061).
|
|
"""
|
|
|
|
import os
|
|
import tempfile
|
|
|
|
import pytest
|
|
import yaml
|
|
|
|
# Isolate the settings singleton onto a tmp repos_dir BEFORE importing the module.
|
|
os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
|
|
os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")
|
|
|
|
from src import post_deploy # noqa: E402
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
def _probe(health_ok=True, total=2, fivexx=0):
|
|
return {"health_ok": health_ok, "total": total, "fivexx": fivexx}
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def _tmp_state(monkeypatch, tmp_path):
|
|
monkeypatch.setattr(post_deploy.settings, "repos_dir", str(tmp_path))
|
|
monkeypatch.setattr(post_deploy.settings, "host_repos_dir", str(tmp_path))
|
|
yield
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TC-01..TC-05 — classification (the core)
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc01_healthy_no_failures():
|
|
series = [_probe() for _ in range(5)]
|
|
assert post_deploy.classify(series, fail_threshold=3, fivexx_threshold=0.5) == "HEALTHY"
|
|
|
|
|
|
def test_tc02_degraded_consecutive_health_failures():
|
|
# Exactly fail_threshold consecutive failures -> DEGRADED (>= contract).
|
|
series = [_probe(health_ok=False) for _ in range(3)]
|
|
assert post_deploy.classify(series, fail_threshold=3, fivexx_threshold=0.5) == "DEGRADED"
|
|
|
|
|
|
def test_tc03_degraded_by_5xx_ratio_even_when_health_200():
|
|
# /health stays 200 (health_ok True) but the 5xx ratio is above threshold.
|
|
series = [_probe(health_ok=True, total=2, fivexx=2) for _ in range(3)]
|
|
assert post_deploy.classify(series, fail_threshold=10, fivexx_threshold=0.5) == "DEGRADED"
|
|
|
|
|
|
def test_tc04_no_false_trip_single_glitch_then_recovery():
|
|
# One isolated failure (1 < threshold) surrounded by healthy probes -> HEALTHY.
|
|
series = [_probe(), _probe(health_ok=False), _probe(), _probe()]
|
|
assert post_deploy.classify(series, fail_threshold=3, fivexx_threshold=0.5) == "HEALTHY"
|
|
|
|
|
|
def test_tc05_thresholds_change_verdict_on_same_data():
|
|
# Same data, different threshold flips the verdict (AC-11): two consecutive fails.
|
|
series = [_probe(health_ok=False), _probe(health_ok=False)]
|
|
assert post_deploy.classify(series, fail_threshold=3, fivexx_threshold=0.5) == "HEALTHY"
|
|
assert post_deploy.classify(series, fail_threshold=2, fivexx_threshold=0.5) == "DEGRADED"
|
|
|
|
|
|
def test_classify_uses_settings_thresholds(monkeypatch):
|
|
# The tick reads thresholds from Settings (env ORCH_*) — verify the wiring point.
|
|
monkeypatch.setattr(post_deploy.settings, "post_deploy_fail_threshold", 2)
|
|
series = [_probe(health_ok=False), _probe(health_ok=False)]
|
|
assert post_deploy.classify(
|
|
series,
|
|
post_deploy.settings.post_deploy_fail_threshold,
|
|
post_deploy.settings.post_deploy_5xx_threshold,
|
|
) == "DEGRADED"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TC-06..TC-08 — reaction decision (self-hosting safety)
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc06_nonself_auto_rollback_degraded_rolls_back(monkeypatch):
|
|
monkeypatch.setattr(post_deploy.settings, "post_deploy_auto_rollback", True)
|
|
assert post_deploy.decide_action("enduro-trails", "DEGRADED") == "ROLLBACK"
|
|
|
|
|
|
def test_tc07_self_hosting_degraded_never_rolls_back(monkeypatch):
|
|
# orchestrator (self-hosting) is ALWAYS ALERT_ONLY, even with auto_rollback on.
|
|
monkeypatch.setattr(post_deploy.settings, "post_deploy_auto_rollback", True)
|
|
assert post_deploy.decide_action("orchestrator", "DEGRADED") == "ALERT_ONLY"
|
|
|
|
|
|
def test_tc08_healthy_means_none_for_any_repo():
|
|
assert post_deploy.decide_action("orchestrator", "HEALTHY") == "NONE"
|
|
assert post_deploy.decide_action("enduro-trails", "HEALTHY") == "NONE"
|
|
|
|
|
|
def test_nonself_default_policy_alert_only(monkeypatch):
|
|
monkeypatch.setattr(post_deploy.settings, "post_deploy_auto_rollback", False)
|
|
assert post_deploy.decide_action("enduro-trails", "DEGRADED") == "ALERT_ONLY"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TC-09..TC-10 — conditionality / kill-switch
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc09_applies_empty_repos_only_self_hosting(monkeypatch):
|
|
monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
|
|
monkeypatch.setattr(post_deploy.settings, "post_deploy_repos", "")
|
|
assert post_deploy.post_deploy_applies("orchestrator") is True
|
|
assert post_deploy.post_deploy_applies("enduro-trails") is False
|
|
|
|
|
|
def test_tc09_applies_explicit_repos_csv(monkeypatch):
|
|
monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
|
|
monkeypatch.setattr(post_deploy.settings, "post_deploy_repos", "enduro-trails")
|
|
assert post_deploy.post_deploy_applies("enduro-trails") is True
|
|
assert post_deploy.post_deploy_applies("orchestrator") is False
|
|
|
|
|
|
def test_tc10_kill_switch_disables_for_everyone(monkeypatch):
|
|
monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", False)
|
|
assert post_deploy.post_deploy_applies("orchestrator") is False
|
|
assert post_deploy.post_deploy_applies("enduro-trails") is False
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TC-11..TC-12 — rollback exit-code mapping
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc11_rollback_exit0_is_ok():
|
|
assert post_deploy.map_rollback_exit_code(0) == "ROLLBACK_OK"
|
|
|
|
|
|
def test_tc12_rollback_exit_nonzero_is_failed():
|
|
assert post_deploy.map_rollback_exit_code(1) == "ROLLBACK_FAILED"
|
|
assert post_deploy.map_rollback_exit_code(2) == "ROLLBACK_FAILED"
|
|
assert post_deploy.map_rollback_exit_code(None) == "ROLLBACK_FAILED"
|
|
assert post_deploy.map_rollback_exit_code("garbage") == "ROLLBACK_FAILED"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TC-13 — artefact frontmatter
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc13_log_frontmatter_parses():
|
|
body = post_deploy.build_post_deploy_log(
|
|
"ORCH-021", "DEGRADED", "ALERT_ONLY", 900, 12, 4
|
|
)
|
|
assert body.startswith("---\n")
|
|
fm = body.split("---", 2)[1]
|
|
data = yaml.safe_load(fm)
|
|
assert data["post_deploy_status"] == "DEGRADED"
|
|
assert data["action_taken"] == "ALERT_ONLY"
|
|
assert data["work_item"] == "ORCH-021"
|
|
assert data["window_s"] == 900
|
|
assert data["checks_total"] == 12
|
|
assert data["checks_failed"] == 4
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TC-14..TC-15 — never-raise
|
|
# ---------------------------------------------------------------------------
|
|
def test_tc14_probe_network_error_is_conservative_not_raise(monkeypatch):
|
|
# urlopen raises on every call -> health bad + monitored endpoints counted as
|
|
# 5xx, but NO exception propagates (the helper swallows and reports code 0).
|
|
def boom(*a, **k):
|
|
raise OSError("network down")
|
|
|
|
monkeypatch.setattr(post_deploy.urllib.request, "urlopen", boom)
|
|
res = post_deploy.probe_signals("http://localhost:8500")
|
|
assert res.health_ok is False
|
|
assert res.total == 2
|
|
assert res.fivexx == 2 # unreachable endpoints counted as failures
|
|
|
|
|
|
def test_tc14_classify_junk_input_swallowed():
|
|
# If classify gets junk it must not raise (fail-safe to HEALTHY).
|
|
assert post_deploy.classify("not-a-list", 3, 0.5) == "HEALTHY"
|
|
assert post_deploy.classify([{"bad": "row"}], 3, 0.5) == "HEALTHY"
|
|
assert post_deploy.classify(None, 3, 0.5) == "HEALTHY"
|
|
|
|
|
|
def test_tc15_write_log_no_worktree_returns_false(monkeypatch):
|
|
# get_worktree_path raises -> write returns False, no exception (best-effort).
|
|
def boom(repo, branch):
|
|
raise FileNotFoundError("no worktree")
|
|
|
|
monkeypatch.setattr("src.git_worktree.get_worktree_path", boom)
|
|
ok = post_deploy.write_post_deploy_log(
|
|
"nope-repo", "ORCH-021", "feature/x", "HEALTHY", "NONE", 900, 3, 0
|
|
)
|
|
assert ok is False
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Sentinel state restart-safe counters
|
|
# ---------------------------------------------------------------------------
|
|
def test_series_append_and_read_roundtrip():
|
|
post_deploy.write_marker("orchestrator", "ORCH-021", post_deploy.ARMED, "armed")
|
|
post_deploy.append_probe("orchestrator", "ORCH-021", post_deploy.ProbeResult(False, 2, 1, "x"))
|
|
post_deploy.append_probe("orchestrator", "ORCH-021", post_deploy.ProbeResult(True, 2, 0, "y"))
|
|
series = post_deploy.read_series("orchestrator", "ORCH-021")
|
|
assert len(series) == 2
|
|
assert series[0]["health_ok"] is False
|
|
assert series[1]["health_ok"] is True
|
|
|
|
|
|
def test_mark_done_idempotency_marker():
|
|
assert post_deploy.has_marker("orchestrator", "ORCH-021", post_deploy.DONE) is False
|
|
post_deploy.mark_done("orchestrator", "ORCH-021")
|
|
assert post_deploy.has_marker("orchestrator", "ORCH-021", post_deploy.DONE) is True
|