feat(post-deploy): post-deploy prod monitoring + degradation reaction (ORCH-021)
Extend pipeline responsibility past deploy->done: after the terminal
transition for an applicable repo, arm a ~15min observation window that
probes prod and reacts to a degradation the restart-time health-check
missed ("green deploy, red prod").
- src/post_deploy.py: new leaf module (config + lazy qg/db only).
Sentinel-file restart-safe state (.post-deploy-state-<repo>/<wi>/),
no DB migration. probe_signals/classify/decide_action/run_rollback,
all never-raise.
- Reserved-agent job `post-deploy-monitor` (no-LLM, Variant B, calque of
deploy-finalizer): self-requeues each tick via enqueue_job.
- Deterministic classify: DEGRADED iff >= fail_threshold consecutive
health failures OR window 5xx ratio > 5xx_threshold; fail-safe HEALTHY.
- Self-hosting invariant (BR-5/AC-8): a tick NEVER restarts the prod
orchestrator container -> orchestrator is ALWAYS ALERT_ONLY.
- Conditionality (ORCH-35/36/43/58): kill-switch + CSV repos, empty ->
self-hosting only.
- QG_CHECKS / STAGE_TRANSITIONS / schema unchanged (AC-12).
- Docs: CHANGELOG, CLAUDE artefact list (16-post-deploy-log.md),
architecture README, .env.example (ORCH_POST_DEPLOY_*).
Refs: ORCH-021
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -90,6 +90,10 @@ def test_tc17_success_deploy_syncs_terminal_done(monkeypatch):
|
||||
# Spy the merge-lease release to confirm the terminal-sync still frees it.
|
||||
release = MagicMock()
|
||||
monkeypatch.setattr(stage_engine.merge_gate, "release_merge_lease", release)
|
||||
# ORCH-021 arms an orthogonal post-deploy-monitor reserved job at deploy->done
|
||||
# for the self-hosting repo; disable it here so this test stays focused on the
|
||||
# ORCH-036 terminal-sync contract (no PIPELINE agent launched leaving deploy).
|
||||
monkeypatch.setattr(stage_engine.post_deploy.settings, "post_deploy_monitor_enabled", False)
|
||||
|
||||
task_id = _make_task("deploy")
|
||||
stage_engine.run_deploy_finalizer(
|
||||
|
||||
210
tests/test_post_deploy.py
Normal file
210
tests/test_post_deploy.py
Normal file
@@ -0,0 +1,210 @@
|
||||
"""ORCH-021 unit tests — post-deploy monitor pure logic (TC-01..TC-15).
|
||||
|
||||
The deterministic, network-free core (classification + reaction decision +
|
||||
exit-code mapping + artefact frontmatter + never-raise) of ``src/post_deploy.py``.
|
||||
Network probes and the rollback hook are exercised via mocks; the classifier is
|
||||
the main subject (mirrors compute_staging_verdict in ORCH-061).
|
||||
"""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
import yaml
|
||||
|
||||
# Isolate the settings singleton onto a tmp repos_dir BEFORE importing the module.
|
||||
os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
|
||||
os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")
|
||||
|
||||
from src import post_deploy # noqa: E402
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
def _probe(health_ok=True, total=2, fivexx=0):
|
||||
return {"health_ok": health_ok, "total": total, "fivexx": fivexx}
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _tmp_state(monkeypatch, tmp_path):
|
||||
monkeypatch.setattr(post_deploy.settings, "repos_dir", str(tmp_path))
|
||||
monkeypatch.setattr(post_deploy.settings, "host_repos_dir", str(tmp_path))
|
||||
yield
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TC-01..TC-05 — classification (the core)
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_tc01_healthy_no_failures():
|
||||
series = [_probe() for _ in range(5)]
|
||||
assert post_deploy.classify(series, fail_threshold=3, fivexx_threshold=0.5) == "HEALTHY"
|
||||
|
||||
|
||||
def test_tc02_degraded_consecutive_health_failures():
|
||||
# Exactly fail_threshold consecutive failures -> DEGRADED (>= contract).
|
||||
series = [_probe(health_ok=False) for _ in range(3)]
|
||||
assert post_deploy.classify(series, fail_threshold=3, fivexx_threshold=0.5) == "DEGRADED"
|
||||
|
||||
|
||||
def test_tc03_degraded_by_5xx_ratio_even_when_health_200():
|
||||
# /health stays 200 (health_ok True) but the 5xx ratio is above threshold.
|
||||
series = [_probe(health_ok=True, total=2, fivexx=2) for _ in range(3)]
|
||||
assert post_deploy.classify(series, fail_threshold=10, fivexx_threshold=0.5) == "DEGRADED"
|
||||
|
||||
|
||||
def test_tc04_no_false_trip_single_glitch_then_recovery():
|
||||
# One isolated failure (1 < threshold) surrounded by healthy probes -> HEALTHY.
|
||||
series = [_probe(), _probe(health_ok=False), _probe(), _probe()]
|
||||
assert post_deploy.classify(series, fail_threshold=3, fivexx_threshold=0.5) == "HEALTHY"
|
||||
|
||||
|
||||
def test_tc05_thresholds_change_verdict_on_same_data():
|
||||
# Same data, different threshold flips the verdict (AC-11): two consecutive fails.
|
||||
series = [_probe(health_ok=False), _probe(health_ok=False)]
|
||||
assert post_deploy.classify(series, fail_threshold=3, fivexx_threshold=0.5) == "HEALTHY"
|
||||
assert post_deploy.classify(series, fail_threshold=2, fivexx_threshold=0.5) == "DEGRADED"
|
||||
|
||||
|
||||
def test_classify_uses_settings_thresholds(monkeypatch):
|
||||
# The tick reads thresholds from Settings (env ORCH_*) — verify the wiring point.
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_fail_threshold", 2)
|
||||
series = [_probe(health_ok=False), _probe(health_ok=False)]
|
||||
assert post_deploy.classify(
|
||||
series,
|
||||
post_deploy.settings.post_deploy_fail_threshold,
|
||||
post_deploy.settings.post_deploy_5xx_threshold,
|
||||
) == "DEGRADED"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TC-06..TC-08 — reaction decision (self-hosting safety)
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_tc06_nonself_auto_rollback_degraded_rolls_back(monkeypatch):
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_auto_rollback", True)
|
||||
assert post_deploy.decide_action("enduro-trails", "DEGRADED") == "ROLLBACK"
|
||||
|
||||
|
||||
def test_tc07_self_hosting_degraded_never_rolls_back(monkeypatch):
|
||||
# orchestrator (self-hosting) is ALWAYS ALERT_ONLY, even with auto_rollback on.
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_auto_rollback", True)
|
||||
assert post_deploy.decide_action("orchestrator", "DEGRADED") == "ALERT_ONLY"
|
||||
|
||||
|
||||
def test_tc08_healthy_means_none_for_any_repo():
|
||||
assert post_deploy.decide_action("orchestrator", "HEALTHY") == "NONE"
|
||||
assert post_deploy.decide_action("enduro-trails", "HEALTHY") == "NONE"
|
||||
|
||||
|
||||
def test_nonself_default_policy_alert_only(monkeypatch):
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_auto_rollback", False)
|
||||
assert post_deploy.decide_action("enduro-trails", "DEGRADED") == "ALERT_ONLY"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TC-09..TC-10 — conditionality / kill-switch
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_tc09_applies_empty_repos_only_self_hosting(monkeypatch):
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_repos", "")
|
||||
assert post_deploy.post_deploy_applies("orchestrator") is True
|
||||
assert post_deploy.post_deploy_applies("enduro-trails") is False
|
||||
|
||||
|
||||
def test_tc09_applies_explicit_repos_csv(monkeypatch):
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_repos", "enduro-trails")
|
||||
assert post_deploy.post_deploy_applies("enduro-trails") is True
|
||||
assert post_deploy.post_deploy_applies("orchestrator") is False
|
||||
|
||||
|
||||
def test_tc10_kill_switch_disables_for_everyone(monkeypatch):
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", False)
|
||||
assert post_deploy.post_deploy_applies("orchestrator") is False
|
||||
assert post_deploy.post_deploy_applies("enduro-trails") is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TC-11..TC-12 — rollback exit-code mapping
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_tc11_rollback_exit0_is_ok():
|
||||
assert post_deploy.map_rollback_exit_code(0) == "ROLLBACK_OK"
|
||||
|
||||
|
||||
def test_tc12_rollback_exit_nonzero_is_failed():
|
||||
assert post_deploy.map_rollback_exit_code(1) == "ROLLBACK_FAILED"
|
||||
assert post_deploy.map_rollback_exit_code(2) == "ROLLBACK_FAILED"
|
||||
assert post_deploy.map_rollback_exit_code(None) == "ROLLBACK_FAILED"
|
||||
assert post_deploy.map_rollback_exit_code("garbage") == "ROLLBACK_FAILED"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TC-13 — artefact frontmatter
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_tc13_log_frontmatter_parses():
|
||||
body = post_deploy.build_post_deploy_log(
|
||||
"ORCH-021", "DEGRADED", "ALERT_ONLY", 900, 12, 4
|
||||
)
|
||||
assert body.startswith("---\n")
|
||||
fm = body.split("---", 2)[1]
|
||||
data = yaml.safe_load(fm)
|
||||
assert data["post_deploy_status"] == "DEGRADED"
|
||||
assert data["action_taken"] == "ALERT_ONLY"
|
||||
assert data["work_item"] == "ORCH-021"
|
||||
assert data["window_s"] == 900
|
||||
assert data["checks_total"] == 12
|
||||
assert data["checks_failed"] == 4
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TC-14..TC-15 — never-raise
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_tc14_probe_network_error_is_conservative_not_raise(monkeypatch):
|
||||
# urlopen raises on every call -> health bad + monitored endpoints counted as
|
||||
# 5xx, but NO exception propagates (the helper swallows and reports code 0).
|
||||
def boom(*a, **k):
|
||||
raise OSError("network down")
|
||||
|
||||
monkeypatch.setattr(post_deploy.urllib.request, "urlopen", boom)
|
||||
res = post_deploy.probe_signals("http://localhost:8500")
|
||||
assert res.health_ok is False
|
||||
assert res.total == 2
|
||||
assert res.fivexx == 2 # unreachable endpoints counted as failures
|
||||
|
||||
|
||||
def test_tc14_classify_junk_input_swallowed():
|
||||
# If classify gets junk it must not raise (fail-safe to HEALTHY).
|
||||
assert post_deploy.classify("not-a-list", 3, 0.5) == "HEALTHY"
|
||||
assert post_deploy.classify([{"bad": "row"}], 3, 0.5) == "HEALTHY"
|
||||
assert post_deploy.classify(None, 3, 0.5) == "HEALTHY"
|
||||
|
||||
|
||||
def test_tc15_write_log_no_worktree_returns_false(monkeypatch):
|
||||
# get_worktree_path raises -> write returns False, no exception (best-effort).
|
||||
def boom(repo, branch):
|
||||
raise FileNotFoundError("no worktree")
|
||||
|
||||
monkeypatch.setattr("src.git_worktree.get_worktree_path", boom)
|
||||
ok = post_deploy.write_post_deploy_log(
|
||||
"nope-repo", "ORCH-021", "feature/x", "HEALTHY", "NONE", 900, 3, 0
|
||||
)
|
||||
assert ok is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Sentinel state restart-safe counters
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_series_append_and_read_roundtrip():
|
||||
post_deploy.write_marker("orchestrator", "ORCH-021", post_deploy.ARMED, "armed")
|
||||
post_deploy.append_probe("orchestrator", "ORCH-021", post_deploy.ProbeResult(False, 2, 1, "x"))
|
||||
post_deploy.append_probe("orchestrator", "ORCH-021", post_deploy.ProbeResult(True, 2, 0, "y"))
|
||||
series = post_deploy.read_series("orchestrator", "ORCH-021")
|
||||
assert len(series) == 2
|
||||
assert series[0]["health_ok"] is False
|
||||
assert series[1]["health_ok"] is True
|
||||
|
||||
|
||||
def test_mark_done_idempotency_marker():
|
||||
assert post_deploy.has_marker("orchestrator", "ORCH-021", post_deploy.DONE) is False
|
||||
post_deploy.mark_done("orchestrator", "ORCH-021")
|
||||
assert post_deploy.has_marker("orchestrator", "ORCH-021", post_deploy.DONE) is True
|
||||
259
tests/test_post_deploy_integration.py
Normal file
259
tests/test_post_deploy_integration.py
Normal file
@@ -0,0 +1,259 @@
|
||||
"""ORCH-021 integration tests — arming + tick orchestration (TC-16..TC-20).
|
||||
|
||||
Exercises the wiring in ``stage_engine`` (arm on deploy->done,
|
||||
``run_post_deploy_monitor`` tick + reaction) and the ``/queue`` observability
|
||||
block, with the network probe and the rollback hook mocked. Mirrors the
|
||||
test_deploy_terminal_sync.py harness.
|
||||
"""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
|
||||
_test_db = os.path.join(tempfile.gettempdir(), "test_orch_post_deploy.db")
|
||||
os.environ["ORCH_DB_PATH"] = _test_db
|
||||
os.environ["ORCH_REPOS_DIR"] = tempfile.gettempdir()
|
||||
os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
|
||||
os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")
|
||||
|
||||
from unittest.mock import MagicMock # noqa: E402
|
||||
|
||||
import src.db as _db # noqa: E402
|
||||
from src.db import init_db, get_db # noqa: E402
|
||||
from src import stage_engine # noqa: E402
|
||||
from src import post_deploy # noqa: E402
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def fresh_db(monkeypatch, tmp_path):
|
||||
monkeypatch.setattr(_db.settings, "db_path", _test_db)
|
||||
if os.path.exists(_test_db):
|
||||
os.unlink(_test_db)
|
||||
init_db()
|
||||
# State sentinels live under the tmp repos_dir (container view).
|
||||
monkeypatch.setattr(post_deploy.settings, "repos_dir", str(tmp_path))
|
||||
monkeypatch.setattr(post_deploy.settings, "host_repos_dir", str(tmp_path))
|
||||
monkeypatch.setattr(stage_engine.settings, "repos_dir", str(tmp_path))
|
||||
# The artefact write is best-effort; stub it so no worktree is needed.
|
||||
monkeypatch.setattr(post_deploy, "write_post_deploy_log", MagicMock(return_value=True))
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def silence_side_effects(monkeypatch):
|
||||
for name in (
|
||||
"notify_stage_change", "notify_qg_failure", "notify_approve_requested",
|
||||
"send_telegram", "plane_notify_stage", "plane_notify_qg", "plane_add_comment",
|
||||
"set_issue_in_review", "set_issue_needs_input", "set_issue_in_progress",
|
||||
"set_issue_blocked", "set_issue_done",
|
||||
):
|
||||
monkeypatch.setattr(stage_engine, name, MagicMock())
|
||||
|
||||
|
||||
def _make_task(stage, repo="orchestrator", branch="feature/ORCH-021-x", wi="ORCH-021"):
|
||||
conn = get_db()
|
||||
cur = conn.execute(
|
||||
"INSERT INTO tasks (plane_id, work_item_id, repo, branch, stage) "
|
||||
"VALUES (?, ?, ?, ?, ?)",
|
||||
(f"plane-{wi}", wi, repo, branch, stage),
|
||||
)
|
||||
task_id = cur.lastrowid
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return task_id
|
||||
|
||||
|
||||
def _jobs(agent=None):
|
||||
conn = get_db()
|
||||
if agent:
|
||||
rows = conn.execute(
|
||||
"SELECT agent FROM jobs WHERE agent=? ORDER BY id", (agent,)
|
||||
).fetchall()
|
||||
else:
|
||||
rows = conn.execute("SELECT agent FROM jobs ORDER BY id").fetchall()
|
||||
conn.close()
|
||||
return [r[0] for r in rows]
|
||||
|
||||
|
||||
def _pass(*a, **k):
|
||||
return (True, "ok")
|
||||
|
||||
|
||||
def _drive_deploy_to_done(monkeypatch, task_id, repo="orchestrator",
|
||||
branch="feature/ORCH-021-x", wi="ORCH-021"):
|
||||
"""Advance a deploy-stage task to done through the real terminal block."""
|
||||
monkeypatch.setattr(
|
||||
stage_engine, "QG_CHECKS",
|
||||
{**stage_engine.QG_CHECKS, "check_deploy_status": _pass},
|
||||
)
|
||||
monkeypatch.setattr(stage_engine.merge_gate, "release_merge_lease", MagicMock())
|
||||
return stage_engine.advance_stage(
|
||||
task_id=task_id, current_stage="deploy", repo=repo,
|
||||
work_item_id=wi, branch=branch, finished_agent="deployer",
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TC-16 — arm on deploy->done (applicable repo only)
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_tc16_arm_for_self_hosting(monkeypatch):
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_repos", "")
|
||||
task_id = _make_task("deploy")
|
||||
_drive_deploy_to_done(monkeypatch, task_id)
|
||||
|
||||
assert post_deploy.has_marker("orchestrator", "ORCH-021", post_deploy.ARMED)
|
||||
assert "post-deploy-monitor" in _jobs("post-deploy-monitor")
|
||||
|
||||
|
||||
def test_tc16_no_arm_for_nonself(monkeypatch):
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_repos", "")
|
||||
task_id = _make_task("deploy", repo="enduro-trails", branch="feature/ET-9", wi="ET-9")
|
||||
_drive_deploy_to_done(monkeypatch, task_id, repo="enduro-trails",
|
||||
branch="feature/ET-9", wi="ET-9")
|
||||
|
||||
assert not post_deploy.has_marker("enduro-trails", "ET-9", post_deploy.ARMED)
|
||||
assert _jobs("post-deploy-monitor") == []
|
||||
|
||||
|
||||
def test_tc16_no_arm_when_kill_switch_off(monkeypatch):
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", False)
|
||||
task_id = _make_task("deploy")
|
||||
_drive_deploy_to_done(monkeypatch, task_id)
|
||||
assert not post_deploy.has_marker("orchestrator", "ORCH-021", post_deploy.ARMED)
|
||||
assert _jobs("post-deploy-monitor") == []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TC-17 — idempotent arm (double webhook)
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_tc17_double_arm_is_noop(monkeypatch):
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
|
||||
armed1 = post_deploy.arm_monitor("orchestrator", "ORCH-021", "feature/ORCH-021-x", 1)
|
||||
armed2 = post_deploy.arm_monitor("orchestrator", "ORCH-021", "feature/ORCH-021-x", 1)
|
||||
assert armed1 is True
|
||||
assert armed2 is False
|
||||
# Exactly ONE monitor job enqueued despite two arm calls.
|
||||
assert _jobs("post-deploy-monitor") == ["post-deploy-monitor"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TC-18 — DEGRADED -> non-self auto-rollback (hook mocked)
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_tc18_degraded_nonself_rolls_back(monkeypatch):
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_repos", "enduro-trails")
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_auto_rollback", True)
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_fail_threshold", 1)
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_window_s", 30)
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_interval_s", 30) # budget=1 tick
|
||||
# Probe reports unhealthy.
|
||||
monkeypatch.setattr(
|
||||
post_deploy, "probe_signals",
|
||||
lambda url: post_deploy.ProbeResult(False, 2, 2, "down"),
|
||||
)
|
||||
rollback = MagicMock(return_value=(0, "ok"))
|
||||
monkeypatch.setattr(post_deploy, "run_rollback", rollback)
|
||||
notify = MagicMock()
|
||||
monkeypatch.setattr(stage_engine, "_notify_post_deploy", notify)
|
||||
logspy = MagicMock(return_value=True)
|
||||
monkeypatch.setattr(post_deploy, "write_post_deploy_log", logspy)
|
||||
|
||||
task_id = _make_task("done", repo="enduro-trails", branch="feature/ET-9", wi="ET-9")
|
||||
post_deploy.write_marker("enduro-trails", "ET-9", post_deploy.ARMED, "armed")
|
||||
stage_engine.run_post_deploy_monitor(
|
||||
{"task_id": task_id, "repo": "enduro-trails", "id": 1, "agent": "post-deploy-monitor"}
|
||||
)
|
||||
|
||||
rollback.assert_called_once_with("enduro-trails")
|
||||
assert post_deploy.has_marker("enduro-trails", "ET-9", post_deploy.DONE)
|
||||
# Artefact written with ROLLBACK_OK; a notification was sent.
|
||||
args = logspy.call_args[0]
|
||||
assert "DEGRADED" in args
|
||||
assert "ROLLBACK_OK" in args
|
||||
assert notify.called
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TC-19 — self-hosting DEGRADED never rolls back, alerts instead
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_tc19_degraded_self_hosting_alert_only(monkeypatch):
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_auto_rollback", True)
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_fail_threshold", 1)
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_window_s", 30)
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_interval_s", 30)
|
||||
monkeypatch.setattr(
|
||||
post_deploy, "probe_signals",
|
||||
lambda url: post_deploy.ProbeResult(False, 2, 2, "down"),
|
||||
)
|
||||
# Rollback hook MUST NOT be called for self-hosting (AC-8 structural invariant).
|
||||
rollback = MagicMock(return_value=(0, "ok"))
|
||||
monkeypatch.setattr(post_deploy, "run_rollback", rollback)
|
||||
notify = MagicMock()
|
||||
monkeypatch.setattr(stage_engine, "_notify_post_deploy", notify)
|
||||
logspy = MagicMock(return_value=True)
|
||||
monkeypatch.setattr(post_deploy, "write_post_deploy_log", logspy)
|
||||
|
||||
task_id = _make_task("done")
|
||||
post_deploy.write_marker("orchestrator", "ORCH-021", post_deploy.ARMED, "armed")
|
||||
stage_engine.run_post_deploy_monitor(
|
||||
{"task_id": task_id, "repo": "orchestrator", "id": 1, "agent": "post-deploy-monitor"}
|
||||
)
|
||||
|
||||
rollback.assert_not_called()
|
||||
assert post_deploy.has_marker("orchestrator", "ORCH-021", post_deploy.DONE)
|
||||
args = logspy.call_args[0]
|
||||
assert "DEGRADED" in args
|
||||
assert "ALERT_ONLY" in args
|
||||
assert notify.called
|
||||
|
||||
|
||||
def test_healthy_tick_requeues_without_finishing(monkeypatch):
|
||||
# HEALTHY and window not exhausted -> re-queue, do NOT mark done.
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_window_s", 90)
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_interval_s", 30) # budget=3
|
||||
monkeypatch.setattr(
|
||||
post_deploy, "probe_signals",
|
||||
lambda url: post_deploy.ProbeResult(True, 2, 0, "ok"),
|
||||
)
|
||||
task_id = _make_task("done")
|
||||
post_deploy.write_marker("orchestrator", "ORCH-021", post_deploy.ARMED, "armed")
|
||||
stage_engine.run_post_deploy_monitor(
|
||||
{"task_id": task_id, "repo": "orchestrator", "id": 1, "agent": "post-deploy-monitor"}
|
||||
)
|
||||
assert not post_deploy.has_marker("orchestrator", "ORCH-021", post_deploy.DONE)
|
||||
# A follow-up tick job was enqueued.
|
||||
assert _jobs("post-deploy-monitor") == ["post-deploy-monitor"]
|
||||
|
||||
|
||||
def test_finished_window_tick_is_noop(monkeypatch):
|
||||
# AC-15: a tick after the window is done is a no-op (no new job, no re-probe).
|
||||
probe = MagicMock()
|
||||
monkeypatch.setattr(post_deploy, "probe_signals", probe)
|
||||
task_id = _make_task("done")
|
||||
post_deploy.mark_done("orchestrator", "ORCH-021")
|
||||
stage_engine.run_post_deploy_monitor(
|
||||
{"task_id": task_id, "repo": "orchestrator", "id": 9, "agent": "post-deploy-monitor"}
|
||||
)
|
||||
probe.assert_not_called()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TC-20 — /queue observability block
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_tc20_queue_block_present(monkeypatch):
|
||||
monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
|
||||
post_deploy.write_marker("orchestrator", "ORCH-021", post_deploy.ARMED, "armed")
|
||||
snap = post_deploy.status()
|
||||
assert snap["enabled"] is True
|
||||
assert snap["window_s"] == post_deploy.settings.post_deploy_window_s
|
||||
assert "ORCH-021" in snap["active"]
|
||||
assert snap["active_count"] >= 1
|
||||
# A finished window drops out of "active".
|
||||
post_deploy.mark_done("orchestrator", "ORCH-021")
|
||||
snap2 = post_deploy.status()
|
||||
assert "ORCH-021" not in snap2["active"]
|
||||
Reference in New Issue
Block a user