feat(post-deploy): post-deploy prod monitoring + degradation reaction (ORCH-021)

Extend pipeline responsibility past deploy->done: after the terminal transition for an applicable repo, arm a ~15min observation window that probes prod and reacts to a degradation the restart-time health-check missed ("green deploy, red prod"). - src/post_deploy.py: new leaf module (config + lazy qg/db only). Sentinel-file restart-safe state (.post-deploy-state-<repo>/<wi>/), no DB migration. probe_signals/classify/decide_action/run_rollback, all never-raise. - Reserved-agent job `post-deploy-monitor` (no-LLM, Variant B, calque of deploy-finalizer): self-requeues each tick via enqueue_job. - Deterministic classify: DEGRADED iff >= fail_threshold consecutive health failures OR window 5xx ratio > 5xx_threshold; fail-safe HEALTHY. - Self-hosting invariant (BR-5/AC-8): a tick NEVER restarts the prod orchestrator container -> orchestrator is ALWAYS ALERT_ONLY. - Conditionality (ORCH-35/36/43/58): kill-switch + CSV repos, empty -> self-hosting only. - QG_CHECKS / STAGE_TRANSITIONS / schema unchanged (AC-12). - Docs: CHANGELOG, CLAUDE artefact list (16-post-deploy-log.md), architecture README, .env.example (ORCH_POST_DEPLOY_*). Refs: ORCH-021 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-07 14:16:12 +00:00
parent a4ad55c862
commit 8273c1fc9d
12 changed files with 1322 additions and 3 deletions
--- a/tests/test_deploy_terminal_sync.py
+++ b/tests/test_deploy_terminal_sync.py
@@ -90,6 +90,10 @@ def test_tc17_success_deploy_syncs_terminal_done(monkeypatch):
    # Spy the merge-lease release to confirm the terminal-sync still frees it.
    release = MagicMock()
    monkeypatch.setattr(stage_engine.merge_gate, "release_merge_lease", release)
+    # ORCH-021 arms an orthogonal post-deploy-monitor reserved job at deploy->done
+    # for the self-hosting repo; disable it here so this test stays focused on the
+    # ORCH-036 terminal-sync contract (no PIPELINE agent launched leaving deploy).
+    monkeypatch.setattr(stage_engine.post_deploy.settings, "post_deploy_monitor_enabled", False)

    task_id = _make_task("deploy")
    stage_engine.run_deploy_finalizer(
--- a/tests/test_post_deploy.py
+++ b/tests/test_post_deploy.py
@@ -0,0 +1,210 @@
+"""ORCH-021 unit tests — post-deploy monitor pure logic (TC-01..TC-15).
+
+The deterministic, network-free core (classification + reaction decision +
+exit-code mapping + artefact frontmatter + never-raise) of ``src/post_deploy.py``.
+Network probes and the rollback hook are exercised via mocks; the classifier is
+the main subject (mirrors compute_staging_verdict in ORCH-061).
+"""
+
+import os
+import tempfile
+
+import pytest
+import yaml
+
+# Isolate the settings singleton onto a tmp repos_dir BEFORE importing the module.
+os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
+os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")
+
+from src import post_deploy  # noqa: E402
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _probe(health_ok=True, total=2, fivexx=0):
+    return {"health_ok": health_ok, "total": total, "fivexx": fivexx}
+
+
+@pytest.fixture(autouse=True)
+def _tmp_state(monkeypatch, tmp_path):
+    monkeypatch.setattr(post_deploy.settings, "repos_dir", str(tmp_path))
+    monkeypatch.setattr(post_deploy.settings, "host_repos_dir", str(tmp_path))
+    yield
+
+
+# ---------------------------------------------------------------------------
+# TC-01..TC-05 — classification (the core)
+# ---------------------------------------------------------------------------
+def test_tc01_healthy_no_failures():
+    series = [_probe() for _ in range(5)]
+    assert post_deploy.classify(series, fail_threshold=3, fivexx_threshold=0.5) == "HEALTHY"
+
+
+def test_tc02_degraded_consecutive_health_failures():
+    # Exactly fail_threshold consecutive failures -> DEGRADED (>= contract).
+    series = [_probe(health_ok=False) for _ in range(3)]
+    assert post_deploy.classify(series, fail_threshold=3, fivexx_threshold=0.5) == "DEGRADED"
+
+
+def test_tc03_degraded_by_5xx_ratio_even_when_health_200():
+    # /health stays 200 (health_ok True) but the 5xx ratio is above threshold.
+    series = [_probe(health_ok=True, total=2, fivexx=2) for _ in range(3)]
+    assert post_deploy.classify(series, fail_threshold=10, fivexx_threshold=0.5) == "DEGRADED"
+
+
+def test_tc04_no_false_trip_single_glitch_then_recovery():
+    # One isolated failure (1 < threshold) surrounded by healthy probes -> HEALTHY.
+    series = [_probe(), _probe(health_ok=False), _probe(), _probe()]
+    assert post_deploy.classify(series, fail_threshold=3, fivexx_threshold=0.5) == "HEALTHY"
+
+
+def test_tc05_thresholds_change_verdict_on_same_data():
+    # Same data, different threshold flips the verdict (AC-11): two consecutive fails.
+    series = [_probe(health_ok=False), _probe(health_ok=False)]
+    assert post_deploy.classify(series, fail_threshold=3, fivexx_threshold=0.5) == "HEALTHY"
+    assert post_deploy.classify(series, fail_threshold=2, fivexx_threshold=0.5) == "DEGRADED"
+
+
+def test_classify_uses_settings_thresholds(monkeypatch):
+    # The tick reads thresholds from Settings (env ORCH_*) — verify the wiring point.
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_fail_threshold", 2)
+    series = [_probe(health_ok=False), _probe(health_ok=False)]
+    assert post_deploy.classify(
+        series,
+        post_deploy.settings.post_deploy_fail_threshold,
+        post_deploy.settings.post_deploy_5xx_threshold,
+    ) == "DEGRADED"
+
+
+# ---------------------------------------------------------------------------
+# TC-06..TC-08 — reaction decision (self-hosting safety)
+# ---------------------------------------------------------------------------
+def test_tc06_nonself_auto_rollback_degraded_rolls_back(monkeypatch):
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_auto_rollback", True)
+    assert post_deploy.decide_action("enduro-trails", "DEGRADED") == "ROLLBACK"
+
+
+def test_tc07_self_hosting_degraded_never_rolls_back(monkeypatch):
+    # orchestrator (self-hosting) is ALWAYS ALERT_ONLY, even with auto_rollback on.
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_auto_rollback", True)
+    assert post_deploy.decide_action("orchestrator", "DEGRADED") == "ALERT_ONLY"
+
+
+def test_tc08_healthy_means_none_for_any_repo():
+    assert post_deploy.decide_action("orchestrator", "HEALTHY") == "NONE"
+    assert post_deploy.decide_action("enduro-trails", "HEALTHY") == "NONE"
+
+
+def test_nonself_default_policy_alert_only(monkeypatch):
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_auto_rollback", False)
+    assert post_deploy.decide_action("enduro-trails", "DEGRADED") == "ALERT_ONLY"
+
+
+# ---------------------------------------------------------------------------
+# TC-09..TC-10 — conditionality / kill-switch
+# ---------------------------------------------------------------------------
+def test_tc09_applies_empty_repos_only_self_hosting(monkeypatch):
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_repos", "")
+    assert post_deploy.post_deploy_applies("orchestrator") is True
+    assert post_deploy.post_deploy_applies("enduro-trails") is False
+
+
+def test_tc09_applies_explicit_repos_csv(monkeypatch):
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_repos", "enduro-trails")
+    assert post_deploy.post_deploy_applies("enduro-trails") is True
+    assert post_deploy.post_deploy_applies("orchestrator") is False
+
+
+def test_tc10_kill_switch_disables_for_everyone(monkeypatch):
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", False)
+    assert post_deploy.post_deploy_applies("orchestrator") is False
+    assert post_deploy.post_deploy_applies("enduro-trails") is False
+
+
+# ---------------------------------------------------------------------------
+# TC-11..TC-12 — rollback exit-code mapping
+# ---------------------------------------------------------------------------
+def test_tc11_rollback_exit0_is_ok():
+    assert post_deploy.map_rollback_exit_code(0) == "ROLLBACK_OK"
+
+
+def test_tc12_rollback_exit_nonzero_is_failed():
+    assert post_deploy.map_rollback_exit_code(1) == "ROLLBACK_FAILED"
+    assert post_deploy.map_rollback_exit_code(2) == "ROLLBACK_FAILED"
+    assert post_deploy.map_rollback_exit_code(None) == "ROLLBACK_FAILED"
+    assert post_deploy.map_rollback_exit_code("garbage") == "ROLLBACK_FAILED"
+
+
+# ---------------------------------------------------------------------------
+# TC-13 — artefact frontmatter
+# ---------------------------------------------------------------------------
+def test_tc13_log_frontmatter_parses():
+    body = post_deploy.build_post_deploy_log(
+        "ORCH-021", "DEGRADED", "ALERT_ONLY", 900, 12, 4
+    )
+    assert body.startswith("---\n")
+    fm = body.split("---", 2)[1]
+    data = yaml.safe_load(fm)
+    assert data["post_deploy_status"] == "DEGRADED"
+    assert data["action_taken"] == "ALERT_ONLY"
+    assert data["work_item"] == "ORCH-021"
+    assert data["window_s"] == 900
+    assert data["checks_total"] == 12
+    assert data["checks_failed"] == 4
+
+
+# ---------------------------------------------------------------------------
+# TC-14..TC-15 — never-raise
+# ---------------------------------------------------------------------------
+def test_tc14_probe_network_error_is_conservative_not_raise(monkeypatch):
+    # urlopen raises on every call -> health bad + monitored endpoints counted as
+    # 5xx, but NO exception propagates (the helper swallows and reports code 0).
+    def boom(*a, **k):
+        raise OSError("network down")
+
+    monkeypatch.setattr(post_deploy.urllib.request, "urlopen", boom)
+    res = post_deploy.probe_signals("http://localhost:8500")
+    assert res.health_ok is False
+    assert res.total == 2
+    assert res.fivexx == 2  # unreachable endpoints counted as failures
+
+
+def test_tc14_classify_junk_input_swallowed():
+    # If classify gets junk it must not raise (fail-safe to HEALTHY).
+    assert post_deploy.classify("not-a-list", 3, 0.5) == "HEALTHY"
+    assert post_deploy.classify([{"bad": "row"}], 3, 0.5) == "HEALTHY"
+    assert post_deploy.classify(None, 3, 0.5) == "HEALTHY"
+
+
+def test_tc15_write_log_no_worktree_returns_false(monkeypatch):
+    # get_worktree_path raises -> write returns False, no exception (best-effort).
+    def boom(repo, branch):
+        raise FileNotFoundError("no worktree")
+
+    monkeypatch.setattr("src.git_worktree.get_worktree_path", boom)
+    ok = post_deploy.write_post_deploy_log(
+        "nope-repo", "ORCH-021", "feature/x", "HEALTHY", "NONE", 900, 3, 0
+    )
+    assert ok is False
+
+
+# ---------------------------------------------------------------------------
+# Sentinel state restart-safe counters
+# ---------------------------------------------------------------------------
+def test_series_append_and_read_roundtrip():
+    post_deploy.write_marker("orchestrator", "ORCH-021", post_deploy.ARMED, "armed")
+    post_deploy.append_probe("orchestrator", "ORCH-021", post_deploy.ProbeResult(False, 2, 1, "x"))
+    post_deploy.append_probe("orchestrator", "ORCH-021", post_deploy.ProbeResult(True, 2, 0, "y"))
+    series = post_deploy.read_series("orchestrator", "ORCH-021")
+    assert len(series) == 2
+    assert series[0]["health_ok"] is False
+    assert series[1]["health_ok"] is True
+
+
+def test_mark_done_idempotency_marker():
+    assert post_deploy.has_marker("orchestrator", "ORCH-021", post_deploy.DONE) is False
+    post_deploy.mark_done("orchestrator", "ORCH-021")
+    assert post_deploy.has_marker("orchestrator", "ORCH-021", post_deploy.DONE) is True
--- a/tests/test_post_deploy_integration.py
+++ b/tests/test_post_deploy_integration.py
@@ -0,0 +1,259 @@
+"""ORCH-021 integration tests — arming + tick orchestration (TC-16..TC-20).
+
+Exercises the wiring in ``stage_engine`` (arm on deploy->done,
+``run_post_deploy_monitor`` tick + reaction) and the ``/queue`` observability
+block, with the network probe and the rollback hook mocked. Mirrors the
+test_deploy_terminal_sync.py harness.
+"""
+
+import os
+import tempfile
+
+import pytest
+
+_test_db = os.path.join(tempfile.gettempdir(), "test_orch_post_deploy.db")
+os.environ["ORCH_DB_PATH"] = _test_db
+os.environ["ORCH_REPOS_DIR"] = tempfile.gettempdir()
+os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
+os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")
+
+from unittest.mock import MagicMock  # noqa: E402
+
+import src.db as _db  # noqa: E402
+from src.db import init_db, get_db  # noqa: E402
+from src import stage_engine  # noqa: E402
+from src import post_deploy  # noqa: E402
+
+
+@pytest.fixture(autouse=True)
+def fresh_db(monkeypatch, tmp_path):
+    monkeypatch.setattr(_db.settings, "db_path", _test_db)
+    if os.path.exists(_test_db):
+        os.unlink(_test_db)
+    init_db()
+    # State sentinels live under the tmp repos_dir (container view).
+    monkeypatch.setattr(post_deploy.settings, "repos_dir", str(tmp_path))
+    monkeypatch.setattr(post_deploy.settings, "host_repos_dir", str(tmp_path))
+    monkeypatch.setattr(stage_engine.settings, "repos_dir", str(tmp_path))
+    # The artefact write is best-effort; stub it so no worktree is needed.
+    monkeypatch.setattr(post_deploy, "write_post_deploy_log", MagicMock(return_value=True))
+    yield
+
+
+@pytest.fixture(autouse=True)
+def silence_side_effects(monkeypatch):
+    for name in (
+        "notify_stage_change", "notify_qg_failure", "notify_approve_requested",
+        "send_telegram", "plane_notify_stage", "plane_notify_qg", "plane_add_comment",
+        "set_issue_in_review", "set_issue_needs_input", "set_issue_in_progress",
+        "set_issue_blocked", "set_issue_done",
+    ):
+        monkeypatch.setattr(stage_engine, name, MagicMock())
+
+
+def _make_task(stage, repo="orchestrator", branch="feature/ORCH-021-x", wi="ORCH-021"):
+    conn = get_db()
+    cur = conn.execute(
+        "INSERT INTO tasks (plane_id, work_item_id, repo, branch, stage) "
+        "VALUES (?, ?, ?, ?, ?)",
+        (f"plane-{wi}", wi, repo, branch, stage),
+    )
+    task_id = cur.lastrowid
+    conn.commit()
+    conn.close()
+    return task_id
+
+
+def _jobs(agent=None):
+    conn = get_db()
+    if agent:
+        rows = conn.execute(
+            "SELECT agent FROM jobs WHERE agent=? ORDER BY id", (agent,)
+        ).fetchall()
+    else:
+        rows = conn.execute("SELECT agent FROM jobs ORDER BY id").fetchall()
+    conn.close()
+    return [r[0] for r in rows]
+
+
+def _pass(*a, **k):
+    return (True, "ok")
+
+
+def _drive_deploy_to_done(monkeypatch, task_id, repo="orchestrator",
+                          branch="feature/ORCH-021-x", wi="ORCH-021"):
+    """Advance a deploy-stage task to done through the real terminal block."""
+    monkeypatch.setattr(
+        stage_engine, "QG_CHECKS",
+        {**stage_engine.QG_CHECKS, "check_deploy_status": _pass},
+    )
+    monkeypatch.setattr(stage_engine.merge_gate, "release_merge_lease", MagicMock())
+    return stage_engine.advance_stage(
+        task_id=task_id, current_stage="deploy", repo=repo,
+        work_item_id=wi, branch=branch, finished_agent="deployer",
+    )
+
+
+# ---------------------------------------------------------------------------
+# TC-16 — arm on deploy->done (applicable repo only)
+# ---------------------------------------------------------------------------
+def test_tc16_arm_for_self_hosting(monkeypatch):
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_repos", "")
+    task_id = _make_task("deploy")
+    _drive_deploy_to_done(monkeypatch, task_id)
+
+    assert post_deploy.has_marker("orchestrator", "ORCH-021", post_deploy.ARMED)
+    assert "post-deploy-monitor" in _jobs("post-deploy-monitor")
+
+
+def test_tc16_no_arm_for_nonself(monkeypatch):
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_repos", "")
+    task_id = _make_task("deploy", repo="enduro-trails", branch="feature/ET-9", wi="ET-9")
+    _drive_deploy_to_done(monkeypatch, task_id, repo="enduro-trails",
+                          branch="feature/ET-9", wi="ET-9")
+
+    assert not post_deploy.has_marker("enduro-trails", "ET-9", post_deploy.ARMED)
+    assert _jobs("post-deploy-monitor") == []
+
+
+def test_tc16_no_arm_when_kill_switch_off(monkeypatch):
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", False)
+    task_id = _make_task("deploy")
+    _drive_deploy_to_done(monkeypatch, task_id)
+    assert not post_deploy.has_marker("orchestrator", "ORCH-021", post_deploy.ARMED)
+    assert _jobs("post-deploy-monitor") == []
+
+
+# ---------------------------------------------------------------------------
+# TC-17 — idempotent arm (double webhook)
+# ---------------------------------------------------------------------------
+def test_tc17_double_arm_is_noop(monkeypatch):
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
+    armed1 = post_deploy.arm_monitor("orchestrator", "ORCH-021", "feature/ORCH-021-x", 1)
+    armed2 = post_deploy.arm_monitor("orchestrator", "ORCH-021", "feature/ORCH-021-x", 1)
+    assert armed1 is True
+    assert armed2 is False
+    # Exactly ONE monitor job enqueued despite two arm calls.
+    assert _jobs("post-deploy-monitor") == ["post-deploy-monitor"]
+
+
+# ---------------------------------------------------------------------------
+# TC-18 — DEGRADED -> non-self auto-rollback (hook mocked)
+# ---------------------------------------------------------------------------
+def test_tc18_degraded_nonself_rolls_back(monkeypatch):
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_repos", "enduro-trails")
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_auto_rollback", True)
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_fail_threshold", 1)
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_window_s", 30)
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_interval_s", 30)  # budget=1 tick
+    # Probe reports unhealthy.
+    monkeypatch.setattr(
+        post_deploy, "probe_signals",
+        lambda url: post_deploy.ProbeResult(False, 2, 2, "down"),
+    )
+    rollback = MagicMock(return_value=(0, "ok"))
+    monkeypatch.setattr(post_deploy, "run_rollback", rollback)
+    notify = MagicMock()
+    monkeypatch.setattr(stage_engine, "_notify_post_deploy", notify)
+    logspy = MagicMock(return_value=True)
+    monkeypatch.setattr(post_deploy, "write_post_deploy_log", logspy)
+
+    task_id = _make_task("done", repo="enduro-trails", branch="feature/ET-9", wi="ET-9")
+    post_deploy.write_marker("enduro-trails", "ET-9", post_deploy.ARMED, "armed")
+    stage_engine.run_post_deploy_monitor(
+        {"task_id": task_id, "repo": "enduro-trails", "id": 1, "agent": "post-deploy-monitor"}
+    )
+
+    rollback.assert_called_once_with("enduro-trails")
+    assert post_deploy.has_marker("enduro-trails", "ET-9", post_deploy.DONE)
+    # Artefact written with ROLLBACK_OK; a notification was sent.
+    args = logspy.call_args[0]
+    assert "DEGRADED" in args
+    assert "ROLLBACK_OK" in args
+    assert notify.called
+
+
+# ---------------------------------------------------------------------------
+# TC-19 — self-hosting DEGRADED never rolls back, alerts instead
+# ---------------------------------------------------------------------------
+def test_tc19_degraded_self_hosting_alert_only(monkeypatch):
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_auto_rollback", True)
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_fail_threshold", 1)
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_window_s", 30)
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_interval_s", 30)
+    monkeypatch.setattr(
+        post_deploy, "probe_signals",
+        lambda url: post_deploy.ProbeResult(False, 2, 2, "down"),
+    )
+    # Rollback hook MUST NOT be called for self-hosting (AC-8 structural invariant).
+    rollback = MagicMock(return_value=(0, "ok"))
+    monkeypatch.setattr(post_deploy, "run_rollback", rollback)
+    notify = MagicMock()
+    monkeypatch.setattr(stage_engine, "_notify_post_deploy", notify)
+    logspy = MagicMock(return_value=True)
+    monkeypatch.setattr(post_deploy, "write_post_deploy_log", logspy)
+
+    task_id = _make_task("done")
+    post_deploy.write_marker("orchestrator", "ORCH-021", post_deploy.ARMED, "armed")
+    stage_engine.run_post_deploy_monitor(
+        {"task_id": task_id, "repo": "orchestrator", "id": 1, "agent": "post-deploy-monitor"}
+    )
+
+    rollback.assert_not_called()
+    assert post_deploy.has_marker("orchestrator", "ORCH-021", post_deploy.DONE)
+    args = logspy.call_args[0]
+    assert "DEGRADED" in args
+    assert "ALERT_ONLY" in args
+    assert notify.called
+
+
+def test_healthy_tick_requeues_without_finishing(monkeypatch):
+    # HEALTHY and window not exhausted -> re-queue, do NOT mark done.
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_window_s", 90)
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_interval_s", 30)  # budget=3
+    monkeypatch.setattr(
+        post_deploy, "probe_signals",
+        lambda url: post_deploy.ProbeResult(True, 2, 0, "ok"),
+    )
+    task_id = _make_task("done")
+    post_deploy.write_marker("orchestrator", "ORCH-021", post_deploy.ARMED, "armed")
+    stage_engine.run_post_deploy_monitor(
+        {"task_id": task_id, "repo": "orchestrator", "id": 1, "agent": "post-deploy-monitor"}
+    )
+    assert not post_deploy.has_marker("orchestrator", "ORCH-021", post_deploy.DONE)
+    # A follow-up tick job was enqueued.
+    assert _jobs("post-deploy-monitor") == ["post-deploy-monitor"]
+
+
+def test_finished_window_tick_is_noop(monkeypatch):
+    # AC-15: a tick after the window is done is a no-op (no new job, no re-probe).
+    probe = MagicMock()
+    monkeypatch.setattr(post_deploy, "probe_signals", probe)
+    task_id = _make_task("done")
+    post_deploy.mark_done("orchestrator", "ORCH-021")
+    stage_engine.run_post_deploy_monitor(
+        {"task_id": task_id, "repo": "orchestrator", "id": 9, "agent": "post-deploy-monitor"}
+    )
+    probe.assert_not_called()
+
+
+# ---------------------------------------------------------------------------
+# TC-20 — /queue observability block
+# ---------------------------------------------------------------------------
+def test_tc20_queue_block_present(monkeypatch):
+    monkeypatch.setattr(post_deploy.settings, "post_deploy_monitor_enabled", True)
+    post_deploy.write_marker("orchestrator", "ORCH-021", post_deploy.ARMED, "armed")
+    snap = post_deploy.status()
+    assert snap["enabled"] is True
+    assert snap["window_s"] == post_deploy.settings.post_deploy_window_s
+    assert "ORCH-021" in snap["active"]
+    assert snap["active_count"] >= 1
+    # A finished window drops out of "active".
+    post_deploy.mark_done("orchestrator", "ORCH-021")
+    snap2 = post_deploy.status()
+    assert "ORCH-021" not in snap2["active"]