"""ORCH-113 (adr-0043): reaper must not re-run deploy-staging finalization while the original finalizer is alive — finalizer-liveness ownership tests (TC-01..TC-08). Covers the bug from incident ORCH-111 (deployer job 1914): on the ``deploy-staging -> deploy`` edge the live monitor runs the heavy edge sub-gates (security/merge-gate re-test/coverage/image-freshness) in-thread for MINUTES AFTER stamping ``agent_runs.finished_at`` and BEFORE ``_finalize_job``. Reaper Tier-2 measures ``finished_age_s`` from ``finished_at``, so past ``reaper_finalize_grace_s`` it treated the live, long-finalizing monitor as dead and independently re-ran the advance -> a second re-test went red -> false rollback ``deploy-staging -> development`` while the original finalizer concurrently merged the PR. State diverged. The reaper never spawns claude / pytest / docker; we drive the DB directly (a 'running' jobs row + a linked agent_runs exit_code) and the process-local ``finalizer_liveness`` marker, then assert the reaper's terminal flip / deferral. No network, no subprocess — every external is mocked. """ import os import tempfile import pytest # Override env before importing app modules (same convention as test_job_reaper.py). os.environ["ORCH_DB_PATH"] = os.path.join( tempfile.gettempdir(), "test_orch113_reaper.db" ) os.environ["ORCH_REPOS_DIR"] = tempfile.gettempdir() os.environ["ORCH_GITEA_TOKEN"] = "test-token" os.environ["ORCH_PLANE_API_TOKEN"] = "test-token" import src.db as db from src.db import init_db, get_db, get_job import src.finalizer_liveness as fl import src.job_reaper as jr from src.job_reaper import JobReaper @pytest.fixture(autouse=True) def fresh_db(tmp_path, monkeypatch): dbfile = tmp_path / "reaper113.db" monkeypatch.setattr(db.settings, "db_path", str(dbfile)) init_db() # Each test starts with a clean ownership registry (process-local module state). with fl._LOCK: fl._OWNED.clear() # Default: kill-switch ON (the fix is active) unless a test flips it. monkeypatch.setattr(db.settings, "reaper_finalizer_liveness_enabled", True) yield with fl._LOCK: fl._OWNED.clear() # --- helpers ---------------------------------------------------------------- def _make_task(repo="orchestrator", branch="feature/orch113", stage="deploy-staging", work_item_id="ORCH-113"): conn = get_db() cur = conn.execute( "INSERT INTO tasks (plane_id, work_item_id, repo, branch, stage) " "VALUES (?, ?, ?, ?, ?)", (work_item_id, work_item_id, repo, branch, stage), ) tid = cur.lastrowid conn.commit() conn.close() return tid def _make_running_job(agent="deployer", repo="orchestrator", task_id=None, pid=None, age_s=0, attempts=0, max_attempts=2, run_id=None, exit_code=0, finished_age_s=600): """Insert a job already in 'running' with a linked agent_runs row. ``finished_at`` is back-dated by ``finished_age_s`` so the Tier-2 grace (default 300) is satisfied by default; pass a small value to stay within grace. """ conn = get_db() if run_id is None and exit_code is not None: cur = conn.execute( "INSERT INTO agent_runs (task_id, agent, finished_at, exit_code) " "VALUES (?, ?, datetime('now', ?), ?)", (task_id, agent, f"-{int(finished_age_s)} seconds", exit_code), ) run_id = cur.lastrowid cur = conn.execute( "INSERT INTO jobs (agent, repo, task_id, status, attempts, max_attempts, " "run_id, pid, started_at) " "VALUES (?, ?, ?, 'running', ?, ?, ?, ?, datetime('now', ?))", (agent, repo, task_id, attempts, max_attempts, run_id, pid, f"-{int(age_s)} seconds"), ) job_id = cur.lastrowid conn.commit() conn.close() return job_id def _spy_advance(monkeypatch, side_effect=None): """Patch launcher._try_advance_stage with a call recorder. Returns a mutable ``calls`` list. ``side_effect(run_id, agent, repo, branch)`` runs on each call (e.g. to simulate the false rollback to development). """ import src.agents.launcher as L calls: list = [] def _fake(run_id, agent, repo, branch): calls.append((run_id, agent, repo, branch)) if side_effect is not None: side_effect(run_id, agent, repo, branch) monkeypatch.setattr(L.launcher, "_try_advance_stage", _fake) return calls def _green_gate(monkeypatch): """Force the read-only canonical-QG pre-eval green (staging SUCCESS).""" monkeypatch.setattr(JobReaper, "_gate_is_green", lambda self, stage, job, branch, wid: True) # --- TC-01: live finalizer on deploy-staging is NOT reaped (AC-1/FR-1) ------ def test_tc01_live_finalizer_deploy_staging_not_reaped(monkeypatch): """exit_code=0 and finished_age_s >= grace, but the finalizer is ALIVE (marked) -> reaper does NOT call _try_advance_stage; the row stays running; defer logged.""" _green_gate(monkeypatch) calls = _spy_advance(monkeypatch) tid = _make_task(stage="deploy-staging") jid = _make_running_job(task_id=tid, exit_code=0, finished_age_s=600) # A live monitor owns this finalization. fl.mark(jid, run_id=1, stage="deploy-staging") r = JobReaper() r.reap_once() assert get_job(jid)["status"] == "running" # not reaped assert calls == [] # no second advance assert r.finalizer_defers_total == 1 assert r.reaped_total == 0 # --- TC-02: strict ownership — non-owner runs zero side effects (AC-2/FR-2) -- def test_tc02_non_owner_runs_no_edge_gates(monkeypatch): """While a live finalizer owns the (job, stage), a racing reaper tick performs NO side-effectful advance/merge-gate/re-test (zero side effects).""" _green_gate(monkeypatch) calls = _spy_advance(monkeypatch) tid = _make_task(stage="deploy-staging") jid = _make_running_job(task_id=tid, exit_code=0, finished_age_s=900) fl.mark(jid, run_id=7, stage="deploy-staging") r = JobReaper() # Several ticks while ownership is held — still zero advances, still running. for _ in range(3): r.reap_once() assert calls == [] assert get_job(jid)["status"] == "running" assert r.finalizer_defers_total == 3 # --- TC-03: a genuinely dead finalizer is still reaped (AC-3/FR-4) ---------- def test_tc03_dead_finalizer_still_reaped_tier2(monkeypatch): """No ownership marker (finalizer dead) -> reaper reaps via Tier-2 as before.""" _green_gate(monkeypatch) calls = _spy_advance(monkeypatch) tid = _make_task(stage="deploy-staging") jid = _make_running_job(task_id=tid, exit_code=0, finished_age_s=600) # No fl.mark() -> ownership absent (finalizer dead). r = JobReaper() r.reap_once() assert get_job(jid)["status"] == "done" # reaped to done (gate green) assert len(calls) == 1 # advance driven exactly once assert r.finalizer_defers_total == 0 def test_tc03_tier3_backstop_ignores_marker(monkeypatch): """Even with an active ownership marker, a job past reaper_max_running_s is reaped by the Tier-3 backstop (a stuck finalizer is never immortal).""" monkeypatch.setattr(db.settings, "reaper_max_running_s", 1000) tid = _make_task(stage="deploy-staging") # age beyond the backstop ceiling; exit_code=0 within grace so Tier-2 defers, # but Tier-3 must still fire. jid = _make_running_job(task_id=tid, exit_code=0, finished_age_s=10, age_s=2000, attempts=0, max_attempts=2) fl.mark(jid, run_id=3, stage="deploy-staging") r = JobReaper() r.reap_once() # Backstop reaps to a retry (attempts queued, regardless of the marker. assert get_job(jid)["status"] == "queued" assert r.reaped_total == 1 # --- TC-04: idempotency under race — exactly-once advance (AC-2/AC-4/FR-2) --- def test_tc04_idempotent_no_second_advance_under_race(monkeypatch): """Monitor finalizing (owns the job) + concurrent reaper ticks -> the heavy edge-gate advance is NEVER duplicated by the reaper; no false rollback.""" rolled_back = {"hit": False} def _rollback(run_id, agent, repo, branch): # Simulate the incident: a second advance rolls back to development. conn = get_db() conn.execute("UPDATE tasks SET stage='development' WHERE branch=?", (branch,)) conn.commit() conn.close() rolled_back["hit"] = True _green_gate(monkeypatch) calls = _spy_advance(monkeypatch, side_effect=_rollback) tid = _make_task(stage="deploy-staging") jid = _make_running_job(task_id=tid, exit_code=0, finished_age_s=1200) fl.mark(jid, run_id=9, stage="deploy-staging") r = JobReaper() for _ in range(5): r.reap_once() assert calls == [] # reaper never re-ran the advance assert rolled_back["hit"] is False # The task is NOT rolled back; the live finalizer remains the sole driver. conn = get_db() stage = conn.execute("SELECT stage FROM tasks WHERE id=?", (tid,)).fetchone()[0] conn.close() assert stage == "deploy-staging" # --- TC-05: MANDATORY regression for incident ORCH-111 (AC-4/FR-5) ---------- def test_tc05_orch111_no_false_rollback_no_retry_increment(monkeypatch): """Long (> grace) deploy-staging finalization at staging_status=SUCCESS while the deploy finalizer concurrently reaches success/merge -> reaper does NOT roll back deploy-staging -> development and does NOT increment developer-retry; the task keeps a single consistent state. RED before the fix, GREEN after.""" def _rollback(run_id, agent, repo, branch): # Simulate the incident: a second advance rolls the task back to development # and spawns a fresh developer run (the developer-retry count is derived from # agent_runs — stage_engine.developer_retry_count). conn = get_db() conn.execute("UPDATE tasks SET stage='development' WHERE branch=?", (branch,)) _t = conn.execute("SELECT id FROM tasks WHERE branch=?", (branch,)).fetchone() conn.execute( "INSERT INTO agent_runs (task_id, agent) VALUES (?, 'developer')", (_t[0],), ) conn.commit() conn.close() from src.stage_engine import developer_retry_count _green_gate(monkeypatch) # staging_status SUCCESS calls = _spy_advance(monkeypatch, side_effect=_rollback) tid = _make_task(stage="deploy-staging") jid = _make_running_job(task_id=tid, exit_code=0, finished_age_s=1500) # The original finalizer is still alive (running the heavy edge sub-gates). fl.mark(jid, run_id=11, stage="deploy-staging") r = JobReaper() r.reap_once() # No second advance => no false rollback, no developer-retry increment. assert calls == [] conn = get_db() stage = conn.execute("SELECT stage FROM tasks WHERE id=?", (tid,)).fetchone()[0] conn.close() assert stage == "deploy-staging" # NOT rolled back to development assert developer_retry_count(tid) == 0 # developer-retry NOT incremented assert get_job(jid)["status"] == "running" assert r.finalizer_defers_total == 1 # --- TC-06: compatibility guard — kill-switch / non-deploy-staging (AC-5) ---- def test_tc06_killswitch_off_byte_for_byte_prior(monkeypatch): """Kill-switch OFF -> the marker is ignored; a deploy-staging exit0/past-grace job is reaped exactly as before the fix (advance driven once).""" monkeypatch.setattr(db.settings, "reaper_finalizer_liveness_enabled", False) _green_gate(monkeypatch) calls = _spy_advance(monkeypatch) tid = _make_task(stage="deploy-staging") jid = _make_running_job(task_id=tid, exit_code=0, finished_age_s=600) fl.mark(jid, run_id=5, stage="deploy-staging") # marked, but ignored r = JobReaper() r.reap_once() assert get_job(jid)["status"] == "done" assert len(calls) == 1 assert r.finalizer_defers_total == 0 def test_tc06_non_deploy_staging_stage_not_consulted(monkeypatch): """A non-deploy-staging stage is never consulted -> reaped as before even when an (irrelevant) marker happens to be present.""" _green_gate(monkeypatch) calls = _spy_advance(monkeypatch) tid = _make_task(stage="testing") # deployer also owns 'testing' jid = _make_running_job(task_id=tid, agent="deployer", exit_code=0, finished_age_s=600) fl.mark(jid, run_id=6, stage="testing") r = JobReaper() r.reap_once() assert get_job(jid)["status"] == "done" assert len(calls) == 1 assert r.finalizer_defers_total == 0 def test_tc06_within_grace_unchanged(monkeypatch): """Within the finalization grace the Tier-2 path is unchanged (deferred, not reaped) regardless of the marker — the fix only acts PAST the grace.""" monkeypatch.setattr(db.settings, "reaper_finalize_grace_s", 300) _green_gate(monkeypatch) calls = _spy_advance(monkeypatch) tid = _make_task(stage="deploy-staging") jid = _make_running_job(task_id=tid, exit_code=0, finished_age_s=5) # < grace r = JobReaper() r.reap_once() assert get_job(jid)["status"] == "running" assert calls == [] # Within-grace deferral is the legacy path, not a finalizer-liveness defer. assert r.finalizer_defers_total == 0 # --- TC-07: cross-cutting budget invariant (NFR-6/AC-5) --------------------- def test_tc07_budget_invariant_preserved(): """reaper_max_running_s (5400) > Σ(deploy-staging gate-work) + grace; the fix changed neither the grace nor the ceiling (ORCH-065/109/110 invariant).""" s = jr.settings # The fix did not touch these (zero schema/budget change). assert s.reaper_finalize_grace_s == 300 assert s.reaper_max_running_s == 5400 # Conservative Σ of the heavy deploy-staging gate-work + grace must fit. sigma = s.merge_retest_timeout_s + s.coverage_run_timeout_s assert s.reaper_max_running_s > sigma + s.reaper_finalize_grace_s # --- TC-08: never-raise — a fault in the liveness path degrades safely ------- def test_tc08_liveness_error_never_breaks_tick(monkeypatch): """An exception inside the ownership consultation must not crash the tick; the job is still processed (conservatively reaped, never blocked).""" def _boom(job_id): raise RuntimeError("registry exploded") monkeypatch.setattr(fl, "is_active", _boom) _green_gate(monkeypatch) calls = _spy_advance(monkeypatch) tid = _make_task(stage="deploy-staging") jid = _make_running_job(task_id=tid, exit_code=0, finished_age_s=600) fl.mark(jid, run_id=2, stage="deploy-staging") r = JobReaper() r.reap_once() # must not raise # is_active raised -> _finalizer_owns conservatively returns False -> reaped. assert get_job(jid)["status"] == "done" assert len(calls) == 1 def test_tc08_reap_once_isolates_and_never_raises(monkeypatch): """A fault while resolving one job's metadata is isolated; reap_once never raises and other jobs are still processed.""" def _boom(self, job): raise RuntimeError("meta exploded") monkeypatch.setattr(JobReaper, "_task_meta", _boom) tid = _make_task(stage="deploy-staging") _make_running_job(task_id=tid, exit_code=0, finished_age_s=600) r = JobReaper() r.reap_once() # outer + per-job never-raise -> no exception propagates def test_tc08_finalizer_liveness_leaf_never_raises(): """The leaf degrades safely on bad input / None job_id.""" fl.mark(None, None, None) # no-op, no raise fl.clear(None) # no-op, no raise assert fl.is_active(None) is False fl.mark(1234, 1, "deploy-staging") assert fl.is_active(1234) is True snap = fl.snapshot() assert snap["active"] >= 1 and 1234 in snap["jobs"] fl.clear(1234) assert fl.is_active(1234) is False