fix(reaper): Tier-2 finalization grace + claim-before-act (no dup advance)
Tier-2 reaped a LIVE, still-finalizing monitor: _monitor_agent writes agent_runs.exit_code FIRST, then does git push / PR / Plane comments before _finalize_job, and the agent pid is already dead in that window — so the old "exit_code recorded -> reap now" had no grace and could race a healthy job. Worse, _reap_known_outcome ran the advance (advance_stage -> enqueue_job) BEFORE the atomic claim, so a reaper that lost the race had already enqueued the next stage (dup advance / dup enqueue), violating ADR-001 Р-1. Fix: - Tier-2 grace: reap only once agent_runs.exit_code has been recorded for >= reaper_finalize_grace_s (new setting, default 300s; > max finalization window). A live finalizing monitor is never reaped (FR-1.3/AC-3). New finished_age_s column computed in get_running_jobs. - claim-before-act for exit0: evaluate the canonical QG READ-ONLY (the reconciler pattern) to choose the terminal status, then atomically claim 'done' FIRST; only the claim winner runs the advance. A loser performs no side effects -> no dup advance / dup enqueue. Docs (golden source) updated in the same change: ADR-001, global adr-0011, README, internals, .env.example, CHANGELOG (also fixes the P3 broken adr-0011 link). New tests cover the grace window, lost-claim no-side-effects, and the already-advanced idempotent path. Refs: ORCH-065 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -32,18 +32,22 @@ def fresh_db(tmp_path, monkeypatch):
|
||||
# --- helpers ----------------------------------------------------------------
|
||||
def _make_running_job(agent="developer", repo="orchestrator", task_id=None,
|
||||
pid=None, age_s=0, attempts=0, max_attempts=2,
|
||||
run_id=None, exit_code=None):
|
||||
run_id=None, exit_code=None, finished_age_s=600):
|
||||
"""Insert a job already in 'running' with the given pid/age/attempts.
|
||||
|
||||
started_at is back-dated by ``age_s`` seconds so running_age_s reflects it.
|
||||
When ``exit_code`` is given an agent_runs row is created and linked (Tier-2).
|
||||
When ``exit_code`` is given an agent_runs row is created and linked (Tier-2);
|
||||
its ``finished_at`` is back-dated by ``finished_age_s`` seconds so the
|
||||
Tier-2 finalization grace (``reaper_finalize_grace_s``, default 300) is
|
||||
satisfied by default — pass a small ``finished_age_s`` to exercise the
|
||||
"monitor may still be finalizing" deferral.
|
||||
"""
|
||||
conn = get_db()
|
||||
if run_id is None and exit_code is not None:
|
||||
cur = conn.execute(
|
||||
"INSERT INTO agent_runs (task_id, agent, finished_at, exit_code) "
|
||||
"VALUES (?, ?, datetime('now'), ?)",
|
||||
(task_id, agent, exit_code),
|
||||
"VALUES (?, ?, datetime('now', ?), ?)",
|
||||
(task_id, agent, f"-{int(finished_age_s)} seconds", exit_code),
|
||||
)
|
||||
run_id = cur.lastrowid
|
||||
cur = conn.execute(
|
||||
@@ -153,11 +157,20 @@ def test_tc04_backstop_no_pid(monkeypatch):
|
||||
|
||||
|
||||
# --- TC-05: correct outcome by exit_code (Tier-2) ---------------------------
|
||||
def _gate(monkeypatch, green: bool):
|
||||
"""Force the reaper's READ-ONLY gate pre-evaluation to green/red."""
|
||||
monkeypatch.setattr(
|
||||
JobReaper, "_gate_is_green",
|
||||
lambda self, stage, job, branch, wid: green,
|
||||
)
|
||||
|
||||
|
||||
def test_tc05_exit0_gate_green_done(monkeypatch):
|
||||
# A developer job runs to LEAVE the 'architecture' stage (-> 'development').
|
||||
tid = _make_task(stage="architecture")
|
||||
jid = _make_running_job(agent="developer", task_id=tid, exit_code=0)
|
||||
# gate green -> advance succeeds (stage leaves the developer candidate set).
|
||||
_gate(monkeypatch, green=True)
|
||||
# gate green -> the claim flips 'done' FIRST, then the advance runs.
|
||||
import src.agents.launcher as L
|
||||
monkeypatch.setattr(
|
||||
L.launcher, "_try_advance_stage",
|
||||
@@ -172,13 +185,31 @@ def test_tc05_exit0_gate_red_requeues(monkeypatch):
|
||||
tid = _make_task(stage="architecture")
|
||||
jid = _make_running_job(agent="developer", task_id=tid, exit_code=0,
|
||||
attempts=0, max_attempts=2)
|
||||
# gate red -> _try_advance_stage is a no-op (stage stays 'architecture').
|
||||
_gate(monkeypatch, green=False) # read-only pre-eval says red
|
||||
# The advance path must NEVER run when the gate is red (claim-before-act).
|
||||
import src.agents.launcher as L
|
||||
called = []
|
||||
monkeypatch.setattr(L.launcher, "_try_advance_stage",
|
||||
lambda run_id, agent, repo, branch: None)
|
||||
lambda run_id, agent, repo, branch: called.append(1))
|
||||
r = JobReaper()
|
||||
r.reap_once()
|
||||
assert get_job(jid)["status"] == "queued" # exit0 but gate red -> not 'done'
|
||||
assert not called, "no advance/side-effects on a red gate"
|
||||
|
||||
|
||||
def test_tc05_exit0_already_advanced_done_no_side_effects(monkeypatch):
|
||||
# Stage already past the developer candidate set -> idempotent clean 'done'
|
||||
# with NO advance call (the monitor already advanced before dying).
|
||||
tid = _make_task(stage="development") # developer's candidate is 'architecture'
|
||||
jid = _make_running_job(agent="developer", task_id=tid, exit_code=0)
|
||||
import src.agents.launcher as L
|
||||
called = []
|
||||
monkeypatch.setattr(L.launcher, "_try_advance_stage",
|
||||
lambda run_id, agent, repo, branch: called.append(1))
|
||||
r = JobReaper()
|
||||
r.reap_once()
|
||||
assert get_job(jid)["status"] == "done"
|
||||
assert not called, "already-advanced reap must not re-advance"
|
||||
|
||||
|
||||
def test_tc05_nonzero_exit_requeue_then_failed(monkeypatch):
|
||||
@@ -201,6 +232,78 @@ def test_tc05_nonzero_exit_requeue_then_failed(monkeypatch):
|
||||
assert sent, "failed reap must send a Telegram alert"
|
||||
|
||||
|
||||
# --- TC-05b: Tier-2 finalization grace (live monitor still finalizing) -------
|
||||
def test_tc05_tier2_within_grace_not_reaped(monkeypatch):
|
||||
"""exit_code freshly recorded -> a LIVE monitor may still be finalizing.
|
||||
|
||||
The reaper must NOT reap it within ``reaper_finalize_grace_s`` (FR-1.3/AC-3:
|
||||
a live finalizing monitor — git push / PR / Plane comments — is never reaped,
|
||||
no dup advance / enqueue).
|
||||
"""
|
||||
monkeypatch.setattr(db.settings, "reaper_finalize_grace_s", 300)
|
||||
tid = _make_task(stage="architecture")
|
||||
# exit_code recorded only 5s ago -> still inside the finalization grace.
|
||||
jid = _make_running_job(agent="developer", task_id=tid, exit_code=0,
|
||||
finished_age_s=5)
|
||||
import src.agents.launcher as L
|
||||
called = []
|
||||
monkeypatch.setattr(L.launcher, "_try_advance_stage",
|
||||
lambda run_id, agent, repo, branch: called.append(1))
|
||||
r = JobReaper()
|
||||
r.reap_once()
|
||||
assert get_job(jid)["status"] == "running" # deferred, NOT reaped
|
||||
assert r.reaped_total == 0
|
||||
assert not called, "a live finalizing monitor must not be advanced by the reaper"
|
||||
|
||||
|
||||
def test_tc05_tier2_after_grace_reaped(monkeypatch):
|
||||
"""Once exit_code has been recorded longer than the grace, the monitor is
|
||||
genuinely dead and the Tier-2 reap proceeds."""
|
||||
monkeypatch.setattr(db.settings, "reaper_finalize_grace_s", 300)
|
||||
tid = _make_task(stage="architecture")
|
||||
jid = _make_running_job(agent="developer", task_id=tid, exit_code=0,
|
||||
finished_age_s=600) # well past the grace
|
||||
_gate(monkeypatch, green=True)
|
||||
import src.agents.launcher as L
|
||||
monkeypatch.setattr(
|
||||
L.launcher, "_try_advance_stage",
|
||||
lambda run_id, agent, repo, branch: db.update_task_stage(tid, "development"),
|
||||
)
|
||||
r = JobReaper()
|
||||
r.reap_once()
|
||||
assert get_job(jid)["status"] == "done"
|
||||
|
||||
|
||||
def test_tc05_tier2_lost_claim_no_side_effects(monkeypatch):
|
||||
"""claim-BEFORE-act: when another actor (a late monitor / startup requeue)
|
||||
moves the row out of 'running' AFTER the reaper read it but BEFORE the atomic
|
||||
claim, the reaper's claim loses (rowcount==0) and it performs NO advance side
|
||||
effects (no dup advance / dup enqueue) — ADR-001 Р-1."""
|
||||
monkeypatch.setattr(db.settings, "reaper_finalize_grace_s", 0)
|
||||
tid = _make_task(stage="architecture")
|
||||
jid = _make_running_job(agent="developer", task_id=tid, exit_code=0,
|
||||
finished_age_s=10)
|
||||
import src.agents.launcher as L
|
||||
called = []
|
||||
monkeypatch.setattr(L.launcher, "_try_advance_stage",
|
||||
lambda run_id, agent, repo, branch: called.append(1))
|
||||
|
||||
# The read-only gate pre-eval reports green, but the row is concurrently
|
||||
# claimed by someone else right before the reaper's atomic claim runs.
|
||||
def green_then_steal(self, stage, job, branch, wid):
|
||||
db.requeue_running_jobs() # another actor wins the 'running' row first
|
||||
return True
|
||||
|
||||
monkeypatch.setattr(JobReaper, "_gate_is_green", green_then_steal)
|
||||
r = JobReaper()
|
||||
r.reap_once()
|
||||
# Reaper lost the atomic claim -> no advance, no double work. The row stays
|
||||
# where the winner left it ('queued'), not flipped to 'done' by the reaper.
|
||||
assert not called, "reaper that lost the claim must not advance/enqueue"
|
||||
assert get_job(jid)["status"] == "queued"
|
||||
assert r.reaped_total == 0
|
||||
|
||||
|
||||
# --- TC-06: atomicity — reaper vs requeue_running_jobs (status guard) --------
|
||||
def test_tc06_atomic_no_double_reap(monkeypatch):
|
||||
_dead_pid(monkeypatch)
|
||||
|
||||
Reference in New Issue
Block a user