Closes the "zombie jobs" incident class: job status was set only inside the live launcher process, so a process death left jobs.status='running' forever; at max_concurrency=1 one zombie blocked ALL projects' queue (self-hosting risk). Adds a background daemon (src/job_reaper.py) with three-tier liveness (dead-pid streak / known exit_code / max-running backstop) whose only mutating write is an atomic terminal flip guarded by WHERE status='running' (no double-process). For exit0 the canonical QG is the source of truth via gate-driven advance, not "exit0". Also proactively reclaims stale merge-lease (dead pid OR TTL) via file delete only (no git ops), and makes merge finalization idempotent (pr_already_merged guard + up-to-date short-circuit on re-drive). New jobs.pid column via idempotent _ensure_column (no migration); pid stamped in launcher._spawn after Popen. Reaper start/stop in lifespan; "reaper" snapshot in GET /queue. Kill-switches: ORCH_REAPER_ENABLED, ORCH_REAPER_INTERVAL_S, ORCH_REAPER_DEAD_TICKS, ORCH_REAPER_MAX_RUNNING_S, ORCH_LEASE_RECLAIM_ENABLED. Invariants unchanged (AC-13): STAGE_TRANSITIONS, QG_CHECKS registry, check_branch_mergeable signature/behaviour, BUG-8 rollback, hook exit codes. restart-safe, never-raise per unit of background work. Docs: docs/architecture/README.md, CHANGELOG.md, .env.example. Tests: tests/test_job_reaper.py, tests/test_merge_lease_reclaim.py, tests/test_merge_gate.py (TC-16), tests/test_merge_gate_race.py (TC-17), tests/test_queue.py, tests/test_config.py (TC-19/TC-20). 742 passed. Refs: ORCH-065 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
360 lines
13 KiB
Python
360 lines
13 KiB
Python
"""Tests for ORCH-1 (F-2b) persistent job queue.
|
|
|
|
Covers:
|
|
- enqueue_job -> claim_next_job -> mark_job lifecycle
|
|
- claim_next_job atomicity (no double-dispatch of the same job)
|
|
- retry: fail -> requeue while attempts < max_attempts, then failed
|
|
- requeue_running_jobs (queue-recovery)
|
|
- count_running_jobs / job_status_counts / recent_jobs
|
|
- QueueWorker respects max_concurrency (Popen / launch fully mocked)
|
|
|
|
The real claude/Popen is NEVER spawned: launcher.launch_job is mocked in worker
|
|
tests, and the launcher finalize logic is exercised directly via mark_job.
|
|
"""
|
|
import os
|
|
import tempfile
|
|
|
|
import pytest
|
|
|
|
# Override env before importing app modules (same convention as test_qg.py).
|
|
_test_db = os.path.join(tempfile.gettempdir(), "test_orchestrator_queue.db")
|
|
os.environ["ORCH_DB_PATH"] = _test_db
|
|
os.environ["ORCH_REPOS_DIR"] = tempfile.gettempdir()
|
|
os.environ["ORCH_GITEA_TOKEN"] = "test-token"
|
|
os.environ["ORCH_PLANE_API_TOKEN"] = "test-token"
|
|
|
|
import src.db as db
|
|
from src.db import (
|
|
init_db,
|
|
enqueue_job,
|
|
claim_next_job,
|
|
mark_job,
|
|
count_running_jobs,
|
|
requeue_running_jobs,
|
|
get_job,
|
|
job_status_counts,
|
|
recent_jobs,
|
|
)
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def fresh_db(tmp_path, monkeypatch):
|
|
"""Point the DB at a fresh per-test sqlite file and init the schema."""
|
|
dbfile = tmp_path / "queue.db"
|
|
monkeypatch.setattr(db.settings, "db_path", str(dbfile))
|
|
init_db()
|
|
yield
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# enqueue / claim / mark lifecycle
|
|
# ---------------------------------------------------------------------------
|
|
class TestLifecycle:
|
|
def test_enqueue_creates_queued_job(self):
|
|
jid = enqueue_job("analyst", "enduro-trails", "task body", task_id=7)
|
|
job = get_job(jid)
|
|
assert job["status"] == "queued"
|
|
assert job["agent"] == "analyst"
|
|
assert job["repo"] == "enduro-trails"
|
|
assert job["task_content"] == "task body"
|
|
assert job["task_id"] == 7
|
|
assert job["attempts"] == 0
|
|
assert job["max_attempts"] == 2
|
|
|
|
def test_claim_marks_running_and_increments_attempts(self):
|
|
jid = enqueue_job("developer", "repo")
|
|
claimed = claim_next_job()
|
|
assert claimed is not None
|
|
assert claimed["id"] == jid
|
|
assert claimed["status"] == "running"
|
|
assert claimed["attempts"] == 1
|
|
assert count_running_jobs() == 1
|
|
|
|
def test_claim_empty_queue_returns_none(self):
|
|
assert claim_next_job() is None
|
|
|
|
def test_claim_is_fifo(self):
|
|
a = enqueue_job("analyst", "r")
|
|
b = enqueue_job("developer", "r")
|
|
assert claim_next_job()["id"] == a
|
|
assert claim_next_job()["id"] == b
|
|
|
|
def test_mark_done(self):
|
|
jid = enqueue_job("tester", "r")
|
|
claim_next_job()
|
|
mark_job(jid, "done", run_id=42)
|
|
job = get_job(jid)
|
|
assert job["status"] == "done"
|
|
assert job["run_id"] == 42
|
|
assert job["finished_at"] is not None
|
|
assert count_running_jobs() == 0
|
|
|
|
def test_mark_failed_records_error(self):
|
|
jid = enqueue_job("tester", "r")
|
|
claim_next_job()
|
|
mark_job(jid, "failed", run_id=9, error="boom")
|
|
job = get_job(jid)
|
|
assert job["status"] == "failed"
|
|
assert job["error"] == "boom"
|
|
assert job["finished_at"] is not None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# claim atomicity — no double dispatch
|
|
# ---------------------------------------------------------------------------
|
|
class TestClaimAtomicity:
|
|
def test_single_job_claimed_once(self):
|
|
jid = enqueue_job("analyst", "r")
|
|
first = claim_next_job()
|
|
second = claim_next_job()
|
|
assert first["id"] == jid
|
|
assert second is None # already running, not re-dispatched
|
|
|
|
def test_concurrent_claims_no_duplicate(self):
|
|
"""Many enqueued jobs claimed from parallel threads -> each claimed once."""
|
|
import threading
|
|
|
|
n = 20
|
|
for _ in range(n):
|
|
enqueue_job("developer", "r")
|
|
|
|
claimed_ids = []
|
|
lock = threading.Lock()
|
|
|
|
def grab():
|
|
while True:
|
|
job = claim_next_job()
|
|
if job is None:
|
|
return
|
|
with lock:
|
|
claimed_ids.append(job["id"])
|
|
|
|
threads = [threading.Thread(target=grab) for _ in range(8)]
|
|
for t in threads:
|
|
t.start()
|
|
for t in threads:
|
|
t.join()
|
|
|
|
assert len(claimed_ids) == n
|
|
assert len(set(claimed_ids)) == n # no id claimed twice
|
|
assert count_running_jobs() == n
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# retry semantics (mirrors launcher._finalize_job logic)
|
|
# ---------------------------------------------------------------------------
|
|
class TestRetry:
|
|
def test_fail_requeues_while_under_max(self):
|
|
jid = enqueue_job("developer", "r", max_attempts=2)
|
|
job = claim_next_job() # attempts=1
|
|
assert job["attempts"] == 1
|
|
# attempts(1) < max(2) -> requeue
|
|
mark_job(jid, "queued", error="exit 1")
|
|
j = get_job(jid)
|
|
assert j["status"] == "queued"
|
|
assert j["error"] == "exit 1"
|
|
assert j["started_at"] is None # requeue clears started_at
|
|
|
|
def test_fail_fails_when_max_reached(self):
|
|
jid = enqueue_job("developer", "r", max_attempts=2)
|
|
claim_next_job() # attempts=1 -> requeue
|
|
mark_job(jid, "queued")
|
|
job2 = claim_next_job() # attempts=2
|
|
assert job2["attempts"] == 2
|
|
# attempts(2) >= max(2) -> failed
|
|
mark_job(jid, "failed", error="exit 1")
|
|
assert get_job(jid)["status"] == "failed"
|
|
|
|
def test_finalize_job_done(self):
|
|
"""launcher._finalize_job marks done on exit_code 0 (no Popen needed)."""
|
|
from src.agents.launcher import AgentLauncher
|
|
jid = enqueue_job("analyst", "r")
|
|
claim_next_job()
|
|
AgentLauncher()._finalize_job(jid, "analyst", run_id=5, exit_code=0)
|
|
assert get_job(jid)["status"] == "done"
|
|
|
|
def test_finalize_job_requeue_then_fail(self, monkeypatch):
|
|
from src.agents.launcher import AgentLauncher
|
|
# Silence telegram side-effect.
|
|
monkeypatch.setattr("src.notifications.send_telegram", lambda *a, **k: None)
|
|
lr = AgentLauncher()
|
|
jid = enqueue_job("developer", "r", max_attempts=2)
|
|
|
|
claim_next_job() # attempts=1
|
|
lr._finalize_job(jid, "developer", run_id=1, exit_code=2)
|
|
assert get_job(jid)["status"] == "queued" # 1 < 2 -> requeue
|
|
|
|
claim_next_job() # attempts=2
|
|
lr._finalize_job(jid, "developer", run_id=2, exit_code=2)
|
|
assert get_job(jid)["status"] == "failed" # 2 >= 2 -> failed
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# queue-recovery
|
|
# ---------------------------------------------------------------------------
|
|
class TestRequeueRunning:
|
|
def test_requeue_running_jobs(self):
|
|
a = enqueue_job("analyst", "r")
|
|
b = enqueue_job("developer", "r")
|
|
claim_next_job() # a -> running
|
|
claim_next_job() # b -> running
|
|
assert count_running_jobs() == 2
|
|
n = requeue_running_jobs()
|
|
assert n == 2
|
|
assert count_running_jobs() == 0
|
|
assert get_job(a)["status"] == "queued"
|
|
assert get_job(b)["status"] == "queued"
|
|
|
|
def test_requeue_preserves_attempts(self):
|
|
jid = enqueue_job("analyst", "r")
|
|
claim_next_job() # attempts=1
|
|
requeue_running_jobs()
|
|
assert get_job(jid)["attempts"] == 1 # not reset
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# observability helpers
|
|
# ---------------------------------------------------------------------------
|
|
class TestObservability:
|
|
def test_status_counts(self):
|
|
enqueue_job("analyst", "r") # stays queued
|
|
enqueue_job("developer", "r") # first claimed -> running (FIFO)
|
|
claim_next_job()
|
|
counts = job_status_counts()
|
|
assert counts["running"] == 1
|
|
assert counts["queued"] == 1
|
|
assert counts["done"] == 0
|
|
assert counts["failed"] == 0
|
|
|
|
def test_recent_jobs_desc(self):
|
|
ids = [enqueue_job("analyst", "r") for _ in range(3)]
|
|
recent = recent_jobs(10)
|
|
assert [r["id"] for r in recent] == sorted(ids, reverse=True)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# QueueWorker max_concurrency (launch_job fully mocked — no real Popen)
|
|
# ---------------------------------------------------------------------------
|
|
class TestWorkerConcurrency:
|
|
@pytest.fixture(autouse=True)
|
|
def _ok_preflight(self, monkeypatch):
|
|
# ORCH-1 resilience: the worker gates claims behind preflight; in tests there
|
|
# is no claude binary, so stub preflight OK to exercise pure queue/concurrency.
|
|
monkeypatch.setattr("src.queue_worker.preflight.check", lambda *a, **k: (True, "ok"))
|
|
|
|
def test_worker_respects_max_concurrency(self, monkeypatch):
|
|
from src.queue_worker import QueueWorker
|
|
|
|
launched = []
|
|
|
|
def fake_launch_job(job):
|
|
# Simulate a long-running agent: the job stays 'running' (we do NOT
|
|
# mark it done), so the slot remains occupied.
|
|
launched.append(job["id"])
|
|
return 100 + job["id"]
|
|
|
|
monkeypatch.setattr("src.queue_worker.launcher.launch_job", fake_launch_job)
|
|
|
|
for _ in range(5):
|
|
enqueue_job("developer", "r")
|
|
|
|
w = QueueWorker(max_concurrency=2, poll_interval=0.01)
|
|
w._drain_once()
|
|
|
|
# Only max_concurrency jobs may be launched / running at once.
|
|
assert len(launched) == 2
|
|
assert count_running_jobs() == 2
|
|
|
|
def test_worker_drains_as_slots_free(self, monkeypatch):
|
|
from src.queue_worker import QueueWorker
|
|
|
|
def fake_launch_job(job):
|
|
# Immediately complete the job so the slot frees for the next claim.
|
|
mark_job(job["id"], "done", run_id=job["id"])
|
|
return job["id"]
|
|
|
|
monkeypatch.setattr("src.queue_worker.launcher.launch_job", fake_launch_job)
|
|
|
|
for _ in range(4):
|
|
enqueue_job("analyst", "r")
|
|
|
|
w = QueueWorker(max_concurrency=1, poll_interval=0.01)
|
|
w._drain_once()
|
|
|
|
# With instant completion and concurrency 1, one drain pass empties the queue.
|
|
assert job_status_counts()["done"] == 4
|
|
assert count_running_jobs() == 0
|
|
|
|
def test_worker_launch_failure_does_not_wedge_slot(self, monkeypatch):
|
|
from src.queue_worker import QueueWorker
|
|
|
|
def boom(job):
|
|
raise RuntimeError("repo missing")
|
|
|
|
monkeypatch.setattr("src.queue_worker.launcher.launch_job", boom)
|
|
monkeypatch.setattr("src.notifications.send_telegram", lambda *a, **k: None)
|
|
|
|
enqueue_job("developer", "r", max_attempts=1)
|
|
w = QueueWorker(max_concurrency=1, poll_interval=0.01)
|
|
w._drain_once()
|
|
|
|
# attempts=1 >= max_attempts=1 -> failed, not stuck running.
|
|
assert count_running_jobs() == 0
|
|
counts = job_status_counts()
|
|
assert counts["failed"] == 1
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# ORCH-065: job-reaper unblocks the shared queue (TC-09) + /queue block (TC-18)
|
|
# ---------------------------------------------------------------------------
|
|
class TestReaperUnblocksQueue:
|
|
def test_tc09_reap_unblocks_claim_at_concurrency_1(self, monkeypatch):
|
|
"""A zombie 'running' row at max_concurrency=1 blocks every claim; once the
|
|
reaper reaps it the next queued job can be claimed (AC-2)."""
|
|
import src.merge_gate as mg
|
|
from src.job_reaper import JobReaper
|
|
|
|
monkeypatch.setattr(db.settings, "reaper_dead_ticks", 1)
|
|
monkeypatch.setattr(mg, "pid_alive", lambda pid: False) # zombie pid dead
|
|
|
|
# A zombie row stuck 'running' with a dead pid.
|
|
conn = db.get_db()
|
|
cur = conn.execute(
|
|
"INSERT INTO jobs (agent, repo, status, attempts, max_attempts, pid, "
|
|
"started_at) VALUES ('developer','r','running',2,2,999999,datetime('now'))"
|
|
)
|
|
zombie = cur.lastrowid
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
# A second job waits in the queue behind it.
|
|
nxt = enqueue_job("analyst", "r")
|
|
|
|
# At concurrency 1 the slot is fully occupied -> nothing else can run.
|
|
assert count_running_jobs() == 1
|
|
|
|
monkeypatch.setattr("src.notifications.send_telegram", lambda *a, **k: None)
|
|
JobReaper().reap_once() # dead pid, attempts>=max -> failed
|
|
|
|
assert get_job(zombie)["status"] == "failed"
|
|
assert count_running_jobs() == 0
|
|
# Queue is unblocked: the next job claims successfully.
|
|
claimed = claim_next_job()
|
|
assert claimed is not None and claimed["id"] == nxt
|
|
|
|
def test_tc18_queue_endpoint_has_reaper_block(self):
|
|
"""GET /queue exposes the reaper observability block (AC-15).
|
|
|
|
Calls the endpoint coroutine directly (no lifespan / no background
|
|
threads / no network) so the test stays hermetic.
|
|
"""
|
|
import asyncio
|
|
import src.main as main
|
|
|
|
body = asyncio.run(main.queue())
|
|
assert "reaper" in body
|
|
reaper = body["reaper"]
|
|
for key in ("enabled", "interval", "last_run_ts", "reaped_total",
|
|
"last_reaped", "lease_reclaimed_total"):
|
|
assert key in reaper
|