Closes the "zombie jobs" incident class: job status was set only inside the live launcher process, so a process death left jobs.status='running' forever; at max_concurrency=1 one zombie blocked ALL projects' queue (self-hosting risk). Adds a background daemon (src/job_reaper.py) with three-tier liveness (dead-pid streak / known exit_code / max-running backstop) whose only mutating write is an atomic terminal flip guarded by WHERE status='running' (no double-process). For exit0 the canonical QG is the source of truth via gate-driven advance, not "exit0". Also proactively reclaims stale merge-lease (dead pid OR TTL) via file delete only (no git ops), and makes merge finalization idempotent (pr_already_merged guard + up-to-date short-circuit on re-drive). New jobs.pid column via idempotent _ensure_column (no migration); pid stamped in launcher._spawn after Popen. Reaper start/stop in lifespan; "reaper" snapshot in GET /queue. Kill-switches: ORCH_REAPER_ENABLED, ORCH_REAPER_INTERVAL_S, ORCH_REAPER_DEAD_TICKS, ORCH_REAPER_MAX_RUNNING_S, ORCH_LEASE_RECLAIM_ENABLED. Invariants unchanged (AC-13): STAGE_TRANSITIONS, QG_CHECKS registry, check_branch_mergeable signature/behaviour, BUG-8 rollback, hook exit codes. restart-safe, never-raise per unit of background work. Docs: docs/architecture/README.md, CHANGELOG.md, .env.example. Tests: tests/test_job_reaper.py, tests/test_merge_lease_reclaim.py, tests/test_merge_gate.py (TC-16), tests/test_merge_gate_race.py (TC-17), tests/test_queue.py, tests/test_config.py (TC-19/TC-20). 742 passed. Refs: ORCH-065 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
139 lines
5.3 KiB
Python
139 lines
5.3 KiB
Python
"""ORCH-065: proactive stale/dead merge-lease reclaim (TC-10..TC-15).
|
|
|
|
Exercises merge_gate.reclaim_stale_lease / pid_alive directly with lease files
|
|
written into a tmp repos_dir. No git ops run (reclaim only removes the lease
|
|
file). pid liveness is monkeypatched so 'dead'/'alive' are deterministic.
|
|
"""
|
|
import json
|
|
import os
|
|
import tempfile
|
|
import time
|
|
|
|
import pytest
|
|
|
|
os.environ["ORCH_DB_PATH"] = os.path.join(tempfile.gettempdir(), "test_orch_lease.db")
|
|
os.environ["ORCH_REPOS_DIR"] = tempfile.gettempdir()
|
|
os.environ["ORCH_GITEA_TOKEN"] = "test-token"
|
|
os.environ["ORCH_PLANE_API_TOKEN"] = "test-token"
|
|
|
|
from src import merge_gate
|
|
|
|
|
|
@pytest.fixture
|
|
def repos_dir(tmp_path, monkeypatch):
|
|
d = tmp_path / "repos"
|
|
d.mkdir()
|
|
monkeypatch.setattr(merge_gate.settings, "repos_dir", str(d))
|
|
monkeypatch.setattr(merge_gate.settings, "lease_reclaim_enabled", True)
|
|
monkeypatch.setattr(merge_gate.settings, "merge_gate_repos", "") # self-hosting only
|
|
monkeypatch.setattr(merge_gate.settings, "merge_lock_timeout_s", 300)
|
|
return d
|
|
|
|
|
|
def _write_lease(repos_dir, repo, branch="feature/x", pid=1234, age_s=0):
|
|
path = os.path.join(str(repos_dir), f".merge-lease-{repo}.json")
|
|
holder = {
|
|
"branch": branch,
|
|
"work_item_id": "ORCH-1",
|
|
"task_id": 1,
|
|
"acquired_at": time.time() - age_s,
|
|
"pid": pid,
|
|
}
|
|
with open(path, "w", encoding="utf-8") as f:
|
|
f.write(json.dumps(holder))
|
|
return path
|
|
|
|
|
|
def _no_telegram(monkeypatch):
|
|
import src.notifications as notif
|
|
monkeypatch.setattr(notif, "send_telegram", lambda *a, **k: None)
|
|
|
|
|
|
# --- TC-10: reclaim a lease with a DEAD pid, proactively --------------------
|
|
def test_tc10_reclaim_dead_pid(repos_dir, monkeypatch):
|
|
_no_telegram(monkeypatch)
|
|
path = _write_lease(repos_dir, "orchestrator", pid=999999, age_s=0)
|
|
monkeypatch.setattr(merge_gate, "pid_alive", lambda pid: False)
|
|
assert merge_gate.reclaim_stale_lease("orchestrator") is True
|
|
assert not os.path.exists(path) # lease removed
|
|
|
|
|
|
# --- TC-11: reclaim by TTL is preserved -------------------------------------
|
|
def test_tc11_reclaim_by_ttl(repos_dir, monkeypatch):
|
|
_no_telegram(monkeypatch)
|
|
# pid alive, but the lease is older than the TTL -> still reclaimed.
|
|
path = _write_lease(repos_dir, "orchestrator", pid=4321, age_s=999)
|
|
monkeypatch.setattr(merge_gate, "pid_alive", lambda pid: True)
|
|
assert merge_gate.reclaim_stale_lease("orchestrator") is True
|
|
assert not os.path.exists(path)
|
|
|
|
|
|
# --- TC-12: a LIVE lease within TTL is NOT released -------------------------
|
|
def test_tc12_live_lease_protected(repos_dir, monkeypatch):
|
|
_no_telegram(monkeypatch)
|
|
path = _write_lease(repos_dir, "orchestrator", pid=4321, age_s=10)
|
|
monkeypatch.setattr(merge_gate, "pid_alive", lambda pid: True)
|
|
assert merge_gate.reclaim_stale_lease("orchestrator") is False
|
|
assert os.path.exists(path) # untouched
|
|
|
|
|
|
# --- TC-13: conditional — non-self-hosting repos are a no-op ----------------
|
|
def test_tc13_non_scope_repo_noop(repos_dir, monkeypatch):
|
|
_no_telegram(monkeypatch)
|
|
path = _write_lease(repos_dir, "enduro-trails", pid=999999, age_s=999)
|
|
monkeypatch.setattr(merge_gate, "pid_alive", lambda pid: False)
|
|
assert merge_gate.reclaim_stale_lease("enduro-trails") is False
|
|
assert os.path.exists(path) # out of scope -> untouched
|
|
|
|
|
|
def test_tc13_merge_gate_repos_csv_scope(repos_dir, monkeypatch):
|
|
_no_telegram(monkeypatch)
|
|
monkeypatch.setattr(merge_gate.settings, "merge_gate_repos", "enduro-trails")
|
|
path = _write_lease(repos_dir, "enduro-trails", pid=999999, age_s=0)
|
|
monkeypatch.setattr(merge_gate, "pid_alive", lambda pid: False)
|
|
assert merge_gate.reclaim_stale_lease("enduro-trails") is True
|
|
assert not os.path.exists(path)
|
|
|
|
|
|
# --- TC-14: never-raise on a read/remove error ------------------------------
|
|
def test_tc14_never_raise_on_read_error(repos_dir, monkeypatch):
|
|
_no_telegram(monkeypatch)
|
|
_write_lease(repos_dir, "orchestrator", pid=1, age_s=999)
|
|
|
|
def boom(path):
|
|
raise OSError("simulated read failure")
|
|
|
|
monkeypatch.setattr(merge_gate, "_read_lease", boom)
|
|
# Must not raise; returns False (could not reclaim).
|
|
assert merge_gate.reclaim_stale_lease("orchestrator") is False
|
|
|
|
|
|
def test_tc14_no_lease_file_is_noop(repos_dir, monkeypatch):
|
|
_no_telegram(monkeypatch)
|
|
assert merge_gate.reclaim_stale_lease("orchestrator") is False
|
|
|
|
|
|
# --- TC-15: kill-switch lease_reclaim_enabled=False -------------------------
|
|
def test_tc15_kill_switch(repos_dir, monkeypatch):
|
|
_no_telegram(monkeypatch)
|
|
monkeypatch.setattr(merge_gate.settings, "lease_reclaim_enabled", False)
|
|
path = _write_lease(repos_dir, "orchestrator", pid=999999, age_s=999)
|
|
monkeypatch.setattr(merge_gate, "pid_alive", lambda pid: False)
|
|
assert merge_gate.reclaim_stale_lease("orchestrator") is False
|
|
assert os.path.exists(path) # proactive reclaim off -> untouched
|
|
|
|
|
|
# --- pid_alive semantics ----------------------------------------------------
|
|
def test_pid_alive_dead_process():
|
|
# PID 999999999 almost certainly does not exist.
|
|
assert merge_gate.pid_alive(999999999) is False
|
|
|
|
|
|
def test_pid_alive_self():
|
|
assert merge_gate.pid_alive(os.getpid()) is True
|
|
|
|
|
|
def test_pid_alive_missing_pid_conservative():
|
|
assert merge_gate.pid_alive(None) is True
|
|
assert merge_gate.pid_alive(0) is True
|