orchestrator/tests/test_orch126_queued_stale_run.py

"""ORCH-126: run-ownership hygiene of the `jobs` row — invariant
``status='queued' ⇒ run_id IS NULL AND pid IS NULL AND started_at IS NULL``.

Covers FR-1…FR-5 / AC-1…AC-8 (TC-01..TC-10, see 04-test-plan.yaml). The defect:
no path that returns a job to ``queued`` reset its run-ownership (``run_id`` /
``pid``), so a requeued/restart-recovered job carried a stale (and possibly
OS-reused) pid. The job-reaper (ORCH-065) judges Tier-1 liveness by ``jobs.pid``,
so a reused pid made it guard a phantom ``running`` forever — at ``max_concurrency=1``
this wedged the claim of EVERY project's queue (incident: job 2286 ``queued +
run_id=759/760 + pid=35/42 + started_at=NULL``).

No network / no Claude CLI / no real Popen: jobs are seeded directly in an
isolated temp SQLite DB (the convention of test_orch114_transition_ownership.py).
TC-01 is the MANDATORY regression (red before the fix, green after).
"""
import os
import tempfile

import pytest

os.environ.setdefault("ORCH_REPOS_DIR", tempfile.gettempdir())
os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")

import src.db as db
from src.db import (
    init_db,
    get_db,
    get_job,
    claim_next_job,
    mark_job,
    mark_job_transient,
    reap_running_job,
    requeue_running_jobs,
    find_impossible_queued_jobs,
    sanitize_impossible_queued,
)
from src.job_reaper import JobReaper

_REPO = "orchestrator"


@pytest.fixture(autouse=True)
def fresh_db(tmp_path, monkeypatch):
    dbfile = tmp_path / "orch126.db"
    monkeypatch.setattr(db.settings, "db_path", str(dbfile))
    init_db()
    # Keep the scheduler gates out of the way of the claim tests (the bug is
    # independent of serial-gate / dep-gate semantics — BRD §2 out-of-scope).
    monkeypatch.setattr(db.settings, "serial_gate_enabled", False, raising=False)
    monkeypatch.setattr(db.settings, "task_deps_enabled", False, raising=False)
    yield


def _seed_job(
    *,
    status="queued",
    run_id=None,
    pid=None,
    started_at=None,
    finished_at=None,
    agent="developer",
    repo=_REPO,
    attempts=0,
    max_attempts=2,
    transient_attempts=0,
):
    """Insert a job row with arbitrary (possibly invalid) run-ownership columns."""
    conn = get_db()
    cur = conn.execute(
        "INSERT INTO jobs (agent, repo, status, run_id, pid, started_at, "
        "finished_at, attempts, max_attempts, transient_attempts) "
        "VALUES (?,?,?,?,?,?,?,?,?,?)",
        (agent, repo, status, run_id, pid, started_at, finished_at,
         attempts, max_attempts, transient_attempts),
    )
    jid = cur.lastrowid
    conn.commit()
    conn.close()
    return jid


def _row(job_id):
    return get_job(job_id)


# --------------------------------------------------------------------------- #
# TC-01 — MANDATORY regression (red -> green): restart-recovery clears ownership
# --------------------------------------------------------------------------- #
def test_tc01_requeue_running_clears_stale_ownership():
    """A job left 'running' with stale run_id+pid+started_at (the incident state)
    is restored to a CLEAN queued by requeue_running_jobs() — run_id/pid/started_at
    all NULL. Red on the code BEFORE the fix (run_id/pid kept), green after (AC-1/AC-8).
    """
    jid = _seed_job(
        status="running", run_id=759, pid=35, started_at="2026-06-17 10:00:00",
    )
    n = requeue_running_jobs()
    assert n == 1
    r = _row(jid)
    assert r["status"] == "queued"
    assert r["run_id"] is None
    assert r["pid"] is None
    assert r["started_at"] is None


# --------------------------------------------------------------------------- #
# TC-02 — mark_job(..., 'queued') resets run_id and pid
# --------------------------------------------------------------------------- #
def test_tc02_mark_job_queued_resets_ownership():
    jid = _seed_job(
        status="running", run_id=760, pid=42, started_at="2026-06-17 10:00:00",
    )
    # A real caller (launcher._finalize_permanent) requeues WITH the old run_id —
    # the invariant must win and drop it regardless.
    mark_job(jid, "queued", run_id=760, error="exit_code=1")
    r = _row(jid)
    assert r["status"] == "queued"
    assert r["run_id"] is None
    assert r["pid"] is None
    assert r["started_at"] is None
    assert r["finished_at"] is None
    assert r["error"] == "exit_code=1"


# --------------------------------------------------------------------------- #
# TC-03 — mark_job_transient resets ownership but keeps transient bookkeeping
# --------------------------------------------------------------------------- #
def test_tc03_mark_job_transient_resets_ownership_keeps_backoff():
    jid = _seed_job(
        status="running", run_id=761, pid=50, started_at="2026-06-17 10:00:00",
        transient_attempts=1,
    )
    mark_job_transient(jid, 30, error="overloaded")
    r = _row(jid)
    assert r["status"] == "queued"
    assert r["run_id"] is None
    assert r["pid"] is None
    assert r["started_at"] is None
    # Transient bookkeeping preserved.
    assert r["transient_attempts"] == 2
    assert r["available_at"] is not None
    assert r["error"] == "overloaded"


# --------------------------------------------------------------------------- #
# TC-04 — reap_running_job('queued') resets ownership; atomic guard preserved
# --------------------------------------------------------------------------- #
def test_tc04_reap_running_job_queued_resets_and_guard_holds():
    jid = _seed_job(
        status="running", run_id=762, pid=60, started_at="2026-06-17 10:00:00",
    )
    won = reap_running_job(jid, "queued", run_id=762, error="dead pid")
    assert won is True
    r = _row(jid)
    assert r["status"] == "queued"
    assert r["run_id"] is None
    assert r["pid"] is None
    assert r["started_at"] is None
    # Atomic WHERE status='running' guard: a second call on the now-queued row
    # must lose (rowcount 0) — restart-safe / race-safe (TR-4).
    won2 = reap_running_job(jid, "queued", error="again")
    assert won2 is False


# --------------------------------------------------------------------------- #
# TC-05 — claim_next_job leaves no stale pid (defense-in-depth, AC-3)
# --------------------------------------------------------------------------- #
def test_tc05_claim_clears_stale_pid_before_spawn():
    """A queued job carrying a stale pid/run_id (impossible state) is claimed into
    'running' with pid/run_id reset to NULL — BEFORE the launcher stamps the real
    pid in _spawn. Red before the fix (claim left the stale pid), green after."""
    jid = _seed_job(status="queued", run_id=900, pid=999999)
    claimed = claim_next_job()
    assert claimed is not None
    assert claimed["id"] == jid
    assert claimed["status"] == "running"
    assert claimed["pid"] is None
    assert claimed["run_id"] is None
    r = _row(jid)
    assert r["status"] == "running"
    assert r["pid"] is None
    assert r["run_id"] is None
    assert r["started_at"] is not None  # claim still stamps started_at


# --------------------------------------------------------------------------- #
# TC-06 — claim works (no starvation) with serial-gate disabled (AC-2)
# --------------------------------------------------------------------------- #
def test_tc06_claim_starts_with_serial_gate_off(monkeypatch):
    monkeypatch.setattr(db.settings, "serial_gate_enabled", False, raising=False)
    jid = _seed_job(status="queued", agent="analyst")
    claimed = claim_next_job()
    assert claimed is not None and claimed["id"] == jid
    assert claimed["status"] == "running"
    # Queue did not hang: nothing left queued, exactly one running.
    counts = db.job_status_counts()
    assert counts["queued"] == 0
    assert counts["running"] == 1


# --------------------------------------------------------------------------- #
# TC-07 — reaper does not reap a freshly-claimed running job with pid IS NULL (AC-4)
# --------------------------------------------------------------------------- #
def test_tc07_reaper_skips_pid_null_fresh_running(monkeypatch):
    monkeypatch.setattr(db.settings, "lease_reclaim_enabled", False, raising=False)
    monkeypatch.setattr(db.settings, "reaper_enabled", True, raising=False)
    # Fresh running job: pid NULL (claim reset it, _spawn not yet stamped),
    # started_at = now -> small age, no agent_runs row -> exit_code NULL.
    jid = _seed_job(status="running", pid=None, run_id=None,
                    started_at=None)
    conn = get_db()
    conn.execute("UPDATE jobs SET started_at = datetime('now') WHERE id=?", (jid,))
    conn.commit()
    conn.close()
    reaper = JobReaper()
    reaper.reap_once()
    r = _row(jid)
    # Tier-1 skips pid IS NULL; Tier-3 backstop not reached -> still running.
    assert r["status"] == "running"


# --------------------------------------------------------------------------- #
# TC-08 — detect + self-heal the impossible queued state (idempotent, AC-5)
# --------------------------------------------------------------------------- #
def test_tc08_sanitize_impossible_queued_idempotent():
    jid = _seed_job(
        status="queued", run_id=759, pid=35, started_at="2026-06-17 10:00:00",
    )
    # Detection sees the anomaly.
    found = find_impossible_queued_jobs()
    assert any(f["id"] == jid for f in found)
    # First sanitize heals it and reports it.
    healed = sanitize_impossible_queued()
    assert len(healed) == 1 and healed[0]["id"] == jid
    r = _row(jid)
    assert r["status"] == "queued"
    assert r["run_id"] is None and r["pid"] is None and r["started_at"] is None
    # Idempotent: nothing left to heal on a clean DB.
    assert find_impossible_queued_jobs() == []
    assert sanitize_impossible_queued() == []


def test_tc08b_reaper_sanitize_counter_and_status(monkeypatch):
    """The reaper's sanitize pass bumps an observability counter exposed in /queue
    and never raises; gated by the kill-switch."""
    monkeypatch.setattr(db.settings, "lease_reclaim_enabled", False, raising=False)
    monkeypatch.setattr(
        db.settings, "impossible_queued_sanitize_enabled", True, raising=False
    )
    _seed_job(status="queued", run_id=10, pid=11, started_at="2026-06-17 10:00:00")
    reaper = JobReaper()
    n = reaper.sanitize_impossible_queued_once()
    assert n == 1
    assert reaper.impossible_queued_total == 1
    st = reaper.status()
    assert st["impossible_queued_total"] == 1
    assert st["last_impossible_queued"]["count"] == 1
    # Kill-switch off -> no-op even with an anomaly present.
    _seed_job(status="queued", run_id=12, pid=13, started_at="2026-06-17 10:00:00")
    monkeypatch.setattr(
        db.settings, "impossible_queued_sanitize_enabled", False, raising=False
    )
    assert reaper.sanitize_impossible_queued_once() == 0


# --------------------------------------------------------------------------- #
# TC-09 — _spawn window: a launch failure before the pid stamp requeues cleanly
# --------------------------------------------------------------------------- #
def test_tc09_spawn_failure_requeues_clean_and_reclaimable(monkeypatch):
    """Simulate the queue worker's launch-failure handler: claim flips queued->running
    (pid reset to NULL by D2), launch raises before stamping pid, the handler requeues
    via mark_job('queued') (D1) -> the row is clean and immediately re-claimable (AC-6).
    """
    jid = _seed_job(status="queued", run_id=900, pid=999999, attempts=0, max_attempts=2)
    claimed = claim_next_job()
    assert claimed["id"] == jid and claimed["pid"] is None  # D2 already cleaned

    # _spawn fails before stamping pid -> the queue worker marks it back to queued.
    mark_job(jid, "queued", error="launch error: ensure_worktree failed")
    r = _row(jid)
    assert r["status"] == "queued"
    assert r["pid"] is None and r["run_id"] is None and r["started_at"] is None

    # Re-claim starts normally (no "partially started" wedge).
    again = claim_next_job()
    assert again is not None and again["id"] == jid
    assert again["status"] == "running" and again["pid"] is None


# --------------------------------------------------------------------------- #
# TC-10 — anti-regression: a healthy job's terminal outcomes are untouched
# --------------------------------------------------------------------------- #
def test_tc10_healthy_job_terminal_outcomes_unchanged():
    # done: run_id link kept, finished_at stamped, NO ownership reset.
    jid = _seed_job(status="running", run_id=500, pid=12345,
                    started_at="2026-06-17 10:00:00")
    mark_job(jid, "done", run_id=500)
    r = _row(jid)
    assert r["status"] == "done"
    assert r["run_id"] == 500          # link preserved for done
    assert r["finished_at"] is not None
    assert r["started_at"] == "2026-06-17 10:00:00"  # not cleared for terminal

    # failed: same — run_id kept, finished_at stamped.
    jid2 = _seed_job(status="running", run_id=501, pid=222,
                     started_at="2026-06-17 10:00:00")
    mark_job(jid2, "failed", run_id=501, error="boom")
    r2 = _row(jid2)
    assert r2["status"] == "failed"
    assert r2["run_id"] == 501
    assert r2["finished_at"] is not None
    assert r2["error"] == "boom"

    # A never-started healthy queued job is NOT flagged as impossible.
    healthy = _seed_job(status="queued")
    assert all(f["id"] != healthy for f in find_impossible_queued_jobs())