test(launcher): watchdog graceful kill ordering + timeout config + M-4 removal
Cover M-2: SIGTERM-before-SIGKILL ordering, graceful exit within grace skips SIGKILL, ProcessLookupError before SIGTERM is tolerated (no _record_kill), and _resolve_timeout per-agent override / default / malformed-JSON fallback. Cover M-4: _auto_merge_pr removed, _ensure_pr retained.
This commit is contained in:
@@ -7,6 +7,7 @@ Covers the audit-2026-06-02 fixes:
|
||||
the YAML frontmatter only (no fragile substring matching).
|
||||
"""
|
||||
import os
|
||||
import signal
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
@@ -20,6 +21,7 @@ os.environ["ORCH_PLANE_API_TOKEN"] = "test-token"
|
||||
|
||||
from src.agents.launcher import AgentLauncher
|
||||
from src.qg.checks import check_reviewer_verdict
|
||||
from src.config import settings
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -138,3 +140,141 @@ class TestCheckReviewerVerdict:
|
||||
passed, reason = check_reviewer_verdict("enduro-trails", "ET-999")
|
||||
assert passed is False
|
||||
assert "not found" in reason.lower()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ORCH-7 (M-4): dead code removed
|
||||
# ---------------------------------------------------------------------------
|
||||
class TestDeadCodeRemoved:
|
||||
"""M-4: _auto_merge_pr was never called (merge is the deployer's job) and is
|
||||
removed. _ensure_pr (used by the auto-advance path) must stay."""
|
||||
|
||||
def test_auto_merge_pr_is_gone(self):
|
||||
assert not hasattr(AgentLauncher, "_auto_merge_pr")
|
||||
|
||||
def test_ensure_pr_still_present(self):
|
||||
assert hasattr(AgentLauncher, "_ensure_pr")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ORCH-7 (M-2): configurable timeout + per-agent override
|
||||
# ---------------------------------------------------------------------------
|
||||
class TestResolveTimeout:
|
||||
"""M-2: _resolve_timeout honours a per-agent JSON override, else the default."""
|
||||
|
||||
def test_default_when_no_override(self, monkeypatch):
|
||||
monkeypatch.setattr(settings, "agent_timeout_seconds", 1800)
|
||||
monkeypatch.setattr(settings, "agent_timeout_overrides_json", "")
|
||||
assert AgentLauncher._resolve_timeout("developer") == 1800
|
||||
assert AgentLauncher._resolve_timeout(None) == 1800
|
||||
|
||||
def test_override_for_specific_agent(self, monkeypatch):
|
||||
monkeypatch.setattr(settings, "agent_timeout_seconds", 1800)
|
||||
monkeypatch.setattr(
|
||||
settings, "agent_timeout_overrides_json", '{"reviewer": 3600, "architect": 2700}'
|
||||
)
|
||||
assert AgentLauncher._resolve_timeout("reviewer") == 3600
|
||||
assert AgentLauncher._resolve_timeout("architect") == 2700
|
||||
# an agent not in the override map falls back to the default
|
||||
assert AgentLauncher._resolve_timeout("developer") == 1800
|
||||
|
||||
def test_malformed_override_falls_back_to_default(self, monkeypatch):
|
||||
monkeypatch.setattr(settings, "agent_timeout_seconds", 1800)
|
||||
monkeypatch.setattr(settings, "agent_timeout_overrides_json", "{not-json")
|
||||
# must not raise, must return the default
|
||||
assert AgentLauncher._resolve_timeout("reviewer") == 1800
|
||||
|
||||
|
||||
class TestWatchdogGracefulKill:
|
||||
"""M-2: SIGTERM -> grace -> SIGKILL ordering, with graceful-exit short-circuit
|
||||
and ProcessLookupError tolerance. The OS process is fully faked: we record the
|
||||
signals sent and decide liveness from a script, so no real process is touched."""
|
||||
|
||||
def _patch_db(self, monkeypatch):
|
||||
"""Stub get_db so _record_kill does not need a real DB."""
|
||||
class _Conn:
|
||||
def execute(self, *a, **k):
|
||||
return self
|
||||
def commit(self):
|
||||
pass
|
||||
def close(self):
|
||||
pass
|
||||
monkeypatch.setattr("src.agents.launcher.get_db", lambda: _Conn())
|
||||
|
||||
def test_sigterm_then_sigkill_after_grace(self, monkeypatch):
|
||||
"""Process stays alive through the whole grace window -> SIGTERM then SIGKILL."""
|
||||
self._patch_db(monkeypatch)
|
||||
monkeypatch.setattr(settings, "agent_kill_grace_seconds", 1)
|
||||
monkeypatch.setattr("src.agents.launcher.time.sleep", lambda s: None)
|
||||
|
||||
sent = []
|
||||
|
||||
def fake_kill(pid, sig):
|
||||
sent.append(sig)
|
||||
# signal 0 (liveness probe) -> always alive; never raise
|
||||
return None
|
||||
|
||||
monkeypatch.setattr("src.agents.launcher.os.kill", fake_kill)
|
||||
|
||||
launcher = AgentLauncher()
|
||||
launcher._watchdog(pid=4242, run_id=1, timeout=0, agent="developer")
|
||||
|
||||
assert signal.SIGTERM in sent
|
||||
assert signal.SIGKILL in sent
|
||||
# SIGTERM must come before SIGKILL
|
||||
assert sent.index(signal.SIGTERM) < sent.index(signal.SIGKILL)
|
||||
|
||||
def test_graceful_exit_in_grace_skips_sigkill(self, monkeypatch):
|
||||
"""Process dies during the grace window -> SIGKILL is NOT sent."""
|
||||
self._patch_db(monkeypatch)
|
||||
monkeypatch.setattr(settings, "agent_kill_grace_seconds", 5)
|
||||
monkeypatch.setattr("src.agents.launcher.time.sleep", lambda s: None)
|
||||
|
||||
sent = []
|
||||
state = {"alive": True, "probes": 0}
|
||||
|
||||
def fake_kill(pid, sig):
|
||||
if sig == 0:
|
||||
state["probes"] += 1
|
||||
# die on the 2nd liveness probe (within grace)
|
||||
if state["probes"] >= 2:
|
||||
raise ProcessLookupError
|
||||
return None
|
||||
sent.append(sig)
|
||||
return None
|
||||
|
||||
monkeypatch.setattr("src.agents.launcher.os.kill", fake_kill)
|
||||
|
||||
launcher = AgentLauncher()
|
||||
launcher._watchdog(pid=4242, run_id=2, timeout=0, agent="developer")
|
||||
|
||||
assert signal.SIGTERM in sent
|
||||
assert signal.SIGKILL not in sent
|
||||
|
||||
def test_already_dead_before_sigterm(self, monkeypatch):
|
||||
"""Process already gone at SIGTERM -> ProcessLookupError tolerated, no SIGKILL,
|
||||
and _record_kill is NOT called (the monitor's proc.wait owns the exit)."""
|
||||
self._patch_db(monkeypatch)
|
||||
monkeypatch.setattr("src.agents.launcher.time.sleep", lambda s: None)
|
||||
|
||||
sent = []
|
||||
|
||||
def fake_kill(pid, sig):
|
||||
if sig == signal.SIGTERM:
|
||||
raise ProcessLookupError
|
||||
sent.append(sig)
|
||||
return None
|
||||
|
||||
recorded = {"called": False}
|
||||
monkeypatch.setattr(
|
||||
AgentLauncher, "_record_kill",
|
||||
staticmethod(lambda rid: recorded.__setitem__("called", True)),
|
||||
)
|
||||
monkeypatch.setattr("src.agents.launcher.os.kill", fake_kill)
|
||||
|
||||
launcher = AgentLauncher()
|
||||
# must not raise
|
||||
launcher._watchdog(pid=4242, run_id=3, timeout=0, agent="developer")
|
||||
|
||||
assert signal.SIGKILL not in sent
|
||||
assert recorded["called"] is False
|
||||
|
||||
Reference in New Issue
Block a user