Files
orchestrator/src/preflight.py
Dev Agent c23f000c05 fix(preflight): check the binary the launcher actually spawns (ORCH-1)
Container ORCH_CLAUDE_BIN pointed at a non-existent /usr/bin/claude while the
launcher spawns the hardcoded /opt/claude-code/bin/claude.exe. Preflight now
follows AgentLauncher.CLAUDE_BIN (the genuinely executed path), so it no longer
falsely blocks every job in production.
2026-06-03 00:13:44 +03:00

107 lines
3.6 KiB
Python

"""ORCH-1 resilience: cheap preflight check (CLI / network available?).
Goal: before the worker claims a job, confirm the claude CLI binary and runtime
are reachable WITHOUT spending any tokens. We only do local/cheap checks:
1. os.path.exists(CLAUDE_BIN) -- instant
2. `claude --version` (timeout 5s) -- spawns CLI, does NOT call the API
The result is cached for `preflight_cache_ttl` seconds so we do not re-run
`claude --version` on every worker tick.
🚫 We deliberately do NOT do a prompt ping (ping->pong) — that would burn the
rate limit and add latency. Preflight is local-only.
"""
import os
import time
import logging
import subprocess
from .config import settings
logger = logging.getLogger("orchestrator.preflight")
_VERSION_TIMEOUT = 5
class _PreflightCache:
def __init__(self):
self.ts: float = 0.0
self.ok: bool = False
self.reason: str = "not checked yet"
_cache = _PreflightCache()
def _claude_bin() -> str:
"""Resolve the claude binary preflight should check.
Must match the binary the launcher actually spawns. The launcher hardcodes
AgentLauncher.CLAUDE_BIN for the real Popen, so we prefer that; we only fall
back to settings.claude_bin / a default if it is somehow unset. (Note: the
container's ORCH_CLAUDE_BIN may point elsewhere; preflight follows the path
that is genuinely executed, not the unused env override.)
"""
try:
from .agents.launcher import AgentLauncher
launcher_bin = getattr(AgentLauncher, "CLAUDE_BIN", None)
if launcher_bin and os.path.exists(launcher_bin):
return launcher_bin
# Launcher path not present -> fall back to configured/default.
return launcher_bin or getattr(settings, "claude_bin", None) or "/opt/claude-code/bin/claude.exe"
except Exception:
return getattr(settings, "claude_bin", None) or "/opt/claude-code/bin/claude.exe"
def _run_version(bin_path: str) -> tuple[bool, str]:
"""`claude --version` — proves the CLI runs without touching the API."""
try:
r = subprocess.run(
[bin_path, "--version"],
capture_output=True,
text=True,
timeout=_VERSION_TIMEOUT,
)
if r.returncode == 0:
return True, (r.stdout or r.stderr or "").strip()[:120] or "ok"
return False, f"--version exit {r.returncode}: {(r.stderr or r.stdout).strip()[:120]}"
except subprocess.TimeoutExpired:
return False, f"--version timed out after {_VERSION_TIMEOUT}s"
except FileNotFoundError:
return False, "claude binary not found (FileNotFoundError)"
except Exception as e: # pragma: no cover - defensive
return False, f"--version error: {e}"
def _compute() -> tuple[bool, str]:
bin_path = _claude_bin()
if not os.path.exists(bin_path):
return False, f"CLAUDE_BIN not found: {bin_path}"
return _run_version(bin_path)
def check(force: bool = False) -> tuple[bool, str]:
"""Return (ok, reason). Cached for preflight_cache_ttl seconds.
force=True bypasses the cache (used by the breaker half-open probe / tests).
"""
now = time.time()
ttl = settings.preflight_cache_ttl
if not force and _cache.ts > 0 and (now - _cache.ts) < ttl:
return _cache.ok, _cache.reason
ok, reason = _compute()
_cache.ts = now
_cache.ok = ok
_cache.reason = reason
if not ok:
logger.warning(f"Preflight FAIL: {reason}")
return ok, reason
def reset_cache() -> None:
"""Invalidate the cache (tests / forced recheck)."""
_cache.ts = 0.0
_cache.ok = False
_cache.reason = "reset"