Container ORCH_CLAUDE_BIN pointed at a non-existent /usr/bin/claude while the launcher spawns the hardcoded /opt/claude-code/bin/claude.exe. Preflight now follows AgentLauncher.CLAUDE_BIN (the genuinely executed path), so it no longer falsely blocks every job in production.
107 lines
3.6 KiB
Python
107 lines
3.6 KiB
Python
"""ORCH-1 resilience: cheap preflight check (CLI / network available?).
|
|
|
|
Goal: before the worker claims a job, confirm the claude CLI binary and runtime
|
|
are reachable WITHOUT spending any tokens. We only do local/cheap checks:
|
|
|
|
1. os.path.exists(CLAUDE_BIN) -- instant
|
|
2. `claude --version` (timeout 5s) -- spawns CLI, does NOT call the API
|
|
|
|
The result is cached for `preflight_cache_ttl` seconds so we do not re-run
|
|
`claude --version` on every worker tick.
|
|
|
|
🚫 We deliberately do NOT do a prompt ping (ping->pong) — that would burn the
|
|
rate limit and add latency. Preflight is local-only.
|
|
"""
|
|
import os
|
|
import time
|
|
import logging
|
|
import subprocess
|
|
|
|
from .config import settings
|
|
|
|
logger = logging.getLogger("orchestrator.preflight")
|
|
|
|
_VERSION_TIMEOUT = 5
|
|
|
|
|
|
class _PreflightCache:
|
|
def __init__(self):
|
|
self.ts: float = 0.0
|
|
self.ok: bool = False
|
|
self.reason: str = "not checked yet"
|
|
|
|
|
|
_cache = _PreflightCache()
|
|
|
|
|
|
def _claude_bin() -> str:
|
|
"""Resolve the claude binary preflight should check.
|
|
|
|
Must match the binary the launcher actually spawns. The launcher hardcodes
|
|
AgentLauncher.CLAUDE_BIN for the real Popen, so we prefer that; we only fall
|
|
back to settings.claude_bin / a default if it is somehow unset. (Note: the
|
|
container's ORCH_CLAUDE_BIN may point elsewhere; preflight follows the path
|
|
that is genuinely executed, not the unused env override.)
|
|
"""
|
|
try:
|
|
from .agents.launcher import AgentLauncher
|
|
launcher_bin = getattr(AgentLauncher, "CLAUDE_BIN", None)
|
|
if launcher_bin and os.path.exists(launcher_bin):
|
|
return launcher_bin
|
|
# Launcher path not present -> fall back to configured/default.
|
|
return launcher_bin or getattr(settings, "claude_bin", None) or "/opt/claude-code/bin/claude.exe"
|
|
except Exception:
|
|
return getattr(settings, "claude_bin", None) or "/opt/claude-code/bin/claude.exe"
|
|
|
|
|
|
def _run_version(bin_path: str) -> tuple[bool, str]:
|
|
"""`claude --version` — proves the CLI runs without touching the API."""
|
|
try:
|
|
r = subprocess.run(
|
|
[bin_path, "--version"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=_VERSION_TIMEOUT,
|
|
)
|
|
if r.returncode == 0:
|
|
return True, (r.stdout or r.stderr or "").strip()[:120] or "ok"
|
|
return False, f"--version exit {r.returncode}: {(r.stderr or r.stdout).strip()[:120]}"
|
|
except subprocess.TimeoutExpired:
|
|
return False, f"--version timed out after {_VERSION_TIMEOUT}s"
|
|
except FileNotFoundError:
|
|
return False, "claude binary not found (FileNotFoundError)"
|
|
except Exception as e: # pragma: no cover - defensive
|
|
return False, f"--version error: {e}"
|
|
|
|
|
|
def _compute() -> tuple[bool, str]:
|
|
bin_path = _claude_bin()
|
|
if not os.path.exists(bin_path):
|
|
return False, f"CLAUDE_BIN not found: {bin_path}"
|
|
return _run_version(bin_path)
|
|
|
|
|
|
def check(force: bool = False) -> tuple[bool, str]:
|
|
"""Return (ok, reason). Cached for preflight_cache_ttl seconds.
|
|
|
|
force=True bypasses the cache (used by the breaker half-open probe / tests).
|
|
"""
|
|
now = time.time()
|
|
ttl = settings.preflight_cache_ttl
|
|
if not force and _cache.ts > 0 and (now - _cache.ts) < ttl:
|
|
return _cache.ok, _cache.reason
|
|
ok, reason = _compute()
|
|
_cache.ts = now
|
|
_cache.ok = ok
|
|
_cache.reason = reason
|
|
if not ok:
|
|
logger.warning(f"Preflight FAIL: {reason}")
|
|
return ok, reason
|
|
|
|
|
|
def reset_cache() -> None:
|
|
"""Invalidate the cache (tests / forced recheck)."""
|
|
_cache.ts = 0.0
|
|
_cache.ok = False
|
|
_cache.reason = "reset"
|