"""ORCH-1 resilience: cheap preflight check (CLI / network available?). Goal: before the worker claims a job, confirm the claude CLI binary and runtime are reachable WITHOUT spending any tokens. We only do local/cheap checks: 1. os.path.exists(CLAUDE_BIN) -- instant 2. `claude --version` (timeout 5s) -- spawns CLI, does NOT call the API The result is cached for `preflight_cache_ttl` seconds so we do not re-run `claude --version` on every worker tick. 🚫 We deliberately do NOT do a prompt ping (ping->pong) — that would burn the rate limit and add latency. Preflight is local-only. """ import os import time import logging import subprocess from .config import settings logger = logging.getLogger("orchestrator.preflight") _VERSION_TIMEOUT = 5 class _PreflightCache: def __init__(self): self.ts: float = 0.0 self.ok: bool = False self.reason: str = "not checked yet" _cache = _PreflightCache() def _claude_bin() -> str: """Resolve the claude binary preflight should check. Must match the binary the launcher actually spawns. The launcher hardcodes AgentLauncher.CLAUDE_BIN for the real Popen, so we prefer that; we only fall back to settings.claude_bin / a default if it is somehow unset. (Note: the container's ORCH_CLAUDE_BIN may point elsewhere; preflight follows the path that is genuinely executed, not the unused env override.) """ try: from .agents.launcher import AgentLauncher launcher_bin = getattr(AgentLauncher, "CLAUDE_BIN", None) if launcher_bin and os.path.exists(launcher_bin): return launcher_bin # Launcher path not present -> fall back to configured/default. return launcher_bin or getattr(settings, "claude_bin", None) or "/opt/claude-code/bin/claude.exe" except Exception: return getattr(settings, "claude_bin", None) or "/opt/claude-code/bin/claude.exe" def _run_version(bin_path: str) -> tuple[bool, str]: """`claude --version` — proves the CLI runs without touching the API.""" try: r = subprocess.run( [bin_path, "--version"], capture_output=True, text=True, timeout=_VERSION_TIMEOUT, ) if r.returncode == 0: return True, (r.stdout or r.stderr or "").strip()[:120] or "ok" return False, f"--version exit {r.returncode}: {(r.stderr or r.stdout).strip()[:120]}" except subprocess.TimeoutExpired: return False, f"--version timed out after {_VERSION_TIMEOUT}s" except FileNotFoundError: return False, "claude binary not found (FileNotFoundError)" except Exception as e: # pragma: no cover - defensive return False, f"--version error: {e}" def _compute() -> tuple[bool, str]: bin_path = _claude_bin() if not os.path.exists(bin_path): return False, f"CLAUDE_BIN not found: {bin_path}" return _run_version(bin_path) def check(force: bool = False) -> tuple[bool, str]: """Return (ok, reason). Cached for preflight_cache_ttl seconds. force=True bypasses the cache (used by the breaker half-open probe / tests). """ now = time.time() ttl = settings.preflight_cache_ttl if not force and _cache.ts > 0 and (now - _cache.ts) < ttl: return _cache.ok, _cache.reason ok, reason = _compute() _cache.ts = now _cache.ok = ok _cache.reason = reason if not ok: logger.warning(f"Preflight FAIL: {reason}") return ok, reason def reset_cache() -> None: """Invalidate the cache (tests / forced recheck).""" _cache.ts = 0.0 _cache.ok = False _cache.reason = "reset"