feat(resilience): cheap preflight + 429/transient error classifier (ORCH-1)
preflight.py: cached CLAUDE_BIN exists + claude --version (no tokens, no prompt-ping). error_classifier.py: classify_log_file -> transient|permanent from log tail + Retry-After parsing.
This commit is contained in:
90
src/preflight.py
Normal file
90
src/preflight.py
Normal file
@@ -0,0 +1,90 @@
|
||||
"""ORCH-1 resilience: cheap preflight check (CLI / network available?).
|
||||
|
||||
Goal: before the worker claims a job, confirm the claude CLI binary and runtime
|
||||
are reachable WITHOUT spending any tokens. We only do local/cheap checks:
|
||||
|
||||
1. os.path.exists(CLAUDE_BIN) -- instant
|
||||
2. `claude --version` (timeout 5s) -- spawns CLI, does NOT call the API
|
||||
|
||||
The result is cached for `preflight_cache_ttl` seconds so we do not re-run
|
||||
`claude --version` on every worker tick.
|
||||
|
||||
🚫 We deliberately do NOT do a prompt ping (ping->pong) — that would burn the
|
||||
rate limit and add latency. Preflight is local-only.
|
||||
"""
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
import subprocess
|
||||
|
||||
from .config import settings
|
||||
|
||||
logger = logging.getLogger("orchestrator.preflight")
|
||||
|
||||
_VERSION_TIMEOUT = 5
|
||||
|
||||
|
||||
class _PreflightCache:
|
||||
def __init__(self):
|
||||
self.ts: float = 0.0
|
||||
self.ok: bool = False
|
||||
self.reason: str = "not checked yet"
|
||||
|
||||
|
||||
_cache = _PreflightCache()
|
||||
|
||||
|
||||
def _claude_bin() -> str:
|
||||
return getattr(settings, "claude_bin", None) or "/opt/claude-code/bin/claude.exe"
|
||||
|
||||
|
||||
def _run_version(bin_path: str) -> tuple[bool, str]:
|
||||
"""`claude --version` — proves the CLI runs without touching the API."""
|
||||
try:
|
||||
r = subprocess.run(
|
||||
[bin_path, "--version"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=_VERSION_TIMEOUT,
|
||||
)
|
||||
if r.returncode == 0:
|
||||
return True, (r.stdout or r.stderr or "").strip()[:120] or "ok"
|
||||
return False, f"--version exit {r.returncode}: {(r.stderr or r.stdout).strip()[:120]}"
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, f"--version timed out after {_VERSION_TIMEOUT}s"
|
||||
except FileNotFoundError:
|
||||
return False, "claude binary not found (FileNotFoundError)"
|
||||
except Exception as e: # pragma: no cover - defensive
|
||||
return False, f"--version error: {e}"
|
||||
|
||||
|
||||
def _compute() -> tuple[bool, str]:
|
||||
bin_path = _claude_bin()
|
||||
if not os.path.exists(bin_path):
|
||||
return False, f"CLAUDE_BIN not found: {bin_path}"
|
||||
return _run_version(bin_path)
|
||||
|
||||
|
||||
def check(force: bool = False) -> tuple[bool, str]:
|
||||
"""Return (ok, reason). Cached for preflight_cache_ttl seconds.
|
||||
|
||||
force=True bypasses the cache (used by the breaker half-open probe / tests).
|
||||
"""
|
||||
now = time.time()
|
||||
ttl = settings.preflight_cache_ttl
|
||||
if not force and _cache.ts > 0 and (now - _cache.ts) < ttl:
|
||||
return _cache.ok, _cache.reason
|
||||
ok, reason = _compute()
|
||||
_cache.ts = now
|
||||
_cache.ok = ok
|
||||
_cache.reason = reason
|
||||
if not ok:
|
||||
logger.warning(f"Preflight FAIL: {reason}")
|
||||
return ok, reason
|
||||
|
||||
|
||||
def reset_cache() -> None:
|
||||
"""Invalidate the cache (tests / forced recheck)."""
|
||||
_cache.ts = 0.0
|
||||
_cache.ok = False
|
||||
_cache.reason = "reset"
|
||||
Reference in New Issue
Block a user