feat(resilience): cheap preflight + 429/transient error classifier (ORCH-1)

preflight.py: cached CLAUDE_BIN exists + claude --version (no tokens, no
prompt-ping). error_classifier.py: classify_log_file -> transient|permanent
from log tail + Retry-After parsing.
This commit is contained in:
Dev Agent
2026-06-03 00:12:17 +03:00
parent 0cd9b11fe0
commit 4ef87a3959
2 changed files with 177 additions and 0 deletions

90
src/preflight.py Normal file
View File

@@ -0,0 +1,90 @@
"""ORCH-1 resilience: cheap preflight check (CLI / network available?).
Goal: before the worker claims a job, confirm the claude CLI binary and runtime
are reachable WITHOUT spending any tokens. We only do local/cheap checks:
1. os.path.exists(CLAUDE_BIN) -- instant
2. `claude --version` (timeout 5s) -- spawns CLI, does NOT call the API
The result is cached for `preflight_cache_ttl` seconds so we do not re-run
`claude --version` on every worker tick.
🚫 We deliberately do NOT do a prompt ping (ping->pong) — that would burn the
rate limit and add latency. Preflight is local-only.
"""
import os
import time
import logging
import subprocess
from .config import settings
logger = logging.getLogger("orchestrator.preflight")
_VERSION_TIMEOUT = 5
class _PreflightCache:
def __init__(self):
self.ts: float = 0.0
self.ok: bool = False
self.reason: str = "not checked yet"
_cache = _PreflightCache()
def _claude_bin() -> str:
return getattr(settings, "claude_bin", None) or "/opt/claude-code/bin/claude.exe"
def _run_version(bin_path: str) -> tuple[bool, str]:
"""`claude --version` — proves the CLI runs without touching the API."""
try:
r = subprocess.run(
[bin_path, "--version"],
capture_output=True,
text=True,
timeout=_VERSION_TIMEOUT,
)
if r.returncode == 0:
return True, (r.stdout or r.stderr or "").strip()[:120] or "ok"
return False, f"--version exit {r.returncode}: {(r.stderr or r.stdout).strip()[:120]}"
except subprocess.TimeoutExpired:
return False, f"--version timed out after {_VERSION_TIMEOUT}s"
except FileNotFoundError:
return False, "claude binary not found (FileNotFoundError)"
except Exception as e: # pragma: no cover - defensive
return False, f"--version error: {e}"
def _compute() -> tuple[bool, str]:
bin_path = _claude_bin()
if not os.path.exists(bin_path):
return False, f"CLAUDE_BIN not found: {bin_path}"
return _run_version(bin_path)
def check(force: bool = False) -> tuple[bool, str]:
"""Return (ok, reason). Cached for preflight_cache_ttl seconds.
force=True bypasses the cache (used by the breaker half-open probe / tests).
"""
now = time.time()
ttl = settings.preflight_cache_ttl
if not force and _cache.ts > 0 and (now - _cache.ts) < ttl:
return _cache.ok, _cache.reason
ok, reason = _compute()
_cache.ts = now
_cache.ok = ok
_cache.reason = reason
if not ok:
logger.warning(f"Preflight FAIL: {reason}")
return ok, reason
def reset_cache() -> None:
"""Invalidate the cache (tests / forced recheck)."""
_cache.ts = 0.0
_cache.ok = False
_cache.reason = "reset"