"""ORCH-1 resilience: cheap preflight check (CLI / network available?). Goal: before the worker claims a job, confirm the claude CLI binary and runtime are reachable WITHOUT spending any tokens. We only do local/cheap checks: 1. os.path.exists(CLAUDE_BIN) -- instant 2. `claude --version` (timeout 5s) -- spawns CLI, does NOT call the API 3. auth check (ORCH-044, P1) -- read the local OAuth credentials file The result is cached for `preflight_cache_ttl` seconds so we do not re-run `claude --version` (or re-read the credentials file) on every worker tick. ๐Ÿšซ We deliberately do NOT do a prompt ping (ping->pong) โ€” that would burn the rate limit and add latency. Preflight is local-only. ORCH-044 (P1): `claude --version` answers successfully even when claude is NOT logged in (the version is local information), so version-only preflight was blind to auth. We add a token-free auth gate: read /.claude/.credentials.json and validate the OAuth token (presence + expiry). Combined with a post-factum `Not logged in` marker detection (is_auth_failure_text), this stops a logged-out instance from claiming jobs and silently dying with an empty run log. No network call is ever made here. """ import os import re import json import time import logging import subprocess from .config import settings logger = logging.getLogger("orchestrator.preflight") _VERSION_TIMEOUT = 5 # ORCH-044 (P1b): post-factum auth-failure markers. If an agent started under a # session that died/expired between preflight and spawn, these substrings in the # run log identify the auth failure so the launcher can invalidate the preflight # cache (forcing the next tick to re-evaluate auth proactively). _AUTH_FAIL_RE = re.compile( r"not logged in|please run\s*/login|invalid api key|unauthorized|\b401\b", re.IGNORECASE, ) class _PreflightCache: def __init__(self): self.ts: float = 0.0 self.ok: bool = False self.reason: str = "not checked yet" _cache = _PreflightCache() def _claude_bin() -> str: """Resolve the claude binary preflight should check. Must match the binary the launcher actually spawns. The launcher hardcodes AgentLauncher.CLAUDE_BIN for the real Popen, so we prefer that; we only fall back to settings.claude_bin / a default if it is somehow unset. (Note: the container's ORCH_CLAUDE_BIN may point elsewhere; preflight follows the path that is genuinely executed, not the unused env override.) """ try: from .agents.launcher import AgentLauncher launcher_bin = getattr(AgentLauncher, "CLAUDE_BIN", None) if launcher_bin and os.path.exists(launcher_bin): return launcher_bin # Launcher path not present -> fall back to configured/default. return launcher_bin or getattr(settings, "claude_bin", None) or "/opt/claude-code/bin/claude.exe" except Exception: return getattr(settings, "claude_bin", None) or "/opt/claude-code/bin/claude.exe" def _run_version(bin_path: str) -> tuple[bool, str]: """`claude --version` โ€” proves the CLI runs without touching the API.""" try: r = subprocess.run( [bin_path, "--version"], capture_output=True, text=True, timeout=_VERSION_TIMEOUT, ) if r.returncode == 0: return True, (r.stdout or r.stderr or "").strip()[:120] or "ok" return False, f"--version exit {r.returncode}: {(r.stderr or r.stdout).strip()[:120]}" except subprocess.TimeoutExpired: return False, f"--version timed out after {_VERSION_TIMEOUT}s" except FileNotFoundError: return False, "claude binary not found (FileNotFoundError)" except Exception as e: # pragma: no cover - defensive return False, f"--version error: {e}" def _agent_home() -> str: """Resolve the HOME the launcher actually spawns claude under (ORCH-044, TR-1.3). The auth credentials live under the *agent's* HOME (/home/slin), which the launcher injects into the claude subprocess env โ€” NOT the orchestrator process HOME. We mirror _claude_bin()'s "follow the genuinely executed path" approach by reading AgentLauncher.AGENT_HOME. Falls back to the known default if the launcher cannot be imported (e.g. isolated unit test). """ try: from .agents.launcher import AgentLauncher home = getattr(AgentLauncher, "AGENT_HOME", None) if home: return home except Exception: pass return "/home/slin" def _credentials_path() -> str: """Path to claude's OAuth credentials file (ORCH-044, P1). settings.claude_credentials_path wins when set; otherwise /.claude/.credentials.json. """ explicit = (getattr(settings, "claude_credentials_path", "") or "").strip() if explicit: return explicit return os.path.join(_agent_home(), ".claude", ".credentials.json") def _iso(epoch_ms) -> str: """Best-effort epoch-ms -> ISO-8601 UTC string (for human-readable reasons).""" try: from datetime import datetime, timezone return datetime.fromtimestamp(int(epoch_ms) / 1000, tz=timezone.utc).isoformat() except Exception: return str(epoch_ms) def is_auth_failure_text(text: str) -> bool: """ORCH-044 (P1b): True if `text` contains a claude auth-failure marker. Used post-factum on a run log so the launcher can tell an auth death apart from a generic failure and reset the preflight cache. Never raises. """ if not text: return False try: return bool(_AUTH_FAIL_RE.search(text)) except Exception: return False def _check_auth() -> tuple[bool, str]: """ORCH-044 (P1a): token-free local auth gate. Never raises. Steps (ADR-001 ยงP1): 1. credentials file missing / unreadable / invalid JSON -> not ok. 2. no claudeAiOauth block / accessToken -> not ok. 3. claudeAiOauth.expiresAt (epoch ms) <= now + skew -> expired -> not ok. 4. accessToken present but expiresAt absent/unparsable -> OK (cannot prove expiry; we do not manufacture false positives that would wedge the shared queue โ€” see ADR Risks R-1). Fail-safe: any unexpected error returns (False, ...) so a logged-out / broken state never claims a job (BR-2 / TR-3.5). This reads only a local file โ€” no network call, no token spend (BR-1 / AC-5). """ try: path = _credentials_path() if not os.path.exists(path): return False, f"claude not logged in: credentials missing ({path})" try: with open(path, "r", encoding="utf-8") as f: data = json.load(f) except (OSError, ValueError) as e: return False, f"claude not logged in: credentials unreadable ({e})" oauth = data.get("claudeAiOauth") if isinstance(data, dict) else None if not isinstance(oauth, dict) or not oauth.get("accessToken"): return False, "claude not logged in: no oauth token" expires = oauth.get("expiresAt") if expires is None: return True, "auth ok (no expiry recorded)" try: expires_ms = int(expires) except (TypeError, ValueError): return True, "auth ok (unparsable expiry)" skew_ms = int(getattr(settings, "auth_expiry_skew_seconds", 0) or 0) * 1000 now_ms = int(time.time() * 1000) if expires_ms <= now_ms + skew_ms: return False, f"OAuth token expired at {_iso(expires_ms)}" return True, "auth ok" except Exception as e: # pragma: no cover - defensive fail-safe return False, f"auth check error: {e}" def _compute() -> tuple[bool, str]: bin_path = _claude_bin() if not os.path.exists(bin_path): return False, f"CLAUDE_BIN not found: {bin_path}" ok, reason = _run_version(bin_path) if not ok: return ok, reason # ORCH-044 (P1): version is local info and answers even when logged out, so # gate on a token-free auth check too. Toggleable for emergencies. if getattr(settings, "preflight_check_auth", True): auth_ok, auth_reason = _check_auth() if not auth_ok: return False, auth_reason return True, reason def check(force: bool = False) -> tuple[bool, str]: """Return (ok, reason). Cached for preflight_cache_ttl seconds. force=True bypasses the cache (used by the breaker half-open probe / tests). """ now = time.time() ttl = settings.preflight_cache_ttl if not force and _cache.ts > 0 and (now - _cache.ts) < ttl: return _cache.ok, _cache.reason ok, reason = _compute() _cache.ts = now _cache.ok = ok _cache.reason = reason if not ok: logger.warning(f"Preflight FAIL: {reason}") return ok, reason def reset_cache() -> None: """Invalidate the cache (tests / forced recheck).""" _cache.ts = 0.0 _cache.ok = False _cache.reason = "reset"