feat(preflight): catch logged-out auth and treat empty result as failure
ORCH-044 closes two blind spots that let a single de-authenticated agent
stall the shared queue for all projects:
P1 — preflight auth gate. `claude --version` answers even when logged out,
so version-only preflight was blind to auth. Adds a token-free, network-free
check of <AGENT_HOME>/.claude/.credentials.json: missing/unreadable/no-oauth
or an expired `claudeAiOauth.expiresAt` (epoch ms, vs now + skew) => preflight
FAIL; absent expiry => OK (no false positives). Result is cached on the same
preflight_cache_ttl. Post-factum safety net: launcher detects auth markers
("not logged in" / "/login" / "unauthorized" / 401) in the run log and resets
the preflight cache so the next tick re-evaluates auth. Auth failure is a gate,
not a transient — it does not spin the circuit breaker. Emergency toggle
ORCH_PREFLIGHT_CHECK_AUTH=false restores version-only behaviour.
P3 — empty log / no result-JSON => job failed. exit_code==0 with an empty or
JSON-less run log no longer counts as success: a separate result_ok flag gates
stage advance + usage comments, fires a Telegram alert, and routes the job
through the normal transient/permanent failure path (exit_code integrity in
agent_runs preserved).
Scope: P2 (--effort) is intentionally excluded and tracked in ORCH-50.
New settings: ORCH_PREFLIGHT_CHECK_AUTH, ORCH_CLAUDE_CREDENTIALS_PATH,
ORCH_AUTH_EXPIRY_SKEW_SECONDS. Docs updated (INFRA.md, internals.md, CHANGELOG).
Refs: ORCH-044
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
133
src/preflight.py
133
src/preflight.py
@@ -5,14 +5,25 @@ are reachable WITHOUT spending any tokens. We only do local/cheap checks:
|
||||
|
||||
1. os.path.exists(CLAUDE_BIN) -- instant
|
||||
2. `claude --version` (timeout 5s) -- spawns CLI, does NOT call the API
|
||||
3. auth check (ORCH-044, P1) -- read the local OAuth credentials file
|
||||
|
||||
The result is cached for `preflight_cache_ttl` seconds so we do not re-run
|
||||
`claude --version` on every worker tick.
|
||||
`claude --version` (or re-read the credentials file) on every worker tick.
|
||||
|
||||
🚫 We deliberately do NOT do a prompt ping (ping->pong) — that would burn the
|
||||
rate limit and add latency. Preflight is local-only.
|
||||
|
||||
ORCH-044 (P1): `claude --version` answers successfully even when claude is NOT
|
||||
logged in (the version is local information), so version-only preflight was blind
|
||||
to auth. We add a token-free auth gate: read <AGENT_HOME>/.claude/.credentials.json
|
||||
and validate the OAuth token (presence + expiry). Combined with a post-factum
|
||||
`Not logged in` marker detection (is_auth_failure_text), this stops a logged-out
|
||||
instance from claiming jobs and silently dying with an empty run log. No network
|
||||
call is ever made here.
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import time
|
||||
import logging
|
||||
import subprocess
|
||||
@@ -23,6 +34,15 @@ logger = logging.getLogger("orchestrator.preflight")
|
||||
|
||||
_VERSION_TIMEOUT = 5
|
||||
|
||||
# ORCH-044 (P1b): post-factum auth-failure markers. If an agent started under a
|
||||
# session that died/expired between preflight and spawn, these substrings in the
|
||||
# run log identify the auth failure so the launcher can invalidate the preflight
|
||||
# cache (forcing the next tick to re-evaluate auth proactively).
|
||||
_AUTH_FAIL_RE = re.compile(
|
||||
r"not logged in|please run\s*/login|invalid api key|unauthorized|\b401\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
class _PreflightCache:
|
||||
def __init__(self):
|
||||
@@ -74,11 +94,120 @@ def _run_version(bin_path: str) -> tuple[bool, str]:
|
||||
return False, f"--version error: {e}"
|
||||
|
||||
|
||||
def _agent_home() -> str:
|
||||
"""Resolve the HOME the launcher actually spawns claude under (ORCH-044, TR-1.3).
|
||||
|
||||
The auth credentials live under the *agent's* HOME (/home/slin), which the
|
||||
launcher injects into the claude subprocess env — NOT the orchestrator
|
||||
process HOME. We mirror _claude_bin()'s "follow the genuinely executed path"
|
||||
approach by reading AgentLauncher.AGENT_HOME. Falls back to the known default
|
||||
if the launcher cannot be imported (e.g. isolated unit test).
|
||||
"""
|
||||
try:
|
||||
from .agents.launcher import AgentLauncher
|
||||
home = getattr(AgentLauncher, "AGENT_HOME", None)
|
||||
if home:
|
||||
return home
|
||||
except Exception:
|
||||
pass
|
||||
return "/home/slin"
|
||||
|
||||
|
||||
def _credentials_path() -> str:
|
||||
"""Path to claude's OAuth credentials file (ORCH-044, P1).
|
||||
|
||||
settings.claude_credentials_path wins when set; otherwise
|
||||
<AGENT_HOME>/.claude/.credentials.json.
|
||||
"""
|
||||
explicit = (getattr(settings, "claude_credentials_path", "") or "").strip()
|
||||
if explicit:
|
||||
return explicit
|
||||
return os.path.join(_agent_home(), ".claude", ".credentials.json")
|
||||
|
||||
|
||||
def _iso(epoch_ms) -> str:
|
||||
"""Best-effort epoch-ms -> ISO-8601 UTC string (for human-readable reasons)."""
|
||||
try:
|
||||
from datetime import datetime, timezone
|
||||
return datetime.fromtimestamp(int(epoch_ms) / 1000, tz=timezone.utc).isoformat()
|
||||
except Exception:
|
||||
return str(epoch_ms)
|
||||
|
||||
|
||||
def is_auth_failure_text(text: str) -> bool:
|
||||
"""ORCH-044 (P1b): True if `text` contains a claude auth-failure marker.
|
||||
|
||||
Used post-factum on a run log so the launcher can tell an auth death apart
|
||||
from a generic failure and reset the preflight cache. Never raises.
|
||||
"""
|
||||
if not text:
|
||||
return False
|
||||
try:
|
||||
return bool(_AUTH_FAIL_RE.search(text))
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _check_auth() -> tuple[bool, str]:
|
||||
"""ORCH-044 (P1a): token-free local auth gate. Never raises.
|
||||
|
||||
Steps (ADR-001 §P1):
|
||||
1. credentials file missing / unreadable / invalid JSON -> not ok.
|
||||
2. no claudeAiOauth block / accessToken -> not ok.
|
||||
3. claudeAiOauth.expiresAt (epoch ms) <= now + skew -> expired -> not ok.
|
||||
4. accessToken present but expiresAt absent/unparsable -> OK (cannot prove
|
||||
expiry; we do not manufacture false positives that would wedge the shared
|
||||
queue — see ADR Risks R-1).
|
||||
|
||||
Fail-safe: any unexpected error returns (False, ...) so a logged-out / broken
|
||||
state never claims a job (BR-2 / TR-3.5). This reads only a local file — no
|
||||
network call, no token spend (BR-1 / AC-5).
|
||||
"""
|
||||
try:
|
||||
path = _credentials_path()
|
||||
if not os.path.exists(path):
|
||||
return False, f"claude not logged in: credentials missing ({path})"
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
except (OSError, ValueError) as e:
|
||||
return False, f"claude not logged in: credentials unreadable ({e})"
|
||||
|
||||
oauth = data.get("claudeAiOauth") if isinstance(data, dict) else None
|
||||
if not isinstance(oauth, dict) or not oauth.get("accessToken"):
|
||||
return False, "claude not logged in: no oauth token"
|
||||
|
||||
expires = oauth.get("expiresAt")
|
||||
if expires is None:
|
||||
return True, "auth ok (no expiry recorded)"
|
||||
try:
|
||||
expires_ms = int(expires)
|
||||
except (TypeError, ValueError):
|
||||
return True, "auth ok (unparsable expiry)"
|
||||
|
||||
skew_ms = int(getattr(settings, "auth_expiry_skew_seconds", 0) or 0) * 1000
|
||||
now_ms = int(time.time() * 1000)
|
||||
if expires_ms <= now_ms + skew_ms:
|
||||
return False, f"OAuth token expired at {_iso(expires_ms)}"
|
||||
return True, "auth ok"
|
||||
except Exception as e: # pragma: no cover - defensive fail-safe
|
||||
return False, f"auth check error: {e}"
|
||||
|
||||
|
||||
def _compute() -> tuple[bool, str]:
|
||||
bin_path = _claude_bin()
|
||||
if not os.path.exists(bin_path):
|
||||
return False, f"CLAUDE_BIN not found: {bin_path}"
|
||||
return _run_version(bin_path)
|
||||
ok, reason = _run_version(bin_path)
|
||||
if not ok:
|
||||
return ok, reason
|
||||
# ORCH-044 (P1): version is local info and answers even when logged out, so
|
||||
# gate on a token-free auth check too. Toggleable for emergencies.
|
||||
if getattr(settings, "preflight_check_auth", True):
|
||||
auth_ok, auth_reason = _check_auth()
|
||||
if not auth_ok:
|
||||
return False, auth_reason
|
||||
return True, reason
|
||||
|
||||
|
||||
def check(force: bool = False) -> tuple[bool, str]:
|
||||
|
||||
Reference in New Issue
Block a user