feat(resilience): cheap preflight + 429/transient error classifier (ORCH-1)
preflight.py: cached CLAUDE_BIN exists + claude --version (no tokens, no prompt-ping). error_classifier.py: classify_log_file -> transient|permanent from log tail + Retry-After parsing.
This commit is contained in:
87
src/error_classifier.py
Normal file
87
src/error_classifier.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""ORCH-1 resilience: classify an agent failure as transient vs permanent.
|
||||
|
||||
Rate limits / overload / network blips cannot be reliably predicted in advance,
|
||||
so we classify *after the run* by scanning the agent's combined stdout/stderr log
|
||||
(B-2 sends both to /app/data/runs/<run_id>.log).
|
||||
|
||||
- transient -> 429 / rate limit / overloaded / network / quota-exhausted etc.
|
||||
=> backoff + transient retry (separate counter, larger budget).
|
||||
- permanent -> a genuine code fault / agent error
|
||||
=> normal attempts < max_attempts, then 'failed'.
|
||||
|
||||
Also extracts a Retry-After hint (seconds) when the server provided one.
|
||||
"""
|
||||
import re
|
||||
|
||||
# Case-insensitive substrings/patterns that signal a transient/rate-limit issue.
|
||||
_TRANSIENT_PATTERNS = [
|
||||
r"\b429\b",
|
||||
r"rate[\s_-]*limit",
|
||||
r"rate_limit_error",
|
||||
r"overloaded",
|
||||
r"overloaded_error",
|
||||
r"too many requests",
|
||||
r"quota",
|
||||
r"insufficient[_\s-]*quota",
|
||||
r"retry[\s-]*after",
|
||||
r"service unavailable",
|
||||
r"\b503\b",
|
||||
r"\b529\b",
|
||||
r"timed out",
|
||||
r"timeout",
|
||||
r"connection (reset|refused|error|aborted)",
|
||||
r"temporarily unavailable",
|
||||
r"econnreset",
|
||||
r"etimedout",
|
||||
]
|
||||
|
||||
_TRANSIENT_RE = re.compile("|".join(_TRANSIENT_PATTERNS), re.IGNORECASE)
|
||||
|
||||
# Retry-After: header style ("Retry-After: 30") or JSON ("retry_after": 30) or
|
||||
# "retry after 30 seconds". Returns the integer seconds.
|
||||
_RETRY_AFTER_RE = re.compile(
|
||||
r"retry[\s_-]*after[\"']?\s*[:=]?\s*[\"']?\s*(\d+)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def classify_text(text: str) -> str:
|
||||
"""Return 'transient' or 'permanent' for a chunk of log/stderr text."""
|
||||
if not text:
|
||||
return "permanent"
|
||||
return "transient" if _TRANSIENT_RE.search(text) else "permanent"
|
||||
|
||||
|
||||
def parse_retry_after(text: str) -> int | None:
|
||||
"""Return Retry-After seconds if present in the text, else None."""
|
||||
if not text:
|
||||
return None
|
||||
m = _RETRY_AFTER_RE.search(text)
|
||||
if m:
|
||||
try:
|
||||
return int(m.group(1))
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def classify_log_file(path: str, tail_bytes: int = 16384) -> tuple[str, int | None]:
|
||||
"""Classify the tail of a log file.
|
||||
|
||||
Reads the last `tail_bytes` of the log (rate-limit messages appear near the
|
||||
end) and returns (classification, retry_after_seconds_or_None).
|
||||
On any read error, treats it as 'permanent' (no special backoff).
|
||||
"""
|
||||
if not path:
|
||||
return "permanent", None
|
||||
try:
|
||||
with open(path, "rb") as f:
|
||||
try:
|
||||
f.seek(-tail_bytes, 2)
|
||||
except OSError:
|
||||
f.seek(0)
|
||||
data = f.read()
|
||||
text = data.decode("utf-8", errors="replace")
|
||||
except Exception:
|
||||
return "permanent", None
|
||||
return classify_text(text), parse_retry_after(text)
|
||||
Reference in New Issue
Block a user