feat(resilience): cheap preflight + 429/transient error classifier (ORCH-1)

preflight.py: cached CLAUDE_BIN exists + claude --version (no tokens, no
prompt-ping). error_classifier.py: classify_log_file -> transient|permanent
from log tail + Retry-After parsing.
This commit is contained in:
Dev Agent
2026-06-03 00:12:17 +03:00
parent 0cd9b11fe0
commit 4ef87a3959
2 changed files with 177 additions and 0 deletions

87
src/error_classifier.py Normal file
View File

@@ -0,0 +1,87 @@
"""ORCH-1 resilience: classify an agent failure as transient vs permanent.
Rate limits / overload / network blips cannot be reliably predicted in advance,
so we classify *after the run* by scanning the agent's combined stdout/stderr log
(B-2 sends both to /app/data/runs/<run_id>.log).
- transient -> 429 / rate limit / overloaded / network / quota-exhausted etc.
=> backoff + transient retry (separate counter, larger budget).
- permanent -> a genuine code fault / agent error
=> normal attempts < max_attempts, then 'failed'.
Also extracts a Retry-After hint (seconds) when the server provided one.
"""
import re
# Case-insensitive substrings/patterns that signal a transient/rate-limit issue.
_TRANSIENT_PATTERNS = [
r"\b429\b",
r"rate[\s_-]*limit",
r"rate_limit_error",
r"overloaded",
r"overloaded_error",
r"too many requests",
r"quota",
r"insufficient[_\s-]*quota",
r"retry[\s-]*after",
r"service unavailable",
r"\b503\b",
r"\b529\b",
r"timed out",
r"timeout",
r"connection (reset|refused|error|aborted)",
r"temporarily unavailable",
r"econnreset",
r"etimedout",
]
_TRANSIENT_RE = re.compile("|".join(_TRANSIENT_PATTERNS), re.IGNORECASE)
# Retry-After: header style ("Retry-After: 30") or JSON ("retry_after": 30) or
# "retry after 30 seconds". Returns the integer seconds.
_RETRY_AFTER_RE = re.compile(
r"retry[\s_-]*after[\"']?\s*[:=]?\s*[\"']?\s*(\d+)",
re.IGNORECASE,
)
def classify_text(text: str) -> str:
"""Return 'transient' or 'permanent' for a chunk of log/stderr text."""
if not text:
return "permanent"
return "transient" if _TRANSIENT_RE.search(text) else "permanent"
def parse_retry_after(text: str) -> int | None:
"""Return Retry-After seconds if present in the text, else None."""
if not text:
return None
m = _RETRY_AFTER_RE.search(text)
if m:
try:
return int(m.group(1))
except (TypeError, ValueError):
return None
return None
def classify_log_file(path: str, tail_bytes: int = 16384) -> tuple[str, int | None]:
"""Classify the tail of a log file.
Reads the last `tail_bytes` of the log (rate-limit messages appear near the
end) and returns (classification, retry_after_seconds_or_None).
On any read error, treats it as 'permanent' (no special backoff).
"""
if not path:
return "permanent", None
try:
with open(path, "rb") as f:
try:
f.seek(-tail_bytes, 2)
except OSError:
f.seek(0)
data = f.read()
text = data.decode("utf-8", errors="replace")
except Exception:
return "permanent", None
return classify_text(text), parse_retry_after(text)