"""ORCH-1 resilience: classify an agent failure as transient vs permanent. Rate limits / overload / network blips cannot be reliably predicted in advance, so we classify *after the run* by scanning the agent's combined stdout/stderr log (B-2 sends both to /app/data/runs/.log). - transient -> 429 / rate limit / overloaded / network / quota-exhausted etc. => backoff + transient retry (separate counter, larger budget). - permanent -> a genuine code fault / agent error => normal attempts < max_attempts, then 'failed'. Also extracts a Retry-After hint (seconds) when the server provided one. """ import re # Case-insensitive substrings/patterns that signal a transient/rate-limit issue. _TRANSIENT_PATTERNS = [ r"\b429\b", r"rate[\s_-]*limit", r"rate_limit_error", r"overloaded", r"overloaded_error", r"too many requests", r"quota", r"insufficient[_\s-]*quota", r"retry[\s-]*after", r"service unavailable", r"\b503\b", r"\b529\b", r"timed out", r"timeout", r"connection (reset|refused|error|aborted)", r"temporarily unavailable", r"econnreset", r"etimedout", ] _TRANSIENT_RE = re.compile("|".join(_TRANSIENT_PATTERNS), re.IGNORECASE) # Retry-After: header style ("Retry-After: 30") or JSON ("retry_after": 30) or # "retry after 30 seconds". Returns the integer seconds. _RETRY_AFTER_RE = re.compile( r"retry[\s_-]*after[\"']?\s*[:=]?\s*[\"']?\s*(\d+)", re.IGNORECASE, ) def classify_text(text: str) -> str: """Return 'transient' or 'permanent' for a chunk of log/stderr text.""" if not text: return "permanent" return "transient" if _TRANSIENT_RE.search(text) else "permanent" def parse_retry_after(text: str) -> int | None: """Return Retry-After seconds if present in the text, else None.""" if not text: return None m = _RETRY_AFTER_RE.search(text) if m: try: return int(m.group(1)) except (TypeError, ValueError): return None return None def classify_log_file(path: str, tail_bytes: int = 16384) -> tuple[str, int | None]: """Classify the tail of a log file. Reads the last `tail_bytes` of the log (rate-limit messages appear near the end) and returns (classification, retry_after_seconds_or_None). On any read error, treats it as 'permanent' (no special backoff). """ if not path: return "permanent", None try: with open(path, "rb") as f: try: f.seek(-tail_bytes, 2) except OSError: f.seek(0) data = f.read() text = data.decode("utf-8", errors="replace") except Exception: return "permanent", None return classify_text(text), parse_retry_after(text)