feat(launcher): classify failures, backoff transient retry, breaker outcome (ORCH-1)

_finalize_job classifies the run log: transient (429/overload) -> backoff requeue via mark_job_transient with separate transient_attempts budget honouring Retry-After; permanent -> normal attempts<max. on_outcome callback feeds the circuit breaker. _backoff_seconds = min(2^n*base, max) | Retry-After.
2026-06-03 00:12:17 +03:00
parent 4ef87a3959
commit 90fdd19394
1 changed files with 105 additions and 30 deletions
--- a/src/agents/launcher.py
+++ b/src/agents/launcher.py
@@ -364,18 +364,35 @@ class AgentLauncher:
        # ORCH-1: drive the job-queue status for queue-launched jobs only.
        # (Legacy direct launch() has job_id=None and is unaffected.)
        if job_id is not None:
-            self._finalize_job(job_id, agent, run_id, exit_code)
+            self._finalize_job(job_id, agent, run_id, exit_code, output_path=output_path)

-    def _finalize_job(self, job_id: int, agent: str, run_id: int, exit_code):
+    def _backoff_seconds(self, transient_attempts: int, retry_after: int = None) -> int:
+        """Exponential backoff for transient failures, honouring Retry-After.
+
+        backoff = min(2^transient_attempts * base, max). If the server sent a
+        Retry-After, use the larger of the two (never poll sooner than asked).
+        """
+        base = settings.backoff_base_seconds
+        cap = settings.backoff_max_seconds
+        backoff = min((2 ** max(transient_attempts, 0)) * base, cap)
+        if retry_after is not None and retry_after > 0:
+            backoff = max(backoff, min(retry_after, cap))
+        return int(backoff)
+
+    def _finalize_job(self, job_id: int, agent: str, run_id: int, exit_code, output_path=None):
        """ORCH-1: update the jobs row after the agent process finished.

-        exit_code == 0  -> done.
-        exit_code != 0  -> retry while attempts < max_attempts (back to 'queued'),
-                           otherwise 'failed' + Telegram notification.
-        attempts were already incremented at claim time, so `attempts` reflects how
-        many times this job has been picked up.
+        exit_code == 0  -> done (and resets the breaker streak via on_outcome).
+        exit_code != 0  -> classify the failure from the run log tail (token-free):
+          - TRANSIENT (429/overload/network): backoff-requeue with available_at in
+            the future + a SEPARATE transient_attempts budget
+            (settings.transient_max_attempts), honouring Retry-After. Reported to
+            the breaker so it opens after N consecutive transient failures.
+          - PERMANENT (code fault): ordinary attempts < max_attempts requeue,
+            otherwise 'failed' + Telegram.
        """
        from ..db import get_job, mark_job
+        from ..error_classifier import classify_log_file
        try:
            job = get_job(job_id)
            if not job:
@@ -383,35 +400,93 @@ class AgentLauncher:
            if exit_code == 0:
                mark_job(job_id, "done", run_id=run_id)
                logger.info(f"Job {job_id} ({agent}) done (run_id={run_id})")
+                self._record_outcome(transient=False, recovered=True)
                return

-            attempts = job.get("attempts", 0)
-            max_attempts = job.get("max_attempts", 2)
-            err = f"agent {agent} exit_code={exit_code} (run_id={run_id})"
-            if attempts < max_attempts:
-                mark_job(job_id, "queued", run_id=run_id, error=err)
-                logger.warning(
-                    f"Job {job_id} ({agent}) failed (exit={exit_code}), "
-                    f"requeued (attempt {attempts}/{max_attempts})"
-                )
+            # Classify the failure from the agent log tail (no token cost).
+            kind, retry_after = "permanent", None
+            log_path = output_path or f"/app/data/runs/{run_id}.log"
+            try:
+                kind, retry_after = classify_log_file(log_path)
+            except Exception:
+                pass
+
+            if kind == "transient":
+                self._finalize_transient(job_id, agent, run_id, exit_code, job, retry_after)
            else:
-                mark_job(job_id, "failed", run_id=run_id, error=err)
-                logger.error(
-                    f"Job {job_id} ({agent}) failed permanently after "
-                    f"{attempts} attempts (exit={exit_code})"
-                )
-                try:
-                    from ..notifications import send_telegram
-                    send_telegram(
-                        f"\U0001f6a8 Job {job_id} ({agent}, repo {job.get('repo')}) "
-                        f"failed after {attempts} attempts (exit={exit_code}). "
-                        f"Logs: /app/data/runs/{run_id}.log"
-                    )
-                except Exception:
-                    pass
+                self._finalize_permanent(job_id, agent, run_id, exit_code, job)
        except Exception as e:
            logger.error(f"Job {job_id}: _finalize_job error: {e}")

+    def _finalize_transient(self, job_id, agent, run_id, exit_code, job, retry_after):
+        """Transient (429/overload/net) failure -> backoff requeue or fail when budget out."""
+        from ..db import mark_job, mark_job_transient
+        tattempts = job.get("transient_attempts", 0)
+        tmax = settings.transient_max_attempts
+        err = (f"transient (429/overload) agent {agent} exit={exit_code} "
+               f"(run_id={run_id}); retry_after={retry_after}")
+        self._record_outcome(transient=True, recovered=False)
+        if tattempts < tmax:
+            backoff = self._backoff_seconds(tattempts + 1, retry_after)
+            mark_job_transient(job_id, backoff, error=err)
+            logger.warning(
+                f"Job {job_id} ({agent}) TRANSIENT fail (exit={exit_code}), "
+                f"backoff {backoff}s, transient_attempt {tattempts + 1}/{tmax}"
+            )
+        else:
+            mark_job(job_id, "failed", run_id=run_id, error=err)
+            logger.error(
+                f"Job {job_id} ({agent}) failed after {tattempts} transient attempts"
+            )
+            self._notify_failed(job_id, agent, job, run_id,
+                                f"transient (rate-limit) after {tattempts} attempts")
+
+    def _finalize_permanent(self, job_id, agent, run_id, exit_code, job):
+        """Permanent (code-fault) failure -> normal attempts<max requeue, then fail."""
+        from ..db import mark_job
+        attempts = job.get("attempts", 0)
+        max_attempts = job.get("max_attempts", 2)
+        err = f"agent {agent} exit_code={exit_code} (run_id={run_id})"
+        self._record_outcome(transient=False, recovered=False)
+        if attempts < max_attempts:
+            mark_job(job_id, "queued", run_id=run_id, error=err)
+            logger.warning(
+                f"Job {job_id} ({agent}) failed (exit={exit_code}), "
+                f"requeued (attempt {attempts}/{max_attempts})"
+            )
+        else:
+            mark_job(job_id, "failed", run_id=run_id, error=err)
+            logger.error(
+                f"Job {job_id} ({agent}) failed permanently after "
+                f"{attempts} attempts (exit={exit_code})"
+            )
+            self._notify_failed(job_id, agent, job, run_id,
+                                f"{attempts} attempts (exit={exit_code})")
+
+    def _notify_failed(self, job_id, agent, job, run_id, why):
+        try:
+            from ..notifications import send_telegram
+            send_telegram(
+                f"\U0001f6a8 Job {job_id} ({agent}, repo {job.get('repo')}) "
+                f"failed: {why}. Logs: /app/data/runs/{run_id}.log"
+            )
+        except Exception:
+            pass
+
+    def _record_outcome(self, transient: bool, recovered: bool):
+        """Forward the run outcome to the circuit breaker (if a worker is wired).
+
+        Decoupled via a settable callback (set by QueueWorker.start) so the launcher
+        does not hard-import the worker (avoids a cycle) and tests can run the
+        launcher standalone.
+        """
+        cb = getattr(self, "on_outcome", None)
+        if cb:
+            try:
+                cb(transient=transient, recovered=recovered)
+            except Exception:
+                pass
+
    def _try_advance_stage(self, run_id: int, agent: str, repo: str, branch: str):
        """After agent finishes successfully, check QG and advance stage if possible."""
        try: