From 98b47fe021e08fabacc2f0532bdf0562edcf618c Mon Sep 17 00:00:00 2001 From: claude-bot Date: Sat, 6 Jun 2026 08:11:27 +0000 Subject: [PATCH] feat(preflight): catch logged-out auth and treat empty result as failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ORCH-044 closes two blind spots that let a single de-authenticated agent stall the shared queue for all projects: P1 — preflight auth gate. `claude --version` answers even when logged out, so version-only preflight was blind to auth. Adds a token-free, network-free check of /.claude/.credentials.json: missing/unreadable/no-oauth or an expired `claudeAiOauth.expiresAt` (epoch ms, vs now + skew) => preflight FAIL; absent expiry => OK (no false positives). Result is cached on the same preflight_cache_ttl. Post-factum safety net: launcher detects auth markers ("not logged in" / "/login" / "unauthorized" / 401) in the run log and resets the preflight cache so the next tick re-evaluates auth. Auth failure is a gate, not a transient — it does not spin the circuit breaker. Emergency toggle ORCH_PREFLIGHT_CHECK_AUTH=false restores version-only behaviour. P3 — empty log / no result-JSON => job failed. exit_code==0 with an empty or JSON-less run log no longer counts as success: a separate result_ok flag gates stage advance + usage comments, fires a Telegram alert, and routes the job through the normal transient/permanent failure path (exit_code integrity in agent_runs preserved). Scope: P2 (--effort) is intentionally excluded and tracked in ORCH-50. New settings: ORCH_PREFLIGHT_CHECK_AUTH, ORCH_CLAUDE_CREDENTIALS_PATH, ORCH_AUTH_EXPIRY_SKEW_SECONDS. Docs updated (INFRA.md, internals.md, CHANGELOG). Refs: ORCH-044 Co-Authored-By: Claude Opus 4.7 --- CHANGELOG.md | 1 + docs/architecture/internals.md | 24 ++- docs/operations/INFRA.md | 16 ++ src/agents/launcher.py | 183 +++++++++++++++++--- src/config.py | 19 ++ src/preflight.py | 133 +++++++++++++- tests/test_empty_log_failure.py | 298 ++++++++++++++++++++++++++++++++ tests/test_preflight_auth.py | 246 ++++++++++++++++++++++++++ 8 files changed, 891 insertions(+), 29 deletions(-) create mode 100644 tests/test_empty_log_failure.py create mode 100644 tests/test_preflight_auth.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e3cbfb..7f71c11 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ## [Unreleased] ### Added +- **Надёжность запуска агента: preflight ловит авторизацию + пустой результат = провал** (ORCH-044): закрыты две системные дыры, из-за которых разлогиненный/«быстро умерший» агент тихо вешал общую очередь всех проектов (инцидент ORCH-17). **P1 — preflight ловит auth (token-free, без сети/prompt-ping, BR-1):** после успешного `claude --version` (который отвечает даже когда claude разлогинен — версия локальна) `src/preflight.py` читает `/.claude/.credentials.json` и валидирует OAuth-токен — нет файла / битый JSON / нет `claudeAiOauth.accessToken` ⇒ FAIL; `claudeAiOauth.expiresAt` (epoch ms) `<= now + ORCH_AUTH_EXPIRY_SKEW_SECONDS` ⇒ протух ⇒ FAIL; нет `expiresAt` ⇒ OK (не плодим ложных срабатываний). Путь к credentials резолвится от `AgentLauncher.AGENT_HOME` (`/home/slin`, HOME под которым launcher реально спавнит claude), а не от HOME процесса орка (новый `_agent_home()`, зеркально `_claude_bin()`). Результат кешируется тем же `ORCH_PREFLIGHT_CACHE_TTL`. При `auth=fail` job не клеймится (`_drain_once` уже корректен при `ok=False`), reason виден в `/queue`. Защитная сетка постфактум: `_handle_auth_marker` детектит маркер разлогина в run-логе (`is_auth_failure_text`) и сбрасывает preflight-кеш, чтобы следующий тик переоценил auth (auth-провал НЕ transient, breaker не крутится). Новые настройки: `ORCH_PREFLIGHT_CHECK_AUTH` (тумблер, default true), `ORCH_CLAUDE_CREDENTIALS_PATH` (явный путь), `ORCH_AUTH_EXPIRY_SKEW_SECONDS`. **P3 — пустой лог / нет result-JSON ⇒ провал:** `exit_code==0` больше не считается успехом сам по себе — `_monitor_agent` валидирует результат (`_validate_result`: лог непустой + есть trailing result-JSON по контракту `usage._extract_last_json_object`); `success = exit 0 AND result_ok`. Только при `success` постится «успешный» status-коммент и вызывается `_try_advance_stage`; при `exit 0 & not result_ok` — Telegram-алерт, стадия НЕ двигается, `_finalize_job(result_ok=False)` маршрутизирует job в провал (`empty run log / no result JSON`: по умолчанию permanent → requeue/`failed`+алерт; transient-маркер в логе → transient-путь). Реальный `exit_code` пишется в `agent_runs` без искажения — решение done/fail несёт отдельный флаг `result_ok` (не подменённый код выхода). Итог: `exit 0` всегда завершается терминально/ретраябельно (`done`|`failed`|`queued`) — путь «быстрая смерть с exit 0 → вечный running» закрыт. ⛔ Scope: `--effort` (P2) исключён владельцем и вынесен в ORCH-50 — не трогался. ADR `docs/work-items/ORCH-044/06-adr/ADR-001-preflight-auth-and-empty-result-failure.md`. Тесты: `tests/test_preflight_auth.py`, `tests/test_empty_log_failure.py`. - **Дословный текст findings reviewer/tester встраивается в `task_desc` заворота** (ORCH-046): при откате на `development` строка `task_desc` (попадает в `.task-dev.md` developer-агента) теперь несёт суть претензий, а не только ссылку на файл — устраняет «испорченный телефон», из-за которого агент шёл «читать файл», терял ключевые P0/P1 / причину FAIL и заворачивался снова, выжигая `MAX_DEVELOPER_RETRIES` и токены. Новый defensive-модуль `src/review_parse.py` (контракт «never raise», как `src/frontmatter.py`): `extract_review_findings(path)` — дословные пункты P0/P1 из секции `## Findings` файла `12-review.md`; `extract_test_failures(path)` — релевантный фрагмент тела `13-test-report.md` (приоритет `## Вывод pytest` → FAIL-строки `## Результаты` → `## Итог`). Обе функции усекают результат до `MAX_FINDINGS_CHARS`/`MAX_FAILURES_CHARS` (≈2000) с маркером `…(truncated)`. Две rollback-ветки `src/stage_engine.py` (reviewer REQUEST_CHANGES, tester `check_tests_passed` FAIL) встраивают извлечённый текст и **сохраняют ссылку** на полный файл («Полный контекст»); при пустом/битом артефакте — graceful-фоллбэк на прежнюю ссылку-строку (никаких исключений в `advance_stage`). Tester-ветка дополнительно всегда включает `reason` гейта. Последовательность отката, `_developer_retry_count`, поля `AdvanceResult` и реестр `QG_CHECKS` не менялись. ADR `docs/work-items/ORCH-046/06-adr/ADR-001-embed-findings-in-task-desc.md`. Тесты: `tests/test_review_parse.py`, `tests/test_stage_engine.py::TestRollbackTaskDescEmbedding`. - **Поллинг с ретраем в quality-gate `check_ci_green`** (ORCH-045): гейт CI превращён из single-shot в polling, чтобы устранить race condition — раньше один опрос combined commit-status сразу после пуша developer-а ловил транзиентный `pending` (типично 1-3с, реальный кейс ORCH-017: опрос 17:58:54 → pending, CI дозеленел 17:58:55) и задача застревала насмерть без повторного опроса. Теперь: `success` → пропуск сразу; `failure`/`error` → провал сразу (терминально, ретрай бессмыслен); `pending`/unknown → `time.sleep` и повторный опрос до `ci_poll_max_attempts` раз; истечение попыток → явный `(False, "CI still pending after s")` (тупик больше не молчаливый); 404 → как раньше; транзиентная `httpx.HTTPError` на попытке логируется и ретраится в рамках бюджета. Параметры — новые настройки `ORCH_CI_POLL_MAX_ATTEMPTS` (12) и `ORCH_CI_POLL_INTERVAL_S` (10) в `src/config.py` (~2 мин ожидания pending). Сигнатура `check_ci_green(repo, branch)` и реестр `QG_CHECKS` не менялись; `check_tests_passed` не затронут. ADR `docs/architecture/adr/adr-0004-ci-poll-retry.md`. Тесты: `tests/test_qg.py::TestCheckCIGreen`. - **Прямые ссылки на BRD и Plane-таску в Telegram-уведомлении об апруве** (ORCH-017): пингующее сообщение `notify_approve_requested` теперь встраивает две HTML-``-ссылки — на `docs/work-items//01-brd.md` (Gitea branch-view: `gitea_public_url`→`gitea_url`) и на issue в Plane (`{web_base}/{workspace}/projects/{project_id}/issues/{plane_issue_id}/`). Новая настройка `ORCH_PLANE_WEB_URL` (внешний браузерный web-URL Plane; фолбэк на `plane_api_url`). **Loopback-guard:** если итоговый Plane web-base указывает на localhost/127.0.0.1/0.0.0.0/::1 или пуст — Plane-ссылка опускается (не выпускаем битый localhost-URL). Graceful degradation: каждая ссылка строится независимо и опускается при нехватке данных, сообщение и призыв «Переведите задачу в статус Approved …» сохраняются всегда; ровно одно пингующее сообщение, разделяемая `send_telegram` не тронута. Динамические подписи экранируются `html.escape`, `parse_mode=HTML` сохранён. ADR `docs/work-items/ORCH-017/06-adr/ADR-001-telegram-approve-links.md`. Тесты: `test_notify_approve_links.py`, `test_analysis_approve_flow_links.py`. diff --git a/docs/architecture/internals.md b/docs/architecture/internals.md index 3e01346..b130b51 100644 --- a/docs/architecture/internals.md +++ b/docs/architecture/internals.md @@ -88,7 +88,16 @@ claude.exe --print --system-prompt --allowedTools Read,Write,Edit,Bash 1. Записывает run в DB (agent_runs) 2. Запускает subprocess. **stdout/stderr перенаправляются СРАЗУ в файл `/app/data/runs/{id}.log` на уровне ОС** (Popen `stdout=log_fh`). Никакого PIPE в памяти оркестратора → нет PIPE-deadlock, нет потока-читателя, нет зомби (B-2). 3. Стартует **watchdog thread** (timeout 30 мин → SIGKILL по pid) -4. Стартует **monitor thread**: `proc.wait()` (гарантированный reap → реальный exit_code в БД) → закрывает log_fh → git commit/push → auto-advance +4. Стартует **monitor thread**: `proc.wait()` (гарантированный reap → реальный exit_code в БД) → закрывает log_fh → **валидация результата (ORCH-044)** → git commit/push → auto-advance + +**Валидация результата (ORCH-044, P3).** `exit_code==0` сам по себе НЕ считается успехом: claude может «быстро умереть» (разлогинен / флаг гасит stdout), оставив пустой или JSON-less лог, но выйдя с кодом 0 — раньше это было неотличимо от успеха (`done` + auto-advance по пустому результату). Теперь `_monitor_agent` вызывает `_validate_result(output_path)`: +- лог отсутствует / пустой (0 байт или только whitespace) ⇒ невалиден; +- нет парсящегося trailing result-JSON (тот же контракт, что usage-учёт — `usage._extract_last_json_object`) ⇒ невалиден; +- хелпер защитный (never-raise); при собственной ошибке — fail-safe в сторону провала. + +`success = (exit_code==0 AND result_ok)`. Реальный `exit_code` пишется в `agent_runs` без искажения; на решение done/fail влияет отдельный флаг `result_ok` (не подменённый код выхода). Только при `success`: постится «успешный» status-коммент и вызывается `_try_advance_stage`. При `exit_code==0 AND not result_ok`: шлётся Telegram-алерт о пустом/невалидном результате, стадия НЕ двигается, а `_finalize_job(result_ok=False)` маршрутизирует job в провал (`empty run log / no result JSON`): по умолчанию permanent (`attempts mark_job done - exit !=0 & attempts requeue (queued) - exit !=0 & attempts>=max -> failed + Telegram + _finalize_job(result_ok): + exit 0 & result_ok -> mark_job done + else (exit!=0 ИЛИ пустой результат): + attempts requeue (queued) + attempts>=max -> failed + Telegram ``` +> ORCH-044 (P3): `result_ok` отражает валидность run-лога (непустой + есть result-JSON). `exit 0` с пустым/невалидным результатом идёт в ветку провала, НЕ в `done` (см. §4 «Валидация результата»). + ### Таблица `jobs` | Колонка | Назначение | diff --git a/docs/operations/INFRA.md b/docs/operations/INFRA.md index 90bd8e0..f2dac6c 100644 --- a/docs/operations/INFRA.md +++ b/docs/operations/INFRA.md @@ -54,6 +54,9 @@ | `ORCH_AGENT_EFFORT_DEFAULT` | режим работы `--effort` по умолчанию (ORCH-41): low\|medium\|high\|xhigh\|max; дефолт `high` | | `ORCH_AGENT_EFFORT_` | per-agent effort; дефолт: думающие → high, tester/deployer → medium | | `ORCH_AGENT_FALLBACK_MODEL` | опц. фолбэк-модель при overloaded (`--fallback-model`); пусто → без флага | +| `ORCH_PREFLIGHT_CHECK_AUTH` | вкл/выкл token-free auth-проверку preflight (ORCH-044); дефолт `true`. Аварийный тумблер: `false` → preflight как до ORCH-044 (только `--version`) | +| `ORCH_CLAUDE_CREDENTIALS_PATH` | явный путь к `.credentials.json` (ORCH-044); пусто → `/.claude/.credentials.json`, где `AGENT_HOME=/home/slin` — HOME, под которым launcher реально спавнит claude (не HOME процесса орка) | +| `ORCH_AUTH_EXPIRY_SKEW_SECONDS` | запас на рассинхрон часов при сравнении `claudeAiOauth.expiresAt` (ORCH-044); дефолт `0` | | `DEPLOY_SSH_USER` / `_HOST` / `DEPLOY_HOOK_SCRIPT` | параметры деплой-хука | **Секреты — только в `.env` / `.env.staging` на хосте, в гит НЕ коммитятся.** Канон — `.env.example`, `.env.staging.example`. @@ -81,6 +84,19 @@ > ⚠️ Бюджет (ORCH-38): `claude-opus-4-8` дефолт в коде; реальное переключение прод-env делается отдельно после согласования. +## Preflight auth-гейт (`src/preflight.py`, ORCH-044) +`claude --version` отвечает успешно **даже когда claude разлогинен** (версия — локальная инфа), поэтому до ORCH-044 preflight был слеп к авторизации: разлогиненный инстанс клеймил job и тихо умирал с пустым логом, блокируя общую очередь всех проектов. + +ORCH-044 добавляет **token-free** проверку (без сети, без prompt-ping — BR-1): +1. **Проактивно (основной гейт):** после успешного `--version` читается `/.claude/.credentials.json` (путь — `ORCH_CLAUDE_CREDENTIALS_PATH` или дефолт от `AGENT_HOME=/home/slin`, **не** HOME процесса орка). Нет файла / битый JSON / нет `claudeAiOauth.accessToken` ⇒ `check()=(False, …)`. `claudeAiOauth.expiresAt` (epoch ms) `<= now + ORCH_AUTH_EXPIRY_SKEW_SECONDS` ⇒ протух ⇒ FAIL. Нет `expiresAt` ⇒ OK (не плодим ложные срабатывания). Результат кешируется тем же `ORCH_PREFLIGHT_CACHE_TTL`, что и `--version`. +2. **Постфактум (защитная сетка):** если агент всё же стартовал при протухшей сессии, launcher детектит маркер (`not logged in` / `please run /login` / `unauthorized` / `401`) в run-логе и сбрасывает preflight-кеш, чтобы следующий тик переоценил auth. Auth-провал **не** считается transient и **не** крутит circuit breaker — гейт здесь preflight. + +При `auth=fail` job **не клеймится** (`_drain_once` уже корректен при `ok=False`), reason виден в `/queue` (`preflight_reason`). Аварийный тумблер `ORCH_PREFLIGHT_CHECK_AUTH=false` возвращает version-only поведение. + +> ⚠️ Риск ложноположительного auth-fail (R-1): неверный путь к credentials заблокирует клейм **всех** проектов (общая очередь). Митигация: единый источник `AGENT_HOME`, тумблер, обязательная проверка на staging (8501) перед прод-деплоем. ADR — `docs/work-items/ORCH-044/06-adr/ADR-001-preflight-auth-and-empty-result-failure.md`. + +> ℹ️ `--effort` (P2) в ORCH-044 **не трогается** — вынесен в ORCH-50. + ## ⚠️ Self-hosting — оркестратор дорабатывает САМ СЕБЯ **Факт:** прод-инстанс `orchestrator` (8500) — ОДИН на ВСЕ прод-проекты (enduro-trails + orchestrator), с ОБЩЕЙ БД `./data/orchestrator.db` и общей очередью задач (ORCH-1). diff --git a/src/agents/launcher.py b/src/agents/launcher.py index 43d8019..816601a 100644 --- a/src/agents/launcher.py +++ b/src/agents/launcher.py @@ -185,6 +185,10 @@ class AgentLauncher: } CLAUDE_BIN = "/opt/claude-code/bin/claude.exe" + # ORCH-044 (P1): HOME the claude subprocess actually runs under. preflight + # resolves the OAuth credentials path from this (NOT the orchestrator process + # HOME), so keep this single source of truth in sync with the spawn env below. + AGENT_HOME = "/home/slin" # ORCH-7 (M-2): timeout is now configurable. AGENT_TIMEOUT stays as a # backward-compatible alias for the default; the actual value (and per-agent # overrides) live in settings and are resolved via _resolve_timeout(). @@ -323,7 +327,7 @@ class AgentLauncher: stderr=subprocess.STDOUT, env={ **os.environ, - "HOME": "/home/slin", + "HOME": self.AGENT_HOME, "GIT_AUTHOR_NAME": "claude-bot", "GIT_AUTHOR_EMAIL": "claude-bot@mva154.local", "GIT_COMMITTER_NAME": "claude-bot", @@ -492,6 +496,21 @@ class AgentLauncher: notify_agent_finished(run_id, agent, exit_code, task_id=_task_id, duration_s=_duration_s) + # ORCH-044 (P3): a clean exit_code==0 is NOT enough — claude can die fast + # (logged out, killed flag) leaving an empty / JSON-less log while still + # exiting 0. Validate the result; only (exit 0 AND result_ok) is success. + # The real exit_code is still recorded above without distortion; this flag + # drives the done/fail decision (ADR-001 §P3 / A4). + result_ok, result_reason = (True, "ok") + if exit_code == 0: + result_ok, result_reason = self._validate_result(output_path) + if not result_ok: + logger.warning( + f"Agent run_id={run_id} ({agent}) exited 0 but result invalid: " + f"{result_reason}" + ) + success = (exit_code == 0 and result_ok) + # Feature 4: parse token usage / cost from the (json) run log and record # it on the agent_runs row. Never fatal — a garbled/missing JSON records # NULLs and logs a warning so a broken run can't crash the monitor. @@ -510,7 +529,7 @@ class AgentLauncher: try: git_env = { **os.environ, - "HOME": "/home/slin", + "HOME": self.AGENT_HOME, "GIT_AUTHOR_NAME": "claude-bot", "GIT_AUTHOR_EMAIL": "claude-bot@mva154.local", "GIT_COMMITTER_NAME": "claude-bot", @@ -593,11 +612,34 @@ class AgentLauncher: from ..notifications import send_telegram send_telegram(f"\u26a0\ufe0f {_wid}: Agent {agent} failed (exit_code={exit_code}). Check logs: /app/data/runs/{run_id}.log") + # ORCH-044 (P3): exit 0 with an empty/invalid result is a failure, not a + # success — alert (like other failures) and DO NOT post a success comment + # or advance the stage. The job-queue finalize below routes it to + # failed/retry. (AC-10/11/12.) + if exit_code == 0 and not success: + try: + conn = get_db() + task_row = conn.execute( + "SELECT work_item_id FROM tasks WHERE repo=? AND branch=?", + (repo, branch), + ).fetchone() + conn.close() + _wid = task_row[0] if task_row else None + from ..notifications import send_telegram + send_telegram( + f"⚠️ {_wid or repo}: Agent {agent} exited 0 but produced " + f"an empty/invalid result ({result_reason}). " + f"Logs: /app/data/runs/{run_id}.log" + ) + except Exception as e: + logger.warning(f"run_id={run_id}: empty-result alert failed: {e}") + # Feature 4 + ORCH-016: post the unified per-agent status comment under # that agent's bot, threading the wall-clock duration we just measured # straight through (ADR-001 §6: explicit param wins over DB fallback). # The deployer finishing the task also posts the per-task usage summary. - if exit_code == 0: + # ORCH-044 (P3): only on real success (exit 0 AND valid result). + if success: try: self._post_usage_comments( run_id, agent, repo, branch, _usage, duration_s=_duration_s @@ -605,14 +647,81 @@ class AgentLauncher: except Exception as e: logger.warning(f"run_id={run_id}: usage comment failed: {e}") - # Auto-advance stage if agent finished successfully and QG passes - if exit_code == 0: + # Auto-advance stage if agent finished successfully and QG passes. + # ORCH-044 (P3): suppressed when the result was empty/invalid. + if success: self._try_advance_stage(run_id, agent, repo, branch) # ORCH-1: drive the job-queue status for queue-launched jobs only. # (Legacy direct launch() has job_id=None and is unaffected.) + # ORCH-044 (P3): result_ok lets _finalize_job treat an empty-result exit 0 + # as a failure rather than 'done'. if job_id is not None: - self._finalize_job(job_id, agent, run_id, exit_code, output_path=output_path) + self._finalize_job( + job_id, agent, run_id, exit_code, + output_path=output_path, result_ok=result_ok, + ) + + @staticmethod + def _validate_result(output_path) -> tuple[bool, str]: + """ORCH-044 (P3): is the run log a real result, or an empty/JSON-less death? + + Returns (ok, reason). A run counts as a valid result only when the log + exists, is non-empty (not just whitespace), AND carries a parseable + trailing result-JSON object — the same contract usage accounting uses + (usage._extract_last_json_object). claude --output-format json always + emits exactly such an object on a real run, so its absence means the agent + died before producing anything. + + Never raises: any error is treated as an invalid result (fail-safe toward + failing the job rather than silently passing — TR-3.5). + """ + try: + if not output_path: + return False, "no output path" + if not os.path.exists(output_path): + return False, "run log missing" + if os.path.getsize(output_path) == 0: + return False, "empty run log (0 bytes)" + with open(output_path, "r", encoding="utf-8", errors="replace") as f: + text = f.read() + if not text.strip(): + return False, "empty run log (whitespace only)" + from ..usage import _extract_last_json_object + if _extract_last_json_object(text) is None: + return False, "no result JSON in run log" + return True, "result ok" + except Exception as e: # pragma: no cover - defensive fail-safe + return False, f"result validation error: {e}" + + def _handle_auth_marker(self, log_path) -> bool: + """ORCH-044 (P1b): post-factum auth-failure detection (defensive net). + + If an agent died because the session was logged out / expired between + preflight and spawn, reset the preflight cache so the NEXT worker tick + re-evaluates auth proactively (fast re-login pickup, or continued gating + if still broken). Auth failure is deliberately NOT treated as transient + and does NOT crank the circuit breaker — preflight is the right gate here. + Returns True if an auth marker was found. Never raises. + """ + try: + from .. import preflight + with open(log_path, "rb") as f: + try: + f.seek(-16384, 2) + except OSError: + f.seek(0) + text = f.read().decode("utf-8", errors="replace") + if preflight.is_auth_failure_text(text): + logger.warning( + f"Auth-failure marker in {log_path}; resetting preflight cache " + f"so the next tick re-checks auth" + ) + preflight.reset_cache() + return True + except Exception: + pass + return False def _backoff_seconds(self, transient_attempts: int, retry_after: int = None) -> int: """Exponential backoff for transient failures, honouring Retry-After. @@ -627,17 +736,21 @@ class AgentLauncher: backoff = max(backoff, min(retry_after, cap)) return int(backoff) - def _finalize_job(self, job_id: int, agent: str, run_id: int, exit_code, output_path=None): + def _finalize_job(self, job_id: int, agent: str, run_id: int, exit_code, + output_path=None, result_ok: bool = True): """ORCH-1: update the jobs row after the agent process finished. - exit_code == 0 -> done (and resets the breaker streak via on_outcome). - exit_code != 0 -> classify the failure from the run log tail (token-free): + success = (exit_code == 0 AND result_ok) -> done (resets the breaker + streak via on_outcome). ORCH-044 (P3): result_ok==False means + exit 0 but the run log was empty / had no result-JSON, so it is + routed through the failure path below, NOT marked done. + otherwise -> classify the failure from the run log tail (token-free): - TRANSIENT (429/overload/network): backoff-requeue with available_at in the future + a SEPARATE transient_attempts budget (settings.transient_max_attempts), honouring Retry-After. Reported to the breaker so it opens after N consecutive transient failures. - - PERMANENT (code fault): ordinary attempts < max_attempts requeue, - otherwise 'failed' + Telegram. + - PERMANENT (code fault, incl. the empty-result case): ordinary + attempts < max_attempts requeue, otherwise 'failed' + Telegram. """ from ..db import get_job, mark_job from ..error_classifier import classify_log_file @@ -645,34 +758,55 @@ class AgentLauncher: job = get_job(job_id) if not job: return - if exit_code == 0: + if exit_code == 0 and result_ok: mark_job(job_id, "done", run_id=run_id) logger.info(f"Job {job_id} ({agent}) done (run_id={run_id})") self._record_outcome(transient=False, recovered=True) return + log_path = output_path or f"/app/data/runs/{run_id}.log" + + # ORCH-044 (P1b): if the failure was an auth death, invalidate the + # preflight cache so the next tick re-gates on auth proactively. + self._handle_auth_marker(log_path) + + # ORCH-044 (P3): informative error for the empty/invalid-result case + # (exit 0 but no usable result). Defaults to permanent (it is not a + # 429/overload) unless the log carries a transient marker (TR-3.3). + empty_result = (exit_code == 0 and not result_ok) + override_err = ( + f"empty run log / no result JSON (run_id={run_id})" + if empty_result else None + ) + # Classify the failure from the agent log tail (no token cost). kind, retry_after = "permanent", None - log_path = output_path or f"/app/data/runs/{run_id}.log" try: kind, retry_after = classify_log_file(log_path) except Exception: pass if kind == "transient": - self._finalize_transient(job_id, agent, run_id, exit_code, job, retry_after) + self._finalize_transient(job_id, agent, run_id, exit_code, job, + retry_after, error=override_err) else: - self._finalize_permanent(job_id, agent, run_id, exit_code, job) + self._finalize_permanent(job_id, agent, run_id, exit_code, job, + error=override_err) except Exception as e: logger.error(f"Job {job_id}: _finalize_job error: {e}") - def _finalize_transient(self, job_id, agent, run_id, exit_code, job, retry_after): - """Transient (429/overload/net) failure -> backoff requeue or fail when budget out.""" + def _finalize_transient(self, job_id, agent, run_id, exit_code, job, retry_after, + error: str | None = None): + """Transient (429/overload/net) failure -> backoff requeue or fail when budget out. + + ORCH-044 (P3): `error`, when provided, overrides the default transient + message (used for the empty-result case so the reason is informative). + """ from ..db import mark_job, mark_job_transient tattempts = job.get("transient_attempts", 0) tmax = settings.transient_max_attempts - err = (f"transient (429/overload) agent {agent} exit={exit_code} " - f"(run_id={run_id}); retry_after={retry_after}") + err = error or (f"transient (429/overload) agent {agent} exit={exit_code} " + f"(run_id={run_id}); retry_after={retry_after}") self._record_outcome(transient=True, recovered=False) if tattempts < tmax: backoff = self._backoff_seconds(tattempts + 1, retry_after) @@ -689,12 +823,17 @@ class AgentLauncher: self._notify_failed(job_id, agent, job, run_id, f"transient (rate-limit) after {tattempts} attempts") - def _finalize_permanent(self, job_id, agent, run_id, exit_code, job): - """Permanent (code-fault) failure -> normal attempts normal attempts consecutive transient failures that OPEN the breaker. # breaker_pause_seconds -> how long the breaker stays open before half-open. preflight_cache_ttl: int = 45 + # ORCH-044 (P1): token-free preflight auth gate. After `claude --version` + # succeeds, preflight also checks that claude is logged in by reading the + # local OAuth credentials file (no network / no prompt-ping — BR-1). + # preflight_check_auth -> master toggle (env ORCH_PREFLIGHT_CHECK_AUTH). + # Emergency off-switch if the check ever + # false-positives and wedges the shared queue. + # claude_credentials_path -> explicit path to .credentials.json + # (env ORCH_CLAUDE_CREDENTIALS_PATH). Empty -> + # /.claude/.credentials.json, where + # AGENT_HOME is the HOME the launcher really + # spawns claude under (/home/slin), NOT the + # orchestrator process env. + # auth_expiry_skew_seconds -> clock-drift slack when comparing + # claudeAiOauth.expiresAt (env + # ORCH_AUTH_EXPIRY_SKEW_SECONDS); a token within + # this many seconds of now is treated as expired. + preflight_check_auth: bool = True + claude_credentials_path: str = "" + auth_expiry_skew_seconds: int = 0 backoff_base_seconds: int = 10 backoff_max_seconds: int = 600 transient_max_attempts: int = 5 diff --git a/src/preflight.py b/src/preflight.py index 316fdce..f9d0f02 100644 --- a/src/preflight.py +++ b/src/preflight.py @@ -5,14 +5,25 @@ are reachable WITHOUT spending any tokens. We only do local/cheap checks: 1. os.path.exists(CLAUDE_BIN) -- instant 2. `claude --version` (timeout 5s) -- spawns CLI, does NOT call the API + 3. auth check (ORCH-044, P1) -- read the local OAuth credentials file The result is cached for `preflight_cache_ttl` seconds so we do not re-run -`claude --version` on every worker tick. +`claude --version` (or re-read the credentials file) on every worker tick. 🚫 We deliberately do NOT do a prompt ping (ping->pong) — that would burn the rate limit and add latency. Preflight is local-only. + +ORCH-044 (P1): `claude --version` answers successfully even when claude is NOT +logged in (the version is local information), so version-only preflight was blind +to auth. We add a token-free auth gate: read /.claude/.credentials.json +and validate the OAuth token (presence + expiry). Combined with a post-factum +`Not logged in` marker detection (is_auth_failure_text), this stops a logged-out +instance from claiming jobs and silently dying with an empty run log. No network +call is ever made here. """ import os +import re +import json import time import logging import subprocess @@ -23,6 +34,15 @@ logger = logging.getLogger("orchestrator.preflight") _VERSION_TIMEOUT = 5 +# ORCH-044 (P1b): post-factum auth-failure markers. If an agent started under a +# session that died/expired between preflight and spawn, these substrings in the +# run log identify the auth failure so the launcher can invalidate the preflight +# cache (forcing the next tick to re-evaluate auth proactively). +_AUTH_FAIL_RE = re.compile( + r"not logged in|please run\s*/login|invalid api key|unauthorized|\b401\b", + re.IGNORECASE, +) + class _PreflightCache: def __init__(self): @@ -74,11 +94,120 @@ def _run_version(bin_path: str) -> tuple[bool, str]: return False, f"--version error: {e}" +def _agent_home() -> str: + """Resolve the HOME the launcher actually spawns claude under (ORCH-044, TR-1.3). + + The auth credentials live under the *agent's* HOME (/home/slin), which the + launcher injects into the claude subprocess env — NOT the orchestrator + process HOME. We mirror _claude_bin()'s "follow the genuinely executed path" + approach by reading AgentLauncher.AGENT_HOME. Falls back to the known default + if the launcher cannot be imported (e.g. isolated unit test). + """ + try: + from .agents.launcher import AgentLauncher + home = getattr(AgentLauncher, "AGENT_HOME", None) + if home: + return home + except Exception: + pass + return "/home/slin" + + +def _credentials_path() -> str: + """Path to claude's OAuth credentials file (ORCH-044, P1). + + settings.claude_credentials_path wins when set; otherwise + /.claude/.credentials.json. + """ + explicit = (getattr(settings, "claude_credentials_path", "") or "").strip() + if explicit: + return explicit + return os.path.join(_agent_home(), ".claude", ".credentials.json") + + +def _iso(epoch_ms) -> str: + """Best-effort epoch-ms -> ISO-8601 UTC string (for human-readable reasons).""" + try: + from datetime import datetime, timezone + return datetime.fromtimestamp(int(epoch_ms) / 1000, tz=timezone.utc).isoformat() + except Exception: + return str(epoch_ms) + + +def is_auth_failure_text(text: str) -> bool: + """ORCH-044 (P1b): True if `text` contains a claude auth-failure marker. + + Used post-factum on a run log so the launcher can tell an auth death apart + from a generic failure and reset the preflight cache. Never raises. + """ + if not text: + return False + try: + return bool(_AUTH_FAIL_RE.search(text)) + except Exception: + return False + + +def _check_auth() -> tuple[bool, str]: + """ORCH-044 (P1a): token-free local auth gate. Never raises. + + Steps (ADR-001 §P1): + 1. credentials file missing / unreadable / invalid JSON -> not ok. + 2. no claudeAiOauth block / accessToken -> not ok. + 3. claudeAiOauth.expiresAt (epoch ms) <= now + skew -> expired -> not ok. + 4. accessToken present but expiresAt absent/unparsable -> OK (cannot prove + expiry; we do not manufacture false positives that would wedge the shared + queue — see ADR Risks R-1). + + Fail-safe: any unexpected error returns (False, ...) so a logged-out / broken + state never claims a job (BR-2 / TR-3.5). This reads only a local file — no + network call, no token spend (BR-1 / AC-5). + """ + try: + path = _credentials_path() + if not os.path.exists(path): + return False, f"claude not logged in: credentials missing ({path})" + try: + with open(path, "r", encoding="utf-8") as f: + data = json.load(f) + except (OSError, ValueError) as e: + return False, f"claude not logged in: credentials unreadable ({e})" + + oauth = data.get("claudeAiOauth") if isinstance(data, dict) else None + if not isinstance(oauth, dict) or not oauth.get("accessToken"): + return False, "claude not logged in: no oauth token" + + expires = oauth.get("expiresAt") + if expires is None: + return True, "auth ok (no expiry recorded)" + try: + expires_ms = int(expires) + except (TypeError, ValueError): + return True, "auth ok (unparsable expiry)" + + skew_ms = int(getattr(settings, "auth_expiry_skew_seconds", 0) or 0) * 1000 + now_ms = int(time.time() * 1000) + if expires_ms <= now_ms + skew_ms: + return False, f"OAuth token expired at {_iso(expires_ms)}" + return True, "auth ok" + except Exception as e: # pragma: no cover - defensive fail-safe + return False, f"auth check error: {e}" + + def _compute() -> tuple[bool, str]: bin_path = _claude_bin() if not os.path.exists(bin_path): return False, f"CLAUDE_BIN not found: {bin_path}" - return _run_version(bin_path) + ok, reason = _run_version(bin_path) + if not ok: + return ok, reason + # ORCH-044 (P1): version is local info and answers even when logged out, so + # gate on a token-free auth check too. Toggleable for emergencies. + if getattr(settings, "preflight_check_auth", True): + auth_ok, auth_reason = _check_auth() + if not auth_ok: + return False, auth_reason + return True, reason def check(force: bool = False) -> tuple[bool, str]: diff --git a/tests/test_empty_log_failure.py b/tests/test_empty_log_failure.py new file mode 100644 index 0000000..4d014e2 --- /dev/null +++ b/tests/test_empty_log_failure.py @@ -0,0 +1,298 @@ +"""ORCH-044 (P3): empty run log / no result-JSON at exit 0 == failure. + +claude can exit 0 yet leave an empty (or JSON-less) run log — e.g. it died fast +because the session was logged out, or a flag silenced stdout. Before ORCH-044 +that looked identical to success: job -> done, stage auto-advanced. Now the +launcher validates the result; only (exit 0 AND valid result-JSON) is a success. + +No real claude/Popen is spawned. The git/usage/notify side effects of +_monitor_agent are stubbed; DB is a fresh per-test sqlite. +""" +import os +import tempfile + +import pytest + +_test_db = os.path.join(tempfile.gettempdir(), "test_orchestrator_empty_log.db") +os.environ["ORCH_DB_PATH"] = _test_db +os.environ["ORCH_REPOS_DIR"] = tempfile.gettempdir() +os.environ["ORCH_GITEA_TOKEN"] = "test-token" +os.environ["ORCH_PLANE_API_TOKEN"] = "test-token" + +import src.db as db +from src.db import init_db, enqueue_job, claim_next_job, get_job +from src import preflight +from src.agents.launcher import AgentLauncher + + +VALID_RESULT_LOG = ( + "some preamble text from the agent run...\n" + '{"type":"result","subtype":"success","usage":' + '{"input_tokens":120,"output_tokens":45},"total_cost_usd":0.12}\n' +) + + +@pytest.fixture(autouse=True) +def fresh_db(tmp_path, monkeypatch): + monkeypatch.setattr(db.settings, "db_path", str(tmp_path / "res.db")) + init_db() + preflight.reset_cache() + yield + + +# =========================================================================== +# _validate_result — the result-JSON contract (TR-3.1) +# =========================================================================== +class TestValidateResult: + def test_missing_path(self): + ok, reason = AgentLauncher._validate_result(None) + assert ok is False + + def test_missing_file(self, tmp_path): + ok, reason = AgentLauncher._validate_result(str(tmp_path / "nope.log")) + assert ok is False + assert "missing" in reason.lower() + + def test_empty_file(self, tmp_path): + p = tmp_path / "empty.log" + p.write_text("") + ok, reason = AgentLauncher._validate_result(str(p)) + assert ok is False + assert "empty" in reason.lower() + + def test_whitespace_only(self, tmp_path): + p = tmp_path / "ws.log" + p.write_text(" \n\t\n") + ok, _ = AgentLauncher._validate_result(str(p)) + assert ok is False + + def test_no_json(self, tmp_path): + p = tmp_path / "garbage.log" + p.write_text("this is not json at all, just noise\n") + ok, reason = AgentLauncher._validate_result(str(p)) + assert ok is False + assert "json" in reason.lower() + + def test_valid_result_json(self, tmp_path): + p = tmp_path / "good.log" + p.write_text(VALID_RESULT_LOG) + ok, _ = AgentLauncher._validate_result(str(p)) + assert ok is True + + +# =========================================================================== +# _finalize_job — job state under result_ok (TC-12/13/15/16/17) +# =========================================================================== +class TestFinalizeJobResultOk: + def _spy_telegram(self, monkeypatch): + sent = [] + monkeypatch.setattr("src.notifications.send_telegram", + lambda *a, **k: sent.append(a[0] if a else "")) + return sent + + # TC-15 / AC-13: valid result -> done (no regression). + def test_valid_result_done(self, tmp_path, monkeypatch): + self._spy_telegram(monkeypatch) + log = tmp_path / "1.log" + log.write_text(VALID_RESULT_LOG) + jid = enqueue_job("developer", "r") + claim_next_job() + AgentLauncher()._finalize_job(jid, "developer", run_id=1, exit_code=0, + output_path=str(log), result_ok=True) + assert get_job(jid)["status"] == "done" + + # TC-12 / AC-10: exit 0 + empty log -> NOT done; terminal failed + alert. + def test_empty_log_exit0_terminal_failed_alerts(self, tmp_path, monkeypatch): + sent = self._spy_telegram(monkeypatch) + log = tmp_path / "2.log" + log.write_text("") # 0 bytes + # max_attempts=1 -> after the claim (attempts=1) the budget is spent -> + # the permanent path goes straight to 'failed' and alerts. + jid = enqueue_job("developer", "r", max_attempts=1) + claim_next_job() + AgentLauncher()._finalize_job(jid, "developer", run_id=2, exit_code=0, + output_path=str(log), result_ok=False) + job = get_job(jid) + assert job["status"] == "failed" + assert job["status"] != "done" + assert "empty run log" in (job["error"] or "") + assert sent, "a Telegram alert must be sent on terminal failure" + + # TC-13 / AC-11: exit 0 + JSON-less log -> failure (here: requeue). + def test_garbage_log_exit0_not_done(self, tmp_path, monkeypatch): + self._spy_telegram(monkeypatch) + log = tmp_path / "3.log" + log.write_text("noise, no json here\n") + jid = enqueue_job("developer", "r", max_attempts=2) + claim_next_job() + AgentLauncher()._finalize_job(jid, "developer", run_id=3, exit_code=0, + output_path=str(log), result_ok=False) + job = get_job(jid) + assert job["status"] != "done" + assert job["status"] == "queued" # retry budget remained + assert "no result JSON" in (job["error"] or "") + + # TC-16 / AC-14: exit 0 + empty log never leaves the job 'running'. + def test_never_running_after_empty_result(self, tmp_path, monkeypatch): + self._spy_telegram(monkeypatch) + log = tmp_path / "4.log" + log.write_text("") + jid = enqueue_job("developer", "r", max_attempts=2) + claim_next_job() + assert get_job(jid)["status"] == "running" # claimed + AgentLauncher()._finalize_job(jid, "developer", run_id=4, exit_code=0, + output_path=str(log), result_ok=False) + assert get_job(jid)["status"] in ("failed", "queued") + + # TC-17 / TR-3.3: empty result defaults to permanent (no backoff, no + # transient budget burn). + def test_empty_result_defaults_permanent(self, tmp_path, monkeypatch): + self._spy_telegram(monkeypatch) + log = tmp_path / "5.log" + log.write_text("") # no transient marker + jid = enqueue_job("developer", "r", max_attempts=2) + claim_next_job() + AgentLauncher()._finalize_job(jid, "developer", run_id=5, exit_code=0, + output_path=str(log), result_ok=False) + job = get_job(jid) + assert job["status"] == "queued" + assert job["transient_attempts"] == 0 # NOT transient + assert job["available_at"] is None # no backoff gate + + # TC-17 / TR-3.3: a transient marker in the log routes to the transient path. + def test_empty_result_with_transient_marker_goes_transient(self, tmp_path, monkeypatch): + self._spy_telegram(monkeypatch) + log = tmp_path / "6.log" + log.write_text("overloaded_error: 429 rate limit. Retry-After: 12\n") + jid = enqueue_job("developer", "r", max_attempts=2) + claim_next_job() + AgentLauncher()._finalize_job(jid, "developer", run_id=6, exit_code=0, + output_path=str(log), result_ok=False) + job = get_job(jid) + assert job["status"] == "queued" + assert job["transient_attempts"] == 1 # transient path taken + assert job["available_at"] is not None # backoff gate set + + +# =========================================================================== +# _monitor_agent — success gating (TC-14/15) + auth-marker reset (P1b) +# =========================================================================== +class _FakeProc: + def __init__(self, exit_code): + self._ec = exit_code + self.pid = 4242 + + def wait(self): + return self._ec + + +def _seed_task_and_run(repo, branch, agent="developer", work_item_id="ORCH-001"): + conn = db.get_db() + conn.execute( + "INSERT INTO tasks (work_item_id, repo, branch, stage) VALUES (?,?,?,?)", + (work_item_id, repo, branch, "development"), + ) + cur = conn.execute( + "INSERT INTO agent_runs (task_id, agent) VALUES ((SELECT id FROM tasks " + "WHERE repo=? AND branch=?), ?)", + (repo, branch, agent), + ) + run_id = cur.lastrowid + conn.commit() + conn.close() + return run_id + + +class TestMonitorAgentGating: + def _patch_monitor_env(self, monkeypatch, tmp_path): + """Stub the heavy side effects of _monitor_agent (git/usage/notify).""" + monkeypatch.setattr("src.agents.launcher.notify_agent_finished", + lambda *a, **k: None) + monkeypatch.setattr("src.agents.launcher.get_worktree_path", + lambda repo, branch: str(tmp_path)) + + class _R: + returncode = 0 + stdout = "" # "no changes to commit" -> skips git add/commit/push + stderr = "" + + monkeypatch.setattr("src.agents.launcher.subprocess.run", + lambda *a, **k: _R()) + + def test_success_advances_and_comments(self, tmp_path, monkeypatch): + self._patch_monitor_env(monkeypatch, tmp_path) + run_id = _seed_task_and_run("r", "feature/x") + log = tmp_path / f"{run_id}.log" + log.write_text(VALID_RESULT_LOG) + + spy = {"post": 0, "advance": 0, "finalize": None, "alert": 0} + monkeypatch.setattr("src.notifications.send_telegram", + lambda *a, **k: spy.__setitem__("alert", spy["alert"] + 1)) + + lr = AgentLauncher() + monkeypatch.setattr(lr, "_post_usage_comments", + lambda *a, **k: spy.__setitem__("post", spy["post"] + 1)) + monkeypatch.setattr(lr, "_try_advance_stage", + lambda *a, **k: spy.__setitem__("advance", spy["advance"] + 1)) + monkeypatch.setattr(lr, "_finalize_job", + lambda *a, **k: spy.__setitem__("finalize", k.get("result_ok"))) + + lr._monitor_agent(_FakeProc(0), run_id, "developer", "r", "feature/x", + output_path=str(log), log_fh=None, job_id=99) + + assert spy["post"] == 1 + assert spy["advance"] == 1 + assert spy["finalize"] is True + assert spy["alert"] == 0 # no empty-result alert on a valid run + + # TC-14 / AC-12: empty result -> no advance, no success comment, alert sent. + def test_empty_result_suppresses_advance_and_comment(self, tmp_path, monkeypatch): + self._patch_monitor_env(monkeypatch, tmp_path) + run_id = _seed_task_and_run("r", "feature/y") + log = tmp_path / f"{run_id}.log" + log.write_text("") # empty -> invalid result + + spy = {"post": 0, "advance": 0, "finalize": None, "alert": 0} + monkeypatch.setattr("src.notifications.send_telegram", + lambda *a, **k: spy.__setitem__("alert", spy["alert"] + 1)) + + lr = AgentLauncher() + monkeypatch.setattr(lr, "_post_usage_comments", + lambda *a, **k: spy.__setitem__("post", spy["post"] + 1)) + monkeypatch.setattr(lr, "_try_advance_stage", + lambda *a, **k: spy.__setitem__("advance", spy["advance"] + 1)) + monkeypatch.setattr(lr, "_finalize_job", + lambda *a, **k: spy.__setitem__("finalize", k.get("result_ok"))) + + lr._monitor_agent(_FakeProc(0), run_id, "developer", "r", "feature/y", + output_path=str(log), log_fh=None, job_id=99) + + assert spy["post"] == 0 # no success comment + assert spy["advance"] == 0 # stage NOT advanced + assert spy["finalize"] is False # finalize told the result was invalid + assert spy["alert"] == 1 # empty-result alert fired + + +# =========================================================================== +# _handle_auth_marker — post-factum auth detection resets preflight cache (P1b) +# =========================================================================== +class TestAuthMarkerHandling: + def test_auth_marker_resets_preflight_cache(self, tmp_path, monkeypatch): + log = tmp_path / "auth.log" + log.write_text("Error: Not logged in. Please run /login\n") + reset = {"n": 0} + monkeypatch.setattr(preflight, "reset_cache", + lambda: reset.__setitem__("n", reset["n"] + 1)) + found = AgentLauncher()._handle_auth_marker(str(log)) + assert found is True + assert reset["n"] == 1 + + def test_no_auth_marker_no_reset(self, tmp_path, monkeypatch): + log = tmp_path / "plain.log" + log.write_text("Traceback: ValueError somewhere\n") + reset = {"n": 0} + monkeypatch.setattr(preflight, "reset_cache", + lambda: reset.__setitem__("n", reset["n"] + 1)) + found = AgentLauncher()._handle_auth_marker(str(log)) + assert found is False + assert reset["n"] == 0 diff --git a/tests/test_preflight_auth.py b/tests/test_preflight_auth.py new file mode 100644 index 0000000..403a557 --- /dev/null +++ b/tests/test_preflight_auth.py @@ -0,0 +1,246 @@ +"""ORCH-044 (P1): token-free preflight auth gate. + +`claude --version` answers even when claude is logged OUT, so version-only +preflight was blind to auth. These tests cover the new local credentials check: +missing / expired / valid token, broken JSON fail-safe, no network, caching, +HOME-correct path resolution, and the queue-worker claim gate. + +No real claude/Popen is spawned: `_run_version` is stubbed and credentials live +in tmp files. DB is a fresh per-test sqlite (mirrors tests/test_resilience.py). +""" +import os +import json +import socket +import tempfile + +import pytest + +_test_db = os.path.join(tempfile.gettempdir(), "test_orchestrator_preflight_auth.db") +os.environ["ORCH_DB_PATH"] = _test_db +os.environ["ORCH_REPOS_DIR"] = tempfile.gettempdir() +os.environ["ORCH_GITEA_TOKEN"] = "test-token" +os.environ["ORCH_PLANE_API_TOKEN"] = "test-token" + +import src.db as db +from src.db import init_db, enqueue_job, get_job, count_running_jobs +from src import preflight +from src.queue_worker import QueueWorker +from src.agents.launcher import AgentLauncher + + +@pytest.fixture(autouse=True) +def fresh_db(tmp_path, monkeypatch): + monkeypatch.setattr(db.settings, "db_path", str(tmp_path / "res.db")) + init_db() + preflight.reset_cache() + # auth check on by default; large TTL unless a test overrides it. + monkeypatch.setattr(preflight.settings, "preflight_check_auth", True) + yield + + +def _fake_bin(monkeypatch, tmp_path): + """A bin path that exists + a --version that always succeeds (auth-agnostic).""" + b = tmp_path / "claude" + b.write_text("#!/bin/sh\necho v1\n") + monkeypatch.setattr(preflight, "_claude_bin", lambda: str(b)) + monkeypatch.setattr(preflight, "_run_version", lambda b: (True, "1.2.3")) + + +def _write_creds(tmp_path, *, expires_ms=None, access_token="tok", oauth=True, + raw=None): + path = tmp_path / ".credentials.json" + if raw is not None: + path.write_text(raw) + return path + body = {} + if oauth: + oa = {"accessToken": access_token} + if expires_ms is not None: + oa["expiresAt"] = expires_ms + body["claudeAiOauth"] = oa + path.write_text(json.dumps(body)) + return path + + +# --------------------------------------------------------------------------- +# TC-01 / AC-1: not logged in (no credentials file) -> FAIL +# --------------------------------------------------------------------------- +def test_missing_credentials_fails(monkeypatch, tmp_path): + _fake_bin(monkeypatch, tmp_path) + monkeypatch.setattr(preflight, "_credentials_path", + lambda: str(tmp_path / "nope.json")) + ok, reason = preflight.check(force=True) + assert ok is False + assert "logged in" in reason.lower() or "credentials" in reason.lower() + + +# --------------------------------------------------------------------------- +# TC-02 / AC-2: expired OAuth token -> FAIL +# --------------------------------------------------------------------------- +def test_expired_token_fails(monkeypatch, tmp_path): + _fake_bin(monkeypatch, tmp_path) + past = (int(__import__("time").time()) - 3600) * 1000 # 1h ago, epoch ms + creds = _write_creds(tmp_path, expires_ms=past) + monkeypatch.setattr(preflight, "_credentials_path", lambda: str(creds)) + ok, reason = preflight.check(force=True) + assert ok is False + assert "expired" in reason.lower() + + +# --------------------------------------------------------------------------- +# TC-03 / AC-3: valid login -> OK (no regression) +# --------------------------------------------------------------------------- +def test_valid_login_ok(monkeypatch, tmp_path): + _fake_bin(monkeypatch, tmp_path) + future = (int(__import__("time").time()) + 3600) * 1000 # 1h ahead + creds = _write_creds(tmp_path, expires_ms=future) + monkeypatch.setattr(preflight, "_credentials_path", lambda: str(creds)) + ok, reason = preflight.check(force=True) + assert ok is True + + +def test_token_without_expiry_is_ok(monkeypatch, tmp_path): + # accessToken present but no expiresAt -> cannot prove expiry -> OK (ADR §P1.5). + _fake_bin(monkeypatch, tmp_path) + creds = _write_creds(tmp_path, expires_ms=None) + monkeypatch.setattr(preflight, "_credentials_path", lambda: str(creds)) + ok, _ = preflight.check(force=True) + assert ok is True + + +# --------------------------------------------------------------------------- +# TC-04 / AC-1: broken / unreadable credentials JSON -> FAIL (no exception) +# --------------------------------------------------------------------------- +def test_broken_json_fails_without_raising(monkeypatch, tmp_path): + _fake_bin(monkeypatch, tmp_path) + creds = _write_creds(tmp_path, raw="{ this is not valid json ") + monkeypatch.setattr(preflight, "_credentials_path", lambda: str(creds)) + ok, reason = preflight.check(force=True) # must not raise + assert ok is False + assert "logged in" in reason.lower() or "unreadable" in reason.lower() + + +def test_no_oauth_block_fails(monkeypatch, tmp_path): + _fake_bin(monkeypatch, tmp_path) + creds = _write_creds(tmp_path, oauth=False) + monkeypatch.setattr(preflight, "_credentials_path", lambda: str(creds)) + ok, reason = preflight.check(force=True) + assert ok is False + assert "oauth" in reason.lower() or "logged in" in reason.lower() + + +# --------------------------------------------------------------------------- +# TC-05 / AC-5: token-free — no network call in the auth path +# --------------------------------------------------------------------------- +def test_auth_check_makes_no_network_call(monkeypatch, tmp_path): + _fake_bin(monkeypatch, tmp_path) + future = (int(__import__("time").time()) + 3600) * 1000 + creds = _write_creds(tmp_path, expires_ms=future) + monkeypatch.setattr(preflight, "_credentials_path", lambda: str(creds)) + + def _no_net(*a, **k): + raise AssertionError("token-free auth check must not open a socket") + + monkeypatch.setattr(socket, "socket", _no_net) + ok, _ = preflight.check(force=True) + assert ok is True + + +# --------------------------------------------------------------------------- +# TC-06 / AC-6: auth result cached within preflight_cache_ttl +# --------------------------------------------------------------------------- +def test_auth_result_cached_within_ttl(monkeypatch, tmp_path): + _fake_bin(monkeypatch, tmp_path) + monkeypatch.setattr(preflight.settings, "preflight_cache_ttl", 999) + + calls = {"n": 0} + real = preflight._check_auth + + future = (int(__import__("time").time()) + 3600) * 1000 + creds = _write_creds(tmp_path, expires_ms=future) + monkeypatch.setattr(preflight, "_credentials_path", lambda: str(creds)) + + def counting(): + calls["n"] += 1 + return real() + + monkeypatch.setattr(preflight, "_check_auth", counting) + preflight.reset_cache() + preflight.check() # miss -> reads creds + preflight.check() # cached -> no re-read + preflight.check() + assert calls["n"] == 1 + + +# --------------------------------------------------------------------------- +# TC-07 / TR-1.3: credentials path resolves from AGENT_HOME, not process env +# --------------------------------------------------------------------------- +def test_credentials_path_follows_agent_home(monkeypatch, tmp_path): + agent_home = tmp_path / "agent_home" + agent_home.mkdir() + monkeypatch.setattr(AgentLauncher, "AGENT_HOME", str(agent_home)) + monkeypatch.setattr(preflight.settings, "claude_credentials_path", "") + # The orchestrator process HOME points somewhere else entirely. + monkeypatch.setenv("HOME", str(tmp_path / "orchestrator_home")) + + resolved = preflight._credentials_path() + assert resolved == str(agent_home / ".claude" / ".credentials.json") + assert str(tmp_path / "orchestrator_home") not in resolved + + +def test_explicit_credentials_path_wins(monkeypatch, tmp_path): + monkeypatch.setattr(preflight.settings, "claude_credentials_path", + str(tmp_path / "explicit.json")) + assert preflight._credentials_path() == str(tmp_path / "explicit.json") + + +# --------------------------------------------------------------------------- +# TC-08 / AC-4: auth-fail blocks the queue-worker claim +# --------------------------------------------------------------------------- +def test_worker_does_not_claim_when_auth_fails(monkeypatch, tmp_path): + _fake_bin(monkeypatch, tmp_path) + monkeypatch.setattr(preflight, "_credentials_path", + lambda: str(tmp_path / "missing.json")) # not logged in + called = {"launch": False} + monkeypatch.setattr("src.queue_worker.launcher.launch_job", + lambda job: called.__setitem__("launch", True)) + + jid = enqueue_job("analyst", "r") + w = QueueWorker(max_concurrency=1, poll_interval=0.01) + w._drain_once() + + assert called["launch"] is False + assert get_job(jid)["status"] == "queued" + assert count_running_jobs() == 0 + assert w.last_preflight_ok is False + assert "logged in" in w.last_preflight_reason.lower() \ + or "credentials" in w.last_preflight_reason.lower() + + +# --------------------------------------------------------------------------- +# Toggle off: preflight_check_auth=False keeps the old version-only behaviour +# --------------------------------------------------------------------------- +def test_auth_toggle_off_skips_check(monkeypatch, tmp_path): + _fake_bin(monkeypatch, tmp_path) + monkeypatch.setattr(preflight.settings, "preflight_check_auth", False) + monkeypatch.setattr(preflight, "_credentials_path", + lambda: str(tmp_path / "missing.json")) + ok, _ = preflight.check(force=True) + assert ok is True # auth not consulted -> version-only pass + + +# --------------------------------------------------------------------------- +# is_auth_failure_text: post-factum marker detection (P1b) +# --------------------------------------------------------------------------- +@pytest.mark.parametrize("text", [ + "Error: Not logged in. Please run /login", + "401 Unauthorized", + "invalid api key provided", +]) +def test_is_auth_failure_text_positive(text): + assert preflight.is_auth_failure_text(text) is True + + +@pytest.mark.parametrize("text", ["", "429 rate limit", "Traceback ValueError"]) +def test_is_auth_failure_text_negative(text): + assert preflight.is_auth_failure_text(text) is False