Конвейер продвигается только входящими webhook; потерянное событие (502 на ребилде, отсутствие ретраев у Plane/Gitea, неразрезолвленный sha→branch) оставляет задачу молча застрявшей (класс инцидента ORCH-044). Новый фоновый daemon-поток src/reconciler.py (паттерн queue_worker) доигрывает пропущенный переход через те же штатные гейты/обработчики, что и webhook: - F-1 gate-side: для задач stage≠done, без активного job и age(updated_at) ≥ grace_for_stage(stage) — read-only пред-оценка канонического QG; зелёный → stage_engine.advance_stage(..., finished_agent=None); красный → тишина (спам нотификаций структурно невозможен). analysis F-1 не трогает (человеческий гейт). - F-2 plane-side: опрос Plane API per-project (plane_sync.list_issues_by_state, курсорная пагинация, never-raise) → реплей In Progress/Approved/Rejected через существующие handle_status_start/handle_verdict (async из sync-потока, asyncio.run). - F-3: усиление sha→branch в handle_ci_status — БД-fallback по единственной development-задаче repo (неоднозначность → не резолвим), debug→info. - Анти-дубль на создании (db.create_task_atomic под process-wide Lock): гонка reconcile↔webhook не плодит второй task/branch/worktree/analyst-job (AC-4). - F-4 observability: лог-строка разблокировки + Telegram + блок reconcile в /queue. Старт/стоп в main.lifespan (после worker.start() / перед worker.stop()), restart-safe, never-raise на единицу работы. Kill-switches ORCH_RECONCILE_ENABLED / ORCH_RECONCILE_PLANE_ENABLED + grace-настройки. Схема БД и реестры STAGE_TRANSITIONS/QG_CHECKS не менялись. Тесты: test_reconciler.py, test_reconciler_plane.py, test_gitea_sha_resolve.py, test_config.py (33 новых, 563 всего зелёные). Документация обновлена (golden source): architecture/README.md, INFRA.md, README.md, CHANGELOG.md, adr-0007 → accepted. Refs: ORCH-053 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
195 lines
9.9 KiB
Python
195 lines
9.9 KiB
Python
from pydantic_settings import BaseSettings
|
|
|
|
|
|
class Settings(BaseSettings):
|
|
# Plane
|
|
plane_api_url: str = "http://localhost:8091"
|
|
# ORCH-017: external (browser) web URL of Plane for clickable issue links in
|
|
# notifications, e.g. https://plane.example.org. Falls back to plane_api_url,
|
|
# but a loopback fallback (localhost/127.0.0.1) is treated as "no web URL" and
|
|
# the Plane link is omitted (see notifications._build_plane_issue_link).
|
|
plane_web_url: str = ""
|
|
plane_api_token: str = ""
|
|
plane_workspace_slug: str = ""
|
|
plane_webhook_secret: str = ""
|
|
plane_project_id: str = ""
|
|
|
|
# Per-agent Plane bot tokens (feat: per-agent comment authorship).
|
|
# When set, add_comment posts under the matching bot so Plane shows the
|
|
# real author (Analyst/Architect/...). Empty -> fallback to plane_api_token.
|
|
plane_bot_analyst: str = ""
|
|
plane_bot_architect: str = ""
|
|
plane_bot_developer: str = ""
|
|
plane_bot_reviewer: str = ""
|
|
plane_bot_tester: str = ""
|
|
plane_bot_deployer: str = ""
|
|
plane_bot_stream: str = ""
|
|
|
|
# Gitea
|
|
gitea_url: str = "http://localhost:3000"
|
|
gitea_public_url: str = "" # external URL for clickable links in comments; falls back to gitea_url
|
|
gitea_token: str = ""
|
|
gitea_webhook_secret: str = ""
|
|
gitea_owner: str = "admin"
|
|
default_repo: str = "enduro-trails"
|
|
|
|
# ORCH-6: multi-repo project registry. JSON array of
|
|
# {plane_project_id, repo, work_item_prefix, name}.
|
|
# Empty -> built-in default registry in src/projects.py.
|
|
projects_json: str = ""
|
|
|
|
# Claude CLI
|
|
claude_bin: str = "/opt/claude-code/bin/claude.exe"
|
|
repos_dir: str = "/repos"
|
|
host_repos_dir: str = "/home/slin/repos"
|
|
worktrees_dir: str = "/repos/_wt" # ORCH-2 / S-4: isolated worktree per task/branch
|
|
|
|
# DB
|
|
db_path: str = "/app/data/orchestrator.db"
|
|
|
|
# ORCH-1 (F-2b): persistent job queue / background worker.
|
|
# max_concurrency -> max agent jobs running in parallel (env ORCH_MAX_CONCURRENCY)
|
|
# queue_poll_interval -> worker loop poll seconds (env ORCH_QUEUE_POLL_INTERVAL)
|
|
max_concurrency: int = 1
|
|
queue_poll_interval: float = 2.0
|
|
|
|
# ORCH-1b (resilience): preflight + 429/rate-limit + backoff + circuit breaker.
|
|
# preflight_cache_ttl -> cache the cheap CLI/network preflight result (seconds);
|
|
# the worker does NOT re-run `claude --version` more often
|
|
# than this (env ORCH_PREFLIGHT_CACHE_TTL).
|
|
# backoff_base_seconds -> base for exponential transient backoff.
|
|
# backoff_max_seconds -> ceiling for the transient backoff.
|
|
# transient_max_attempts -> retry budget for transient (429/overload/network)
|
|
# failures, separate from code-fault `attempts`.
|
|
# breaker_threshold -> consecutive transient failures that OPEN the breaker.
|
|
# breaker_pause_seconds -> how long the breaker stays open before half-open.
|
|
preflight_cache_ttl: int = 45
|
|
backoff_base_seconds: int = 10
|
|
backoff_max_seconds: int = 600
|
|
transient_max_attempts: int = 5
|
|
breaker_threshold: int = 3
|
|
breaker_pause_seconds: int = 300
|
|
|
|
# ORCH-7 (M-2): agent timeout + graceful kill.
|
|
# agent_timeout_seconds -> default per-agent wall-clock budget; the watchdog
|
|
# kills the run after this (env ORCH_AGENT_TIMEOUT_SECONDS).
|
|
# agent_kill_grace_seconds-> pause between SIGTERM and SIGKILL so claude can
|
|
# flush artifacts before the hard kill
|
|
# (env ORCH_AGENT_KILL_GRACE_SECONDS).
|
|
# agent_timeout_overrides_json -> optional per-agent override JSON object,
|
|
# e.g. {"reviewer": 3600, "architect": 2700}
|
|
# (env ORCH_AGENT_TIMEOUT_OVERRIDES_JSON).
|
|
agent_timeout_seconds: int = 1800
|
|
agent_kill_grace_seconds: int = 20
|
|
agent_timeout_overrides_json: str = ""
|
|
|
|
# ORCH-41: per-agent LLM model. Empty -> agent_model_default. Resolution order:
|
|
# project-override (projects_json agent_models) > ORCH_AGENT_MODEL_<AGENT> >
|
|
# agent_model_default > CLI default (no --model flag). Default is 4-8 because
|
|
# 4-7 == 4-8 in price (Slava 05.06); do NOT hardcode the version anywhere else.
|
|
agent_model_default: str = "claude-opus-4-8"
|
|
agent_model_analyst: str = ""
|
|
agent_model_architect: str = ""
|
|
agent_model_developer: str = ""
|
|
agent_model_reviewer: str = ""
|
|
agent_model_tester: str = ""
|
|
agent_model_deployer: str = ""
|
|
|
|
# ORCH-41: per-agent effort / reasoning level: low|medium|high|xhigh|max.
|
|
# Empty -> agent_effort_default. Same resolution order as model. Default split:
|
|
# thinking agents (analyst/architect/developer/reviewer) -> high; mechanical
|
|
# agents (tester/deployer) -> medium.
|
|
agent_effort_default: str = "high"
|
|
agent_effort_analyst: str = "high"
|
|
agent_effort_architect: str = "high"
|
|
agent_effort_developer: str = "high"
|
|
agent_effort_reviewer: str = "high"
|
|
agent_effort_tester: str = "medium"
|
|
agent_effort_deployer: str = "medium"
|
|
|
|
# ORCH-41: optional per-agent fallback model used when the primary is
|
|
# overloaded (--fallback-model, works with --print). Empty -> no flag.
|
|
agent_fallback_model: str = ""
|
|
|
|
# L-2: run-log rotation. Old per-run logs in <data>/runs/*.log are pruned at
|
|
# app startup (best-effort). A *.log is removed if it is older than
|
|
# log_keep_days OR not within the log_keep_max most-recent logs (whichever
|
|
# hits first). Only *.log files are touched; the active run log is skipped.
|
|
# log_keep_days -> max age in days (env ORCH_LOG_KEEP_DAYS).
|
|
# log_keep_max -> max number of newest logs to retain (env ORCH_LOG_KEEP_MAX).
|
|
log_keep_days: int = 30
|
|
log_keep_max: int = 500
|
|
|
|
|
|
# ORCH-045: quality-gate CI poll/retry. check_ci_green polls the Gitea
|
|
# combined commit status up to ci_poll_max_attempts times, sleeping
|
|
# ci_poll_interval_s between attempts, to ride out a transient pending
|
|
# state right after the developer push (race fix, see ORCH-017).
|
|
# ci_poll_max_attempts -> max status polls (env ORCH_CI_POLL_MAX_ATTEMPTS)
|
|
# ci_poll_interval_s -> seconds between polls (env ORCH_CI_POLL_INTERVAL_S)
|
|
ci_poll_max_attempts: int = 12
|
|
ci_poll_interval_s: int = 10
|
|
|
|
# ORCH-043: merge-gate (auto-rebase + re-test + merge-lock) on the
|
|
# deploy-staging -> deploy edge. A deterministic sub-gate (no LLM) that
|
|
# catches the up-to-date branch up to the CURRENT origin/main, re-tests it,
|
|
# and serialises merges so two green branches can't break main.
|
|
# merge_gate_enabled -> global kill-switch; False -> no-op pass for the
|
|
# whole gate (staged rollout, env ORCH_MERGE_GATE_ENABLED).
|
|
# merge_gate_repos -> CSV of repos where the gate is REAL; empty means
|
|
# only the self-hosting repo (orchestrator). Other
|
|
# repos -> conditional no-op (mirrors ORCH-35 staging).
|
|
# merge_retest_timeout_s -> wall-clock budget for the post-rebase re-test.
|
|
# merge_retest_target -> pytest target for the re-test (portability across repos).
|
|
# merge_lock_timeout_s -> max lease age; an older lease is reclaimed (crash backstop).
|
|
# merge_defer_delay_s -> delay before re-running the gate when the lock is busy.
|
|
# merge_defer_max_attempts -> defer retries before escalation (avoids livelock).
|
|
merge_gate_enabled: bool = True
|
|
merge_gate_repos: str = ""
|
|
merge_retest_timeout_s: int = 600
|
|
merge_retest_target: str = "tests/"
|
|
merge_lock_timeout_s: int = 300
|
|
merge_defer_delay_s: int = 60
|
|
merge_defer_max_attempts: int = 5
|
|
|
|
# ORCH-053: stuck-task reconciler (sweeper for lost webhooks). A background
|
|
# daemon thread reconciles the "source of truth (gate / Plane) != task stage"
|
|
# drift left behind by a dropped webhook (502 on rebuild, no Plane/Gitea
|
|
# retries, unresolved sha->branch). See docs/architecture/adr/adr-0007-reconciler.md.
|
|
# reconcile_enabled -> global kill-switch (self-hosting safety,
|
|
# staged rollout, env ORCH_RECONCILE_ENABLED).
|
|
# reconcile_interval_s -> background sweep period (seconds).
|
|
# reconcile_plane_enabled -> separate flag for the F-2 Plane-API poll so
|
|
# only the plane branch can be muted.
|
|
# reconcile_grace_default_s -> default "stuck" threshold on tasks.updated_at.
|
|
# reconcile_grace_overrides_json -> JSON object of per-stage thresholds, e.g.
|
|
# {"analysis": 1800, "development": 300}. Invalid
|
|
# JSON -> default (mirrors agent_timeout_overrides_json).
|
|
# reconcile_notify_unblock -> send a Telegram message when a stuck task is
|
|
# unblocked (F-4 observability).
|
|
reconcile_enabled: bool = True
|
|
reconcile_interval_s: int = 120
|
|
reconcile_plane_enabled: bool = True
|
|
reconcile_grace_default_s: int = 600
|
|
reconcile_grace_overrides_json: str = ""
|
|
reconcile_notify_unblock: bool = True
|
|
|
|
# Telegram notifications
|
|
telegram_bot_token: str = ""
|
|
telegram_chat_id: str = ""
|
|
|
|
# ORCH-042: режим live-трекера задачи.
|
|
# edit -> карточка редактируется на месте (editMessageText), ДЕФОЛТ (как было).
|
|
# bump -> при обновлении старое сообщение удаляется и карточка отправляется
|
|
# заново вниз чата (deleteMessage + sendMessage + repoint message_id),
|
|
# тихо (disable_notification). Одна карточка на задачу в обоих режимах.
|
|
# Неизвестное/пустое значение трактуется как edit (см. notifications).
|
|
tracker_mode: str = "edit"
|
|
|
|
class Config:
|
|
env_prefix = "ORCH_"
|
|
env_file = ".env"
|
|
|
|
|
|
settings = Settings()
|