from pydantic_settings import BaseSettings class Settings(BaseSettings): # Plane plane_api_url: str = "http://localhost:8091" # ORCH-017: external (browser) web URL of Plane for clickable issue links in # notifications, e.g. https://plane.example.org. Falls back to plane_api_url, # but a loopback fallback (localhost/127.0.0.1) is treated as "no web URL" and # the Plane link is omitted (see notifications._build_plane_issue_link). plane_web_url: str = "" plane_api_token: str = "" plane_workspace_slug: str = "" plane_webhook_secret: str = "" plane_project_id: str = "" # Per-agent Plane bot tokens (feat: per-agent comment authorship). # When set, add_comment posts under the matching bot so Plane shows the # real author (Analyst/Architect/...). Empty -> fallback to plane_api_token. plane_bot_analyst: str = "" plane_bot_architect: str = "" plane_bot_developer: str = "" plane_bot_reviewer: str = "" plane_bot_tester: str = "" plane_bot_deployer: str = "" plane_bot_stream: str = "" # Gitea gitea_url: str = "http://localhost:3000" gitea_public_url: str = "" # external URL for clickable links in comments; falls back to gitea_url gitea_token: str = "" gitea_webhook_secret: str = "" gitea_owner: str = "admin" default_repo: str = "enduro-trails" # ORCH-6: multi-repo project registry. JSON array of # {plane_project_id, repo, work_item_prefix, name}. # Empty -> built-in default registry in src/projects.py. projects_json: str = "" # Claude CLI claude_bin: str = "/opt/claude-code/bin/claude.exe" repos_dir: str = "/repos" host_repos_dir: str = "/home/slin/repos" worktrees_dir: str = "/repos/_wt" # ORCH-2 / S-4: isolated worktree per task/branch # DB db_path: str = "/app/data/orchestrator.db" # ORCH-1 (F-2b): persistent job queue / background worker. # max_concurrency -> max agent jobs running in parallel (env ORCH_MAX_CONCURRENCY) # queue_poll_interval -> worker loop poll seconds (env ORCH_QUEUE_POLL_INTERVAL) max_concurrency: int = 1 queue_poll_interval: float = 2.0 # ORCH-1b (resilience): preflight + 429/rate-limit + backoff + circuit breaker. # preflight_cache_ttl -> cache the cheap CLI/network preflight result (seconds); # the worker does NOT re-run `claude --version` more often # than this (env ORCH_PREFLIGHT_CACHE_TTL). # backoff_base_seconds -> base for exponential transient backoff. # backoff_max_seconds -> ceiling for the transient backoff. # transient_max_attempts -> retry budget for transient (429/overload/network) # failures, separate from code-fault `attempts`. # breaker_threshold -> consecutive transient failures that OPEN the breaker. # breaker_pause_seconds -> how long the breaker stays open before half-open. preflight_cache_ttl: int = 45 backoff_base_seconds: int = 10 backoff_max_seconds: int = 600 transient_max_attempts: int = 5 breaker_threshold: int = 3 breaker_pause_seconds: int = 300 # ORCH-7 (M-2): agent timeout + graceful kill. # agent_timeout_seconds -> default per-agent wall-clock budget; the watchdog # kills the run after this (env ORCH_AGENT_TIMEOUT_SECONDS). # agent_kill_grace_seconds-> pause between SIGTERM and SIGKILL so claude can # flush artifacts before the hard kill # (env ORCH_AGENT_KILL_GRACE_SECONDS). # agent_timeout_overrides_json -> optional per-agent override JSON object, # e.g. {"reviewer": 3600, "architect": 2700} # (env ORCH_AGENT_TIMEOUT_OVERRIDES_JSON). agent_timeout_seconds: int = 1800 agent_kill_grace_seconds: int = 20 agent_timeout_overrides_json: str = "" # ORCH-41: per-agent LLM model. Empty -> agent_model_default. Resolution order: # project-override (projects_json agent_models) > ORCH_AGENT_MODEL_ > # agent_model_default > CLI default (no --model flag). Default is 4-8 because # 4-7 == 4-8 in price (Slava 05.06); do NOT hardcode the version anywhere else. agent_model_default: str = "claude-opus-4-8" agent_model_analyst: str = "" agent_model_architect: str = "" agent_model_developer: str = "" agent_model_reviewer: str = "" agent_model_tester: str = "" agent_model_deployer: str = "" # ORCH-41: per-agent effort / reasoning level: low|medium|high|xhigh|max. # Empty -> agent_effort_default. Same resolution order as model. Default split: # thinking agents (analyst/architect/developer/reviewer) -> high; mechanical # agents (tester/deployer) -> medium. agent_effort_default: str = "high" agent_effort_analyst: str = "high" agent_effort_architect: str = "high" agent_effort_developer: str = "high" agent_effort_reviewer: str = "high" agent_effort_tester: str = "medium" agent_effort_deployer: str = "medium" # ORCH-41: optional per-agent fallback model used when the primary is # overloaded (--fallback-model, works with --print). Empty -> no flag. agent_fallback_model: str = "" # L-2: run-log rotation. Old per-run logs in /runs/*.log are pruned at # app startup (best-effort). A *.log is removed if it is older than # log_keep_days OR not within the log_keep_max most-recent logs (whichever # hits first). Only *.log files are touched; the active run log is skipped. # log_keep_days -> max age in days (env ORCH_LOG_KEEP_DAYS). # log_keep_max -> max number of newest logs to retain (env ORCH_LOG_KEEP_MAX). log_keep_days: int = 30 log_keep_max: int = 500 # ORCH-045: quality-gate CI poll/retry. check_ci_green polls the Gitea # combined commit status up to ci_poll_max_attempts times, sleeping # ci_poll_interval_s between attempts, to ride out a transient pending # state right after the developer push (race fix, see ORCH-017). # ci_poll_max_attempts -> max status polls (env ORCH_CI_POLL_MAX_ATTEMPTS) # ci_poll_interval_s -> seconds between polls (env ORCH_CI_POLL_INTERVAL_S) ci_poll_max_attempts: int = 12 ci_poll_interval_s: int = 10 # ORCH-043: merge-gate (auto-rebase + re-test + merge-lock) on the # deploy-staging -> deploy edge. A deterministic sub-gate (no LLM) that # catches the up-to-date branch up to the CURRENT origin/main, re-tests it, # and serialises merges so two green branches can't break main. # merge_gate_enabled -> global kill-switch; False -> no-op pass for the # whole gate (staged rollout, env ORCH_MERGE_GATE_ENABLED). # merge_gate_repos -> CSV of repos where the gate is REAL; empty means # only the self-hosting repo (orchestrator). Other # repos -> conditional no-op (mirrors ORCH-35 staging). # merge_retest_timeout_s -> wall-clock budget for the post-rebase re-test. # merge_retest_target -> pytest target for the re-test (portability across repos). # merge_lock_timeout_s -> max lease age; an older lease is reclaimed (crash backstop). # merge_defer_delay_s -> delay before re-running the gate when the lock is busy. # merge_defer_max_attempts -> defer retries before escalation (avoids livelock). merge_gate_enabled: bool = True merge_gate_repos: str = "" merge_retest_timeout_s: int = 600 merge_retest_target: str = "tests/" merge_lock_timeout_s: int = 300 merge_defer_delay_s: int = 60 merge_defer_max_attempts: int = 5 # ORCH-036: executable self-deploy (deploy stage drives the host hook). # The `deploy` stage for the self-hosting repo is turned into a REAL prod # restart via a detached host process, gated by a manual approve. Three-phase # design (ADR-001): A=approve-request, B=initiate (human Approved), C=finalizer # maps the hook exit-code -> deploy_status. Non-self repos are unaffected. # # self_deploy_enabled -> global kill-switch; False -> no Phase A/B/C # interception (the legacy synchronous deployer # path runs for everyone, env ORCH_SELF_DEPLOY_ENABLED). # self_deploy_repos -> CSV of repos where executable self-deploy is # REAL; empty -> only the self-hosting repo # (orchestrator). Mirrors merge_gate_repos. # deploy_require_manual_approve -> require a human Approved before the prod # restart (BR-5). Default true; NOT toggled in # ORCH-36 (AC-12). false -> Phase A initiates # immediately (structural branch, off by default). # deploy_finalize_delay_s -> delay before the first finalize poll; must be # > the hook health-loop (~60s) so the verdict # usually exists on the first poll. # deploy_finalize_max_attempts -> bounded finalize-defer budget (anti-livelock). # ssh / hook target (detached prod restart; real values live on the host): # deploy_ssh_user / deploy_ssh_host -> ssh target for the host hook (INFRA P-2). # deploy_hook_script -> path to the hook ON THE HOST (relative to repo). # deploy_host_repo_path -> orchestrator clone path on the host. # prod overrides passed to the hook for build-once (retag staging image -> prod): # deploy_prod_source_image -> image validated on staging (retagged, no rebuild). # deploy_prod_target_service / _port / _image / _compose_profile -> prod profile. # deploy_prod_prev_image_file -> prod prev-image snapshot (separate from staging). self_deploy_enabled: bool = True self_deploy_repos: str = "" deploy_require_manual_approve: bool = True deploy_finalize_delay_s: int = 90 deploy_finalize_max_attempts: int = 10 deploy_ssh_user: str = "slin" deploy_ssh_host: str = "" deploy_hook_script: str = "scripts/orchestrator-deploy-hook.sh" deploy_host_repo_path: str = "/home/slin/repos/orchestrator" deploy_prod_source_image: str = "orchestrator-orchestrator-staging" deploy_prod_target_service: str = "orchestrator" deploy_prod_target_port: int = 8500 deploy_prod_target_image: str = "orchestrator-orchestrator" deploy_prod_compose_profile: str = "" deploy_prod_prev_image_file: str = ".deploy-prev-image-prod" # ORCH-058: staging-image provenance before the BUILD-ONCE retag to prod. # Closes the INV-FRESH gap (ADR-001): the BUILD-ONCE retag (ORCH-36) promotes # the staging image to prod WITHOUT a rebuild, assuming the staging image is # fresh — a guarantee the pipeline never had (a stale image could be silently # promoted, LESSONS_ORCH-036 §4). Two complementary layers, self-hosting only: # A (liveness): the QG sub-check check_staging_image_fresh rebuilds the # staging image from the VALIDATED commit (worktree HEAD after merge-gate) # and recreates 8501 on the deploy-staging -> deploy edge, so we validate # and promote ONE artefact. # B (safety): build_deploy_command passes EXPECTED_REVISION and the hook # fail-closes (exit 1) if SOURCE_IMAGE's revision label != EXPECTED_REVISION # before `docker tag`, making a silent stale promote structurally impossible. # # image_freshness_enabled -> SINGLE kill-switch for the WHOLE feature (A + B # together; never "B without A" = a deadlock). False # -> legacy ORCH-36 behaviour (BUILD-ONCE, no guard, # no EXPECTED_REVISION). Env ORCH_IMAGE_FRESHNESS_ENABLED. # image_freshness_repos -> CSV of repos where the feature is REAL; empty -> # only the self-hosting repo (orchestrator). Mirrors # self_deploy_repos / merge_gate_repos. image_freshness_enabled: bool = True image_freshness_repos: str = "" # ORCH-022: security-gate (secret-scanning + dependency audit) on the # deploy-staging -> deploy edge, run FIRST among the edge sub-gates (cheap to # fail before the expensive rebase/rebuild). Deterministic (no LLM): gitleaks # (offline secret-scan) + pip-audit (OSV/PyPI dependency audit), verdict in the # versioned 17-security-report.md frontmatter; FAIL -> rollback to development + # developer-retry (cap MAX_DEVELOPER_RETRIES). See ADR-001-security-gate.md. # security_gate_enabled -> SINGLE kill-switch; False -> pipeline 1:1 as # before ORCH-022 for everyone. Env # ORCH_SECURITY_GATE_ENABLED. # security_gate_repos -> CSV of repos where the gate is REAL; empty -> # only the self-hosting repo (orchestrator). # Mirrors merge_gate_repos / image_freshness_repos. # security_dep_block_severity -> CVE severity threshold that BLOCKS (CRITICAL > # HIGH > MEDIUM > LOW); below it / UNKNOWN -> a # warning only (anti-loop ADR-001 Р-4). # security_scan_timeout_s -> per external scanner call timeout (mirrors # merge_retest_timeout_s). # security_dep_audit_fail_closed -> strict mode: an unreachable CVE feed -> FAIL # instead of the default fail-open + warning # (Р-3). Default False (anti-loop ORCH-061). # security_secrets_block -> a found secret blocks (always True by default; # the offline secrets guarantee is unconditional, # BR-2). security_gate_enabled: bool = True security_gate_repos: str = "" security_dep_block_severity: str = "HIGH" security_scan_timeout_s: int = 300 security_dep_audit_fail_closed: bool = False security_secrets_block: bool = True # ORCH-061: tolerate KNOWN sandbox-infra FAILs (C9a/C9b) in the staging suite. # The self-hosting deploy-staging stage looped because scripts/staging_check.py # exited non-zero on ANY failed check, so two infra-only failures (sandbox bot # accounts not members of the sandbox Plane project) produced staging_status: # FAILED -> rollback deploy-staging -> development -> loop. # True -> a run whose ONLY failures are allowlisted sandbox-infra checks # (C9a/C9b) is waived to SUCCESS; ANY real pipeline check that fails # still fails closed -> FAILED -> rollback (safety net intact, FR-4). # False -> 1:1 pre-ORCH-061 strict behaviour: any FAIL -> FAILED -> rollback. # Default True (mirrors merge_gate_enabled / image_freshness_enabled / # self_deploy_enabled): the safety net holds regardless of the flag; the flag # exists to instantly restore legacy strictness without a code redeploy. Lives # in .env.staging (ORCH_ prefix) so it is reachable inside orchestrator-staging. # Env ORCH_STAGING_INFRA_TOLERANCE_ENABLED. staging_infra_tolerance_enabled: bool = True # ORCH-053: stuck-task reconciler (sweeper for lost webhooks). A background # daemon thread reconciles the "source of truth (gate / Plane) != task stage" # drift left behind by a dropped webhook (502 on rebuild, no Plane/Gitea # retries, unresolved sha->branch). See docs/architecture/adr/adr-0007-reconciler.md. # reconcile_enabled -> global kill-switch (self-hosting safety, # staged rollout, env ORCH_RECONCILE_ENABLED). # reconcile_interval_s -> background sweep period (seconds). # reconcile_plane_enabled -> separate flag for the F-2 Plane-API poll so # only the plane branch can be muted. # reconcile_grace_default_s -> default "stuck" threshold on tasks.updated_at. # reconcile_grace_overrides_json -> JSON object of per-stage thresholds, e.g. # {"analysis": 1800, "development": 300}. Invalid # JSON -> default (mirrors agent_timeout_overrides_json). # reconcile_notify_unblock -> send a Telegram message when a stuck task is # unblocked (F-4 observability). # reconcile_skip_blocked_enabled -> ORCH-060 Guard 2: skip F-1 reconciliation of # issues a human moved to Blocked / Needs Input # (per-candidate Plane state lookup). Disabling it # mutes ONLY the networked Guard 2; Guard 1 # (escalated-by-retries, local + deterministic) is # always active. Manual escape hatch during a Plane # outage. reconcile_enabled: bool = True reconcile_interval_s: int = 120 reconcile_plane_enabled: bool = True reconcile_grace_default_s: int = 600 reconcile_grace_overrides_json: str = "" reconcile_notify_unblock: bool = True reconcile_skip_blocked_enabled: bool = True # ORCH-068: TTL for the per-project Plane states cache (_STATES_CACHE in # plane_sync). Historically the cache lived for the whole process lifetime, # so a status added to Plane after start was never seen without a restart # ("stale set -> no pipeline action"). With a TTL the entry self-heals by # re-fetching /states/ after it expires (invalidation reuses the existing # reload_project_states() primitive — no duplicated reset logic). # plane_states_ttl_s (env ORCH_PLANE_STATES_TTL_S): # >0 -> seconds before a cache entry is re-fetched (default 300 = 5 min); # 0 -> disable TTL -> strictly the previous lifetime cache (back-compat # escape hatch). get_project_states return shape is unchanged. plane_states_ttl_s: int = 300 # ORCH-021: post-deploy production monitoring + degradation reaction. After # the terminal deploy->done transition for an applicable repo, a reserved-agent # `post-deploy-monitor` job (no LLM, modelled on deploy-finalizer) probes prod # over a window and reacts to a degradation the restart-time health-check # missed (class "green deploy, red prod", precedent ET-8). State is in sentinel # files (.post-deploy-state-//), no DB migration. See # docs/architecture/adr/adr-0010-post-deploy-monitor.md. # post_deploy_monitor_enabled -> global kill-switch (BR-8); False -> the # pipeline is 1:1 as before ORCH-021 (no arm). # post_deploy_repos -> CSV of repos where monitoring is REAL; empty # -> only the self-hosting repo (orchestrator). # Mirrors self_deploy_repos / merge_gate_repos. # post_deploy_window_s -> observation window length (~15 min, BR-1). # post_deploy_interval_s -> seconds between probe ticks. # post_deploy_fail_threshold -> N CONSECUTIVE health failures -> DEGRADED. # post_deploy_5xx_threshold -> window 5xx ratio above this -> DEGRADED. # post_deploy_auto_rollback -> globally allow auto-rollback; True acts ONLY # for non-self repos. For self-hosting the # reaction is ALWAYS ALERT_ONLY (BR-5) — a tick # NEVER restarts the prod orchestrator container. # post_deploy_base_url -> base URL of the observed prod instance. # Rollback target params reuse the existing deploy_prod_* settings (no dupes). post_deploy_monitor_enabled: bool = True post_deploy_repos: str = "" post_deploy_window_s: int = 900 post_deploy_interval_s: int = 30 post_deploy_fail_threshold: int = 3 post_deploy_5xx_threshold: float = 0.5 post_deploy_auto_rollback: bool = False post_deploy_base_url: str = "http://localhost:8500" # ORCH-065: job-reaper + proactive merge-lease reclaim. A background daemon # thread (modelled on the reconciler) makes "the monitor thread / process died # while a job/lease was held" self-heal WITHOUT a restart. Status (done/queued/ # failed) is otherwise only ever set by launcher._monitor_agent -> _finalize_job # inside the live process; a death there left the jobs row 'running' forever and # (at max_concurrency=1) wedged the queue of EVERY project (incidents 07.06: jobs # 236/239/242/254). The same thread proactively reclaims a stale/dead merge-lease # (ORCH-043) instead of waiting for the lazy TTL on the next foreign acquire. See # docs/architecture/adr/adr-0011-job-reaper-lease-reclaim.md. # reaper_enabled -> global kill-switch (false -> strictly prior behaviour; # only the startup requeue_running_jobs remains). # reaper_interval_s -> background scan period (seconds). # reaper_dead_ticks -> Tier-1: consecutive ticks a job's pid must be dead # before it is reaped (>=2 anti-false-positive; a live # long-running agent is NEVER reaped). # reaper_max_running_s -> Tier-3 backstop ceiling: a job 'running' longer than # this is reaped even when liveness is unknowable. MUST be # > max agent_timeout + grace so a legit agent is safe. # reaper_finalize_grace_s -> Tier-2 anti-false-positive: a LIVE monitor writes # agent_runs.exit_code FIRST, THEN does git commit/push + # PR + Plane usage comments (seconds..minutes) and only # then _finalize_job. The agent pid is already dead in # that window, so pid cannot tell "monitor died" from # "monitor still finalizing". A job is reaped via Tier-2 # only once exit_code has been recorded for at least this # many seconds (MUST be > the max finalization window). # lease_reclaim_enabled -> kill-switch for the proactive stale/dead lease reclaim # (false -> only the legacy lazy TTL reclaim in acquire). # (reuse) merge_lock_timeout_s -> lease TTL; merge_gate_repos -> reclaim scope. reaper_enabled: bool = True reaper_interval_s: int = 60 reaper_dead_ticks: int = 2 reaper_max_running_s: int = 3600 reaper_finalize_grace_s: int = 300 lease_reclaim_enabled: bool = True # ORCH-071: merge-verify under-gate on the `deploy -> done` edge. For the # self-hosting repo the `deploy` stage runs the DETERMINISTIC self-deploy path # (Phase A/B/C), where the LLM `deployer` agent — historically the ONLY actor # that merged the feature PR into `main` — never runs. Result: a "green" deploy # could reach `done` while the PR stayed `open` (phantom merge, postmortem # LESSONS_2026-06-08). This under-gate (врезка in advance_stage, NOT a new # STAGE_TRANSITIONS edge or registered QG) runs a deterministic merge-actor + # post-deploy verification before `done`: not-merged -> alert + HOLD (no done), # merged -> normal advance. Mirrors merge_gate_* / image_freshness_* rollout. # merge_verify_enabled -> global kill-switch; False -> strictly the prior # behaviour (no merge/verify), env ORCH_MERGE_VERIFY_ENABLED. # merge_verify_repos -> CSV of repos where the under-gate is REAL; empty -> # only the self-hosting repo (orchestrator). Mirrors # merge_gate_repos / self_deploy_repos. # merge_pr_timeout_s -> per Gitea merge/list HTTP call timeout. # merge_verify_timeout_s-> git fetch/merge-base timeout for the ancestor check. merge_verify_enabled: bool = True merge_verify_repos: str = "" merge_pr_timeout_s: int = 60 merge_verify_timeout_s: int = 60 # Telegram notifications telegram_bot_token: str = "" telegram_chat_id: str = "" # ORCH-042: режим live-трекера задачи. # edit -> карточка редактируется на месте (editMessageText), ДЕФОЛТ (как было). # bump -> при обновлении старое сообщение удаляется и карточка отправляется # заново вниз чата (deleteMessage + sendMessage + repoint message_id), # тихо (disable_notification). Одна карточка на задачу в обоих режимах. # Неизвестное/пустое значение трактуется как edit (см. notifications). tracker_mode: str = "edit" class Config: env_prefix = "ORCH_" env_file = ".env" settings = Settings()