Files
orchestrator/src/config.py
claude-bot 1ada41f272 fix(effort): per-role floor for --effort resolution + developer→xhigh
resolve_agent_effort returned '' for all agents in prod because empty
ORCH_AGENT_EFFORT_*= env vars clobber pydantic class-defaults, leaving no
non-empty floor to fall back to -> --effort never reached the Claude CLI.

Add a level-4 per-role floor in resolve_agent_effort (src/agents/launcher.py):
_agent_effort_floor reads the declared class-default of agent_effort_<agent>
(model_fields[...].default), which a present-but-empty env cannot override.
Floor applies only when levels 1-3 are empty and BEFORE validation, so a typo
(non-empty) still drops to '' (never-break ORCH-41) and explicit env/override
still wins (priority preserved). config.py: agent_effort_developer high->xhigh
(single source of truth; floor follows automatically).

Refs: ORCH-081

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-08 22:50:47 +03:00

496 lines
30 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from pydantic import field_validator
from pydantic_settings import BaseSettings
class Settings(BaseSettings):
# Plane
plane_api_url: str = "http://localhost:8091"
# ORCH-017: external (browser) web URL of Plane for clickable issue links in
# notifications, e.g. https://plane.example.org. Falls back to plane_api_url,
# but a loopback fallback (localhost/127.0.0.1) is treated as "no web URL" and
# the Plane link is omitted (see notifications._build_plane_issue_link).
plane_web_url: str = ""
plane_api_token: str = ""
plane_workspace_slug: str = ""
plane_webhook_secret: str = ""
plane_project_id: str = ""
# Per-agent Plane bot tokens (feat: per-agent comment authorship).
# When set, add_comment posts under the matching bot so Plane shows the
# real author (Analyst/Architect/...). Empty -> fallback to plane_api_token.
plane_bot_analyst: str = ""
plane_bot_architect: str = ""
plane_bot_developer: str = ""
plane_bot_reviewer: str = ""
plane_bot_tester: str = ""
plane_bot_deployer: str = ""
plane_bot_stream: str = ""
# Gitea
gitea_url: str = "http://localhost:3000"
gitea_public_url: str = "" # external URL for clickable links in comments; falls back to gitea_url
gitea_token: str = ""
gitea_webhook_secret: str = ""
gitea_owner: str = "admin"
default_repo: str = "enduro-trails"
# ORCH-6: multi-repo project registry. JSON array of
# {plane_project_id, repo, work_item_prefix, name}.
# Empty -> built-in default registry in src/projects.py.
projects_json: str = ""
# Claude CLI
claude_bin: str = "/opt/claude-code/bin/claude.exe"
repos_dir: str = "/repos"
host_repos_dir: str = "/home/slin/repos"
worktrees_dir: str = "/repos/_wt" # ORCH-2 / S-4: isolated worktree per task/branch
# DB
db_path: str = "/app/data/orchestrator.db"
# ORCH-1 (F-2b): persistent job queue / background worker.
# max_concurrency -> max agent jobs running in parallel (env ORCH_MAX_CONCURRENCY)
# queue_poll_interval -> worker loop poll seconds (env ORCH_QUEUE_POLL_INTERVAL)
max_concurrency: int = 1
queue_poll_interval: float = 2.0
# ORCH-1b (resilience): preflight + 429/rate-limit + backoff + circuit breaker.
# preflight_cache_ttl -> cache the cheap CLI/network preflight result (seconds);
# the worker does NOT re-run `claude --version` more often
# than this (env ORCH_PREFLIGHT_CACHE_TTL).
# backoff_base_seconds -> base for exponential transient backoff.
# backoff_max_seconds -> ceiling for the transient backoff.
# transient_max_attempts -> retry budget for transient (429/overload/network)
# failures, separate from code-fault `attempts`.
# breaker_threshold -> consecutive transient failures that OPEN the breaker.
# breaker_pause_seconds -> how long the breaker stays open before half-open.
preflight_cache_ttl: int = 45
backoff_base_seconds: int = 10
backoff_max_seconds: int = 600
transient_max_attempts: int = 5
breaker_threshold: int = 3
breaker_pause_seconds: int = 300
# ORCH-7 (M-2): agent timeout + graceful kill.
# agent_timeout_seconds -> default per-agent wall-clock budget; the watchdog
# kills the run after this (env ORCH_AGENT_TIMEOUT_SECONDS).
# agent_kill_grace_seconds-> pause between SIGTERM and SIGKILL so claude can
# flush artifacts before the hard kill
# (env ORCH_AGENT_KILL_GRACE_SECONDS).
# agent_timeout_overrides_json -> optional per-agent override JSON object,
# e.g. {"reviewer": 3600, "architect": 2700}
# (env ORCH_AGENT_TIMEOUT_OVERRIDES_JSON).
agent_timeout_seconds: int = 1800
agent_kill_grace_seconds: int = 20
agent_timeout_overrides_json: str = ""
# ORCH-41: per-agent LLM model. Empty -> agent_model_default. Resolution order:
# project-override (projects_json agent_models) > ORCH_AGENT_MODEL_<AGENT> >
# agent_model_default > CLI default (no --model flag). Default is 4-8 because
# 4-7 == 4-8 in price (Slava 05.06); do NOT hardcode the version anywhere else.
agent_model_default: str = "claude-opus-4-8"
agent_model_analyst: str = ""
agent_model_architect: str = ""
agent_model_developer: str = ""
agent_model_reviewer: str = ""
agent_model_tester: str = ""
agent_model_deployer: str = ""
# ORCH-41: per-agent effort / reasoning level: low|medium|high|xhigh|max.
# Empty -> agent_effort_default. Same resolution order as model. Default split
# (ORCH-081/ORCH-52h): thinking agents (analyst/architect/reviewer) -> high;
# developer -> xhigh (coding/agentic role, Opus 4.8 canon); mechanical agents
# (tester/deployer) -> medium. These class-defaults are ALSO the per-role floor
# used by resolve_agent_effort when the env is empty (single source of truth).
agent_effort_default: str = "high"
agent_effort_analyst: str = "high"
agent_effort_architect: str = "high"
agent_effort_developer: str = "xhigh"
agent_effort_reviewer: str = "high"
agent_effort_tester: str = "medium"
agent_effort_deployer: str = "medium"
# ORCH-41: optional per-agent fallback model used when the primary is
# overloaded (--fallback-model, works with --print). Empty -> no flag.
agent_fallback_model: str = ""
# L-2: run-log rotation. Old per-run logs in <data>/runs/*.log are pruned at
# app startup (best-effort). A *.log is removed if it is older than
# log_keep_days OR not within the log_keep_max most-recent logs (whichever
# hits first). Only *.log files are touched; the active run log is skipped.
# log_keep_days -> max age in days (env ORCH_LOG_KEEP_DAYS).
# log_keep_max -> max number of newest logs to retain (env ORCH_LOG_KEEP_MAX).
log_keep_days: int = 30
log_keep_max: int = 500
# ORCH-045: quality-gate CI poll/retry. check_ci_green polls the Gitea
# combined commit status up to ci_poll_max_attempts times, sleeping
# ci_poll_interval_s between attempts, to ride out a transient pending
# state right after the developer push (race fix, see ORCH-017).
# ci_poll_max_attempts -> max status polls (env ORCH_CI_POLL_MAX_ATTEMPTS)
# ci_poll_interval_s -> seconds between polls (env ORCH_CI_POLL_INTERVAL_S)
ci_poll_max_attempts: int = 12
ci_poll_interval_s: int = 10
# ORCH-043: merge-gate (auto-rebase + re-test + merge-lock) on the
# deploy-staging -> deploy edge. A deterministic sub-gate (no LLM) that
# catches the up-to-date branch up to the CURRENT origin/main, re-tests it,
# and serialises merges so two green branches can't break main.
# merge_gate_enabled -> global kill-switch; False -> no-op pass for the
# whole gate (staged rollout, env ORCH_MERGE_GATE_ENABLED).
# merge_gate_repos -> CSV of repos where the gate is REAL; empty means
# only the self-hosting repo (orchestrator). Other
# repos -> conditional no-op (mirrors ORCH-35 staging).
# merge_retest_timeout_s -> wall-clock budget for the post-rebase re-test.
# merge_retest_target -> pytest target for the re-test (portability across repos).
# merge_lock_timeout_s -> max lease age; an older lease is reclaimed (crash backstop).
# merge_defer_delay_s -> delay before re-running the gate when the lock is busy.
# merge_defer_max_attempts -> defer retries before escalation (avoids livelock).
merge_gate_enabled: bool = True
merge_gate_repos: str = ""
merge_retest_timeout_s: int = 600
merge_retest_target: str = "tests/"
merge_lock_timeout_s: int = 300
merge_defer_delay_s: int = 60
merge_defer_max_attempts: int = 5
# ORCH-036: executable self-deploy (deploy stage drives the host hook).
# The `deploy` stage for the self-hosting repo is turned into a REAL prod
# restart via a detached host process, gated by a manual approve. Three-phase
# design (ADR-001): A=approve-request, B=initiate (human Approved), C=finalizer
# maps the hook exit-code -> deploy_status. Non-self repos are unaffected.
#
# self_deploy_enabled -> global kill-switch; False -> no Phase A/B/C
# interception (the legacy synchronous deployer
# path runs for everyone, env ORCH_SELF_DEPLOY_ENABLED).
# self_deploy_repos -> CSV of repos where executable self-deploy is
# REAL; empty -> only the self-hosting repo
# (orchestrator). Mirrors merge_gate_repos.
# deploy_require_manual_approve -> require a human Approved before the prod
# restart (BR-5). Default true; NOT toggled in
# ORCH-36 (AC-12). false -> Phase A initiates
# immediately (structural branch, off by default).
# deploy_finalize_delay_s -> delay before the first finalize poll; must be
# > the hook health-loop (~60s) so the verdict
# usually exists on the first poll.
# deploy_finalize_max_attempts -> bounded finalize-defer budget (anti-livelock).
# ssh / hook target (detached prod restart; real values live on the host):
# deploy_ssh_user / deploy_ssh_host -> ssh target for the host hook (INFRA P-2).
# deploy_hook_script -> path to the hook ON THE HOST (relative to repo).
# deploy_host_repo_path -> orchestrator clone path on the host.
# prod overrides passed to the hook for build-once (retag staging image -> prod):
# deploy_prod_source_image -> image validated on staging (retagged, no rebuild).
# deploy_prod_target_service / _port / _image / _compose_profile -> prod profile.
# deploy_prod_prev_image_file -> prod prev-image snapshot (separate from staging).
self_deploy_enabled: bool = True
self_deploy_repos: str = ""
deploy_require_manual_approve: bool = True
deploy_finalize_delay_s: int = 90
deploy_finalize_max_attempts: int = 10
deploy_ssh_user: str = "slin"
deploy_ssh_host: str = ""
deploy_hook_script: str = "scripts/orchestrator-deploy-hook.sh"
deploy_host_repo_path: str = "/home/slin/repos/orchestrator"
deploy_prod_source_image: str = "orchestrator-orchestrator-staging"
deploy_prod_target_service: str = "orchestrator"
deploy_prod_target_port: int = 8500
deploy_prod_target_image: str = "orchestrator-orchestrator"
deploy_prod_compose_profile: str = ""
deploy_prod_prev_image_file: str = ".deploy-prev-image-prod"
# ORCH-058: staging-image provenance before the BUILD-ONCE retag to prod.
# Closes the INV-FRESH gap (ADR-001): the BUILD-ONCE retag (ORCH-36) promotes
# the staging image to prod WITHOUT a rebuild, assuming the staging image is
# fresh — a guarantee the pipeline never had (a stale image could be silently
# promoted, LESSONS_ORCH-036 §4). Two complementary layers, self-hosting only:
# A (liveness): the QG sub-check check_staging_image_fresh rebuilds the
# staging image from the VALIDATED commit (worktree HEAD after merge-gate)
# and recreates 8501 on the deploy-staging -> deploy edge, so we validate
# and promote ONE artefact.
# B (safety): build_deploy_command passes EXPECTED_REVISION and the hook
# fail-closes (exit 1) if SOURCE_IMAGE's revision label != EXPECTED_REVISION
# before `docker tag`, making a silent stale promote structurally impossible.
#
# image_freshness_enabled -> SINGLE kill-switch for the WHOLE feature (A + B
# together; never "B without A" = a deadlock). False
# -> legacy ORCH-36 behaviour (BUILD-ONCE, no guard,
# no EXPECTED_REVISION). Env ORCH_IMAGE_FRESHNESS_ENABLED.
# image_freshness_repos -> CSV of repos where the feature is REAL; empty ->
# only the self-hosting repo (orchestrator). Mirrors
# self_deploy_repos / merge_gate_repos.
image_freshness_enabled: bool = True
image_freshness_repos: str = ""
# ORCH-022: security-gate (secret-scanning + dependency audit) on the
# deploy-staging -> deploy edge, run FIRST among the edge sub-gates (cheap to
# fail before the expensive rebase/rebuild). Deterministic (no LLM): gitleaks
# (offline secret-scan) + pip-audit (OSV/PyPI dependency audit), verdict in the
# versioned 17-security-report.md frontmatter; FAIL -> rollback to development +
# developer-retry (cap MAX_DEVELOPER_RETRIES). See ADR-001-security-gate.md.
# security_gate_enabled -> SINGLE kill-switch; False -> pipeline 1:1 as
# before ORCH-022 for everyone. Env
# ORCH_SECURITY_GATE_ENABLED.
# security_gate_repos -> CSV of repos where the gate is REAL; empty ->
# only the self-hosting repo (orchestrator).
# Mirrors merge_gate_repos / image_freshness_repos.
# security_dep_block_severity -> CVE severity threshold that BLOCKS (CRITICAL >
# HIGH > MEDIUM > LOW); below it / UNKNOWN -> a
# warning only (anti-loop ADR-001 Р-4).
# security_scan_timeout_s -> per external scanner call timeout (mirrors
# merge_retest_timeout_s).
# security_dep_audit_fail_closed -> strict mode: an unreachable CVE feed -> FAIL
# instead of the default fail-open + warning
# (Р-3). Default False (anti-loop ORCH-061).
# security_secrets_block -> a found secret blocks (always True by default;
# the offline secrets guarantee is unconditional,
# BR-2).
security_gate_enabled: bool = True
security_gate_repos: str = ""
security_dep_block_severity: str = "HIGH"
security_scan_timeout_s: int = 300
security_dep_audit_fail_closed: bool = False
security_secrets_block: bool = True
# ORCH-061: tolerate KNOWN sandbox-infra FAILs (C9a/C9b) in the staging suite.
# The self-hosting deploy-staging stage looped because scripts/staging_check.py
# exited non-zero on ANY failed check, so two infra-only failures (sandbox bot
# accounts not members of the sandbox Plane project) produced staging_status:
# FAILED -> rollback deploy-staging -> development -> loop.
# True -> a run whose ONLY failures are allowlisted sandbox-infra checks
# (C9a/C9b) is waived to SUCCESS; ANY real pipeline check that fails
# still fails closed -> FAILED -> rollback (safety net intact, FR-4).
# False -> 1:1 pre-ORCH-061 strict behaviour: any FAIL -> FAILED -> rollback.
# Default True (mirrors merge_gate_enabled / image_freshness_enabled /
# self_deploy_enabled): the safety net holds regardless of the flag; the flag
# exists to instantly restore legacy strictness without a code redeploy. Lives
# in .env.staging (ORCH_ prefix) so it is reachable inside orchestrator-staging.
# Env ORCH_STAGING_INFRA_TOLERANCE_ENABLED.
staging_infra_tolerance_enabled: bool = True
# ORCH-053: stuck-task reconciler (sweeper for lost webhooks). A background
# daemon thread reconciles the "source of truth (gate / Plane) != task stage"
# drift left behind by a dropped webhook (502 on rebuild, no Plane/Gitea
# retries, unresolved sha->branch). See docs/architecture/adr/adr-0007-reconciler.md.
# reconcile_enabled -> global kill-switch (self-hosting safety,
# staged rollout, env ORCH_RECONCILE_ENABLED).
# reconcile_interval_s -> background sweep period (seconds).
# reconcile_plane_enabled -> separate flag for the F-2 Plane-API poll so
# only the plane branch can be muted.
# reconcile_grace_default_s -> default "stuck" threshold on tasks.updated_at.
# reconcile_grace_overrides_json -> JSON object of per-stage thresholds, e.g.
# {"analysis": 1800, "development": 300}. Invalid
# JSON -> default (mirrors agent_timeout_overrides_json).
# reconcile_notify_unblock -> send a Telegram message when a stuck task is
# unblocked (F-4 observability).
# reconcile_skip_blocked_enabled -> ORCH-060 Guard 2: skip F-1 reconciliation of
# issues a human moved to Blocked / Needs Input
# (per-candidate Plane state lookup). Disabling it
# mutes ONLY the networked Guard 2; Guard 1
# (escalated-by-retries, local + deterministic) is
# always active. Manual escape hatch during a Plane
# outage.
reconcile_enabled: bool = True
reconcile_interval_s: int = 120
reconcile_plane_enabled: bool = True
reconcile_grace_default_s: int = 600
reconcile_grace_overrides_json: str = ""
reconcile_notify_unblock: bool = True
reconcile_skip_blocked_enabled: bool = True
# ORCH-068: TTL for the per-project Plane states cache (_STATES_CACHE in
# plane_sync). Historically the cache lived for the whole process lifetime,
# so a status added to Plane after start was never seen without a restart
# ("stale set -> no pipeline action"). With a TTL the entry self-heals by
# re-fetching /states/ after it expires (invalidation reuses the existing
# reload_project_states() primitive — no duplicated reset logic).
# plane_states_ttl_s (env ORCH_PLANE_STATES_TTL_S):
# >0 -> seconds before a cache entry is re-fetched (default 300 = 5 min);
# 0 -> disable TTL -> strictly the previous lifetime cache (back-compat
# escape hatch). get_project_states return shape is unchanged.
plane_states_ttl_s: int = 300
# ORCH-021: post-deploy production monitoring + degradation reaction. After
# the terminal deploy->done transition for an applicable repo, a reserved-agent
# `post-deploy-monitor` job (no LLM, modelled on deploy-finalizer) probes prod
# over a window and reacts to a degradation the restart-time health-check
# missed (class "green deploy, red prod", precedent ET-8). State is in sentinel
# files (.post-deploy-state-<repo>/<wi>/), no DB migration. See
# docs/architecture/adr/adr-0010-post-deploy-monitor.md.
# post_deploy_monitor_enabled -> global kill-switch (BR-8); False -> the
# pipeline is 1:1 as before ORCH-021 (no arm).
# post_deploy_repos -> CSV of repos where monitoring is REAL; empty
# -> only the self-hosting repo (orchestrator).
# Mirrors self_deploy_repos / merge_gate_repos.
# post_deploy_window_s -> observation window length (~15 min, BR-1).
# post_deploy_interval_s -> seconds between probe ticks.
# post_deploy_fail_threshold -> N CONSECUTIVE health failures -> DEGRADED.
# post_deploy_5xx_threshold -> window 5xx ratio above this -> DEGRADED.
# post_deploy_auto_rollback -> globally allow auto-rollback; True acts ONLY
# for non-self repos. For self-hosting the
# reaction is ALWAYS ALERT_ONLY (BR-5) — a tick
# NEVER restarts the prod orchestrator container.
# post_deploy_base_url -> base URL of the observed prod instance.
# Rollback target params reuse the existing deploy_prod_* settings (no dupes).
post_deploy_monitor_enabled: bool = True
post_deploy_repos: str = ""
post_deploy_window_s: int = 900
post_deploy_interval_s: int = 30
post_deploy_fail_threshold: int = 3
post_deploy_5xx_threshold: float = 0.5
post_deploy_auto_rollback: bool = False
post_deploy_base_url: str = "http://localhost:8500"
# ORCH-065: job-reaper + proactive merge-lease reclaim. A background daemon
# thread (modelled on the reconciler) makes "the monitor thread / process died
# while a job/lease was held" self-heal WITHOUT a restart. Status (done/queued/
# failed) is otherwise only ever set by launcher._monitor_agent -> _finalize_job
# inside the live process; a death there left the jobs row 'running' forever and
# (at max_concurrency=1) wedged the queue of EVERY project (incidents 07.06: jobs
# 236/239/242/254). The same thread proactively reclaims a stale/dead merge-lease
# (ORCH-043) instead of waiting for the lazy TTL on the next foreign acquire. See
# docs/architecture/adr/adr-0011-job-reaper-lease-reclaim.md.
# reaper_enabled -> global kill-switch (false -> strictly prior behaviour;
# only the startup requeue_running_jobs remains).
# reaper_interval_s -> background scan period (seconds).
# reaper_dead_ticks -> Tier-1: consecutive ticks a job's pid must be dead
# before it is reaped (>=2 anti-false-positive; a live
# long-running agent is NEVER reaped).
# reaper_max_running_s -> Tier-3 backstop ceiling: a job 'running' longer than
# this is reaped even when liveness is unknowable. MUST be
# > max agent_timeout + grace so a legit agent is safe.
# reaper_finalize_grace_s -> Tier-2 anti-false-positive: a LIVE monitor writes
# agent_runs.exit_code FIRST, THEN does git commit/push +
# PR + Plane usage comments (seconds..minutes) and only
# then _finalize_job. The agent pid is already dead in
# that window, so pid cannot tell "monitor died" from
# "monitor still finalizing". A job is reaped via Tier-2
# only once exit_code has been recorded for at least this
# many seconds (MUST be > the max finalization window).
# lease_reclaim_enabled -> kill-switch for the proactive stale/dead lease reclaim
# (false -> only the legacy lazy TTL reclaim in acquire).
# (reuse) merge_lock_timeout_s -> lease TTL; merge_gate_repos -> reclaim scope.
reaper_enabled: bool = True
reaper_interval_s: int = 60
reaper_dead_ticks: int = 2
reaper_max_running_s: int = 3600
reaper_finalize_grace_s: int = 300
lease_reclaim_enabled: bool = True
# ORCH-071: merge-verify under-gate on the `deploy -> done` edge. For the
# self-hosting repo the `deploy` stage runs the DETERMINISTIC self-deploy path
# (Phase A/B/C), where the LLM `deployer` agent — historically the ONLY actor
# that merged the feature PR into `main` — never runs. Result: a "green" deploy
# could reach `done` while the PR stayed `open` (phantom merge, postmortem
# LESSONS_2026-06-08). This under-gate (врезка in advance_stage, NOT a new
# STAGE_TRANSITIONS edge or registered QG) runs a deterministic merge-actor +
# post-deploy verification before `done`: not-merged -> alert + HOLD (no done),
# merged -> normal advance. Mirrors merge_gate_* / image_freshness_* rollout.
# merge_verify_enabled -> global kill-switch; False -> strictly the prior
# behaviour (no merge/verify), env ORCH_MERGE_VERIFY_ENABLED.
# merge_verify_repos -> CSV of repos where the under-gate is REAL; empty ->
# only the self-hosting repo (orchestrator). Mirrors
# merge_gate_repos / self_deploy_repos.
# merge_pr_timeout_s -> per Gitea merge/list HTTP call timeout.
# merge_verify_timeout_s-> git fetch/merge-base timeout for the ancestor check.
merge_verify_enabled: bool = True
merge_verify_repos: str = ""
merge_pr_timeout_s: int = 60
merge_verify_timeout_s: int = 60
# ORCH-026: intra-repo merge serialisation (Level A) + declarative task
# dependencies (Level B). Level A reuses the ORCH-043/065 merge-lease window
# (no new mechanism) — the merge-lease already serialises "merge -> main-updated"
# per repo; the ONLY new behaviour is an unconditional pre-merge rebase. Level B
# adds a new ADDITIVE job_deps table + a NOT EXISTS gate in claim_next_job. Both
# features are inert without data (no applicable repo / no declared deps) ->
# zero regression for enduro-trails.
# premerge_rebase_always -> Level A (A-2): when True, check_branch_mergeable
# ALWAYS rebases the task branch onto the CURRENT
# origin/main UNDER the merge-lease (not only when
# branch_is_behind_main) — a deterministic anti-phantom
# that does not depend on the ancestor check's precision.
# auto_rebase_onto_main is a cheap no-op on an already
# up-to-date branch (rc 0, push up-to-date, CI not
# retriggered). Scope = merge_gate_repos (empty ->
# self-hosting). Kill-switch (False -> exactly the
# ORCH-043 behaviour: rebase only when behind). Env
# ORCH_PREMERGE_REBASE_ALWAYS.
# task_deps_enabled -> Level B (B-2): global kill-switch for the scheduler
# dependency gate. False -> claim_next_job is 1:1 as
# ORCH-1 (the NOT EXISTS clause is omitted). Inert when
# job_deps is empty. Env ORCH_TASK_DEPS_ENABLED.
# task_deps_source -> declaration source: db|plane|hybrid (default db).
# The scheduler ALWAYS reads the DB cache (offline-safe
# hot path); plane/hybrid additionally ingest Plane
# `blocked-by` relations into job_deps at task creation.
# Env ORCH_TASK_DEPS_SOURCE.
premerge_rebase_always: bool = True
task_deps_enabled: bool = True
task_deps_source: str = "db"
# ORCH-073 (ADR-001 Р-4): main-integrity regression guard. After the merge-verify
# under-gate confirms the deployed SHA is an ancestor of origin/main (FR-1), a
# secondary deterministic (no-LLM) guard checks that a declarative set of markers
# for recently-merged tasks (MAIN_REGRESSION_MARKERS in merge_gate.py) is still
# present in origin/main — i.e. a CHANGELOG-rebase or phantom-merge did not silently
# roll back a neighbouring task's code. A missing marker (deterministic count==0) ->
# ALERT + HOLD (task stays on `deploy`, NOT done); an infra/git error on the grep
# itself -> fail-OPEN (do not block done; SHA-in-main remains the primary gate).
# regression_guard_enabled -> kill-switch (env ORCH_REGRESSION_GUARD_ENABLED);
# reuses the merge_verify_applies scope (self-hosting /
# merge_verify_repos), so non-self repos are a no-op.
regression_guard_enabled: bool = True
# Telegram notifications
telegram_bot_token: str = ""
telegram_chat_id: str = ""
# ORCH-042: режим live-трекера задачи.
# bump (ДЕФОЛТ с ORCH-067) -> при обновлении старое сообщение удаляется и
# карточка отправляется заново вниз чата (deleteMessage + sendMessage
# + repoint message_id), тихо (disable_notification).
# edit -> карточка редактируется на месте (editMessageText); доступен через
# ORCH_TRACKER_MODE=edit.
# Одна карточка на задачу в обоих режимах. Неизвестное/пустое значение
# трактуется как edit (см. notifications).
tracker_mode: str = "bump"
# ORCH-067 (ADR Р-2/Р-3/Р-4): best-effort live-overlay для статус-строки
# карточки. Дорисовывает ветки Plane-статуса, неотличимые offline по
# tasks.stage (Needs Input / Blocked / Rejected / Cancelled / Deploying /
# Monitoring after Deploy) — читая ЖИВОЙ Plane-статус с коротким таймаутом и
# TTL-кэшем. Offline-ядро (stage -> статус, In Review из brd-clock) работает
# всегда без сети; overlay лишь дополняет его и НИКОГДА не блокирует конвейер.
# tracker_live_status -> kill-switch (False -> только offline-ядро).
# tracker_live_status_ttl_s -> TTL per-issue кэша live-uuid (защита hot-path).
# tracker_live_status_timeout_s -> таймаут одного live-GET в пути рендера.
tracker_live_status: bool = True
tracker_live_status_ttl_s: int = 60
tracker_live_status_timeout_s: int = 3
# ORCH-069: QG-0 upper title-length limit (entry gate _qg0_errors). The 80-char
# cap was a hygiene limit, not structural (slug is cut to [:30] independently,
# DB title TEXT is unbounded). Configurable via env ORCH_QG0_TITLE_MAX; default
# 200 (was hardcoded 80). Invalid/empty value -> default (graceful, no crash).
qg0_title_max: int = 200
@field_validator("qg0_title_max", mode="before")
@classmethod
def _qg0_title_max_default(cls, v):
# Graceful (ORCH-069 AC-3): empty / non-numeric env -> default 200, the
# process must not crash on startup. Never raises (self-hosting safety).
try:
if v is None or (isinstance(v, str) and v.strip() == ""):
return 200
return int(v)
except (TypeError, ValueError):
return 200
class Config:
env_prefix = "ORCH_"
env_file = ".env"
settings = Settings()