resolve_agent_effort returned '' for all agents in prod because empty ORCH_AGENT_EFFORT_*= env vars clobber pydantic class-defaults, leaving no non-empty floor to fall back to -> --effort never reached the Claude CLI. Add a level-4 per-role floor in resolve_agent_effort (src/agents/launcher.py): _agent_effort_floor reads the declared class-default of agent_effort_<agent> (model_fields[...].default), which a present-but-empty env cannot override. Floor applies only when levels 1-3 are empty and BEFORE validation, so a typo (non-empty) still drops to '' (never-break ORCH-41) and explicit env/override still wins (priority preserved). config.py: agent_effort_developer high->xhigh (single source of truth; floor follows automatically). Refs: ORCH-081 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
496 lines
30 KiB
Python
496 lines
30 KiB
Python
from pydantic import field_validator
|
||
from pydantic_settings import BaseSettings
|
||
|
||
|
||
class Settings(BaseSettings):
|
||
# Plane
|
||
plane_api_url: str = "http://localhost:8091"
|
||
# ORCH-017: external (browser) web URL of Plane for clickable issue links in
|
||
# notifications, e.g. https://plane.example.org. Falls back to plane_api_url,
|
||
# but a loopback fallback (localhost/127.0.0.1) is treated as "no web URL" and
|
||
# the Plane link is omitted (see notifications._build_plane_issue_link).
|
||
plane_web_url: str = ""
|
||
plane_api_token: str = ""
|
||
plane_workspace_slug: str = ""
|
||
plane_webhook_secret: str = ""
|
||
plane_project_id: str = ""
|
||
|
||
# Per-agent Plane bot tokens (feat: per-agent comment authorship).
|
||
# When set, add_comment posts under the matching bot so Plane shows the
|
||
# real author (Analyst/Architect/...). Empty -> fallback to plane_api_token.
|
||
plane_bot_analyst: str = ""
|
||
plane_bot_architect: str = ""
|
||
plane_bot_developer: str = ""
|
||
plane_bot_reviewer: str = ""
|
||
plane_bot_tester: str = ""
|
||
plane_bot_deployer: str = ""
|
||
plane_bot_stream: str = ""
|
||
|
||
# Gitea
|
||
gitea_url: str = "http://localhost:3000"
|
||
gitea_public_url: str = "" # external URL for clickable links in comments; falls back to gitea_url
|
||
gitea_token: str = ""
|
||
gitea_webhook_secret: str = ""
|
||
gitea_owner: str = "admin"
|
||
default_repo: str = "enduro-trails"
|
||
|
||
# ORCH-6: multi-repo project registry. JSON array of
|
||
# {plane_project_id, repo, work_item_prefix, name}.
|
||
# Empty -> built-in default registry in src/projects.py.
|
||
projects_json: str = ""
|
||
|
||
# Claude CLI
|
||
claude_bin: str = "/opt/claude-code/bin/claude.exe"
|
||
repos_dir: str = "/repos"
|
||
host_repos_dir: str = "/home/slin/repos"
|
||
worktrees_dir: str = "/repos/_wt" # ORCH-2 / S-4: isolated worktree per task/branch
|
||
|
||
# DB
|
||
db_path: str = "/app/data/orchestrator.db"
|
||
|
||
# ORCH-1 (F-2b): persistent job queue / background worker.
|
||
# max_concurrency -> max agent jobs running in parallel (env ORCH_MAX_CONCURRENCY)
|
||
# queue_poll_interval -> worker loop poll seconds (env ORCH_QUEUE_POLL_INTERVAL)
|
||
max_concurrency: int = 1
|
||
queue_poll_interval: float = 2.0
|
||
|
||
# ORCH-1b (resilience): preflight + 429/rate-limit + backoff + circuit breaker.
|
||
# preflight_cache_ttl -> cache the cheap CLI/network preflight result (seconds);
|
||
# the worker does NOT re-run `claude --version` more often
|
||
# than this (env ORCH_PREFLIGHT_CACHE_TTL).
|
||
# backoff_base_seconds -> base for exponential transient backoff.
|
||
# backoff_max_seconds -> ceiling for the transient backoff.
|
||
# transient_max_attempts -> retry budget for transient (429/overload/network)
|
||
# failures, separate from code-fault `attempts`.
|
||
# breaker_threshold -> consecutive transient failures that OPEN the breaker.
|
||
# breaker_pause_seconds -> how long the breaker stays open before half-open.
|
||
preflight_cache_ttl: int = 45
|
||
backoff_base_seconds: int = 10
|
||
backoff_max_seconds: int = 600
|
||
transient_max_attempts: int = 5
|
||
breaker_threshold: int = 3
|
||
breaker_pause_seconds: int = 300
|
||
|
||
# ORCH-7 (M-2): agent timeout + graceful kill.
|
||
# agent_timeout_seconds -> default per-agent wall-clock budget; the watchdog
|
||
# kills the run after this (env ORCH_AGENT_TIMEOUT_SECONDS).
|
||
# agent_kill_grace_seconds-> pause between SIGTERM and SIGKILL so claude can
|
||
# flush artifacts before the hard kill
|
||
# (env ORCH_AGENT_KILL_GRACE_SECONDS).
|
||
# agent_timeout_overrides_json -> optional per-agent override JSON object,
|
||
# e.g. {"reviewer": 3600, "architect": 2700}
|
||
# (env ORCH_AGENT_TIMEOUT_OVERRIDES_JSON).
|
||
agent_timeout_seconds: int = 1800
|
||
agent_kill_grace_seconds: int = 20
|
||
agent_timeout_overrides_json: str = ""
|
||
|
||
# ORCH-41: per-agent LLM model. Empty -> agent_model_default. Resolution order:
|
||
# project-override (projects_json agent_models) > ORCH_AGENT_MODEL_<AGENT> >
|
||
# agent_model_default > CLI default (no --model flag). Default is 4-8 because
|
||
# 4-7 == 4-8 in price (Slava 05.06); do NOT hardcode the version anywhere else.
|
||
agent_model_default: str = "claude-opus-4-8"
|
||
agent_model_analyst: str = ""
|
||
agent_model_architect: str = ""
|
||
agent_model_developer: str = ""
|
||
agent_model_reviewer: str = ""
|
||
agent_model_tester: str = ""
|
||
agent_model_deployer: str = ""
|
||
|
||
# ORCH-41: per-agent effort / reasoning level: low|medium|high|xhigh|max.
|
||
# Empty -> agent_effort_default. Same resolution order as model. Default split
|
||
# (ORCH-081/ORCH-52h): thinking agents (analyst/architect/reviewer) -> high;
|
||
# developer -> xhigh (coding/agentic role, Opus 4.8 canon); mechanical agents
|
||
# (tester/deployer) -> medium. These class-defaults are ALSO the per-role floor
|
||
# used by resolve_agent_effort when the env is empty (single source of truth).
|
||
agent_effort_default: str = "high"
|
||
agent_effort_analyst: str = "high"
|
||
agent_effort_architect: str = "high"
|
||
agent_effort_developer: str = "xhigh"
|
||
agent_effort_reviewer: str = "high"
|
||
agent_effort_tester: str = "medium"
|
||
agent_effort_deployer: str = "medium"
|
||
|
||
# ORCH-41: optional per-agent fallback model used when the primary is
|
||
# overloaded (--fallback-model, works with --print). Empty -> no flag.
|
||
agent_fallback_model: str = ""
|
||
|
||
# L-2: run-log rotation. Old per-run logs in <data>/runs/*.log are pruned at
|
||
# app startup (best-effort). A *.log is removed if it is older than
|
||
# log_keep_days OR not within the log_keep_max most-recent logs (whichever
|
||
# hits first). Only *.log files are touched; the active run log is skipped.
|
||
# log_keep_days -> max age in days (env ORCH_LOG_KEEP_DAYS).
|
||
# log_keep_max -> max number of newest logs to retain (env ORCH_LOG_KEEP_MAX).
|
||
log_keep_days: int = 30
|
||
log_keep_max: int = 500
|
||
|
||
|
||
# ORCH-045: quality-gate CI poll/retry. check_ci_green polls the Gitea
|
||
# combined commit status up to ci_poll_max_attempts times, sleeping
|
||
# ci_poll_interval_s between attempts, to ride out a transient pending
|
||
# state right after the developer push (race fix, see ORCH-017).
|
||
# ci_poll_max_attempts -> max status polls (env ORCH_CI_POLL_MAX_ATTEMPTS)
|
||
# ci_poll_interval_s -> seconds between polls (env ORCH_CI_POLL_INTERVAL_S)
|
||
ci_poll_max_attempts: int = 12
|
||
ci_poll_interval_s: int = 10
|
||
|
||
# ORCH-043: merge-gate (auto-rebase + re-test + merge-lock) on the
|
||
# deploy-staging -> deploy edge. A deterministic sub-gate (no LLM) that
|
||
# catches the up-to-date branch up to the CURRENT origin/main, re-tests it,
|
||
# and serialises merges so two green branches can't break main.
|
||
# merge_gate_enabled -> global kill-switch; False -> no-op pass for the
|
||
# whole gate (staged rollout, env ORCH_MERGE_GATE_ENABLED).
|
||
# merge_gate_repos -> CSV of repos where the gate is REAL; empty means
|
||
# only the self-hosting repo (orchestrator). Other
|
||
# repos -> conditional no-op (mirrors ORCH-35 staging).
|
||
# merge_retest_timeout_s -> wall-clock budget for the post-rebase re-test.
|
||
# merge_retest_target -> pytest target for the re-test (portability across repos).
|
||
# merge_lock_timeout_s -> max lease age; an older lease is reclaimed (crash backstop).
|
||
# merge_defer_delay_s -> delay before re-running the gate when the lock is busy.
|
||
# merge_defer_max_attempts -> defer retries before escalation (avoids livelock).
|
||
merge_gate_enabled: bool = True
|
||
merge_gate_repos: str = ""
|
||
merge_retest_timeout_s: int = 600
|
||
merge_retest_target: str = "tests/"
|
||
merge_lock_timeout_s: int = 300
|
||
merge_defer_delay_s: int = 60
|
||
merge_defer_max_attempts: int = 5
|
||
|
||
# ORCH-036: executable self-deploy (deploy stage drives the host hook).
|
||
# The `deploy` stage for the self-hosting repo is turned into a REAL prod
|
||
# restart via a detached host process, gated by a manual approve. Three-phase
|
||
# design (ADR-001): A=approve-request, B=initiate (human Approved), C=finalizer
|
||
# maps the hook exit-code -> deploy_status. Non-self repos are unaffected.
|
||
#
|
||
# self_deploy_enabled -> global kill-switch; False -> no Phase A/B/C
|
||
# interception (the legacy synchronous deployer
|
||
# path runs for everyone, env ORCH_SELF_DEPLOY_ENABLED).
|
||
# self_deploy_repos -> CSV of repos where executable self-deploy is
|
||
# REAL; empty -> only the self-hosting repo
|
||
# (orchestrator). Mirrors merge_gate_repos.
|
||
# deploy_require_manual_approve -> require a human Approved before the prod
|
||
# restart (BR-5). Default true; NOT toggled in
|
||
# ORCH-36 (AC-12). false -> Phase A initiates
|
||
# immediately (structural branch, off by default).
|
||
# deploy_finalize_delay_s -> delay before the first finalize poll; must be
|
||
# > the hook health-loop (~60s) so the verdict
|
||
# usually exists on the first poll.
|
||
# deploy_finalize_max_attempts -> bounded finalize-defer budget (anti-livelock).
|
||
# ssh / hook target (detached prod restart; real values live on the host):
|
||
# deploy_ssh_user / deploy_ssh_host -> ssh target for the host hook (INFRA P-2).
|
||
# deploy_hook_script -> path to the hook ON THE HOST (relative to repo).
|
||
# deploy_host_repo_path -> orchestrator clone path on the host.
|
||
# prod overrides passed to the hook for build-once (retag staging image -> prod):
|
||
# deploy_prod_source_image -> image validated on staging (retagged, no rebuild).
|
||
# deploy_prod_target_service / _port / _image / _compose_profile -> prod profile.
|
||
# deploy_prod_prev_image_file -> prod prev-image snapshot (separate from staging).
|
||
self_deploy_enabled: bool = True
|
||
self_deploy_repos: str = ""
|
||
deploy_require_manual_approve: bool = True
|
||
deploy_finalize_delay_s: int = 90
|
||
deploy_finalize_max_attempts: int = 10
|
||
deploy_ssh_user: str = "slin"
|
||
deploy_ssh_host: str = ""
|
||
deploy_hook_script: str = "scripts/orchestrator-deploy-hook.sh"
|
||
deploy_host_repo_path: str = "/home/slin/repos/orchestrator"
|
||
deploy_prod_source_image: str = "orchestrator-orchestrator-staging"
|
||
deploy_prod_target_service: str = "orchestrator"
|
||
deploy_prod_target_port: int = 8500
|
||
deploy_prod_target_image: str = "orchestrator-orchestrator"
|
||
deploy_prod_compose_profile: str = ""
|
||
deploy_prod_prev_image_file: str = ".deploy-prev-image-prod"
|
||
|
||
# ORCH-058: staging-image provenance before the BUILD-ONCE retag to prod.
|
||
# Closes the INV-FRESH gap (ADR-001): the BUILD-ONCE retag (ORCH-36) promotes
|
||
# the staging image to prod WITHOUT a rebuild, assuming the staging image is
|
||
# fresh — a guarantee the pipeline never had (a stale image could be silently
|
||
# promoted, LESSONS_ORCH-036 §4). Two complementary layers, self-hosting only:
|
||
# A (liveness): the QG sub-check check_staging_image_fresh rebuilds the
|
||
# staging image from the VALIDATED commit (worktree HEAD after merge-gate)
|
||
# and recreates 8501 on the deploy-staging -> deploy edge, so we validate
|
||
# and promote ONE artefact.
|
||
# B (safety): build_deploy_command passes EXPECTED_REVISION and the hook
|
||
# fail-closes (exit 1) if SOURCE_IMAGE's revision label != EXPECTED_REVISION
|
||
# before `docker tag`, making a silent stale promote structurally impossible.
|
||
#
|
||
# image_freshness_enabled -> SINGLE kill-switch for the WHOLE feature (A + B
|
||
# together; never "B without A" = a deadlock). False
|
||
# -> legacy ORCH-36 behaviour (BUILD-ONCE, no guard,
|
||
# no EXPECTED_REVISION). Env ORCH_IMAGE_FRESHNESS_ENABLED.
|
||
# image_freshness_repos -> CSV of repos where the feature is REAL; empty ->
|
||
# only the self-hosting repo (orchestrator). Mirrors
|
||
# self_deploy_repos / merge_gate_repos.
|
||
image_freshness_enabled: bool = True
|
||
image_freshness_repos: str = ""
|
||
|
||
# ORCH-022: security-gate (secret-scanning + dependency audit) on the
|
||
# deploy-staging -> deploy edge, run FIRST among the edge sub-gates (cheap to
|
||
# fail before the expensive rebase/rebuild). Deterministic (no LLM): gitleaks
|
||
# (offline secret-scan) + pip-audit (OSV/PyPI dependency audit), verdict in the
|
||
# versioned 17-security-report.md frontmatter; FAIL -> rollback to development +
|
||
# developer-retry (cap MAX_DEVELOPER_RETRIES). See ADR-001-security-gate.md.
|
||
# security_gate_enabled -> SINGLE kill-switch; False -> pipeline 1:1 as
|
||
# before ORCH-022 for everyone. Env
|
||
# ORCH_SECURITY_GATE_ENABLED.
|
||
# security_gate_repos -> CSV of repos where the gate is REAL; empty ->
|
||
# only the self-hosting repo (orchestrator).
|
||
# Mirrors merge_gate_repos / image_freshness_repos.
|
||
# security_dep_block_severity -> CVE severity threshold that BLOCKS (CRITICAL >
|
||
# HIGH > MEDIUM > LOW); below it / UNKNOWN -> a
|
||
# warning only (anti-loop ADR-001 Р-4).
|
||
# security_scan_timeout_s -> per external scanner call timeout (mirrors
|
||
# merge_retest_timeout_s).
|
||
# security_dep_audit_fail_closed -> strict mode: an unreachable CVE feed -> FAIL
|
||
# instead of the default fail-open + warning
|
||
# (Р-3). Default False (anti-loop ORCH-061).
|
||
# security_secrets_block -> a found secret blocks (always True by default;
|
||
# the offline secrets guarantee is unconditional,
|
||
# BR-2).
|
||
security_gate_enabled: bool = True
|
||
security_gate_repos: str = ""
|
||
security_dep_block_severity: str = "HIGH"
|
||
security_scan_timeout_s: int = 300
|
||
security_dep_audit_fail_closed: bool = False
|
||
security_secrets_block: bool = True
|
||
|
||
# ORCH-061: tolerate KNOWN sandbox-infra FAILs (C9a/C9b) in the staging suite.
|
||
# The self-hosting deploy-staging stage looped because scripts/staging_check.py
|
||
# exited non-zero on ANY failed check, so two infra-only failures (sandbox bot
|
||
# accounts not members of the sandbox Plane project) produced staging_status:
|
||
# FAILED -> rollback deploy-staging -> development -> loop.
|
||
# True -> a run whose ONLY failures are allowlisted sandbox-infra checks
|
||
# (C9a/C9b) is waived to SUCCESS; ANY real pipeline check that fails
|
||
# still fails closed -> FAILED -> rollback (safety net intact, FR-4).
|
||
# False -> 1:1 pre-ORCH-061 strict behaviour: any FAIL -> FAILED -> rollback.
|
||
# Default True (mirrors merge_gate_enabled / image_freshness_enabled /
|
||
# self_deploy_enabled): the safety net holds regardless of the flag; the flag
|
||
# exists to instantly restore legacy strictness without a code redeploy. Lives
|
||
# in .env.staging (ORCH_ prefix) so it is reachable inside orchestrator-staging.
|
||
# Env ORCH_STAGING_INFRA_TOLERANCE_ENABLED.
|
||
staging_infra_tolerance_enabled: bool = True
|
||
|
||
# ORCH-053: stuck-task reconciler (sweeper for lost webhooks). A background
|
||
# daemon thread reconciles the "source of truth (gate / Plane) != task stage"
|
||
# drift left behind by a dropped webhook (502 on rebuild, no Plane/Gitea
|
||
# retries, unresolved sha->branch). See docs/architecture/adr/adr-0007-reconciler.md.
|
||
# reconcile_enabled -> global kill-switch (self-hosting safety,
|
||
# staged rollout, env ORCH_RECONCILE_ENABLED).
|
||
# reconcile_interval_s -> background sweep period (seconds).
|
||
# reconcile_plane_enabled -> separate flag for the F-2 Plane-API poll so
|
||
# only the plane branch can be muted.
|
||
# reconcile_grace_default_s -> default "stuck" threshold on tasks.updated_at.
|
||
# reconcile_grace_overrides_json -> JSON object of per-stage thresholds, e.g.
|
||
# {"analysis": 1800, "development": 300}. Invalid
|
||
# JSON -> default (mirrors agent_timeout_overrides_json).
|
||
# reconcile_notify_unblock -> send a Telegram message when a stuck task is
|
||
# unblocked (F-4 observability).
|
||
# reconcile_skip_blocked_enabled -> ORCH-060 Guard 2: skip F-1 reconciliation of
|
||
# issues a human moved to Blocked / Needs Input
|
||
# (per-candidate Plane state lookup). Disabling it
|
||
# mutes ONLY the networked Guard 2; Guard 1
|
||
# (escalated-by-retries, local + deterministic) is
|
||
# always active. Manual escape hatch during a Plane
|
||
# outage.
|
||
reconcile_enabled: bool = True
|
||
reconcile_interval_s: int = 120
|
||
reconcile_plane_enabled: bool = True
|
||
reconcile_grace_default_s: int = 600
|
||
reconcile_grace_overrides_json: str = ""
|
||
reconcile_notify_unblock: bool = True
|
||
reconcile_skip_blocked_enabled: bool = True
|
||
|
||
# ORCH-068: TTL for the per-project Plane states cache (_STATES_CACHE in
|
||
# plane_sync). Historically the cache lived for the whole process lifetime,
|
||
# so a status added to Plane after start was never seen without a restart
|
||
# ("stale set -> no pipeline action"). With a TTL the entry self-heals by
|
||
# re-fetching /states/ after it expires (invalidation reuses the existing
|
||
# reload_project_states() primitive — no duplicated reset logic).
|
||
# plane_states_ttl_s (env ORCH_PLANE_STATES_TTL_S):
|
||
# >0 -> seconds before a cache entry is re-fetched (default 300 = 5 min);
|
||
# 0 -> disable TTL -> strictly the previous lifetime cache (back-compat
|
||
# escape hatch). get_project_states return shape is unchanged.
|
||
plane_states_ttl_s: int = 300
|
||
|
||
# ORCH-021: post-deploy production monitoring + degradation reaction. After
|
||
# the terminal deploy->done transition for an applicable repo, a reserved-agent
|
||
# `post-deploy-monitor` job (no LLM, modelled on deploy-finalizer) probes prod
|
||
# over a window and reacts to a degradation the restart-time health-check
|
||
# missed (class "green deploy, red prod", precedent ET-8). State is in sentinel
|
||
# files (.post-deploy-state-<repo>/<wi>/), no DB migration. See
|
||
# docs/architecture/adr/adr-0010-post-deploy-monitor.md.
|
||
# post_deploy_monitor_enabled -> global kill-switch (BR-8); False -> the
|
||
# pipeline is 1:1 as before ORCH-021 (no arm).
|
||
# post_deploy_repos -> CSV of repos where monitoring is REAL; empty
|
||
# -> only the self-hosting repo (orchestrator).
|
||
# Mirrors self_deploy_repos / merge_gate_repos.
|
||
# post_deploy_window_s -> observation window length (~15 min, BR-1).
|
||
# post_deploy_interval_s -> seconds between probe ticks.
|
||
# post_deploy_fail_threshold -> N CONSECUTIVE health failures -> DEGRADED.
|
||
# post_deploy_5xx_threshold -> window 5xx ratio above this -> DEGRADED.
|
||
# post_deploy_auto_rollback -> globally allow auto-rollback; True acts ONLY
|
||
# for non-self repos. For self-hosting the
|
||
# reaction is ALWAYS ALERT_ONLY (BR-5) — a tick
|
||
# NEVER restarts the prod orchestrator container.
|
||
# post_deploy_base_url -> base URL of the observed prod instance.
|
||
# Rollback target params reuse the existing deploy_prod_* settings (no dupes).
|
||
post_deploy_monitor_enabled: bool = True
|
||
post_deploy_repos: str = ""
|
||
post_deploy_window_s: int = 900
|
||
post_deploy_interval_s: int = 30
|
||
post_deploy_fail_threshold: int = 3
|
||
post_deploy_5xx_threshold: float = 0.5
|
||
post_deploy_auto_rollback: bool = False
|
||
post_deploy_base_url: str = "http://localhost:8500"
|
||
|
||
# ORCH-065: job-reaper + proactive merge-lease reclaim. A background daemon
|
||
# thread (modelled on the reconciler) makes "the monitor thread / process died
|
||
# while a job/lease was held" self-heal WITHOUT a restart. Status (done/queued/
|
||
# failed) is otherwise only ever set by launcher._monitor_agent -> _finalize_job
|
||
# inside the live process; a death there left the jobs row 'running' forever and
|
||
# (at max_concurrency=1) wedged the queue of EVERY project (incidents 07.06: jobs
|
||
# 236/239/242/254). The same thread proactively reclaims a stale/dead merge-lease
|
||
# (ORCH-043) instead of waiting for the lazy TTL on the next foreign acquire. See
|
||
# docs/architecture/adr/adr-0011-job-reaper-lease-reclaim.md.
|
||
# reaper_enabled -> global kill-switch (false -> strictly prior behaviour;
|
||
# only the startup requeue_running_jobs remains).
|
||
# reaper_interval_s -> background scan period (seconds).
|
||
# reaper_dead_ticks -> Tier-1: consecutive ticks a job's pid must be dead
|
||
# before it is reaped (>=2 anti-false-positive; a live
|
||
# long-running agent is NEVER reaped).
|
||
# reaper_max_running_s -> Tier-3 backstop ceiling: a job 'running' longer than
|
||
# this is reaped even when liveness is unknowable. MUST be
|
||
# > max agent_timeout + grace so a legit agent is safe.
|
||
# reaper_finalize_grace_s -> Tier-2 anti-false-positive: a LIVE monitor writes
|
||
# agent_runs.exit_code FIRST, THEN does git commit/push +
|
||
# PR + Plane usage comments (seconds..minutes) and only
|
||
# then _finalize_job. The agent pid is already dead in
|
||
# that window, so pid cannot tell "monitor died" from
|
||
# "monitor still finalizing". A job is reaped via Tier-2
|
||
# only once exit_code has been recorded for at least this
|
||
# many seconds (MUST be > the max finalization window).
|
||
# lease_reclaim_enabled -> kill-switch for the proactive stale/dead lease reclaim
|
||
# (false -> only the legacy lazy TTL reclaim in acquire).
|
||
# (reuse) merge_lock_timeout_s -> lease TTL; merge_gate_repos -> reclaim scope.
|
||
reaper_enabled: bool = True
|
||
reaper_interval_s: int = 60
|
||
reaper_dead_ticks: int = 2
|
||
reaper_max_running_s: int = 3600
|
||
reaper_finalize_grace_s: int = 300
|
||
lease_reclaim_enabled: bool = True
|
||
|
||
# ORCH-071: merge-verify under-gate on the `deploy -> done` edge. For the
|
||
# self-hosting repo the `deploy` stage runs the DETERMINISTIC self-deploy path
|
||
# (Phase A/B/C), where the LLM `deployer` agent — historically the ONLY actor
|
||
# that merged the feature PR into `main` — never runs. Result: a "green" deploy
|
||
# could reach `done` while the PR stayed `open` (phantom merge, postmortem
|
||
# LESSONS_2026-06-08). This under-gate (врезка in advance_stage, NOT a new
|
||
# STAGE_TRANSITIONS edge or registered QG) runs a deterministic merge-actor +
|
||
# post-deploy verification before `done`: not-merged -> alert + HOLD (no done),
|
||
# merged -> normal advance. Mirrors merge_gate_* / image_freshness_* rollout.
|
||
# merge_verify_enabled -> global kill-switch; False -> strictly the prior
|
||
# behaviour (no merge/verify), env ORCH_MERGE_VERIFY_ENABLED.
|
||
# merge_verify_repos -> CSV of repos where the under-gate is REAL; empty ->
|
||
# only the self-hosting repo (orchestrator). Mirrors
|
||
# merge_gate_repos / self_deploy_repos.
|
||
# merge_pr_timeout_s -> per Gitea merge/list HTTP call timeout.
|
||
# merge_verify_timeout_s-> git fetch/merge-base timeout for the ancestor check.
|
||
merge_verify_enabled: bool = True
|
||
merge_verify_repos: str = ""
|
||
merge_pr_timeout_s: int = 60
|
||
merge_verify_timeout_s: int = 60
|
||
|
||
# ORCH-026: intra-repo merge serialisation (Level A) + declarative task
|
||
# dependencies (Level B). Level A reuses the ORCH-043/065 merge-lease window
|
||
# (no new mechanism) — the merge-lease already serialises "merge -> main-updated"
|
||
# per repo; the ONLY new behaviour is an unconditional pre-merge rebase. Level B
|
||
# adds a new ADDITIVE job_deps table + a NOT EXISTS gate in claim_next_job. Both
|
||
# features are inert without data (no applicable repo / no declared deps) ->
|
||
# zero regression for enduro-trails.
|
||
# premerge_rebase_always -> Level A (A-2): when True, check_branch_mergeable
|
||
# ALWAYS rebases the task branch onto the CURRENT
|
||
# origin/main UNDER the merge-lease (not only when
|
||
# branch_is_behind_main) — a deterministic anti-phantom
|
||
# that does not depend on the ancestor check's precision.
|
||
# auto_rebase_onto_main is a cheap no-op on an already
|
||
# up-to-date branch (rc 0, push up-to-date, CI not
|
||
# retriggered). Scope = merge_gate_repos (empty ->
|
||
# self-hosting). Kill-switch (False -> exactly the
|
||
# ORCH-043 behaviour: rebase only when behind). Env
|
||
# ORCH_PREMERGE_REBASE_ALWAYS.
|
||
# task_deps_enabled -> Level B (B-2): global kill-switch for the scheduler
|
||
# dependency gate. False -> claim_next_job is 1:1 as
|
||
# ORCH-1 (the NOT EXISTS clause is omitted). Inert when
|
||
# job_deps is empty. Env ORCH_TASK_DEPS_ENABLED.
|
||
# task_deps_source -> declaration source: db|plane|hybrid (default db).
|
||
# The scheduler ALWAYS reads the DB cache (offline-safe
|
||
# hot path); plane/hybrid additionally ingest Plane
|
||
# `blocked-by` relations into job_deps at task creation.
|
||
# Env ORCH_TASK_DEPS_SOURCE.
|
||
premerge_rebase_always: bool = True
|
||
task_deps_enabled: bool = True
|
||
task_deps_source: str = "db"
|
||
|
||
# ORCH-073 (ADR-001 Р-4): main-integrity regression guard. After the merge-verify
|
||
# under-gate confirms the deployed SHA is an ancestor of origin/main (FR-1), a
|
||
# secondary deterministic (no-LLM) guard checks that a declarative set of markers
|
||
# for recently-merged tasks (MAIN_REGRESSION_MARKERS in merge_gate.py) is still
|
||
# present in origin/main — i.e. a CHANGELOG-rebase or phantom-merge did not silently
|
||
# roll back a neighbouring task's code. A missing marker (deterministic count==0) ->
|
||
# ALERT + HOLD (task stays on `deploy`, NOT done); an infra/git error on the grep
|
||
# itself -> fail-OPEN (do not block done; SHA-in-main remains the primary gate).
|
||
# regression_guard_enabled -> kill-switch (env ORCH_REGRESSION_GUARD_ENABLED);
|
||
# reuses the merge_verify_applies scope (self-hosting /
|
||
# merge_verify_repos), so non-self repos are a no-op.
|
||
regression_guard_enabled: bool = True
|
||
|
||
# Telegram notifications
|
||
telegram_bot_token: str = ""
|
||
telegram_chat_id: str = ""
|
||
|
||
# ORCH-042: режим live-трекера задачи.
|
||
# bump (ДЕФОЛТ с ORCH-067) -> при обновлении старое сообщение удаляется и
|
||
# карточка отправляется заново вниз чата (deleteMessage + sendMessage
|
||
# + repoint message_id), тихо (disable_notification).
|
||
# edit -> карточка редактируется на месте (editMessageText); доступен через
|
||
# ORCH_TRACKER_MODE=edit.
|
||
# Одна карточка на задачу в обоих режимах. Неизвестное/пустое значение
|
||
# трактуется как edit (см. notifications).
|
||
tracker_mode: str = "bump"
|
||
|
||
# ORCH-067 (ADR Р-2/Р-3/Р-4): best-effort live-overlay для статус-строки
|
||
# карточки. Дорисовывает ветки Plane-статуса, неотличимые offline по
|
||
# tasks.stage (Needs Input / Blocked / Rejected / Cancelled / Deploying /
|
||
# Monitoring after Deploy) — читая ЖИВОЙ Plane-статус с коротким таймаутом и
|
||
# TTL-кэшем. Offline-ядро (stage -> статус, In Review из brd-clock) работает
|
||
# всегда без сети; overlay лишь дополняет его и НИКОГДА не блокирует конвейер.
|
||
# tracker_live_status -> kill-switch (False -> только offline-ядро).
|
||
# tracker_live_status_ttl_s -> TTL per-issue кэша live-uuid (защита hot-path).
|
||
# tracker_live_status_timeout_s -> таймаут одного live-GET в пути рендера.
|
||
tracker_live_status: bool = True
|
||
tracker_live_status_ttl_s: int = 60
|
||
tracker_live_status_timeout_s: int = 3
|
||
|
||
# ORCH-069: QG-0 upper title-length limit (entry gate _qg0_errors). The 80-char
|
||
# cap was a hygiene limit, not structural (slug is cut to [:30] independently,
|
||
# DB title TEXT is unbounded). Configurable via env ORCH_QG0_TITLE_MAX; default
|
||
# 200 (was hardcoded 80). Invalid/empty value -> default (graceful, no crash).
|
||
qg0_title_max: int = 200
|
||
|
||
@field_validator("qg0_title_max", mode="before")
|
||
@classmethod
|
||
def _qg0_title_max_default(cls, v):
|
||
# Graceful (ORCH-069 AC-3): empty / non-numeric env -> default 200, the
|
||
# process must not crash on startup. Never raises (self-hosting safety).
|
||
try:
|
||
if v is None or (isinstance(v, str) and v.strip() == ""):
|
||
return 200
|
||
return int(v)
|
||
except (TypeError, ValueError):
|
||
return 200
|
||
|
||
class Config:
|
||
env_prefix = "ORCH_"
|
||
env_file = ".env"
|
||
|
||
|
||
settings = Settings()
|