import logging import re from pydantic import Field, field_validator from pydantic_settings import BaseSettings class Settings(BaseSettings): # Plane plane_api_url: str = "http://localhost:8091" # ORCH-017: external (browser) web URL of Plane for clickable issue links in # notifications, e.g. https://plane.example.org. Falls back to plane_api_url, # but a loopback fallback (localhost/127.0.0.1) is treated as "no web URL" and # the Plane link is omitted (see notifications._build_plane_issue_link). plane_web_url: str = "" plane_api_token: str = "" plane_workspace_slug: str = "" plane_webhook_secret: str = "" plane_project_id: str = "" # Per-agent Plane bot tokens (feat: per-agent comment authorship). # When set, add_comment posts under the matching bot so Plane shows the # real author (Analyst/Architect/...). Empty -> fallback to plane_api_token. plane_bot_analyst: str = "" plane_bot_architect: str = "" plane_bot_developer: str = "" plane_bot_reviewer: str = "" plane_bot_tester: str = "" plane_bot_deployer: str = "" plane_bot_stream: str = "" # Gitea gitea_url: str = "http://localhost:3000" gitea_public_url: str = "" # external URL for clickable links in comments; falls back to gitea_url gitea_token: str = "" gitea_webhook_secret: str = "" gitea_owner: str = "admin" default_repo: str = "enduro-trails" # ORCH-6: multi-repo project registry. JSON array of # {plane_project_id, repo, work_item_prefix, name}. # Empty -> built-in default registry in src/projects.py. projects_json: str = "" # Claude CLI claude_bin: str = "/opt/claude-code/bin/claude.exe" repos_dir: str = "/repos" host_repos_dir: str = "/home/slin/repos" worktrees_dir: str = "/repos/_wt" # ORCH-2 / S-4: isolated worktree per task/branch # ORCH-087: base dir for per-run agent logs (/.log). Lifted out # of the hardcoded '/app/data/runs' so tests (and any non-container host) can point # it at a writable path; default preserves the container layout. runs_dir: str = "/app/data/runs" # DB db_path: str = "/app/data/orchestrator.db" # ORCH-101 (replication foundation, ADR-001 D2/D4): host-parametrization keys. # config.py is the ONLY legitimate home of host-specific literals in src/** # (BR-1); every default below equals the current production value, so an # absent/unchanged .env keeps behaviour byte-for-byte (BR-5, kill-switch # nature — no extra flag is introduced, NFR-2). # agent_home_dir -> HOME of every actor subprocess env (agent CLI Popen + # git commit/push in agents/launcher, self-deploy # finalizer, post-deploy monitor). The SAME env name is # interpolated by docker-compose.yml as the target of # the .claude/.claude.json/.ssh mounts and wired into # Dockerfile ARG APP_HOME — one env name per fact (D1); # the ORCH-040 uid/HOME/mounts group moves together. # Env ORCH_AGENT_HOME_DIR. # agent_git_name -> GIT_AUTHOR/COMMITTER_NAME of agent commits (the # customer-visible identity). Env ORCH_AGENT_GIT_NAME. # git_email_domain -> domain of ALL actor git emails, built as # f"{name}@{git_email_domain}"; name = agent_git_name # for agents, and the PLATFORM literals # deploy-finalizer / post-deploy-monitor for system # actors (their names are not host-specific, D2). # Env ORCH_GIT_EMAIL_DOMAIN. # staging_port -> port of the staging instance (8501). Replaces the # module constant image_freshness._STAGING_PORT; the # SAME env name is interpolated into the staging # compose `command:` so both readers see one fact (D1). # Fail-closed guard in check_staging_image_fresh: # staging_port == deploy_prod_target_port -> the # freshness path REFUSES to run (ORCH-058 AC-9 made # executable, D4). Env ORCH_STAGING_PORT. agent_home_dir: str = "/home/slin" agent_git_name: str = "claude-bot" git_email_domain: str = "mva154.local" staging_port: int = 8501 # ORCH-1 (F-2b): persistent job queue / background worker. # max_concurrency -> max agent jobs running in parallel (env ORCH_MAX_CONCURRENCY) # queue_poll_interval -> worker loop poll seconds (env ORCH_QUEUE_POLL_INTERVAL) max_concurrency: int = 1 queue_poll_interval: float = 2.0 # ORCH-1b (resilience): preflight + 429/rate-limit + backoff + circuit breaker. # preflight_cache_ttl -> cache the cheap CLI/network preflight result (seconds); # the worker does NOT re-run `claude --version` more often # than this (env ORCH_PREFLIGHT_CACHE_TTL). # backoff_base_seconds -> base for exponential transient backoff. # backoff_max_seconds -> ceiling for the transient backoff. # transient_max_attempts -> retry budget for transient (429/overload/network) # failures, separate from code-fault `attempts`. # breaker_threshold -> consecutive transient failures that OPEN the breaker. # breaker_pause_seconds -> how long the breaker stays open before half-open. preflight_cache_ttl: int = 45 backoff_base_seconds: int = 10 backoff_max_seconds: int = 600 transient_max_attempts: int = 5 breaker_threshold: int = 3 breaker_pause_seconds: int = 300 # ORCH-7 (M-2): agent timeout + graceful kill. # agent_timeout_seconds -> default per-agent wall-clock budget; the watchdog # kills the run after this (env ORCH_AGENT_TIMEOUT_SECONDS). # agent_kill_grace_seconds-> pause between SIGTERM and SIGKILL so claude can # flush artifacts before the hard kill # (env ORCH_AGENT_KILL_GRACE_SECONDS). # agent_timeout_overrides_json -> optional per-agent override JSON object, # e.g. {"reviewer": 3600, "architect": 2700} # (env ORCH_AGENT_TIMEOUT_OVERRIDES_JSON). agent_timeout_seconds: int = 1800 agent_kill_grace_seconds: int = 20 agent_timeout_overrides_json: str = "" # ORCH-41: per-agent LLM model. Empty -> agent_model_default. Resolution order: # project-override (projects_json agent_models) > ORCH_AGENT_MODEL_ > # agent_model_default > CLI default (no --model flag). Default is 4-8 because # 4-7 == 4-8 in price (Slava 05.06); do NOT hardcode the version anywhere else. agent_model_default: str = "claude-opus-4-8" agent_model_analyst: str = "" agent_model_architect: str = "" agent_model_developer: str = "" agent_model_reviewer: str = "" agent_model_tester: str = "" agent_model_deployer: str = "" # ORCH-41: per-agent effort / reasoning level: low|medium|high|xhigh|max. # Empty -> agent_effort_default. Same resolution order as model. Default split # (ORCH-081/ORCH-52h): thinking agents (analyst/architect/reviewer) -> high; # developer -> xhigh (coding/agentic role, Opus 4.8 canon); mechanical agents # (tester/deployer) -> medium. These class-defaults are ALSO the per-role floor # used by resolve_agent_effort when the env is empty (single source of truth). agent_effort_default: str = "high" agent_effort_analyst: str = "high" agent_effort_architect: str = "high" agent_effort_developer: str = "xhigh" agent_effort_reviewer: str = "high" agent_effort_tester: str = "medium" agent_effort_deployer: str = "medium" # ORCH-41: optional per-agent fallback model used when the primary is # overloaded (--fallback-model, works with --print). Empty -> no flag. agent_fallback_model: str = "" # L-2: run-log rotation. Old per-run logs in /runs/*.log are pruned at # app startup (best-effort). A *.log is removed if it is older than # log_keep_days OR not within the log_keep_max most-recent logs (whichever # hits first). Only *.log files are touched; the active run log is skipped. # log_keep_days -> max age in days (env ORCH_LOG_KEEP_DAYS). # log_keep_max -> max number of newest logs to retain (env ORCH_LOG_KEEP_MAX). log_keep_days: int = 30 log_keep_max: int = 500 # ORCH-045: quality-gate CI poll/retry. check_ci_green polls the Gitea # combined commit status up to ci_poll_max_attempts times, sleeping # ci_poll_interval_s between attempts, to ride out a transient pending # state right after the developer push (race fix, see ORCH-017). # ci_poll_max_attempts -> max status polls (env ORCH_CI_POLL_MAX_ATTEMPTS) # ci_poll_interval_s -> seconds between polls (env ORCH_CI_POLL_INTERVAL_S) ci_poll_max_attempts: int = 12 ci_poll_interval_s: int = 10 # ORCH-043: merge-gate (auto-rebase + re-test + merge-lock) on the # deploy-staging -> deploy edge. A deterministic sub-gate (no LLM) that # catches the up-to-date branch up to the CURRENT origin/main, re-tests it, # and serialises merges so two green branches can't break main. # merge_gate_enabled -> global kill-switch; False -> no-op pass for the # whole gate (staged rollout, env ORCH_MERGE_GATE_ENABLED). # merge_gate_repos -> CSV of repos where the gate is REAL; empty means # only the self-hosting repo (orchestrator). Other # repos -> conditional no-op (mirrors ORCH-35 staging). # merge_retest_timeout_s -> wall-clock budget for the post-rebase re-test. # merge_retest_target -> pytest target for the re-test (portability across repos). # merge_lock_timeout_s -> max lease age; an older lease is reclaimed (crash backstop). # merge_defer_delay_s -> delay before re-running the gate when the lock is busy. # merge_defer_max_attempts -> defer retries before escalation (avoids livelock). merge_gate_enabled: bool = True merge_gate_repos: str = "" merge_retest_timeout_s: int = 600 merge_retest_target: str = "tests/" merge_lock_timeout_s: int = 300 merge_defer_delay_s: int = 60 merge_defer_max_attempts: int = 5 # ORCH-036: executable self-deploy (deploy stage drives the host hook). # The `deploy` stage for the self-hosting repo is turned into a REAL prod # restart via a detached host process, gated by a manual approve. Three-phase # design (ADR-001): A=approve-request, B=initiate (human Approved), C=finalizer # maps the hook exit-code -> deploy_status. Non-self repos are unaffected. # # self_deploy_enabled -> global kill-switch; False -> no Phase A/B/C # interception (the legacy synchronous deployer # path runs for everyone, env ORCH_SELF_DEPLOY_ENABLED). # self_deploy_repos -> CSV of repos where executable self-deploy is # REAL; empty -> only the self-hosting repo # (orchestrator). Mirrors merge_gate_repos. # deploy_require_manual_approve -> require a human Approved before the prod # restart (BR-5). Default true; NOT toggled in # ORCH-36 (AC-12). false -> Phase A initiates # immediately (structural branch, off by default). # deploy_finalize_delay_s -> delay before the first finalize poll; must be # > the hook health-loop (~60s) so the verdict # usually exists on the first poll. # deploy_finalize_max_attempts -> bounded finalize-defer budget (anti-livelock). # ssh / hook target (detached prod restart; real values live on the host): # deploy_ssh_user / deploy_ssh_host -> ssh target for the host hook (INFRA P-2). # deploy_hook_script -> path to the hook ON THE HOST (relative to repo). # deploy_host_repo_path -> orchestrator clone path on the host. # prod overrides passed to the hook for build-once (retag staging image -> prod): # deploy_prod_source_image -> image validated on staging (retagged, no rebuild). # deploy_prod_target_service / _port / _image / _compose_profile -> prod profile. # deploy_prod_prev_image_file -> prod prev-image snapshot (separate from staging). self_deploy_enabled: bool = True self_deploy_repos: str = "" deploy_require_manual_approve: bool = True deploy_finalize_delay_s: int = 90 deploy_finalize_max_attempts: int = 10 deploy_ssh_user: str = "slin" deploy_ssh_host: str = "" deploy_hook_script: str = "scripts/orchestrator-deploy-hook.sh" deploy_host_repo_path: str = "/home/slin/repos/orchestrator" deploy_prod_source_image: str = "orchestrator-orchestrator-staging" deploy_prod_target_service: str = "orchestrator" deploy_prod_target_port: int = 8500 deploy_prod_target_image: str = "orchestrator-orchestrator" deploy_prod_compose_profile: str = "" deploy_prod_prev_image_file: str = ".deploy-prev-image-prod" # ORCH-058: staging-image provenance before the BUILD-ONCE retag to prod. # Closes the INV-FRESH gap (ADR-001): the BUILD-ONCE retag (ORCH-36) promotes # the staging image to prod WITHOUT a rebuild, assuming the staging image is # fresh — a guarantee the pipeline never had (a stale image could be silently # promoted, LESSONS_ORCH-036 §4). Two complementary layers, self-hosting only: # A (liveness): the QG sub-check check_staging_image_fresh rebuilds the # staging image from the VALIDATED commit (worktree HEAD after merge-gate) # and recreates 8501 on the deploy-staging -> deploy edge, so we validate # and promote ONE artefact. # B (safety): build_deploy_command passes EXPECTED_REVISION and the hook # fail-closes (exit 1) if SOURCE_IMAGE's revision label != EXPECTED_REVISION # before `docker tag`, making a silent stale promote structurally impossible. # # image_freshness_enabled -> SINGLE kill-switch for the WHOLE feature (A + B # together; never "B without A" = a deadlock). False # -> legacy ORCH-36 behaviour (BUILD-ONCE, no guard, # no EXPECTED_REVISION). Env ORCH_IMAGE_FRESHNESS_ENABLED. # image_freshness_repos -> CSV of repos where the feature is REAL; empty -> # only the self-hosting repo (orchestrator). Mirrors # self_deploy_repos / merge_gate_repos. image_freshness_enabled: bool = True image_freshness_repos: str = "" # ORCH-022: security-gate (secret-scanning + dependency audit) on the # deploy-staging -> deploy edge, run FIRST among the edge sub-gates (cheap to # fail before the expensive rebase/rebuild). Deterministic (no LLM): gitleaks # (offline secret-scan) + pip-audit (OSV/PyPI dependency audit), verdict in the # versioned 17-security-report.md frontmatter; FAIL -> rollback to development + # developer-retry (cap MAX_DEVELOPER_RETRIES). See ADR-001-security-gate.md. # security_gate_enabled -> SINGLE kill-switch; False -> pipeline 1:1 as # before ORCH-022 for everyone. Env # ORCH_SECURITY_GATE_ENABLED. # security_gate_repos -> CSV of repos where the gate is REAL; empty -> # only the self-hosting repo (orchestrator). # Mirrors merge_gate_repos / image_freshness_repos. # security_dep_block_severity -> CVE severity threshold that BLOCKS (CRITICAL > # HIGH > MEDIUM > LOW); below it / UNKNOWN -> a # warning only (anti-loop ADR-001 Р-4). # security_scan_timeout_s -> per external scanner call timeout (mirrors # merge_retest_timeout_s). # security_dep_audit_fail_closed -> strict mode: an unreachable CVE feed -> FAIL # instead of the default fail-open + warning # (Р-3). Default False (anti-loop ORCH-061). # security_secrets_block -> a found secret blocks (always True by default; # the offline secrets guarantee is unconditional, # BR-2). security_gate_enabled: bool = True security_gate_repos: str = "" security_dep_block_severity: str = "HIGH" security_scan_timeout_s: int = 300 security_dep_audit_fail_closed: bool = False security_secrets_block: bool = True # ORCH-027: deterministic test-coverage gate on the deploy-staging -> deploy edge # (AFTER the merge-gate, BEFORE image-freshness). Measures line coverage of src/ # under pytest-cov in the per-branch worktree, compares to an absolute floor and/or # the ratchet baseline of `main`, and FAILs (rollback to development + developer # retry) on degradation. Leaf src/coverage_gate.py (never-raise); machine verdict in # 18-coverage-report.md frontmatter (coverage_status:). See ADR-001-coverage-gate.md. # coverage_gate_enabled -> SINGLE kill-switch; False -> pipeline 1:1 as before # ORCH-027 for everyone. Env ORCH_COVERAGE_GATE_ENABLED. # coverage_gate_repos -> CSV of repos where the gate is REAL; empty -> only # the self-hosting repo (orchestrator). Mirrors # security_gate_repos / image_freshness_repos. # coverage_min_percent -> absolute floor (% line coverage) for policy # absolute/both. Default 0.0 -> safe rollout: the # ratchet baseline drives no-regression, the floor # never false-fails day one. # coverage_policy -> absolute | baseline | both (default both): which # condition(s) must hold (D3). # coverage_epsilon -> small non-negative noise tolerance (%) so jitter at # the boundary does not bounce a task (NFR-4). # coverage_tool_fail_closed -> strict mode: a coverage-tool error -> FAIL instead # of the default fail-open + warning (FR-6). Default # False (anti-loop, precedent ORCH-061/022). # coverage_run_timeout_s -> wall-clock budget for the pytest --cov run (mirrors # merge_retest_timeout_s / security_scan_timeout_s). coverage_gate_enabled: bool = True coverage_gate_repos: str = "" coverage_min_percent: float = 0.0 coverage_policy: str = "both" coverage_epsilon: float = 0.5 coverage_tool_fail_closed: bool = False coverage_run_timeout_s: int = 900 # ORCH-098 (FND/F2): machine lessons-journal — additive `lessons` table + leaf # src/lessons.py (never-raise observer, by образцу serial_gate/coverage_gate/ # metrics). The journal is an OBSERVER, never a Quality Gate: writing a lesson # never influences any repo's pipeline, so — UNLIKE the gate leaves — it has NO # `*_repos` scope (it records lessons about ANY repo, incl. enduro-trails; the # repo cut lives on the READ side, get(repo=...)). The only regulator is a single # global kill-switch (ADR-001 D2). See ADR-001-lessons-journal.md / adr-0033. # lessons_enabled -> SINGLE kill-switch (env ORCH_LESSONS_ENABLED). # False -> record/get/update/snapshot inert (no DB # access), endpoints return {"enabled": false}, # auto-record injections no-op. Default True. # lessons_query_limit_default-> default LIMIT for GET /lessons / get() when the # caller passes none. # lessons_dedup_window_s -> auto-record dedup window (s): a second auto lesson # with the same (work_item_id, lesson_type, stage) # inside this window is suppressed (D4). manual # records are never deduped. Default 3600 (1h). lessons_enabled: bool = True lessons_query_limit_default: int = 100 lessons_dedup_window_s: int = 3600 # ORCH-057: legacy root-owned file ownership detect + actionable worktree error # (follow-up ORCH-040). Three additive, kill-switch-reversible layers: (1) an # actionable RuntimeError in git_worktree.ensure_worktree when a worktree fails # to be created because of legacy root-owned files (Permission denied), (2) a # cheap, TTL-cached, never-raise detect leaf src/fs_normalize.py that finds files # with uid != target_uid across the infra roots (/repos/_wt, /.git, data/runs) # and surfaces a startup WARNING/Telegram + GET /queue fs_ownership block, (3) an # opt-in chown (normalize) ONLY when the process has CAP_CHOWN/root (under uid 1000 # a no-op + honest log; the real fix is the operator procedure in INFRA.md). No # STAGE_TRANSITIONS / QG_CHECKS / check_* / machine-verdict / schema change. See # ADR-001-legacy-ownership-normalization.md / adr-0031. # fs_normalize_enabled -> SINGLE kill-switch; False -> all code inert, behaviour # 1:1 as before ORCH-057 (the actionable error too). # Env ORCH_FS_NORMALIZE_ENABLED. # fs_normalize_repos -> CSV of repos the layer is REAL for; empty -> only the # self-hosting repo (orchestrator). Mirrors coverage_gate_repos. # fs_target_uid -> target uid fallback when os.getuid() is unavailable. # fs_normalize_auto -> detect-only (False) | attempt chown when privileged (True). # fs_scan_roots -> CSV override of the scan roots (empty -> default roots). # fs_scan_cache_ttl_s -> TTL of the detect cache (mirrors preflight_cache_ttl). fs_normalize_enabled: bool = True fs_normalize_repos: str = "" fs_target_uid: int = 1000 fs_normalize_auto: bool = False fs_scan_roots: str = "" fs_scan_cache_ttl_s: int = 300 # ORCH-061: tolerate KNOWN sandbox-infra FAILs (C9a/C9b) in the staging suite. # The self-hosting deploy-staging stage looped because scripts/staging_check.py # exited non-zero on ANY failed check, so two infra-only failures (sandbox bot # accounts not members of the sandbox Plane project) produced staging_status: # FAILED -> rollback deploy-staging -> development -> loop. # True -> a run whose ONLY failures are allowlisted sandbox-infra checks # (C9a/C9b) is waived to SUCCESS; ANY real pipeline check that fails # still fails closed -> FAILED -> rollback (safety net intact, FR-4). # False -> 1:1 pre-ORCH-061 strict behaviour: any FAIL -> FAILED -> rollback. # Default True (mirrors merge_gate_enabled / image_freshness_enabled / # self_deploy_enabled): the safety net holds regardless of the flag; the flag # exists to instantly restore legacy strictness without a code redeploy. Lives # in .env.staging (ORCH_ prefix) so it is reachable inside orchestrator-staging. # Env ORCH_STAGING_INFRA_TOLERANCE_ENABLED. staging_infra_tolerance_enabled: bool = True # ORCH-053: stuck-task reconciler (sweeper for lost webhooks). A background # daemon thread reconciles the "source of truth (gate / Plane) != task stage" # drift left behind by a dropped webhook (502 on rebuild, no Plane/Gitea # retries, unresolved sha->branch). See docs/architecture/adr/adr-0007-reconciler.md. # reconcile_enabled -> global kill-switch (self-hosting safety, # staged rollout, env ORCH_RECONCILE_ENABLED). # reconcile_interval_s -> background sweep period (seconds). # reconcile_plane_enabled -> separate flag for the F-2 Plane-API poll so # only the plane branch can be muted. # reconcile_grace_default_s -> default "stuck" threshold on tasks.updated_at. # reconcile_grace_overrides_json -> JSON object of per-stage thresholds, e.g. # {"analysis": 1800, "development": 300}. Invalid # JSON -> default (mirrors agent_timeout_overrides_json). # reconcile_notify_unblock -> send a Telegram message when a stuck task is # unblocked (F-4 observability). # reconcile_skip_blocked_enabled -> ORCH-060 Guard 2: skip F-1 reconciliation of # issues a human moved to Blocked / Needs Input # (per-candidate Plane state lookup). Disabling it # mutes ONLY the networked Guard 2; Guard 1 # (escalated-by-retries, local + deterministic) is # always active. Manual escape hatch during a Plane # outage. reconcile_enabled: bool = True reconcile_interval_s: int = 120 reconcile_plane_enabled: bool = True reconcile_grace_default_s: int = 600 reconcile_grace_overrides_json: str = "" reconcile_notify_unblock: bool = True reconcile_skip_blocked_enabled: bool = True # ORCH-068: TTL for the per-project Plane states cache (_STATES_CACHE in # plane_sync). Historically the cache lived for the whole process lifetime, # so a status added to Plane after start was never seen without a restart # ("stale set -> no pipeline action"). With a TTL the entry self-heals by # re-fetching /states/ after it expires (invalidation reuses the existing # reload_project_states() primitive — no duplicated reset logic). # plane_states_ttl_s (env ORCH_PLANE_STATES_TTL_S): # >0 -> seconds before a cache entry is re-fetched (default 300 = 5 min); # 0 -> disable TTL -> strictly the previous lifetime cache (back-compat # escape hatch). get_project_states return shape is unchanged. plane_states_ttl_s: int = 300 # ORCH-021: post-deploy production monitoring + degradation reaction. After # the terminal deploy->done transition for an applicable repo, a reserved-agent # `post-deploy-monitor` job (no LLM, modelled on deploy-finalizer) probes prod # over a window and reacts to a degradation the restart-time health-check # missed (class "green deploy, red prod", precedent ET-8). State is in sentinel # files (.post-deploy-state-//), no DB migration. See # docs/architecture/adr/adr-0010-post-deploy-monitor.md. # post_deploy_monitor_enabled -> global kill-switch (BR-8); False -> the # pipeline is 1:1 as before ORCH-021 (no arm). # post_deploy_repos -> CSV of repos where monitoring is REAL; empty # -> only the self-hosting repo (orchestrator). # Mirrors self_deploy_repos / merge_gate_repos. # post_deploy_window_s -> observation window length (~15 min, BR-1). # post_deploy_interval_s -> seconds between probe ticks. # post_deploy_fail_threshold -> N CONSECUTIVE health failures -> DEGRADED. # post_deploy_5xx_threshold -> window 5xx ratio above this -> DEGRADED. # post_deploy_auto_rollback -> globally allow auto-rollback; True acts ONLY # for non-self repos. For self-hosting the # reaction is ALWAYS ALERT_ONLY (BR-5) — a tick # NEVER restarts the prod orchestrator container. # post_deploy_base_url -> base URL of the observed prod instance. # Rollback target params reuse the existing deploy_prod_* settings (no dupes). post_deploy_monitor_enabled: bool = True post_deploy_repos: str = "" post_deploy_window_s: int = 900 post_deploy_interval_s: int = 30 post_deploy_fail_threshold: int = 3 post_deploy_5xx_threshold: float = 0.5 post_deploy_auto_rollback: bool = False post_deploy_base_url: str = "http://localhost:8500" # ORCH-065: job-reaper + proactive merge-lease reclaim. A background daemon # thread (modelled on the reconciler) makes "the monitor thread / process died # while a job/lease was held" self-heal WITHOUT a restart. Status (done/queued/ # failed) is otherwise only ever set by launcher._monitor_agent -> _finalize_job # inside the live process; a death there left the jobs row 'running' forever and # (at max_concurrency=1) wedged the queue of EVERY project (incidents 07.06: jobs # 236/239/242/254). The same thread proactively reclaims a stale/dead merge-lease # (ORCH-043) instead of waiting for the lazy TTL on the next foreign acquire. See # docs/architecture/adr/adr-0011-job-reaper-lease-reclaim.md. # reaper_enabled -> global kill-switch (false -> strictly prior behaviour; # only the startup requeue_running_jobs remains). # reaper_interval_s -> background scan period (seconds). # reaper_dead_ticks -> Tier-1: consecutive ticks a job's pid must be dead # before it is reaped (>=2 anti-false-positive; a live # long-running agent is NEVER reaped). # reaper_max_running_s -> Tier-3 backstop ceiling: a job 'running' longer than # this is reaped even when liveness is unknowable. MUST be # > max agent_timeout + grace so a legit agent is safe. # reaper_finalize_grace_s -> Tier-2 anti-false-positive: a LIVE monitor writes # agent_runs.exit_code FIRST, THEN does git commit/push + # PR + Plane usage comments (seconds..minutes) and only # then _finalize_job. The agent pid is already dead in # that window, so pid cannot tell "monitor died" from # "monitor still finalizing". A job is reaped via Tier-2 # only once exit_code has been recorded for at least this # many seconds (MUST be > the max finalization window). # lease_reclaim_enabled -> kill-switch for the proactive stale/dead lease reclaim # (false -> only the legacy lazy TTL reclaim in acquire). # (reuse) merge_lock_timeout_s -> lease TTL; merge_gate_repos -> reclaim scope. reaper_enabled: bool = True reaper_interval_s: int = 60 reaper_dead_ticks: int = 2 reaper_max_running_s: int = 3600 reaper_finalize_grace_s: int = 300 lease_reclaim_enabled: bool = True # ORCH-063: disk-watchdog — background heartbeat that measures host-FS fill via # the mounted bind-paths and Telegram-alerts the operator at >= threshold. On # 07.06.2026 the mva154 host disk silently hit 100% and stalled the WHOLE # self-hosting pipeline; the watchdog is the missing proactive signal. Modelled # on reconciler/job_reaper (daemon thread, start/stop in main.lifespan, /queue # snapshot, never-raise). Anti-spam state is in-memory (no DB migration). # disk_monitor_enabled -> kill-switch; False -> the daemon does not start # (zero regression), env ORCH_DISK_MONITOR_ENABLED. # disk_monitor_interval_s -> heartbeat measurement period, seconds (order of # minutes; cheap shutil.disk_usage, no df subprocess). # disk_monitor_threshold_pct -> fill % that triggers the alert (Owner-fixed 85). # disk_monitor_realert_s -> min interval between repeat alerts while still # above threshold (anti-spam cooldown, ~6h). # disk_monitor_paths -> CSV of monitored HOST bind-paths (NOT overlay /); # empty -> the default set (/repos, /app/data). # Defensive validation (ADR-001 D7): threshold out of 1..100 or a non-positive # interval -> default + warning (the process never crashes on a bad env value). disk_monitor_enabled: bool = True disk_monitor_interval_s: int = 300 disk_monitor_threshold_pct: int = 85 disk_monitor_realert_s: int = 21600 disk_monitor_paths: str = "/repos,/app/data" @field_validator( "disk_monitor_interval_s", "disk_monitor_realert_s", mode="before" ) @classmethod def _disk_positive_int(cls, v, info): # Non-positive / non-numeric interval -> the field default (never crash). _defaults = {"disk_monitor_interval_s": 300, "disk_monitor_realert_s": 21600} fallback = _defaults.get(info.field_name, 1) try: if v is None or (isinstance(v, str) and v.strip() == ""): return fallback iv = int(v) if iv <= 0: logging.getLogger("orchestrator.config").warning( "%s must be > 0, got %s; falling back to %s", info.field_name, v, fallback, ) return fallback return iv except (TypeError, ValueError): return fallback @field_validator("disk_monitor_threshold_pct", mode="before") @classmethod def _disk_threshold_pct(cls, v): # Threshold must be a percentage in 1..100; otherwise -> default 85. try: if v is None or (isinstance(v, str) and v.strip() == ""): return 85 iv = int(v) if 1 <= iv <= 100: return iv logging.getLogger("orchestrator.config").warning( "disk_monitor_threshold_pct must be 1..100, got %s; using 85", v ) return 85 except (TypeError, ValueError): return 85 # ORCH-062: build-cache-pruner — the "second half" of the disk-watchdog # (ORCH-063): watchdog SIGNALS, pruner CLEANS. A background daemon thread # modelled 1:1 on disk_watchdog (start/stop in main.lifespan, /queue snapshot, # never-raise, kill-switch) that periodically runs `docker builder prune` on # the HOST over ssh (the container ships no docker CLI — same channel as # image_freshness/self_deploy). Touches ONLY the BuildKit build cache: never # images/containers of running services, never restarts the docker daemon or # the prod container (self-hosting safety). State (last run / result) is # in-memory, best-effort — no DB migration. ADR-001 D1..D7. # build_cache_prune_enabled -> kill-switch; False -> daemon does not # start (1:1 as before), env *_ENABLED. # build_cache_prune_interval_s -> tick period, seconds (order of hours). # build_cache_prune_until -> retention age for warm cache # (`docker builder prune --filter until=`). # build_cache_prune_all -> add `-a` (ALWAYS paired with until). # build_cache_prune_timeout_s -> bound on the ssh command, seconds. # build_cache_prune_notify_min_gb -> Telegram when reclaimed >= N GB; 0 -> silent. # Defensive validation (ADR-001 D4): a non-positive / non-numeric interval or # timeout -> default + warning; an `until` not matching ^\d+[smhdw]?$ -> "24h"; # a negative notify threshold -> 0. A bad env value NEVER crashes the start. build_cache_prune_enabled: bool = True build_cache_prune_interval_s: int = 21600 build_cache_prune_until: str = "24h" build_cache_prune_all: bool = False build_cache_prune_timeout_s: int = 120 build_cache_prune_notify_min_gb: float = 0.0 @field_validator( "build_cache_prune_interval_s", "build_cache_prune_timeout_s", mode="before" ) @classmethod def _bcp_positive_int(cls, v, info): # Non-positive / non-numeric -> the field default (never crash the start). _defaults = { "build_cache_prune_interval_s": 21600, "build_cache_prune_timeout_s": 120, } fallback = _defaults.get(info.field_name, 1) try: if v is None or (isinstance(v, str) and v.strip() == ""): return fallback iv = int(v) if iv <= 0: logging.getLogger("orchestrator.config").warning( "%s must be > 0, got %s; falling back to %s", info.field_name, v, fallback, ) return fallback return iv except (TypeError, ValueError): return fallback @field_validator("build_cache_prune_until", mode="before") @classmethod def _bcp_until(cls, v): # A docker `until` filter: digits + optional unit (s/m/h/d/w). Anything # else -> the safe default "24h" (keeps warm cache, BR-2). try: if v is None: return "24h" s = str(v).strip() if s and re.match(r"^\d+[smhdw]?$", s): return s logging.getLogger("orchestrator.config").warning( "build_cache_prune_until must match ^\\d+[smhdw]?$, got %r; using 24h", v ) return "24h" except (TypeError, ValueError): return "24h" @field_validator("build_cache_prune_notify_min_gb", mode="before") @classmethod def _bcp_notify_min_gb(cls, v): # A non-negative GB threshold; negative / non-numeric -> 0 (silent). try: if v is None or (isinstance(v, str) and v.strip() == ""): return 0.0 fv = float(v) return fv if fv >= 0 else 0.0 except (TypeError, ValueError): return 0.0 # ORCH-071: merge-verify under-gate on the `deploy -> done` edge. For the # self-hosting repo the `deploy` stage runs the DETERMINISTIC self-deploy path # (Phase A/B/C), where the LLM `deployer` agent — historically the ONLY actor # that merged the feature PR into `main` — never runs. Result: a "green" deploy # could reach `done` while the PR stayed `open` (phantom merge, postmortem # LESSONS_2026-06-08). This under-gate (врезка in advance_stage, NOT a new # STAGE_TRANSITIONS edge or registered QG) runs a deterministic merge-actor + # post-deploy verification before `done`: not-merged -> alert + HOLD (no done), # merged -> normal advance. Mirrors merge_gate_* / image_freshness_* rollout. # merge_verify_enabled -> global kill-switch; False -> strictly the prior # behaviour (no merge/verify), env ORCH_MERGE_VERIFY_ENABLED. # merge_verify_repos -> CSV of repos where the under-gate is REAL; empty -> # only the self-hosting repo (orchestrator). Mirrors # merge_gate_repos / self_deploy_repos. # merge_pr_timeout_s -> per Gitea merge/list HTTP call timeout. # merge_verify_timeout_s-> git fetch/merge-base timeout for the ancestor check. merge_verify_enabled: bool = True merge_verify_repos: str = "" merge_pr_timeout_s: int = 60 merge_verify_timeout_s: int = 60 # ORCH-093: deterministic merge-actor retry of TRANSIENT Gitea merge errors. # The incident ORCH-063 had a green self-deploy + an open, mergeable PR, yet # POST /pulls/{n}/merge returned HTTP 405 ("Please try again later") because # Gitea was still recomputing `mergeable` right after the push — the one-shot # merge_pr returned False, the ORCH-071/081 backstop HELD the task on `deploy`, # and a human had to re-merge by hand. merge_pr now wraps ONLY the mutating # POST in a bounded exponential-backoff retry-loop on TRANSIENT outcomes # (405/408/5xx/network-timeout, and 409|422 while the PR is still mergeable); # TERMINAL outcomes (403/404/real conflict) -> fast honest False (the HOLD # protection is unchanged). Mirrors the ci_poll_* idiom of check_ci_green. # merge_retry_enabled -> kill-switch; False -> exactly one POST # (byte-for-byte the prior one-shot behaviour, # env ORCH_MERGE_RETRY_ENABLED). # merge_retry_max_attempts -> max POST attempts on a transient outcome # (env ORCH_MERGE_RETRY_MAX_ATTEMPTS). # merge_retry_backoff_base_s -> exponential backoff base seconds # (env ORCH_MERGE_RETRY_BACKOFF_BASE_S). # merge_retry_backoff_max_s -> per-sleep backoff ceiling seconds; total sleep # is bounded by (N-1) * max so the monitor-thread # is never wedged (env ORCH_MERGE_RETRY_BACKOFF_MAX_S). merge_retry_enabled: bool = True merge_retry_max_attempts: int = 3 merge_retry_backoff_base_s: int = 2 merge_retry_backoff_max_s: int = 5 # ORCH-026: intra-repo merge serialisation (Level A) + declarative task # dependencies (Level B). Level A reuses the ORCH-043/065 merge-lease window # (no new mechanism) — the merge-lease already serialises "merge -> main-updated" # per repo; the ONLY new behaviour is an unconditional pre-merge rebase. Level B # adds a new ADDITIVE job_deps table + a NOT EXISTS gate in claim_next_job. Both # features are inert without data (no applicable repo / no declared deps) -> # zero regression for enduro-trails. # premerge_rebase_always -> Level A (A-2): when True, check_branch_mergeable # ALWAYS rebases the task branch onto the CURRENT # origin/main UNDER the merge-lease (not only when # branch_is_behind_main) — a deterministic anti-phantom # that does not depend on the ancestor check's precision. # auto_rebase_onto_main is a cheap no-op on an already # up-to-date branch (rc 0, push up-to-date, CI not # retriggered). Scope = merge_gate_repos (empty -> # self-hosting). Kill-switch (False -> exactly the # ORCH-043 behaviour: rebase only when behind). Env # ORCH_PREMERGE_REBASE_ALWAYS. # task_deps_enabled -> Level B (B-2): global kill-switch for the scheduler # dependency gate. False -> claim_next_job is 1:1 as # ORCH-1 (the NOT EXISTS clause is omitted). Inert when # job_deps is empty. Env ORCH_TASK_DEPS_ENABLED. # task_deps_source -> declaration source: db|plane|hybrid (default db). # The scheduler ALWAYS reads the DB cache (offline-safe # hot path); plane/hybrid additionally ingest Plane # `blocked-by` relations into job_deps at task creation. # Env ORCH_TASK_DEPS_SOURCE. premerge_rebase_always: bool = True task_deps_enabled: bool = True task_deps_source: str = "db" # ORCH-088 (Этап 1, serial e2e): per-repo serial gate. A new task's analyst-job # does NOT enter analysis (no branch cut, no analyst agent) while the same repo # has another unfinished task (tasks.stage != 'done') OR the repo is frozen # (repo_freeze). The gate lives in claim_next_job (offline-safe hot path, like # the ORCH-026 dep-gate) + the branch cut is deferred from start_pipeline to the # analyst-job claim (launcher) so the branch base is always a fresh origin/main # that already contains the predecessor (anti-stale-base, AC-6). All additive, # never-raise, restart-safe; STAGE_TRANSITIONS / QG_CHECKS unchanged. See # docs/work-items/ORCH-088/06-adr/ADR-001-serial-gate.md. # serial_gate_enabled -> kill-switch (env ORCH_SERIAL_GATE_ENABLED). # False -> claim_next_job AND start_pipeline are 1:1 # as before ORCH-088 (clause omitted, branch cut in # start_pipeline) — zero regression (AC-7). # serial_gate_repos -> CSV scope (env ORCH_SERIAL_GATE_REPOS). Empty -> # applies to ALL registered repos (D5); non-empty -> # only the listed repos. Repo tokens are sanitised # (^[A-Za-z0-9._-]+$) before being embedded in SQL. # serial_gate_freeze_enabled-> independent tumbler for the FR-5 rollback-freeze # layer (env ORCH_SERIAL_GATE_FREEZE_ENABLED). False # -> freeze is neither set (post-deploy DEGRADED) nor # consulted in the claim gate. serial_gate_enabled: bool = True serial_gate_repos: str = "" serial_gate_freeze_enabled: bool = True # ORCH-090: STOP-status task cancellation (stop active agent + full progress # reset) and the relaunch-hole close. A new logical Plane key `stop` (fail-closed, # absent from _DEFAULT_STATES) routes to a cancel handler that drives the task to # the new system-terminal state `cancelled` (stage + durable). Additive, # never-raise, restart-safe; STAGE_TRANSITIONS / QG_CHECKS / check_* / existing # status semantics are NOT touched. See # docs/work-items/ORCH-090/06-adr/ADR-001-stop-cancel-task.md and the cross-cutting # docs/architecture/adr/adr-0026-stop-cancel-task.md. # stop_status_enabled -> kill-switch (env ORCH_STOP_STATUS_ENABLED). False -> # STOP handling AND the relaunch-hole gate are inert # (behaviour strictly as before ORCH-090 — zero # regression, AC-8). # stop_status_repos -> CSV scope (env ORCH_STOP_STATUS_REPOS). Empty -> applies # to ALL repos (cancellation is meaningful for enduro too); # non-empty -> only the listed repos. Tokens are sanitised # (^[A-Za-z0-9._-]+$) by the cancel leaf. stop_status_enabled: bool = True stop_status_repos: str = "" # ORCH-094: terminal-window-aware guard for deploy-phase Plane status setters. # A task with DB stage='done' (and 0 active jobs) was flapping in Plane between # `Awaiting Deploy` and `Monitoring after Deploy` instead of holding `Done`, # because the three deploy-phase setters (set_issue_awaiting_deploy / # set_issue_deploying / set_issue_monitoring) are terminal-blind: any stale / # duplicate / unknown caller under the bot token re-stamps an intermediate # deploy status over the terminal Done. ORCH-094 puts a single low choke-point # guard on the entry of those three setters (leaf src/deploy_status_guard.py): # for a task whose DB stage is terminal it converges to Done idempotently # (CONVERGE_DONE), EXCEPT the legitimate post-deploy `Monitoring` while the # window is still active (ARMED & not DONE). Additive, never-raise; reads the # existing tasks.stage (no migration); STAGE_TRANSITIONS / QG_CHECKS / # machine-verdict keys are NOT touched. See # docs/work-items/ORCH-094/06-adr/ADR-001-terminal-window-aware-deploy-status-guard.md # and the cross-cutting docs/architecture/adr/adr-0028-…md. # deploy_status_guard_enabled -> kill-switch (env ORCH_DEPLOY_STATUS_GUARD_ENABLED). # False -> the setters are terminal-blind, behaviour # strictly 1:1 as before ORCH-094 (zero regression). # deploy_status_guard_repos -> CSV scope (env ORCH_DEPLOY_STATUS_GUARD_REPOS). # Empty -> applies ONLY to the self-hosting repo # (orchestrator), where deploy-phase statuses are set # at all; non-empty -> only the listed repos. Tokens # are sanitised (^[A-Za-z0-9._-]+$) by the guard leaf. deploy_status_guard_enabled: bool = True deploy_status_guard_repos: str = "" # ORCH-073 (ADR-001 Р-4): main-integrity regression guard. After the merge-verify # under-gate confirms the deployed SHA is an ancestor of origin/main (FR-1), a # secondary deterministic (no-LLM) guard checks that a declarative set of markers # for recently-merged tasks (MAIN_REGRESSION_MARKERS in merge_gate.py) is still # present in origin/main — i.e. a CHANGELOG-rebase or phantom-merge did not silently # roll back a neighbouring task's code. A missing marker (deterministic count==0) -> # ALERT + HOLD (task stays on `deploy`, NOT done); an infra/git error on the grep # itself -> fail-OPEN (do not block done; SHA-in-main remains the primary gate). # regression_guard_enabled -> kill-switch (env ORCH_REGRESSION_GUARD_ENABLED); # reuses the merge_verify_applies scope (self-hosting / # merge_verify_repos), so non-self repos are a no-op. regression_guard_enabled: bool = True # ORCH-082 (ADR-001 Р-5): guarantee an open code-PR BEFORE the deterministic # merge_pr inside the merge-verify under-gate. The pipeline never guaranteed the # branch had an open PR (head==branch, base==main) at merge time — PRs are created # ONLY on the developer path with a fresh worktree commit (launcher._ensure_pr), # so a branch (e.g. after a manual main restore / a bounce with no new commits) # could reach merge-verify PR-less -> merge_pr returns "no open PR" -> a FALSE HOLD # that ORCH-073 fail-closed correctly catches but should never have to. The # idempotent leaf-actor merge_gate.ensure_open_pr creates/finds the code-PR ДО # merge_pr; ORCH-073's SHA-in-main proof is untouched and stays authoritative. # merge_verify_autocreate_pr_enabled -> kill-switch (env # ORCH_MERGE_VERIFY_AUTOCREATE_PR_ENABLED). False -> exactly the pre-ORCH-082 # behaviour (no auto-create; "no open PR" -> HOLD as before). Reuses the # merge_verify_applies scope (self-hosting / merge_verify_repos) — no separate # *_repos, since auto-create is semantically inseparable from merge-verify. merge_verify_autocreate_pr_enabled: bool = True # ORCH-089: auto-mode by Plane labels — autoApprove (BRD gate) + autoDeploy # (prod-deploy gate). Two HUMAN gates of the pipeline (analysis: wait for a # manual Approved; deploy Phase A: wait for a manual Confirm Deploy) are the # only blockers of an autonomous batch run (epic ORCH-088). ORCH-089 lifts ONLY # those two human decisions — selectively (a Plane label on the issue), # declaratively, reversibly, WITHOUT touching a single technical check. Additive # leaf (src/labels.py, never-raise) + two point insertions + flags; # STAGE_TRANSITIONS / QG_CHECKS / check_* / DB schema are NOT touched. See # docs/work-items/ORCH-089/06-adr/ADR-001-auto-label-gates.md. # auto_label_enabled -> global kill-switch for BOTH auto-modes (env # ORCH_AUTO_LABEL_ENABLED). False -> strictly the prior # behaviour (both gates manual), AND no new network call # on the gates (applies() returns False first, before # has_label is consulted) — zero regression (AC-8). # auto_approve_label -> Plane label name for the BRD gate (env # ORCH_AUTO_APPROVE_LABEL). # auto_deploy_label -> Plane label name for the deploy gate (env # ORCH_AUTO_DEPLOY_LABEL). # auto_label_repos -> CSV scope (env ORCH_AUTO_LABEL_REPOS). Empty -> # self-hosting only (orchestrator), the safe default # (the autoDeploy insertion lives in Phase A, which only # exists for the self-hosting repo). Non-empty -> only # the listed repos. # auto_label_states_ttl_s -> TTL (seconds) of the per-project label-map cache # (mirrors plane_states_ttl_s); 0 -> lifetime cache. auto_label_enabled: bool = True auto_approve_label: str = "autoApprove" auto_deploy_label: str = "autoDeploy" auto_label_repos: str = "" auto_label_states_ttl_s: int = 300 # ORCH-019: bug-fast-track — a cheaper/shorter pipeline route for bug-fix tasks. # A task carrying the Plane label `bug_fast_track_label` (default `Bug`) skips # the whole `architecture` stage (one opus `architect` run + ADR + the # check_architecture_done exit-gate): the routing-override in advance_stage maps # the analysis -> architecture edge to analysis -> development for a task whose # tasks.track == 'bug'. EVERY Quality Gate / sub-gate (CI/review/tester/staging/ # deploy + security/merge/coverage/image-freshness/merge-verify) runs UNCHANGED # — the route is a scheduler property, NOT a gate (root invariant NFR-1). # Recognition reuses the proven ORCH-089 label apparatus (labels.has_label -> # plane_sync), read ONLY in start_pipeline (never in the hot claim_next_job). # Additive leaf (src/bug_fast_track.py, never-raise) + an additive idempotent # tasks.track column; STAGE_TRANSITIONS / QG_CHECKS / check_* / verdict-keys are # NOT touched. fail-safe -> full cycle on any error/ambiguity/disabled flag. See # docs/work-items/ORCH-019/06-adr/ADR-001-bug-fast-track.md and the cross-cutting # docs/architecture/adr/adr-0032-bug-fast-track.md. # bug_fast_track_enabled -> kill-switch (env ORCH_BUG_FAST_TRACK_ENABLED). # False -> start_pipeline AND advance_stage are 1:1 as # before ORCH-019 (skips_architecture always False, # has_label never consulted) — zero regression (AC-6). # bug_fast_track_label -> Plane label name that activates the track (env # ORCH_BUG_FAST_TRACK_LABEL; default `Bug`). # bug_fast_track_repos -> CSV scope (env ORCH_BUG_FAST_TRACK_REPOS). Empty -> # self-hosting only (orchestrator), the safe default # (D6); non-empty -> only the listed repos. bug_fast_track_enabled: bool = True bug_fast_track_label: str = "Bug" bug_fast_track_repos: str = "" # Telegram notifications telegram_bot_token: str = "" telegram_chat_id: str = "" # ORCH-042: режим live-трекера задачи. # bump (ДЕФОЛТ с ORCH-067) -> при обновлении старое сообщение удаляется и # карточка отправляется заново вниз чата (deleteMessage + sendMessage # + repoint message_id), тихо (disable_notification). # edit -> карточка редактируется на месте (editMessageText); доступен через # ORCH_TRACKER_MODE=edit. # Одна карточка на задачу в обоих режимах. Неизвестное/пустое значение # трактуется как edit (см. notifications). tracker_mode: str = "bump" # ORCH-067 (ADR Р-2/Р-3/Р-4): best-effort live-overlay для статус-строки # карточки. Дорисовывает ветки Plane-статуса, неотличимые offline по # tasks.stage (Needs Input / Blocked / Rejected / Cancelled / Deploying / # Monitoring after Deploy) — читая ЖИВОЙ Plane-статус с коротким таймаутом и # TTL-кэшем. Offline-ядро (stage -> статус, In Review из brd-clock) работает # всегда без сети; overlay лишь дополняет его и НИКОГДА не блокирует конвейер. # tracker_live_status -> kill-switch (False -> только offline-ядро). # tracker_live_status_ttl_s -> TTL per-issue кэша live-uuid (защита hot-path). # tracker_live_status_timeout_s -> таймаут одного live-GET в пути рендера. tracker_live_status: bool = True tracker_live_status_ttl_s: int = 60 tracker_live_status_timeout_s: int = 3 # ORCH-087 (BR-G5, ADR-001 Р-6): cap for the human BRD-review time shown on the # done card ("твоё {review}"). The brd_review clock can stay open for hours on a # desync (In Review -> Backlog), which made "твоё время" report anomalous stalls # (ORCH-087: 392m). Above this cap the value is shown capped with a "~" marker so # an abnormal stall is never presented as real human review time. Env # ORCH_TRACKER_BRD_REVIEW_CAP_S; default 7200s (2h). 0/negative -> no cap. tracker_brd_review_cap_s: int = 7200 # ORCH-076 (ORCH-52c, FR-2 / D3): kill-switch for STRICT frontmatter-schema # validation. The unified frontmatter contract (src/frontmatter.py) ships a # machine-checkable schema validator (REQUIRED_FIELDS), but by DEFAULT it is # warning-only and never influences any gate's boolean verdict (maybe_warn_schema # is inert). This flag is RESERVED for a future tightening (ORCH-52d, when agents # start emitting the full schema). It MUST stay False in prod / .env.staging — # otherwise ORCH-52c would self-block its own deploy (its docs predate the # schema). Env ORCH_FRONTMATTER_VALIDATION_STRICT; default False (zero behaviour # change). See docs/_standards/HANDOFF_PROTOCOL.md. frontmatter_validation_strict: bool = False # ORCH-069: QG-0 upper title-length limit (entry gate _qg0_errors). The 80-char # cap was a hygiene limit, not structural (slug is cut to [:30] independently, # DB title TEXT is unbounded). Configurable via env ORCH_QG0_TITLE_MAX; default # 200 (was hardcoded 80). Invalid/empty value -> default (graceful, no crash). qg0_title_max: int = 200 # ORCH-099 (D8): operator off-switch for the read-only GET /metrics endpoint. # The env var is ORCH_METRICS_ENABLED (explicit validation_alias — the documented # contract name, ADR-001 D8 / README — overriding the default ORCH_ + field-name # mapping so the documented switch actually controls the flag). Default True -> # the endpoint is available out of the box (zero regression vs BRD). False -> # /metrics returns a minimal parsable body {"schema_version": 1, "enabled": false} # (200, NOT 404) so the F1b sidecar sees the off-switch explicitly. The endpoint # is inert / read-only anyway; the flag is a cheap self-hosting insurance on the # shared prod instance. metrics_endpoint_enabled: bool = Field(True, validation_alias="ORCH_METRICS_ENABLED") @field_validator("qg0_title_max", mode="before") @classmethod def _qg0_title_max_default(cls, v): # Graceful (ORCH-069 AC-3): empty / non-numeric env -> default 200, the # process must not crash on startup. Never raises (self-hosting safety). try: if v is None or (isinstance(v, str) and v.strip() == ""): return 200 return int(v) except (TypeError, ValueError): return 200 class Config: env_prefix = "ORCH_" env_file = ".env" settings = Settings()