orchestrator/src/config.py

import logging
import re

from pydantic import field_validator
from pydantic_settings import BaseSettings


class Settings(BaseSettings):
    # Plane
    plane_api_url: str = "http://localhost:8091"
    # ORCH-017: external (browser) web URL of Plane for clickable issue links in
    # notifications, e.g. https://plane.example.org. Falls back to plane_api_url,
    # but a loopback fallback (localhost/127.0.0.1) is treated as "no web URL" and
    # the Plane link is omitted (see notifications._build_plane_issue_link).
    plane_web_url: str = ""
    plane_api_token: str = ""
    plane_workspace_slug: str = ""
    plane_webhook_secret: str = ""
    plane_project_id: str = ""

    # Per-agent Plane bot tokens (feat: per-agent comment authorship).
    # When set, add_comment posts under the matching bot so Plane shows the
    # real author (Analyst/Architect/...). Empty -> fallback to plane_api_token.
    plane_bot_analyst: str = ""
    plane_bot_architect: str = ""
    plane_bot_developer: str = ""
    plane_bot_reviewer: str = ""
    plane_bot_tester: str = ""
    plane_bot_deployer: str = ""
    plane_bot_stream: str = ""

    # Gitea
    gitea_url: str = "http://localhost:3000"
    gitea_public_url: str = ""  # external URL for clickable links in comments; falls back to gitea_url
    gitea_token: str = ""
    gitea_webhook_secret: str = ""
    gitea_owner: str = "admin"
    default_repo: str = "enduro-trails"

    # ORCH-6: multi-repo project registry. JSON array of
    #   {plane_project_id, repo, work_item_prefix, name}.
    # Empty -> built-in default registry in src/projects.py.
    projects_json: str = ""

    # Claude CLI
    claude_bin: str = "/opt/claude-code/bin/claude.exe"
    repos_dir: str = "/repos"
    host_repos_dir: str = "/home/slin/repos"
    worktrees_dir: str = "/repos/_wt"  # ORCH-2 / S-4: isolated worktree per task/branch
    # ORCH-087: base dir for per-run agent logs (<runs_dir>/<run_id>.log). Lifted out
    # of the hardcoded '/app/data/runs' so tests (and any non-container host) can point
    # it at a writable path; default preserves the container layout.
    runs_dir: str = "/app/data/runs"

    # DB
    db_path: str = "/app/data/orchestrator.db"

    # ORCH-1 (F-2b): persistent job queue / background worker.
    # max_concurrency  -> max agent jobs running in parallel (env ORCH_MAX_CONCURRENCY)
    # queue_poll_interval -> worker loop poll seconds (env ORCH_QUEUE_POLL_INTERVAL)
    max_concurrency: int = 1
    queue_poll_interval: float = 2.0

    # ORCH-1b (resilience): preflight + 429/rate-limit + backoff + circuit breaker.
    # preflight_cache_ttl  -> cache the cheap CLI/network preflight result (seconds);
    #                         the worker does NOT re-run `claude --version` more often
    #                         than this (env ORCH_PREFLIGHT_CACHE_TTL).
    # backoff_base_seconds -> base for exponential transient backoff.
    # backoff_max_seconds  -> ceiling for the transient backoff.
    # transient_max_attempts -> retry budget for transient (429/overload/network)
    #                         failures, separate from code-fault `attempts`.
    # breaker_threshold    -> consecutive transient failures that OPEN the breaker.
    # breaker_pause_seconds -> how long the breaker stays open before half-open.
    preflight_cache_ttl: int = 45
    backoff_base_seconds: int = 10
    backoff_max_seconds: int = 600
    transient_max_attempts: int = 5
    breaker_threshold: int = 3
    breaker_pause_seconds: int = 300

    # ORCH-7 (M-2): agent timeout + graceful kill.
    # agent_timeout_seconds   -> default per-agent wall-clock budget; the watchdog
    #                            kills the run after this (env ORCH_AGENT_TIMEOUT_SECONDS).
    # agent_kill_grace_seconds-> pause between SIGTERM and SIGKILL so claude can
    #                            flush artifacts before the hard kill
    #                            (env ORCH_AGENT_KILL_GRACE_SECONDS).
    # agent_timeout_overrides_json -> optional per-agent override JSON object,
    #                            e.g. {"reviewer": 3600, "architect": 2700}
    #                            (env ORCH_AGENT_TIMEOUT_OVERRIDES_JSON).
    agent_timeout_seconds: int = 1800
    agent_kill_grace_seconds: int = 20
    agent_timeout_overrides_json: str = ""

    # ORCH-41: per-agent LLM model. Empty -> agent_model_default. Resolution order:
    # project-override (projects_json agent_models) > ORCH_AGENT_MODEL_<AGENT> >
    # agent_model_default > CLI default (no --model flag). Default is 4-8 because
    # 4-7 == 4-8 in price (Slava 05.06); do NOT hardcode the version anywhere else.
    agent_model_default: str = "claude-opus-4-8"
    agent_model_analyst: str = ""
    agent_model_architect: str = ""
    agent_model_developer: str = ""
    agent_model_reviewer: str = ""
    agent_model_tester: str = ""
    agent_model_deployer: str = ""

    # ORCH-41: per-agent effort / reasoning level: low|medium|high|xhigh|max.
    # Empty -> agent_effort_default. Same resolution order as model. Default split
    # (ORCH-081/ORCH-52h): thinking agents (analyst/architect/reviewer) -> high;
    # developer -> xhigh (coding/agentic role, Opus 4.8 canon); mechanical agents
    # (tester/deployer) -> medium. These class-defaults are ALSO the per-role floor
    # used by resolve_agent_effort when the env is empty (single source of truth).
    agent_effort_default: str = "high"
    agent_effort_analyst: str = "high"
    agent_effort_architect: str = "high"
    agent_effort_developer: str = "xhigh"
    agent_effort_reviewer: str = "high"
    agent_effort_tester: str = "medium"
    agent_effort_deployer: str = "medium"

    # ORCH-41: optional per-agent fallback model used when the primary is
    # overloaded (--fallback-model, works with --print). Empty -> no flag.
    agent_fallback_model: str = ""

    # L-2: run-log rotation. Old per-run logs in <data>/runs/*.log are pruned at
    # app startup (best-effort). A *.log is removed if it is older than
    # log_keep_days OR not within the log_keep_max most-recent logs (whichever
    # hits first). Only *.log files are touched; the active run log is skipped.
    #   log_keep_days -> max age in days (env ORCH_LOG_KEEP_DAYS).
    #   log_keep_max  -> max number of newest logs to retain (env ORCH_LOG_KEEP_MAX).
    log_keep_days: int = 30
    log_keep_max: int = 500


    # ORCH-045: quality-gate CI poll/retry. check_ci_green polls the Gitea
    # combined commit status up to ci_poll_max_attempts times, sleeping
    # ci_poll_interval_s between attempts, to ride out a transient pending
    # state right after the developer push (race fix, see ORCH-017).
    #   ci_poll_max_attempts -> max status polls (env ORCH_CI_POLL_MAX_ATTEMPTS)
    #   ci_poll_interval_s   -> seconds between polls (env ORCH_CI_POLL_INTERVAL_S)
    ci_poll_max_attempts: int = 12
    ci_poll_interval_s: int = 10

    # ORCH-043: merge-gate (auto-rebase + re-test + merge-lock) on the
    # deploy-staging -> deploy edge. A deterministic sub-gate (no LLM) that
    # catches the up-to-date branch up to the CURRENT origin/main, re-tests it,
    # and serialises merges so two green branches can't break main.
    #   merge_gate_enabled     -> global kill-switch; False -> no-op pass for the
    #                             whole gate (staged rollout, env ORCH_MERGE_GATE_ENABLED).
    #   merge_gate_repos       -> CSV of repos where the gate is REAL; empty means
    #                             only the self-hosting repo (orchestrator). Other
    #                             repos -> conditional no-op (mirrors ORCH-35 staging).
    #   merge_retest_timeout_s -> wall-clock budget for the post-rebase re-test.
    #   merge_retest_target    -> pytest target for the re-test (portability across repos).
    #   merge_lock_timeout_s   -> max lease age; an older lease is reclaimed (crash backstop).
    #   merge_defer_delay_s    -> delay before re-running the gate when the lock is busy.
    #   merge_defer_max_attempts -> defer retries before escalation (avoids livelock).
    merge_gate_enabled: bool = True
    merge_gate_repos: str = ""
    merge_retest_timeout_s: int = 600
    merge_retest_target: str = "tests/"
    merge_lock_timeout_s: int = 300
    merge_defer_delay_s: int = 60
    merge_defer_max_attempts: int = 5

    # ORCH-036: executable self-deploy (deploy stage drives the host hook).
    # The `deploy` stage for the self-hosting repo is turned into a REAL prod
    # restart via a detached host process, gated by a manual approve. Three-phase
    # design (ADR-001): A=approve-request, B=initiate (human Approved), C=finalizer
    # maps the hook exit-code -> deploy_status. Non-self repos are unaffected.
    #
    #   self_deploy_enabled            -> global kill-switch; False -> no Phase A/B/C
    #                                     interception (the legacy synchronous deployer
    #                                     path runs for everyone, env ORCH_SELF_DEPLOY_ENABLED).
    #   self_deploy_repos              -> CSV of repos where executable self-deploy is
    #                                     REAL; empty -> only the self-hosting repo
    #                                     (orchestrator). Mirrors merge_gate_repos.
    #   deploy_require_manual_approve  -> require a human Approved before the prod
    #                                     restart (BR-5). Default true; NOT toggled in
    #                                     ORCH-36 (AC-12). false -> Phase A initiates
    #                                     immediately (structural branch, off by default).
    #   deploy_finalize_delay_s        -> delay before the first finalize poll; must be
    #                                     > the hook health-loop (~60s) so the verdict
    #                                     usually exists on the first poll.
    #   deploy_finalize_max_attempts   -> bounded finalize-defer budget (anti-livelock).
    # ssh / hook target (detached prod restart; real values live on the host):
    #   deploy_ssh_user / deploy_ssh_host -> ssh target for the host hook (INFRA P-2).
    #   deploy_hook_script             -> path to the hook ON THE HOST (relative to repo).
    #   deploy_host_repo_path          -> orchestrator clone path on the host.
    # prod overrides passed to the hook for build-once (retag staging image -> prod):
    #   deploy_prod_source_image       -> image validated on staging (retagged, no rebuild).
    #   deploy_prod_target_service / _port / _image / _compose_profile -> prod profile.
    #   deploy_prod_prev_image_file    -> prod prev-image snapshot (separate from staging).
    self_deploy_enabled: bool = True
    self_deploy_repos: str = ""
    deploy_require_manual_approve: bool = True
    deploy_finalize_delay_s: int = 90
    deploy_finalize_max_attempts: int = 10
    deploy_ssh_user: str = "slin"
    deploy_ssh_host: str = ""
    deploy_hook_script: str = "scripts/orchestrator-deploy-hook.sh"
    deploy_host_repo_path: str = "/home/slin/repos/orchestrator"
    deploy_prod_source_image: str = "orchestrator-orchestrator-staging"
    deploy_prod_target_service: str = "orchestrator"
    deploy_prod_target_port: int = 8500
    deploy_prod_target_image: str = "orchestrator-orchestrator"
    deploy_prod_compose_profile: str = ""
    deploy_prod_prev_image_file: str = ".deploy-prev-image-prod"

    # ORCH-058: staging-image provenance before the BUILD-ONCE retag to prod.
    # Closes the INV-FRESH gap (ADR-001): the BUILD-ONCE retag (ORCH-36) promotes
    # the staging image to prod WITHOUT a rebuild, assuming the staging image is
    # fresh — a guarantee the pipeline never had (a stale image could be silently
    # promoted, LESSONS_ORCH-036 §4). Two complementary layers, self-hosting only:
    #   A (liveness): the QG sub-check check_staging_image_fresh rebuilds the
    #     staging image from the VALIDATED commit (worktree HEAD after merge-gate)
    #     and recreates 8501 on the deploy-staging -> deploy edge, so we validate
    #     and promote ONE artefact.
    #   B (safety): build_deploy_command passes EXPECTED_REVISION and the hook
    #     fail-closes (exit 1) if SOURCE_IMAGE's revision label != EXPECTED_REVISION
    #     before `docker tag`, making a silent stale promote structurally impossible.
    #
    #   image_freshness_enabled -> SINGLE kill-switch for the WHOLE feature (A + B
    #                              together; never "B without A" = a deadlock). False
    #                              -> legacy ORCH-36 behaviour (BUILD-ONCE, no guard,
    #                              no EXPECTED_REVISION). Env ORCH_IMAGE_FRESHNESS_ENABLED.
    #   image_freshness_repos   -> CSV of repos where the feature is REAL; empty ->
    #                              only the self-hosting repo (orchestrator). Mirrors
    #                              self_deploy_repos / merge_gate_repos.
    image_freshness_enabled: bool = True
    image_freshness_repos: str = ""

    # ORCH-022: security-gate (secret-scanning + dependency audit) on the
    # deploy-staging -> deploy edge, run FIRST among the edge sub-gates (cheap to
    # fail before the expensive rebase/rebuild). Deterministic (no LLM): gitleaks
    # (offline secret-scan) + pip-audit (OSV/PyPI dependency audit), verdict in the
    # versioned 17-security-report.md frontmatter; FAIL -> rollback to development +
    # developer-retry (cap MAX_DEVELOPER_RETRIES). See ADR-001-security-gate.md.
    #   security_gate_enabled        -> SINGLE kill-switch; False -> pipeline 1:1 as
    #                                   before ORCH-022 for everyone. Env
    #                                   ORCH_SECURITY_GATE_ENABLED.
    #   security_gate_repos          -> CSV of repos where the gate is REAL; empty ->
    #                                   only the self-hosting repo (orchestrator).
    #                                   Mirrors merge_gate_repos / image_freshness_repos.
    #   security_dep_block_severity  -> CVE severity threshold that BLOCKS (CRITICAL >
    #                                   HIGH > MEDIUM > LOW); below it / UNKNOWN -> a
    #                                   warning only (anti-loop ADR-001 Р-4).
    #   security_scan_timeout_s      -> per external scanner call timeout (mirrors
    #                                   merge_retest_timeout_s).
    #   security_dep_audit_fail_closed -> strict mode: an unreachable CVE feed -> FAIL
    #                                   instead of the default fail-open + warning
    #                                   (Р-3). Default False (anti-loop ORCH-061).
    #   security_secrets_block       -> a found secret blocks (always True by default;
    #                                   the offline secrets guarantee is unconditional,
    #                                   BR-2).
    security_gate_enabled: bool = True
    security_gate_repos: str = ""
    security_dep_block_severity: str = "HIGH"
    security_scan_timeout_s: int = 300
    security_dep_audit_fail_closed: bool = False
    security_secrets_block: bool = True

    # ORCH-061: tolerate KNOWN sandbox-infra FAILs (C9a/C9b) in the staging suite.
    # The self-hosting deploy-staging stage looped because scripts/staging_check.py
    # exited non-zero on ANY failed check, so two infra-only failures (sandbox bot
    # accounts not members of the sandbox Plane project) produced staging_status:
    # FAILED -> rollback deploy-staging -> development -> loop.
    #   True  -> a run whose ONLY failures are allowlisted sandbox-infra checks
    #            (C9a/C9b) is waived to SUCCESS; ANY real pipeline check that fails
    #            still fails closed -> FAILED -> rollback (safety net intact, FR-4).
    #   False -> 1:1 pre-ORCH-061 strict behaviour: any FAIL -> FAILED -> rollback.
    # Default True (mirrors merge_gate_enabled / image_freshness_enabled /
    # self_deploy_enabled): the safety net holds regardless of the flag; the flag
    # exists to instantly restore legacy strictness without a code redeploy. Lives
    # in .env.staging (ORCH_ prefix) so it is reachable inside orchestrator-staging.
    # Env ORCH_STAGING_INFRA_TOLERANCE_ENABLED.
    staging_infra_tolerance_enabled: bool = True

    # ORCH-053: stuck-task reconciler (sweeper for lost webhooks). A background
    # daemon thread reconciles the "source of truth (gate / Plane) != task stage"
    # drift left behind by a dropped webhook (502 on rebuild, no Plane/Gitea
    # retries, unresolved sha->branch). See docs/architecture/adr/adr-0007-reconciler.md.
    #   reconcile_enabled            -> global kill-switch (self-hosting safety,
    #                                   staged rollout, env ORCH_RECONCILE_ENABLED).
    #   reconcile_interval_s         -> background sweep period (seconds).
    #   reconcile_plane_enabled      -> separate flag for the F-2 Plane-API poll so
    #                                   only the plane branch can be muted.
    #   reconcile_grace_default_s    -> default "stuck" threshold on tasks.updated_at.
    #   reconcile_grace_overrides_json -> JSON object of per-stage thresholds, e.g.
    #                                   {"analysis": 1800, "development": 300}. Invalid
    #                                   JSON -> default (mirrors agent_timeout_overrides_json).
    #   reconcile_notify_unblock     -> send a Telegram message when a stuck task is
    #                                   unblocked (F-4 observability).
    #   reconcile_skip_blocked_enabled -> ORCH-060 Guard 2: skip F-1 reconciliation of
    #                                   issues a human moved to Blocked / Needs Input
    #                                   (per-candidate Plane state lookup). Disabling it
    #                                   mutes ONLY the networked Guard 2; Guard 1
    #                                   (escalated-by-retries, local + deterministic) is
    #                                   always active. Manual escape hatch during a Plane
    #                                   outage.
    reconcile_enabled: bool = True
    reconcile_interval_s: int = 120
    reconcile_plane_enabled: bool = True
    reconcile_grace_default_s: int = 600
    reconcile_grace_overrides_json: str = ""
    reconcile_notify_unblock: bool = True
    reconcile_skip_blocked_enabled: bool = True

    # ORCH-068: TTL for the per-project Plane states cache (_STATES_CACHE in
    # plane_sync). Historically the cache lived for the whole process lifetime,
    # so a status added to Plane after start was never seen without a restart
    # ("stale set -> no pipeline action"). With a TTL the entry self-heals by
    # re-fetching /states/ after it expires (invalidation reuses the existing
    # reload_project_states() primitive — no duplicated reset logic).
    #   plane_states_ttl_s (env ORCH_PLANE_STATES_TTL_S):
    #     >0 -> seconds before a cache entry is re-fetched (default 300 = 5 min);
    #      0 -> disable TTL -> strictly the previous lifetime cache (back-compat
    #           escape hatch). get_project_states return shape is unchanged.
    plane_states_ttl_s: int = 300

    # ORCH-021: post-deploy production monitoring + degradation reaction. After
    # the terminal deploy->done transition for an applicable repo, a reserved-agent
    # `post-deploy-monitor` job (no LLM, modelled on deploy-finalizer) probes prod
    # over a window and reacts to a degradation the restart-time health-check
    # missed (class "green deploy, red prod", precedent ET-8). State is in sentinel
    # files (.post-deploy-state-<repo>/<wi>/), no DB migration. See
    # docs/architecture/adr/adr-0010-post-deploy-monitor.md.
    #   post_deploy_monitor_enabled -> global kill-switch (BR-8); False -> the
    #                                  pipeline is 1:1 as before ORCH-021 (no arm).
    #   post_deploy_repos           -> CSV of repos where monitoring is REAL; empty
    #                                  -> only the self-hosting repo (orchestrator).
    #                                  Mirrors self_deploy_repos / merge_gate_repos.
    #   post_deploy_window_s        -> observation window length (~15 min, BR-1).
    #   post_deploy_interval_s      -> seconds between probe ticks.
    #   post_deploy_fail_threshold  -> N CONSECUTIVE health failures -> DEGRADED.
    #   post_deploy_5xx_threshold   -> window 5xx ratio above this -> DEGRADED.
    #   post_deploy_auto_rollback   -> globally allow auto-rollback; True acts ONLY
    #                                  for non-self repos. For self-hosting the
    #                                  reaction is ALWAYS ALERT_ONLY (BR-5) — a tick
    #                                  NEVER restarts the prod orchestrator container.
    #   post_deploy_base_url        -> base URL of the observed prod instance.
    #   Rollback target params reuse the existing deploy_prod_* settings (no dupes).
    post_deploy_monitor_enabled: bool = True
    post_deploy_repos: str = ""
    post_deploy_window_s: int = 900
    post_deploy_interval_s: int = 30
    post_deploy_fail_threshold: int = 3
    post_deploy_5xx_threshold: float = 0.5
    post_deploy_auto_rollback: bool = False
    post_deploy_base_url: str = "http://localhost:8500"

    # ORCH-065: job-reaper + proactive merge-lease reclaim. A background daemon
    # thread (modelled on the reconciler) makes "the monitor thread / process died
    # while a job/lease was held" self-heal WITHOUT a restart. Status (done/queued/
    # failed) is otherwise only ever set by launcher._monitor_agent -> _finalize_job
    # inside the live process; a death there left the jobs row 'running' forever and
    # (at max_concurrency=1) wedged the queue of EVERY project (incidents 07.06: jobs
    # 236/239/242/254). The same thread proactively reclaims a stale/dead merge-lease
    # (ORCH-043) instead of waiting for the lazy TTL on the next foreign acquire. See
    # docs/architecture/adr/adr-0011-job-reaper-lease-reclaim.md.
    #   reaper_enabled       -> global kill-switch (false -> strictly prior behaviour;
    #                           only the startup requeue_running_jobs remains).
    #   reaper_interval_s    -> background scan period (seconds).
    #   reaper_dead_ticks    -> Tier-1: consecutive ticks a job's pid must be dead
    #                           before it is reaped (>=2 anti-false-positive; a live
    #                           long-running agent is NEVER reaped).
    #   reaper_max_running_s  -> Tier-3 backstop ceiling: a job 'running' longer than
    #                           this is reaped even when liveness is unknowable. MUST be
    #                           > max agent_timeout + grace so a legit agent is safe.
    #   reaper_finalize_grace_s -> Tier-2 anti-false-positive: a LIVE monitor writes
    #                           agent_runs.exit_code FIRST, THEN does git commit/push +
    #                           PR + Plane usage comments (seconds..minutes) and only
    #                           then _finalize_job. The agent pid is already dead in
    #                           that window, so pid cannot tell "monitor died" from
    #                           "monitor still finalizing". A job is reaped via Tier-2
    #                           only once exit_code has been recorded for at least this
    #                           many seconds (MUST be > the max finalization window).
    #   lease_reclaim_enabled -> kill-switch for the proactive stale/dead lease reclaim
    #                           (false -> only the legacy lazy TTL reclaim in acquire).
    # (reuse) merge_lock_timeout_s -> lease TTL; merge_gate_repos -> reclaim scope.
    reaper_enabled: bool = True
    reaper_interval_s: int = 60
    reaper_dead_ticks: int = 2
    reaper_max_running_s: int = 3600
    reaper_finalize_grace_s: int = 300
    lease_reclaim_enabled: bool = True

    # ORCH-063: disk-watchdog — background heartbeat that measures host-FS fill via
    # the mounted bind-paths and Telegram-alerts the operator at >= threshold. On
    # 07.06.2026 the mva154 host disk silently hit 100% and stalled the WHOLE
    # self-hosting pipeline; the watchdog is the missing proactive signal. Modelled
    # on reconciler/job_reaper (daemon thread, start/stop in main.lifespan, /queue
    # snapshot, never-raise). Anti-spam state is in-memory (no DB migration).
    #   disk_monitor_enabled       -> kill-switch; False -> the daemon does not start
    #                                 (zero regression), env ORCH_DISK_MONITOR_ENABLED.
    #   disk_monitor_interval_s    -> heartbeat measurement period, seconds (order of
    #                                 minutes; cheap shutil.disk_usage, no df subprocess).
    #   disk_monitor_threshold_pct -> fill % that triggers the alert (Owner-fixed 85).
    #   disk_monitor_realert_s     -> min interval between repeat alerts while still
    #                                 above threshold (anti-spam cooldown, ~6h).
    #   disk_monitor_paths         -> CSV of monitored HOST bind-paths (NOT overlay /);
    #                                 empty -> the default set (/repos, /app/data).
    # Defensive validation (ADR-001 D7): threshold out of 1..100 or a non-positive
    # interval -> default + warning (the process never crashes on a bad env value).
    disk_monitor_enabled: bool = True
    disk_monitor_interval_s: int = 300
    disk_monitor_threshold_pct: int = 85
    disk_monitor_realert_s: int = 21600
    disk_monitor_paths: str = "/repos,/app/data"

    @field_validator(
        "disk_monitor_interval_s", "disk_monitor_realert_s", mode="before"
    )
    @classmethod
    def _disk_positive_int(cls, v, info):
        # Non-positive / non-numeric interval -> the field default (never crash).
        _defaults = {"disk_monitor_interval_s": 300, "disk_monitor_realert_s": 21600}
        fallback = _defaults.get(info.field_name, 1)
        try:
            if v is None or (isinstance(v, str) and v.strip() == ""):
                return fallback
            iv = int(v)
            if iv <= 0:
                logging.getLogger("orchestrator.config").warning(
                    "%s must be > 0, got %s; falling back to %s",
                    info.field_name, v, fallback,
                )
                return fallback
            return iv
        except (TypeError, ValueError):
            return fallback

    @field_validator("disk_monitor_threshold_pct", mode="before")
    @classmethod
    def _disk_threshold_pct(cls, v):
        # Threshold must be a percentage in 1..100; otherwise -> default 85.
        try:
            if v is None or (isinstance(v, str) and v.strip() == ""):
                return 85
            iv = int(v)
            if 1 <= iv <= 100:
                return iv
            logging.getLogger("orchestrator.config").warning(
                "disk_monitor_threshold_pct must be 1..100, got %s; using 85", v
            )
            return 85
        except (TypeError, ValueError):
            return 85

    # ORCH-062: build-cache-pruner — the "second half" of the disk-watchdog
    # (ORCH-063): watchdog SIGNALS, pruner CLEANS. A background daemon thread
    # modelled 1:1 on disk_watchdog (start/stop in main.lifespan, /queue snapshot,
    # never-raise, kill-switch) that periodically runs `docker builder prune` on
    # the HOST over ssh (the container ships no docker CLI — same channel as
    # image_freshness/self_deploy). Touches ONLY the BuildKit build cache: never
    # images/containers of running services, never restarts the docker daemon or
    # the prod container (self-hosting safety). State (last run / result) is
    # in-memory, best-effort — no DB migration. ADR-001 D1..D7.
    #   build_cache_prune_enabled       -> kill-switch; False -> daemon does not
    #                                      start (1:1 as before), env *_ENABLED.
    #   build_cache_prune_interval_s    -> tick period, seconds (order of hours).
    #   build_cache_prune_until         -> retention age for warm cache
    #                                      (`docker builder prune --filter until=`).
    #   build_cache_prune_all           -> add `-a` (ALWAYS paired with until).
    #   build_cache_prune_timeout_s     -> bound on the ssh command, seconds.
    #   build_cache_prune_notify_min_gb -> Telegram when reclaimed >= N GB; 0 -> silent.
    # Defensive validation (ADR-001 D4): a non-positive / non-numeric interval or
    # timeout -> default + warning; an `until` not matching ^\d+[smhdw]?$ -> "24h";
    # a negative notify threshold -> 0. A bad env value NEVER crashes the start.
    build_cache_prune_enabled: bool = True
    build_cache_prune_interval_s: int = 21600
    build_cache_prune_until: str = "24h"
    build_cache_prune_all: bool = False
    build_cache_prune_timeout_s: int = 120
    build_cache_prune_notify_min_gb: float = 0.0

    @field_validator(
        "build_cache_prune_interval_s", "build_cache_prune_timeout_s", mode="before"
    )
    @classmethod
    def _bcp_positive_int(cls, v, info):
        # Non-positive / non-numeric -> the field default (never crash the start).
        _defaults = {
            "build_cache_prune_interval_s": 21600,
            "build_cache_prune_timeout_s": 120,
        }
        fallback = _defaults.get(info.field_name, 1)
        try:
            if v is None or (isinstance(v, str) and v.strip() == ""):
                return fallback
            iv = int(v)
            if iv <= 0:
                logging.getLogger("orchestrator.config").warning(
                    "%s must be > 0, got %s; falling back to %s",
                    info.field_name, v, fallback,
                )
                return fallback
            return iv
        except (TypeError, ValueError):
            return fallback

    @field_validator("build_cache_prune_until", mode="before")
    @classmethod
    def _bcp_until(cls, v):
        # A docker `until` filter: digits + optional unit (s/m/h/d/w). Anything
        # else -> the safe default "24h" (keeps warm cache, BR-2).
        try:
            if v is None:
                return "24h"
            s = str(v).strip()
            if s and re.match(r"^\d+[smhdw]?$", s):
                return s
            logging.getLogger("orchestrator.config").warning(
                "build_cache_prune_until must match ^\\d+[smhdw]?$, got %r; using 24h", v
            )
            return "24h"
        except (TypeError, ValueError):
            return "24h"

    @field_validator("build_cache_prune_notify_min_gb", mode="before")
    @classmethod
    def _bcp_notify_min_gb(cls, v):
        # A non-negative GB threshold; negative / non-numeric -> 0 (silent).
        try:
            if v is None or (isinstance(v, str) and v.strip() == ""):
                return 0.0
            fv = float(v)
            return fv if fv >= 0 else 0.0
        except (TypeError, ValueError):
            return 0.0

    # ORCH-071: merge-verify under-gate on the `deploy -> done` edge. For the
    # self-hosting repo the `deploy` stage runs the DETERMINISTIC self-deploy path
    # (Phase A/B/C), where the LLM `deployer` agent — historically the ONLY actor
    # that merged the feature PR into `main` — never runs. Result: a "green" deploy
    # could reach `done` while the PR stayed `open` (phantom merge, postmortem
    # LESSONS_2026-06-08). This under-gate (врезка in advance_stage, NOT a new
    # STAGE_TRANSITIONS edge or registered QG) runs a deterministic merge-actor +
    # post-deploy verification before `done`: not-merged -> alert + HOLD (no done),
    # merged -> normal advance. Mirrors merge_gate_* / image_freshness_* rollout.
    #   merge_verify_enabled  -> global kill-switch; False -> strictly the prior
    #                            behaviour (no merge/verify), env ORCH_MERGE_VERIFY_ENABLED.
    #   merge_verify_repos    -> CSV of repos where the under-gate is REAL; empty ->
    #                            only the self-hosting repo (orchestrator). Mirrors
    #                            merge_gate_repos / self_deploy_repos.
    #   merge_pr_timeout_s    -> per Gitea merge/list HTTP call timeout.
    #   merge_verify_timeout_s-> git fetch/merge-base timeout for the ancestor check.
    merge_verify_enabled: bool = True
    merge_verify_repos: str = ""
    merge_pr_timeout_s: int = 60
    merge_verify_timeout_s: int = 60

    # ORCH-026: intra-repo merge serialisation (Level A) + declarative task
    # dependencies (Level B). Level A reuses the ORCH-043/065 merge-lease window
    # (no new mechanism) — the merge-lease already serialises "merge -> main-updated"
    # per repo; the ONLY new behaviour is an unconditional pre-merge rebase. Level B
    # adds a new ADDITIVE job_deps table + a NOT EXISTS gate in claim_next_job. Both
    # features are inert without data (no applicable repo / no declared deps) ->
    # zero regression for enduro-trails.
    #   premerge_rebase_always -> Level A (A-2): when True, check_branch_mergeable
    #                             ALWAYS rebases the task branch onto the CURRENT
    #                             origin/main UNDER the merge-lease (not only when
    #                             branch_is_behind_main) — a deterministic anti-phantom
    #                             that does not depend on the ancestor check's precision.
    #                             auto_rebase_onto_main is a cheap no-op on an already
    #                             up-to-date branch (rc 0, push up-to-date, CI not
    #                             retriggered). Scope = merge_gate_repos (empty ->
    #                             self-hosting). Kill-switch (False -> exactly the
    #                             ORCH-043 behaviour: rebase only when behind). Env
    #                             ORCH_PREMERGE_REBASE_ALWAYS.
    #   task_deps_enabled      -> Level B (B-2): global kill-switch for the scheduler
    #                             dependency gate. False -> claim_next_job is 1:1 as
    #                             ORCH-1 (the NOT EXISTS clause is omitted). Inert when
    #                             job_deps is empty. Env ORCH_TASK_DEPS_ENABLED.
    #   task_deps_source       -> declaration source: db|plane|hybrid (default db).
    #                             The scheduler ALWAYS reads the DB cache (offline-safe
    #                             hot path); plane/hybrid additionally ingest Plane
    #                             `blocked-by` relations into job_deps at task creation.
    #                             Env ORCH_TASK_DEPS_SOURCE.
    premerge_rebase_always: bool = True
    task_deps_enabled: bool = True
    task_deps_source: str = "db"

    # ORCH-088 (Этап 1, serial e2e): per-repo serial gate. A new task's analyst-job
    # does NOT enter analysis (no branch cut, no analyst agent) while the same repo
    # has another unfinished task (tasks.stage != 'done') OR the repo is frozen
    # (repo_freeze). The gate lives in claim_next_job (offline-safe hot path, like
    # the ORCH-026 dep-gate) + the branch cut is deferred from start_pipeline to the
    # analyst-job claim (launcher) so the branch base is always a fresh origin/main
    # that already contains the predecessor (anti-stale-base, AC-6). All additive,
    # never-raise, restart-safe; STAGE_TRANSITIONS / QG_CHECKS unchanged. See
    # docs/work-items/ORCH-088/06-adr/ADR-001-serial-gate.md.
    #   serial_gate_enabled       -> kill-switch (env ORCH_SERIAL_GATE_ENABLED).
    #                                False -> claim_next_job AND start_pipeline are 1:1
    #                                as before ORCH-088 (clause omitted, branch cut in
    #                                start_pipeline) — zero regression (AC-7).
    #   serial_gate_repos         -> CSV scope (env ORCH_SERIAL_GATE_REPOS). Empty ->
    #                                applies to ALL registered repos (D5); non-empty ->
    #                                only the listed repos. Repo tokens are sanitised
    #                                (^[A-Za-z0-9._-]+$) before being embedded in SQL.
    #   serial_gate_freeze_enabled-> independent tumbler for the FR-5 rollback-freeze
    #                                layer (env ORCH_SERIAL_GATE_FREEZE_ENABLED). False
    #                                -> freeze is neither set (post-deploy DEGRADED) nor
    #                                consulted in the claim gate.
    serial_gate_enabled: bool = True
    serial_gate_repos: str = ""
    serial_gate_freeze_enabled: bool = True

    # ORCH-090: STOP-status task cancellation (stop active agent + full progress
    # reset) and the relaunch-hole close. A new logical Plane key `stop` (fail-closed,
    # absent from _DEFAULT_STATES) routes to a cancel handler that drives the task to
    # the new system-terminal state `cancelled` (stage + durable). Additive,
    # never-raise, restart-safe; STAGE_TRANSITIONS / QG_CHECKS / check_* / existing
    # status semantics are NOT touched. See
    # docs/work-items/ORCH-090/06-adr/ADR-001-stop-cancel-task.md and the cross-cutting
    # docs/architecture/adr/adr-0026-stop-cancel-task.md.
    #   stop_status_enabled -> kill-switch (env ORCH_STOP_STATUS_ENABLED). False ->
    #                          STOP handling AND the relaunch-hole gate are inert
    #                          (behaviour strictly as before ORCH-090 — zero
    #                          regression, AC-8).
    #   stop_status_repos   -> CSV scope (env ORCH_STOP_STATUS_REPOS). Empty -> applies
    #                          to ALL repos (cancellation is meaningful for enduro too);
    #                          non-empty -> only the listed repos. Tokens are sanitised
    #                          (^[A-Za-z0-9._-]+$) by the cancel leaf.
    stop_status_enabled: bool = True
    stop_status_repos: str = ""

    # ORCH-073 (ADR-001 Р-4): main-integrity regression guard. After the merge-verify
    # under-gate confirms the deployed SHA is an ancestor of origin/main (FR-1), a
    # secondary deterministic (no-LLM) guard checks that a declarative set of markers
    # for recently-merged tasks (MAIN_REGRESSION_MARKERS in merge_gate.py) is still
    # present in origin/main — i.e. a CHANGELOG-rebase or phantom-merge did not silently
    # roll back a neighbouring task's code. A missing marker (deterministic count==0) ->
    # ALERT + HOLD (task stays on `deploy`, NOT done); an infra/git error on the grep
    # itself -> fail-OPEN (do not block done; SHA-in-main remains the primary gate).
    #   regression_guard_enabled -> kill-switch (env ORCH_REGRESSION_GUARD_ENABLED);
    #                               reuses the merge_verify_applies scope (self-hosting /
    #                               merge_verify_repos), so non-self repos are a no-op.
    regression_guard_enabled: bool = True

    # ORCH-082 (ADR-001 Р-5): guarantee an open code-PR BEFORE the deterministic
    # merge_pr inside the merge-verify under-gate. The pipeline never guaranteed the
    # branch had an open PR (head==branch, base==main) at merge time — PRs are created
    # ONLY on the developer path with a fresh worktree commit (launcher._ensure_pr),
    # so a branch (e.g. after a manual main restore / a bounce with no new commits)
    # could reach merge-verify PR-less -> merge_pr returns "no open PR" -> a FALSE HOLD
    # that ORCH-073 fail-closed correctly catches but should never have to. The
    # idempotent leaf-actor merge_gate.ensure_open_pr creates/finds the code-PR ДО
    # merge_pr; ORCH-073's SHA-in-main proof is untouched and stays authoritative.
    #   merge_verify_autocreate_pr_enabled -> kill-switch (env
    #       ORCH_MERGE_VERIFY_AUTOCREATE_PR_ENABLED). False -> exactly the pre-ORCH-082
    #       behaviour (no auto-create; "no open PR" -> HOLD as before). Reuses the
    #       merge_verify_applies scope (self-hosting / merge_verify_repos) — no separate
    #       *_repos, since auto-create is semantically inseparable from merge-verify.
    merge_verify_autocreate_pr_enabled: bool = True

    # ORCH-089: auto-mode by Plane labels — autoApprove (BRD gate) + autoDeploy
    # (prod-deploy gate). Two HUMAN gates of the pipeline (analysis: wait for a
    # manual Approved; deploy Phase A: wait for a manual Confirm Deploy) are the
    # only blockers of an autonomous batch run (epic ORCH-088). ORCH-089 lifts ONLY
    # those two human decisions — selectively (a Plane label on the issue),
    # declaratively, reversibly, WITHOUT touching a single technical check. Additive
    # leaf (src/labels.py, never-raise) + two point insertions + flags;
    # STAGE_TRANSITIONS / QG_CHECKS / check_* / DB schema are NOT touched. See
    # docs/work-items/ORCH-089/06-adr/ADR-001-auto-label-gates.md.
    #   auto_label_enabled  -> global kill-switch for BOTH auto-modes (env
    #                          ORCH_AUTO_LABEL_ENABLED). False -> strictly the prior
    #                          behaviour (both gates manual), AND no new network call
    #                          on the gates (applies() returns False first, before
    #                          has_label is consulted) — zero regression (AC-8).
    #   auto_approve_label  -> Plane label name for the BRD gate (env
    #                          ORCH_AUTO_APPROVE_LABEL).
    #   auto_deploy_label   -> Plane label name for the deploy gate (env
    #                          ORCH_AUTO_DEPLOY_LABEL).
    #   auto_label_repos    -> CSV scope (env ORCH_AUTO_LABEL_REPOS). Empty ->
    #                          self-hosting only (orchestrator), the safe default
    #                          (the autoDeploy insertion lives in Phase A, which only
    #                          exists for the self-hosting repo). Non-empty -> only
    #                          the listed repos.
    #   auto_label_states_ttl_s -> TTL (seconds) of the per-project label-map cache
    #                          (mirrors plane_states_ttl_s); 0 -> lifetime cache.
    auto_label_enabled: bool = True
    auto_approve_label: str = "autoApprove"
    auto_deploy_label: str = "autoDeploy"
    auto_label_repos: str = ""
    auto_label_states_ttl_s: int = 300

    # Telegram notifications
    telegram_bot_token: str = ""
    telegram_chat_id: str = ""

    # ORCH-042: режим live-трекера задачи.
    #   bump (ДЕФОЛТ с ORCH-067) -> при обновлении старое сообщение удаляется и
    #           карточка отправляется заново вниз чата (deleteMessage + sendMessage
    #           + repoint message_id), тихо (disable_notification).
    #   edit -> карточка редактируется на месте (editMessageText); доступен через
    #           ORCH_TRACKER_MODE=edit.
    #   Одна карточка на задачу в обоих режимах. Неизвестное/пустое значение
    #   трактуется как edit (см. notifications).
    tracker_mode: str = "bump"

    # ORCH-067 (ADR Р-2/Р-3/Р-4): best-effort live-overlay для статус-строки
    # карточки. Дорисовывает ветки Plane-статуса, неотличимые offline по
    # tasks.stage (Needs Input / Blocked / Rejected / Cancelled / Deploying /
    # Monitoring after Deploy) — читая ЖИВОЙ Plane-статус с коротким таймаутом и
    # TTL-кэшем. Offline-ядро (stage -> статус, In Review из brd-clock) работает
    # всегда без сети; overlay лишь дополняет его и НИКОГДА не блокирует конвейер.
    #   tracker_live_status         -> kill-switch (False -> только offline-ядро).
    #   tracker_live_status_ttl_s   -> TTL per-issue кэша live-uuid (защита hot-path).
    #   tracker_live_status_timeout_s -> таймаут одного live-GET в пути рендера.
    tracker_live_status: bool = True
    tracker_live_status_ttl_s: int = 60
    tracker_live_status_timeout_s: int = 3

    # ORCH-087 (BR-G5, ADR-001 Р-6): cap for the human BRD-review time shown on the
    # done card ("твоё {review}"). The brd_review clock can stay open for hours on a
    # desync (In Review -> Backlog), which made "твоё время" report anomalous stalls
    # (ORCH-087: 392m). Above this cap the value is shown capped with a "~" marker so
    # an abnormal stall is never presented as real human review time. Env
    # ORCH_TRACKER_BRD_REVIEW_CAP_S; default 7200s (2h). 0/negative -> no cap.
    tracker_brd_review_cap_s: int = 7200

    # ORCH-076 (ORCH-52c, FR-2 / D3): kill-switch for STRICT frontmatter-schema
    # validation. The unified frontmatter contract (src/frontmatter.py) ships a
    # machine-checkable schema validator (REQUIRED_FIELDS), but by DEFAULT it is
    # warning-only and never influences any gate's boolean verdict (maybe_warn_schema
    # is inert). This flag is RESERVED for a future tightening (ORCH-52d, when agents
    # start emitting the full schema). It MUST stay False in prod / .env.staging —
    # otherwise ORCH-52c would self-block its own deploy (its docs predate the
    # schema). Env ORCH_FRONTMATTER_VALIDATION_STRICT; default False (zero behaviour
    # change). See docs/_standards/HANDOFF_PROTOCOL.md.
    frontmatter_validation_strict: bool = False

    # ORCH-069: QG-0 upper title-length limit (entry gate _qg0_errors). The 80-char
    # cap was a hygiene limit, not structural (slug is cut to [:30] independently,
    # DB title TEXT is unbounded). Configurable via env ORCH_QG0_TITLE_MAX; default
    # 200 (was hardcoded 80). Invalid/empty value -> default (graceful, no crash).
    qg0_title_max: int = 200

    @field_validator("qg0_title_max", mode="before")
    @classmethod
    def _qg0_title_max_default(cls, v):
        # Graceful (ORCH-069 AC-3): empty / non-numeric env -> default 200, the
        # process must not crash on startup. Never raises (self-hosting safety).
        try:
            if v is None or (isinstance(v, str) and v.strip() == ""):
                return 200
            return int(v)
        except (TypeError, ValueError):
            return 200

    class Config:
        env_prefix = "ORCH_"
        env_file = ".env"


settings = Settings()