orchestrator/src/config.py

from pydantic_settings import BaseSettings


class Settings(BaseSettings):
    # Plane
    plane_api_url: str = "http://localhost:8091"
    # ORCH-017: external (browser) web URL of Plane for clickable issue links in
    # notifications, e.g. https://plane.example.org. Falls back to plane_api_url,
    # but a loopback fallback (localhost/127.0.0.1) is treated as "no web URL" and
    # the Plane link is omitted (see notifications._build_plane_issue_link).
    plane_web_url: str = ""
    plane_api_token: str = ""
    plane_workspace_slug: str = ""
    plane_webhook_secret: str = ""
    plane_project_id: str = ""

    # Per-agent Plane bot tokens (feat: per-agent comment authorship).
    # When set, add_comment posts under the matching bot so Plane shows the
    # real author (Analyst/Architect/...). Empty -> fallback to plane_api_token.
    plane_bot_analyst: str = ""
    plane_bot_architect: str = ""
    plane_bot_developer: str = ""
    plane_bot_reviewer: str = ""
    plane_bot_tester: str = ""
    plane_bot_deployer: str = ""
    plane_bot_stream: str = ""

    # Gitea
    gitea_url: str = "http://localhost:3000"
    gitea_public_url: str = ""  # external URL for clickable links in comments; falls back to gitea_url
    gitea_token: str = ""
    gitea_webhook_secret: str = ""
    gitea_owner: str = "admin"
    default_repo: str = "enduro-trails"

    # ORCH-6: multi-repo project registry. JSON array of
    #   {plane_project_id, repo, work_item_prefix, name}.
    # Empty -> built-in default registry in src/projects.py.
    projects_json: str = ""

    # Claude CLI
    claude_bin: str = "/opt/claude-code/bin/claude.exe"
    repos_dir: str = "/repos"
    host_repos_dir: str = "/home/slin/repos"
    worktrees_dir: str = "/repos/_wt"  # ORCH-2 / S-4: isolated worktree per task/branch

    # DB
    db_path: str = "/app/data/orchestrator.db"

    # ORCH-1 (F-2b): persistent job queue / background worker.
    # max_concurrency  -> max agent jobs running in parallel (env ORCH_MAX_CONCURRENCY)
    # queue_poll_interval -> worker loop poll seconds (env ORCH_QUEUE_POLL_INTERVAL)
    max_concurrency: int = 1
    queue_poll_interval: float = 2.0

    # ORCH-1b (resilience): preflight + 429/rate-limit + backoff + circuit breaker.
    # preflight_cache_ttl  -> cache the cheap CLI/network preflight result (seconds);
    #                         the worker does NOT re-run `claude --version` more often
    #                         than this (env ORCH_PREFLIGHT_CACHE_TTL).
    # backoff_base_seconds -> base for exponential transient backoff.
    # backoff_max_seconds  -> ceiling for the transient backoff.
    # transient_max_attempts -> retry budget for transient (429/overload/network)
    #                         failures, separate from code-fault `attempts`.
    # breaker_threshold    -> consecutive transient failures that OPEN the breaker.
    # breaker_pause_seconds -> how long the breaker stays open before half-open.
    preflight_cache_ttl: int = 45
    # ORCH-044 (P1): token-free preflight auth gate. After `claude --version`
    # succeeds, preflight also checks that claude is logged in by reading the
    # local OAuth credentials file (no network / no prompt-ping — BR-1).
    #   preflight_check_auth     -> master toggle (env ORCH_PREFLIGHT_CHECK_AUTH).
    #                               Emergency off-switch if the check ever
    #                               false-positives and wedges the shared queue.
    #   claude_credentials_path  -> explicit path to .credentials.json
    #                               (env ORCH_CLAUDE_CREDENTIALS_PATH). Empty ->
    #                               <AGENT_HOME>/.claude/.credentials.json, where
    #                               AGENT_HOME is the HOME the launcher really
    #                               spawns claude under (/home/slin), NOT the
    #                               orchestrator process env.
    #   auth_expiry_skew_seconds -> clock-drift slack when comparing
    #                               claudeAiOauth.expiresAt (env
    #                               ORCH_AUTH_EXPIRY_SKEW_SECONDS); a token within
    #                               this many seconds of now is treated as expired.
    preflight_check_auth: bool = True
    claude_credentials_path: str = ""
    auth_expiry_skew_seconds: int = 0
    backoff_base_seconds: int = 10
    backoff_max_seconds: int = 600
    transient_max_attempts: int = 5
    breaker_threshold: int = 3
    breaker_pause_seconds: int = 300

    # ORCH-7 (M-2): agent timeout + graceful kill.
    # agent_timeout_seconds   -> default per-agent wall-clock budget; the watchdog
    #                            kills the run after this (env ORCH_AGENT_TIMEOUT_SECONDS).
    # agent_kill_grace_seconds-> pause between SIGTERM and SIGKILL so claude can
    #                            flush artifacts before the hard kill
    #                            (env ORCH_AGENT_KILL_GRACE_SECONDS).
    # agent_timeout_overrides_json -> optional per-agent override JSON object,
    #                            e.g. {"reviewer": 3600, "architect": 2700}
    #                            (env ORCH_AGENT_TIMEOUT_OVERRIDES_JSON).
    agent_timeout_seconds: int = 1800
    agent_kill_grace_seconds: int = 20
    agent_timeout_overrides_json: str = ""

    # ORCH-41: per-agent LLM model. Empty -> agent_model_default. Resolution order:
    # project-override (projects_json agent_models) > ORCH_AGENT_MODEL_<AGENT> >
    # agent_model_default > CLI default (no --model flag). Default is 4-8 because
    # 4-7 == 4-8 in price (Slava 05.06); do NOT hardcode the version anywhere else.
    agent_model_default: str = "claude-opus-4-8"
    agent_model_analyst: str = ""
    agent_model_architect: str = ""
    agent_model_developer: str = ""
    agent_model_reviewer: str = ""
    agent_model_tester: str = ""
    agent_model_deployer: str = ""

    # ORCH-41: per-agent effort / reasoning level: low|medium|high|xhigh|max.
    # Empty -> agent_effort_default. Same resolution order as model. Default split:
    # thinking agents (analyst/architect/developer/reviewer) -> high; mechanical
    # agents (tester/deployer) -> medium.
    agent_effort_default: str = "high"
    agent_effort_analyst: str = "high"
    agent_effort_architect: str = "high"
    agent_effort_developer: str = "high"
    agent_effort_reviewer: str = "high"
    agent_effort_tester: str = "medium"
    agent_effort_deployer: str = "medium"

    # ORCH-41: optional per-agent fallback model used when the primary is
    # overloaded (--fallback-model, works with --print). Empty -> no flag.
    agent_fallback_model: str = ""

    # L-2: run-log rotation. Old per-run logs in <data>/runs/*.log are pruned at
    # app startup (best-effort). A *.log is removed if it is older than
    # log_keep_days OR not within the log_keep_max most-recent logs (whichever
    # hits first). Only *.log files are touched; the active run log is skipped.
    #   log_keep_days -> max age in days (env ORCH_LOG_KEEP_DAYS).
    #   log_keep_max  -> max number of newest logs to retain (env ORCH_LOG_KEEP_MAX).
    log_keep_days: int = 30
    log_keep_max: int = 500


    # ORCH-045: quality-gate CI poll/retry. check_ci_green polls the Gitea
    # combined commit status up to ci_poll_max_attempts times, sleeping
    # ci_poll_interval_s between attempts, to ride out a transient pending
    # state right after the developer push (race fix, see ORCH-017).
    #   ci_poll_max_attempts -> max status polls (env ORCH_CI_POLL_MAX_ATTEMPTS)
    #   ci_poll_interval_s   -> seconds between polls (env ORCH_CI_POLL_INTERVAL_S)
    ci_poll_max_attempts: int = 12
    ci_poll_interval_s: int = 10

    # Telegram notifications
    telegram_bot_token: str = ""
    telegram_chat_id: str = ""

    class Config:
        env_prefix = "ORCH_"
        env_file = ".env"


settings = Settings()