orchestrator/.env.example

ORCH_PLANE_API_URL=http://plane-app-api-1:8000
# External (browser) web URL of Plane for clickable issue links in notifications
# (ORCH-017). Falls back to ORCH_PLANE_API_URL; a loopback fallback is treated as
# "no web URL" and the Plane link is omitted. Example: https://plane.example.org
ORCH_PLANE_WEB_URL=
ORCH_PLANE_API_TOKEN=
ORCH_PLANE_WORKSPACE_SLUG=
# Webhook secrets are GENERATED PER HOST: python3 scripts/gen_secrets.py
# (ORCH-101 / AC-5: production secrets are NEVER copied to a new host).
ORCH_PLANE_WEBHOOK_SECRET=
ORCH_GITEA_URL=http://localhost:3000
# External (browser) URL of Gitea for clickable Branch/PR links in comments;
# empty -> falls back to ORCH_GITEA_URL.
ORCH_GITEA_PUBLIC_URL=
ORCH_GITEA_TOKEN=
ORCH_GITEA_WEBHOOK_SECRET=
ORCH_GITEA_OWNER=admin
# Per-agent Plane bot tokens (optional): when set, comments are posted under
# the matching bot so Plane shows the real author; empty -> ORCH_PLANE_API_TOKEN.
ORCH_PLANE_BOT_ANALYST=
ORCH_PLANE_BOT_ARCHITECT=
ORCH_PLANE_BOT_DEVELOPER=
ORCH_PLANE_BOT_REVIEWER=
ORCH_PLANE_BOT_TESTER=
ORCH_PLANE_BOT_DEPLOYER=
ORCH_PLANE_BOT_STREAM=
# Telegram live-tracker / alerts (empty -> notifications are logged, not sent).
ORCH_TELEGRAM_BOT_TOKEN=
ORCH_TELEGRAM_CHAT_ID=
# ORCH-6: project registry — JSON array of {plane_project_id, repo,
# work_item_prefix, name}. Empty -> built-in default registry (src/projects.py)
# whose Plane UUIDs belong to the ORIGINAL host. On a NEW host this key is
# MANDATORY (ORCH-101 replication checklist, docs/operations/REPLICATION.md).
ORCH_PROJECTS_JSON=
ORCH_CLAUDE_BIN=/usr/bin/claude
ORCH_DB_PATH=/app/data/orchestrator.db

# ── ORCH-101: host parametrization (replication foundation, ADR-001 D1–D7) ───
# Every host-specific value lives HERE (defaults = the current production host;
# an empty/absent value keeps behaviour 1:1). The same names are read by BOTH
# pydantic Settings (env_file) and docker-compose ${VAR:-default} interpolation
# (compose reads .env/shell, NOT a service's env_file). Full variable map and
# the new-host procedure: docs/operations/REPLICATION.md.
#   AGENT_HOME_DIR -> HOME of all actor subprocesses (agents/finalizer/monitor)
#                     AND the target of the .claude/.claude.json/.ssh mounts AND
#                     Dockerfile ARG APP_HOME (ORCH-040 group moves together).
#   AGENT_GIT_NAME / GIT_EMAIL_DOMAIN -> git identity of agent commits; system
#                     actors keep platform names deploy-finalizer/post-deploy-
#                     monitor under the same domain.
#   STAGING_PORT   -> staging instance port; image_freshness fail-closes when it
#                     equals the prod port (ORCH-058 AC-9 guard).
#   HOST_*         -> host-side sources of the bind mounts (repos, ~/.claude,
#                     ~/.claude.json, ssh keydir, claude-code dist, node binary).
#   RUN_UID/RUN_GID/DOCKER_GID -> container uid:gid + host docker group for
#                     docker.sock access (group_add «МИНА 1», ORCH-040).
ORCH_AGENT_HOME_DIR=/home/slin
ORCH_AGENT_GIT_NAME=claude-bot
ORCH_GIT_EMAIL_DOMAIN=mva154.local
ORCH_STAGING_PORT=8501
ORCH_HOST_REPOS_DIR=/home/slin/repos
ORCH_HOST_CLAUDE_DIR=/home/slin/.claude
ORCH_HOST_CLAUDE_JSON=/home/slin/.claude.json
ORCH_HOST_SSH_DIR=/home/slin/.orchestrator-ssh
ORCH_HOST_CLAUDE_CODE_DIR=/usr/lib/node_modules/@anthropic-ai/claude-code
ORCH_HOST_NODE_BIN=/usr/bin/node
ORCH_RUN_UID=1000
ORCH_RUN_GID=1000
ORCH_DOCKER_GID=999

# ── Agent model / effort / fallback (ORCH-41, validation ORCH-74) ─────────────
# Per-agent LLM model + reasoning effort, resolved by launcher.resolve_agent_*.
# Resolution priority (per agent): project-override (projects_json agent_models/
# agent_efforts) > ORCH_AGENT_MODEL_<AGENT> / ORCH_AGENT_EFFORT_<AGENT> >
# ORCH_AGENT_MODEL_DEFAULT / ORCH_AGENT_EFFORT_DEFAULT > CLI default (no flag).
# The frontmatter `model:` in .openclaw/agents/*.md is DESCRIPTIVE only and is NOT
# read — config below is the single source of truth for the model (ORCH-74 G1).
#
# ORCH-74 (G2): a resolved MODEL name is validated (^claude-…$ format check) before
# it reaches --model. A structurally invalid name (typo, gpt-4, empty) is logged and
# the next valid level is used (in the limit: no --model flag). Forward-compatible:
# a future claude-* version passes without editing any allowlist. EFFORT is validated
# against low|medium|high|xhigh|max (ORCH-41); an invalid effort is dropped.
#
# All 6 agents resolve to claude-opus-4-8 (model-routing G3 NOT enabled). Leave the
# per-agent overrides empty to use the default. Do NOT hardcode the model version
# anywhere except ORCH_AGENT_MODEL_DEFAULT.
ORCH_AGENT_MODEL_DEFAULT=claude-opus-4-8
ORCH_AGENT_MODEL_ANALYST=
ORCH_AGENT_MODEL_ARCHITECT=
ORCH_AGENT_MODEL_DEVELOPER=
ORCH_AGENT_MODEL_REVIEWER=
ORCH_AGENT_MODEL_TESTER=
ORCH_AGENT_MODEL_DEPLOYER=
# Effort split (ORCH-081/ORCH-52h): thinking agents (analyst/architect/reviewer)
# -> high; developer -> xhigh (coding/agentic role, Opus 4.8 canon); mechanical
# agents (tester/deployer) -> medium. NB: an empty ORCH_AGENT_EFFORT_*= no longer
# zeroes the effort — the launcher falls back to a per-role floor (= the config.py
# class-default) so each role still runs at its canonical level (ORCH-081).
ORCH_AGENT_EFFORT_DEFAULT=high
ORCH_AGENT_EFFORT_ANALYST=high
ORCH_AGENT_EFFORT_ARCHITECT=high
ORCH_AGENT_EFFORT_DEVELOPER=xhigh
ORCH_AGENT_EFFORT_REVIEWER=high
ORCH_AGENT_EFFORT_TESTER=medium
ORCH_AGENT_EFFORT_DEPLOYER=medium
# Optional --fallback-model used when the primary is overloaded. Empty -> no flag
# (G4 NOT enabled, ADR-001 ORCH-74: determinism — all agents stay on opus-4-8). A
# non-empty value is validated by the SAME predicate as the model; a typo is dropped.
ORCH_AGENT_FALLBACK_MODEL=
# ORCH-042/ORCH-067: live-tracker mode. bump (DEFAULT since ORCH-067) -> on every
# update the old card is deleted and a fresh one is sent silently to the BOTTOM of
# the chat (deleteMessage + sendMessage + repoint), so the current status is always
# the last message in an active chat. edit -> the task card is edited in place
# (editMessageText). One card per task in both modes. Any value other than "bump"
# (incl. empty/garbage) -> edit.
ORCH_TRACKER_MODE=bump
# ORCH-067: best-effort live-overlay for the card status line. The offline core
# (stage -> Plane status, In Review from the brd-clock) always works without network;
# the overlay only fills in branches indistinguishable offline (Needs Input / Blocked /
# Rejected / Cancelled / Deploying / Monitoring after Deploy) by reading the LIVE Plane
# status with a short timeout + per-issue TTL cache. It NEVER blocks the pipeline and
# NEVER raises.
#   LIVE_STATUS         -> kill-switch (false -> offline core only).
#   LIVE_STATUS_TTL_S   -> TTL (seconds) of the per-issue live-uuid cache (hot-path guard).
#   LIVE_STATUS_TIMEOUT_S -> timeout (seconds) of a single live-GET on the render path.
ORCH_TRACKER_LIVE_STATUS=true
ORCH_TRACKER_LIVE_STATUS_TTL_S=60
ORCH_TRACKER_LIVE_STATUS_TIMEOUT_S=3
# ORCH-043: merge-gate (auto-rebase onto current origin/main + re-test + merge-lock)
# on the deploy-staging -> deploy edge. Deterministic sub-gate (no LLM) that catches
# the branch up to the CURRENT origin/main, re-tests it, and serialises merges so two
# green parallel branches can't break main.
#   ENABLED   -> global kill-switch (false -> whole gate is a no-op pass).
#   REPOS     -> CSV of repos where the gate is REAL; empty -> only the self-hosting
#                repo (orchestrator); other repos -> conditional no-op (mirrors ORCH-35).
#   RETEST_TIMEOUT_S -> wall-clock budget for the post-rebase re-test.
#   RETEST_TARGET    -> pytest target for the re-test.
#   LOCK_TIMEOUT_S   -> max merge-lease age before a stale lease is reclaimed.
#   DEFER_DELAY_S    -> delay before re-running the gate when the lock is busy.
#   DEFER_MAX_ATTEMPTS -> defer retries before escalation (avoids livelock).
ORCH_MERGE_GATE_ENABLED=true
ORCH_MERGE_GATE_REPOS=
ORCH_MERGE_RETEST_TIMEOUT_S=600
ORCH_MERGE_RETEST_TARGET=tests/
ORCH_MERGE_LOCK_TIMEOUT_S=300
ORCH_MERGE_DEFER_DELAY_S=60
ORCH_MERGE_DEFER_MAX_ATTEMPTS=5
# ORCH-026 Level A: unconditional pre-merge rebase. With the flag ON (default),
# check_branch_mergeable ALWAYS rebases the branch onto origin/main under the held
# merge-lease (not only when behind) — a deterministic structural anti-phantom on
# the scheduler edge. No-op on an up-to-date branch (rebase keeps HEAD, force-with-
# lease -> "Everything up-to-date", CI not triggered). Scope = ORCH_MERGE_GATE_REPOS.
#   PREMERGE_REBASE_ALWAYS=false -> strictly pre-ORCH-026 (rebase only when behind).
ORCH_PREMERGE_REBASE_ALWAYS=true
# ORCH-026 Level B: declarative task dependencies ("B waits for A"). claim_next_job
# gates jobs whose depends-on tasks are not yet 'done' (additive job_deps table,
# NOT EXISTS) WITHOUT occupying a max_concurrency slot. Inert on an empty job_deps.
#   TASK_DEPS_ENABLED=false -> claim query is 1:1 the ORCH-1 query (no gate).
#   TASK_DEPS_SOURCE=db|plane|hybrid -> declaration source; db (default) never calls
#     Plane on the hot path; plane/hybrid ingest Plane `blocked-by` relations and
#     cache them into job_deps (the scheduler then reads only the DB).
ORCH_TASK_DEPS_ENABLED=true
ORCH_TASK_DEPS_SOURCE=db
# ORCH-088 (Stage 1, serial e2e): per-repo serial gate. A NEW task's analyst-job does
# NOT enter analysis (no branch cut, no analyst) while the same repo has an EARLIER
# unfinished task (FIFO, tasks.id < the job's task) OR the repo is frozen. The branch
# cut is DEFERRED from start_pipeline to the analyst-job claim so its base is a fresh
# origin/main already containing the predecessor (anti-stale-base). Gate lives in
# claim_next_job (offline hot-path, fail-OPEN on error); freeze (FR-5) is a durable
# repo_freeze row set on post-deploy DEGRADED, cleared manually via
# POST /serial-gate/unfreeze?repo=<repo>. Leaf src/serial_gate.py (never-raise).
#   SERIAL_GATE_ENABLED=false -> claim AND start_pipeline are 1:1 as before ORCH-088.
#   SERIAL_GATE_REPOS (CSV) -> scope; EMPTY = ALL repos (not self-hosting-only).
#   SERIAL_GATE_FREEZE_ENABLED=false -> the rollback-freeze layer is off (not set/read).
ORCH_SERIAL_GATE_ENABLED=true
ORCH_SERIAL_GATE_REPOS=
ORCH_SERIAL_GATE_FREEZE_ENABLED=true
# ORCH-090: STOP-status task cancellation (stop active agent + full progress reset)
# and the relaunch-hole close. A dedicated Plane "STOP" status (logical key `stop`,
# fail-closed: absent from _DEFAULT_STATES, so a board without the status -> no-op)
# routes to a cancel handler that drives the task to the system-terminal state
# `cancelled` (stop agent via the graceful SIGTERM cascade, cancel all jobs, remove
# worktree + delete the remote feature branch [never main / never force-push],
# tombstone the natural keys for a clean re-create via "To Analyse"; docs preserved).
# STOP during a critical merge/deploy window is DEFERRED until the irreversible step
# finishes honestly. The relaunch-hole gate restricts the "To Analyse" agent relaunch
# to the `analysis` stage (the sole Needs-Input owner). Additive, never-raise.
# Infra precondition: create a "STOP" status with the `cancelled` group on the ORCH
# board (07-infra-requirements.md). Leaf src/cancel.py.
#   STOP_STATUS_ENABLED=false -> STOP handling AND the relaunch-hole gate are inert
#                                (behaviour strictly as before ORCH-090).
#   STOP_STATUS_REPOS (CSV)   -> scope; EMPTY = ALL repos (cancellation is meaningful
#                                for enduro too).
ORCH_STOP_STATUS_ENABLED=true
ORCH_STOP_STATUS_REPOS=
# ORCH-019: bug-fast-track — a cheaper/shorter pipeline route for bug-fix tasks.
# A task carrying the Plane `Bug` label skips the whole `architecture` stage; EVERY
# Quality Gate / sub-gate runs UNCHANGED (route is a scheduler property, not a gate).
# Additive, never-raise, fail-safe -> full cycle. Infra precondition: create a `Bug`
# label on the ORCH board (its absence = full cycle, fail-safe). Leaf src/bug_fast_track.py.
#   BUG_FAST_TRACK_ENABLED=false -> start_pipeline AND advance_stage are 1:1 as before
#                                   ORCH-019 (zero regression).
#   BUG_FAST_TRACK_LABEL         -> Plane label that activates the track (default `Bug`).
#   BUG_FAST_TRACK_REPOS (CSV)   -> scope; EMPTY = self-hosting only (orchestrator).
ORCH_BUG_FAST_TRACK_ENABLED=true
ORCH_BUG_FAST_TRACK_LABEL=Bug
ORCH_BUG_FAST_TRACK_REPOS=
# ORCH-094: terminal-window-aware guard for the three deploy-phase Plane status
# setters (set_issue_awaiting_deploy / set_issue_deploying / set_issue_monitoring).
# A DB stage=done task converges to Done idempotently instead of flapping
# Awaiting <-> Monitoring, EXCEPT the legitimate post-deploy Monitoring while the
# window is active (ARMED & not DONE). Leaf src/deploy_status_guard.py, never-raise;
# STAGE_TRANSITIONS / QG_CHECKS / machine-verdict keys untouched (no DB migration).
#   DEPLOY_STATUS_GUARD_ENABLED=false -> setters are terminal-blind (1:1 pre-ORCH-094).
#   DEPLOY_STATUS_GUARD_REPOS (CSV)   -> scope; EMPTY = self-hosting only (orchestrator),
#                                        the only repo where deploy-phase statuses are set.
ORCH_DEPLOY_STATUS_GUARD_ENABLED=true
ORCH_DEPLOY_STATUS_GUARD_REPOS=
# ORCH-071/073: merge-verify under-gate on the `deploy -> done` edge (врезка in
# advance_stage, NOT a new STAGE_TRANSITIONS edge / registered QG). A deterministic
# merge-actor merges the feature code-PR via the Gitea PR-merge API (never push/
# force-push to main), then `done` is allowed ONLY when the deployed SHA is proven an
# ancestor of origin/main (ORCH-073 FR-1: SHA-in-main is the single criterion; a
# merged PR alone no longer confirms). A secondary regression guard then checks a
# declarative marker set (MAIN_REGRESSION_MARKERS) is still in origin/main; a missing
# marker -> alert + HOLD (NOT done), a git error of the grep itself -> fail-open.
#   MERGE_VERIFY_ENABLED      -> global kill-switch (false -> strictly pre-ORCH-071).
#   MERGE_VERIFY_REPOS        -> CSV of repos where the under-gate is REAL; empty ->
#                                only the self-hosting repo (orchestrator); non-self -> no-op.
#   MERGE_PR_TIMEOUT_S        -> per Gitea list/merge HTTP call timeout.
#   MERGE_VERIFY_TIMEOUT_S    -> git fetch/merge-base timeout for the ancestor + marker checks.
#   REGRESSION_GUARD_ENABLED  -> kill-switch for the ORCH-073 main-integrity regression
#                                guard (false -> SHA-in-main alone gates done); reuses the
#                                merge-verify scope, so non-self repos are a no-op.
#   MERGE_VERIFY_AUTOCREATE_PR_ENABLED -> ORCH-082: guarantee an open code-PR
#                                (head==branch, base==main) via merge_gate.ensure_open_pr
#                                BEFORE the deterministic merge_pr (fixes the false HOLD
#                                "no open PR"). false -> exactly pre-ORCH-082 behaviour.
#                                Reuses the merge-verify scope; non-self repos -> no-op.
ORCH_MERGE_VERIFY_ENABLED=true
ORCH_MERGE_VERIFY_REPOS=
ORCH_MERGE_PR_TIMEOUT_S=60
ORCH_MERGE_VERIFY_TIMEOUT_S=60
ORCH_REGRESSION_GUARD_ENABLED=true
ORCH_MERGE_VERIFY_AUTOCREATE_PR_ENABLED=true
# ORCH-093: deterministic merge-actor retry of TRANSIENT Gitea merge errors. merge_pr
# wraps ONLY the mutating POST /pulls/{n}/merge in a bounded exponential-backoff
# retry-loop on transient outcomes (405 "try again later" / 408 / 5xx / network /
# timeout, and 409|422 while the PR is still mergeable); terminal outcomes
# (403/404/real conflict) -> fast honest False (the ORCH-071/081 HOLD backstop is
# unchanged). Fixes the ORCH-063 false HOLD + manual re-merge. The already-in-main
# guard (no commits beyond origin/main -> no garbage PR) is always-on under
# MERGE_VERIFY_AUTOCREATE_PR_ENABLED (no separate flag).
#   MERGE_RETRY_ENABLED        -> kill-switch; false -> exactly one POST (one-shot, prior behaviour).
#   MERGE_RETRY_MAX_ATTEMPTS   -> max POST attempts on a transient outcome.
#   MERGE_RETRY_BACKOFF_BASE_S -> exponential backoff base seconds (sleep = base*2^(i-1)).
#   MERGE_RETRY_BACKOFF_MAX_S  -> per-sleep backoff ceiling seconds (bounds total wait).
ORCH_MERGE_RETRY_ENABLED=true
ORCH_MERGE_RETRY_MAX_ATTEMPTS=3
ORCH_MERGE_RETRY_BACKOFF_BASE_S=2
ORCH_MERGE_RETRY_BACKOFF_MAX_S=5
# ORCH-036: executable self-deploy of the `deploy` stage. For the self-hosting repo
# (orchestrator) the stage REALLY restarts prod (8500) via a detached host hook;
# deploy_status: SUCCESS means proven health-ok, not an LLM declaration. Three
# deterministic phases (A: request approve, B: human Approved -> detached deploy,
# C: finalizer maps hook exit-code -> deploy_status). Non-self repos: unchanged
# synchronous ssh deploy. SECRETS / host paths live ONLY on the host — do NOT commit.
#   SELF_DEPLOY_ENABLED -> global kill-switch (false -> legacy synchronous deploy for all).
#   SELF_DEPLOY_REPOS   -> CSV of repos where Phase A/B/C is REAL; empty -> only the
#                          self-hosting repo (orchestrator); others -> no-op (mirrors ORCH-35).
#   DEPLOY_REQUIRE_MANUAL_APPROVE -> require a human Plane "Approved" before the prod
#                          deploy (true on rollout; full auto is ORCH-54).
#   DEPLOY_FINALIZE_DELAY_S       -> delay before the first/each finalize poll (>= hook+health).
#   DEPLOY_FINALIZE_MAX_ATTEMPTS  -> bounded finalize-defer budget (anti-livelock).
#   DEPLOY_SSH_USER / DEPLOY_SSH_HOST -> ssh target for the host hook (DEPLOY_SSH_HOST
#                          empty -> detached deploy will NOT launch; set on the host).
#   DEPLOY_HOOK_SCRIPT            -> path to the hook ON THE HOST (relative to the repo).
#   DEPLOY_HOST_REPO_PATH         -> orchestrator clone path on the host.
#   DEPLOY_PROD_SOURCE_IMAGE      -> staging-validated image, retagged build-once (no rebuild).
#   DEPLOY_PROD_TARGET_SERVICE / _PORT / _IMAGE / _COMPOSE_PROFILE -> prod compose profile.
#   DEPLOY_PROD_PREV_IMAGE_FILE   -> prod prev-image snapshot (separate from staging's).
ORCH_SELF_DEPLOY_ENABLED=true
ORCH_SELF_DEPLOY_REPOS=
ORCH_DEPLOY_REQUIRE_MANUAL_APPROVE=true
ORCH_DEPLOY_FINALIZE_DELAY_S=90
ORCH_DEPLOY_FINALIZE_MAX_ATTEMPTS=10
ORCH_DEPLOY_SSH_USER=slin
ORCH_DEPLOY_SSH_HOST=
ORCH_DEPLOY_HOOK_SCRIPT=scripts/orchestrator-deploy-hook.sh
ORCH_DEPLOY_HOST_REPO_PATH=/home/slin/repos/orchestrator
ORCH_DEPLOY_PROD_SOURCE_IMAGE=orchestrator-orchestrator-staging
ORCH_DEPLOY_PROD_TARGET_SERVICE=orchestrator
ORCH_DEPLOY_PROD_TARGET_PORT=8500
ORCH_DEPLOY_PROD_TARGET_IMAGE=orchestrator-orchestrator
ORCH_DEPLOY_PROD_COMPOSE_PROFILE=
ORCH_DEPLOY_PROD_PREV_IMAGE_FILE=.deploy-prev-image-prod

# ORCH-058: staging-image provenance before the BUILD-ONCE prod retag (INV-FRESH).
# Guarantees the staging image promoted to prod is the EXACT artefact rebuilt from the
# validated commit — two layers, self-hosting only:
#   A (liveness): QG sub-check `check_staging_image_fresh` on the deploy-staging->deploy
#     edge rebuilds orchestrator-orchestrator-staging from the validated commit + recreates
#     8501; FAIL -> rollback to development. (builds/recreate STAGING only, never prod.)
#   B (safety):  the Dockerfile stamps `org.opencontainers.image.revision`; the prod hook
#     fail-closes (exit 1) before `docker tag` if SOURCE_IMAGE's label != EXPECTED_REVISION.
#   ENABLED -> single kill-switch for A+B as a WHOLE (never "B without A"); false -> legacy.
#   REPOS   -> CSV of repos where the gate is REAL; empty -> only self-hosting (orchestrator).
ORCH_IMAGE_FRESHNESS_ENABLED=true
ORCH_IMAGE_FRESHNESS_REPOS=

# ORCH-061: staging-verdict tolerance to sandbox-infra-only FAILs. The self-hosting
# orchestrator looped on deploy-staging because staging_check.py exited 1 on ANY FAIL,
# so two infra-only checks (C9a sandbox branch / C9b analyst-job — caused by SANDBOX
# bot accounts not being members of the sandbox Plane project, NOT a pipeline regress)
# forced staging_status: FAILED -> rollback -> loop. With this ON, C9a/C9b are WAIVED
# to SUCCESS when every REAL check is green; any REAL failure still fails closed.
#   true (default) -> tolerant; false -> legacy strict (1:1 pre-ORCH-061, any FAIL rolls back).
# Lives in .env.staging (the staging instance). CLI --strict overrides this per-run.
ORCH_STAGING_INFRA_TOLERANCE_ENABLED=true

# ORCH-053: stuck-task reconciler (sweeper for lost webhooks). A background daemon
# replays a missed stage transition through the SAME gates/handlers a webhook would,
# fixing tasks that got stuck on a dropped event (502 on rebuild, no Plane/Gitea
# retries, unresolved sha->branch).
#   ENABLED            -> global kill-switch (self-hosting safety / staged rollout).
#   PLANE_ENABLED      -> separate flag for the F-2 Plane-API poll (mute only F-2).
#   INTERVAL_S         -> background sweep period (seconds).
#   GRACE_DEFAULT_S    -> default "stuck" threshold on tasks.updated_at (seconds).
#   GRACE_OVERRIDES_JSON -> per-stage thresholds, e.g. {"development":300}; bad JSON -> default.
#   NOTIFY_UNBLOCK     -> send a Telegram message when a stuck task is unblocked.
#   SKIP_BLOCKED_ENABLED -> ORCH-060 F-1 Guard 2: skip reconciling issues a human moved
#                        to Blocked / Needs Input (per-candidate Plane state lookup).
#                        false mutes ONLY the networked Guard 2; Guard 1 (escalated by
#                        developer retries, local+deterministic) is always active.
ORCH_RECONCILE_ENABLED=true
ORCH_RECONCILE_PLANE_ENABLED=true
ORCH_RECONCILE_INTERVAL_S=120
ORCH_RECONCILE_GRACE_DEFAULT_S=600
ORCH_RECONCILE_GRACE_OVERRIDES_JSON=
ORCH_RECONCILE_NOTIFY_UNBLOCK=true
ORCH_RECONCILE_SKIP_BLOCKED_ENABLED=true

# ORCH-068: TTL (seconds) for the per-project Plane states cache (plane_sync
# _STATES_CACHE). Historically the cache lived for the whole process lifetime,
# so a status added to Plane after start was invisible until a restart
# ("stale set -> no pipeline action"). With a TTL the entry self-heals by
# re-fetching /states/ once it expires (reuses reload_project_states()).
#   >0  -> re-fetch after this many seconds (default 300 = 5 min);
#   0   -> disable TTL -> strictly the previous lifetime cache (back-compat).
ORCH_PLANE_STATES_TTL_S=300

# ORCH-065: job-reaper + proactive merge-lease reclaim. A background daemon thread
# (src/job_reaper.py, started LAST in main.lifespan after requeue_running_jobs) reaps
# zombie 'running' jobs whose monitor/process died before writing the terminal status
# (one zombie at max_concurrency=1 blocks the whole shared queue) and periodically
# reclaims dead/stale merge-leases. Liveness is three-tier: Tier-1 dead jobs.pid
# (os.kill(pid,0)) after REAPER_DEAD_TICKS consecutive dead ticks (anti-false-positive
# for a live agent); Tier-2 agent_runs.exit_code recorded but job still 'running'
# (only after a REAPER_FINALIZE_GRACE_S finalization grace, so a live monitor still
# doing git push / PR / Plane comments is never reaped); Tier-3 backstop after
# REAPER_MAX_RUNNING_S. The terminal flip carries an atomic status='running' guard and
# precedes any advance/enqueue (claim-before-act) so it never double-processes/-advances
# a row racing a late monitor or requeue_running_jobs.
#   REAPER_ENABLED          -> global kill-switch (false -> strictly prior behaviour).
#   REAPER_INTERVAL_S       -> background scan period (seconds).
#   REAPER_DEAD_TICKS       -> consecutive dead-pid ticks before reaping (Tier-1, >=2).
#   REAPER_MAX_RUNNING_S    -> Tier-3 backstop ceiling; must exceed max agent_timeout+grace.
#   REAPER_FINALIZE_GRACE_S -> Tier-2 grace: how long agent_runs.exit_code must have been
#                              recorded before a still-'running' job is reaped; MUST exceed
#                              the max finalization window (git push + PR + Plane comments).
#   LEASE_RECLAIM_ENABLED   -> kill-switch for the proactive stale/dead lease reclaim
#                              (false -> only the legacy lazy TTL reclaim in acquire_merge_lease).
# (reuse) ORCH_MERGE_LOCK_TIMEOUT_S -> lease TTL; ORCH_MERGE_GATE_REPOS -> reclaim scope.
ORCH_REAPER_ENABLED=true
ORCH_REAPER_INTERVAL_S=60
ORCH_REAPER_DEAD_TICKS=2
ORCH_REAPER_MAX_RUNNING_S=3600
ORCH_REAPER_FINALIZE_GRACE_S=300
ORCH_LEASE_RECLAIM_ENABLED=true

# ORCH-063: disk-watchdog — background heartbeat that measures HOST-FS fill via the
# mounted bind-paths (/repos, /app/data) with shutil.disk_usage (NOT the container
# overlay /) and Telegram-alerts the operator at >= threshold. On 07.06.2026 the
# mva154 host disk silently hit 100% and stalled the WHOLE self-hosting pipeline;
# this is the missing proactive signal. Daemon thread modelled on reconciler/reaper
# (start/stop in main.lifespan, /queue snapshot, never-raise). Anti-spam state is
# in-memory (no DB migration); the watchdog only READS fill and SENDS Telegram — it
# never touches the disk/container or restarts prod (self-hosting safety).
#   DISK_MONITOR_ENABLED       -> kill-switch; false -> the daemon does not start (1:1 as before).
#   DISK_MONITOR_INTERVAL_S    -> heartbeat measurement period, seconds (order of minutes).
#   DISK_MONITOR_THRESHOLD_PCT -> fill % that triggers the alert (Owner-fixed 85; valid 1..100).
#   DISK_MONITOR_REALERT_S     -> cooldown between repeat alerts while above threshold (~6h).
#   DISK_MONITOR_PATHS         -> CSV of monitored HOST bind-paths; empty -> /repos,/app/data.
ORCH_DISK_MONITOR_ENABLED=true
ORCH_DISK_MONITOR_INTERVAL_S=300
ORCH_DISK_MONITOR_THRESHOLD_PCT=85
ORCH_DISK_MONITOR_REALERT_S=21600
ORCH_DISK_MONITOR_PATHS=/repos,/app/data

# ORCH-062: build-cache-pruner — the "second half" of the disk-watchdog
# (watchdog SIGNALS, pruner CLEANS). A daemon thread modelled on disk_watchdog
# that periodically runs STRICTLY `docker builder prune -f --filter until=<until>`
# on the HOST over ssh (BuildKit GC). Touches ONLY the build cache: never
# images/containers of running services, never restarts the docker daemon or the
# prod container (self-hosting safety). State is in-memory (no DB migration). No
# ssh host configured -> the tick is a no-op. See docs/operations/INFRA.md.
#   BUILD_CACHE_PRUNE_ENABLED       -> kill-switch; false -> the daemon does not start (1:1 as before).
#   BUILD_CACHE_PRUNE_INTERVAL_S    -> tick period, seconds (order of hours; default ~6h). >0, else default.
#   BUILD_CACHE_PRUNE_UNTIL         -> retention age for the warm cache (`--filter until=`); ^\d+[smhdw]?$, else 24h.
#   BUILD_CACHE_PRUNE_ALL           -> add `-a` (ALWAYS paired with until); default false.
#   BUILD_CACHE_PRUNE_TIMEOUT_S     -> bound on the ssh command, seconds. >0, else default.
#   BUILD_CACHE_PRUNE_NOTIFY_MIN_GB -> Telegram when reclaimed >= N GB; 0 -> silent.
ORCH_BUILD_CACHE_PRUNE_ENABLED=true
ORCH_BUILD_CACHE_PRUNE_INTERVAL_S=21600
ORCH_BUILD_CACHE_PRUNE_UNTIL=24h
ORCH_BUILD_CACHE_PRUNE_ALL=false
ORCH_BUILD_CACHE_PRUNE_TIMEOUT_S=120
ORCH_BUILD_CACHE_PRUNE_NOTIFY_MIN_GB=0

# ORCH-022: security-gate (secret-scanning + dependency audit) on the
# deploy-staging -> deploy edge, run FIRST among the edge sub-gates. Deterministic
# (no LLM): gitleaks (offline secret-scan, pinned Go binary in the image) + pip-audit
# (OSV/PyPI CVE audit). Verdict in the versioned 17-security-report.md frontmatter;
# FAIL -> rollback to development + developer-retry (cap 3). See ADR-001.
#   GATE_ENABLED          -> global kill-switch; false -> pipeline 1:1 as before ORCH-022.
#   GATE_REPOS            -> CSV of repos where the gate is REAL; empty -> only self-hosting.
#   DEP_BLOCK_SEVERITY    -> CVE severity that BLOCKS (CRITICAL>HIGH>MEDIUM>LOW); below /
#                            UNKNOWN -> warning only (anti-loop).
#   SCAN_TIMEOUT_S        -> per external scanner call timeout.
#   DEP_AUDIT_FAIL_CLOSED -> strict mode: unreachable CVE feed -> FAIL instead of the
#                            default fail-open + warning (anti-loop). Default false.
#   SECRETS_BLOCK         -> a found secret blocks (always true by default; the offline
#                            secrets guarantee is unconditional).
ORCH_SECURITY_GATE_ENABLED=true
ORCH_SECURITY_GATE_REPOS=
ORCH_SECURITY_DEP_BLOCK_SEVERITY=HIGH
ORCH_SECURITY_SCAN_TIMEOUT_S=300
ORCH_SECURITY_DEP_AUDIT_FAIL_CLOSED=false
ORCH_SECURITY_SECRETS_BLOCK=true

# ORCH-027: coverage-gate (deterministic test-coverage) on the deploy-staging ->
# deploy edge, run AFTER the merge-gate and BEFORE image-freshness. Measures line
# coverage of src/ with pytest-cov in the per-branch worktree, compares to an absolute
# floor and/or the ratchet baseline of `main`; FAIL -> rollback to development +
# developer-retry (cap 3). Verdict in the 18-coverage-report.md frontmatter
# (coverage_status:). See ADR-001-coverage-gate.md.
#   GATE_ENABLED       -> global kill-switch; false -> pipeline 1:1 as before ORCH-027.
#   GATE_REPOS         -> CSV of repos where the gate is REAL; empty -> only self-hosting.
#   MIN_PERCENT        -> absolute floor (% line coverage) for policy absolute/both.
#   POLICY             -> absolute | baseline | both (default both).
#   EPSILON            -> noise tolerance (%) at the boundary (anti-flap).
#   TOOL_FAIL_CLOSED   -> strict mode: a coverage-tool error -> FAIL instead of the
#                         default fail-open + warning (anti-loop). Default false.
#   RUN_TIMEOUT_S      -> wall-clock budget for the pytest --cov run.
ORCH_COVERAGE_GATE_ENABLED=true
ORCH_COVERAGE_GATE_REPOS=
ORCH_COVERAGE_MIN_PERCENT=0.0
ORCH_COVERAGE_POLICY=both
ORCH_COVERAGE_EPSILON=0.5
ORCH_COVERAGE_TOOL_FAIL_CLOSED=false
ORCH_COVERAGE_RUN_TIMEOUT_S=900

# ORCH-057 (follow-up ORCH-040): legacy root-owned ownership detect + actionable
# worktree error. After the uid migration (user: "1000:1000") legacy root:root files
# in /repos broke worktree creation under uid 1000 with a raw "Permission denied".
# Three additive, kill-switch-reversible layers: an actionable RuntimeError in
# ensure_worktree, a cheap never-raise detect leaf (src/fs_normalize.py) with a
# startup WARNING/Telegram + GET /queue fs_ownership block, and an opt-in chown ONLY
# when privileged (under uid 1000 a no-op; the real fix is the operator procedure in
# docs/operations/INFRA.md «Миграция uid»). No STAGE_TRANSITIONS / QG_CHECKS / schema
# change.
#   ENABLED        -> kill-switch; false -> all code inert, behaviour 1:1 as before
#                     ORCH-057 (the actionable error too).
#   REPOS          -> CSV of repos the layer is REAL for; empty -> self-hosting only.
#   TARGET_UID     -> target uid fallback when os.getuid() is unavailable.
#   NORMALIZE_AUTO -> detect-only (false) | attempt chown when privileged (true).
#   SCAN_ROOTS     -> CSV override of the scan roots (empty -> default roots).
#   SCAN_CACHE_TTL_S -> TTL of the detect cache (mirrors ORCH_PREFLIGHT_CACHE_TTL).
ORCH_FS_NORMALIZE_ENABLED=true
ORCH_FS_NORMALIZE_REPOS=
ORCH_FS_TARGET_UID=1000
ORCH_FS_NORMALIZE_AUTO=false
ORCH_FS_SCAN_ROOTS=
ORCH_FS_SCAN_CACHE_TTL_S=300

# ORCH-099 (FND/F1a): operator off-switch for the read-only GET /metrics endpoint
# (raw-signal snapshot for the F1b sidecar). Default true -> available out of the
# box. false -> /metrics returns a minimal parsable body {"schema_version":1,
# "enabled":false} (200, not 404). The endpoint is inert / read-only anyway.
ORCH_METRICS_ENABLED=true

# ORCH-021: post-deploy production monitoring + degradation reaction. After the
# terminal deploy->done transition for an applicable repo, a reserved-agent job
# `post-deploy-monitor` (no LLM, modelled on deploy-finalizer) probes prod over a
# window and reacts to a degradation the restart-time health-check missed (class
# "green deploy, red prod", precedent ET-8). State is in sentinel files
# (.post-deploy-state-<repo>/<wi>/), no DB migration.
#   MONITOR_ENABLED  -> global kill-switch; false -> pipeline is 1:1 as before ORCH-021.
#   REPOS            -> CSV of repos where monitoring is REAL; empty -> only self-hosting.
#   WINDOW_S         -> observation window length (~15 min).
#   INTERVAL_S       -> seconds between probe ticks.
#   FAIL_THRESHOLD   -> N CONSECUTIVE health failures -> DEGRADED.
#   5XX_THRESHOLD    -> window 5xx ratio above this -> DEGRADED.
#   AUTO_ROLLBACK    -> allow auto-rollback; acts ONLY for non-self repos. Self-hosting
#                       is ALWAYS ALERT_ONLY (a tick NEVER restarts the prod container).
#   BASE_URL         -> base URL of the observed prod instance.
ORCH_POST_DEPLOY_MONITOR_ENABLED=true
ORCH_POST_DEPLOY_REPOS=
ORCH_POST_DEPLOY_WINDOW_S=900
ORCH_POST_DEPLOY_INTERVAL_S=30
ORCH_POST_DEPLOY_FAIL_THRESHOLD=3
ORCH_POST_DEPLOY_5XX_THRESHOLD=0.5
ORCH_POST_DEPLOY_AUTO_ROLLBACK=false
ORCH_POST_DEPLOY_BASE_URL=http://localhost:8500

# ── QG-0 entry validation (ORCH-069) ──────────────────────────────────────────
# Upper title-length limit for the QG-0 entry gate (_qg0_errors). The old 80-char
# cap was a hygiene limit, not structural (slug is cut to [:30] independently, the
# DB title TEXT is unbounded). Default 200. An invalid/empty value gracefully
# degrades to 200 (the process never crashes on startup).
ORCH_QG0_TITLE_MAX=200

# ── ORCH-100 (FND/F1b): sidecar-watchdog (orchestrator-watchdog container) ─────
# The monitoring brain runs in a SEPARATE container with its OWN config. These
# keys are read by the watchdog package (watchdog/config.py), NOT by the
# orchestrator. At runtime they live in `.env.watchdog` (env_file of the
# orchestrator-watchdog service); this block is the canon. NO real secrets here.
#   ENABLED        -> kill-switch; false (or not starting the service) -> inert.
#   INTERVAL_S     -> seconds between ticks.
#   HTTP_TIMEOUT_S -> per-request timeout (metrics / pings / docker / telegram).
#   COOLDOWN_S     -> re-alert throttle for a sustained signal (anti-spam).
#   METRICS_URL    -> orchestrator /metrics (host-network -> 127.0.0.1:8500).
#   ORCH_DOWN_TICKS-> K consecutive /metrics failures before "орк не отвечает".
#   MEM_PCT        -> host memory used-% threshold.
#   DISK_CRIT_*    -> OPT-IN independent disk CEILING (disk_watchdog/ORCH-063 owns
#                     the 85% alert; this is a higher ceiling on the sidecar's own
#                     channel, OFF by default -> no double disk-alert, AC-5/D6).
#   DISK_PATHS     -> host paths measured for the opt-in ceiling.
#   AGENT_HUNG_MIN -> runtime minutes before an agent with ~0 CPU is "hung".
#   AGENT_CPU_FLOOR-> CPU fraction below which a long-running agent counts as hung.
#   STAGE_STUCK_MIN-> minutes a task may sit in one stage before alerting.
#   QUEUE_DEPTH    -> queued-job depth threshold.
#   CONTAINERS     -> CSV of container names to watch (status != running/healthy).
#   DOCKER_SOCK    -> path to the read-only docker.sock inside the container.
#   DEPS           -> CSV of name=url dependency pings (empty -> no pings).
#   TG_BOT_TOKEN / TG_CHAT_ID -> the sidecar's OWN Telegram bot/chat (independent
#                     of the orchestrator's; absent -> logs, does not send).
WATCHDOG_ENABLED=true
WATCHDOG_INTERVAL_S=30
WATCHDOG_HTTP_TIMEOUT_S=5
WATCHDOG_COOLDOWN_S=1800
WATCHDOG_METRICS_URL=http://127.0.0.1:8500/metrics
WATCHDOG_ORCH_DOWN_TICKS=3
WATCHDOG_MEM_PCT=90
WATCHDOG_DISK_CRIT_ENABLED=false
WATCHDOG_DISK_CRIT_PCT=97
WATCHDOG_DISK_PATHS=/repos,/app/data
WATCHDOG_AGENT_HUNG_MIN=20
WATCHDOG_AGENT_CPU_FLOOR=0.01
WATCHDOG_STAGE_STUCK_MIN=120
WATCHDOG_QUEUE_DEPTH=20
WATCHDOG_CONTAINERS=orchestrator
WATCHDOG_DOCKER_SOCK=/var/run/docker.sock
WATCHDOG_DEPS=
WATCHDOG_TG_BOT_TOKEN=
WATCHDOG_TG_CHAT_ID=