orchestrator/.env.example

ORCH_PLANE_API_URL=http://plane-app-api-1:8000
# External (browser) web URL of Plane for clickable issue links in notifications
# (ORCH-017). Falls back to ORCH_PLANE_API_URL; a loopback fallback is treated as
# "no web URL" and the Plane link is omitted. Example: https://plane.example.org
ORCH_PLANE_WEB_URL=
ORCH_PLANE_API_TOKEN=
ORCH_PLANE_WORKSPACE_SLUG=
ORCH_PLANE_WEBHOOK_SECRET=
ORCH_GITEA_URL=http://localhost:3000
ORCH_GITEA_TOKEN=
ORCH_GITEA_WEBHOOK_SECRET=
ORCH_CLAUDE_BIN=/usr/bin/claude
ORCH_REPOS_DIR=/home/slin/repos
ORCH_DB_PATH=/app/data/orchestrator.db
# ORCH-042: live-tracker mode. edit (DEFAULT) -> the task card is edited in place
# (editMessageText). bump -> on every update the old card is deleted and a fresh
# one is sent silently to the BOTTOM of the chat (deleteMessage + sendMessage +
# repoint). One card per task in both modes. Any value other than "bump" -> edit.
ORCH_TRACKER_MODE=edit
# ORCH-043: merge-gate (auto-rebase onto current origin/main + re-test + merge-lock)
# on the deploy-staging -> deploy edge. Deterministic sub-gate (no LLM) that catches
# the branch up to the CURRENT origin/main, re-tests it, and serialises merges so two
# green parallel branches can't break main.
#   ENABLED   -> global kill-switch (false -> whole gate is a no-op pass).
#   REPOS     -> CSV of repos where the gate is REAL; empty -> only the self-hosting
#                repo (orchestrator); other repos -> conditional no-op (mirrors ORCH-35).
#   RETEST_TIMEOUT_S -> wall-clock budget for the post-rebase re-test.
#   RETEST_TARGET    -> pytest target for the re-test.
#   LOCK_TIMEOUT_S   -> max merge-lease age before a stale lease is reclaimed.
#   DEFER_DELAY_S    -> delay before re-running the gate when the lock is busy.
#   DEFER_MAX_ATTEMPTS -> defer retries before escalation (avoids livelock).
ORCH_MERGE_GATE_ENABLED=true
ORCH_MERGE_GATE_REPOS=
ORCH_MERGE_RETEST_TIMEOUT_S=600
ORCH_MERGE_RETEST_TARGET=tests/
ORCH_MERGE_LOCK_TIMEOUT_S=300
ORCH_MERGE_DEFER_DELAY_S=60
ORCH_MERGE_DEFER_MAX_ATTEMPTS=5
# ORCH-036: executable self-deploy of the `deploy` stage. For the self-hosting repo
# (orchestrator) the stage REALLY restarts prod (8500) via a detached host hook;
# deploy_status: SUCCESS means proven health-ok, not an LLM declaration. Three
# deterministic phases (A: request approve, B: human Approved -> detached deploy,
# C: finalizer maps hook exit-code -> deploy_status). Non-self repos: unchanged
# synchronous ssh deploy. SECRETS / host paths live ONLY on the host — do NOT commit.
#   SELF_DEPLOY_ENABLED -> global kill-switch (false -> legacy synchronous deploy for all).
#   SELF_DEPLOY_REPOS   -> CSV of repos where Phase A/B/C is REAL; empty -> only the
#                          self-hosting repo (orchestrator); others -> no-op (mirrors ORCH-35).
#   DEPLOY_REQUIRE_MANUAL_APPROVE -> require a human Plane "Approved" before the prod
#                          deploy (true on rollout; full auto is ORCH-54).
#   DEPLOY_FINALIZE_DELAY_S       -> delay before the first/each finalize poll (>= hook+health).
#   DEPLOY_FINALIZE_MAX_ATTEMPTS  -> bounded finalize-defer budget (anti-livelock).
#   DEPLOY_SSH_USER / DEPLOY_SSH_HOST -> ssh target for the host hook (DEPLOY_SSH_HOST
#                          empty -> detached deploy will NOT launch; set on the host).
#   DEPLOY_HOOK_SCRIPT            -> path to the hook ON THE HOST (relative to the repo).
#   DEPLOY_HOST_REPO_PATH         -> orchestrator clone path on the host.
#   DEPLOY_PROD_SOURCE_IMAGE      -> staging-validated image, retagged build-once (no rebuild).
#   DEPLOY_PROD_TARGET_SERVICE / _PORT / _IMAGE / _COMPOSE_PROFILE -> prod compose profile.
#   DEPLOY_PROD_PREV_IMAGE_FILE   -> prod prev-image snapshot (separate from staging's).
ORCH_SELF_DEPLOY_ENABLED=true
ORCH_SELF_DEPLOY_REPOS=
ORCH_DEPLOY_REQUIRE_MANUAL_APPROVE=true
ORCH_DEPLOY_FINALIZE_DELAY_S=90
ORCH_DEPLOY_FINALIZE_MAX_ATTEMPTS=10
ORCH_DEPLOY_SSH_USER=slin
ORCH_DEPLOY_SSH_HOST=
ORCH_DEPLOY_HOOK_SCRIPT=scripts/orchestrator-deploy-hook.sh
ORCH_DEPLOY_HOST_REPO_PATH=/home/slin/repos/orchestrator
ORCH_DEPLOY_PROD_SOURCE_IMAGE=orchestrator-orchestrator-staging
ORCH_DEPLOY_PROD_TARGET_SERVICE=orchestrator
ORCH_DEPLOY_PROD_TARGET_PORT=8500
ORCH_DEPLOY_PROD_TARGET_IMAGE=orchestrator-orchestrator
ORCH_DEPLOY_PROD_COMPOSE_PROFILE=
ORCH_DEPLOY_PROD_PREV_IMAGE_FILE=.deploy-prev-image-prod

# ORCH-058: staging-image provenance before the BUILD-ONCE prod retag (INV-FRESH).
# Guarantees the staging image promoted to prod is the EXACT artefact rebuilt from the
# validated commit — two layers, self-hosting only:
#   A (liveness): QG sub-check `check_staging_image_fresh` on the deploy-staging->deploy
#     edge rebuilds orchestrator-orchestrator-staging from the validated commit + recreates
#     8501; FAIL -> rollback to development. (builds/recreate STAGING only, never prod.)
#   B (safety):  the Dockerfile stamps `org.opencontainers.image.revision`; the prod hook
#     fail-closes (exit 1) before `docker tag` if SOURCE_IMAGE's label != EXPECTED_REVISION.
#   ENABLED -> single kill-switch for A+B as a WHOLE (never "B without A"); false -> legacy.
#   REPOS   -> CSV of repos where the gate is REAL; empty -> only self-hosting (orchestrator).
ORCH_IMAGE_FRESHNESS_ENABLED=true
ORCH_IMAGE_FRESHNESS_REPOS=

# ORCH-061: staging-verdict tolerance to sandbox-infra-only FAILs. The self-hosting
# orchestrator looped on deploy-staging because staging_check.py exited 1 on ANY FAIL,
# so two infra-only checks (C9a sandbox branch / C9b analyst-job — caused by SANDBOX
# bot accounts not being members of the sandbox Plane project, NOT a pipeline regress)
# forced staging_status: FAILED -> rollback -> loop. With this ON, C9a/C9b are WAIVED
# to SUCCESS when every REAL check is green; any REAL failure still fails closed.
#   true (default) -> tolerant; false -> legacy strict (1:1 pre-ORCH-061, any FAIL rolls back).
# Lives in .env.staging (the staging instance). CLI --strict overrides this per-run.
ORCH_STAGING_INFRA_TOLERANCE_ENABLED=true

# ORCH-053: stuck-task reconciler (sweeper for lost webhooks). A background daemon
# replays a missed stage transition through the SAME gates/handlers a webhook would,
# fixing tasks that got stuck on a dropped event (502 on rebuild, no Plane/Gitea
# retries, unresolved sha->branch).
#   ENABLED            -> global kill-switch (self-hosting safety / staged rollout).
#   PLANE_ENABLED      -> separate flag for the F-2 Plane-API poll (mute only F-2).
#   INTERVAL_S         -> background sweep period (seconds).
#   GRACE_DEFAULT_S    -> default "stuck" threshold on tasks.updated_at (seconds).
#   GRACE_OVERRIDES_JSON -> per-stage thresholds, e.g. {"development":300}; bad JSON -> default.
#   NOTIFY_UNBLOCK     -> send a Telegram message when a stuck task is unblocked.
#   SKIP_BLOCKED_ENABLED -> ORCH-060 F-1 Guard 2: skip reconciling issues a human moved
#                        to Blocked / Needs Input (per-candidate Plane state lookup).
#                        false mutes ONLY the networked Guard 2; Guard 1 (escalated by
#                        developer retries, local+deterministic) is always active.
ORCH_RECONCILE_ENABLED=true
ORCH_RECONCILE_PLANE_ENABLED=true
ORCH_RECONCILE_INTERVAL_S=120
ORCH_RECONCILE_GRACE_DEFAULT_S=600
ORCH_RECONCILE_GRACE_OVERRIDES_JSON=
ORCH_RECONCILE_NOTIFY_UNBLOCK=true
ORCH_RECONCILE_SKIP_BLOCKED_ENABLED=true

# ORCH-065: job-reaper + proactive merge-lease reclaim. A background daemon thread
# (src/job_reaper.py, started LAST in main.lifespan after requeue_running_jobs) reaps
# zombie 'running' jobs whose monitor/process died before writing the terminal status
# (one zombie at max_concurrency=1 blocks the whole shared queue) and periodically
# reclaims dead/stale merge-leases. Liveness is three-tier: Tier-1 dead jobs.pid
# (os.kill(pid,0)) after REAPER_DEAD_TICKS consecutive dead ticks (anti-false-positive
# for a live agent); Tier-2 agent_runs.exit_code recorded but job still 'running'
# (only after a REAPER_FINALIZE_GRACE_S finalization grace, so a live monitor still
# doing git push / PR / Plane comments is never reaped); Tier-3 backstop after
# REAPER_MAX_RUNNING_S. The terminal flip carries an atomic status='running' guard and
# precedes any advance/enqueue (claim-before-act) so it never double-processes/-advances
# a row racing a late monitor or requeue_running_jobs.
#   REAPER_ENABLED          -> global kill-switch (false -> strictly prior behaviour).
#   REAPER_INTERVAL_S       -> background scan period (seconds).
#   REAPER_DEAD_TICKS       -> consecutive dead-pid ticks before reaping (Tier-1, >=2).
#   REAPER_MAX_RUNNING_S    -> Tier-3 backstop ceiling; must exceed max agent_timeout+grace.
#   REAPER_FINALIZE_GRACE_S -> Tier-2 grace: how long agent_runs.exit_code must have been
#                              recorded before a still-'running' job is reaped; MUST exceed
#                              the max finalization window (git push + PR + Plane comments).
#   LEASE_RECLAIM_ENABLED   -> kill-switch for the proactive stale/dead lease reclaim
#                              (false -> only the legacy lazy TTL reclaim in acquire_merge_lease).
# (reuse) ORCH_MERGE_LOCK_TIMEOUT_S -> lease TTL; ORCH_MERGE_GATE_REPOS -> reclaim scope.
ORCH_REAPER_ENABLED=true
ORCH_REAPER_INTERVAL_S=60
ORCH_REAPER_DEAD_TICKS=2
ORCH_REAPER_MAX_RUNNING_S=3600
ORCH_REAPER_FINALIZE_GRACE_S=300
ORCH_LEASE_RECLAIM_ENABLED=true

# ORCH-022: security-gate (secret-scanning + dependency audit) on the
# deploy-staging -> deploy edge, run FIRST among the edge sub-gates. Deterministic
# (no LLM): gitleaks (offline secret-scan, pinned Go binary in the image) + pip-audit
# (OSV/PyPI CVE audit). Verdict in the versioned 17-security-report.md frontmatter;
# FAIL -> rollback to development + developer-retry (cap 3). See ADR-001.
#   GATE_ENABLED          -> global kill-switch; false -> pipeline 1:1 as before ORCH-022.
#   GATE_REPOS            -> CSV of repos where the gate is REAL; empty -> only self-hosting.
#   DEP_BLOCK_SEVERITY    -> CVE severity that BLOCKS (CRITICAL>HIGH>MEDIUM>LOW); below /
#                            UNKNOWN -> warning only (anti-loop).
#   SCAN_TIMEOUT_S        -> per external scanner call timeout.
#   DEP_AUDIT_FAIL_CLOSED -> strict mode: unreachable CVE feed -> FAIL instead of the
#                            default fail-open + warning (anti-loop). Default false.
#   SECRETS_BLOCK         -> a found secret blocks (always true by default; the offline
#                            secrets guarantee is unconditional).
ORCH_SECURITY_GATE_ENABLED=true
ORCH_SECURITY_GATE_REPOS=
ORCH_SECURITY_DEP_BLOCK_SEVERITY=HIGH
ORCH_SECURITY_SCAN_TIMEOUT_S=300
ORCH_SECURITY_DEP_AUDIT_FAIL_CLOSED=false
ORCH_SECURITY_SECRETS_BLOCK=true

# ORCH-021: post-deploy production monitoring + degradation reaction. After the
# terminal deploy->done transition for an applicable repo, a reserved-agent job
# `post-deploy-monitor` (no LLM, modelled on deploy-finalizer) probes prod over a
# window and reacts to a degradation the restart-time health-check missed (class
# "green deploy, red prod", precedent ET-8). State is in sentinel files
# (.post-deploy-state-<repo>/<wi>/), no DB migration.
#   MONITOR_ENABLED  -> global kill-switch; false -> pipeline is 1:1 as before ORCH-021.
#   REPOS            -> CSV of repos where monitoring is REAL; empty -> only self-hosting.
#   WINDOW_S         -> observation window length (~15 min).
#   INTERVAL_S       -> seconds between probe ticks.
#   FAIL_THRESHOLD   -> N CONSECUTIVE health failures -> DEGRADED.
#   5XX_THRESHOLD    -> window 5xx ratio above this -> DEGRADED.
#   AUTO_ROLLBACK    -> allow auto-rollback; acts ONLY for non-self repos. Self-hosting
#                       is ALWAYS ALERT_ONLY (a tick NEVER restarts the prod container).
#   BASE_URL         -> base URL of the observed prod instance.
ORCH_POST_DEPLOY_MONITOR_ENABLED=true
ORCH_POST_DEPLOY_REPOS=
ORCH_POST_DEPLOY_WINDOW_S=900
ORCH_POST_DEPLOY_INTERVAL_S=30
ORCH_POST_DEPLOY_FAIL_THRESHOLD=3
ORCH_POST_DEPLOY_5XX_THRESHOLD=0.5
ORCH_POST_DEPLOY_AUTO_ROLLBACK=false
ORCH_POST_DEPLOY_BASE_URL=http://localhost:8500