ORCH_PLANE_API_URL=http://plane-app-api-1:8000 # External (browser) web URL of Plane for clickable issue links in notifications # (ORCH-017). Falls back to ORCH_PLANE_API_URL; a loopback fallback is treated as # "no web URL" and the Plane link is omitted. Example: https://plane.example.org ORCH_PLANE_WEB_URL= ORCH_PLANE_API_TOKEN= ORCH_PLANE_WORKSPACE_SLUG= ORCH_PLANE_WEBHOOK_SECRET= ORCH_GITEA_URL=http://localhost:3000 ORCH_GITEA_TOKEN= ORCH_GITEA_WEBHOOK_SECRET= ORCH_CLAUDE_BIN=/usr/bin/claude ORCH_REPOS_DIR=/home/slin/repos ORCH_DB_PATH=/app/data/orchestrator.db # ORCH-042: live-tracker mode. edit (DEFAULT) -> the task card is edited in place # (editMessageText). bump -> on every update the old card is deleted and a fresh # one is sent silently to the BOTTOM of the chat (deleteMessage + sendMessage + # repoint). One card per task in both modes. Any value other than "bump" -> edit. ORCH_TRACKER_MODE=edit # ORCH-043: merge-gate (auto-rebase onto current origin/main + re-test + merge-lock) # on the deploy-staging -> deploy edge. Deterministic sub-gate (no LLM) that catches # the branch up to the CURRENT origin/main, re-tests it, and serialises merges so two # green parallel branches can't break main. # ENABLED -> global kill-switch (false -> whole gate is a no-op pass). # REPOS -> CSV of repos where the gate is REAL; empty -> only the self-hosting # repo (orchestrator); other repos -> conditional no-op (mirrors ORCH-35). # RETEST_TIMEOUT_S -> wall-clock budget for the post-rebase re-test. # RETEST_TARGET -> pytest target for the re-test. # LOCK_TIMEOUT_S -> max merge-lease age before a stale lease is reclaimed. # DEFER_DELAY_S -> delay before re-running the gate when the lock is busy. # DEFER_MAX_ATTEMPTS -> defer retries before escalation (avoids livelock). ORCH_MERGE_GATE_ENABLED=true ORCH_MERGE_GATE_REPOS= ORCH_MERGE_RETEST_TIMEOUT_S=600 ORCH_MERGE_RETEST_TARGET=tests/ ORCH_MERGE_LOCK_TIMEOUT_S=300 ORCH_MERGE_DEFER_DELAY_S=60 ORCH_MERGE_DEFER_MAX_ATTEMPTS=5 # ORCH-036: executable self-deploy of the `deploy` stage. For the self-hosting repo # (orchestrator) the stage REALLY restarts prod (8500) via a detached host hook; # deploy_status: SUCCESS means proven health-ok, not an LLM declaration. Three # deterministic phases (A: request approve, B: human Approved -> detached deploy, # C: finalizer maps hook exit-code -> deploy_status). Non-self repos: unchanged # synchronous ssh deploy. SECRETS / host paths live ONLY on the host — do NOT commit. # SELF_DEPLOY_ENABLED -> global kill-switch (false -> legacy synchronous deploy for all). # SELF_DEPLOY_REPOS -> CSV of repos where Phase A/B/C is REAL; empty -> only the # self-hosting repo (orchestrator); others -> no-op (mirrors ORCH-35). # DEPLOY_REQUIRE_MANUAL_APPROVE -> require a human Plane "Approved" before the prod # deploy (true on rollout; full auto is ORCH-54). # DEPLOY_FINALIZE_DELAY_S -> delay before the first/each finalize poll (>= hook+health). # DEPLOY_FINALIZE_MAX_ATTEMPTS -> bounded finalize-defer budget (anti-livelock). # DEPLOY_SSH_USER / DEPLOY_SSH_HOST -> ssh target for the host hook (DEPLOY_SSH_HOST # empty -> detached deploy will NOT launch; set on the host). # DEPLOY_HOOK_SCRIPT -> path to the hook ON THE HOST (relative to the repo). # DEPLOY_HOST_REPO_PATH -> orchestrator clone path on the host. # DEPLOY_PROD_SOURCE_IMAGE -> staging-validated image, retagged build-once (no rebuild). # DEPLOY_PROD_TARGET_SERVICE / _PORT / _IMAGE / _COMPOSE_PROFILE -> prod compose profile. # DEPLOY_PROD_PREV_IMAGE_FILE -> prod prev-image snapshot (separate from staging's). ORCH_SELF_DEPLOY_ENABLED=true ORCH_SELF_DEPLOY_REPOS= ORCH_DEPLOY_REQUIRE_MANUAL_APPROVE=true ORCH_DEPLOY_FINALIZE_DELAY_S=90 ORCH_DEPLOY_FINALIZE_MAX_ATTEMPTS=10 ORCH_DEPLOY_SSH_USER=slin ORCH_DEPLOY_SSH_HOST= ORCH_DEPLOY_HOOK_SCRIPT=scripts/orchestrator-deploy-hook.sh ORCH_DEPLOY_HOST_REPO_PATH=/home/slin/repos/orchestrator ORCH_DEPLOY_PROD_SOURCE_IMAGE=orchestrator-orchestrator-staging ORCH_DEPLOY_PROD_TARGET_SERVICE=orchestrator ORCH_DEPLOY_PROD_TARGET_PORT=8500 ORCH_DEPLOY_PROD_TARGET_IMAGE=orchestrator-orchestrator ORCH_DEPLOY_PROD_COMPOSE_PROFILE= ORCH_DEPLOY_PROD_PREV_IMAGE_FILE=.deploy-prev-image-prod # ORCH-058: staging-image provenance before the BUILD-ONCE prod retag (INV-FRESH). # Guarantees the staging image promoted to prod is the EXACT artefact rebuilt from the # validated commit — two layers, self-hosting only: # A (liveness): QG sub-check `check_staging_image_fresh` on the deploy-staging->deploy # edge rebuilds orchestrator-orchestrator-staging from the validated commit + recreates # 8501; FAIL -> rollback to development. (builds/recreate STAGING only, never prod.) # B (safety): the Dockerfile stamps `org.opencontainers.image.revision`; the prod hook # fail-closes (exit 1) before `docker tag` if SOURCE_IMAGE's label != EXPECTED_REVISION. # ENABLED -> single kill-switch for A+B as a WHOLE (never "B without A"); false -> legacy. # REPOS -> CSV of repos where the gate is REAL; empty -> only self-hosting (orchestrator). ORCH_IMAGE_FRESHNESS_ENABLED=true ORCH_IMAGE_FRESHNESS_REPOS= # ORCH-061: staging-verdict tolerance to sandbox-infra-only FAILs. The self-hosting # orchestrator looped on deploy-staging because staging_check.py exited 1 on ANY FAIL, # so two infra-only checks (C9a sandbox branch / C9b analyst-job — caused by SANDBOX # bot accounts not being members of the sandbox Plane project, NOT a pipeline regress) # forced staging_status: FAILED -> rollback -> loop. With this ON, C9a/C9b are WAIVED # to SUCCESS when every REAL check is green; any REAL failure still fails closed. # true (default) -> tolerant; false -> legacy strict (1:1 pre-ORCH-061, any FAIL rolls back). # Lives in .env.staging (the staging instance). CLI --strict overrides this per-run. ORCH_STAGING_INFRA_TOLERANCE_ENABLED=true # ORCH-053: stuck-task reconciler (sweeper for lost webhooks). A background daemon # replays a missed stage transition through the SAME gates/handlers a webhook would, # fixing tasks that got stuck on a dropped event (502 on rebuild, no Plane/Gitea # retries, unresolved sha->branch). # ENABLED -> global kill-switch (self-hosting safety / staged rollout). # PLANE_ENABLED -> separate flag for the F-2 Plane-API poll (mute only F-2). # INTERVAL_S -> background sweep period (seconds). # GRACE_DEFAULT_S -> default "stuck" threshold on tasks.updated_at (seconds). # GRACE_OVERRIDES_JSON -> per-stage thresholds, e.g. {"development":300}; bad JSON -> default. # NOTIFY_UNBLOCK -> send a Telegram message when a stuck task is unblocked. # SKIP_BLOCKED_ENABLED -> ORCH-060 F-1 Guard 2: skip reconciling issues a human moved # to Blocked / Needs Input (per-candidate Plane state lookup). # false mutes ONLY the networked Guard 2; Guard 1 (escalated by # developer retries, local+deterministic) is always active. ORCH_RECONCILE_ENABLED=true ORCH_RECONCILE_PLANE_ENABLED=true ORCH_RECONCILE_INTERVAL_S=120 ORCH_RECONCILE_GRACE_DEFAULT_S=600 ORCH_RECONCILE_GRACE_OVERRIDES_JSON= ORCH_RECONCILE_NOTIFY_UNBLOCK=true ORCH_RECONCILE_SKIP_BLOCKED_ENABLED=true # ORCH-065: job-reaper + proactive merge-lease reclaim. A background daemon thread # (src/job_reaper.py, started LAST in main.lifespan after requeue_running_jobs) reaps # zombie 'running' jobs whose monitor/process died before writing the terminal status # (one zombie at max_concurrency=1 blocks the whole shared queue) and periodically # reclaims dead/stale merge-leases. Liveness is three-tier: Tier-1 dead jobs.pid # (os.kill(pid,0)) after REAPER_DEAD_TICKS consecutive dead ticks (anti-false-positive # for a live agent); Tier-2 agent_runs.exit_code recorded but job still 'running'; # Tier-3 backstop after REAPER_MAX_RUNNING_S. The terminal flip carries an atomic # status='running' guard so it never double-processes a row racing requeue_running_jobs. # REAPER_ENABLED -> global kill-switch (false -> strictly prior behaviour). # REAPER_INTERVAL_S -> background scan period (seconds). # REAPER_DEAD_TICKS -> consecutive dead-pid ticks before reaping (Tier-1, >=2). # REAPER_MAX_RUNNING_S -> Tier-3 backstop ceiling; must exceed max agent_timeout+grace. # LEASE_RECLAIM_ENABLED -> kill-switch for the proactive stale/dead lease reclaim # (false -> only the legacy lazy TTL reclaim in acquire_merge_lease). # (reuse) ORCH_MERGE_LOCK_TIMEOUT_S -> lease TTL; ORCH_MERGE_GATE_REPOS -> reclaim scope. ORCH_REAPER_ENABLED=true ORCH_REAPER_INTERVAL_S=60 ORCH_REAPER_DEAD_TICKS=2 ORCH_REAPER_MAX_RUNNING_S=3600 ORCH_LEASE_RECLAIM_ENABLED=true # ORCH-021: post-deploy production monitoring + degradation reaction. After the # terminal deploy->done transition for an applicable repo, a reserved-agent job # `post-deploy-monitor` (no LLM, modelled on deploy-finalizer) probes prod over a # window and reacts to a degradation the restart-time health-check missed (class # "green deploy, red prod", precedent ET-8). State is in sentinel files # (.post-deploy-state-//), no DB migration. # MONITOR_ENABLED -> global kill-switch; false -> pipeline is 1:1 as before ORCH-021. # REPOS -> CSV of repos where monitoring is REAL; empty -> only self-hosting. # WINDOW_S -> observation window length (~15 min). # INTERVAL_S -> seconds between probe ticks. # FAIL_THRESHOLD -> N CONSECUTIVE health failures -> DEGRADED. # 5XX_THRESHOLD -> window 5xx ratio above this -> DEGRADED. # AUTO_ROLLBACK -> allow auto-rollback; acts ONLY for non-self repos. Self-hosting # is ALWAYS ALERT_ONLY (a tick NEVER restarts the prod container). # BASE_URL -> base URL of the observed prod instance. ORCH_POST_DEPLOY_MONITOR_ENABLED=true ORCH_POST_DEPLOY_REPOS= ORCH_POST_DEPLOY_WINDOW_S=900 ORCH_POST_DEPLOY_INTERVAL_S=30 ORCH_POST_DEPLOY_FAIL_THRESHOLD=3 ORCH_POST_DEPLOY_5XX_THRESHOLD=0.5 ORCH_POST_DEPLOY_AUTO_ROLLBACK=false ORCH_POST_DEPLOY_BASE_URL=http://localhost:8500