from fastapi import FastAPI from contextlib import asynccontextmanager import logging from .db import init_db from .webhooks.plane import router as plane_router from .webhooks.gitea import router as gitea_router # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", ) @asynccontextmanager async def lifespan(app: FastAPI): init_db() # M-1: proper orphan-recovery. # An orphan = an agent_run with no finished_at that is older than the recovery # window. After a uvicorn restart the monitor thread is gone, so its child claude # process (if any) was reparented to init; we cannot kill it by pid (pid is not # persisted). Instead of silently writing exit=-1, we: enumerate each orphan, # mark it exit=-1, log a warning per run, and notify so a human can check/restart. log = logging.getLogger('orchestrator') from .db import get_db conn = get_db() orphan_rows = conn.execute( "SELECT id, task_id, agent FROM agent_runs " "WHERE finished_at IS NULL AND started_at < datetime('now', '-35 minutes')" ).fetchall() for row in orphan_rows: run_id, task_id, agent = row[0], row[1], row[2] conn.execute( "UPDATE agent_runs SET finished_at=datetime('now'), exit_code=-1 WHERE id=?", (run_id,), ) log.warning( f"Orphan run {run_id} (task {task_id}, agent {agent}) recovered — " f"manual check needed (process may have been killed on restart)" ) conn.commit() conn.close() if orphan_rows: try: from .notifications import send_telegram ids = ", ".join(str(r[0]) for r in orphan_rows) send_telegram( f"\u26a0\ufe0f Orchestrator restart: {len(orphan_rows)} orphaned agent run(s) " f"(run_id: {ids}) marked exit=-1. Нужна ручная проверка/перезапуск." ) except Exception: pass log.warning(f"Recovered {len(orphan_rows)} orphaned agent runs") # ORCH-1 (F-2b): queue-recovery. Any job left in 'running' status belongs to a # worker that died on the previous restart -> put it back to 'queued' so the # worker re-picks it up (restart-safe, no lost work). Runs AFTER M-1. from .db import requeue_running_jobs requeued = requeue_running_jobs() if requeued: log.warning(f"Queue-recovery: requeued {requeued} running job(s) after restart") # ORCH-065: proactive startup reclaim of dead/stale merge-leases, next to the # queue-recovery above. A lease held by the previous (now dead) process pid is # released at once instead of waiting for the TTL / a foreign acquire so the # next merge is not blocked. Conditional (merge_gate_repos / self-hosting) and # gated by ORCH_LEASE_RECLAIM_ENABLED; never raises. try: from .job_reaper import reclaim_all_stale_leases reclaimed = reclaim_all_stale_leases() if reclaimed: log.warning(f"Startup lease-reclaim: reclaimed {reclaimed} stale merge-lease(s)") except Exception as e: log.warning(f"Startup lease-reclaim skipped: {e}") # L-2: rotate old per-run logs at startup (best-effort; never fatal). try: import os as _os from .config import settings as _settings from .agents.launcher import prune_run_logs _runs_dir = _os.path.join(_os.path.dirname(_settings.db_path), "runs") _removed = prune_run_logs( _runs_dir, keep_days=_settings.log_keep_days, keep_max=_settings.log_keep_max, ) if _removed: log.info(f"Log rotation: pruned {_removed} old run log(s) from {_runs_dir}") except Exception as e: log.warning(f"Log rotation skipped: {e}") # ORCH-057 (D3 / FR-3): best-effort legacy-ownership detect. Surfaces a # PROACTIVE operator signal (WARNING + Telegram) when /repos still holds # root-owned files after the uid migration, BEFORE a task fails on launch. # never-fatal (mirrors lease-reclaim / log-rotation above): a detect error must # not crash the start of the shared instance. The actual "clear, early" failure # is delivered by the actionable error in ensure_worktree (D1) — claim is NOT # blocked (ADR-001 D3). Honours ORCH_FS_NORMALIZE_ENABLED inside scan_ownership. try: from .fs_normalize import scan_ownership, healing_command, normalize from .config import settings as _fs_settings scan = scan_ownership() if scan.mismatch: log.warning( "FS-ownership mismatch: %d root(s) with files not owned by uid %s " "(%s; sample: %s). Heal: %s", len(scan.roots_mismatch), scan.target_uid, ", ".join(scan.roots_mismatch), scan.sample_path, healing_command(), ) try: from .notifications import send_telegram send_telegram( "⚠️ Orchestrator: обнаружены legacy root-owned файлы в " f"{', '.join(scan.roots_mismatch)} (uid != {scan.target_uid}). " f"Первый запуск задачи может упасть на создании worktree. " f"Лечение: {healing_command()}" ) except Exception: pass # D4 / FR-4: opt-in auto-chown ONLY when privileged (no-op under uid 1000). if getattr(_fs_settings, "fs_normalize_auto", False): try: res = normalize() log.warning("FS-ownership auto-normalize: %s", res.get("note")) except Exception as e: # noqa: BLE001 log.warning("FS-ownership auto-normalize skipped: %s", e) except Exception as e: log.warning(f"FS-ownership detect skipped: {e}") # Start the background job-queue worker (ORCH-1). from .queue_worker import worker worker.start() # ORCH-053: start the stuck-task reconciler AFTER the worker so its active-job # guard sees a fully-initialised queue. Kill-switch: ORCH_RECONCILE_ENABLED. from .reconciler import reconciler reconciler.start() # ORCH-065: start the job-reaper LAST (after requeue_running_jobs + the worker # + the reconciler) so its atomic status='running' guard never races the # startup requeue. It reaps zombie jobs and periodically reclaims stale # merge-leases. Kill-switch: ORCH_REAPER_ENABLED. from .job_reaper import reaper reaper.start() # ORCH-063: start the disk-watchdog LAST (after the reaper). It is independent # of the queue/DB — it only reads host-FS fill and Telegram-alerts at >= # threshold — so the order is not critical, but we follow the daemon # convention. Honours the kill-switch ORCH_DISK_MONITOR_ENABLED (start() is a # no-op when disabled, so behaviour is 1:1 as before). from .disk_watchdog import disk_watchdog disk_watchdog.start() # ORCH-062: start the build-cache-pruner LAST, right after the disk-watchdog # (D7). It is the "second half" of the watchdog (watchdog signals, pruner # cleans): a daemon thread that periodically runs `docker builder prune` on # the host over ssh. Honours the kill-switch ORCH_BUILD_CACHE_PRUNE_ENABLED # (start() is a no-op when disabled, so behaviour is 1:1 as before). from .build_cache_pruner import build_cache_pruner build_cache_pruner.start() try: yield finally: # ORCH-062: stop the build-cache-pruner first (reverse of startup, D7). build_cache_pruner.stop() # ORCH-063: stop the disk-watchdog next (reverse of startup). disk_watchdog.stop() # Graceful shutdown order mirrors startup in reverse: stop the reaper # first, then the reconciler (it must not enqueue new work while the # worker is winding down), then the worker. Running agents keep going; # their jobs are requeued on next start via queue-recovery if the # process dies. reaper.stop() reconciler.stop() worker.stop() app = FastAPI(title="Multi-Agent Orchestrator", lifespan=lifespan) app.include_router(plane_router, prefix="/webhook") app.include_router(gitea_router, prefix="/webhook") @app.get("/health") async def health(): return {"status": "ok", "service": "orchestrator"} @app.get("/status") async def status(): from .db import get_db conn = get_db() tasks = conn.execute( "SELECT * FROM tasks WHERE stage != 'done' ORDER BY created_at DESC LIMIT 10" ).fetchall() conn.close() return {"active_tasks": [dict(t) for t in tasks]} @app.get("/queue") async def queue(): """ORCH-1: job-queue observability — status counts + recent jobs.""" from .db import job_status_counts, recent_jobs from .queue_worker import worker from .reconciler import reconciler from .job_reaper import reaper from . import post_deploy from . import merge_gate from . import task_deps from . import serial_gate from . import coverage_gate from . import fs_normalize from . import labels from . import cancel from . import bug_fast_track from .disk_watchdog import disk_watchdog from .build_cache_pruner import build_cache_pruner return { "counts": job_status_counts(), "max_concurrency": worker.max_concurrency, "poll_interval": worker.poll_interval, "resilience": worker.status(), "reconcile": reconciler.status(), "reaper": reaper.status(), "post_deploy": post_deploy.status(), "merge_verify": merge_gate.merge_verify_status(), # ORCH-026 (G-2): declarative task-dependency observability (read-only, # NOT a source of truth) — declared edges, blocked tasks, detected cycle. "task_deps": task_deps.snapshot(), # ORCH-088 (D9 / AC-10): per-repo serial-gate observability (read-only) — # active task, queued/waiting analyst-jobs, freeze state. Additive block. "serial_gate": serial_gate.snapshot(), # ORCH-027 (FR-7 / AC-9): coverage-gate observability (read-only) — # kill-switch, scope, policy/floor/epsilon, per-repo baselines. Additive block. "coverage": coverage_gate.snapshot(), # ORCH-057 (D6 / AC-4): legacy-ownership detect observability (read-only) — # kill-switch, scope, target_uid, mismatch + affected roots (TTL-cached scan). # Additive block; never-raise. "fs_ownership": fs_normalize.snapshot(), # ORCH-089 (D7): auto-mode-by-label observability (read-only) — kill-switch, # label names, scope. Additive block. "auto_labels": labels.snapshot(), # ORCH-090 (AC-10): STOP-cancellation observability (read-only) — kill-switch, # repo scope, cancelled/deferred counts, recent cancellations. Additive block; # never-raise. "stop": cancel.snapshot(), # ORCH-019 (FR-7 / AC-7): bug-fast-track observability (read-only) — # kill-switch, label, scope, bug-task counts + the structural savings metric # (architecture stages skipped). Additive block; never-raise. "bug_fast_track": bug_fast_track.snapshot(), # ORCH-063 (FR-6 / AC-7): disk-watchdog observability (read-only) — # enabled, threshold, interval, last measurement per host-path. Additive # block; never-raise (status() returns {"enabled": ...} minimum on error). "disk_monitor": disk_watchdog.status(), # ORCH-062 (FR-4 / AC-7): build-cache-pruner observability (read-only) — # enabled, interval, retention (until), last run + best-effort reclaimed / # last error. Additive block; never-raise (status() returns {"enabled": # ...} minimum on error). "build_cache_prune": build_cache_pruner.status(), "recent": recent_jobs(10), } @app.get("/metrics") async def metrics(): """ORCH-099 (FND/F1a): lightweight read-only raw-signal snapshot for the F1b sidecar. A versioned JSON envelope (``schema_version`` / ``generated_at`` / ``clk_tck``) with four raw-signal sections — ``stages`` (active task stages + age), ``queue`` (counts / retries / breaker / concurrency), ``agents`` (agent-liveness: pid / runtime / cpu_ticks), ``cost`` (per-run + aggregate tokens/cost). The orchestrator emits ONLY raw signal it alone knows; the stateful arbiter (thresholds / deltas / alerts) is the separate sidecar (BRD §1). Thin wrapper over ``metrics.build_metrics()`` (in the style of GET /queue): the collector is already strictly read-only and never-raise, so no extra error handling is needed here. Same access level as /queue//status. The format is the documented contract for the sidecar (docs/architecture/README.md). """ from . import metrics as metrics_mod return metrics_mod.build_metrics() @app.post("/serial-gate/unfreeze") async def serial_gate_unfreeze(repo: str = ""): """ORCH-088 (FR-5, ADR-001 D4): manually clear a per-repo rollback-freeze. A freeze set by the post-deploy monitor (DEGRADED) keeps the serial gate CLOSED for the repo until an operator explicitly clears it here. Idempotent: clearing an already-clear repo reports ``cleared: 0``. The next queued analyst-job is then claimable on the next scheduler tick (no restart needed). Alternative manual path (documented in README): ``UPDATE repo_freeze SET cleared_at=datetime('now') WHERE repo=? AND cleared_at IS NULL``. """ from . import serial_gate if not repo or not repo.strip(): return {"ok": False, "error": "missing 'repo'", "repo": repo, "cleared": 0} repo = repo.strip() cleared = serial_gate.clear_repo_freeze(repo) frozen = serial_gate.is_repo_frozen(repo) if cleared: try: from .notifications import send_telegram send_telegram( f"🔥 {repo}: пакет РАЗМОРОЖЕН вручную ({cleared} запис(ь/и) снято). " f"Следующая задача репо стартует на ближайшем цикле." ) except Exception: pass return {"ok": True, "repo": repo, "cleared": cleared, "frozen": frozen} @app.post("/fs-normalize/check") async def fs_normalize_check(normalize: bool = False): """ORCH-057 (D6 / AC-4): force a fresh legacy-ownership detect (bypass the TTL cache) and return the snapshot. By образцу ``POST /serial-gate/unfreeze``. ``normalize=true`` additionally attempts an opt-in ``chown`` — a no-op under uid 1000 (the prod-self case), effective only when the process is privileged (D4). The real fix remains the operator procedure (docs/operations/INFRA.md «Миграция uid»). Read-only/never-raise otherwise. """ from . import fs_normalize as _fs scan = _fs.scan_ownership(force=True) out = {"ok": True, "scan": scan.to_dict(), "healing": _fs.healing_command()} if normalize: out["normalize"] = _fs.normalize() # Re-scan so the returned snapshot reflects any change a privileged run made. out["scan"] = _fs.scan_ownership(force=True).to_dict() return out @app.post("/coverage/baseline") async def coverage_set_baseline(repo: str = "", value: float | None = None): """ORCH-027 (D8): manually set/override the per-repo coverage baseline. For a legitimate one-off coverage drop (e.g. removing a large tested module) the operator sets the baseline directly here (by образцу ``POST /serial-gate/unfreeze``) instead of waiting for the upward-only ratchet. Unlike the ratchet this CAN lower the baseline. Alternative without this endpoint: temporarily flip ``ORCH_COVERAGE_POLICY=absolute``. """ from . import db if not repo or not repo.strip(): return {"ok": False, "error": "missing 'repo'", "repo": repo} if value is None: return {"ok": False, "error": "missing 'value'", "repo": repo} repo = repo.strip() ok = db.set_coverage_baseline(repo, value, sha="manual-override") return {"ok": ok, "repo": repo, "baseline": db.get_coverage_baseline(repo)} @app.post("/bug-fast-track/escalate") async def bug_fast_track_escalate(work_item: str = ""): """ORCH-019 (FR-5 / AC-5, ADR-001 D5): escalate a bug-fast-track task to the full cycle (return it to the route WITH `architecture`). Operator path for a bug that turned out to be complex / architectural / visual (needs an ADR or a mock): reset ``tasks.track`` 'bug' -> 'full'. Apply while the task is still in `analysis` (before its exit) — the next advance_stage then routes analysis -> architecture normally. By образцу ``POST /serial-gate/unfreeze`` / ``POST /coverage/baseline``. never-raise. """ from . import db if not work_item or not work_item.strip(): return {"ok": False, "error": "missing 'work_item'", "work_item": work_item} work_item = work_item.strip() task = db.get_task_by_work_item_id(work_item) if not task: return {"ok": False, "error": "unknown work_item", "work_item": work_item} prev_track = task.get("track") or "full" db.set_task_track(task["id"], "full") if prev_track == "bug": try: from .notifications import send_telegram send_telegram( f"🐞➡️ {work_item}: эскалация в ПОЛНЫЙ цикл " f"(багфикс-трек снят, стадия architecture восстановлена)." ) except Exception: pass try: from .plane_sync import add_comment add_comment( work_item, "🐞➡️ Эскалация: задача возвращена в полный цикл " "(багфикс-трек снят, стадия architecture восстановлена).", author="analyst", ) except Exception: pass return {"ok": True, "work_item": work_item, "track": "full", "was": prev_track}