From 212352997e192769442feeee9427c4bea7391e91 Mon Sep 17 00:00:00 2001 From: Dev Agent Date: Tue, 2 Jun 2026 20:12:29 +0300 Subject: [PATCH] fix(main): proper orphan recovery with per-run warning + notify (M-1) --- src/main.py | 37 +++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/src/main.py b/src/main.py index 7d84883..f1952e8 100644 --- a/src/main.py +++ b/src/main.py @@ -15,17 +15,42 @@ logging.basicConfig( @asynccontextmanager async def lifespan(app: FastAPI): init_db() - # Recover orphaned runs + # M-1: proper orphan-recovery. + # An orphan = an agent_run with no finished_at that is older than the recovery + # window. After a uvicorn restart the monitor thread is gone, so its child claude + # process (if any) was reparented to init; we cannot kill it by pid (pid is not + # persisted). Instead of silently writing exit=-1, we: enumerate each orphan, + # mark it exit=-1, log a warning per run, and notify so a human can check/restart. + log = logging.getLogger('orchestrator') from .db import get_db conn = get_db() - orphans = conn.execute( - "UPDATE agent_runs SET finished_at=datetime('now'), exit_code=-1 " + orphan_rows = conn.execute( + "SELECT id, task_id, agent FROM agent_runs " "WHERE finished_at IS NULL AND started_at < datetime('now', '-35 minutes')" - ).rowcount + ).fetchall() + for row in orphan_rows: + run_id, task_id, agent = row[0], row[1], row[2] + conn.execute( + "UPDATE agent_runs SET finished_at=datetime('now'), exit_code=-1 WHERE id=?", + (run_id,), + ) + log.warning( + f"Orphan run {run_id} (task {task_id}, agent {agent}) recovered — " + f"manual check needed (process may have been killed on restart)" + ) conn.commit() conn.close() - if orphans: - logging.getLogger('orchestrator').warning(f'Recovered {orphans} orphaned agent runs') + if orphan_rows: + try: + from .notifications import send_telegram + ids = ", ".join(str(r[0]) for r in orphan_rows) + send_telegram( + f"\u26a0\ufe0f Orchestrator restart: {len(orphan_rows)} orphaned agent run(s) " + f"(run_id: {ids}) marked exit=-1. Нужна ручная проверка/перезапуск." + ) + except Exception: + pass + log.warning(f"Recovered {len(orphan_rows)} orphaned agent runs") yield