fix(main): proper orphan recovery with per-run warning + notify (M-1)
This commit is contained in:
37
src/main.py
37
src/main.py
@@ -15,17 +15,42 @@ logging.basicConfig(
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
init_db()
|
||||
# Recover orphaned runs
|
||||
# M-1: proper orphan-recovery.
|
||||
# An orphan = an agent_run with no finished_at that is older than the recovery
|
||||
# window. After a uvicorn restart the monitor thread is gone, so its child claude
|
||||
# process (if any) was reparented to init; we cannot kill it by pid (pid is not
|
||||
# persisted). Instead of silently writing exit=-1, we: enumerate each orphan,
|
||||
# mark it exit=-1, log a warning per run, and notify so a human can check/restart.
|
||||
log = logging.getLogger('orchestrator')
|
||||
from .db import get_db
|
||||
conn = get_db()
|
||||
orphans = conn.execute(
|
||||
"UPDATE agent_runs SET finished_at=datetime('now'), exit_code=-1 "
|
||||
orphan_rows = conn.execute(
|
||||
"SELECT id, task_id, agent FROM agent_runs "
|
||||
"WHERE finished_at IS NULL AND started_at < datetime('now', '-35 minutes')"
|
||||
).rowcount
|
||||
).fetchall()
|
||||
for row in orphan_rows:
|
||||
run_id, task_id, agent = row[0], row[1], row[2]
|
||||
conn.execute(
|
||||
"UPDATE agent_runs SET finished_at=datetime('now'), exit_code=-1 WHERE id=?",
|
||||
(run_id,),
|
||||
)
|
||||
log.warning(
|
||||
f"Orphan run {run_id} (task {task_id}, agent {agent}) recovered — "
|
||||
f"manual check needed (process may have been killed on restart)"
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
if orphans:
|
||||
logging.getLogger('orchestrator').warning(f'Recovered {orphans} orphaned agent runs')
|
||||
if orphan_rows:
|
||||
try:
|
||||
from .notifications import send_telegram
|
||||
ids = ", ".join(str(r[0]) for r in orphan_rows)
|
||||
send_telegram(
|
||||
f"\u26a0\ufe0f Orchestrator restart: {len(orphan_rows)} orphaned agent run(s) "
|
||||
f"(run_id: {ids}) marked exit=-1. Нужна ручная проверка/перезапуск."
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
log.warning(f"Recovered {len(orphan_rows)} orphaned agent runs")
|
||||
yield
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user