Files
orchestrator/src/agents/launcher.py
claude-bot 81fc2df8a8 fix(launcher): runs log dir from settings, not hardcoded /app (CI fix)
test_spawn_stamps_resolved_effort упал в CI с PermissionError на '/app':
launcher._spawn хардкодил output_path='/app/data/runs/{run_id}.log' и
os.makedirs('/app/data/runs'). В контейнере /app есть, на CI-хосте
(act_runner hostexecutor) — нет, makedirs бросает -> красный CI.

Фикс корня (не только теста): базовый каталог per-run логов вынесен в
Settings.runs_dir (env ORCH_RUNS_DIR, дефолт '/app/data/runs' = прод 1:1).
Новый хелпер _run_log_path(run_id) — единый источник пути, использован в
_spawn + три прежних inline-строки логов/алертов. Тест monkeypatch-ит
settings.runs_dir на tmp_path -> окружение-независим (проверено прогоном
с принудительно недоступным /app). pytest tests/ -q: 1090 passed.

STAGE_TRANSITIONS/QG_CHECKS/схема БД не тронуты. Docs: README env-таблица,
CHANGELOG.

Refs: ORCH-087
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-09 10:06:17 +03:00

1146 lines
51 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import subprocess
import os
import json
import logging
import re
import threading
import signal
import time
from ..config import settings
from ..db import get_db, get_task_by_repo_branch, update_task_stage, enqueue_job
from ..stages import get_next_stage, get_qg_for_stage, get_agent_for_stage
from ..git_worktree import ensure_worktree, get_worktree_path
from ..qg.checks import QG_CHECKS
from ..notifications import notify_stage_change, notify_qg_failure, notify_agent_started, notify_agent_finished, notify_approve_requested
from ..plane_sync import notify_stage_change as plane_notify_stage, add_comment as plane_add_comment
logger = logging.getLogger("orchestrator.launcher")
# ORCH-41: valid --effort values accepted by the Claude CLI. An effort that is
# not in this set is treated as misconfiguration: logged and dropped (no flag),
# never passed through to the CLI.
VALID_EFFORTS = frozenset({"low", "medium", "high", "xhigh", "max"})
# ORCH-074 (G2): structural validity check for a Claude CLI model name. We use a
# FORMAT check (^claude-…$), not a static allowlist, on purpose: an allowlist
# recreates the exact rot we kill in G1 — it silently drops a CORRECT newer model
# (e.g. claude-opus-4-9) the day Anthropic ships it (never-break working against
# the operator). The final authority on whether a model exists is the Claude CLI
# itself, not our code; a format check is forward-compatible (new versions pass
# without code edits) while still catching the real failure classes: another
# provider (gpt-4), empty/whitespace, garbage chars, wrong prefix (claud-opus-typo).
# The claude- prefix is hardcoded here because the orchestrator is bound to the
# Claude CLI (CLAUDE_BIN); the canonical model VERSION lives ONLY in
# settings.agent_model_default, never here. See ADR-001 (ORCH-074).
_MODEL_NAME_RE = re.compile(r"^claude-[a-z0-9.-]+$")
def is_valid_model(name: str) -> bool:
"""ORCH-074 (G2): True iff ``name`` is a structurally valid Claude model name.
A valid name, after ``strip()``, is non-empty, starts with ``claude-`` and
contains only lowercase letters, digits, dots and dashes. Anything else
(empty/whitespace, another provider like ``gpt-4``, a wrong prefix, illegal
characters) is invalid. This is the single predicate used by BOTH
``resolve_agent_model`` and the inline ``--fallback-model`` read in ``_spawn``
so a typo can never reach the CLI (never-break). It is a structural guard, not
a registry of existing models — a structurally valid typo (``claude-opus-typo``)
is left for the CLI to reject. Never raises.
"""
if not name:
return False
return bool(_MODEL_NAME_RE.match(name.strip()))
# ORCH-061: action stages whose success is an ACTION (restart/retag), not a src
# edit — so "no changes to commit" is EXPECTED there, not under-delivery (FR-3).
_ACTION_STAGES = frozenset({"deploy-staging", "deploy"})
def action_stage_no_changes_note(stage, repo) -> str | None:
"""ORCH-061 (FR-3 / FR-7): observability for an empty diff on an action stage.
The ``deploy-staging`` / ``deploy`` stages are actions (restart / retag), not
code edits, so the post-run "no changes to commit" is the NORMAL case there —
advancement is decided by the agent exit-code + the staging/deploy gate verdict,
NEVER by the presence of a commit (FR-3 / AC-4). This is a PURE decision used
only to emit an explicit log line distinguishing an expected action-stage no-op
from a code-stage no-op; it has no effect on stage advancement.
Returns an explicit note string when the empty diff is expected (an action
stage of a self-deploy repo), else ``None``. Never raises.
"""
try:
if stage in _ACTION_STAGES:
from ..self_deploy import self_deploy_applies
if self_deploy_applies(repo):
return f"{stage}: no code changes (expected on action stage)"
return None
except Exception: # noqa: BLE001 - observability only, never raise
return None
def _resolve_agent_attr(agent, project_id, project_map_attr, env_attr_prefix,
default_attr):
"""ORCH-41 shared resolver with priority:
1. ProjectConfig.<project_map_attr>[agent] (per-project override)
2. settings.<env_attr_prefix><agent> (per-agent env, if non-empty)
3. settings.<default_attr> (global default)
4. "" (no flag -> CLI default)
project_id is the Plane project uuid. It is resolved to a ProjectConfig via
the registry; an unknown / empty id simply skips level 1. A missing per-agent
settings attribute (e.g. unknown agent name) skips level 2.
"""
# Level 1: per-project override.
if project_id:
from ..projects import get_project_by_plane_id
proj = get_project_by_plane_id(project_id)
if proj is not None:
override = getattr(proj, project_map_attr, {}).get(agent)
if override:
return override
# Level 2: per-agent env (settings.<prefix><agent>), if defined & non-empty.
per_agent = getattr(settings, f"{env_attr_prefix}{agent}", "")
if per_agent:
return per_agent
# Level 3: global default.
default = getattr(settings, default_attr, "")
if default:
return default
# Level 4: nothing -> CLI default.
return ""
def _agent_model_candidates(agent: str, project_id: str = None):
"""Yield non-empty model candidates in ORCH-41 priority order.
Same priority as _resolve_agent_attr (project-override > per-agent env >
global default), but as a generator so resolve_agent_model can validate each
level and SKIP an invalid one (ORCH-074 G2) instead of returning the first
non-empty value blindly. Empty levels are simply not yielded.
"""
if project_id:
from ..projects import get_project_by_plane_id
proj = get_project_by_plane_id(project_id)
if proj is not None:
override = getattr(proj, "agent_models", {}).get(agent)
if override:
yield override
per_agent = getattr(settings, f"agent_model_{agent}", "")
if per_agent:
yield per_agent
default = getattr(settings, "agent_model_default", "")
if default:
yield default
def resolve_agent_model(agent: str, project_id: str = None) -> str:
"""ORCH-41: resolve the LLM model for an agent (optionally per-project).
ORCH-074 (G2): the resolved name is validated with is_valid_model BEFORE it is
returned. An invalid (structurally garbage) value at any level is logged and
SKIPPED — resolution falls through to the next valid level (project-override
invalid -> per-agent env -> default); if no level yields a valid name the
function returns "" so the caller omits --model and the CLI default applies.
The ORCH-41 priority order and signature are unchanged; validation is layered
on top. Never raises and never returns garbage that could reach --model.
"""
for value in _agent_model_candidates(agent, project_id):
if is_valid_model(value):
return value
logger.warning(
f"Invalid model name '{value}' for agent '{agent}' "
f"(expected '^claude-…'); skipping to next resolution level / CLI default"
)
return ""
def _agent_effort_floor(agent: str) -> str:
"""ORCH-081 (ORCH-52h): per-role non-empty floor for --effort resolution.
Returns the DECLARED class-default of the ``agent_effort_<agent>`` field on
Settings (e.g. developer -> ``xhigh``, tester/deployer -> ``medium``, the rest
-> ``high``). This is the value pydantic WOULD have used were it not clobbered
by a spurious empty env var (``ORCH_AGENT_EFFORT_<ROLE>=``): the class-default
is fixed in the class body and a present-but-empty env value cannot override it,
so it is a robust floor even when the host ``.env`` zeroes every effort var.
config.py is the single source of truth: upgrading developer to ``xhigh`` there
automatically raises the floor here — no second map to keep in sync (ADR-001).
Unknown agent (a name outside the 6 roles) has no ``agent_effort_<agent>``
field; we degrade to the class-default of ``agent_effort_default`` (``high``),
a safe non-empty floor. Never raises.
"""
fields = type(settings).model_fields
for key in (f"agent_effort_{agent}", "agent_effort_default"):
field = fields.get(key)
if field is not None and field.default:
return field.default
return ""
def resolve_agent_effort(agent: str, project_id: str = None) -> str:
"""ORCH-41: resolve the --effort level for an agent (optionally per-project).
Same priority as resolve_agent_model, with one extra level below the global
default (ORCH-081 / ADR-001):
1. project-override (projects_json.agent_efforts[agent])
2. per-agent env (settings.agent_effort_<agent>)
3. global default (settings.agent_effort_default)
4. per-role FLOOR (class-default of agent_effort_<agent>) — NEW
The floor only kicks in when levels 1-3 are all empty (the prod bug: a present
but empty ``ORCH_AGENT_EFFORT_*=`` clobbers every default to ''), guaranteeing
a non-empty target effort for the 6 known roles regardless of host .env state.
The floor is applied BEFORE validation and ONLY to an empty resolve, so it
never masks a typo: an explicit invalid value (e.g. ``turbo``) is non-empty,
skips the floor, and is logged + dropped to "" exactly as in ORCH-41 (the
resolved value is validated against VALID_EFFORTS; an invalid value can never
pass a bad flag to the CLI). Never raises.
"""
value = _resolve_agent_attr(
agent, project_id,
project_map_attr="agent_efforts",
env_attr_prefix="agent_effort_",
default_attr="agent_effort_default",
)
if not value:
# Levels 1-3 all empty (typically a prod .env with empty ORCH_AGENT_EFFORT_*):
# fall through to the per-role floor (class-default). Applied before
# validation but only here, so a typo (non-empty) never reaches this branch.
value = _agent_effort_floor(agent)
if value and value not in VALID_EFFORTS:
logger.warning(
f"Invalid effort '{value}' for agent '{agent}' "
f"(allowed: {sorted(VALID_EFFORTS)}); omitting --effort"
)
return ""
return value
def _run_log_path(run_id):
"""Absolute path of a per-run agent log: ``<settings.runs_dir>/<run_id>.log``.
ORCH-087: single source of truth for the log path so it follows
``settings.runs_dir`` everywhere (no hardcoded ``/app/data/runs``), which keeps
``_spawn`` writable on non-container hosts (CI) where ``/app`` is inaccessible.
"""
return os.path.join(settings.runs_dir, f"{run_id}.log")
def prune_run_logs(runs_dir, keep_days=30, keep_max=500, active_paths=None):
"""L-2: best-effort rotation of per-run logs (<runs_dir>/*.log).
A log file is removed if it is older than keep_days OR it is not within the
keep_max most-recent logs (whichever condition is met first). Only *.log
files directly inside runs_dir are considered; non-.log files and
subdirectories are never touched. Files whose path is in active_paths (the
currently running log) are always kept.
Returns the number of files removed. Never raises: any error is logged and
swallowed so log rotation can never bring the app down.
"""
removed = 0
try:
active = set()
for ap in (active_paths or []):
try:
active.add(os.path.realpath(ap))
except Exception:
active.add(ap)
if not os.path.isdir(runs_dir):
return 0
logs = []
for name in os.listdir(runs_dir):
if not name.endswith(".log"):
continue
path = os.path.join(runs_dir, name)
if not os.path.isfile(path):
continue
if os.path.realpath(path) in active:
continue
try:
mtime = os.path.getmtime(path)
except OSError:
continue
logs.append((path, mtime))
logs.sort(key=lambda t: t[1], reverse=True)
cutoff = time.time() - keep_days * 86400
for idx, (path, mtime) in enumerate(logs):
too_old = mtime < cutoff
over_max = idx >= keep_max
if too_old or over_max:
try:
os.remove(path)
removed += 1
except OSError as e:
logger.warning(f"prune_run_logs: failed to remove {path}: {e}")
except Exception as e:
logger.warning(f"prune_run_logs failed for {runs_dir}: {e}")
return removed
class AgentLauncher:
"""Launch Claude CLI agents directly (binary mounted into container)."""
AGENT_CONFIGS = {
"analyst": {
"system_prompt": ".openclaw/agents/analyst.md",
"task_file": ".task.md",
"allowed_tools": "Read,Write,Edit,Bash",
},
"architect": {
"system_prompt": ".openclaw/agents/architect.md",
"task_file": ".task-arch.md",
"allowed_tools": "Read,Write,Edit,Bash",
},
"developer": {
"system_prompt": ".openclaw/agents/developer.md",
"task_file": ".task-dev.md",
"allowed_tools": "Read,Write,Edit,Bash",
},
"reviewer": {
"system_prompt": ".openclaw/agents/reviewer.md",
"task_file": ".task-review.md",
"allowed_tools": "Read,Write,Edit,Bash",
},
"tester": {
"system_prompt": ".openclaw/agents/tester.md",
"task_file": ".task-test.md",
"allowed_tools": "Read,Write,Edit,Bash",
},
"deployer": {
"task_file": ".task-deploy.md",
"system_prompt": ".openclaw/agents/deployer.md",
"allowed_tools": "Read,Write,Edit,Bash",
},
}
CLAUDE_BIN = "/opt/claude-code/bin/claude.exe"
# ORCH-7 (M-2): timeout is now configurable. AGENT_TIMEOUT stays as a
# backward-compatible alias for the default; the actual value (and per-agent
# overrides) live in settings and are resolved via _resolve_timeout().
AGENT_TIMEOUT = settings.agent_timeout_seconds
def launch(self, agent: str, repo: str, task_content: str = None, task_id: int = None) -> int:
"""
Launch a Claude CLI agent directly (legacy synchronous path).
Kept for backward compatibility (direct callers / existing tests). The
ORCH-1 job queue uses launch_job() instead, but both share _spawn().
Args:
agent: Agent role (analyst, architect, developer, reviewer, tester)
repo: Repository name
task_content: Optional task content to write to task file
task_id: Optional task ID to associate with this run
Returns:
agent_run_id from DB
"""
return self._spawn(agent, repo, task_content, task_id, job_id=None)
def launch_job(self, job: dict) -> int:
"""ORCH-1: launch an agent for a claimed queue job.
Same spawn path as launch(), but threads job['id'] through so the monitor
can update the job's status (done / requeue / failed) and link jobs.run_id
to the agent_runs row. Returns the agent_run_id.
ORCH-036: the reserved-agent ``deploy-finalizer`` is a DETERMINISTIC
(no-LLM) job — intercept it BEFORE _spawn (which would raise
"Unknown agent", R-6) and run the deploy finalizer synchronously, driving
the jobs row status itself. Returns None (no agent_run row).
"""
if job.get("agent") == "deploy-finalizer":
return self._run_deploy_finalizer_job(job)
# ORCH-021: the reserved-agent `post-deploy-monitor` is also a
# DETERMINISTIC (no-LLM) tick — intercept it BEFORE _spawn and run one
# observation tick synchronously. Returns None (no agent_run row).
if job.get("agent") == "post-deploy-monitor":
return self._run_post_deploy_monitor_job(job)
return self._spawn(
job["agent"],
job["repo"],
job.get("task_content"),
job.get("task_id"),
job_id=job["id"],
)
def _run_deploy_finalizer_job(self, job: dict):
"""ORCH-036 Phase C: run the deterministic deploy finalizer for a job.
Not an LLM spawn — there is no subprocess/monitor, so we mark the jobs row
done/failed here. Any error is contained (the finalizer never-raises, but
we guard anyway so a finalizer fault can't wedge the worker).
"""
from ..db import mark_job
from .. import stage_engine
try:
stage_engine.run_deploy_finalizer(job)
mark_job(job["id"], "done")
logger.info(f"deploy-finalizer job {job['id']} done")
except Exception as e:
logger.error(f"deploy-finalizer job {job['id']} failed: {e}")
try:
mark_job(job["id"], "failed", error=f"deploy-finalizer error: {e}")
except Exception:
pass
return None
def _run_post_deploy_monitor_job(self, job: dict):
"""ORCH-021: run one deterministic post-deploy monitor tick for a job.
Not an LLM spawn — there is no subprocess/monitor, so we mark the jobs row
done/failed here. The tick never-raises, but we guard anyway so a monitor
fault can never wedge the worker / starve other projects (AC-16).
"""
from ..db import mark_job
from .. import stage_engine
try:
stage_engine.run_post_deploy_monitor(job)
mark_job(job["id"], "done")
logger.info(f"post-deploy-monitor job {job['id']} done")
except Exception as e:
logger.error(f"post-deploy-monitor job {job['id']} failed: {e}")
try:
mark_job(job["id"], "failed", error=f"post-deploy-monitor error: {e}")
except Exception:
pass
return None
def _spawn(self, agent: str, repo: str, task_content: str = None,
task_id: int = None, job_id: int = None) -> int:
"""Shared spawn implementation for launch() and launch_job().
When job_id is set, the monitor/watchdog drive the jobs table status
(ORCH-1). The claude-CLI Popen logic (B-2) and worktree/task-file logic
(B-1 / ORCH-2) are unchanged.
"""
config = self.AGENT_CONFIGS.get(agent)
if not config:
raise ValueError(f"Unknown agent: {agent}")
# Main clone lives at /repos/<repo>; the agent works in an isolated worktree
# (ORCH-2 / S-4) so concurrent tasks never fight over a shared checkout.
local_repo_path = os.path.join(settings.repos_dir, repo)
if not os.path.isdir(local_repo_path):
raise FileNotFoundError(f"Repo not found: {local_repo_path}")
# Determine branch (needed before we touch the worktree / task file).
_br_row = get_db().execute("SELECT branch FROM tasks WHERE id=?", (task_id,)).fetchone() if task_id else None
agent_branch = _br_row[0] if _br_row else "main"
# ORCH-41: resolve the Plane project uuid for this repo so per-project
# model/effort overrides apply. Unknown repo -> None (env/default only).
from ..projects import get_project_by_repo
_proj = get_project_by_repo(repo)
project_id = _proj.plane_project_id if _proj else None
# Ensure the per-branch worktree exists and is on the right branch.
work_path = ensure_worktree(repo, agent_branch)
# Write task file if content provided (B-1: direct write; now into the worktree).
if task_content:
self._write_task_file(repo, agent_branch, config["task_file"], task_content)
# Record run in DB
conn = get_db()
cursor = conn.execute(
"INSERT INTO agent_runs (task_id, agent) VALUES (?, ?)",
(task_id, agent),
)
run_id = cursor.lastrowid
conn.commit()
# ORCH-1: link this job to the agent_runs row and stamp started_at.
if job_id is not None:
conn.execute(
"UPDATE jobs SET run_id = ?, started_at = datetime('now') WHERE id = ?",
(run_id, job_id),
)
conn.commit()
# Prepare output log path
output_path = _run_log_path(run_id)
os.makedirs(os.path.dirname(output_path), exist_ok=True)
# Build the claude command
task_file = config["task_file"]
system_prompt = config["system_prompt"]
allowed_tools = config["allowed_tools"]
# ORCH-41: model + effort + optional fallback are resolved from config
# (project-override > per-agent env > default), not hardcoded in AGENT_CONFIGS.
model = resolve_agent_model(agent, project_id)
effort = resolve_agent_effort(agent, project_id)
# ORCH-087 (BR-EFF): stamp the REAL --effort value onto this agent_runs row
# in the moment of launch. The CLI does not echo effort in its result JSON,
# so this is the only reliable source for the tracker's "· model · effort"
# line. Empty resolve (no --effort flag) -> NULL so the suffix is omitted.
# Reuses the still-open conn; never blocks the launch.
try:
conn.execute(
"UPDATE agent_runs SET effort=? WHERE id=?",
(effort or None, run_id),
)
conn.commit()
except Exception as e:
logger.warning(f"effort stamp failed for run_id={run_id}: {e}")
model_flag = f"--model {model} " if model else ""
effort_flag = f"--effort {effort} " if effort else ""
# ORCH-074 (G2): agent_fallback_model is read directly here, bypassing
# resolve_agent_model, so the same validator must guard this point too —
# otherwise a typo in ORCH_AGENT_FALLBACK_MODEL would slip into
# --fallback-model (never-break violation). Empty value -> no flag, exactly
# as before (is_valid_model("") is False but the `if fb` short-circuits).
fb = settings.agent_fallback_model
if fb and not is_valid_model(fb):
logger.warning(
f"Invalid fallback model '{fb}'; dropping --fallback-model"
)
fb = ""
fb_flag = f"--fallback-model {fb} " if fb else ""
# No git fetch/checkout here: ensure_worktree() already put the worktree on
# the right branch. The agent simply runs inside its isolated work_path.
# Feature 4 (token usage): --output-format json makes claude emit a single
# result JSON (with usage + total_cost_usd) at the end of stdout. The log
# still captures it; _monitor_agent parses the trailing JSON after the run
# to record per-agent tokens/cost. _monitor_agent's failure handling keys
# off the process exit_code (not stdout shape), so this is safe.
cmd = (
f'cd {work_path} && '
f'{self.CLAUDE_BIN} --print '
f'--output-format json '
f'{model_flag}{effort_flag}{fb_flag}'
f'"$(cat {task_file})" '
f'--system-prompt "$(cat {system_prompt})" '
f'--allowedTools {allowed_tools}'
)
logger.info(f"Launching agent '{agent}' for repo '{repo}', run_id={run_id}")
# Launch as background process.
# B-2 fix: redirect stdout/stderr straight to the log file at the OS level.
# No PIPE in the orchestrator process -> no PIPE deadlock, no reader thread,
# no zombies. log_fh is closed by _monitor_agent after proc.wait().
log_fh = open(output_path, "w")
proc = subprocess.Popen(
["bash", "-c", cmd],
stdout=log_fh,
stderr=subprocess.STDOUT,
env={
**os.environ,
"HOME": "/home/slin",
"GIT_AUTHOR_NAME": "claude-bot",
"GIT_AUTHOR_EMAIL": "claude-bot@mva154.local",
"GIT_COMMITTER_NAME": "claude-bot",
"GIT_COMMITTER_EMAIL": "claude-bot@mva154.local",
},
)
# Update DB with output path
conn.execute(
"UPDATE agent_runs SET output_path = ? WHERE id = ?",
(output_path, run_id),
)
# ORCH-065: stamp the agent process pid onto the job row so the job-reaper
# can probe liveness (os.kill(pid, 0)). proc.pid only exists after Popen,
# so this is a second UPDATE next to run_id/started_at (set above in _spawn).
if job_id is not None:
conn.execute(
"UPDATE jobs SET pid = ? WHERE id = ?",
(proc.pid, job_id),
)
conn.commit()
conn.close()
# Start timeout watchdog
t = threading.Thread(
target=self._watchdog,
args=(proc.pid, run_id),
kwargs={"job_id": job_id, "agent": agent},
daemon=True,
)
t.start()
# Start monitor thread (waits for completion, commits, pushes)
# agent_branch already computed above
m = threading.Thread(
target=self._monitor_agent,
args=(proc, run_id, agent, repo, agent_branch, output_path, log_fh),
kwargs={"job_id": job_id},
daemon=True,
)
m.start()
logger.info(f"Agent '{agent}' launched, pid={proc.pid}, run_id={run_id}")
notify_agent_started(run_id, agent, task_id)
return run_id
@staticmethod
def _resolve_timeout(agent: str = None) -> int:
"""ORCH-7 (M-2): resolve the wall-clock timeout for an agent.
Per-agent override from settings.agent_timeout_overrides_json (a JSON object
like {"reviewer": 3600}) wins; otherwise the global default
settings.agent_timeout_seconds is used. A malformed override JSON is ignored
(falls back to the default) and only logged, so a bad env never bricks runs.
"""
default = settings.agent_timeout_seconds
raw = (settings.agent_timeout_overrides_json or "").strip()
if agent and raw:
try:
overrides = json.loads(raw)
if isinstance(overrides, dict) and agent in overrides:
return int(overrides[agent])
except (ValueError, TypeError) as e:
logger.warning(f"Invalid agent_timeout_overrides_json, using default: {e}")
return default
def _watchdog(self, pid: int, run_id: int, timeout: int = None,
job_id: int = None, agent: str = None):
"""Kill agent if it exceeds its timeout.
ORCH-1: on a timeout-kill the monitor's proc.wait() returns the kill exit
code and drives the job retry/fail logic, so the watchdog itself only needs
to terminate the process and record the agent_runs exit. job_id is accepted
for symmetry.
ORCH-7 (M-2): graceful shutdown. Instead of an immediate SIGKILL (which cuts
claude off mid-write and leaves half-written artifacts), send SIGTERM first,
give the process up to settings.agent_kill_grace_seconds to flush and exit on
its own, and only SIGKILL if it is still alive after the grace window. If the
process exits during the grace window, SIGKILL is NOT sent.
ProcessLookupError is tolerated at every step (the process may already be
gone). The recorded exit_code stays -9 to match the existing retry/fail
contract regardless of which signal actually reaped it.
"""
if timeout is None:
timeout = self._resolve_timeout(agent)
time.sleep(timeout)
# Phase 1: SIGTERM (graceful). If the process is already gone, we're done.
try:
os.kill(pid, signal.SIGTERM)
logger.warning(
f"Agent run_id={run_id} exceeded {timeout}s timeout: sent SIGTERM "
f"(pid={pid}), grace={settings.agent_kill_grace_seconds}s"
)
except ProcessLookupError:
logger.info(f"Agent run_id={run_id} already exited before SIGTERM")
return # nothing to record: the monitor's proc.wait() owns the exit
# Phase 2: poll for graceful exit within the grace window.
grace = settings.agent_kill_grace_seconds
poll_interval = 0.5
waited = 0.0
while waited < grace:
time.sleep(poll_interval)
waited += poll_interval
try:
os.kill(pid, 0) # signal 0 = liveness probe, does not kill
except ProcessLookupError:
logger.info(
f"Agent run_id={run_id} exited gracefully after SIGTERM "
f"({waited:.1f}s); no SIGKILL needed"
)
self._record_kill(run_id)
return
# Phase 3: still alive -> hard SIGKILL.
try:
os.kill(pid, signal.SIGKILL)
logger.warning(
f"Agent run_id={run_id} did not exit within {grace}s grace: sent SIGKILL"
)
except ProcessLookupError:
logger.info(f"Agent run_id={run_id} exited just before SIGKILL")
self._record_kill(run_id)
@staticmethod
def _record_kill(run_id: int):
"""Stamp the agent_runs row as timeout-killed (exit_code=-9).
ORCH-1: -9 is the existing kill-exit contract the monitor/retry logic keys
off, so we keep it stable whether the reap came from SIGTERM or SIGKILL.
"""
conn = get_db()
conn.execute(
"UPDATE agent_runs SET finished_at=datetime('now'), exit_code=-9 WHERE id=?",
(run_id,),
)
conn.commit()
conn.close()
def _monitor_agent(self, proc, run_id, agent, repo, branch, output_path=None, log_fh=None, job_id=None):
"""Wait for agent to finish, commit+push results, update DB.
B-2 fix: stdout already goes straight to the log file via Popen, so we just
block on proc.wait() (guaranteed reap -> no zombie, real exit_code) and then
close the log file handle. No PIPE, no select loop, no startup timeout here
(the watchdog still enforces the overall AGENT_TIMEOUT by pid).
"""
import time as _time
_start_ts = _time.time()
exit_code = proc.wait()
if log_fh is not None:
try:
log_fh.close()
except Exception:
pass
_duration_s = int(_time.time() - _start_ts)
logger.info(f"Agent run_id={run_id} ({agent}) finished with exit_code={exit_code}")
# Update DB
conn = get_db()
conn.execute(
"UPDATE agent_runs SET finished_at=datetime('now'), exit_code=? WHERE id=?",
(exit_code, run_id),
)
conn.commit()
# Get task_id for notification
_row = conn.execute("SELECT task_id FROM agent_runs WHERE id=?", (run_id,)).fetchone()
_task_id = _row[0] if _row else None
conn.close()
notify_agent_finished(run_id, agent, exit_code, task_id=_task_id, duration_s=_duration_s)
# Feature 4: parse token usage / cost from the (json) run log and record
# it on the agent_runs row. Never fatal — a garbled/missing JSON records
# NULLs and logs a warning so a broken run can't crash the monitor.
try:
from ..usage import parse_usage_from_log, record_usage
_usage = parse_usage_from_log(output_path) if output_path else None
record_usage(run_id, _usage)
except Exception as e:
logger.warning(f"run_id={run_id}: usage accounting failed: {e}")
_usage = None
# Commit and push any changes — in the per-branch worktree (ORCH-2 / S-4),
# NOT in the shared /repos/<repo>. The worktree is already on `branch`
# (ensure_worktree did the checkout), so no checkout is needed here.
repo_path = get_worktree_path(repo, branch)
try:
git_env = {
**os.environ,
"HOME": "/home/slin",
"GIT_AUTHOR_NAME": "claude-bot",
"GIT_AUTHOR_EMAIL": "claude-bot@mva154.local",
"GIT_COMMITTER_NAME": "claude-bot",
"GIT_COMMITTER_EMAIL": "claude-bot@mva154.local",
}
result = subprocess.run(
["git", "-C", repo_path, "status", "--porcelain"],
capture_output=True, text=True, timeout=10, env=git_env
)
if result.stdout.strip():
# Add docs/ always
subprocess.run(
["git", "-C", repo_path, "add", "docs/"],
capture_output=True, text=True, timeout=10, env=git_env
)
# Add src/ and tests/ for developer
if agent == "developer":
subprocess.run(
["git", "-C", repo_path, "add", "src/", "tests/"],
capture_output=True, text=True, timeout=10, env=git_env
)
# Commit
commit_result = subprocess.run(
["git", "-C", repo_path, "commit", "-m",
f"{agent}(ET): auto-commit from {agent} run_id={run_id}"],
capture_output=True, text=True, timeout=30, env=git_env
)
if commit_result.returncode == 0:
push_result = subprocess.run(
["git", "-C", repo_path, "push", "origin", branch],
capture_output=True, text=True, timeout=60, env=git_env
)
if push_result.returncode == 0:
logger.info(f"Agent run_id={run_id}: committed and pushed to {branch}")
# Auto-create PR after developer pushes
if agent == "developer":
self._ensure_pr(repo, branch, run_id)
else:
logger.error(f"Agent run_id={run_id}: push failed: {push_result.stderr}")
else:
logger.warning(f"Agent run_id={run_id}: commit failed: {commit_result.stderr}")
else:
logger.info(f"Agent run_id={run_id}: no changes to commit")
# ORCH-061: on a self-deploy action stage (deploy-staging/deploy)
# an empty diff is EXPECTED (action, not a src edit). Emit an
# explicit observability line so an operator can tell this apart
# from a code-stage no-op. Does NOT affect advancement (decided by
# exit-code + gate verdict, never by a commit existing).
try:
_t = get_task_by_repo_branch(repo, branch)
_stage = _t["stage"] if _t else None
_note = action_stage_no_changes_note(_stage, repo)
if _note:
logger.info(f"Agent run_id={run_id}: {_note}")
except Exception as _e:
logger.debug(
f"Agent run_id={run_id}: action-stage no-changes note "
f"skipped: {_e}"
)
except Exception as e:
logger.error(f"Agent run_id={run_id}: post-run git failed: {e}")
# Handle deployer failure (smoke/healthcheck failed) — Task 7
if exit_code != 0 and agent == "deployer":
conn = get_db()
task_row = conn.execute(
"SELECT id, work_item_id FROM tasks WHERE repo=? AND branch=?",
(repo, branch),
).fetchone()
conn.close()
if task_row:
_tid, _wid = task_row
update_task_stage(_tid, "development")
notify_stage_change(_tid, "deploy", "development")
plane_notify_stage(_wid, "deploy", "development")
from ..plane_sync import set_issue_blocked
set_issue_blocked(_wid)
plane_add_comment(
_wid,
"\u274c Deploy FAILED (smoke/healthcheck). Rolled back. Developer \u043d\u0443\u0436\u0435\u043d \u0434\u043b\u044f \u0444\u0438\u043a\u0441\u0430.",
author="deployer",
)
from ..notifications import send_telegram, link_for
send_telegram(f"\U0001f6a8 {link_for(_wid)}: Deploy failed! Rolled back. Needs fix.")
# Notify on startup timeout (exit_code from kill = -9 or 137)
if exit_code != 0 and exit_code not in (None,):
conn = get_db()
task_row = conn.execute(
"SELECT id, work_item_id FROM tasks WHERE repo=? AND branch=?",
(repo, branch),
).fetchone()
conn.close()
if task_row and agent != "deployer": # deployer handled above
_tid, _wid = task_row
from ..notifications import send_telegram, link_for
send_telegram(f"\u26a0\ufe0f {link_for(_wid, _tid)}: Agent {agent} failed (exit_code={exit_code}). Check logs: {_run_log_path(run_id)}")
# Feature 4 + ORCH-016: post the unified per-agent status comment under
# that agent's bot, threading the wall-clock duration we just measured
# straight through (ADR-001 §6: explicit param wins over DB fallback).
# The deployer finishing the task also posts the per-task usage summary.
if exit_code == 0:
try:
self._post_usage_comments(
run_id, agent, repo, branch, _usage, duration_s=_duration_s
)
except Exception as e:
logger.warning(f"run_id={run_id}: usage comment failed: {e}")
# Auto-advance stage if agent finished successfully and QG passes
if exit_code == 0:
self._try_advance_stage(run_id, agent, repo, branch)
# ORCH-1: drive the job-queue status for queue-launched jobs only.
# (Legacy direct launch() has job_id=None and is unaffected.)
if job_id is not None:
self._finalize_job(job_id, agent, run_id, exit_code, output_path=output_path)
def _backoff_seconds(self, transient_attempts: int, retry_after: int = None) -> int:
"""Exponential backoff for transient failures, honouring Retry-After.
backoff = min(2^transient_attempts * base, max). If the server sent a
Retry-After, use the larger of the two (never poll sooner than asked).
"""
base = settings.backoff_base_seconds
cap = settings.backoff_max_seconds
backoff = min((2 ** max(transient_attempts, 0)) * base, cap)
if retry_after is not None and retry_after > 0:
backoff = max(backoff, min(retry_after, cap))
return int(backoff)
def _finalize_job(self, job_id: int, agent: str, run_id: int, exit_code, output_path=None):
"""ORCH-1: update the jobs row after the agent process finished.
exit_code == 0 -> done (and resets the breaker streak via on_outcome).
exit_code != 0 -> classify the failure from the run log tail (token-free):
- TRANSIENT (429/overload/network): backoff-requeue with available_at in
the future + a SEPARATE transient_attempts budget
(settings.transient_max_attempts), honouring Retry-After. Reported to
the breaker so it opens after N consecutive transient failures.
- PERMANENT (code fault): ordinary attempts < max_attempts requeue,
otherwise 'failed' + Telegram.
"""
from ..db import get_job, mark_job
from ..error_classifier import classify_log_file
try:
job = get_job(job_id)
if not job:
return
if exit_code == 0:
mark_job(job_id, "done", run_id=run_id)
logger.info(f"Job {job_id} ({agent}) done (run_id={run_id})")
self._record_outcome(transient=False, recovered=True)
return
# Classify the failure from the agent log tail (no token cost).
kind, retry_after = "permanent", None
log_path = output_path or _run_log_path(run_id)
try:
kind, retry_after = classify_log_file(log_path)
except Exception:
pass
if kind == "transient":
self._finalize_transient(job_id, agent, run_id, exit_code, job, retry_after)
else:
self._finalize_permanent(job_id, agent, run_id, exit_code, job)
except Exception as e:
logger.error(f"Job {job_id}: _finalize_job error: {e}")
def _finalize_transient(self, job_id, agent, run_id, exit_code, job, retry_after):
"""Transient (429/overload/net) failure -> backoff requeue or fail when budget out."""
from ..db import mark_job, mark_job_transient
tattempts = job.get("transient_attempts", 0)
tmax = settings.transient_max_attempts
err = (f"transient (429/overload) agent {agent} exit={exit_code} "
f"(run_id={run_id}); retry_after={retry_after}")
self._record_outcome(transient=True, recovered=False)
if tattempts < tmax:
backoff = self._backoff_seconds(tattempts + 1, retry_after)
mark_job_transient(job_id, backoff, error=err)
logger.warning(
f"Job {job_id} ({agent}) TRANSIENT fail (exit={exit_code}), "
f"backoff {backoff}s, transient_attempt {tattempts + 1}/{tmax}"
)
else:
mark_job(job_id, "failed", run_id=run_id, error=err)
logger.error(
f"Job {job_id} ({agent}) failed after {tattempts} transient attempts"
)
self._notify_failed(job_id, agent, job, run_id,
f"transient (rate-limit) after {tattempts} attempts")
def _finalize_permanent(self, job_id, agent, run_id, exit_code, job):
"""Permanent (code-fault) failure -> normal attempts<max requeue, then fail."""
from ..db import mark_job
attempts = job.get("attempts", 0)
max_attempts = job.get("max_attempts", 2)
err = f"agent {agent} exit_code={exit_code} (run_id={run_id})"
self._record_outcome(transient=False, recovered=False)
if attempts < max_attempts:
mark_job(job_id, "queued", run_id=run_id, error=err)
logger.warning(
f"Job {job_id} ({agent}) failed (exit={exit_code}), "
f"requeued (attempt {attempts}/{max_attempts})"
)
else:
mark_job(job_id, "failed", run_id=run_id, error=err)
logger.error(
f"Job {job_id} ({agent}) failed permanently after "
f"{attempts} attempts (exit={exit_code})"
)
self._notify_failed(job_id, agent, job, run_id,
f"{attempts} attempts (exit={exit_code})")
def _notify_failed(self, job_id, agent, job, run_id, why):
try:
from ..notifications import send_telegram
send_telegram(
f"\U0001f6a8 Job {job_id} ({agent}, repo {job.get('repo')}) "
f"failed: {why}. Logs: {_run_log_path(run_id)}"
)
except Exception:
pass
def _record_outcome(self, transient: bool, recovered: bool):
"""Forward the run outcome to the circuit breaker (if a worker is wired).
Decoupled via a settable callback (set by QueueWorker.start) so the launcher
does not hard-import the worker (avoids a cycle) and tests can run the
launcher standalone.
"""
cb = getattr(self, "on_outcome", None)
if cb:
try:
cb(transient=transient, recovered=recovered)
except Exception:
pass
def _try_advance_stage(self, run_id: int, agent: str, repo: str, branch: str):
"""After agent finishes successfully, advance the stage via the unified engine.
ORCH-4 / M-3: the 174-line body that used to live here moved into
src/stage_engine.advance_stage(). This is now a thin wrapper: it looks up
the task by (repo, branch) and delegates. `agent` is forwarded as
finished_agent so the analyst/reviewer/tester/architect rollback branches
still trigger exactly as before. The agent-selection bug (it used to call
get_agent_for_stage(next_stage)) is fixed inside the engine.
"""
try:
conn = get_db()
task_row = conn.execute(
"SELECT id, stage, work_item_id FROM tasks WHERE repo=? AND branch=?",
(repo, branch),
).fetchone()
conn.close()
if not task_row:
return
task_id, current_stage, work_item_id = task_row
from ..stage_engine import advance_stage
advance_stage(
task_id=task_id,
current_stage=current_stage,
repo=repo,
work_item_id=work_item_id,
branch=branch,
finished_agent=agent,
)
except Exception as e:
logger.error(f"Auto-advance failed for run_id={run_id}: {e}")
def _post_usage_comments(self, run_id, agent, repo, branch, usage, duration_s=None):
"""Feature 4 + ORCH-016: post the unified per-agent status comment.
- Always (on success, with a work_item_id): a per-agent finish comment
via ``usage.build_status_comment(...)``, authored by the finishing
agent's Plane bot. The comment carries:
* single-line header (icon + role + per-stage description),
* machine verdict line for reviewer / tester / deployer (when the
relevant frontmatter is present in the worktree),
* the agent's wall-clock duration (``duration_s`` is the measured
value in _monitor_agent; DB fallback is unused on this path),
* an HTML <ul> of artifact links scoped per agent,
* a ``<sub>`` token/cost tail.
- When the deployer finishes: also a per-task summary (SUM over
agent_runs GROUP BY agent), authored by the deployer.
The deployer's `stage=` is resolved from the task row so the helper can
pick between 14-deploy-log.md (prod) and 15-staging-log.md (staging).
"""
from ..usage import build_status_comment, task_summary_comment
from ..git_worktree import get_worktree_path
conn = get_db()
row = conn.execute(
"SELECT id, work_item_id, stage FROM tasks WHERE repo=? AND branch=?",
(repo, branch),
).fetchone()
conn.close()
if not row:
return
task_id, work_item_id, stage = row[0], row[1], row[2]
if not work_item_id:
return
# Observability: every agent's finish comment links its artifact(s)
# (reviewer->12-review, tester->13-test-report, deployer->14- or 15-,
# architect->ADR, developer->PR/branch). For the developer we resolve the
# open PR number so the link points straight at it.
pr_number = None
if agent == "developer":
pr_number = self._open_pr_number(repo, branch)
# Best-effort worktree path — drives AC-8 (skip missing artifacts) and
# the verdict frontmatter read. Falls back to None on lookup error so
# the comment still goes out without the verdict line / file probe.
try:
worktree_root = get_worktree_path(repo, branch)
except Exception:
worktree_root = None
plane_add_comment(
work_item_id,
build_status_comment(
agent,
repo=repo,
branch=branch,
work_item_id=work_item_id,
pr_number=pr_number,
stage=stage,
usage=usage,
duration_s=duration_s,
task_id=task_id,
worktree_root=worktree_root,
),
author=agent,
)
if agent == "deployer":
plane_add_comment(
work_item_id, task_summary_comment(task_id), author="deployer"
)
def _open_pr_number(self, repo: str, branch: str):
"""Return the open PR number for `branch`, or None. Never raises."""
try:
import httpx
owner = settings.gitea_owner
headers = {"Authorization": f"token {settings.gitea_token}"}
resp = httpx.get(
f"{settings.gitea_url}/api/v1/repos/{owner}/{repo}/pulls",
params={"state": "open", "head": branch},
headers=headers, timeout=5,
)
if resp.status_code == 200:
prs = resp.json()
if prs:
return prs[0].get("number")
except Exception:
pass
return None
def _ensure_pr(self, repo: str, branch: str, run_id: int):
"""Ensure an open code-PR exists for ``branch``; return its number or None.
ORCH-082 (ADR-001 Р-4): delegated to the single idempotent PR-creation actor
``merge_gate.ensure_open_pr`` so PR creation lives in ONE place and logs the
same created/existed/failed outcomes (G3). The CALL TRIGGER is unchanged — the
caller (`_monitor_agent`) still invokes this ONLY on the developer path with a
fresh worktree commit; only the implementation under the hood is shared. The
actor uses the same ``head==branch AND base==main`` filter as ``merge_pr``, so
the developer-created PR and the one merge-verify merges are guaranteed to be
the same code-PR. Never raises (the actor is never-raise); ``failed`` -> None,
preserving the previous "best-effort, return None on failure" contract.
"""
from .. import merge_gate
status, detail = merge_gate.ensure_open_pr(repo, branch)
logger.info(f"_ensure_pr({branch}, run_id={run_id}) -> {status} ({detail})")
if status in ("created", "existed"):
try:
return int(detail)
except (TypeError, ValueError):
return None
logger.error(f"Failed to ensure PR for {branch}: {detail}")
return None
def _write_task_file(self, repo: str, branch: str, task_file: str, content: str):
"""Write task file directly into the task's worktree.
B-1 fix: no docker (direct open()). ORCH-2/S-4: the target is the per-branch
worktree (/repos/_wt/<repo>/<branch>), not the shared /repos/<repo>, so the
agent reads the task ZADANIE from its own isolated working copy.
Raise on failure instead of silently swallowing errors.
"""
work_path = get_worktree_path(repo, branch) # /repos/_wt/<repo>/<branch>
full_path = os.path.join(work_path, task_file)
try:
with open(full_path, "w", encoding="utf-8") as f:
f.write(content)
logger.info(f"Task file written: {full_path} ({len(content)} bytes)")
except OSError as e:
logger.error(f"Failed to write task file {full_path}: {e}")
raise RuntimeError(f"Failed to write task file: {e}")
launcher = AgentLauncher()