test_spawn_stamps_resolved_effort упал в CI с PermissionError на '/app':
launcher._spawn хардкодил output_path='/app/data/runs/{run_id}.log' и
os.makedirs('/app/data/runs'). В контейнере /app есть, на CI-хосте
(act_runner hostexecutor) — нет, makedirs бросает -> красный CI.
Фикс корня (не только теста): базовый каталог per-run логов вынесен в
Settings.runs_dir (env ORCH_RUNS_DIR, дефолт '/app/data/runs' = прод 1:1).
Новый хелпер _run_log_path(run_id) — единый источник пути, использован в
_spawn + три прежних inline-строки логов/алертов. Тест monkeypatch-ит
settings.runs_dir на tmp_path -> окружение-независим (проверено прогоном
с принудительно недоступным /app). pytest tests/ -q: 1090 passed.
STAGE_TRANSITIONS/QG_CHECKS/схема БД не тронуты. Docs: README env-таблица,
CHANGELOG.
Refs: ORCH-087
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1146 lines
51 KiB
Python
1146 lines
51 KiB
Python
import subprocess
|
||
import os
|
||
import json
|
||
import logging
|
||
import re
|
||
import threading
|
||
import signal
|
||
import time
|
||
from ..config import settings
|
||
from ..db import get_db, get_task_by_repo_branch, update_task_stage, enqueue_job
|
||
from ..stages import get_next_stage, get_qg_for_stage, get_agent_for_stage
|
||
from ..git_worktree import ensure_worktree, get_worktree_path
|
||
from ..qg.checks import QG_CHECKS
|
||
from ..notifications import notify_stage_change, notify_qg_failure, notify_agent_started, notify_agent_finished, notify_approve_requested
|
||
from ..plane_sync import notify_stage_change as plane_notify_stage, add_comment as plane_add_comment
|
||
|
||
logger = logging.getLogger("orchestrator.launcher")
|
||
|
||
# ORCH-41: valid --effort values accepted by the Claude CLI. An effort that is
|
||
# not in this set is treated as misconfiguration: logged and dropped (no flag),
|
||
# never passed through to the CLI.
|
||
VALID_EFFORTS = frozenset({"low", "medium", "high", "xhigh", "max"})
|
||
|
||
# ORCH-074 (G2): structural validity check for a Claude CLI model name. We use a
|
||
# FORMAT check (^claude-…$), not a static allowlist, on purpose: an allowlist
|
||
# recreates the exact rot we kill in G1 — it silently drops a CORRECT newer model
|
||
# (e.g. claude-opus-4-9) the day Anthropic ships it (never-break working against
|
||
# the operator). The final authority on whether a model exists is the Claude CLI
|
||
# itself, not our code; a format check is forward-compatible (new versions pass
|
||
# without code edits) while still catching the real failure classes: another
|
||
# provider (gpt-4), empty/whitespace, garbage chars, wrong prefix (claud-opus-typo).
|
||
# The claude- prefix is hardcoded here because the orchestrator is bound to the
|
||
# Claude CLI (CLAUDE_BIN); the canonical model VERSION lives ONLY in
|
||
# settings.agent_model_default, never here. See ADR-001 (ORCH-074).
|
||
_MODEL_NAME_RE = re.compile(r"^claude-[a-z0-9.-]+$")
|
||
|
||
|
||
def is_valid_model(name: str) -> bool:
|
||
"""ORCH-074 (G2): True iff ``name`` is a structurally valid Claude model name.
|
||
|
||
A valid name, after ``strip()``, is non-empty, starts with ``claude-`` and
|
||
contains only lowercase letters, digits, dots and dashes. Anything else
|
||
(empty/whitespace, another provider like ``gpt-4``, a wrong prefix, illegal
|
||
characters) is invalid. This is the single predicate used by BOTH
|
||
``resolve_agent_model`` and the inline ``--fallback-model`` read in ``_spawn``
|
||
so a typo can never reach the CLI (never-break). It is a structural guard, not
|
||
a registry of existing models — a structurally valid typo (``claude-opus-typo``)
|
||
is left for the CLI to reject. Never raises.
|
||
"""
|
||
if not name:
|
||
return False
|
||
return bool(_MODEL_NAME_RE.match(name.strip()))
|
||
|
||
# ORCH-061: action stages whose success is an ACTION (restart/retag), not a src
|
||
# edit — so "no changes to commit" is EXPECTED there, not under-delivery (FR-3).
|
||
_ACTION_STAGES = frozenset({"deploy-staging", "deploy"})
|
||
|
||
|
||
def action_stage_no_changes_note(stage, repo) -> str | None:
|
||
"""ORCH-061 (FR-3 / FR-7): observability for an empty diff on an action stage.
|
||
|
||
The ``deploy-staging`` / ``deploy`` stages are actions (restart / retag), not
|
||
code edits, so the post-run "no changes to commit" is the NORMAL case there —
|
||
advancement is decided by the agent exit-code + the staging/deploy gate verdict,
|
||
NEVER by the presence of a commit (FR-3 / AC-4). This is a PURE decision used
|
||
only to emit an explicit log line distinguishing an expected action-stage no-op
|
||
from a code-stage no-op; it has no effect on stage advancement.
|
||
|
||
Returns an explicit note string when the empty diff is expected (an action
|
||
stage of a self-deploy repo), else ``None``. Never raises.
|
||
"""
|
||
try:
|
||
if stage in _ACTION_STAGES:
|
||
from ..self_deploy import self_deploy_applies
|
||
if self_deploy_applies(repo):
|
||
return f"{stage}: no code changes (expected on action stage)"
|
||
return None
|
||
except Exception: # noqa: BLE001 - observability only, never raise
|
||
return None
|
||
|
||
|
||
def _resolve_agent_attr(agent, project_id, project_map_attr, env_attr_prefix,
|
||
default_attr):
|
||
"""ORCH-41 shared resolver with priority:
|
||
1. ProjectConfig.<project_map_attr>[agent] (per-project override)
|
||
2. settings.<env_attr_prefix><agent> (per-agent env, if non-empty)
|
||
3. settings.<default_attr> (global default)
|
||
4. "" (no flag -> CLI default)
|
||
|
||
project_id is the Plane project uuid. It is resolved to a ProjectConfig via
|
||
the registry; an unknown / empty id simply skips level 1. A missing per-agent
|
||
settings attribute (e.g. unknown agent name) skips level 2.
|
||
"""
|
||
# Level 1: per-project override.
|
||
if project_id:
|
||
from ..projects import get_project_by_plane_id
|
||
proj = get_project_by_plane_id(project_id)
|
||
if proj is not None:
|
||
override = getattr(proj, project_map_attr, {}).get(agent)
|
||
if override:
|
||
return override
|
||
|
||
# Level 2: per-agent env (settings.<prefix><agent>), if defined & non-empty.
|
||
per_agent = getattr(settings, f"{env_attr_prefix}{agent}", "")
|
||
if per_agent:
|
||
return per_agent
|
||
|
||
# Level 3: global default.
|
||
default = getattr(settings, default_attr, "")
|
||
if default:
|
||
return default
|
||
|
||
# Level 4: nothing -> CLI default.
|
||
return ""
|
||
|
||
|
||
def _agent_model_candidates(agent: str, project_id: str = None):
|
||
"""Yield non-empty model candidates in ORCH-41 priority order.
|
||
|
||
Same priority as _resolve_agent_attr (project-override > per-agent env >
|
||
global default), but as a generator so resolve_agent_model can validate each
|
||
level and SKIP an invalid one (ORCH-074 G2) instead of returning the first
|
||
non-empty value blindly. Empty levels are simply not yielded.
|
||
"""
|
||
if project_id:
|
||
from ..projects import get_project_by_plane_id
|
||
proj = get_project_by_plane_id(project_id)
|
||
if proj is not None:
|
||
override = getattr(proj, "agent_models", {}).get(agent)
|
||
if override:
|
||
yield override
|
||
per_agent = getattr(settings, f"agent_model_{agent}", "")
|
||
if per_agent:
|
||
yield per_agent
|
||
default = getattr(settings, "agent_model_default", "")
|
||
if default:
|
||
yield default
|
||
|
||
|
||
def resolve_agent_model(agent: str, project_id: str = None) -> str:
|
||
"""ORCH-41: resolve the LLM model for an agent (optionally per-project).
|
||
|
||
ORCH-074 (G2): the resolved name is validated with is_valid_model BEFORE it is
|
||
returned. An invalid (structurally garbage) value at any level is logged and
|
||
SKIPPED — resolution falls through to the next valid level (project-override
|
||
invalid -> per-agent env -> default); if no level yields a valid name the
|
||
function returns "" so the caller omits --model and the CLI default applies.
|
||
The ORCH-41 priority order and signature are unchanged; validation is layered
|
||
on top. Never raises and never returns garbage that could reach --model.
|
||
"""
|
||
for value in _agent_model_candidates(agent, project_id):
|
||
if is_valid_model(value):
|
||
return value
|
||
logger.warning(
|
||
f"Invalid model name '{value}' for agent '{agent}' "
|
||
f"(expected '^claude-…'); skipping to next resolution level / CLI default"
|
||
)
|
||
return ""
|
||
|
||
|
||
def _agent_effort_floor(agent: str) -> str:
|
||
"""ORCH-081 (ORCH-52h): per-role non-empty floor for --effort resolution.
|
||
|
||
Returns the DECLARED class-default of the ``agent_effort_<agent>`` field on
|
||
Settings (e.g. developer -> ``xhigh``, tester/deployer -> ``medium``, the rest
|
||
-> ``high``). This is the value pydantic WOULD have used were it not clobbered
|
||
by a spurious empty env var (``ORCH_AGENT_EFFORT_<ROLE>=``): the class-default
|
||
is fixed in the class body and a present-but-empty env value cannot override it,
|
||
so it is a robust floor even when the host ``.env`` zeroes every effort var.
|
||
|
||
config.py is the single source of truth: upgrading developer to ``xhigh`` there
|
||
automatically raises the floor here — no second map to keep in sync (ADR-001).
|
||
|
||
Unknown agent (a name outside the 6 roles) has no ``agent_effort_<agent>``
|
||
field; we degrade to the class-default of ``agent_effort_default`` (``high``),
|
||
a safe non-empty floor. Never raises.
|
||
"""
|
||
fields = type(settings).model_fields
|
||
for key in (f"agent_effort_{agent}", "agent_effort_default"):
|
||
field = fields.get(key)
|
||
if field is not None and field.default:
|
||
return field.default
|
||
return ""
|
||
|
||
|
||
def resolve_agent_effort(agent: str, project_id: str = None) -> str:
|
||
"""ORCH-41: resolve the --effort level for an agent (optionally per-project).
|
||
|
||
Same priority as resolve_agent_model, with one extra level below the global
|
||
default (ORCH-081 / ADR-001):
|
||
1. project-override (projects_json.agent_efforts[agent])
|
||
2. per-agent env (settings.agent_effort_<agent>)
|
||
3. global default (settings.agent_effort_default)
|
||
4. per-role FLOOR (class-default of agent_effort_<agent>) — NEW
|
||
|
||
The floor only kicks in when levels 1-3 are all empty (the prod bug: a present
|
||
but empty ``ORCH_AGENT_EFFORT_*=`` clobbers every default to ''), guaranteeing
|
||
a non-empty target effort for the 6 known roles regardless of host .env state.
|
||
|
||
The floor is applied BEFORE validation and ONLY to an empty resolve, so it
|
||
never masks a typo: an explicit invalid value (e.g. ``turbo``) is non-empty,
|
||
skips the floor, and is logged + dropped to "" exactly as in ORCH-41 (the
|
||
resolved value is validated against VALID_EFFORTS; an invalid value can never
|
||
pass a bad flag to the CLI). Never raises.
|
||
"""
|
||
value = _resolve_agent_attr(
|
||
agent, project_id,
|
||
project_map_attr="agent_efforts",
|
||
env_attr_prefix="agent_effort_",
|
||
default_attr="agent_effort_default",
|
||
)
|
||
if not value:
|
||
# Levels 1-3 all empty (typically a prod .env with empty ORCH_AGENT_EFFORT_*):
|
||
# fall through to the per-role floor (class-default). Applied before
|
||
# validation but only here, so a typo (non-empty) never reaches this branch.
|
||
value = _agent_effort_floor(agent)
|
||
if value and value not in VALID_EFFORTS:
|
||
logger.warning(
|
||
f"Invalid effort '{value}' for agent '{agent}' "
|
||
f"(allowed: {sorted(VALID_EFFORTS)}); omitting --effort"
|
||
)
|
||
return ""
|
||
return value
|
||
|
||
|
||
def _run_log_path(run_id):
|
||
"""Absolute path of a per-run agent log: ``<settings.runs_dir>/<run_id>.log``.
|
||
|
||
ORCH-087: single source of truth for the log path so it follows
|
||
``settings.runs_dir`` everywhere (no hardcoded ``/app/data/runs``), which keeps
|
||
``_spawn`` writable on non-container hosts (CI) where ``/app`` is inaccessible.
|
||
"""
|
||
return os.path.join(settings.runs_dir, f"{run_id}.log")
|
||
|
||
|
||
def prune_run_logs(runs_dir, keep_days=30, keep_max=500, active_paths=None):
|
||
"""L-2: best-effort rotation of per-run logs (<runs_dir>/*.log).
|
||
|
||
A log file is removed if it is older than keep_days OR it is not within the
|
||
keep_max most-recent logs (whichever condition is met first). Only *.log
|
||
files directly inside runs_dir are considered; non-.log files and
|
||
subdirectories are never touched. Files whose path is in active_paths (the
|
||
currently running log) are always kept.
|
||
|
||
Returns the number of files removed. Never raises: any error is logged and
|
||
swallowed so log rotation can never bring the app down.
|
||
"""
|
||
removed = 0
|
||
try:
|
||
active = set()
|
||
for ap in (active_paths or []):
|
||
try:
|
||
active.add(os.path.realpath(ap))
|
||
except Exception:
|
||
active.add(ap)
|
||
|
||
if not os.path.isdir(runs_dir):
|
||
return 0
|
||
|
||
logs = []
|
||
for name in os.listdir(runs_dir):
|
||
if not name.endswith(".log"):
|
||
continue
|
||
path = os.path.join(runs_dir, name)
|
||
if not os.path.isfile(path):
|
||
continue
|
||
if os.path.realpath(path) in active:
|
||
continue
|
||
try:
|
||
mtime = os.path.getmtime(path)
|
||
except OSError:
|
||
continue
|
||
logs.append((path, mtime))
|
||
|
||
logs.sort(key=lambda t: t[1], reverse=True)
|
||
|
||
cutoff = time.time() - keep_days * 86400
|
||
for idx, (path, mtime) in enumerate(logs):
|
||
too_old = mtime < cutoff
|
||
over_max = idx >= keep_max
|
||
if too_old or over_max:
|
||
try:
|
||
os.remove(path)
|
||
removed += 1
|
||
except OSError as e:
|
||
logger.warning(f"prune_run_logs: failed to remove {path}: {e}")
|
||
except Exception as e:
|
||
logger.warning(f"prune_run_logs failed for {runs_dir}: {e}")
|
||
return removed
|
||
|
||
|
||
class AgentLauncher:
|
||
"""Launch Claude CLI agents directly (binary mounted into container)."""
|
||
|
||
AGENT_CONFIGS = {
|
||
"analyst": {
|
||
"system_prompt": ".openclaw/agents/analyst.md",
|
||
"task_file": ".task.md",
|
||
"allowed_tools": "Read,Write,Edit,Bash",
|
||
},
|
||
"architect": {
|
||
"system_prompt": ".openclaw/agents/architect.md",
|
||
"task_file": ".task-arch.md",
|
||
"allowed_tools": "Read,Write,Edit,Bash",
|
||
},
|
||
"developer": {
|
||
"system_prompt": ".openclaw/agents/developer.md",
|
||
"task_file": ".task-dev.md",
|
||
"allowed_tools": "Read,Write,Edit,Bash",
|
||
},
|
||
"reviewer": {
|
||
"system_prompt": ".openclaw/agents/reviewer.md",
|
||
"task_file": ".task-review.md",
|
||
"allowed_tools": "Read,Write,Edit,Bash",
|
||
},
|
||
"tester": {
|
||
"system_prompt": ".openclaw/agents/tester.md",
|
||
"task_file": ".task-test.md",
|
||
"allowed_tools": "Read,Write,Edit,Bash",
|
||
},
|
||
"deployer": {
|
||
"task_file": ".task-deploy.md",
|
||
"system_prompt": ".openclaw/agents/deployer.md",
|
||
"allowed_tools": "Read,Write,Edit,Bash",
|
||
},
|
||
}
|
||
|
||
CLAUDE_BIN = "/opt/claude-code/bin/claude.exe"
|
||
# ORCH-7 (M-2): timeout is now configurable. AGENT_TIMEOUT stays as a
|
||
# backward-compatible alias for the default; the actual value (and per-agent
|
||
# overrides) live in settings and are resolved via _resolve_timeout().
|
||
AGENT_TIMEOUT = settings.agent_timeout_seconds
|
||
|
||
def launch(self, agent: str, repo: str, task_content: str = None, task_id: int = None) -> int:
|
||
"""
|
||
Launch a Claude CLI agent directly (legacy synchronous path).
|
||
|
||
Kept for backward compatibility (direct callers / existing tests). The
|
||
ORCH-1 job queue uses launch_job() instead, but both share _spawn().
|
||
|
||
Args:
|
||
agent: Agent role (analyst, architect, developer, reviewer, tester)
|
||
repo: Repository name
|
||
task_content: Optional task content to write to task file
|
||
task_id: Optional task ID to associate with this run
|
||
|
||
Returns:
|
||
agent_run_id from DB
|
||
"""
|
||
return self._spawn(agent, repo, task_content, task_id, job_id=None)
|
||
|
||
def launch_job(self, job: dict) -> int:
|
||
"""ORCH-1: launch an agent for a claimed queue job.
|
||
|
||
Same spawn path as launch(), but threads job['id'] through so the monitor
|
||
can update the job's status (done / requeue / failed) and link jobs.run_id
|
||
to the agent_runs row. Returns the agent_run_id.
|
||
|
||
ORCH-036: the reserved-agent ``deploy-finalizer`` is a DETERMINISTIC
|
||
(no-LLM) job — intercept it BEFORE _spawn (which would raise
|
||
"Unknown agent", R-6) and run the deploy finalizer synchronously, driving
|
||
the jobs row status itself. Returns None (no agent_run row).
|
||
"""
|
||
if job.get("agent") == "deploy-finalizer":
|
||
return self._run_deploy_finalizer_job(job)
|
||
# ORCH-021: the reserved-agent `post-deploy-monitor` is also a
|
||
# DETERMINISTIC (no-LLM) tick — intercept it BEFORE _spawn and run one
|
||
# observation tick synchronously. Returns None (no agent_run row).
|
||
if job.get("agent") == "post-deploy-monitor":
|
||
return self._run_post_deploy_monitor_job(job)
|
||
return self._spawn(
|
||
job["agent"],
|
||
job["repo"],
|
||
job.get("task_content"),
|
||
job.get("task_id"),
|
||
job_id=job["id"],
|
||
)
|
||
|
||
def _run_deploy_finalizer_job(self, job: dict):
|
||
"""ORCH-036 Phase C: run the deterministic deploy finalizer for a job.
|
||
|
||
Not an LLM spawn — there is no subprocess/monitor, so we mark the jobs row
|
||
done/failed here. Any error is contained (the finalizer never-raises, but
|
||
we guard anyway so a finalizer fault can't wedge the worker).
|
||
"""
|
||
from ..db import mark_job
|
||
from .. import stage_engine
|
||
try:
|
||
stage_engine.run_deploy_finalizer(job)
|
||
mark_job(job["id"], "done")
|
||
logger.info(f"deploy-finalizer job {job['id']} done")
|
||
except Exception as e:
|
||
logger.error(f"deploy-finalizer job {job['id']} failed: {e}")
|
||
try:
|
||
mark_job(job["id"], "failed", error=f"deploy-finalizer error: {e}")
|
||
except Exception:
|
||
pass
|
||
return None
|
||
|
||
def _run_post_deploy_monitor_job(self, job: dict):
|
||
"""ORCH-021: run one deterministic post-deploy monitor tick for a job.
|
||
|
||
Not an LLM spawn — there is no subprocess/monitor, so we mark the jobs row
|
||
done/failed here. The tick never-raises, but we guard anyway so a monitor
|
||
fault can never wedge the worker / starve other projects (AC-16).
|
||
"""
|
||
from ..db import mark_job
|
||
from .. import stage_engine
|
||
try:
|
||
stage_engine.run_post_deploy_monitor(job)
|
||
mark_job(job["id"], "done")
|
||
logger.info(f"post-deploy-monitor job {job['id']} done")
|
||
except Exception as e:
|
||
logger.error(f"post-deploy-monitor job {job['id']} failed: {e}")
|
||
try:
|
||
mark_job(job["id"], "failed", error=f"post-deploy-monitor error: {e}")
|
||
except Exception:
|
||
pass
|
||
return None
|
||
|
||
def _spawn(self, agent: str, repo: str, task_content: str = None,
|
||
task_id: int = None, job_id: int = None) -> int:
|
||
"""Shared spawn implementation for launch() and launch_job().
|
||
|
||
When job_id is set, the monitor/watchdog drive the jobs table status
|
||
(ORCH-1). The claude-CLI Popen logic (B-2) and worktree/task-file logic
|
||
(B-1 / ORCH-2) are unchanged.
|
||
"""
|
||
config = self.AGENT_CONFIGS.get(agent)
|
||
if not config:
|
||
raise ValueError(f"Unknown agent: {agent}")
|
||
|
||
# Main clone lives at /repos/<repo>; the agent works in an isolated worktree
|
||
# (ORCH-2 / S-4) so concurrent tasks never fight over a shared checkout.
|
||
local_repo_path = os.path.join(settings.repos_dir, repo)
|
||
if not os.path.isdir(local_repo_path):
|
||
raise FileNotFoundError(f"Repo not found: {local_repo_path}")
|
||
|
||
# Determine branch (needed before we touch the worktree / task file).
|
||
_br_row = get_db().execute("SELECT branch FROM tasks WHERE id=?", (task_id,)).fetchone() if task_id else None
|
||
agent_branch = _br_row[0] if _br_row else "main"
|
||
|
||
# ORCH-41: resolve the Plane project uuid for this repo so per-project
|
||
# model/effort overrides apply. Unknown repo -> None (env/default only).
|
||
from ..projects import get_project_by_repo
|
||
_proj = get_project_by_repo(repo)
|
||
project_id = _proj.plane_project_id if _proj else None
|
||
|
||
# Ensure the per-branch worktree exists and is on the right branch.
|
||
work_path = ensure_worktree(repo, agent_branch)
|
||
|
||
# Write task file if content provided (B-1: direct write; now into the worktree).
|
||
if task_content:
|
||
self._write_task_file(repo, agent_branch, config["task_file"], task_content)
|
||
|
||
# Record run in DB
|
||
conn = get_db()
|
||
cursor = conn.execute(
|
||
"INSERT INTO agent_runs (task_id, agent) VALUES (?, ?)",
|
||
(task_id, agent),
|
||
)
|
||
run_id = cursor.lastrowid
|
||
conn.commit()
|
||
|
||
# ORCH-1: link this job to the agent_runs row and stamp started_at.
|
||
if job_id is not None:
|
||
conn.execute(
|
||
"UPDATE jobs SET run_id = ?, started_at = datetime('now') WHERE id = ?",
|
||
(run_id, job_id),
|
||
)
|
||
conn.commit()
|
||
|
||
# Prepare output log path
|
||
output_path = _run_log_path(run_id)
|
||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||
|
||
# Build the claude command
|
||
task_file = config["task_file"]
|
||
system_prompt = config["system_prompt"]
|
||
allowed_tools = config["allowed_tools"]
|
||
|
||
# ORCH-41: model + effort + optional fallback are resolved from config
|
||
# (project-override > per-agent env > default), not hardcoded in AGENT_CONFIGS.
|
||
model = resolve_agent_model(agent, project_id)
|
||
effort = resolve_agent_effort(agent, project_id)
|
||
# ORCH-087 (BR-EFF): stamp the REAL --effort value onto this agent_runs row
|
||
# in the moment of launch. The CLI does not echo effort in its result JSON,
|
||
# so this is the only reliable source for the tracker's "· model · effort"
|
||
# line. Empty resolve (no --effort flag) -> NULL so the suffix is omitted.
|
||
# Reuses the still-open conn; never blocks the launch.
|
||
try:
|
||
conn.execute(
|
||
"UPDATE agent_runs SET effort=? WHERE id=?",
|
||
(effort or None, run_id),
|
||
)
|
||
conn.commit()
|
||
except Exception as e:
|
||
logger.warning(f"effort stamp failed for run_id={run_id}: {e}")
|
||
model_flag = f"--model {model} " if model else ""
|
||
effort_flag = f"--effort {effort} " if effort else ""
|
||
# ORCH-074 (G2): agent_fallback_model is read directly here, bypassing
|
||
# resolve_agent_model, so the same validator must guard this point too —
|
||
# otherwise a typo in ORCH_AGENT_FALLBACK_MODEL would slip into
|
||
# --fallback-model (never-break violation). Empty value -> no flag, exactly
|
||
# as before (is_valid_model("") is False but the `if fb` short-circuits).
|
||
fb = settings.agent_fallback_model
|
||
if fb and not is_valid_model(fb):
|
||
logger.warning(
|
||
f"Invalid fallback model '{fb}'; dropping --fallback-model"
|
||
)
|
||
fb = ""
|
||
fb_flag = f"--fallback-model {fb} " if fb else ""
|
||
|
||
# No git fetch/checkout here: ensure_worktree() already put the worktree on
|
||
# the right branch. The agent simply runs inside its isolated work_path.
|
||
# Feature 4 (token usage): --output-format json makes claude emit a single
|
||
# result JSON (with usage + total_cost_usd) at the end of stdout. The log
|
||
# still captures it; _monitor_agent parses the trailing JSON after the run
|
||
# to record per-agent tokens/cost. _monitor_agent's failure handling keys
|
||
# off the process exit_code (not stdout shape), so this is safe.
|
||
cmd = (
|
||
f'cd {work_path} && '
|
||
f'{self.CLAUDE_BIN} --print '
|
||
f'--output-format json '
|
||
f'{model_flag}{effort_flag}{fb_flag}'
|
||
f'"$(cat {task_file})" '
|
||
f'--system-prompt "$(cat {system_prompt})" '
|
||
f'--allowedTools {allowed_tools}'
|
||
)
|
||
|
||
logger.info(f"Launching agent '{agent}' for repo '{repo}', run_id={run_id}")
|
||
|
||
# Launch as background process.
|
||
# B-2 fix: redirect stdout/stderr straight to the log file at the OS level.
|
||
# No PIPE in the orchestrator process -> no PIPE deadlock, no reader thread,
|
||
# no zombies. log_fh is closed by _monitor_agent after proc.wait().
|
||
log_fh = open(output_path, "w")
|
||
proc = subprocess.Popen(
|
||
["bash", "-c", cmd],
|
||
stdout=log_fh,
|
||
stderr=subprocess.STDOUT,
|
||
env={
|
||
**os.environ,
|
||
"HOME": "/home/slin",
|
||
"GIT_AUTHOR_NAME": "claude-bot",
|
||
"GIT_AUTHOR_EMAIL": "claude-bot@mva154.local",
|
||
"GIT_COMMITTER_NAME": "claude-bot",
|
||
"GIT_COMMITTER_EMAIL": "claude-bot@mva154.local",
|
||
},
|
||
)
|
||
|
||
# Update DB with output path
|
||
conn.execute(
|
||
"UPDATE agent_runs SET output_path = ? WHERE id = ?",
|
||
(output_path, run_id),
|
||
)
|
||
# ORCH-065: stamp the agent process pid onto the job row so the job-reaper
|
||
# can probe liveness (os.kill(pid, 0)). proc.pid only exists after Popen,
|
||
# so this is a second UPDATE next to run_id/started_at (set above in _spawn).
|
||
if job_id is not None:
|
||
conn.execute(
|
||
"UPDATE jobs SET pid = ? WHERE id = ?",
|
||
(proc.pid, job_id),
|
||
)
|
||
conn.commit()
|
||
conn.close()
|
||
|
||
# Start timeout watchdog
|
||
t = threading.Thread(
|
||
target=self._watchdog,
|
||
args=(proc.pid, run_id),
|
||
kwargs={"job_id": job_id, "agent": agent},
|
||
daemon=True,
|
||
)
|
||
t.start()
|
||
|
||
# Start monitor thread (waits for completion, commits, pushes)
|
||
# agent_branch already computed above
|
||
m = threading.Thread(
|
||
target=self._monitor_agent,
|
||
args=(proc, run_id, agent, repo, agent_branch, output_path, log_fh),
|
||
kwargs={"job_id": job_id},
|
||
daemon=True,
|
||
)
|
||
m.start()
|
||
|
||
logger.info(f"Agent '{agent}' launched, pid={proc.pid}, run_id={run_id}")
|
||
notify_agent_started(run_id, agent, task_id)
|
||
return run_id
|
||
|
||
@staticmethod
|
||
def _resolve_timeout(agent: str = None) -> int:
|
||
"""ORCH-7 (M-2): resolve the wall-clock timeout for an agent.
|
||
|
||
Per-agent override from settings.agent_timeout_overrides_json (a JSON object
|
||
like {"reviewer": 3600}) wins; otherwise the global default
|
||
settings.agent_timeout_seconds is used. A malformed override JSON is ignored
|
||
(falls back to the default) and only logged, so a bad env never bricks runs.
|
||
"""
|
||
default = settings.agent_timeout_seconds
|
||
raw = (settings.agent_timeout_overrides_json or "").strip()
|
||
if agent and raw:
|
||
try:
|
||
overrides = json.loads(raw)
|
||
if isinstance(overrides, dict) and agent in overrides:
|
||
return int(overrides[agent])
|
||
except (ValueError, TypeError) as e:
|
||
logger.warning(f"Invalid agent_timeout_overrides_json, using default: {e}")
|
||
return default
|
||
|
||
def _watchdog(self, pid: int, run_id: int, timeout: int = None,
|
||
job_id: int = None, agent: str = None):
|
||
"""Kill agent if it exceeds its timeout.
|
||
|
||
ORCH-1: on a timeout-kill the monitor's proc.wait() returns the kill exit
|
||
code and drives the job retry/fail logic, so the watchdog itself only needs
|
||
to terminate the process and record the agent_runs exit. job_id is accepted
|
||
for symmetry.
|
||
|
||
ORCH-7 (M-2): graceful shutdown. Instead of an immediate SIGKILL (which cuts
|
||
claude off mid-write and leaves half-written artifacts), send SIGTERM first,
|
||
give the process up to settings.agent_kill_grace_seconds to flush and exit on
|
||
its own, and only SIGKILL if it is still alive after the grace window. If the
|
||
process exits during the grace window, SIGKILL is NOT sent.
|
||
ProcessLookupError is tolerated at every step (the process may already be
|
||
gone). The recorded exit_code stays -9 to match the existing retry/fail
|
||
contract regardless of which signal actually reaped it.
|
||
"""
|
||
if timeout is None:
|
||
timeout = self._resolve_timeout(agent)
|
||
time.sleep(timeout)
|
||
|
||
# Phase 1: SIGTERM (graceful). If the process is already gone, we're done.
|
||
try:
|
||
os.kill(pid, signal.SIGTERM)
|
||
logger.warning(
|
||
f"Agent run_id={run_id} exceeded {timeout}s timeout: sent SIGTERM "
|
||
f"(pid={pid}), grace={settings.agent_kill_grace_seconds}s"
|
||
)
|
||
except ProcessLookupError:
|
||
logger.info(f"Agent run_id={run_id} already exited before SIGTERM")
|
||
return # nothing to record: the monitor's proc.wait() owns the exit
|
||
|
||
# Phase 2: poll for graceful exit within the grace window.
|
||
grace = settings.agent_kill_grace_seconds
|
||
poll_interval = 0.5
|
||
waited = 0.0
|
||
while waited < grace:
|
||
time.sleep(poll_interval)
|
||
waited += poll_interval
|
||
try:
|
||
os.kill(pid, 0) # signal 0 = liveness probe, does not kill
|
||
except ProcessLookupError:
|
||
logger.info(
|
||
f"Agent run_id={run_id} exited gracefully after SIGTERM "
|
||
f"({waited:.1f}s); no SIGKILL needed"
|
||
)
|
||
self._record_kill(run_id)
|
||
return
|
||
|
||
# Phase 3: still alive -> hard SIGKILL.
|
||
try:
|
||
os.kill(pid, signal.SIGKILL)
|
||
logger.warning(
|
||
f"Agent run_id={run_id} did not exit within {grace}s grace: sent SIGKILL"
|
||
)
|
||
except ProcessLookupError:
|
||
logger.info(f"Agent run_id={run_id} exited just before SIGKILL")
|
||
self._record_kill(run_id)
|
||
|
||
@staticmethod
|
||
def _record_kill(run_id: int):
|
||
"""Stamp the agent_runs row as timeout-killed (exit_code=-9).
|
||
|
||
ORCH-1: -9 is the existing kill-exit contract the monitor/retry logic keys
|
||
off, so we keep it stable whether the reap came from SIGTERM or SIGKILL.
|
||
"""
|
||
conn = get_db()
|
||
conn.execute(
|
||
"UPDATE agent_runs SET finished_at=datetime('now'), exit_code=-9 WHERE id=?",
|
||
(run_id,),
|
||
)
|
||
conn.commit()
|
||
conn.close()
|
||
|
||
def _monitor_agent(self, proc, run_id, agent, repo, branch, output_path=None, log_fh=None, job_id=None):
|
||
"""Wait for agent to finish, commit+push results, update DB.
|
||
|
||
B-2 fix: stdout already goes straight to the log file via Popen, so we just
|
||
block on proc.wait() (guaranteed reap -> no zombie, real exit_code) and then
|
||
close the log file handle. No PIPE, no select loop, no startup timeout here
|
||
(the watchdog still enforces the overall AGENT_TIMEOUT by pid).
|
||
"""
|
||
import time as _time
|
||
_start_ts = _time.time()
|
||
|
||
exit_code = proc.wait()
|
||
if log_fh is not None:
|
||
try:
|
||
log_fh.close()
|
||
except Exception:
|
||
pass
|
||
_duration_s = int(_time.time() - _start_ts)
|
||
logger.info(f"Agent run_id={run_id} ({agent}) finished with exit_code={exit_code}")
|
||
|
||
# Update DB
|
||
conn = get_db()
|
||
conn.execute(
|
||
"UPDATE agent_runs SET finished_at=datetime('now'), exit_code=? WHERE id=?",
|
||
(exit_code, run_id),
|
||
)
|
||
conn.commit()
|
||
|
||
# Get task_id for notification
|
||
_row = conn.execute("SELECT task_id FROM agent_runs WHERE id=?", (run_id,)).fetchone()
|
||
_task_id = _row[0] if _row else None
|
||
conn.close()
|
||
|
||
notify_agent_finished(run_id, agent, exit_code, task_id=_task_id, duration_s=_duration_s)
|
||
|
||
# Feature 4: parse token usage / cost from the (json) run log and record
|
||
# it on the agent_runs row. Never fatal — a garbled/missing JSON records
|
||
# NULLs and logs a warning so a broken run can't crash the monitor.
|
||
try:
|
||
from ..usage import parse_usage_from_log, record_usage
|
||
_usage = parse_usage_from_log(output_path) if output_path else None
|
||
record_usage(run_id, _usage)
|
||
except Exception as e:
|
||
logger.warning(f"run_id={run_id}: usage accounting failed: {e}")
|
||
_usage = None
|
||
|
||
# Commit and push any changes — in the per-branch worktree (ORCH-2 / S-4),
|
||
# NOT in the shared /repos/<repo>. The worktree is already on `branch`
|
||
# (ensure_worktree did the checkout), so no checkout is needed here.
|
||
repo_path = get_worktree_path(repo, branch)
|
||
try:
|
||
git_env = {
|
||
**os.environ,
|
||
"HOME": "/home/slin",
|
||
"GIT_AUTHOR_NAME": "claude-bot",
|
||
"GIT_AUTHOR_EMAIL": "claude-bot@mva154.local",
|
||
"GIT_COMMITTER_NAME": "claude-bot",
|
||
"GIT_COMMITTER_EMAIL": "claude-bot@mva154.local",
|
||
}
|
||
result = subprocess.run(
|
||
["git", "-C", repo_path, "status", "--porcelain"],
|
||
capture_output=True, text=True, timeout=10, env=git_env
|
||
)
|
||
if result.stdout.strip():
|
||
# Add docs/ always
|
||
subprocess.run(
|
||
["git", "-C", repo_path, "add", "docs/"],
|
||
capture_output=True, text=True, timeout=10, env=git_env
|
||
)
|
||
# Add src/ and tests/ for developer
|
||
if agent == "developer":
|
||
subprocess.run(
|
||
["git", "-C", repo_path, "add", "src/", "tests/"],
|
||
capture_output=True, text=True, timeout=10, env=git_env
|
||
)
|
||
# Commit
|
||
commit_result = subprocess.run(
|
||
["git", "-C", repo_path, "commit", "-m",
|
||
f"{agent}(ET): auto-commit from {agent} run_id={run_id}"],
|
||
capture_output=True, text=True, timeout=30, env=git_env
|
||
)
|
||
if commit_result.returncode == 0:
|
||
push_result = subprocess.run(
|
||
["git", "-C", repo_path, "push", "origin", branch],
|
||
capture_output=True, text=True, timeout=60, env=git_env
|
||
)
|
||
if push_result.returncode == 0:
|
||
logger.info(f"Agent run_id={run_id}: committed and pushed to {branch}")
|
||
# Auto-create PR after developer pushes
|
||
if agent == "developer":
|
||
self._ensure_pr(repo, branch, run_id)
|
||
else:
|
||
logger.error(f"Agent run_id={run_id}: push failed: {push_result.stderr}")
|
||
else:
|
||
logger.warning(f"Agent run_id={run_id}: commit failed: {commit_result.stderr}")
|
||
else:
|
||
logger.info(f"Agent run_id={run_id}: no changes to commit")
|
||
# ORCH-061: on a self-deploy action stage (deploy-staging/deploy)
|
||
# an empty diff is EXPECTED (action, not a src edit). Emit an
|
||
# explicit observability line so an operator can tell this apart
|
||
# from a code-stage no-op. Does NOT affect advancement (decided by
|
||
# exit-code + gate verdict, never by a commit existing).
|
||
try:
|
||
_t = get_task_by_repo_branch(repo, branch)
|
||
_stage = _t["stage"] if _t else None
|
||
_note = action_stage_no_changes_note(_stage, repo)
|
||
if _note:
|
||
logger.info(f"Agent run_id={run_id}: {_note}")
|
||
except Exception as _e:
|
||
logger.debug(
|
||
f"Agent run_id={run_id}: action-stage no-changes note "
|
||
f"skipped: {_e}"
|
||
)
|
||
except Exception as e:
|
||
logger.error(f"Agent run_id={run_id}: post-run git failed: {e}")
|
||
|
||
# Handle deployer failure (smoke/healthcheck failed) — Task 7
|
||
if exit_code != 0 and agent == "deployer":
|
||
conn = get_db()
|
||
task_row = conn.execute(
|
||
"SELECT id, work_item_id FROM tasks WHERE repo=? AND branch=?",
|
||
(repo, branch),
|
||
).fetchone()
|
||
conn.close()
|
||
if task_row:
|
||
_tid, _wid = task_row
|
||
update_task_stage(_tid, "development")
|
||
notify_stage_change(_tid, "deploy", "development")
|
||
plane_notify_stage(_wid, "deploy", "development")
|
||
from ..plane_sync import set_issue_blocked
|
||
set_issue_blocked(_wid)
|
||
plane_add_comment(
|
||
_wid,
|
||
"\u274c Deploy FAILED (smoke/healthcheck). Rolled back. Developer \u043d\u0443\u0436\u0435\u043d \u0434\u043b\u044f \u0444\u0438\u043a\u0441\u0430.",
|
||
author="deployer",
|
||
)
|
||
from ..notifications import send_telegram, link_for
|
||
send_telegram(f"\U0001f6a8 {link_for(_wid)}: Deploy failed! Rolled back. Needs fix.")
|
||
|
||
# Notify on startup timeout (exit_code from kill = -9 or 137)
|
||
if exit_code != 0 and exit_code not in (None,):
|
||
conn = get_db()
|
||
task_row = conn.execute(
|
||
"SELECT id, work_item_id FROM tasks WHERE repo=? AND branch=?",
|
||
(repo, branch),
|
||
).fetchone()
|
||
conn.close()
|
||
if task_row and agent != "deployer": # deployer handled above
|
||
_tid, _wid = task_row
|
||
from ..notifications import send_telegram, link_for
|
||
send_telegram(f"\u26a0\ufe0f {link_for(_wid, _tid)}: Agent {agent} failed (exit_code={exit_code}). Check logs: {_run_log_path(run_id)}")
|
||
|
||
# Feature 4 + ORCH-016: post the unified per-agent status comment under
|
||
# that agent's bot, threading the wall-clock duration we just measured
|
||
# straight through (ADR-001 §6: explicit param wins over DB fallback).
|
||
# The deployer finishing the task also posts the per-task usage summary.
|
||
if exit_code == 0:
|
||
try:
|
||
self._post_usage_comments(
|
||
run_id, agent, repo, branch, _usage, duration_s=_duration_s
|
||
)
|
||
except Exception as e:
|
||
logger.warning(f"run_id={run_id}: usage comment failed: {e}")
|
||
|
||
# Auto-advance stage if agent finished successfully and QG passes
|
||
if exit_code == 0:
|
||
self._try_advance_stage(run_id, agent, repo, branch)
|
||
|
||
# ORCH-1: drive the job-queue status for queue-launched jobs only.
|
||
# (Legacy direct launch() has job_id=None and is unaffected.)
|
||
if job_id is not None:
|
||
self._finalize_job(job_id, agent, run_id, exit_code, output_path=output_path)
|
||
|
||
def _backoff_seconds(self, transient_attempts: int, retry_after: int = None) -> int:
|
||
"""Exponential backoff for transient failures, honouring Retry-After.
|
||
|
||
backoff = min(2^transient_attempts * base, max). If the server sent a
|
||
Retry-After, use the larger of the two (never poll sooner than asked).
|
||
"""
|
||
base = settings.backoff_base_seconds
|
||
cap = settings.backoff_max_seconds
|
||
backoff = min((2 ** max(transient_attempts, 0)) * base, cap)
|
||
if retry_after is not None and retry_after > 0:
|
||
backoff = max(backoff, min(retry_after, cap))
|
||
return int(backoff)
|
||
|
||
def _finalize_job(self, job_id: int, agent: str, run_id: int, exit_code, output_path=None):
|
||
"""ORCH-1: update the jobs row after the agent process finished.
|
||
|
||
exit_code == 0 -> done (and resets the breaker streak via on_outcome).
|
||
exit_code != 0 -> classify the failure from the run log tail (token-free):
|
||
- TRANSIENT (429/overload/network): backoff-requeue with available_at in
|
||
the future + a SEPARATE transient_attempts budget
|
||
(settings.transient_max_attempts), honouring Retry-After. Reported to
|
||
the breaker so it opens after N consecutive transient failures.
|
||
- PERMANENT (code fault): ordinary attempts < max_attempts requeue,
|
||
otherwise 'failed' + Telegram.
|
||
"""
|
||
from ..db import get_job, mark_job
|
||
from ..error_classifier import classify_log_file
|
||
try:
|
||
job = get_job(job_id)
|
||
if not job:
|
||
return
|
||
if exit_code == 0:
|
||
mark_job(job_id, "done", run_id=run_id)
|
||
logger.info(f"Job {job_id} ({agent}) done (run_id={run_id})")
|
||
self._record_outcome(transient=False, recovered=True)
|
||
return
|
||
|
||
# Classify the failure from the agent log tail (no token cost).
|
||
kind, retry_after = "permanent", None
|
||
log_path = output_path or _run_log_path(run_id)
|
||
try:
|
||
kind, retry_after = classify_log_file(log_path)
|
||
except Exception:
|
||
pass
|
||
|
||
if kind == "transient":
|
||
self._finalize_transient(job_id, agent, run_id, exit_code, job, retry_after)
|
||
else:
|
||
self._finalize_permanent(job_id, agent, run_id, exit_code, job)
|
||
except Exception as e:
|
||
logger.error(f"Job {job_id}: _finalize_job error: {e}")
|
||
|
||
def _finalize_transient(self, job_id, agent, run_id, exit_code, job, retry_after):
|
||
"""Transient (429/overload/net) failure -> backoff requeue or fail when budget out."""
|
||
from ..db import mark_job, mark_job_transient
|
||
tattempts = job.get("transient_attempts", 0)
|
||
tmax = settings.transient_max_attempts
|
||
err = (f"transient (429/overload) agent {agent} exit={exit_code} "
|
||
f"(run_id={run_id}); retry_after={retry_after}")
|
||
self._record_outcome(transient=True, recovered=False)
|
||
if tattempts < tmax:
|
||
backoff = self._backoff_seconds(tattempts + 1, retry_after)
|
||
mark_job_transient(job_id, backoff, error=err)
|
||
logger.warning(
|
||
f"Job {job_id} ({agent}) TRANSIENT fail (exit={exit_code}), "
|
||
f"backoff {backoff}s, transient_attempt {tattempts + 1}/{tmax}"
|
||
)
|
||
else:
|
||
mark_job(job_id, "failed", run_id=run_id, error=err)
|
||
logger.error(
|
||
f"Job {job_id} ({agent}) failed after {tattempts} transient attempts"
|
||
)
|
||
self._notify_failed(job_id, agent, job, run_id,
|
||
f"transient (rate-limit) after {tattempts} attempts")
|
||
|
||
def _finalize_permanent(self, job_id, agent, run_id, exit_code, job):
|
||
"""Permanent (code-fault) failure -> normal attempts<max requeue, then fail."""
|
||
from ..db import mark_job
|
||
attempts = job.get("attempts", 0)
|
||
max_attempts = job.get("max_attempts", 2)
|
||
err = f"agent {agent} exit_code={exit_code} (run_id={run_id})"
|
||
self._record_outcome(transient=False, recovered=False)
|
||
if attempts < max_attempts:
|
||
mark_job(job_id, "queued", run_id=run_id, error=err)
|
||
logger.warning(
|
||
f"Job {job_id} ({agent}) failed (exit={exit_code}), "
|
||
f"requeued (attempt {attempts}/{max_attempts})"
|
||
)
|
||
else:
|
||
mark_job(job_id, "failed", run_id=run_id, error=err)
|
||
logger.error(
|
||
f"Job {job_id} ({agent}) failed permanently after "
|
||
f"{attempts} attempts (exit={exit_code})"
|
||
)
|
||
self._notify_failed(job_id, agent, job, run_id,
|
||
f"{attempts} attempts (exit={exit_code})")
|
||
|
||
def _notify_failed(self, job_id, agent, job, run_id, why):
|
||
try:
|
||
from ..notifications import send_telegram
|
||
send_telegram(
|
||
f"\U0001f6a8 Job {job_id} ({agent}, repo {job.get('repo')}) "
|
||
f"failed: {why}. Logs: {_run_log_path(run_id)}"
|
||
)
|
||
except Exception:
|
||
pass
|
||
|
||
def _record_outcome(self, transient: bool, recovered: bool):
|
||
"""Forward the run outcome to the circuit breaker (if a worker is wired).
|
||
|
||
Decoupled via a settable callback (set by QueueWorker.start) so the launcher
|
||
does not hard-import the worker (avoids a cycle) and tests can run the
|
||
launcher standalone.
|
||
"""
|
||
cb = getattr(self, "on_outcome", None)
|
||
if cb:
|
||
try:
|
||
cb(transient=transient, recovered=recovered)
|
||
except Exception:
|
||
pass
|
||
|
||
def _try_advance_stage(self, run_id: int, agent: str, repo: str, branch: str):
|
||
"""After agent finishes successfully, advance the stage via the unified engine.
|
||
|
||
ORCH-4 / M-3: the 174-line body that used to live here moved into
|
||
src/stage_engine.advance_stage(). This is now a thin wrapper: it looks up
|
||
the task by (repo, branch) and delegates. `agent` is forwarded as
|
||
finished_agent so the analyst/reviewer/tester/architect rollback branches
|
||
still trigger exactly as before. The agent-selection bug (it used to call
|
||
get_agent_for_stage(next_stage)) is fixed inside the engine.
|
||
"""
|
||
try:
|
||
conn = get_db()
|
||
task_row = conn.execute(
|
||
"SELECT id, stage, work_item_id FROM tasks WHERE repo=? AND branch=?",
|
||
(repo, branch),
|
||
).fetchone()
|
||
conn.close()
|
||
if not task_row:
|
||
return
|
||
|
||
task_id, current_stage, work_item_id = task_row
|
||
from ..stage_engine import advance_stage
|
||
advance_stage(
|
||
task_id=task_id,
|
||
current_stage=current_stage,
|
||
repo=repo,
|
||
work_item_id=work_item_id,
|
||
branch=branch,
|
||
finished_agent=agent,
|
||
)
|
||
except Exception as e:
|
||
logger.error(f"Auto-advance failed for run_id={run_id}: {e}")
|
||
|
||
|
||
def _post_usage_comments(self, run_id, agent, repo, branch, usage, duration_s=None):
|
||
"""Feature 4 + ORCH-016: post the unified per-agent status comment.
|
||
|
||
- Always (on success, with a work_item_id): a per-agent finish comment
|
||
via ``usage.build_status_comment(...)``, authored by the finishing
|
||
agent's Plane bot. The comment carries:
|
||
* single-line header (icon + role + per-stage description),
|
||
* machine verdict line for reviewer / tester / deployer (when the
|
||
relevant frontmatter is present in the worktree),
|
||
* the agent's wall-clock duration (``duration_s`` is the measured
|
||
value in _monitor_agent; DB fallback is unused on this path),
|
||
* an HTML <ul> of artifact links scoped per agent,
|
||
* a ``<sub>`` token/cost tail.
|
||
- When the deployer finishes: also a per-task summary (SUM over
|
||
agent_runs GROUP BY agent), authored by the deployer.
|
||
|
||
The deployer's `stage=` is resolved from the task row so the helper can
|
||
pick between 14-deploy-log.md (prod) and 15-staging-log.md (staging).
|
||
"""
|
||
from ..usage import build_status_comment, task_summary_comment
|
||
from ..git_worktree import get_worktree_path
|
||
conn = get_db()
|
||
row = conn.execute(
|
||
"SELECT id, work_item_id, stage FROM tasks WHERE repo=? AND branch=?",
|
||
(repo, branch),
|
||
).fetchone()
|
||
conn.close()
|
||
if not row:
|
||
return
|
||
task_id, work_item_id, stage = row[0], row[1], row[2]
|
||
if not work_item_id:
|
||
return
|
||
# Observability: every agent's finish comment links its artifact(s)
|
||
# (reviewer->12-review, tester->13-test-report, deployer->14- or 15-,
|
||
# architect->ADR, developer->PR/branch). For the developer we resolve the
|
||
# open PR number so the link points straight at it.
|
||
pr_number = None
|
||
if agent == "developer":
|
||
pr_number = self._open_pr_number(repo, branch)
|
||
|
||
# Best-effort worktree path — drives AC-8 (skip missing artifacts) and
|
||
# the verdict frontmatter read. Falls back to None on lookup error so
|
||
# the comment still goes out without the verdict line / file probe.
|
||
try:
|
||
worktree_root = get_worktree_path(repo, branch)
|
||
except Exception:
|
||
worktree_root = None
|
||
|
||
plane_add_comment(
|
||
work_item_id,
|
||
build_status_comment(
|
||
agent,
|
||
repo=repo,
|
||
branch=branch,
|
||
work_item_id=work_item_id,
|
||
pr_number=pr_number,
|
||
stage=stage,
|
||
usage=usage,
|
||
duration_s=duration_s,
|
||
task_id=task_id,
|
||
worktree_root=worktree_root,
|
||
),
|
||
author=agent,
|
||
)
|
||
if agent == "deployer":
|
||
plane_add_comment(
|
||
work_item_id, task_summary_comment(task_id), author="deployer"
|
||
)
|
||
|
||
def _open_pr_number(self, repo: str, branch: str):
|
||
"""Return the open PR number for `branch`, or None. Never raises."""
|
||
try:
|
||
import httpx
|
||
owner = settings.gitea_owner
|
||
headers = {"Authorization": f"token {settings.gitea_token}"}
|
||
resp = httpx.get(
|
||
f"{settings.gitea_url}/api/v1/repos/{owner}/{repo}/pulls",
|
||
params={"state": "open", "head": branch},
|
||
headers=headers, timeout=5,
|
||
)
|
||
if resp.status_code == 200:
|
||
prs = resp.json()
|
||
if prs:
|
||
return prs[0].get("number")
|
||
except Exception:
|
||
pass
|
||
return None
|
||
|
||
def _ensure_pr(self, repo: str, branch: str, run_id: int):
|
||
"""Ensure an open code-PR exists for ``branch``; return its number or None.
|
||
|
||
ORCH-082 (ADR-001 Р-4): delegated to the single idempotent PR-creation actor
|
||
``merge_gate.ensure_open_pr`` so PR creation lives in ONE place and logs the
|
||
same created/existed/failed outcomes (G3). The CALL TRIGGER is unchanged — the
|
||
caller (`_monitor_agent`) still invokes this ONLY on the developer path with a
|
||
fresh worktree commit; only the implementation under the hood is shared. The
|
||
actor uses the same ``head==branch AND base==main`` filter as ``merge_pr``, so
|
||
the developer-created PR and the one merge-verify merges are guaranteed to be
|
||
the same code-PR. Never raises (the actor is never-raise); ``failed`` -> None,
|
||
preserving the previous "best-effort, return None on failure" contract.
|
||
"""
|
||
from .. import merge_gate
|
||
status, detail = merge_gate.ensure_open_pr(repo, branch)
|
||
logger.info(f"_ensure_pr({branch}, run_id={run_id}) -> {status} ({detail})")
|
||
if status in ("created", "existed"):
|
||
try:
|
||
return int(detail)
|
||
except (TypeError, ValueError):
|
||
return None
|
||
logger.error(f"Failed to ensure PR for {branch}: {detail}")
|
||
return None
|
||
|
||
def _write_task_file(self, repo: str, branch: str, task_file: str, content: str):
|
||
"""Write task file directly into the task's worktree.
|
||
|
||
B-1 fix: no docker (direct open()). ORCH-2/S-4: the target is the per-branch
|
||
worktree (/repos/_wt/<repo>/<branch>), not the shared /repos/<repo>, so the
|
||
agent reads the task ZADANIE from its own isolated working copy.
|
||
Raise on failure instead of silently swallowing errors.
|
||
"""
|
||
work_path = get_worktree_path(repo, branch) # /repos/_wt/<repo>/<branch>
|
||
full_path = os.path.join(work_path, task_file)
|
||
try:
|
||
with open(full_path, "w", encoding="utf-8") as f:
|
||
f.write(content)
|
||
logger.info(f"Task file written: {full_path} ({len(content)} bytes)")
|
||
except OSError as e:
|
||
logger.error(f"Failed to write task file {full_path}: {e}")
|
||
raise RuntimeError(f"Failed to write task file: {e}")
|
||
|
||
|
||
launcher = AgentLauncher()
|