_finalize_job classifies the run log: transient (429/overload) -> backoff requeue via mark_job_transient with separate transient_attempts budget honouring Retry-After; permanent -> normal attempts<max. on_outcome callback feeds the circuit breaker. _backoff_seconds = min(2^n*base, max) | Retry-After.
767 lines
36 KiB
Python
767 lines
36 KiB
Python
import subprocess
|
|
import os
|
|
import logging
|
|
import threading
|
|
import signal
|
|
from ..config import settings
|
|
from ..db import get_db, get_task_by_repo_branch, update_task_stage, enqueue_job
|
|
from ..stages import get_next_stage, get_qg_for_stage, get_agent_for_stage
|
|
from ..git_worktree import ensure_worktree, get_worktree_path
|
|
from ..qg.checks import QG_CHECKS
|
|
from ..notifications import notify_stage_change, notify_qg_failure, notify_agent_started, notify_agent_finished, notify_approve_requested
|
|
from ..plane_sync import notify_stage_change as plane_notify_stage, add_comment as plane_add_comment
|
|
|
|
logger = logging.getLogger("orchestrator.launcher")
|
|
|
|
|
|
class AgentLauncher:
|
|
"""Launch Claude CLI agents directly (binary mounted into container)."""
|
|
|
|
AGENT_CONFIGS = {
|
|
"analyst": {
|
|
"system_prompt": ".openclaw/agents/analyst.md",
|
|
"task_file": ".task.md",
|
|
"allowed_tools": "Read,Write,Edit,Bash",
|
|
},
|
|
"architect": {
|
|
"system_prompt": ".openclaw/agents/architect.md",
|
|
"task_file": ".task-arch.md",
|
|
"allowed_tools": "Read,Write,Edit,Bash",
|
|
"model": "opus",
|
|
},
|
|
"developer": {
|
|
"system_prompt": ".openclaw/agents/developer.md",
|
|
"task_file": ".task-dev.md",
|
|
"allowed_tools": "Read,Write,Edit,Bash",
|
|
},
|
|
"reviewer": {
|
|
"system_prompt": ".openclaw/agents/reviewer.md",
|
|
"task_file": ".task-review.md",
|
|
"allowed_tools": "Read,Write,Edit,Bash",
|
|
"model": "opus",
|
|
},
|
|
"tester": {
|
|
"system_prompt": ".openclaw/agents/tester.md",
|
|
"task_file": ".task-test.md",
|
|
"allowed_tools": "Read,Write,Edit,Bash",
|
|
},
|
|
"deployer": {
|
|
"task_file": ".task-deploy.md",
|
|
"system_prompt": ".openclaw/agents/deployer.md",
|
|
"allowed_tools": "Read,Write,Edit,Bash",
|
|
},
|
|
}
|
|
|
|
CLAUDE_BIN = "/opt/claude-code/bin/claude.exe"
|
|
AGENT_TIMEOUT = 1800 # 30 minutes
|
|
|
|
def launch(self, agent: str, repo: str, task_content: str = None, task_id: int = None) -> int:
|
|
"""
|
|
Launch a Claude CLI agent directly (legacy synchronous path).
|
|
|
|
Kept for backward compatibility (direct callers / existing tests). The
|
|
ORCH-1 job queue uses launch_job() instead, but both share _spawn().
|
|
|
|
Args:
|
|
agent: Agent role (analyst, architect, developer, reviewer, tester)
|
|
repo: Repository name
|
|
task_content: Optional task content to write to task file
|
|
task_id: Optional task ID to associate with this run
|
|
|
|
Returns:
|
|
agent_run_id from DB
|
|
"""
|
|
return self._spawn(agent, repo, task_content, task_id, job_id=None)
|
|
|
|
def launch_job(self, job: dict) -> int:
|
|
"""ORCH-1: launch an agent for a claimed queue job.
|
|
|
|
Same spawn path as launch(), but threads job['id'] through so the monitor
|
|
can update the job's status (done / requeue / failed) and link jobs.run_id
|
|
to the agent_runs row. Returns the agent_run_id.
|
|
"""
|
|
return self._spawn(
|
|
job["agent"],
|
|
job["repo"],
|
|
job.get("task_content"),
|
|
job.get("task_id"),
|
|
job_id=job["id"],
|
|
)
|
|
|
|
def _spawn(self, agent: str, repo: str, task_content: str = None,
|
|
task_id: int = None, job_id: int = None) -> int:
|
|
"""Shared spawn implementation for launch() and launch_job().
|
|
|
|
When job_id is set, the monitor/watchdog drive the jobs table status
|
|
(ORCH-1). The claude-CLI Popen logic (B-2) and worktree/task-file logic
|
|
(B-1 / ORCH-2) are unchanged.
|
|
"""
|
|
config = self.AGENT_CONFIGS.get(agent)
|
|
if not config:
|
|
raise ValueError(f"Unknown agent: {agent}")
|
|
|
|
# Main clone lives at /repos/<repo>; the agent works in an isolated worktree
|
|
# (ORCH-2 / S-4) so concurrent tasks never fight over a shared checkout.
|
|
local_repo_path = os.path.join(settings.repos_dir, repo)
|
|
if not os.path.isdir(local_repo_path):
|
|
raise FileNotFoundError(f"Repo not found: {local_repo_path}")
|
|
|
|
# Determine branch (needed before we touch the worktree / task file).
|
|
_br_row = get_db().execute("SELECT branch FROM tasks WHERE id=?", (task_id,)).fetchone() if task_id else None
|
|
agent_branch = _br_row[0] if _br_row else "main"
|
|
|
|
# Ensure the per-branch worktree exists and is on the right branch.
|
|
work_path = ensure_worktree(repo, agent_branch)
|
|
|
|
# Write task file if content provided (B-1: direct write; now into the worktree).
|
|
if task_content:
|
|
self._write_task_file(repo, agent_branch, config["task_file"], task_content)
|
|
|
|
# Record run in DB
|
|
conn = get_db()
|
|
cursor = conn.execute(
|
|
"INSERT INTO agent_runs (task_id, agent) VALUES (?, ?)",
|
|
(task_id, agent),
|
|
)
|
|
run_id = cursor.lastrowid
|
|
conn.commit()
|
|
|
|
# ORCH-1: link this job to the agent_runs row and stamp started_at.
|
|
if job_id is not None:
|
|
conn.execute(
|
|
"UPDATE jobs SET run_id = ?, started_at = datetime('now') WHERE id = ?",
|
|
(run_id, job_id),
|
|
)
|
|
conn.commit()
|
|
|
|
# Prepare output log path
|
|
output_path = f"/app/data/runs/{run_id}.log"
|
|
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
|
|
# Build the claude command
|
|
task_file = config["task_file"]
|
|
system_prompt = config["system_prompt"]
|
|
allowed_tools = config["allowed_tools"]
|
|
|
|
model = config.get("model", "")
|
|
model_flag = f"--model {model} " if model else ""
|
|
|
|
# No git fetch/checkout here: ensure_worktree() already put the worktree on
|
|
# the right branch. The agent simply runs inside its isolated work_path.
|
|
cmd = (
|
|
f'cd {work_path} && '
|
|
f'{self.CLAUDE_BIN} --print '
|
|
f'{model_flag}'
|
|
f'"$(cat {task_file})" '
|
|
f'--system-prompt "$(cat {system_prompt})" '
|
|
f'--allowedTools {allowed_tools}'
|
|
)
|
|
|
|
logger.info(f"Launching agent '{agent}' for repo '{repo}', run_id={run_id}")
|
|
|
|
# Launch as background process.
|
|
# B-2 fix: redirect stdout/stderr straight to the log file at the OS level.
|
|
# No PIPE in the orchestrator process -> no PIPE deadlock, no reader thread,
|
|
# no zombies. log_fh is closed by _monitor_agent after proc.wait().
|
|
log_fh = open(output_path, "w")
|
|
proc = subprocess.Popen(
|
|
["bash", "-c", cmd],
|
|
stdout=log_fh,
|
|
stderr=subprocess.STDOUT,
|
|
env={
|
|
**os.environ,
|
|
"HOME": "/home/slin",
|
|
"GIT_AUTHOR_NAME": "claude-bot",
|
|
"GIT_AUTHOR_EMAIL": "claude-bot@mva154.local",
|
|
"GIT_COMMITTER_NAME": "claude-bot",
|
|
"GIT_COMMITTER_EMAIL": "claude-bot@mva154.local",
|
|
},
|
|
)
|
|
|
|
# Update DB with output path
|
|
conn.execute(
|
|
"UPDATE agent_runs SET output_path = ? WHERE id = ?",
|
|
(output_path, run_id),
|
|
)
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
# Start timeout watchdog
|
|
t = threading.Thread(
|
|
target=self._watchdog,
|
|
args=(proc.pid, run_id),
|
|
kwargs={"job_id": job_id},
|
|
daemon=True,
|
|
)
|
|
t.start()
|
|
|
|
# Start monitor thread (waits for completion, commits, pushes)
|
|
# agent_branch already computed above
|
|
m = threading.Thread(
|
|
target=self._monitor_agent,
|
|
args=(proc, run_id, agent, repo, agent_branch, output_path, log_fh),
|
|
kwargs={"job_id": job_id},
|
|
daemon=True,
|
|
)
|
|
m.start()
|
|
|
|
logger.info(f"Agent '{agent}' launched, pid={proc.pid}, run_id={run_id}")
|
|
notify_agent_started(run_id, agent, task_id)
|
|
return run_id
|
|
|
|
def _watchdog(self, pid: int, run_id: int, timeout: int = None, job_id: int = None):
|
|
"""Kill agent if it exceeds timeout.
|
|
|
|
ORCH-1: on a timeout-kill the monitor's proc.wait() returns the kill exit
|
|
code and drives the job retry/fail logic, so the watchdog itself only needs
|
|
to SIGKILL and record the agent_runs exit. job_id is accepted for symmetry.
|
|
"""
|
|
import time
|
|
if timeout is None:
|
|
timeout = self.AGENT_TIMEOUT
|
|
time.sleep(timeout)
|
|
try:
|
|
os.kill(pid, signal.SIGKILL)
|
|
logger.warning(f"Agent run_id={run_id} killed after {timeout}s timeout")
|
|
conn = get_db()
|
|
conn.execute(
|
|
"UPDATE agent_runs SET finished_at=datetime('now'), exit_code=-9 WHERE id=?",
|
|
(run_id,),
|
|
)
|
|
conn.commit()
|
|
conn.close()
|
|
except ProcessLookupError:
|
|
pass # Already finished
|
|
|
|
def _monitor_agent(self, proc, run_id, agent, repo, branch, output_path=None, log_fh=None, job_id=None):
|
|
"""Wait for agent to finish, commit+push results, update DB.
|
|
|
|
B-2 fix: stdout already goes straight to the log file via Popen, so we just
|
|
block on proc.wait() (guaranteed reap -> no zombie, real exit_code) and then
|
|
close the log file handle. No PIPE, no select loop, no startup timeout here
|
|
(the watchdog still enforces the overall AGENT_TIMEOUT by pid).
|
|
"""
|
|
import time as _time
|
|
_start_ts = _time.time()
|
|
|
|
exit_code = proc.wait()
|
|
if log_fh is not None:
|
|
try:
|
|
log_fh.close()
|
|
except Exception:
|
|
pass
|
|
_duration_s = int(_time.time() - _start_ts)
|
|
logger.info(f"Agent run_id={run_id} ({agent}) finished with exit_code={exit_code}")
|
|
|
|
# Update DB
|
|
conn = get_db()
|
|
conn.execute(
|
|
"UPDATE agent_runs SET finished_at=datetime('now'), exit_code=? WHERE id=?",
|
|
(exit_code, run_id),
|
|
)
|
|
conn.commit()
|
|
|
|
# Get task_id for notification
|
|
_row = conn.execute("SELECT task_id FROM agent_runs WHERE id=?", (run_id,)).fetchone()
|
|
_task_id = _row[0] if _row else None
|
|
conn.close()
|
|
|
|
notify_agent_finished(run_id, agent, exit_code, task_id=_task_id, duration_s=_duration_s)
|
|
|
|
# Commit and push any changes — in the per-branch worktree (ORCH-2 / S-4),
|
|
# NOT in the shared /repos/<repo>. The worktree is already on `branch`
|
|
# (ensure_worktree did the checkout), so no checkout is needed here.
|
|
repo_path = get_worktree_path(repo, branch)
|
|
try:
|
|
git_env = {
|
|
**os.environ,
|
|
"HOME": "/home/slin",
|
|
"GIT_AUTHOR_NAME": "claude-bot",
|
|
"GIT_AUTHOR_EMAIL": "claude-bot@mva154.local",
|
|
"GIT_COMMITTER_NAME": "claude-bot",
|
|
"GIT_COMMITTER_EMAIL": "claude-bot@mva154.local",
|
|
}
|
|
result = subprocess.run(
|
|
["git", "-C", repo_path, "status", "--porcelain"],
|
|
capture_output=True, text=True, timeout=10, env=git_env
|
|
)
|
|
if result.stdout.strip():
|
|
# Add docs/ always
|
|
subprocess.run(
|
|
["git", "-C", repo_path, "add", "docs/"],
|
|
capture_output=True, text=True, timeout=10, env=git_env
|
|
)
|
|
# Add src/ and tests/ for developer
|
|
if agent == "developer":
|
|
subprocess.run(
|
|
["git", "-C", repo_path, "add", "src/", "tests/"],
|
|
capture_output=True, text=True, timeout=10, env=git_env
|
|
)
|
|
# Commit
|
|
commit_result = subprocess.run(
|
|
["git", "-C", repo_path, "commit", "-m",
|
|
f"{agent}(ET): auto-commit from {agent} run_id={run_id}"],
|
|
capture_output=True, text=True, timeout=30, env=git_env
|
|
)
|
|
if commit_result.returncode == 0:
|
|
push_result = subprocess.run(
|
|
["git", "-C", repo_path, "push", "origin", branch],
|
|
capture_output=True, text=True, timeout=60, env=git_env
|
|
)
|
|
if push_result.returncode == 0:
|
|
logger.info(f"Agent run_id={run_id}: committed and pushed to {branch}")
|
|
# Auto-create PR after developer pushes
|
|
if agent == "developer":
|
|
self._ensure_pr(repo, branch, run_id)
|
|
else:
|
|
logger.error(f"Agent run_id={run_id}: push failed: {push_result.stderr}")
|
|
else:
|
|
logger.warning(f"Agent run_id={run_id}: commit failed: {commit_result.stderr}")
|
|
else:
|
|
logger.info(f"Agent run_id={run_id}: no changes to commit")
|
|
except Exception as e:
|
|
logger.error(f"Agent run_id={run_id}: post-run git failed: {e}")
|
|
|
|
# Handle deployer failure (smoke/healthcheck failed) — Task 7
|
|
if exit_code != 0 and agent == "deployer":
|
|
conn = get_db()
|
|
task_row = conn.execute(
|
|
"SELECT id, work_item_id FROM tasks WHERE repo=? AND branch=?",
|
|
(repo, branch),
|
|
).fetchone()
|
|
conn.close()
|
|
if task_row:
|
|
_tid, _wid = task_row
|
|
update_task_stage(_tid, "development")
|
|
notify_stage_change(_tid, "deploy", "development")
|
|
plane_notify_stage(_wid, "deploy", "development")
|
|
from ..plane_sync import set_issue_blocked
|
|
set_issue_blocked(_wid)
|
|
plane_add_comment(
|
|
_wid,
|
|
"\u274c Deploy FAILED (smoke/healthcheck). Rolled back. Developer \u043d\u0443\u0436\u0435\u043d \u0434\u043b\u044f \u0444\u0438\u043a\u0441\u0430."
|
|
)
|
|
from ..notifications import send_telegram
|
|
send_telegram(f"\U0001f6a8 {_wid}: Deploy failed! Rolled back. Needs fix.")
|
|
|
|
# Notify on startup timeout (exit_code from kill = -9 or 137)
|
|
if exit_code != 0 and exit_code not in (None,):
|
|
conn = get_db()
|
|
task_row = conn.execute(
|
|
"SELECT id, work_item_id FROM tasks WHERE repo=? AND branch=?",
|
|
(repo, branch),
|
|
).fetchone()
|
|
conn.close()
|
|
if task_row and agent != "deployer": # deployer handled above
|
|
_tid, _wid = task_row
|
|
from ..notifications import send_telegram
|
|
send_telegram(f"\u26a0\ufe0f {_wid}: Agent {agent} failed (exit_code={exit_code}). Check logs: /app/data/runs/{run_id}.log")
|
|
|
|
# Auto-advance stage if agent finished successfully and QG passes
|
|
if exit_code == 0:
|
|
self._try_advance_stage(run_id, agent, repo, branch)
|
|
|
|
# ORCH-1: drive the job-queue status for queue-launched jobs only.
|
|
# (Legacy direct launch() has job_id=None and is unaffected.)
|
|
if job_id is not None:
|
|
self._finalize_job(job_id, agent, run_id, exit_code, output_path=output_path)
|
|
|
|
def _backoff_seconds(self, transient_attempts: int, retry_after: int = None) -> int:
|
|
"""Exponential backoff for transient failures, honouring Retry-After.
|
|
|
|
backoff = min(2^transient_attempts * base, max). If the server sent a
|
|
Retry-After, use the larger of the two (never poll sooner than asked).
|
|
"""
|
|
base = settings.backoff_base_seconds
|
|
cap = settings.backoff_max_seconds
|
|
backoff = min((2 ** max(transient_attempts, 0)) * base, cap)
|
|
if retry_after is not None and retry_after > 0:
|
|
backoff = max(backoff, min(retry_after, cap))
|
|
return int(backoff)
|
|
|
|
def _finalize_job(self, job_id: int, agent: str, run_id: int, exit_code, output_path=None):
|
|
"""ORCH-1: update the jobs row after the agent process finished.
|
|
|
|
exit_code == 0 -> done (and resets the breaker streak via on_outcome).
|
|
exit_code != 0 -> classify the failure from the run log tail (token-free):
|
|
- TRANSIENT (429/overload/network): backoff-requeue with available_at in
|
|
the future + a SEPARATE transient_attempts budget
|
|
(settings.transient_max_attempts), honouring Retry-After. Reported to
|
|
the breaker so it opens after N consecutive transient failures.
|
|
- PERMANENT (code fault): ordinary attempts < max_attempts requeue,
|
|
otherwise 'failed' + Telegram.
|
|
"""
|
|
from ..db import get_job, mark_job
|
|
from ..error_classifier import classify_log_file
|
|
try:
|
|
job = get_job(job_id)
|
|
if not job:
|
|
return
|
|
if exit_code == 0:
|
|
mark_job(job_id, "done", run_id=run_id)
|
|
logger.info(f"Job {job_id} ({agent}) done (run_id={run_id})")
|
|
self._record_outcome(transient=False, recovered=True)
|
|
return
|
|
|
|
# Classify the failure from the agent log tail (no token cost).
|
|
kind, retry_after = "permanent", None
|
|
log_path = output_path or f"/app/data/runs/{run_id}.log"
|
|
try:
|
|
kind, retry_after = classify_log_file(log_path)
|
|
except Exception:
|
|
pass
|
|
|
|
if kind == "transient":
|
|
self._finalize_transient(job_id, agent, run_id, exit_code, job, retry_after)
|
|
else:
|
|
self._finalize_permanent(job_id, agent, run_id, exit_code, job)
|
|
except Exception as e:
|
|
logger.error(f"Job {job_id}: _finalize_job error: {e}")
|
|
|
|
def _finalize_transient(self, job_id, agent, run_id, exit_code, job, retry_after):
|
|
"""Transient (429/overload/net) failure -> backoff requeue or fail when budget out."""
|
|
from ..db import mark_job, mark_job_transient
|
|
tattempts = job.get("transient_attempts", 0)
|
|
tmax = settings.transient_max_attempts
|
|
err = (f"transient (429/overload) agent {agent} exit={exit_code} "
|
|
f"(run_id={run_id}); retry_after={retry_after}")
|
|
self._record_outcome(transient=True, recovered=False)
|
|
if tattempts < tmax:
|
|
backoff = self._backoff_seconds(tattempts + 1, retry_after)
|
|
mark_job_transient(job_id, backoff, error=err)
|
|
logger.warning(
|
|
f"Job {job_id} ({agent}) TRANSIENT fail (exit={exit_code}), "
|
|
f"backoff {backoff}s, transient_attempt {tattempts + 1}/{tmax}"
|
|
)
|
|
else:
|
|
mark_job(job_id, "failed", run_id=run_id, error=err)
|
|
logger.error(
|
|
f"Job {job_id} ({agent}) failed after {tattempts} transient attempts"
|
|
)
|
|
self._notify_failed(job_id, agent, job, run_id,
|
|
f"transient (rate-limit) after {tattempts} attempts")
|
|
|
|
def _finalize_permanent(self, job_id, agent, run_id, exit_code, job):
|
|
"""Permanent (code-fault) failure -> normal attempts<max requeue, then fail."""
|
|
from ..db import mark_job
|
|
attempts = job.get("attempts", 0)
|
|
max_attempts = job.get("max_attempts", 2)
|
|
err = f"agent {agent} exit_code={exit_code} (run_id={run_id})"
|
|
self._record_outcome(transient=False, recovered=False)
|
|
if attempts < max_attempts:
|
|
mark_job(job_id, "queued", run_id=run_id, error=err)
|
|
logger.warning(
|
|
f"Job {job_id} ({agent}) failed (exit={exit_code}), "
|
|
f"requeued (attempt {attempts}/{max_attempts})"
|
|
)
|
|
else:
|
|
mark_job(job_id, "failed", run_id=run_id, error=err)
|
|
logger.error(
|
|
f"Job {job_id} ({agent}) failed permanently after "
|
|
f"{attempts} attempts (exit={exit_code})"
|
|
)
|
|
self._notify_failed(job_id, agent, job, run_id,
|
|
f"{attempts} attempts (exit={exit_code})")
|
|
|
|
def _notify_failed(self, job_id, agent, job, run_id, why):
|
|
try:
|
|
from ..notifications import send_telegram
|
|
send_telegram(
|
|
f"\U0001f6a8 Job {job_id} ({agent}, repo {job.get('repo')}) "
|
|
f"failed: {why}. Logs: /app/data/runs/{run_id}.log"
|
|
)
|
|
except Exception:
|
|
pass
|
|
|
|
def _record_outcome(self, transient: bool, recovered: bool):
|
|
"""Forward the run outcome to the circuit breaker (if a worker is wired).
|
|
|
|
Decoupled via a settable callback (set by QueueWorker.start) so the launcher
|
|
does not hard-import the worker (avoids a cycle) and tests can run the
|
|
launcher standalone.
|
|
"""
|
|
cb = getattr(self, "on_outcome", None)
|
|
if cb:
|
|
try:
|
|
cb(transient=transient, recovered=recovered)
|
|
except Exception:
|
|
pass
|
|
|
|
def _try_advance_stage(self, run_id: int, agent: str, repo: str, branch: str):
|
|
"""After agent finishes successfully, check QG and advance stage if possible."""
|
|
try:
|
|
conn = get_db()
|
|
task_row = conn.execute(
|
|
"SELECT id, stage, work_item_id FROM tasks WHERE repo=? AND branch=?",
|
|
(repo, branch),
|
|
).fetchone()
|
|
conn.close()
|
|
if not task_row:
|
|
return
|
|
|
|
task_id, current_stage, work_item_id = task_row
|
|
qg_name = get_qg_for_stage(current_stage)
|
|
next_stage = get_next_stage(current_stage)
|
|
|
|
if not next_stage:
|
|
return
|
|
|
|
# Run QG check if defined
|
|
if qg_name and qg_name in QG_CHECKS:
|
|
check_fn = QG_CHECKS[qg_name]
|
|
if qg_name in ("check_analysis_approved",):
|
|
# Requires human approval - post request comment if analyst just finished
|
|
if agent == "analyst" and qg_name == "check_analysis_approved" and work_item_id:
|
|
files_check = QG_CHECKS.get("check_analysis_complete")
|
|
if files_check:
|
|
files_ok, _ = files_check(repo, work_item_id, branch)
|
|
if files_ok:
|
|
# Full artifacts ready -> In Review
|
|
from ..plane_sync import set_issue_in_review
|
|
set_issue_in_review(work_item_id)
|
|
plane_add_comment(
|
|
work_item_id,
|
|
"\U0001f4cb BRD/\u0422\u0417/AC/TestPlan \u0433\u043e\u0442\u043e\u0432\u044b. "
|
|
"\u041f\u0440\u043e\u0448\u0443 review \u0438 \u0440\u0435\u0430\u043a\u0446\u0438\u044e :approved: \u0434\u043b\u044f \u043f\u0440\u043e\u0434\u0432\u0438\u0436\u0435\u043d\u0438\u044f \u0432 Architecture."
|
|
)
|
|
notify_approve_requested(task_id)
|
|
logger.info(f"Task {task_id}: analyst finished, requested :approved: in Plane")
|
|
else:
|
|
# Check if questions file exists (in the task worktree)
|
|
import os as _os
|
|
questions_path = _os.path.join(
|
|
get_worktree_path(repo, branch),
|
|
f"docs/work-items/{work_item_id}/01-questions.md"
|
|
)
|
|
if _os.path.isfile(questions_path):
|
|
# Analyst has questions -> Needs Input
|
|
from ..plane_sync import set_issue_needs_input
|
|
set_issue_needs_input(work_item_id)
|
|
with open(questions_path, "r") as qf:
|
|
questions_text = qf.read()
|
|
plane_add_comment(
|
|
work_item_id,
|
|
f"\u2753 Analyst \u043d\u0443\u0436\u0434\u0430\u0435\u0442\u0441\u044f \u0432 \u0443\u0442\u043e\u0447\u043d\u0435\u043d\u0438\u0438:\n\n{questions_text}"
|
|
)
|
|
from ..notifications import send_telegram
|
|
send_telegram(
|
|
f"\u2753 {work_item_id}: Analyst \u0437\u0430\u0434\u0430\u0451\u0442 \u0432\u043e\u043f\u0440\u043e\u0441\u044b. \u041e\u0442\u0432\u0435\u0442\u044c \u0432 Plane."
|
|
)
|
|
else:
|
|
# No artifacts and no questions
|
|
plane_add_comment(
|
|
work_item_id,
|
|
"\u26a0\ufe0f Analyst \u0437\u0430\u0432\u0435\u0440\u0448\u0438\u043b\u0441\u044f \u0431\u0435\u0437 \u0430\u0440\u0442\u0435\u0444\u0430\u043a\u0442\u043e\u0432 \u0438 \u0431\u0435\u0437 \u0432\u043e\u043f\u0440\u043e\u0441\u043e\u0432. \u041f\u0440\u043e\u0432\u0435\u0440\u044c\u0442\u0435 \u043b\u043e\u0433."
|
|
)
|
|
return
|
|
elif qg_name in ("check_ci_green", "check_tests_local"):
|
|
# (repo, branch) signature — already worktree-aware.
|
|
passed, reason = check_fn(repo, branch)
|
|
elif qg_name == "check_tests_passed":
|
|
# Artifact check — pass branch so it reads from the worktree.
|
|
passed, reason = check_fn(repo, work_item_id or "", branch)
|
|
else:
|
|
# Other artifact checks (check_architecture_done, etc.) — worktree-aware.
|
|
passed, reason = check_fn(repo, work_item_id or "", branch)
|
|
|
|
if not passed:
|
|
logger.info(f"Task {task_id}: QG '{qg_name}' not passed after {agent}: {reason}")
|
|
# If reviewer says REQUEST_CHANGES, rollback to development
|
|
if agent == "reviewer" and "REQUEST_CHANGES" in reason:
|
|
update_task_stage(task_id, "development")
|
|
notify_stage_change(task_id, current_stage, "development")
|
|
plane_notify_stage(work_item_id, current_stage, "development")
|
|
# Count retries
|
|
conn2 = get_db()
|
|
retry_count = conn2.execute(
|
|
"SELECT COUNT(*) FROM agent_runs WHERE task_id=? AND agent='developer'",
|
|
(task_id,)
|
|
).fetchone()[0]
|
|
conn2.close()
|
|
if retry_count < 3:
|
|
task_desc = (
|
|
f"Work item: {work_item_id}\nRepo: {repo}\nBranch: {branch}\n"
|
|
f"Stage: development\nNote: REQUEST_CHANGES from reviewer "
|
|
f"(attempt {retry_count+1}/3). Fix findings in "
|
|
f"docs/work-items/{work_item_id}/12-review.md"
|
|
)
|
|
new_job = enqueue_job("developer", repo, task_desc, task_id=task_id)
|
|
logger.info(f"Task {task_id}: reviewer REQUEST_CHANGES, enqueued developer (job_id={new_job})")
|
|
else:
|
|
from ..notifications import send_telegram
|
|
send_telegram(f"\u26a0\ufe0f {work_item_id}: Max developer retries (3) reached. Manual intervention needed.")
|
|
logger.error(f"Task {task_id}: max retries reached")
|
|
|
|
# Task 6: Tester FAIL -> rollback to development
|
|
if agent == "tester" and qg_name == "check_tests_passed" and not passed:
|
|
update_task_stage(task_id, "development")
|
|
notify_stage_change(task_id, current_stage, "development")
|
|
plane_notify_stage(work_item_id, current_stage, "development")
|
|
from ..plane_sync import set_issue_in_progress
|
|
set_issue_in_progress(work_item_id)
|
|
plane_add_comment(
|
|
work_item_id,
|
|
f"\u274c \u0422\u0435\u0441\u0442\u044b \u043d\u0435 \u043f\u0440\u043e\u0448\u043b\u0438: {reason}. Developer \u043f\u0435\u0440\u0435\u0437\u0430\u043f\u0443\u0449\u0435\u043d \u0434\u043b\u044f \u0444\u0438\u043a\u0441\u0430."
|
|
)
|
|
conn2 = get_db()
|
|
retry_count = conn2.execute(
|
|
"SELECT COUNT(*) FROM agent_runs WHERE task_id=? AND agent='developer'",
|
|
(task_id,)
|
|
).fetchone()[0]
|
|
conn2.close()
|
|
if retry_count < 3:
|
|
task_desc = (
|
|
f"Work item: {work_item_id}\nRepo: {repo}\nBranch: {branch}\n"
|
|
f"Stage: development\nNote: Tests FAILED. "
|
|
f"Fix failures described in docs/work-items/{work_item_id}/13-test-report.md"
|
|
)
|
|
new_job = enqueue_job("developer", repo, task_desc, task_id=task_id)
|
|
logger.info(f"Task {task_id}: tester FAIL, enqueued developer (job_id={new_job})")
|
|
else:
|
|
from ..notifications import send_telegram
|
|
from ..plane_sync import set_issue_blocked
|
|
set_issue_blocked(work_item_id)
|
|
send_telegram(f"\U0001f6a8 {work_item_id}: Tests still failing after 3 developer retries. Manual intervention needed.")
|
|
|
|
# Task 8: Architect conflict -> rollback to analysis
|
|
if agent == "architect" and qg_name == "check_architecture_done" and not passed:
|
|
import os as _os
|
|
conflict_path = _os.path.join(
|
|
get_worktree_path(repo, branch),
|
|
f"docs/work-items/{work_item_id}/10-conflict.md"
|
|
)
|
|
if _os.path.isfile(conflict_path):
|
|
update_task_stage(task_id, "analysis")
|
|
notify_stage_change(task_id, current_stage, "analysis")
|
|
plane_notify_stage(work_item_id, current_stage, "analysis")
|
|
from ..plane_sync import set_issue_in_progress
|
|
set_issue_in_progress(work_item_id)
|
|
with open(conflict_path, "r") as cf:
|
|
conflict_text = cf.read()[:500]
|
|
plane_add_comment(
|
|
work_item_id,
|
|
f"\u26a0\ufe0f Architect \u043d\u0430\u0448\u0451\u043b \u043a\u043e\u043d\u0444\u043b\u0438\u043a\u0442 \u0441 \u0422\u0417. \u0412\u043e\u0437\u0432\u0440\u0430\u0442 \u0432 Analysis.\n\n{conflict_text}"
|
|
)
|
|
task_desc = (
|
|
f"Work item: {work_item_id}\nRepo: {repo}\nBranch: {branch}\n"
|
|
f"Stage: analysis\nNote: Architect conflict. Revise TRZ. "
|
|
f"See docs/work-items/{work_item_id}/10-conflict.md"
|
|
)
|
|
new_job = enqueue_job("analyst", repo, task_desc, task_id=task_id)
|
|
logger.info(f"Task {task_id}: architect conflict, enqueued analyst (job_id={new_job})")
|
|
return
|
|
|
|
return
|
|
elif qg_name:
|
|
return
|
|
|
|
# Advance stage
|
|
update_task_stage(task_id, next_stage)
|
|
notify_stage_change(task_id, current_stage, next_stage)
|
|
plane_notify_stage(work_item_id, current_stage, next_stage)
|
|
logger.info(f"Task {task_id}: {current_stage} -> {next_stage} (auto-advance after {agent})")
|
|
|
|
# Launch next agent if defined
|
|
next_agent = get_agent_for_stage(next_stage)
|
|
if next_agent:
|
|
task_desc = f"Work item: {work_item_id}\nRepo: {repo}\nBranch: {branch}\nStage: {next_stage}"
|
|
new_job_id = enqueue_job(next_agent, repo, task_desc, task_id=task_id)
|
|
logger.info(f"Task {task_id}: enqueued '{next_agent}' (job_id={new_job_id})")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Auto-advance failed for run_id={run_id}: {e}")
|
|
|
|
|
|
def _ensure_pr(self, repo: str, branch: str, run_id: int):
|
|
import httpx
|
|
owner = settings.gitea_owner
|
|
headers = {"Authorization": f"token {settings.gitea_token}"}
|
|
base_url = f"{settings.gitea_url}/api/v1"
|
|
try:
|
|
resp = httpx.get(
|
|
f"{base_url}/repos/{owner}/{repo}/pulls",
|
|
params={"state": "open", "head": branch},
|
|
headers=headers, timeout=10
|
|
)
|
|
resp.raise_for_status()
|
|
prs = resp.json()
|
|
if prs:
|
|
return prs[0]["number"]
|
|
parts = branch.split("/")
|
|
title = parts[-1] if parts else branch
|
|
resp = httpx.post(
|
|
f"{base_url}/repos/{owner}/{repo}/pulls",
|
|
json={"title": f"feat: {title}", "head": branch, "base": "main",
|
|
"body": f"Auto-created by orchestrator after developer run_id={run_id}"},
|
|
headers=headers, timeout=10
|
|
)
|
|
resp.raise_for_status()
|
|
pr_number = resp.json()["number"]
|
|
logger.info(f"Created PR #{pr_number} for {branch}")
|
|
return pr_number
|
|
except Exception as e:
|
|
logger.error(f"Failed to create PR for {branch}: {e}")
|
|
return None
|
|
|
|
def _auto_merge_pr(self, repo: str, branch: str, task_id: int, work_item_id: str):
|
|
import httpx
|
|
owner = settings.gitea_owner
|
|
headers = {"Authorization": f"token {settings.gitea_token}"}
|
|
base_url = f"{settings.gitea_url}/api/v1"
|
|
try:
|
|
resp = httpx.get(
|
|
f"{base_url}/repos/{owner}/{repo}/pulls",
|
|
params={"state": "open", "head": branch},
|
|
headers=headers, timeout=10
|
|
)
|
|
resp.raise_for_status()
|
|
prs = resp.json()
|
|
if not prs:
|
|
pr_number = self._ensure_pr(repo, branch, 0)
|
|
if not pr_number:
|
|
return False
|
|
else:
|
|
pr_number = prs[0]["number"]
|
|
resp = httpx.post(
|
|
f"{base_url}/repos/{owner}/{repo}/pulls/{pr_number}/merge",
|
|
json={"Do": "merge"},
|
|
headers=headers, timeout=30
|
|
)
|
|
if resp.status_code in (200, 204):
|
|
logger.info(f"PR #{pr_number} merged for {branch}")
|
|
update_task_stage(task_id, "done")
|
|
notify_stage_change(task_id, "deploy", "done")
|
|
plane_notify_stage(work_item_id, "deploy", "done")
|
|
from ..notifications import send_telegram
|
|
send_telegram(f"\u2705 {work_item_id}: PR #{pr_number} merged! deploy -> done. Task complete.")
|
|
return True
|
|
else:
|
|
logger.error(f"Merge failed for PR #{pr_number}: {resp.status_code} {resp.text}")
|
|
from ..notifications import send_telegram
|
|
send_telegram(f"\u26a0\ufe0f {work_item_id}: Auto-merge failed (HTTP {resp.status_code}). Manual merge needed.")
|
|
return False
|
|
except Exception as e:
|
|
logger.error(f"Auto-merge failed for {branch}: {e}")
|
|
return False
|
|
|
|
def _write_task_file(self, repo: str, branch: str, task_file: str, content: str):
|
|
"""Write task file directly into the task's worktree.
|
|
|
|
B-1 fix: no docker (direct open()). ORCH-2/S-4: the target is the per-branch
|
|
worktree (/repos/_wt/<repo>/<branch>), not the shared /repos/<repo>, so the
|
|
agent reads the task ZADANIE from its own isolated working copy.
|
|
Raise on failure instead of silently swallowing errors.
|
|
"""
|
|
work_path = get_worktree_path(repo, branch) # /repos/_wt/<repo>/<branch>
|
|
full_path = os.path.join(work_path, task_file)
|
|
try:
|
|
with open(full_path, "w", encoding="utf-8") as f:
|
|
f.write(content)
|
|
logger.info(f"Task file written: {full_path} ({len(content)} bytes)")
|
|
except OSError as e:
|
|
logger.error(f"Failed to write task file {full_path}: {e}")
|
|
raise RuntimeError(f"Failed to write task file: {e}")
|
|
|
|
|
|
launcher = AgentLauncher()
|