Files
orchestrator/src/agents/launcher.py
Dev Agent 1ebe8afc23 feat(worktree): git worktree per task to isolate shared /repos (ORCH-2 / S-4)
- add src/git_worktree.py: ensure/remove/get_worktree_path
- config: worktrees_dir=/repos/_wt
- launcher: agent runs in per-branch worktree; task-file + commit/push in worktree; no shared checkout
- qg/checks: read artifacts + run make test from worktree (branch arg, backward-compatible)
- webhooks/plane: pass branch into QG dispatch; review fallback from worktree
- webhooks/gitea: keep read-only branch --contains in main clone (documented)
- tests: test_git_worktree.py (isolation) + update test_launcher write-task-file
- docs: ARCHITECTURE worktree section + BUGFIXES_2026-06-02_ORCH2

Preserves B-1/B-2/S-1/S-5 fixes (paths now point at worktree).
2026-06-02 21:12:06 +03:00

598 lines
29 KiB
Python

import subprocess
import os
import logging
import threading
import signal
from ..config import settings
from ..db import get_db, get_task_by_repo_branch, update_task_stage
from ..stages import get_next_stage, get_qg_for_stage, get_agent_for_stage
from ..git_worktree import ensure_worktree, get_worktree_path
from ..qg.checks import QG_CHECKS
from ..notifications import notify_stage_change, notify_qg_failure, notify_agent_started, notify_agent_finished, notify_approve_requested
from ..plane_sync import notify_stage_change as plane_notify_stage, add_comment as plane_add_comment
logger = logging.getLogger("orchestrator.launcher")
class AgentLauncher:
"""Launch Claude CLI agents directly (binary mounted into container)."""
AGENT_CONFIGS = {
"analyst": {
"system_prompt": ".openclaw/agents/analyst.md",
"task_file": ".task.md",
"allowed_tools": "Read,Write,Edit,Bash",
},
"architect": {
"system_prompt": ".openclaw/agents/architect.md",
"task_file": ".task-arch.md",
"allowed_tools": "Read,Write,Edit,Bash",
"model": "opus",
},
"developer": {
"system_prompt": ".openclaw/agents/developer.md",
"task_file": ".task-dev.md",
"allowed_tools": "Read,Write,Edit,Bash",
},
"reviewer": {
"system_prompt": ".openclaw/agents/reviewer.md",
"task_file": ".task-review.md",
"allowed_tools": "Read,Write,Edit,Bash",
"model": "opus",
},
"tester": {
"system_prompt": ".openclaw/agents/tester.md",
"task_file": ".task-test.md",
"allowed_tools": "Read,Write,Edit,Bash",
},
"deployer": {
"task_file": ".task-deploy.md",
"system_prompt": ".openclaw/agents/deployer.md",
"allowed_tools": "Read,Write,Edit,Bash",
},
}
CLAUDE_BIN = "/opt/claude-code/bin/claude.exe"
AGENT_TIMEOUT = 1800 # 30 minutes
def launch(self, agent: str, repo: str, task_content: str = None, task_id: int = None) -> int:
"""
Launch a Claude CLI agent.
Args:
agent: Agent role (analyst, architect, developer, reviewer, tester)
repo: Repository name
task_content: Optional task content to write to task file
task_id: Optional task ID to associate with this run
Returns:
agent_run_id from DB
"""
config = self.AGENT_CONFIGS.get(agent)
if not config:
raise ValueError(f"Unknown agent: {agent}")
# Main clone lives at /repos/<repo>; the agent works in an isolated worktree
# (ORCH-2 / S-4) so concurrent tasks never fight over a shared checkout.
local_repo_path = os.path.join(settings.repos_dir, repo)
if not os.path.isdir(local_repo_path):
raise FileNotFoundError(f"Repo not found: {local_repo_path}")
# Determine branch (needed before we touch the worktree / task file).
_br_row = get_db().execute("SELECT branch FROM tasks WHERE id=?", (task_id,)).fetchone() if task_id else None
agent_branch = _br_row[0] if _br_row else "main"
# Ensure the per-branch worktree exists and is on the right branch.
work_path = ensure_worktree(repo, agent_branch)
# Write task file if content provided (B-1: direct write; now into the worktree).
if task_content:
self._write_task_file(repo, agent_branch, config["task_file"], task_content)
# Record run in DB
conn = get_db()
cursor = conn.execute(
"INSERT INTO agent_runs (task_id, agent) VALUES (?, ?)",
(task_id, agent),
)
run_id = cursor.lastrowid
conn.commit()
# Prepare output log path
output_path = f"/app/data/runs/{run_id}.log"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
# Build the claude command
task_file = config["task_file"]
system_prompt = config["system_prompt"]
allowed_tools = config["allowed_tools"]
model = config.get("model", "")
model_flag = f"--model {model} " if model else ""
# No git fetch/checkout here: ensure_worktree() already put the worktree on
# the right branch. The agent simply runs inside its isolated work_path.
cmd = (
f'cd {work_path} && '
f'{self.CLAUDE_BIN} --print '
f'{model_flag}'
f'"$(cat {task_file})" '
f'--system-prompt "$(cat {system_prompt})" '
f'--allowedTools {allowed_tools}'
)
logger.info(f"Launching agent '{agent}' for repo '{repo}', run_id={run_id}")
# Launch as background process.
# B-2 fix: redirect stdout/stderr straight to the log file at the OS level.
# No PIPE in the orchestrator process -> no PIPE deadlock, no reader thread,
# no zombies. log_fh is closed by _monitor_agent after proc.wait().
log_fh = open(output_path, "w")
proc = subprocess.Popen(
["bash", "-c", cmd],
stdout=log_fh,
stderr=subprocess.STDOUT,
env={
**os.environ,
"HOME": "/home/slin",
"GIT_AUTHOR_NAME": "claude-bot",
"GIT_AUTHOR_EMAIL": "claude-bot@mva154.local",
"GIT_COMMITTER_NAME": "claude-bot",
"GIT_COMMITTER_EMAIL": "claude-bot@mva154.local",
},
)
# Update DB with output path
conn.execute(
"UPDATE agent_runs SET output_path = ? WHERE id = ?",
(output_path, run_id),
)
conn.commit()
conn.close()
# Start timeout watchdog
t = threading.Thread(
target=self._watchdog,
args=(proc.pid, run_id),
daemon=True,
)
t.start()
# Start monitor thread (waits for completion, commits, pushes)
# agent_branch already computed above
m = threading.Thread(
target=self._monitor_agent,
args=(proc, run_id, agent, repo, agent_branch, output_path, log_fh),
daemon=True,
)
m.start()
logger.info(f"Agent '{agent}' launched, pid={proc.pid}, run_id={run_id}")
notify_agent_started(run_id, agent, task_id)
return run_id
def _watchdog(self, pid: int, run_id: int, timeout: int = None):
"""Kill agent if it exceeds timeout."""
import time
if timeout is None:
timeout = self.AGENT_TIMEOUT
time.sleep(timeout)
try:
os.kill(pid, signal.SIGKILL)
logger.warning(f"Agent run_id={run_id} killed after {timeout}s timeout")
conn = get_db()
conn.execute(
"UPDATE agent_runs SET finished_at=datetime('now'), exit_code=-9 WHERE id=?",
(run_id,),
)
conn.commit()
conn.close()
except ProcessLookupError:
pass # Already finished
def _monitor_agent(self, proc, run_id, agent, repo, branch, output_path=None, log_fh=None):
"""Wait for agent to finish, commit+push results, update DB.
B-2 fix: stdout already goes straight to the log file via Popen, so we just
block on proc.wait() (guaranteed reap -> no zombie, real exit_code) and then
close the log file handle. No PIPE, no select loop, no startup timeout here
(the watchdog still enforces the overall AGENT_TIMEOUT by pid).
"""
import time as _time
_start_ts = _time.time()
exit_code = proc.wait()
if log_fh is not None:
try:
log_fh.close()
except Exception:
pass
_duration_s = int(_time.time() - _start_ts)
logger.info(f"Agent run_id={run_id} ({agent}) finished with exit_code={exit_code}")
# Update DB
conn = get_db()
conn.execute(
"UPDATE agent_runs SET finished_at=datetime('now'), exit_code=? WHERE id=?",
(exit_code, run_id),
)
conn.commit()
# Get task_id for notification
_row = conn.execute("SELECT task_id FROM agent_runs WHERE id=?", (run_id,)).fetchone()
_task_id = _row[0] if _row else None
conn.close()
notify_agent_finished(run_id, agent, exit_code, task_id=_task_id, duration_s=_duration_s)
# Commit and push any changes — in the per-branch worktree (ORCH-2 / S-4),
# NOT in the shared /repos/<repo>. The worktree is already on `branch`
# (ensure_worktree did the checkout), so no checkout is needed here.
repo_path = get_worktree_path(repo, branch)
try:
git_env = {
**os.environ,
"HOME": "/home/slin",
"GIT_AUTHOR_NAME": "claude-bot",
"GIT_AUTHOR_EMAIL": "claude-bot@mva154.local",
"GIT_COMMITTER_NAME": "claude-bot",
"GIT_COMMITTER_EMAIL": "claude-bot@mva154.local",
}
result = subprocess.run(
["git", "-C", repo_path, "status", "--porcelain"],
capture_output=True, text=True, timeout=10, env=git_env
)
if result.stdout.strip():
# Add docs/ always
subprocess.run(
["git", "-C", repo_path, "add", "docs/"],
capture_output=True, text=True, timeout=10, env=git_env
)
# Add src/ and tests/ for developer
if agent == "developer":
subprocess.run(
["git", "-C", repo_path, "add", "src/", "tests/"],
capture_output=True, text=True, timeout=10, env=git_env
)
# Commit
commit_result = subprocess.run(
["git", "-C", repo_path, "commit", "-m",
f"{agent}(ET): auto-commit from {agent} run_id={run_id}"],
capture_output=True, text=True, timeout=30, env=git_env
)
if commit_result.returncode == 0:
push_result = subprocess.run(
["git", "-C", repo_path, "push", "origin", branch],
capture_output=True, text=True, timeout=60, env=git_env
)
if push_result.returncode == 0:
logger.info(f"Agent run_id={run_id}: committed and pushed to {branch}")
# Auto-create PR after developer pushes
if agent == "developer":
self._ensure_pr(repo, branch, run_id)
else:
logger.error(f"Agent run_id={run_id}: push failed: {push_result.stderr}")
else:
logger.warning(f"Agent run_id={run_id}: commit failed: {commit_result.stderr}")
else:
logger.info(f"Agent run_id={run_id}: no changes to commit")
except Exception as e:
logger.error(f"Agent run_id={run_id}: post-run git failed: {e}")
# Handle deployer failure (smoke/healthcheck failed) — Task 7
if exit_code != 0 and agent == "deployer":
conn = get_db()
task_row = conn.execute(
"SELECT id, work_item_id FROM tasks WHERE repo=? AND branch=?",
(repo, branch),
).fetchone()
conn.close()
if task_row:
_tid, _wid = task_row
update_task_stage(_tid, "development")
notify_stage_change(_tid, "deploy", "development")
plane_notify_stage(_wid, "deploy", "development")
from ..plane_sync import set_issue_blocked
set_issue_blocked(_wid)
plane_add_comment(
_wid,
"\u274c Deploy FAILED (smoke/healthcheck). Rolled back. Developer \u043d\u0443\u0436\u0435\u043d \u0434\u043b\u044f \u0444\u0438\u043a\u0441\u0430."
)
from ..notifications import send_telegram
send_telegram(f"\U0001f6a8 {_wid}: Deploy failed! Rolled back. Needs fix.")
# Notify on startup timeout (exit_code from kill = -9 or 137)
if exit_code != 0 and exit_code not in (None,):
conn = get_db()
task_row = conn.execute(
"SELECT id, work_item_id FROM tasks WHERE repo=? AND branch=?",
(repo, branch),
).fetchone()
conn.close()
if task_row and agent != "deployer": # deployer handled above
_tid, _wid = task_row
from ..notifications import send_telegram
send_telegram(f"\u26a0\ufe0f {_wid}: Agent {agent} failed (exit_code={exit_code}). Check logs: /app/data/runs/{run_id}.log")
# Auto-advance stage if agent finished successfully and QG passes
if exit_code == 0:
self._try_advance_stage(run_id, agent, repo, branch)
def _try_advance_stage(self, run_id: int, agent: str, repo: str, branch: str):
"""After agent finishes successfully, check QG and advance stage if possible."""
try:
conn = get_db()
task_row = conn.execute(
"SELECT id, stage, work_item_id FROM tasks WHERE repo=? AND branch=?",
(repo, branch),
).fetchone()
conn.close()
if not task_row:
return
task_id, current_stage, work_item_id = task_row
qg_name = get_qg_for_stage(current_stage)
next_stage = get_next_stage(current_stage)
if not next_stage:
return
# Run QG check if defined
if qg_name and qg_name in QG_CHECKS:
check_fn = QG_CHECKS[qg_name]
if qg_name in ("check_analysis_approved",):
# Requires human approval - post request comment if analyst just finished
if agent == "analyst" and qg_name == "check_analysis_approved" and work_item_id:
files_check = QG_CHECKS.get("check_analysis_complete")
if files_check:
files_ok, _ = files_check(repo, work_item_id, branch)
if files_ok:
# Full artifacts ready -> In Review
from ..plane_sync import set_issue_in_review
set_issue_in_review(work_item_id)
plane_add_comment(
work_item_id,
"\U0001f4cb BRD/\u0422\u0417/AC/TestPlan \u0433\u043e\u0442\u043e\u0432\u044b. "
"\u041f\u0440\u043e\u0448\u0443 review \u0438 \u0440\u0435\u0430\u043a\u0446\u0438\u044e :approved: \u0434\u043b\u044f \u043f\u0440\u043e\u0434\u0432\u0438\u0436\u0435\u043d\u0438\u044f \u0432 Architecture."
)
notify_approve_requested(task_id)
logger.info(f"Task {task_id}: analyst finished, requested :approved: in Plane")
else:
# Check if questions file exists (in the task worktree)
import os as _os
questions_path = _os.path.join(
get_worktree_path(repo, branch),
f"docs/work-items/{work_item_id}/01-questions.md"
)
if _os.path.isfile(questions_path):
# Analyst has questions -> Needs Input
from ..plane_sync import set_issue_needs_input
set_issue_needs_input(work_item_id)
with open(questions_path, "r") as qf:
questions_text = qf.read()
plane_add_comment(
work_item_id,
f"\u2753 Analyst \u043d\u0443\u0436\u0434\u0430\u0435\u0442\u0441\u044f \u0432 \u0443\u0442\u043e\u0447\u043d\u0435\u043d\u0438\u0438:\n\n{questions_text}"
)
from ..notifications import send_telegram
send_telegram(
f"\u2753 {work_item_id}: Analyst \u0437\u0430\u0434\u0430\u0451\u0442 \u0432\u043e\u043f\u0440\u043e\u0441\u044b. \u041e\u0442\u0432\u0435\u0442\u044c \u0432 Plane."
)
else:
# No artifacts and no questions
plane_add_comment(
work_item_id,
"\u26a0\ufe0f Analyst \u0437\u0430\u0432\u0435\u0440\u0448\u0438\u043b\u0441\u044f \u0431\u0435\u0437 \u0430\u0440\u0442\u0435\u0444\u0430\u043a\u0442\u043e\u0432 \u0438 \u0431\u0435\u0437 \u0432\u043e\u043f\u0440\u043e\u0441\u043e\u0432. \u041f\u0440\u043e\u0432\u0435\u0440\u044c\u0442\u0435 \u043b\u043e\u0433."
)
return
elif qg_name in ("check_ci_green", "check_tests_local"):
# (repo, branch) signature — already worktree-aware.
passed, reason = check_fn(repo, branch)
elif qg_name == "check_tests_passed":
# Artifact check — pass branch so it reads from the worktree.
passed, reason = check_fn(repo, work_item_id or "", branch)
else:
# Other artifact checks (check_architecture_done, etc.) — worktree-aware.
passed, reason = check_fn(repo, work_item_id or "", branch)
if not passed:
logger.info(f"Task {task_id}: QG '{qg_name}' not passed after {agent}: {reason}")
# If reviewer says REQUEST_CHANGES, rollback to development
if agent == "reviewer" and "REQUEST_CHANGES" in reason:
update_task_stage(task_id, "development")
notify_stage_change(task_id, current_stage, "development")
plane_notify_stage(work_item_id, current_stage, "development")
# Count retries
conn2 = get_db()
retry_count = conn2.execute(
"SELECT COUNT(*) FROM agent_runs WHERE task_id=? AND agent='developer'",
(task_id,)
).fetchone()[0]
conn2.close()
if retry_count < 3:
task_desc = (
f"Work item: {work_item_id}\nRepo: {repo}\nBranch: {branch}\n"
f"Stage: development\nNote: REQUEST_CHANGES from reviewer "
f"(attempt {retry_count+1}/3). Fix findings in "
f"docs/work-items/{work_item_id}/12-review.md"
)
new_run = self.launch("developer", repo, task_desc, task_id=task_id)
logger.info(f"Task {task_id}: reviewer REQUEST_CHANGES, relaunched developer (run_id={new_run})")
else:
from ..notifications import send_telegram
send_telegram(f"\u26a0\ufe0f {work_item_id}: Max developer retries (3) reached. Manual intervention needed.")
logger.error(f"Task {task_id}: max retries reached")
# Task 6: Tester FAIL -> rollback to development
if agent == "tester" and qg_name == "check_tests_passed" and not passed:
update_task_stage(task_id, "development")
notify_stage_change(task_id, current_stage, "development")
plane_notify_stage(work_item_id, current_stage, "development")
from ..plane_sync import set_issue_in_progress
set_issue_in_progress(work_item_id)
plane_add_comment(
work_item_id,
f"\u274c \u0422\u0435\u0441\u0442\u044b \u043d\u0435 \u043f\u0440\u043e\u0448\u043b\u0438: {reason}. Developer \u043f\u0435\u0440\u0435\u0437\u0430\u043f\u0443\u0449\u0435\u043d \u0434\u043b\u044f \u0444\u0438\u043a\u0441\u0430."
)
conn2 = get_db()
retry_count = conn2.execute(
"SELECT COUNT(*) FROM agent_runs WHERE task_id=? AND agent='developer'",
(task_id,)
).fetchone()[0]
conn2.close()
if retry_count < 3:
task_desc = (
f"Work item: {work_item_id}\nRepo: {repo}\nBranch: {branch}\n"
f"Stage: development\nNote: Tests FAILED. "
f"Fix failures described in docs/work-items/{work_item_id}/13-test-report.md"
)
new_run = self.launch("developer", repo, task_desc, task_id=task_id)
logger.info(f"Task {task_id}: tester FAIL, relaunched developer (run_id={new_run})")
else:
from ..notifications import send_telegram
from ..plane_sync import set_issue_blocked
set_issue_blocked(work_item_id)
send_telegram(f"\U0001f6a8 {work_item_id}: Tests still failing after 3 developer retries. Manual intervention needed.")
# Task 8: Architect conflict -> rollback to analysis
if agent == "architect" and qg_name == "check_architecture_done" and not passed:
import os as _os
conflict_path = _os.path.join(
get_worktree_path(repo, branch),
f"docs/work-items/{work_item_id}/10-conflict.md"
)
if _os.path.isfile(conflict_path):
update_task_stage(task_id, "analysis")
notify_stage_change(task_id, current_stage, "analysis")
plane_notify_stage(work_item_id, current_stage, "analysis")
from ..plane_sync import set_issue_in_progress
set_issue_in_progress(work_item_id)
with open(conflict_path, "r") as cf:
conflict_text = cf.read()[:500]
plane_add_comment(
work_item_id,
f"\u26a0\ufe0f Architect \u043d\u0430\u0448\u0451\u043b \u043a\u043e\u043d\u0444\u043b\u0438\u043a\u0442 \u0441 \u0422\u0417. \u0412\u043e\u0437\u0432\u0440\u0430\u0442 \u0432 Analysis.\n\n{conflict_text}"
)
task_desc = (
f"Work item: {work_item_id}\nRepo: {repo}\nBranch: {branch}\n"
f"Stage: analysis\nNote: Architect conflict. Revise TRZ. "
f"See docs/work-items/{work_item_id}/10-conflict.md"
)
new_run = self.launch("analyst", repo, task_desc, task_id=task_id)
logger.info(f"Task {task_id}: architect conflict, relaunched analyst")
return
return
elif qg_name:
return
# Advance stage
update_task_stage(task_id, next_stage)
notify_stage_change(task_id, current_stage, next_stage)
plane_notify_stage(work_item_id, current_stage, next_stage)
logger.info(f"Task {task_id}: {current_stage} -> {next_stage} (auto-advance after {agent})")
# Launch next agent if defined
next_agent = get_agent_for_stage(next_stage)
if next_agent:
task_desc = f"Work item: {work_item_id}\nRepo: {repo}\nBranch: {branch}\nStage: {next_stage}"
new_run_id = self.launch(next_agent, repo, task_desc, task_id=task_id)
logger.info(f"Task {task_id}: launched '{next_agent}' (run_id={new_run_id})")
except Exception as e:
logger.error(f"Auto-advance failed for run_id={run_id}: {e}")
def _ensure_pr(self, repo: str, branch: str, run_id: int):
import httpx
owner = settings.gitea_owner
headers = {"Authorization": f"token {settings.gitea_token}"}
base_url = f"{settings.gitea_url}/api/v1"
try:
resp = httpx.get(
f"{base_url}/repos/{owner}/{repo}/pulls",
params={"state": "open", "head": branch},
headers=headers, timeout=10
)
resp.raise_for_status()
prs = resp.json()
if prs:
return prs[0]["number"]
parts = branch.split("/")
title = parts[-1] if parts else branch
resp = httpx.post(
f"{base_url}/repos/{owner}/{repo}/pulls",
json={"title": f"feat: {title}", "head": branch, "base": "main",
"body": f"Auto-created by orchestrator after developer run_id={run_id}"},
headers=headers, timeout=10
)
resp.raise_for_status()
pr_number = resp.json()["number"]
logger.info(f"Created PR #{pr_number} for {branch}")
return pr_number
except Exception as e:
logger.error(f"Failed to create PR for {branch}: {e}")
return None
def _auto_merge_pr(self, repo: str, branch: str, task_id: int, work_item_id: str):
import httpx
owner = settings.gitea_owner
headers = {"Authorization": f"token {settings.gitea_token}"}
base_url = f"{settings.gitea_url}/api/v1"
try:
resp = httpx.get(
f"{base_url}/repos/{owner}/{repo}/pulls",
params={"state": "open", "head": branch},
headers=headers, timeout=10
)
resp.raise_for_status()
prs = resp.json()
if not prs:
pr_number = self._ensure_pr(repo, branch, 0)
if not pr_number:
return False
else:
pr_number = prs[0]["number"]
resp = httpx.post(
f"{base_url}/repos/{owner}/{repo}/pulls/{pr_number}/merge",
json={"Do": "merge"},
headers=headers, timeout=30
)
if resp.status_code in (200, 204):
logger.info(f"PR #{pr_number} merged for {branch}")
update_task_stage(task_id, "done")
notify_stage_change(task_id, "deploy", "done")
plane_notify_stage(work_item_id, "deploy", "done")
from ..notifications import send_telegram
send_telegram(f"\u2705 {work_item_id}: PR #{pr_number} merged! deploy -> done. Task complete.")
return True
else:
logger.error(f"Merge failed for PR #{pr_number}: {resp.status_code} {resp.text}")
from ..notifications import send_telegram
send_telegram(f"\u26a0\ufe0f {work_item_id}: Auto-merge failed (HTTP {resp.status_code}). Manual merge needed.")
return False
except Exception as e:
logger.error(f"Auto-merge failed for {branch}: {e}")
return False
def _write_task_file(self, repo: str, branch: str, task_file: str, content: str):
"""Write task file directly into the task's worktree.
B-1 fix: no docker (direct open()). ORCH-2/S-4: the target is the per-branch
worktree (/repos/_wt/<repo>/<branch>), not the shared /repos/<repo>, so the
agent reads the task ZADANIE from its own isolated working copy.
Raise on failure instead of silently swallowing errors.
"""
work_path = get_worktree_path(repo, branch) # /repos/_wt/<repo>/<branch>
full_path = os.path.join(work_path, task_file)
try:
with open(full_path, "w", encoding="utf-8") as f:
f.write(content)
logger.info(f"Task file written: {full_path} ({len(content)} bytes)")
except OSError as e:
logger.error(f"Failed to write task file {full_path}: {e}")
raise RuntimeError(f"Failed to write task file: {e}")
launcher = AgentLauncher()