orchestrator/src/agents/launcher.py

import subprocess
import os
import logging
import threading
import signal
from ..config import settings
from ..db import get_db, get_task_by_repo_branch, update_task_stage, enqueue_job
from ..stages import get_next_stage, get_qg_for_stage, get_agent_for_stage
from ..git_worktree import ensure_worktree, get_worktree_path
from ..qg.checks import QG_CHECKS
from ..notifications import notify_stage_change, notify_qg_failure, notify_agent_started, notify_agent_finished, notify_approve_requested
from ..plane_sync import notify_stage_change as plane_notify_stage, add_comment as plane_add_comment

logger = logging.getLogger("orchestrator.launcher")


class AgentLauncher:
    """Launch Claude CLI agents directly (binary mounted into container)."""

    AGENT_CONFIGS = {
        "analyst": {
            "system_prompt": ".openclaw/agents/analyst.md",
            "task_file": ".task.md",
            "allowed_tools": "Read,Write,Edit,Bash",
        },
        "architect": {
            "system_prompt": ".openclaw/agents/architect.md",
            "task_file": ".task-arch.md",
            "allowed_tools": "Read,Write,Edit,Bash",
            "model": "opus",
        },
        "developer": {
            "system_prompt": ".openclaw/agents/developer.md",
            "task_file": ".task-dev.md",
            "allowed_tools": "Read,Write,Edit,Bash",
        },
        "reviewer": {
            "system_prompt": ".openclaw/agents/reviewer.md",
            "task_file": ".task-review.md",
            "allowed_tools": "Read,Write,Edit,Bash",
            "model": "opus",
        },
        "tester": {
            "system_prompt": ".openclaw/agents/tester.md",
            "task_file": ".task-test.md",
            "allowed_tools": "Read,Write,Edit,Bash",
        },
        "deployer": {
            "task_file": ".task-deploy.md",
            "system_prompt": ".openclaw/agents/deployer.md",
            "allowed_tools": "Read,Write,Edit,Bash",
        },
    }

    CLAUDE_BIN = "/opt/claude-code/bin/claude.exe"
    AGENT_TIMEOUT = 1800  # 30 minutes

    def launch(self, agent: str, repo: str, task_content: str = None, task_id: int = None) -> int:
        """
        Launch a Claude CLI agent directly (legacy synchronous path).

        Kept for backward compatibility (direct callers / existing tests). The
        ORCH-1 job queue uses launch_job() instead, but both share _spawn().

        Args:
            agent: Agent role (analyst, architect, developer, reviewer, tester)
            repo: Repository name
            task_content: Optional task content to write to task file
            task_id: Optional task ID to associate with this run

        Returns:
            agent_run_id from DB
        """
        return self._spawn(agent, repo, task_content, task_id, job_id=None)

    def launch_job(self, job: dict) -> int:
        """ORCH-1: launch an agent for a claimed queue job.

        Same spawn path as launch(), but threads job['id'] through so the monitor
        can update the job's status (done / requeue / failed) and link jobs.run_id
        to the agent_runs row. Returns the agent_run_id.
        """
        return self._spawn(
            job["agent"],
            job["repo"],
            job.get("task_content"),
            job.get("task_id"),
            job_id=job["id"],
        )

    def _spawn(self, agent: str, repo: str, task_content: str = None,
               task_id: int = None, job_id: int = None) -> int:
        """Shared spawn implementation for launch() and launch_job().

        When job_id is set, the monitor/watchdog drive the jobs table status
        (ORCH-1). The claude-CLI Popen logic (B-2) and worktree/task-file logic
        (B-1 / ORCH-2) are unchanged.
        """
        config = self.AGENT_CONFIGS.get(agent)
        if not config:
            raise ValueError(f"Unknown agent: {agent}")

        # Main clone lives at /repos/<repo>; the agent works in an isolated worktree
        # (ORCH-2 / S-4) so concurrent tasks never fight over a shared checkout.
        local_repo_path = os.path.join(settings.repos_dir, repo)
        if not os.path.isdir(local_repo_path):
            raise FileNotFoundError(f"Repo not found: {local_repo_path}")

        # Determine branch (needed before we touch the worktree / task file).
        _br_row = get_db().execute("SELECT branch FROM tasks WHERE id=?", (task_id,)).fetchone() if task_id else None
        agent_branch = _br_row[0] if _br_row else "main"

        # Ensure the per-branch worktree exists and is on the right branch.
        work_path = ensure_worktree(repo, agent_branch)

        # Write task file if content provided (B-1: direct write; now into the worktree).
        if task_content:
            self._write_task_file(repo, agent_branch, config["task_file"], task_content)

        # Record run in DB
        conn = get_db()
        cursor = conn.execute(
            "INSERT INTO agent_runs (task_id, agent) VALUES (?, ?)",
            (task_id, agent),
        )
        run_id = cursor.lastrowid
        conn.commit()

        # ORCH-1: link this job to the agent_runs row and stamp started_at.
        if job_id is not None:
            conn.execute(
                "UPDATE jobs SET run_id = ?, started_at = datetime('now') WHERE id = ?",
                (run_id, job_id),
            )
            conn.commit()

        # Prepare output log path
        output_path = f"/app/data/runs/{run_id}.log"
        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        # Build the claude command
        task_file = config["task_file"]
        system_prompt = config["system_prompt"]
        allowed_tools = config["allowed_tools"]

        model = config.get("model", "")
        model_flag = f"--model {model} " if model else ""

        # No git fetch/checkout here: ensure_worktree() already put the worktree on
        # the right branch. The agent simply runs inside its isolated work_path.
        cmd = (
            f'cd {work_path} && '
            f'{self.CLAUDE_BIN} --print '
            f'{model_flag}'
            f'"$(cat {task_file})" '
            f'--system-prompt "$(cat {system_prompt})" '
            f'--allowedTools {allowed_tools}'
        )

        logger.info(f"Launching agent '{agent}' for repo '{repo}', run_id={run_id}")

        # Launch as background process.
        # B-2 fix: redirect stdout/stderr straight to the log file at the OS level.
        # No PIPE in the orchestrator process -> no PIPE deadlock, no reader thread,
        # no zombies. log_fh is closed by _monitor_agent after proc.wait().
        log_fh = open(output_path, "w")
        proc = subprocess.Popen(
            ["bash", "-c", cmd],
            stdout=log_fh,
            stderr=subprocess.STDOUT,
            env={
                    **os.environ,
                    "HOME": "/home/slin",
                    "GIT_AUTHOR_NAME": "claude-bot",
                    "GIT_AUTHOR_EMAIL": "claude-bot@mva154.local",
                    "GIT_COMMITTER_NAME": "claude-bot",
                    "GIT_COMMITTER_EMAIL": "claude-bot@mva154.local",
                },
            )

        # Update DB with output path
        conn.execute(
            "UPDATE agent_runs SET output_path = ? WHERE id = ?",
            (output_path, run_id),
        )
        conn.commit()
        conn.close()

        # Start timeout watchdog
        t = threading.Thread(
            target=self._watchdog,
            args=(proc.pid, run_id),
            kwargs={"job_id": job_id},
            daemon=True,
        )
        t.start()

        # Start monitor thread (waits for completion, commits, pushes)
        # agent_branch already computed above
        m = threading.Thread(
            target=self._monitor_agent,
            args=(proc, run_id, agent, repo, agent_branch, output_path, log_fh),
            kwargs={"job_id": job_id},
            daemon=True,
        )
        m.start()

        logger.info(f"Agent '{agent}' launched, pid={proc.pid}, run_id={run_id}")
        notify_agent_started(run_id, agent, task_id)
        return run_id

    def _watchdog(self, pid: int, run_id: int, timeout: int = None, job_id: int = None):
        """Kill agent if it exceeds timeout.

        ORCH-1: on a timeout-kill the monitor's proc.wait() returns the kill exit
        code and drives the job retry/fail logic, so the watchdog itself only needs
        to SIGKILL and record the agent_runs exit. job_id is accepted for symmetry.
        """
        import time
        if timeout is None:
            timeout = self.AGENT_TIMEOUT
        time.sleep(timeout)
        try:
            os.kill(pid, signal.SIGKILL)
            logger.warning(f"Agent run_id={run_id} killed after {timeout}s timeout")
            conn = get_db()
            conn.execute(
                "UPDATE agent_runs SET finished_at=datetime('now'), exit_code=-9 WHERE id=?",
                (run_id,),
            )
            conn.commit()
            conn.close()
        except ProcessLookupError:
            pass  # Already finished

    def _monitor_agent(self, proc, run_id, agent, repo, branch, output_path=None, log_fh=None, job_id=None):
        """Wait for agent to finish, commit+push results, update DB.

        B-2 fix: stdout already goes straight to the log file via Popen, so we just
        block on proc.wait() (guaranteed reap -> no zombie, real exit_code) and then
        close the log file handle. No PIPE, no select loop, no startup timeout here
        (the watchdog still enforces the overall AGENT_TIMEOUT by pid).
        """
        import time as _time
        _start_ts = _time.time()

        exit_code = proc.wait()
        if log_fh is not None:
            try:
                log_fh.close()
            except Exception:
                pass
        _duration_s = int(_time.time() - _start_ts)
        logger.info(f"Agent run_id={run_id} ({agent}) finished with exit_code={exit_code}")

        # Update DB
        conn = get_db()
        conn.execute(
            "UPDATE agent_runs SET finished_at=datetime('now'), exit_code=? WHERE id=?",
            (exit_code, run_id),
        )
        conn.commit()

        # Get task_id for notification
        _row = conn.execute("SELECT task_id FROM agent_runs WHERE id=?", (run_id,)).fetchone()
        _task_id = _row[0] if _row else None
        conn.close()

        notify_agent_finished(run_id, agent, exit_code, task_id=_task_id, duration_s=_duration_s)

        # Commit and push any changes — in the per-branch worktree (ORCH-2 / S-4),
        # NOT in the shared /repos/<repo>. The worktree is already on `branch`
        # (ensure_worktree did the checkout), so no checkout is needed here.
        repo_path = get_worktree_path(repo, branch)
        try:
            git_env = {
                **os.environ,
                "HOME": "/home/slin",
                "GIT_AUTHOR_NAME": "claude-bot",
                "GIT_AUTHOR_EMAIL": "claude-bot@mva154.local",
                "GIT_COMMITTER_NAME": "claude-bot",
                "GIT_COMMITTER_EMAIL": "claude-bot@mva154.local",
            }
            result = subprocess.run(
                ["git", "-C", repo_path, "status", "--porcelain"],
                capture_output=True, text=True, timeout=10, env=git_env
            )
            if result.stdout.strip():
                # Add docs/ always
                subprocess.run(
                    ["git", "-C", repo_path, "add", "docs/"],
                    capture_output=True, text=True, timeout=10, env=git_env
                )
                # Add src/ and tests/ for developer
                if agent == "developer":
                    subprocess.run(
                        ["git", "-C", repo_path, "add", "src/", "tests/"],
                        capture_output=True, text=True, timeout=10, env=git_env
                    )
                # Commit
                commit_result = subprocess.run(
                    ["git", "-C", repo_path, "commit", "-m",
                     f"{agent}(ET): auto-commit from {agent} run_id={run_id}"],
                    capture_output=True, text=True, timeout=30, env=git_env
                )
                if commit_result.returncode == 0:
                    push_result = subprocess.run(
                        ["git", "-C", repo_path, "push", "origin", branch],
                        capture_output=True, text=True, timeout=60, env=git_env
                    )
                    if push_result.returncode == 0:
                        logger.info(f"Agent run_id={run_id}: committed and pushed to {branch}")
                        # Auto-create PR after developer pushes
                        if agent == "developer":
                            self._ensure_pr(repo, branch, run_id)
                    else:
                        logger.error(f"Agent run_id={run_id}: push failed: {push_result.stderr}")
                else:
                    logger.warning(f"Agent run_id={run_id}: commit failed: {commit_result.stderr}")
            else:
                logger.info(f"Agent run_id={run_id}: no changes to commit")
        except Exception as e:
            logger.error(f"Agent run_id={run_id}: post-run git failed: {e}")

        # Handle deployer failure (smoke/healthcheck failed) — Task 7
        if exit_code != 0 and agent == "deployer":
            conn = get_db()
            task_row = conn.execute(
                "SELECT id, work_item_id FROM tasks WHERE repo=? AND branch=?",
                (repo, branch),
            ).fetchone()
            conn.close()
            if task_row:
                _tid, _wid = task_row
                update_task_stage(_tid, "development")
                notify_stage_change(_tid, "deploy", "development")
                plane_notify_stage(_wid, "deploy", "development")
                from ..plane_sync import set_issue_blocked
                set_issue_blocked(_wid)
                plane_add_comment(
                    _wid,
                    "\u274c Deploy FAILED (smoke/healthcheck). Rolled back. Developer \u043d\u0443\u0436\u0435\u043d \u0434\u043b\u044f \u0444\u0438\u043a\u0441\u0430."
                )
                from ..notifications import send_telegram
                send_telegram(f"\U0001f6a8 {_wid}: Deploy failed! Rolled back. Needs fix.")

        # Notify on startup timeout (exit_code from kill = -9 or 137)
        if exit_code != 0 and exit_code not in (None,):
            conn = get_db()
            task_row = conn.execute(
                "SELECT id, work_item_id FROM tasks WHERE repo=? AND branch=?",
                (repo, branch),
            ).fetchone()
            conn.close()
            if task_row and agent != "deployer":  # deployer handled above
                _tid, _wid = task_row
                from ..notifications import send_telegram
                send_telegram(f"\u26a0\ufe0f {_wid}: Agent {agent} failed (exit_code={exit_code}). Check logs: /app/data/runs/{run_id}.log")

        # Auto-advance stage if agent finished successfully and QG passes
        if exit_code == 0:
            self._try_advance_stage(run_id, agent, repo, branch)

        # ORCH-1: drive the job-queue status for queue-launched jobs only.
        # (Legacy direct launch() has job_id=None and is unaffected.)
        if job_id is not None:
            self._finalize_job(job_id, agent, run_id, exit_code, output_path=output_path)

    def _backoff_seconds(self, transient_attempts: int, retry_after: int = None) -> int:
        """Exponential backoff for transient failures, honouring Retry-After.

        backoff = min(2^transient_attempts * base, max). If the server sent a
        Retry-After, use the larger of the two (never poll sooner than asked).
        """
        base = settings.backoff_base_seconds
        cap = settings.backoff_max_seconds
        backoff = min((2 ** max(transient_attempts, 0)) * base, cap)
        if retry_after is not None and retry_after > 0:
            backoff = max(backoff, min(retry_after, cap))
        return int(backoff)

    def _finalize_job(self, job_id: int, agent: str, run_id: int, exit_code, output_path=None):
        """ORCH-1: update the jobs row after the agent process finished.

        exit_code == 0  -> done (and resets the breaker streak via on_outcome).
        exit_code != 0  -> classify the failure from the run log tail (token-free):
          - TRANSIENT (429/overload/network): backoff-requeue with available_at in
            the future + a SEPARATE transient_attempts budget
            (settings.transient_max_attempts), honouring Retry-After. Reported to
            the breaker so it opens after N consecutive transient failures.
          - PERMANENT (code fault): ordinary attempts < max_attempts requeue,
            otherwise 'failed' + Telegram.
        """
        from ..db import get_job, mark_job
        from ..error_classifier import classify_log_file
        try:
            job = get_job(job_id)
            if not job:
                return
            if exit_code == 0:
                mark_job(job_id, "done", run_id=run_id)
                logger.info(f"Job {job_id} ({agent}) done (run_id={run_id})")
                self._record_outcome(transient=False, recovered=True)
                return

            # Classify the failure from the agent log tail (no token cost).
            kind, retry_after = "permanent", None
            log_path = output_path or f"/app/data/runs/{run_id}.log"
            try:
                kind, retry_after = classify_log_file(log_path)
            except Exception:
                pass

            if kind == "transient":
                self._finalize_transient(job_id, agent, run_id, exit_code, job, retry_after)
            else:
                self._finalize_permanent(job_id, agent, run_id, exit_code, job)
        except Exception as e:
            logger.error(f"Job {job_id}: _finalize_job error: {e}")

    def _finalize_transient(self, job_id, agent, run_id, exit_code, job, retry_after):
        """Transient (429/overload/net) failure -> backoff requeue or fail when budget out."""
        from ..db import mark_job, mark_job_transient
        tattempts = job.get("transient_attempts", 0)
        tmax = settings.transient_max_attempts
        err = (f"transient (429/overload) agent {agent} exit={exit_code} "
               f"(run_id={run_id}); retry_after={retry_after}")
        self._record_outcome(transient=True, recovered=False)
        if tattempts < tmax:
            backoff = self._backoff_seconds(tattempts + 1, retry_after)
            mark_job_transient(job_id, backoff, error=err)
            logger.warning(
                f"Job {job_id} ({agent}) TRANSIENT fail (exit={exit_code}), "
                f"backoff {backoff}s, transient_attempt {tattempts + 1}/{tmax}"
            )
        else:
            mark_job(job_id, "failed", run_id=run_id, error=err)
            logger.error(
                f"Job {job_id} ({agent}) failed after {tattempts} transient attempts"
            )
            self._notify_failed(job_id, agent, job, run_id,
                                f"transient (rate-limit) after {tattempts} attempts")

    def _finalize_permanent(self, job_id, agent, run_id, exit_code, job):
        """Permanent (code-fault) failure -> normal attempts<max requeue, then fail."""
        from ..db import mark_job
        attempts = job.get("attempts", 0)
        max_attempts = job.get("max_attempts", 2)
        err = f"agent {agent} exit_code={exit_code} (run_id={run_id})"
        self._record_outcome(transient=False, recovered=False)
        if attempts < max_attempts:
            mark_job(job_id, "queued", run_id=run_id, error=err)
            logger.warning(
                f"Job {job_id} ({agent}) failed (exit={exit_code}), "
                f"requeued (attempt {attempts}/{max_attempts})"
            )
        else:
            mark_job(job_id, "failed", run_id=run_id, error=err)
            logger.error(
                f"Job {job_id} ({agent}) failed permanently after "
                f"{attempts} attempts (exit={exit_code})"
            )
            self._notify_failed(job_id, agent, job, run_id,
                                f"{attempts} attempts (exit={exit_code})")

    def _notify_failed(self, job_id, agent, job, run_id, why):
        try:
            from ..notifications import send_telegram
            send_telegram(
                f"\U0001f6a8 Job {job_id} ({agent}, repo {job.get('repo')}) "
                f"failed: {why}. Logs: /app/data/runs/{run_id}.log"
            )
        except Exception:
            pass

    def _record_outcome(self, transient: bool, recovered: bool):
        """Forward the run outcome to the circuit breaker (if a worker is wired).

        Decoupled via a settable callback (set by QueueWorker.start) so the launcher
        does not hard-import the worker (avoids a cycle) and tests can run the
        launcher standalone.
        """
        cb = getattr(self, "on_outcome", None)
        if cb:
            try:
                cb(transient=transient, recovered=recovered)
            except Exception:
                pass

    def _try_advance_stage(self, run_id: int, agent: str, repo: str, branch: str):
        """After agent finishes successfully, check QG and advance stage if possible."""
        try:
            conn = get_db()
            task_row = conn.execute(
                "SELECT id, stage, work_item_id FROM tasks WHERE repo=? AND branch=?",
                (repo, branch),
            ).fetchone()
            conn.close()
            if not task_row:
                return

            task_id, current_stage, work_item_id = task_row
            qg_name = get_qg_for_stage(current_stage)
            next_stage = get_next_stage(current_stage)

            if not next_stage:
                return

            # Run QG check if defined
            if qg_name and qg_name in QG_CHECKS:
                check_fn = QG_CHECKS[qg_name]
                if qg_name in ("check_analysis_approved",):
                    # Requires human approval - post request comment if analyst just finished
                    if agent == "analyst" and qg_name == "check_analysis_approved" and work_item_id:
                        files_check = QG_CHECKS.get("check_analysis_complete")
                        if files_check:
                            files_ok, _ = files_check(repo, work_item_id, branch)
                            if files_ok:
                                # Full artifacts ready -> In Review
                                from ..plane_sync import set_issue_in_review
                                set_issue_in_review(work_item_id)
                                plane_add_comment(
                                    work_item_id,
                                    "\U0001f4cb BRD/\u0422\u0417/AC/TestPlan \u0433\u043e\u0442\u043e\u0432\u044b. "
                                    "\u041f\u0440\u043e\u0448\u0443 review \u0438 \u0440\u0435\u0430\u043a\u0446\u0438\u044e :approved: \u0434\u043b\u044f \u043f\u0440\u043e\u0434\u0432\u0438\u0436\u0435\u043d\u0438\u044f \u0432 Architecture."
                                )
                                notify_approve_requested(task_id)
                                logger.info(f"Task {task_id}: analyst finished, requested :approved: in Plane")
                            else:
                                # Check if questions file exists (in the task worktree)
                                import os as _os
                                questions_path = _os.path.join(
                                    get_worktree_path(repo, branch),
                                    f"docs/work-items/{work_item_id}/01-questions.md"
                                )
                                if _os.path.isfile(questions_path):
                                    # Analyst has questions -> Needs Input
                                    from ..plane_sync import set_issue_needs_input
                                    set_issue_needs_input(work_item_id)
                                    with open(questions_path, "r") as qf:
                                        questions_text = qf.read()
                                    plane_add_comment(
                                        work_item_id,
                                        f"\u2753 Analyst \u043d\u0443\u0436\u0434\u0430\u0435\u0442\u0441\u044f \u0432 \u0443\u0442\u043e\u0447\u043d\u0435\u043d\u0438\u0438:\n\n{questions_text}"
                                    )
                                    from ..notifications import send_telegram
                                    send_telegram(
                                        f"\u2753 {work_item_id}: Analyst \u0437\u0430\u0434\u0430\u0451\u0442 \u0432\u043e\u043f\u0440\u043e\u0441\u044b. \u041e\u0442\u0432\u0435\u0442\u044c \u0432 Plane."
                                    )
                                else:
                                    # No artifacts and no questions
                                    plane_add_comment(
                                        work_item_id,
                                        "\u26a0\ufe0f Analyst \u0437\u0430\u0432\u0435\u0440\u0448\u0438\u043b\u0441\u044f \u0431\u0435\u0437 \u0430\u0440\u0442\u0435\u0444\u0430\u043a\u0442\u043e\u0432 \u0438 \u0431\u0435\u0437 \u0432\u043e\u043f\u0440\u043e\u0441\u043e\u0432. \u041f\u0440\u043e\u0432\u0435\u0440\u044c\u0442\u0435 \u043b\u043e\u0433."
                                    )
                    return
                elif qg_name in ("check_ci_green", "check_tests_local"):
                    # (repo, branch) signature — already worktree-aware.
                    passed, reason = check_fn(repo, branch)
                elif qg_name == "check_tests_passed":
                    # Artifact check — pass branch so it reads from the worktree.
                    passed, reason = check_fn(repo, work_item_id or "", branch)
                else:
                    # Other artifact checks (check_architecture_done, etc.) — worktree-aware.
                    passed, reason = check_fn(repo, work_item_id or "", branch)

                if not passed:
                    logger.info(f"Task {task_id}: QG '{qg_name}' not passed after {agent}: {reason}")
                    # If reviewer says REQUEST_CHANGES, rollback to development
                    if agent == "reviewer" and "REQUEST_CHANGES" in reason:
                        update_task_stage(task_id, "development")
                        notify_stage_change(task_id, current_stage, "development")
                        plane_notify_stage(work_item_id, current_stage, "development")
                        # Count retries
                        conn2 = get_db()
                        retry_count = conn2.execute(
                            "SELECT COUNT(*) FROM agent_runs WHERE task_id=? AND agent='developer'",
                            (task_id,)
                        ).fetchone()[0]
                        conn2.close()
                        if retry_count < 3:
                            task_desc = (
                                f"Work item: {work_item_id}\nRepo: {repo}\nBranch: {branch}\n"
                                f"Stage: development\nNote: REQUEST_CHANGES from reviewer "
                                f"(attempt {retry_count+1}/3). Fix findings in "
                                f"docs/work-items/{work_item_id}/12-review.md"
                            )
                            new_job = enqueue_job("developer", repo, task_desc, task_id=task_id)
                            logger.info(f"Task {task_id}: reviewer REQUEST_CHANGES, enqueued developer (job_id={new_job})")
                        else:
                            from ..notifications import send_telegram
                            send_telegram(f"\u26a0\ufe0f {work_item_id}: Max developer retries (3) reached. Manual intervention needed.")
                            logger.error(f"Task {task_id}: max retries reached")

                    # Task 6: Tester FAIL -> rollback to development
                    if agent == "tester" and qg_name == "check_tests_passed" and not passed:
                        update_task_stage(task_id, "development")
                        notify_stage_change(task_id, current_stage, "development")
                        plane_notify_stage(work_item_id, current_stage, "development")
                        from ..plane_sync import set_issue_in_progress
                        set_issue_in_progress(work_item_id)
                        plane_add_comment(
                            work_item_id,
                            f"\u274c \u0422\u0435\u0441\u0442\u044b \u043d\u0435 \u043f\u0440\u043e\u0448\u043b\u0438: {reason}. Developer \u043f\u0435\u0440\u0435\u0437\u0430\u043f\u0443\u0449\u0435\u043d \u0434\u043b\u044f \u0444\u0438\u043a\u0441\u0430."
                        )
                        conn2 = get_db()
                        retry_count = conn2.execute(
                            "SELECT COUNT(*) FROM agent_runs WHERE task_id=? AND agent='developer'",
                            (task_id,)
                        ).fetchone()[0]
                        conn2.close()
                        if retry_count < 3:
                            task_desc = (
                                f"Work item: {work_item_id}\nRepo: {repo}\nBranch: {branch}\n"
                                f"Stage: development\nNote: Tests FAILED. "
                                f"Fix failures described in docs/work-items/{work_item_id}/13-test-report.md"
                            )
                            new_job = enqueue_job("developer", repo, task_desc, task_id=task_id)
                            logger.info(f"Task {task_id}: tester FAIL, enqueued developer (job_id={new_job})")
                        else:
                            from ..notifications import send_telegram
                            from ..plane_sync import set_issue_blocked
                            set_issue_blocked(work_item_id)
                            send_telegram(f"\U0001f6a8 {work_item_id}: Tests still failing after 3 developer retries. Manual intervention needed.")

                    # Task 8: Architect conflict -> rollback to analysis
                    if agent == "architect" and qg_name == "check_architecture_done" and not passed:
                        import os as _os
                        conflict_path = _os.path.join(
                            get_worktree_path(repo, branch),
                            f"docs/work-items/{work_item_id}/10-conflict.md"
                        )
                        if _os.path.isfile(conflict_path):
                            update_task_stage(task_id, "analysis")
                            notify_stage_change(task_id, current_stage, "analysis")
                            plane_notify_stage(work_item_id, current_stage, "analysis")
                            from ..plane_sync import set_issue_in_progress
                            set_issue_in_progress(work_item_id)
                            with open(conflict_path, "r") as cf:
                                conflict_text = cf.read()[:500]
                            plane_add_comment(
                                work_item_id,
                                f"\u26a0\ufe0f Architect \u043d\u0430\u0448\u0451\u043b \u043a\u043e\u043d\u0444\u043b\u0438\u043a\u0442 \u0441 \u0422\u0417. \u0412\u043e\u0437\u0432\u0440\u0430\u0442 \u0432 Analysis.\n\n{conflict_text}"
                            )
                            task_desc = (
                                f"Work item: {work_item_id}\nRepo: {repo}\nBranch: {branch}\n"
                                f"Stage: analysis\nNote: Architect conflict. Revise TRZ. "
                                f"See docs/work-items/{work_item_id}/10-conflict.md"
                            )
                            new_job = enqueue_job("analyst", repo, task_desc, task_id=task_id)
                            logger.info(f"Task {task_id}: architect conflict, enqueued analyst (job_id={new_job})")
                            return

                    return
            elif qg_name:
                return

            # Advance stage
            update_task_stage(task_id, next_stage)
            notify_stage_change(task_id, current_stage, next_stage)
            plane_notify_stage(work_item_id, current_stage, next_stage)
            logger.info(f"Task {task_id}: {current_stage} -> {next_stage} (auto-advance after {agent})")

            # Launch next agent if defined
            next_agent = get_agent_for_stage(next_stage)
            if next_agent:
                task_desc = f"Work item: {work_item_id}\nRepo: {repo}\nBranch: {branch}\nStage: {next_stage}"
                new_job_id = enqueue_job(next_agent, repo, task_desc, task_id=task_id)
                logger.info(f"Task {task_id}: enqueued '{next_agent}' (job_id={new_job_id})")

        except Exception as e:
            logger.error(f"Auto-advance failed for run_id={run_id}: {e}")


    def _ensure_pr(self, repo: str, branch: str, run_id: int):
        import httpx
        owner = settings.gitea_owner
        headers = {"Authorization": f"token {settings.gitea_token}"}
        base_url = f"{settings.gitea_url}/api/v1"
        try:
            resp = httpx.get(
                f"{base_url}/repos/{owner}/{repo}/pulls",
                params={"state": "open", "head": branch},
                headers=headers, timeout=10
            )
            resp.raise_for_status()
            prs = resp.json()
            if prs:
                return prs[0]["number"]
            parts = branch.split("/")
            title = parts[-1] if parts else branch
            resp = httpx.post(
                f"{base_url}/repos/{owner}/{repo}/pulls",
                json={"title": f"feat: {title}", "head": branch, "base": "main",
                      "body": f"Auto-created by orchestrator after developer run_id={run_id}"},
                headers=headers, timeout=10
            )
            resp.raise_for_status()
            pr_number = resp.json()["number"]
            logger.info(f"Created PR #{pr_number} for {branch}")
            return pr_number
        except Exception as e:
            logger.error(f"Failed to create PR for {branch}: {e}")
            return None

    def _auto_merge_pr(self, repo: str, branch: str, task_id: int, work_item_id: str):
        import httpx
        owner = settings.gitea_owner
        headers = {"Authorization": f"token {settings.gitea_token}"}
        base_url = f"{settings.gitea_url}/api/v1"
        try:
            resp = httpx.get(
                f"{base_url}/repos/{owner}/{repo}/pulls",
                params={"state": "open", "head": branch},
                headers=headers, timeout=10
            )
            resp.raise_for_status()
            prs = resp.json()
            if not prs:
                pr_number = self._ensure_pr(repo, branch, 0)
                if not pr_number:
                    return False
            else:
                pr_number = prs[0]["number"]
            resp = httpx.post(
                f"{base_url}/repos/{owner}/{repo}/pulls/{pr_number}/merge",
                json={"Do": "merge"},
                headers=headers, timeout=30
            )
            if resp.status_code in (200, 204):
                logger.info(f"PR #{pr_number} merged for {branch}")
                update_task_stage(task_id, "done")
                notify_stage_change(task_id, "deploy", "done")
                plane_notify_stage(work_item_id, "deploy", "done")
                from ..notifications import send_telegram
                send_telegram(f"\u2705 {work_item_id}: PR #{pr_number} merged! deploy -> done. Task complete.")
                return True
            else:
                logger.error(f"Merge failed for PR #{pr_number}: {resp.status_code} {resp.text}")
                from ..notifications import send_telegram
                send_telegram(f"\u26a0\ufe0f {work_item_id}: Auto-merge failed (HTTP {resp.status_code}). Manual merge needed.")
                return False
        except Exception as e:
            logger.error(f"Auto-merge failed for {branch}: {e}")
            return False

    def _write_task_file(self, repo: str, branch: str, task_file: str, content: str):
        """Write task file directly into the task's worktree.

        B-1 fix: no docker (direct open()). ORCH-2/S-4: the target is the per-branch
        worktree (/repos/_wt/<repo>/<branch>), not the shared /repos/<repo>, so the
        agent reads the task ZADANIE from its own isolated working copy.
        Raise on failure instead of silently swallowing errors.
        """
        work_path = get_worktree_path(repo, branch)  # /repos/_wt/<repo>/<branch>
        full_path = os.path.join(work_path, task_file)
        try:
            with open(full_path, "w", encoding="utf-8") as f:
                f.write(content)
            logger.info(f"Task file written: {full_path} ({len(content)} bytes)")
        except OSError as e:
            logger.error(f"Failed to write task file {full_path}: {e}")
            raise RuntimeError(f"Failed to write task file: {e}")


launcher = AgentLauncher()