orchestrator/src/agents/launcher.py

import subprocess
import os
import json
import logging
import re
import threading
import signal
import time
from ..config import settings
from ..db import get_db, get_task_by_repo_branch, update_task_stage, enqueue_job
from ..stages import get_next_stage, get_qg_for_stage, get_agent_for_stage
from ..git_worktree import ensure_worktree, get_worktree_path
from ..qg.checks import QG_CHECKS
from ..notifications import notify_stage_change, notify_qg_failure, notify_agent_started, notify_agent_finished, notify_approve_requested
from ..plane_sync import notify_stage_change as plane_notify_stage, add_comment as plane_add_comment

logger = logging.getLogger("orchestrator.launcher")

# ORCH-41: valid --effort values accepted by the Claude CLI. An effort that is
# not in this set is treated as misconfiguration: logged and dropped (no flag),
# never passed through to the CLI.
VALID_EFFORTS = frozenset({"low", "medium", "high", "xhigh", "max"})

# ORCH-074 (G2): structural validity check for a Claude CLI model name. We use a
# FORMAT check (^claude-…$), not a static allowlist, on purpose: an allowlist
# recreates the exact rot we kill in G1 — it silently drops a CORRECT newer model
# (e.g. claude-opus-4-9) the day Anthropic ships it (never-break working against
# the operator). The final authority on whether a model exists is the Claude CLI
# itself, not our code; a format check is forward-compatible (new versions pass
# without code edits) while still catching the real failure classes: another
# provider (gpt-4), empty/whitespace, garbage chars, wrong prefix (claud-opus-typo).
# The claude- prefix is hardcoded here because the orchestrator is bound to the
# Claude CLI (CLAUDE_BIN); the canonical model VERSION lives ONLY in
# settings.agent_model_default, never here. See ADR-001 (ORCH-074).
_MODEL_NAME_RE = re.compile(r"^claude-[a-z0-9.-]+$")


def is_valid_model(name: str) -> bool:
    """ORCH-074 (G2): True iff ``name`` is a structurally valid Claude model name.

    A valid name, after ``strip()``, is non-empty, starts with ``claude-`` and
    contains only lowercase letters, digits, dots and dashes. Anything else
    (empty/whitespace, another provider like ``gpt-4``, a wrong prefix, illegal
    characters) is invalid. This is the single predicate used by BOTH
    ``resolve_agent_model`` and the inline ``--fallback-model`` read in ``_spawn``
    so a typo can never reach the CLI (never-break). It is a structural guard, not
    a registry of existing models — a structurally valid typo (``claude-opus-typo``)
    is left for the CLI to reject. Never raises.
    """
    if not name:
        return False
    return bool(_MODEL_NAME_RE.match(name.strip()))

# ORCH-061: action stages whose success is an ACTION (restart/retag), not a src
# edit — so "no changes to commit" is EXPECTED there, not under-delivery (FR-3).
_ACTION_STAGES = frozenset({"deploy-staging", "deploy"})


def action_stage_no_changes_note(stage, repo) -> str | None:
    """ORCH-061 (FR-3 / FR-7): observability for an empty diff on an action stage.

    The ``deploy-staging`` / ``deploy`` stages are actions (restart / retag), not
    code edits, so the post-run "no changes to commit" is the NORMAL case there —
    advancement is decided by the agent exit-code + the staging/deploy gate verdict,
    NEVER by the presence of a commit (FR-3 / AC-4). This is a PURE decision used
    only to emit an explicit log line distinguishing an expected action-stage no-op
    from a code-stage no-op; it has no effect on stage advancement.

    Returns an explicit note string when the empty diff is expected (an action
    stage of a self-deploy repo), else ``None``. Never raises.
    """
    try:
        if stage in _ACTION_STAGES:
            from ..self_deploy import self_deploy_applies
            if self_deploy_applies(repo):
                return f"{stage}: no code changes (expected on action stage)"
        return None
    except Exception:  # noqa: BLE001 - observability only, never raise
        return None


def _resolve_agent_attr(agent, project_id, project_map_attr, env_attr_prefix,
                        default_attr):
    """ORCH-41 shared resolver with priority:
      1. ProjectConfig.<project_map_attr>[agent]  (per-project override)
      2. settings.<env_attr_prefix><agent>        (per-agent env, if non-empty)
      3. settings.<default_attr>                  (global default)
      4. ""                                       (no flag -> CLI default)

    project_id is the Plane project uuid. It is resolved to a ProjectConfig via
    the registry; an unknown / empty id simply skips level 1. A missing per-agent
    settings attribute (e.g. unknown agent name) skips level 2.
    """
    # Level 1: per-project override.
    if project_id:
        from ..projects import get_project_by_plane_id
        proj = get_project_by_plane_id(project_id)
        if proj is not None:
            override = getattr(proj, project_map_attr, {}).get(agent)
            if override:
                return override

    # Level 2: per-agent env (settings.<prefix><agent>), if defined & non-empty.
    per_agent = getattr(settings, f"{env_attr_prefix}{agent}", "")
    if per_agent:
        return per_agent

    # Level 3: global default.
    default = getattr(settings, default_attr, "")
    if default:
        return default

    # Level 4: nothing -> CLI default.
    return ""


def _agent_model_candidates(agent: str, project_id: str = None):
    """Yield non-empty model candidates in ORCH-41 priority order.

    Same priority as _resolve_agent_attr (project-override > per-agent env >
    global default), but as a generator so resolve_agent_model can validate each
    level and SKIP an invalid one (ORCH-074 G2) instead of returning the first
    non-empty value blindly. Empty levels are simply not yielded.
    """
    if project_id:
        from ..projects import get_project_by_plane_id
        proj = get_project_by_plane_id(project_id)
        if proj is not None:
            override = getattr(proj, "agent_models", {}).get(agent)
            if override:
                yield override
    per_agent = getattr(settings, f"agent_model_{agent}", "")
    if per_agent:
        yield per_agent
    default = getattr(settings, "agent_model_default", "")
    if default:
        yield default


def resolve_agent_model(agent: str, project_id: str = None) -> str:
    """ORCH-41: resolve the LLM model for an agent (optionally per-project).

    ORCH-074 (G2): the resolved name is validated with is_valid_model BEFORE it is
    returned. An invalid (structurally garbage) value at any level is logged and
    SKIPPED — resolution falls through to the next valid level (project-override
    invalid -> per-agent env -> default); if no level yields a valid name the
    function returns "" so the caller omits --model and the CLI default applies.
    The ORCH-41 priority order and signature are unchanged; validation is layered
    on top. Never raises and never returns garbage that could reach --model.
    """
    for value in _agent_model_candidates(agent, project_id):
        if is_valid_model(value):
            return value
        logger.warning(
            f"Invalid model name '{value}' for agent '{agent}' "
            f"(expected '^claude-…'); skipping to next resolution level / CLI default"
        )
    return ""


def _agent_effort_floor(agent: str) -> str:
    """ORCH-081 (ORCH-52h): per-role non-empty floor for --effort resolution.

    Returns the DECLARED class-default of the ``agent_effort_<agent>`` field on
    Settings (e.g. developer -> ``xhigh``, tester/deployer -> ``medium``, the rest
    -> ``high``). This is the value pydantic WOULD have used were it not clobbered
    by a spurious empty env var (``ORCH_AGENT_EFFORT_<ROLE>=``): the class-default
    is fixed in the class body and a present-but-empty env value cannot override it,
    so it is a robust floor even when the host ``.env`` zeroes every effort var.

    config.py is the single source of truth: upgrading developer to ``xhigh`` there
    automatically raises the floor here — no second map to keep in sync (ADR-001).

    Unknown agent (a name outside the 6 roles) has no ``agent_effort_<agent>``
    field; we degrade to the class-default of ``agent_effort_default`` (``high``),
    a safe non-empty floor. Never raises.
    """
    fields = type(settings).model_fields
    for key in (f"agent_effort_{agent}", "agent_effort_default"):
        field = fields.get(key)
        if field is not None and field.default:
            return field.default
    return ""


def resolve_agent_effort(agent: str, project_id: str = None) -> str:
    """ORCH-41: resolve the --effort level for an agent (optionally per-project).

    Same priority as resolve_agent_model, with one extra level below the global
    default (ORCH-081 / ADR-001):
      1. project-override (projects_json.agent_efforts[agent])
      2. per-agent env    (settings.agent_effort_<agent>)
      3. global default   (settings.agent_effort_default)
      4. per-role FLOOR   (class-default of agent_effort_<agent>) — NEW

    The floor only kicks in when levels 1-3 are all empty (the prod bug: a present
    but empty ``ORCH_AGENT_EFFORT_*=`` clobbers every default to ''), guaranteeing
    a non-empty target effort for the 6 known roles regardless of host .env state.

    The floor is applied BEFORE validation and ONLY to an empty resolve, so it
    never masks a typo: an explicit invalid value (e.g. ``turbo``) is non-empty,
    skips the floor, and is logged + dropped to "" exactly as in ORCH-41 (the
    resolved value is validated against VALID_EFFORTS; an invalid value can never
    pass a bad flag to the CLI). Never raises.
    """
    value = _resolve_agent_attr(
        agent, project_id,
        project_map_attr="agent_efforts",
        env_attr_prefix="agent_effort_",
        default_attr="agent_effort_default",
    )
    if not value:
        # Levels 1-3 all empty (typically a prod .env with empty ORCH_AGENT_EFFORT_*):
        # fall through to the per-role floor (class-default). Applied before
        # validation but only here, so a typo (non-empty) never reaches this branch.
        value = _agent_effort_floor(agent)
    if value and value not in VALID_EFFORTS:
        logger.warning(
            f"Invalid effort '{value}' for agent '{agent}' "
            f"(allowed: {sorted(VALID_EFFORTS)}); omitting --effort"
        )
        return ""
    return value


def _run_log_path(run_id):
    """Absolute path of a per-run agent log: ``<settings.runs_dir>/<run_id>.log``.

    ORCH-087: single source of truth for the log path so it follows
    ``settings.runs_dir`` everywhere (no hardcoded ``/app/data/runs``), which keeps
    ``_spawn`` writable on non-container hosts (CI) where ``/app`` is inaccessible.
    """
    return os.path.join(settings.runs_dir, f"{run_id}.log")


def prune_run_logs(runs_dir, keep_days=30, keep_max=500, active_paths=None):
    """L-2: best-effort rotation of per-run logs (<runs_dir>/*.log).

    A log file is removed if it is older than keep_days OR it is not within the
    keep_max most-recent logs (whichever condition is met first). Only *.log
    files directly inside runs_dir are considered; non-.log files and
    subdirectories are never touched. Files whose path is in active_paths (the
    currently running log) are always kept.

    Returns the number of files removed. Never raises: any error is logged and
    swallowed so log rotation can never bring the app down.
    """
    removed = 0
    try:
        active = set()
        for ap in (active_paths or []):
            try:
                active.add(os.path.realpath(ap))
            except Exception:
                active.add(ap)

        if not os.path.isdir(runs_dir):
            return 0

        logs = []
        for name in os.listdir(runs_dir):
            if not name.endswith(".log"):
                continue
            path = os.path.join(runs_dir, name)
            if not os.path.isfile(path):
                continue
            if os.path.realpath(path) in active:
                continue
            try:
                mtime = os.path.getmtime(path)
            except OSError:
                continue
            logs.append((path, mtime))

        logs.sort(key=lambda t: t[1], reverse=True)

        cutoff = time.time() - keep_days * 86400
        for idx, (path, mtime) in enumerate(logs):
            too_old = mtime < cutoff
            over_max = idx >= keep_max
            if too_old or over_max:
                try:
                    os.remove(path)
                    removed += 1
                except OSError as e:
                    logger.warning(f"prune_run_logs: failed to remove {path}: {e}")
    except Exception as e:
        logger.warning(f"prune_run_logs failed for {runs_dir}: {e}")
    return removed


class AgentLauncher:
    """Launch Claude CLI agents directly (binary mounted into container)."""

    AGENT_CONFIGS = {
        "analyst": {
            "system_prompt": ".openclaw/agents/analyst.md",
            "task_file": ".task.md",
            "allowed_tools": "Read,Write,Edit,Bash",
        },
        "architect": {
            "system_prompt": ".openclaw/agents/architect.md",
            "task_file": ".task-arch.md",
            "allowed_tools": "Read,Write,Edit,Bash",
        },
        "developer": {
            "system_prompt": ".openclaw/agents/developer.md",
            "task_file": ".task-dev.md",
            "allowed_tools": "Read,Write,Edit,Bash",
        },
        "reviewer": {
            "system_prompt": ".openclaw/agents/reviewer.md",
            "task_file": ".task-review.md",
            "allowed_tools": "Read,Write,Edit,Bash",
        },
        "tester": {
            "system_prompt": ".openclaw/agents/tester.md",
            "task_file": ".task-test.md",
            "allowed_tools": "Read,Write,Edit,Bash",
        },
        "deployer": {
            "task_file": ".task-deploy.md",
            "system_prompt": ".openclaw/agents/deployer.md",
            "allowed_tools": "Read,Write,Edit,Bash",
        },
    }

    CLAUDE_BIN = "/opt/claude-code/bin/claude.exe"
    # ORCH-7 (M-2): timeout is now configurable. AGENT_TIMEOUT stays as a
    # backward-compatible alias for the default; the actual value (and per-agent
    # overrides) live in settings and are resolved via _resolve_timeout().
    AGENT_TIMEOUT = settings.agent_timeout_seconds

    def launch(self, agent: str, repo: str, task_content: str = None, task_id: int = None) -> int:
        """
        Launch a Claude CLI agent directly (legacy synchronous path).

        Kept for backward compatibility (direct callers / existing tests). The
        ORCH-1 job queue uses launch_job() instead, but both share _spawn().

        Args:
            agent: Agent role (analyst, architect, developer, reviewer, tester)
            repo: Repository name
            task_content: Optional task content to write to task file
            task_id: Optional task ID to associate with this run

        Returns:
            agent_run_id from DB
        """
        return self._spawn(agent, repo, task_content, task_id, job_id=None)

    def launch_job(self, job: dict) -> int:
        """ORCH-1: launch an agent for a claimed queue job.

        Same spawn path as launch(), but threads job['id'] through so the monitor
        can update the job's status (done / requeue / failed) and link jobs.run_id
        to the agent_runs row. Returns the agent_run_id.

        ORCH-036: the reserved-agent ``deploy-finalizer`` is a DETERMINISTIC
        (no-LLM) job — intercept it BEFORE _spawn (which would raise
        "Unknown agent", R-6) and run the deploy finalizer synchronously, driving
        the jobs row status itself. Returns None (no agent_run row).
        """
        if job.get("agent") == "deploy-finalizer":
            return self._run_deploy_finalizer_job(job)
        # ORCH-021: the reserved-agent `post-deploy-monitor` is also a
        # DETERMINISTIC (no-LLM) tick — intercept it BEFORE _spawn and run one
        # observation tick synchronously. Returns None (no agent_run row).
        if job.get("agent") == "post-deploy-monitor":
            return self._run_post_deploy_monitor_job(job)
        return self._spawn(
            job["agent"],
            job["repo"],
            job.get("task_content"),
            job.get("task_id"),
            job_id=job["id"],
        )

    def _run_deploy_finalizer_job(self, job: dict):
        """ORCH-036 Phase C: run the deterministic deploy finalizer for a job.

        Not an LLM spawn — there is no subprocess/monitor, so we mark the jobs row
        done/failed here. Any error is contained (the finalizer never-raises, but
        we guard anyway so a finalizer fault can't wedge the worker).
        """
        from ..db import mark_job
        from .. import stage_engine
        try:
            stage_engine.run_deploy_finalizer(job)
            mark_job(job["id"], "done")
            logger.info(f"deploy-finalizer job {job['id']} done")
        except Exception as e:
            logger.error(f"deploy-finalizer job {job['id']} failed: {e}")
            try:
                mark_job(job["id"], "failed", error=f"deploy-finalizer error: {e}")
            except Exception:
                pass
        return None

    def _run_post_deploy_monitor_job(self, job: dict):
        """ORCH-021: run one deterministic post-deploy monitor tick for a job.

        Not an LLM spawn — there is no subprocess/monitor, so we mark the jobs row
        done/failed here. The tick never-raises, but we guard anyway so a monitor
        fault can never wedge the worker / starve other projects (AC-16).
        """
        from ..db import mark_job
        from .. import stage_engine
        try:
            stage_engine.run_post_deploy_monitor(job)
            mark_job(job["id"], "done")
            logger.info(f"post-deploy-monitor job {job['id']} done")
        except Exception as e:
            logger.error(f"post-deploy-monitor job {job['id']} failed: {e}")
            try:
                mark_job(job["id"], "failed", error=f"post-deploy-monitor error: {e}")
            except Exception:
                pass
        return None

    def _spawn(self, agent: str, repo: str, task_content: str = None,
               task_id: int = None, job_id: int = None) -> int:
        """Shared spawn implementation for launch() and launch_job().

        When job_id is set, the monitor/watchdog drive the jobs table status
        (ORCH-1). The claude-CLI Popen logic (B-2) and worktree/task-file logic
        (B-1 / ORCH-2) are unchanged.
        """
        config = self.AGENT_CONFIGS.get(agent)
        if not config:
            raise ValueError(f"Unknown agent: {agent}")

        # Main clone lives at /repos/<repo>; the agent works in an isolated worktree
        # (ORCH-2 / S-4) so concurrent tasks never fight over a shared checkout.
        local_repo_path = os.path.join(settings.repos_dir, repo)
        if not os.path.isdir(local_repo_path):
            raise FileNotFoundError(f"Repo not found: {local_repo_path}")

        # Determine branch (needed before we touch the worktree / task file).
        _br_row = get_db().execute("SELECT branch FROM tasks WHERE id=?", (task_id,)).fetchone() if task_id else None
        agent_branch = _br_row[0] if _br_row else "main"

        # ORCH-41: resolve the Plane project uuid for this repo so per-project
        # model/effort overrides apply. Unknown repo -> None (env/default only).
        from ..projects import get_project_by_repo
        _proj = get_project_by_repo(repo)
        project_id = _proj.plane_project_id if _proj else None

        # Ensure the per-branch worktree exists and is on the right branch.
        work_path = ensure_worktree(repo, agent_branch)

        # Write task file if content provided (B-1: direct write; now into the worktree).
        if task_content:
            self._write_task_file(repo, agent_branch, config["task_file"], task_content)

        # Record run in DB
        conn = get_db()
        cursor = conn.execute(
            "INSERT INTO agent_runs (task_id, agent) VALUES (?, ?)",
            (task_id, agent),
        )
        run_id = cursor.lastrowid
        conn.commit()

        # ORCH-1: link this job to the agent_runs row and stamp started_at.
        if job_id is not None:
            conn.execute(
                "UPDATE jobs SET run_id = ?, started_at = datetime('now') WHERE id = ?",
                (run_id, job_id),
            )
            conn.commit()

        # Prepare output log path
        output_path = _run_log_path(run_id)
        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        # Build the claude command
        task_file = config["task_file"]
        system_prompt = config["system_prompt"]
        allowed_tools = config["allowed_tools"]

        # ORCH-41: model + effort + optional fallback are resolved from config
        # (project-override > per-agent env > default), not hardcoded in AGENT_CONFIGS.
        model = resolve_agent_model(agent, project_id)
        effort = resolve_agent_effort(agent, project_id)
        # ORCH-087 (BR-EFF): stamp the REAL --effort value onto this agent_runs row
        # in the moment of launch. The CLI does not echo effort in its result JSON,
        # so this is the only reliable source for the tracker's "· model · effort"
        # line. Empty resolve (no --effort flag) -> NULL so the suffix is omitted.
        # Reuses the still-open conn; never blocks the launch.
        try:
            conn.execute(
                "UPDATE agent_runs SET effort=? WHERE id=?",
                (effort or None, run_id),
            )
            conn.commit()
        except Exception as e:
            logger.warning(f"effort stamp failed for run_id={run_id}: {e}")
        model_flag = f"--model {model} " if model else ""
        effort_flag = f"--effort {effort} " if effort else ""
        # ORCH-074 (G2): agent_fallback_model is read directly here, bypassing
        # resolve_agent_model, so the same validator must guard this point too —
        # otherwise a typo in ORCH_AGENT_FALLBACK_MODEL would slip into
        # --fallback-model (never-break violation). Empty value -> no flag, exactly
        # as before (is_valid_model("") is False but the `if fb` short-circuits).
        fb = settings.agent_fallback_model
        if fb and not is_valid_model(fb):
            logger.warning(
                f"Invalid fallback model '{fb}'; dropping --fallback-model"
            )
            fb = ""
        fb_flag = f"--fallback-model {fb} " if fb else ""

        # No git fetch/checkout here: ensure_worktree() already put the worktree on
        # the right branch. The agent simply runs inside its isolated work_path.
        # Feature 4 (token usage): --output-format json makes claude emit a single
        # result JSON (with usage + total_cost_usd) at the end of stdout. The log
        # still captures it; _monitor_agent parses the trailing JSON after the run
        # to record per-agent tokens/cost. _monitor_agent's failure handling keys
        # off the process exit_code (not stdout shape), so this is safe.
        cmd = (
            f'cd {work_path} && '
            f'{self.CLAUDE_BIN} --print '
            f'--output-format json '
            f'{model_flag}{effort_flag}{fb_flag}'
            f'"$(cat {task_file})" '
            f'--system-prompt "$(cat {system_prompt})" '
            f'--allowedTools {allowed_tools}'
        )

        logger.info(f"Launching agent '{agent}' for repo '{repo}', run_id={run_id}")

        # Launch as background process.
        # B-2 fix: redirect stdout/stderr straight to the log file at the OS level.
        # No PIPE in the orchestrator process -> no PIPE deadlock, no reader thread,
        # no zombies. log_fh is closed by _monitor_agent after proc.wait().
        log_fh = open(output_path, "w")
        proc = subprocess.Popen(
            ["bash", "-c", cmd],
            stdout=log_fh,
            stderr=subprocess.STDOUT,
            env={
                    **os.environ,
                    "HOME": "/home/slin",
                    "GIT_AUTHOR_NAME": "claude-bot",
                    "GIT_AUTHOR_EMAIL": "claude-bot@mva154.local",
                    "GIT_COMMITTER_NAME": "claude-bot",
                    "GIT_COMMITTER_EMAIL": "claude-bot@mva154.local",
                },
            )

        # Update DB with output path
        conn.execute(
            "UPDATE agent_runs SET output_path = ? WHERE id = ?",
            (output_path, run_id),
        )
        # ORCH-065: stamp the agent process pid onto the job row so the job-reaper
        # can probe liveness (os.kill(pid, 0)). proc.pid only exists after Popen,
        # so this is a second UPDATE next to run_id/started_at (set above in _spawn).
        if job_id is not None:
            conn.execute(
                "UPDATE jobs SET pid = ? WHERE id = ?",
                (proc.pid, job_id),
            )
        conn.commit()
        conn.close()

        # Start timeout watchdog
        t = threading.Thread(
            target=self._watchdog,
            args=(proc.pid, run_id),
            kwargs={"job_id": job_id, "agent": agent},
            daemon=True,
        )
        t.start()

        # Start monitor thread (waits for completion, commits, pushes)
        # agent_branch already computed above
        m = threading.Thread(
            target=self._monitor_agent,
            args=(proc, run_id, agent, repo, agent_branch, output_path, log_fh),
            kwargs={"job_id": job_id},
            daemon=True,
        )
        m.start()

        logger.info(f"Agent '{agent}' launched, pid={proc.pid}, run_id={run_id}")
        notify_agent_started(run_id, agent, task_id)
        return run_id

    @staticmethod
    def _resolve_timeout(agent: str = None) -> int:
        """ORCH-7 (M-2): resolve the wall-clock timeout for an agent.

        Per-agent override from settings.agent_timeout_overrides_json (a JSON object
        like {"reviewer": 3600}) wins; otherwise the global default
        settings.agent_timeout_seconds is used. A malformed override JSON is ignored
        (falls back to the default) and only logged, so a bad env never bricks runs.
        """
        default = settings.agent_timeout_seconds
        raw = (settings.agent_timeout_overrides_json or "").strip()
        if agent and raw:
            try:
                overrides = json.loads(raw)
                if isinstance(overrides, dict) and agent in overrides:
                    return int(overrides[agent])
            except (ValueError, TypeError) as e:
                logger.warning(f"Invalid agent_timeout_overrides_json, using default: {e}")
        return default

    def _watchdog(self, pid: int, run_id: int, timeout: int = None,
                  job_id: int = None, agent: str = None):
        """Kill agent if it exceeds its timeout.

        ORCH-1: on a timeout-kill the monitor's proc.wait() returns the kill exit
        code and drives the job retry/fail logic, so the watchdog itself only needs
        to terminate the process and record the agent_runs exit. job_id is accepted
        for symmetry.

        ORCH-7 (M-2): graceful shutdown. Instead of an immediate SIGKILL (which cuts
        claude off mid-write and leaves half-written artifacts), send SIGTERM first,
        give the process up to settings.agent_kill_grace_seconds to flush and exit on
        its own, and only SIGKILL if it is still alive after the grace window. If the
        process exits during the grace window, SIGKILL is NOT sent.
        ProcessLookupError is tolerated at every step (the process may already be
        gone). The recorded exit_code stays -9 to match the existing retry/fail
        contract regardless of which signal actually reaped it.
        """
        if timeout is None:
            timeout = self._resolve_timeout(agent)
        time.sleep(timeout)

        # Phase 1: SIGTERM (graceful). If the process is already gone, we're done.
        try:
            os.kill(pid, signal.SIGTERM)
            logger.warning(
                f"Agent run_id={run_id} exceeded {timeout}s timeout: sent SIGTERM "
                f"(pid={pid}), grace={settings.agent_kill_grace_seconds}s"
            )
        except ProcessLookupError:
            logger.info(f"Agent run_id={run_id} already exited before SIGTERM")
            return  # nothing to record: the monitor's proc.wait() owns the exit

        # Phase 2: poll for graceful exit within the grace window.
        grace = settings.agent_kill_grace_seconds
        poll_interval = 0.5
        waited = 0.0
        while waited < grace:
            time.sleep(poll_interval)
            waited += poll_interval
            try:
                os.kill(pid, 0)  # signal 0 = liveness probe, does not kill
            except ProcessLookupError:
                logger.info(
                    f"Agent run_id={run_id} exited gracefully after SIGTERM "
                    f"({waited:.1f}s); no SIGKILL needed"
                )
                self._record_kill(run_id)
                return

        # Phase 3: still alive -> hard SIGKILL.
        try:
            os.kill(pid, signal.SIGKILL)
            logger.warning(
                f"Agent run_id={run_id} did not exit within {grace}s grace: sent SIGKILL"
            )
        except ProcessLookupError:
            logger.info(f"Agent run_id={run_id} exited just before SIGKILL")
        self._record_kill(run_id)

    @staticmethod
    def _record_kill(run_id: int):
        """Stamp the agent_runs row as timeout-killed (exit_code=-9).

        ORCH-1: -9 is the existing kill-exit contract the monitor/retry logic keys
        off, so we keep it stable whether the reap came from SIGTERM or SIGKILL.
        """
        conn = get_db()
        conn.execute(
            "UPDATE agent_runs SET finished_at=datetime('now'), exit_code=-9 WHERE id=?",
            (run_id,),
        )
        conn.commit()
        conn.close()

    def _monitor_agent(self, proc, run_id, agent, repo, branch, output_path=None, log_fh=None, job_id=None):
        """Wait for agent to finish, commit+push results, update DB.

        B-2 fix: stdout already goes straight to the log file via Popen, so we just
        block on proc.wait() (guaranteed reap -> no zombie, real exit_code) and then
        close the log file handle. No PIPE, no select loop, no startup timeout here
        (the watchdog still enforces the overall AGENT_TIMEOUT by pid).
        """
        import time as _time
        _start_ts = _time.time()

        exit_code = proc.wait()
        if log_fh is not None:
            try:
                log_fh.close()
            except Exception:
                pass
        _duration_s = int(_time.time() - _start_ts)
        logger.info(f"Agent run_id={run_id} ({agent}) finished with exit_code={exit_code}")

        # Update DB
        conn = get_db()
        conn.execute(
            "UPDATE agent_runs SET finished_at=datetime('now'), exit_code=? WHERE id=?",
            (exit_code, run_id),
        )
        conn.commit()

        # Get task_id for notification
        _row = conn.execute("SELECT task_id FROM agent_runs WHERE id=?", (run_id,)).fetchone()
        _task_id = _row[0] if _row else None
        conn.close()

        notify_agent_finished(run_id, agent, exit_code, task_id=_task_id, duration_s=_duration_s)

        # Feature 4: parse token usage / cost from the (json) run log and record
        # it on the agent_runs row. Never fatal — a garbled/missing JSON records
        # NULLs and logs a warning so a broken run can't crash the monitor.
        try:
            from ..usage import parse_usage_from_log, record_usage
            _usage = parse_usage_from_log(output_path) if output_path else None
            record_usage(run_id, _usage)
        except Exception as e:
            logger.warning(f"run_id={run_id}: usage accounting failed: {e}")
            _usage = None

        # Commit and push any changes — in the per-branch worktree (ORCH-2 / S-4),
        # NOT in the shared /repos/<repo>. The worktree is already on `branch`
        # (ensure_worktree did the checkout), so no checkout is needed here.
        repo_path = get_worktree_path(repo, branch)
        try:
            git_env = {
                **os.environ,
                "HOME": "/home/slin",
                "GIT_AUTHOR_NAME": "claude-bot",
                "GIT_AUTHOR_EMAIL": "claude-bot@mva154.local",
                "GIT_COMMITTER_NAME": "claude-bot",
                "GIT_COMMITTER_EMAIL": "claude-bot@mva154.local",
            }
            result = subprocess.run(
                ["git", "-C", repo_path, "status", "--porcelain"],
                capture_output=True, text=True, timeout=10, env=git_env
            )
            if result.stdout.strip():
                # Add docs/ always
                subprocess.run(
                    ["git", "-C", repo_path, "add", "docs/"],
                    capture_output=True, text=True, timeout=10, env=git_env
                )
                # Add src/ and tests/ for developer
                if agent == "developer":
                    subprocess.run(
                        ["git", "-C", repo_path, "add", "src/", "tests/"],
                        capture_output=True, text=True, timeout=10, env=git_env
                    )
                # Commit
                commit_result = subprocess.run(
                    ["git", "-C", repo_path, "commit", "-m",
                     f"{agent}(ET): auto-commit from {agent} run_id={run_id}"],
                    capture_output=True, text=True, timeout=30, env=git_env
                )
                if commit_result.returncode == 0:
                    push_result = subprocess.run(
                        ["git", "-C", repo_path, "push", "origin", branch],
                        capture_output=True, text=True, timeout=60, env=git_env
                    )
                    if push_result.returncode == 0:
                        logger.info(f"Agent run_id={run_id}: committed and pushed to {branch}")
                        # Auto-create PR after developer pushes
                        if agent == "developer":
                            self._ensure_pr(repo, branch, run_id)
                    else:
                        logger.error(f"Agent run_id={run_id}: push failed: {push_result.stderr}")
                else:
                    logger.warning(f"Agent run_id={run_id}: commit failed: {commit_result.stderr}")
            else:
                logger.info(f"Agent run_id={run_id}: no changes to commit")
                # ORCH-061: on a self-deploy action stage (deploy-staging/deploy)
                # an empty diff is EXPECTED (action, not a src edit). Emit an
                # explicit observability line so an operator can tell this apart
                # from a code-stage no-op. Does NOT affect advancement (decided by
                # exit-code + gate verdict, never by a commit existing).
                try:
                    _t = get_task_by_repo_branch(repo, branch)
                    _stage = _t["stage"] if _t else None
                    _note = action_stage_no_changes_note(_stage, repo)
                    if _note:
                        logger.info(f"Agent run_id={run_id}: {_note}")
                except Exception as _e:
                    logger.debug(
                        f"Agent run_id={run_id}: action-stage no-changes note "
                        f"skipped: {_e}"
                    )
        except Exception as e:
            logger.error(f"Agent run_id={run_id}: post-run git failed: {e}")

        # Handle deployer failure (smoke/healthcheck failed) — Task 7
        if exit_code != 0 and agent == "deployer":
            conn = get_db()
            task_row = conn.execute(
                "SELECT id, work_item_id FROM tasks WHERE repo=? AND branch=?",
                (repo, branch),
            ).fetchone()
            conn.close()
            if task_row:
                _tid, _wid = task_row
                update_task_stage(_tid, "development")
                notify_stage_change(_tid, "deploy", "development")
                plane_notify_stage(_wid, "deploy", "development")
                from ..plane_sync import set_issue_blocked
                set_issue_blocked(_wid)
                plane_add_comment(
                    _wid,
                    "\u274c Deploy FAILED (smoke/healthcheck). Rolled back. Developer \u043d\u0443\u0436\u0435\u043d \u0434\u043b\u044f \u0444\u0438\u043a\u0441\u0430.",
                    author="deployer",
                )
                from ..notifications import send_telegram, link_for
                send_telegram(f"\U0001f6a8 {link_for(_wid)}: Deploy failed! Rolled back. Needs fix.")

        # Notify on startup timeout (exit_code from kill = -9 or 137)
        if exit_code != 0 and exit_code not in (None,):
            conn = get_db()
            task_row = conn.execute(
                "SELECT id, work_item_id FROM tasks WHERE repo=? AND branch=?",
                (repo, branch),
            ).fetchone()
            conn.close()
            if task_row and agent != "deployer":  # deployer handled above
                _tid, _wid = task_row
                from ..notifications import send_telegram, link_for
                send_telegram(f"\u26a0\ufe0f {link_for(_wid, _tid)}: Agent {agent} failed (exit_code={exit_code}). Check logs: {_run_log_path(run_id)}")

        # Feature 4 + ORCH-016: post the unified per-agent status comment under
        # that agent's bot, threading the wall-clock duration we just measured
        # straight through (ADR-001 §6: explicit param wins over DB fallback).
        # The deployer finishing the task also posts the per-task usage summary.
        if exit_code == 0:
            try:
                self._post_usage_comments(
                    run_id, agent, repo, branch, _usage, duration_s=_duration_s
                )
            except Exception as e:
                logger.warning(f"run_id={run_id}: usage comment failed: {e}")

        # Auto-advance stage if agent finished successfully and QG passes
        if exit_code == 0:
            self._try_advance_stage(run_id, agent, repo, branch)

        # ORCH-1: drive the job-queue status for queue-launched jobs only.
        # (Legacy direct launch() has job_id=None and is unaffected.)
        if job_id is not None:
            self._finalize_job(job_id, agent, run_id, exit_code, output_path=output_path)

    def _backoff_seconds(self, transient_attempts: int, retry_after: int = None) -> int:
        """Exponential backoff for transient failures, honouring Retry-After.

        backoff = min(2^transient_attempts * base, max). If the server sent a
        Retry-After, use the larger of the two (never poll sooner than asked).
        """
        base = settings.backoff_base_seconds
        cap = settings.backoff_max_seconds
        backoff = min((2 ** max(transient_attempts, 0)) * base, cap)
        if retry_after is not None and retry_after > 0:
            backoff = max(backoff, min(retry_after, cap))
        return int(backoff)

    def _finalize_job(self, job_id: int, agent: str, run_id: int, exit_code, output_path=None):
        """ORCH-1: update the jobs row after the agent process finished.

        exit_code == 0  -> done (and resets the breaker streak via on_outcome).
        exit_code != 0  -> classify the failure from the run log tail (token-free):
          - TRANSIENT (429/overload/network): backoff-requeue with available_at in
            the future + a SEPARATE transient_attempts budget
            (settings.transient_max_attempts), honouring Retry-After. Reported to
            the breaker so it opens after N consecutive transient failures.
          - PERMANENT (code fault): ordinary attempts < max_attempts requeue,
            otherwise 'failed' + Telegram.
        """
        from ..db import get_job, mark_job
        from ..error_classifier import classify_log_file
        try:
            job = get_job(job_id)
            if not job:
                return
            if exit_code == 0:
                mark_job(job_id, "done", run_id=run_id)
                logger.info(f"Job {job_id} ({agent}) done (run_id={run_id})")
                self._record_outcome(transient=False, recovered=True)
                return

            # Classify the failure from the agent log tail (no token cost).
            kind, retry_after = "permanent", None
            log_path = output_path or _run_log_path(run_id)
            try:
                kind, retry_after = classify_log_file(log_path)
            except Exception:
                pass

            if kind == "transient":
                self._finalize_transient(job_id, agent, run_id, exit_code, job, retry_after)
            else:
                self._finalize_permanent(job_id, agent, run_id, exit_code, job)
        except Exception as e:
            logger.error(f"Job {job_id}: _finalize_job error: {e}")

    def _finalize_transient(self, job_id, agent, run_id, exit_code, job, retry_after):
        """Transient (429/overload/net) failure -> backoff requeue or fail when budget out."""
        from ..db import mark_job, mark_job_transient
        tattempts = job.get("transient_attempts", 0)
        tmax = settings.transient_max_attempts
        err = (f"transient (429/overload) agent {agent} exit={exit_code} "
               f"(run_id={run_id}); retry_after={retry_after}")
        self._record_outcome(transient=True, recovered=False)
        if tattempts < tmax:
            backoff = self._backoff_seconds(tattempts + 1, retry_after)
            mark_job_transient(job_id, backoff, error=err)
            logger.warning(
                f"Job {job_id} ({agent}) TRANSIENT fail (exit={exit_code}), "
                f"backoff {backoff}s, transient_attempt {tattempts + 1}/{tmax}"
            )
        else:
            mark_job(job_id, "failed", run_id=run_id, error=err)
            logger.error(
                f"Job {job_id} ({agent}) failed after {tattempts} transient attempts"
            )
            self._notify_failed(job_id, agent, job, run_id,
                                f"transient (rate-limit) after {tattempts} attempts")

    def _finalize_permanent(self, job_id, agent, run_id, exit_code, job):
        """Permanent (code-fault) failure -> normal attempts<max requeue, then fail."""
        from ..db import mark_job
        attempts = job.get("attempts", 0)
        max_attempts = job.get("max_attempts", 2)
        err = f"agent {agent} exit_code={exit_code} (run_id={run_id})"
        self._record_outcome(transient=False, recovered=False)
        if attempts < max_attempts:
            mark_job(job_id, "queued", run_id=run_id, error=err)
            logger.warning(
                f"Job {job_id} ({agent}) failed (exit={exit_code}), "
                f"requeued (attempt {attempts}/{max_attempts})"
            )
        else:
            mark_job(job_id, "failed", run_id=run_id, error=err)
            logger.error(
                f"Job {job_id} ({agent}) failed permanently after "
                f"{attempts} attempts (exit={exit_code})"
            )
            self._notify_failed(job_id, agent, job, run_id,
                                f"{attempts} attempts (exit={exit_code})")

    def _notify_failed(self, job_id, agent, job, run_id, why):
        try:
            from ..notifications import send_telegram
            send_telegram(
                f"\U0001f6a8 Job {job_id} ({agent}, repo {job.get('repo')}) "
                f"failed: {why}. Logs: {_run_log_path(run_id)}"
            )
        except Exception:
            pass

    def _record_outcome(self, transient: bool, recovered: bool):
        """Forward the run outcome to the circuit breaker (if a worker is wired).

        Decoupled via a settable callback (set by QueueWorker.start) so the launcher
        does not hard-import the worker (avoids a cycle) and tests can run the
        launcher standalone.
        """
        cb = getattr(self, "on_outcome", None)
        if cb:
            try:
                cb(transient=transient, recovered=recovered)
            except Exception:
                pass

    def _try_advance_stage(self, run_id: int, agent: str, repo: str, branch: str):
        """After agent finishes successfully, advance the stage via the unified engine.

        ORCH-4 / M-3: the 174-line body that used to live here moved into
        src/stage_engine.advance_stage(). This is now a thin wrapper: it looks up
        the task by (repo, branch) and delegates. `agent` is forwarded as
        finished_agent so the analyst/reviewer/tester/architect rollback branches
        still trigger exactly as before. The agent-selection bug (it used to call
        get_agent_for_stage(next_stage)) is fixed inside the engine.
        """
        try:
            conn = get_db()
            task_row = conn.execute(
                "SELECT id, stage, work_item_id FROM tasks WHERE repo=? AND branch=?",
                (repo, branch),
            ).fetchone()
            conn.close()
            if not task_row:
                return

            task_id, current_stage, work_item_id = task_row
            from ..stage_engine import advance_stage
            advance_stage(
                task_id=task_id,
                current_stage=current_stage,
                repo=repo,
                work_item_id=work_item_id,
                branch=branch,
                finished_agent=agent,
            )
        except Exception as e:
            logger.error(f"Auto-advance failed for run_id={run_id}: {e}")


    def _post_usage_comments(self, run_id, agent, repo, branch, usage, duration_s=None):
        """Feature 4 + ORCH-016: post the unified per-agent status comment.

        - Always (on success, with a work_item_id): a per-agent finish comment
          via ``usage.build_status_comment(...)``, authored by the finishing
          agent's Plane bot. The comment carries:
            * single-line header (icon + role + per-stage description),
            * machine verdict line for reviewer / tester / deployer (when the
              relevant frontmatter is present in the worktree),
            * the agent's wall-clock duration (``duration_s`` is the measured
              value in _monitor_agent; DB fallback is unused on this path),
            * an HTML <ul> of artifact links scoped per agent,
            * a ``<sub>`` token/cost tail.
        - When the deployer finishes: also a per-task summary (SUM over
          agent_runs GROUP BY agent), authored by the deployer.

        The deployer's `stage=` is resolved from the task row so the helper can
        pick between 14-deploy-log.md (prod) and 15-staging-log.md (staging).
        """
        from ..usage import build_status_comment, task_summary_comment
        from ..git_worktree import get_worktree_path
        conn = get_db()
        row = conn.execute(
            "SELECT id, work_item_id, stage FROM tasks WHERE repo=? AND branch=?",
            (repo, branch),
        ).fetchone()
        conn.close()
        if not row:
            return
        task_id, work_item_id, stage = row[0], row[1], row[2]
        if not work_item_id:
            return
        # Observability: every agent's finish comment links its artifact(s)
        # (reviewer->12-review, tester->13-test-report, deployer->14- or 15-,
        # architect->ADR, developer->PR/branch). For the developer we resolve the
        # open PR number so the link points straight at it.
        pr_number = None
        if agent == "developer":
            pr_number = self._open_pr_number(repo, branch)

        # Best-effort worktree path — drives AC-8 (skip missing artifacts) and
        # the verdict frontmatter read. Falls back to None on lookup error so
        # the comment still goes out without the verdict line / file probe.
        try:
            worktree_root = get_worktree_path(repo, branch)
        except Exception:
            worktree_root = None

        plane_add_comment(
            work_item_id,
            build_status_comment(
                agent,
                repo=repo,
                branch=branch,
                work_item_id=work_item_id,
                pr_number=pr_number,
                stage=stage,
                usage=usage,
                duration_s=duration_s,
                task_id=task_id,
                worktree_root=worktree_root,
            ),
            author=agent,
        )
        if agent == "deployer":
            plane_add_comment(
                work_item_id, task_summary_comment(task_id), author="deployer"
            )

    def _open_pr_number(self, repo: str, branch: str):
        """Return the open PR number for `branch`, or None. Never raises."""
        try:
            import httpx
            owner = settings.gitea_owner
            headers = {"Authorization": f"token {settings.gitea_token}"}
            resp = httpx.get(
                f"{settings.gitea_url}/api/v1/repos/{owner}/{repo}/pulls",
                params={"state": "open", "head": branch},
                headers=headers, timeout=5,
            )
            if resp.status_code == 200:
                prs = resp.json()
                if prs:
                    return prs[0].get("number")
        except Exception:
            pass
        return None

    def _ensure_pr(self, repo: str, branch: str, run_id: int):
        """Ensure an open code-PR exists for ``branch``; return its number or None.

        ORCH-082 (ADR-001 Р-4): delegated to the single idempotent PR-creation actor
        ``merge_gate.ensure_open_pr`` so PR creation lives in ONE place and logs the
        same created/existed/failed outcomes (G3). The CALL TRIGGER is unchanged — the
        caller (`_monitor_agent`) still invokes this ONLY on the developer path with a
        fresh worktree commit; only the implementation under the hood is shared. The
        actor uses the same ``head==branch AND base==main`` filter as ``merge_pr``, so
        the developer-created PR and the one merge-verify merges are guaranteed to be
        the same code-PR. Never raises (the actor is never-raise); ``failed`` -> None,
        preserving the previous "best-effort, return None on failure" contract.
        """
        from .. import merge_gate
        status, detail = merge_gate.ensure_open_pr(repo, branch)
        logger.info(f"_ensure_pr({branch}, run_id={run_id}) -> {status} ({detail})")
        if status in ("created", "existed"):
            try:
                return int(detail)
            except (TypeError, ValueError):
                return None
        logger.error(f"Failed to ensure PR for {branch}: {detail}")
        return None

    def _write_task_file(self, repo: str, branch: str, task_file: str, content: str):
        """Write task file directly into the task's worktree.

        B-1 fix: no docker (direct open()). ORCH-2/S-4: the target is the per-branch
        worktree (/repos/_wt/<repo>/<branch>), not the shared /repos/<repo>, so the
        agent reads the task ZADANIE from its own isolated working copy.
        Raise on failure instead of silently swallowing errors.
        """
        work_path = get_worktree_path(repo, branch)  # /repos/_wt/<repo>/<branch>
        full_path = os.path.join(work_path, task_file)
        try:
            with open(full_path, "w", encoding="utf-8") as f:
                f.write(content)
            logger.info(f"Task file written: {full_path} ({len(content)} bytes)")
        except OSError as e:
            logger.error(f"Failed to write task file {full_path}: {e}")
            raise RuntimeError(f"Failed to write task file: {e}")


launcher = AgentLauncher()