feat(worktree): git worktree per task to isolate shared /repos (ORCH-2 / S-4)

- add src/git_worktree.py: ensure/remove/get_worktree_path - config: worktrees_dir=/repos/_wt - launcher: agent runs in per-branch worktree; task-file + commit/push in worktree; no shared checkout - qg/checks: read artifacts + run make test from worktree (branch arg, backward-compatible) - webhooks/plane: pass branch into QG dispatch; review fallback from worktree - webhooks/gitea: keep read-only branch --contains in main clone (documented) - tests: test_git_worktree.py (isolation) + update test_launcher write-task-file - docs: ARCHITECTURE worktree section + BUGFIXES_2026-06-02_ORCH2 Preserves B-1/B-2/S-1/S-5 fixes (paths now point at worktree).
2026-06-02 21:12:06 +03:00
parent 66a37612fd
commit 1ebe8afc23
10 changed files with 474 additions and 89 deletions
--- a/src/agents/launcher.py
+++ b/src/agents/launcher.py
@@ -6,6 +6,7 @@ import signal
 from ..config import settings
 from ..db import get_db, get_task_by_repo_branch, update_task_stage
 from ..stages import get_next_stage, get_qg_for_stage, get_agent_for_stage
+from ..git_worktree import ensure_worktree, get_worktree_path
 from ..qg.checks import QG_CHECKS
 from ..notifications import notify_stage_change, notify_qg_failure, notify_agent_started, notify_agent_finished, notify_approve_requested
 from ..plane_sync import notify_stage_change as plane_notify_stage, add_comment as plane_add_comment
@@ -71,15 +72,22 @@ class AgentLauncher:
        if not config:
            raise ValueError(f"Unknown agent: {agent}")

-        # Container-local path (repos mounted at /repos)
+        # Main clone lives at /repos/<repo>; the agent works in an isolated worktree
+        # (ORCH-2 / S-4) so concurrent tasks never fight over a shared checkout.
        local_repo_path = os.path.join(settings.repos_dir, repo)
-
        if not os.path.isdir(local_repo_path):
            raise FileNotFoundError(f"Repo not found: {local_repo_path}")

-        # Write task file if content provided (B-1: direct write to mounted /repos, no docker)
+        # Determine branch (needed before we touch the worktree / task file).
+        _br_row = get_db().execute("SELECT branch FROM tasks WHERE id=?", (task_id,)).fetchone() if task_id else None
+        agent_branch = _br_row[0] if _br_row else "main"
+
+        # Ensure the per-branch worktree exists and is on the right branch.
+        work_path = ensure_worktree(repo, agent_branch)
+
+        # Write task file if content provided (B-1: direct write; now into the worktree).
        if task_content:
-            self._write_task_file(repo, config["task_file"], task_content)
+            self._write_task_file(repo, agent_branch, config["task_file"], task_content)

        # Record run in DB
        conn = get_db()
@@ -99,15 +107,13 @@ class AgentLauncher:
        system_prompt = config["system_prompt"]
        allowed_tools = config["allowed_tools"]

-        # Determine branch for checkout
-        _br_row = get_db().execute("SELECT branch FROM tasks WHERE id=?", (task_id,)).fetchone() if task_id else None
-        agent_branch = _br_row[0] if _br_row else "main"
-
        model = config.get("model", "")
        model_flag = f"--model {model} " if model else ""

+        # No git fetch/checkout here: ensure_worktree() already put the worktree on
+        # the right branch. The agent simply runs inside its isolated work_path.
        cmd = (
-            f'cd {local_repo_path} && git fetch origin 2>/dev/null; git checkout {agent_branch} 2>/dev/null || git checkout -b {agent_branch} origin/{agent_branch} 2>/dev/null; '
+            f'cd {work_path} && '
            f'{self.CLAUDE_BIN} --print '
            f'{model_flag}'
            f'"$(cat {task_file})" '
@@ -219,8 +225,10 @@ class AgentLauncher:

        notify_agent_finished(run_id, agent, exit_code, task_id=_task_id, duration_s=_duration_s)

-        # Commit and push any changes
-        repo_path = os.path.join(settings.repos_dir, repo)
+        # Commit and push any changes — in the per-branch worktree (ORCH-2 / S-4),
+        # NOT in the shared /repos/<repo>. The worktree is already on `branch`
+        # (ensure_worktree did the checkout), so no checkout is needed here.
+        repo_path = get_worktree_path(repo, branch)
        try:
            git_env = {
                **os.environ,
@@ -230,20 +238,6 @@ class AgentLauncher:
                "GIT_COMMITTER_NAME": "claude-bot",
                "GIT_COMMITTER_EMAIL": "claude-bot@mva154.local",
            }
-            # Checkout feature branch before committing
-            subprocess.run(
-                ["git", "-C", repo_path, "fetch", "origin"],
-                capture_output=True, text=True, timeout=30, env=git_env
-            )
-            checkout_result = subprocess.run(
-                ["git", "-C", repo_path, "checkout", branch],
-                capture_output=True, text=True, timeout=30, env=git_env
-            )
-            if checkout_result.returncode != 0:
-                subprocess.run(
-                    ["git", "-C", repo_path, "checkout", "-b", branch, f"origin/{branch}"],
-                    capture_output=True, text=True, timeout=30, env=git_env
-            )
            result = subprocess.run(
                ["git", "-C", repo_path, "status", "--porcelain"],
                capture_output=True, text=True, timeout=10, env=git_env
@@ -351,7 +345,7 @@ class AgentLauncher:
                    if agent == "analyst" and qg_name == "check_analysis_approved" and work_item_id:
                        files_check = QG_CHECKS.get("check_analysis_complete")
                        if files_check:
-                            files_ok, _ = files_check(repo, work_item_id)
+                            files_ok, _ = files_check(repo, work_item_id, branch)
                            if files_ok:
                                # Full artifacts ready -> In Review
                                from ..plane_sync import set_issue_in_review
@@ -364,10 +358,10 @@ class AgentLauncher:
                                notify_approve_requested(task_id)
                                logger.info(f"Task {task_id}: analyst finished, requested :approved: in Plane")
                            else:
-                                # Check if questions file exists
+                                # Check if questions file exists (in the task worktree)
                                import os as _os
                                questions_path = _os.path.join(
-                                    settings.repos_dir, repo,
+                                    get_worktree_path(repo, branch),
                                    f"docs/work-items/{work_item_id}/01-questions.md"
                                )
                                if _os.path.isfile(questions_path):
@@ -392,11 +386,14 @@ class AgentLauncher:
                                    )
                    return
                elif qg_name in ("check_ci_green", "check_tests_local"):
+                    # (repo, branch) signature — already worktree-aware.
                    passed, reason = check_fn(repo, branch)
                elif qg_name == "check_tests_passed":
-                    passed, reason = check_fn(repo, work_item_id or "")
+                    # Artifact check — pass branch so it reads from the worktree.
+                    passed, reason = check_fn(repo, work_item_id or "", branch)
                else:
-                    passed, reason = check_fn(repo, work_item_id or "")
+                    # Other artifact checks (check_architecture_done, etc.) — worktree-aware.
+                    passed, reason = check_fn(repo, work_item_id or "", branch)

                if not passed:
                    logger.info(f"Task {task_id}: QG '{qg_name}' not passed after {agent}: {reason}")
@@ -461,7 +458,7 @@ class AgentLauncher:
                    if agent == "architect" and qg_name == "check_architecture_done" and not passed:
                        import os as _os
                        conflict_path = _os.path.join(
-                            settings.repos_dir, repo,
+                            get_worktree_path(repo, branch),
                            f"docs/work-items/{work_item_id}/10-conflict.md"
                        )
                        if _os.path.isfile(conflict_path):
@@ -578,15 +575,16 @@ class AgentLauncher:
            logger.error(f"Auto-merge failed for {branch}: {e}")
            return False

-    def _write_task_file(self, repo: str, task_file: str, content: str):
-        """Write task file directly to the mounted repo volume (/repos).
+    def _write_task_file(self, repo: str, branch: str, task_file: str, content: str):
+        """Write task file directly into the task's worktree.

-        B-1 fix: no docker. The repos directory is mounted RW at settings.repos_dir
-        (/repos inside the container), so write straight to /repos/<repo>/<task_file>.
+        B-1 fix: no docker (direct open()). ORCH-2/S-4: the target is the per-branch
+        worktree (/repos/_wt/<repo>/<branch>), not the shared /repos/<repo>, so the
+        agent reads the task ZADANIE from its own isolated working copy.
        Raise on failure instead of silently swallowing errors.
        """
-        container_repo_path = os.path.join(settings.repos_dir, repo)  # /repos/<repo>
-        full_path = os.path.join(container_repo_path, task_file)
+        work_path = get_worktree_path(repo, branch)  # /repos/_wt/<repo>/<branch>
+        full_path = os.path.join(work_path, task_file)
        try:
            with open(full_path, "w", encoding="utf-8") as f:
                f.write(content)
--- a/src/config.py
+++ b/src/config.py
@@ -20,6 +20,7 @@ class Settings(BaseSettings):
    claude_bin: str = "/opt/claude-code/bin/claude.exe"
    repos_dir: str = "/repos"
    host_repos_dir: str = "/home/slin/repos"
+    worktrees_dir: str = "/repos/_wt"  # ORCH-2 / S-4: isolated worktree per task/branch

    # DB
    db_path: str = "/app/data/orchestrator.db"
--- a/src/git_worktree.py
+++ b/src/git_worktree.py
@@ -0,0 +1,107 @@
+"""Git worktree management — isolated working copy per task/branch (ORCH-2 / S-4).
+
+Background
+----------
+Previously every git operation (checkout/commit/push/test) ran in the single shared
+clone ``/repos/<repo>``. With two active tasks a ``git checkout`` of one branch would
+overwrite the working copy of the other -> races (see AUDIT S-4 / ET-009 "two collectors").
+
+Solution
+--------
+Each task (branch) gets an isolated git worktree::
+
+    /repos/<repo>                      <- main clone (fetch / worktree management)
+    /repos/_wt/<repo>/<safe-branch>    <- worktree for one task/branch (agent works here)
+
+A branch can only be checked out in ONE worktree at a time, which is exactly the
+property we want: one task = one branch = one worktree.
+"""
+import os
+import re
+import subprocess
+import logging
+from .config import settings
+
+logger = logging.getLogger("orchestrator.git_worktree")
+
+
+def _safe(branch: str) -> str:
+    """Filesystem-safe branch name for use in a path component."""
+    return re.sub(r"[^A-Za-z0-9._-]", "_", branch)
+
+
+def get_worktree_path(repo: str, branch: str) -> str:
+    """Path of the worktree for (repo, branch). Does NOT create it."""
+    return os.path.join(settings.worktrees_dir, repo, _safe(branch))
+
+
+def _main_repo(repo: str) -> str:
+    return os.path.join(settings.repos_dir, repo)
+
+
+def ensure_worktree(repo: str, branch: str) -> str:
+    """Create (or reuse) an isolated worktree for ``branch``. Returns its path.
+
+    Main clone stays at ``/repos/<repo>``. Worktree lives at
+    ``/repos/_wt/<repo>/<safe-branch>``.
+
+    - If the worktree already exists, it is fetched + fast-aligned to the branch
+      (and to ``origin/<branch>`` when that remote branch exists).
+    - If the branch exists (locally or on origin) it is checked out into a fresh
+      worktree; otherwise a new branch is created from ``origin/main``.
+    """
+    main_repo = _main_repo(repo)
+    wt = get_worktree_path(repo, branch)
+
+    if not os.path.isdir(main_repo):
+        raise FileNotFoundError(f"Main repo not found: {main_repo}")
+
+    # Always refresh refs in the main clone first.
+    subprocess.run(["git", "-C", main_repo, "fetch", "origin"],
+                   capture_output=True, timeout=60)
+
+    # Reuse existing worktree (.git may be a dir or a file pointer for worktrees).
+    if os.path.isdir(os.path.join(wt, ".git")) or os.path.isfile(os.path.join(wt, ".git")):
+        subprocess.run(["git", "-C", wt, "fetch", "origin"], capture_output=True, timeout=60)
+        subprocess.run(["git", "-C", wt, "checkout", branch], capture_output=True, timeout=30)
+        # Align to remote only if the remote branch exists (avoid wiping local-only work).
+        rb = subprocess.run(
+            ["git", "-C", wt, "rev-parse", "--verify", "--quiet", f"origin/{branch}"],
+            capture_output=True,
+        )
+        if rb.returncode == 0:
+            subprocess.run(["git", "-C", wt, "reset", "--hard", f"origin/{branch}"],
+                           capture_output=True, timeout=30)
+        logger.info(f"Worktree reused: {wt} (branch {branch})")
+        return wt
+
+    os.makedirs(os.path.dirname(wt), exist_ok=True)
+
+    # Try to attach an existing branch (local or remote-tracking) to the new worktree.
+    r = subprocess.run(["git", "-C", main_repo, "worktree", "add", wt, branch],
+                       capture_output=True, text=True, timeout=60)
+    if r.returncode != 0:
+        # Branch doesn't exist yet — create it from origin/main.
+        r2 = subprocess.run(
+            ["git", "-C", main_repo, "worktree", "add", "-b", branch, wt, "origin/main"],
+            capture_output=True, text=True, timeout=60,
+        )
+        if r2.returncode != 0:
+            raise RuntimeError(
+                f"git worktree add failed for {repo}:{branch}: "
+                f"{r.stderr.strip()} | {r2.stderr.strip()}"
+            )
+    logger.info(f"Worktree ready: {wt} (branch {branch})")
+    return wt
+
+
+def remove_worktree(repo: str, branch: str):
+    """Remove the worktree for (repo, branch) — optional cleanup when a task is done."""
+    main_repo = _main_repo(repo)
+    wt = get_worktree_path(repo, branch)
+    subprocess.run(["git", "-C", main_repo, "worktree", "remove", "--force", wt],
+                   capture_output=True, timeout=30)
+    # Prune dangling administrative entries.
+    subprocess.run(["git", "-C", main_repo, "worktree", "prune"],
+                   capture_output=True, timeout=30)
+    logger.info(f"Worktree removed: {wt}")
--- a/src/qg/checks.py
+++ b/src/qg/checks.py
@@ -7,12 +7,28 @@ from ..config import settings

 logger = logging.getLogger("orchestrator.qg")

+from ..git_worktree import get_worktree_path, ensure_worktree
+
+
+def _repo_path(repo: str, branch: str | None = None) -> str:
+    """Resolve the working path to read agent artifacts from.
+
+    ORCH-2 / S-4: artifacts now live in the per-branch worktree. When a branch is
+    given and its worktree exists on disk, read from there; otherwise fall back to
+    the shared /repos/<repo> clone (keeps backward-compat for 2-arg callers/tests).
+    """
+    if branch:
+        wt = get_worktree_path(repo, branch)
+        if os.path.isdir(wt):
+            return wt
+    return os.path.join(settings.repos_dir, repo)
+
 # Shared httpx client config
 GITEA_HEADERS = {"Authorization": f"token {settings.gitea_token}"}
 GITEA_BASE = f"{settings.gitea_url}/api/v1"


-def check_analysis_complete(repo: str, work_item_id: str) -> tuple[bool, str]:
+def check_analysis_complete(repo: str, work_item_id: str, branch: str | None = None) -> tuple[bool, str]:
    """
    Check if analysis artifacts exist in the repo branch.
    Required files:
@@ -28,7 +44,7 @@ def check_analysis_complete(repo: str, work_item_id: str) -> tuple[bool, str]:
        f"docs/work-items/{work_item_id}/04-test-plan.yaml",
    ]

-    repo_path = os.path.join(settings.repos_dir, repo)
+    repo_path = _repo_path(repo, branch)
    missing = []

    for f in required_files:
@@ -41,13 +57,13 @@ def check_analysis_complete(repo: str, work_item_id: str) -> tuple[bool, str]:
    return True, "All analysis artifacts present"


-def check_architecture_done(repo: str, work_item_id: str) -> tuple[bool, str]:
+def check_architecture_done(repo: str, work_item_id: str, branch: str | None = None) -> tuple[bool, str]:
    """
    Check if architecture artifacts exist.
    Required: docs/work-items/<work_item_id>/06-adr/ (at least 1 file)
    OR: docs/work-items/<work_item_id>/07-infra-requirements.md
    """
-    repo_path = os.path.join(settings.repos_dir, repo)
+    repo_path = _repo_path(repo, branch)

    adr_dir = os.path.join(repo_path, f"docs/work-items/{work_item_id}/06-adr")
    infra_file = os.path.join(repo_path, f"docs/work-items/{work_item_id}/07-infra-requirements.md")
@@ -119,12 +135,12 @@ def check_review_approved(repo: str, pr_number: int) -> tuple[bool, str]:
        return False, f"API error: {e}"


-def check_tests_passed(repo: str, work_item_id: str) -> tuple[bool, str]:
+def check_tests_passed(repo: str, work_item_id: str, branch: str | None = None) -> tuple[bool, str]:
    """
    Check if test report exists and contains PASS indicator.
    File: docs/work-items/<work_item_id>/13-test-report.md
    """
-    repo_path = os.path.join(settings.repos_dir, repo)
+    repo_path = _repo_path(repo, branch)
    report_path = os.path.join(repo_path, f"docs/work-items/{work_item_id}/13-test-report.md")

    if not os.path.isfile(report_path):
@@ -141,7 +157,7 @@ def check_tests_passed(repo: str, work_item_id: str) -> tuple[bool, str]:



-def check_analysis_approved(repo: str, work_item_id: str) -> tuple[bool, str]:
+def check_analysis_approved(repo: str, work_item_id: str, branch: str | None = None) -> tuple[bool, str]:
    """
    Check if analysis is complete AND approved by stakeholder.
    Requirements:
@@ -152,7 +168,7 @@ def check_analysis_approved(repo: str, work_item_id: str) -> tuple[bool, str]:
    so the approval check verifies file completeness as a safety gate.
    """
    # First check files
-    files_ok, files_reason = check_analysis_complete(repo, work_item_id)
+    files_ok, files_reason = check_analysis_complete(repo, work_item_id, branch)
    if not files_ok:
        return False, files_reason

@@ -187,7 +203,7 @@ def check_analysis_approved(repo: str, work_item_id: str) -> tuple[bool, str]:



-def check_reviewer_verdict(repo: str, work_item_id: str) -> tuple[bool, str]:
+def check_reviewer_verdict(repo: str, work_item_id: str, branch: str | None = None) -> tuple[bool, str]:
    """
    Check reviewer agent verdict from 12-review.md (S-5 fix).

@@ -198,7 +214,7 @@ def check_reviewer_verdict(repo: str, work_item_id: str) -> tuple[bool, str]:
      (False, ...) -> verdict: REQUEST_CHANGES, missing verdict, or no frontmatter
    """
    import yaml
-    repo_path = os.path.join(settings.repos_dir, repo)
+    repo_path = _repo_path(repo, branch)
    review_path = os.path.join(repo_path, f"docs/work-items/{work_item_id}/12-review.md")

    if not os.path.isfile(review_path):
@@ -229,26 +245,15 @@ def check_reviewer_verdict(repo: str, work_item_id: str) -> tuple[bool, str]:

 def check_tests_local(repo: str, branch: str) -> tuple[bool, str]:
    """
-    S-1 fix: run the project test suite locally in /repos/<repo> and judge by exit
-    code, instead of depending on Gitea CI (which is not configured -> always false).
+    S-1 fix: run the project test suite locally and judge by exit code, instead of
+    depending on Gitea CI (which is not configured -> always false).

-    Checks out `branch` in the shared /repos checkout and runs `make test`.
-    NOTE (known limitation): the shared /repos checkout means this is not safe for
-    concurrent active tasks. git-worktree-per-task is a separate task (S-4).
+    ORCH-2 / S-4: tests run inside the per-branch worktree (ensure_worktree), so this
+    is safe for concurrent active tasks — no shared /repos checkout race.
    """
    import subprocess
-    repo_path = os.path.join(settings.repos_dir, repo)
    try:
-        subprocess.run(
-            ["git", "-C", repo_path, "fetch", "origin"],
-            capture_output=True, timeout=30,
-        )
-        co = subprocess.run(
-            ["git", "-C", repo_path, "checkout", branch],
-            capture_output=True, text=True, timeout=30,
-        )
-        if co.returncode != 0:
-            return False, f"Cannot checkout branch '{branch}': {co.stderr.strip()[-200:]}"
+        repo_path = ensure_worktree(repo, branch)
        r = subprocess.run(
            ["make", "test"], cwd=repo_path,
            capture_output=True, text=True, timeout=600,
--- a/src/webhooks/gitea.py
+++ b/src/webhooks/gitea.py
@@ -146,7 +146,9 @@ async def handle_ci_status(payload: dict):
    if not branch:
        sha = payload.get("sha", "")
        repo_name = payload.get("repository", {}).get("name", settings.default_repo)
-        # Try to find task by checking git branch containing this SHA
+        # Try to find task by checking git branch containing this SHA.
+        # ORCH-2 / S-4: this is a READ-ONLY query of remote-tracking refs in the main
+        # clone (no checkout / no mutation), so it is safe to keep on /repos/<repo>.
        try:
            result = subprocess.run(
                ["git", "-C", os.path.join(settings.repos_dir, repo_name),
--- a/src/webhooks/plane.py
+++ b/src/webhooks/plane.py
@@ -304,7 +304,8 @@ async def _try_advance_stage(

        # Determine args based on QG function
        if qg_name in ("check_analysis_approved", "check_analysis_complete", "check_architecture_done", "check_tests_passed", "check_reviewer_verdict"):
-            passed, reason = qg_func(repo, work_item_id)
+            # ORCH-2 / S-4: pass branch so artifacts are read from the task worktree.
+            passed, reason = qg_func(repo, work_item_id, branch)
        elif qg_name in ("check_ci_green", "check_tests_local"):
            passed, reason = qg_func(repo, branch)
        elif qg_name == "check_review_approved":
@@ -327,8 +328,10 @@ async def _try_advance_stage(
                else:
                    # No open PR but review file exists — check file-based
                    import os
-                    _review_path = os.path.join(_s.repos_dir, repo, f"docs/work-items/{work_item_id}/12-review.md")
-                    _review_path2 = os.path.join(_s.repos_dir, repo, f"docs/work-items/{work_item_id}/09-review.md")
+                    from ..git_worktree import get_worktree_path as _gwp
+                    _wt = _gwp(repo, branch) if os.path.isdir(_gwp(repo, branch)) else os.path.join(_s.repos_dir, repo)
+                    _review_path = os.path.join(_wt, f"docs/work-items/{work_item_id}/12-review.md")
+                    _review_path2 = os.path.join(_wt, f"docs/work-items/{work_item_id}/09-review.md")
                    if os.path.isfile(_review_path) or os.path.isfile(_review_path2):
                        passed, reason = True, "Review file exists (file-based approval)"
                    else: