Follow-up ORCH-040: legacy root:root files in /repos broke worktree creation under uid 1000 with a raw "Permission denied" (agent never started, no diagnosis). Three additive, kill-switch-reversible layers; STAGE_TRANSITIONS / QG_CHECKS / check_* / machine-verdict keys / DB schema are byte-for-byte unchanged. - D1: ensure_worktree classifies the permission class and raises an actionable RuntimeError (cause + chown command + INFRA.md ref); non-permission errors keep the prior raw-stderr contract; kill-switch off -> contract 1:1 as before ORCH-057. - D2: new never-raise leaf src/fs_normalize.py — scan_ownership (TTL-cached, early-exit per root), applies()-first scope (empty CSV -> self-hosting only), opt-in normalize() that chowns ONLY when privileged (no-op under uid 1000). - D3: best-effort startup detect in main.lifespan (WARNING + Telegram on mismatch, never-fatal); read-only fs_ownership block in GET /queue; POST /fs-normalize/check. Claim is NOT blocked — the clear early outcome is delivered by D1 at launch. - Docs/config: .env.example flags + CHANGELOG (architecture README / adr-0031 / INFRA.md procedure already landed on the branch). - Tests: test_fs_normalize.py, test_git_worktree_perm.py, test_fs_normalize_startup.py, test_api_queue.py (TC-01..TC-12). Full suite green. Refs: ORCH-057 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
143 lines
6.3 KiB
Python
143 lines
6.3 KiB
Python
"""Git worktree management — isolated working copy per task/branch (ORCH-2 / S-4).
|
|
|
|
Background
|
|
----------
|
|
Previously every git operation (checkout/commit/push/test) ran in the single shared
|
|
clone ``/repos/<repo>``. With two active tasks a ``git checkout`` of one branch would
|
|
overwrite the working copy of the other -> races (see AUDIT S-4 / ET-009 "two collectors").
|
|
|
|
Solution
|
|
--------
|
|
Each task (branch) gets an isolated git worktree::
|
|
|
|
/repos/<repo> <- main clone (fetch / worktree management)
|
|
/repos/_wt/<repo>/<safe-branch> <- worktree for one task/branch (agent works here)
|
|
|
|
A branch can only be checked out in ONE worktree at a time, which is exactly the
|
|
property we want: one task = one branch = one worktree.
|
|
"""
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import logging
|
|
from .config import settings
|
|
|
|
logger = logging.getLogger("orchestrator.git_worktree")
|
|
|
|
|
|
def _safe(branch: str) -> str:
|
|
"""Filesystem-safe branch name for use in a path component."""
|
|
return re.sub(r"[^A-Za-z0-9._-]", "_", branch)
|
|
|
|
|
|
def get_worktree_path(repo: str, branch: str) -> str:
|
|
"""Path of the worktree for (repo, branch). Does NOT create it."""
|
|
return os.path.join(settings.worktrees_dir, repo, _safe(branch))
|
|
|
|
|
|
def _main_repo(repo: str) -> str:
|
|
return os.path.join(settings.repos_dir, repo)
|
|
|
|
|
|
def _raise_if_permission(repo: str, branch: str, *, stderr: str | None = None,
|
|
exc: BaseException | None = None) -> None:
|
|
"""ORCH-057 D1: if a worktree failure is the legacy-ownership permission class,
|
|
raise an actionable ``RuntimeError`` (cause + healing command + INFRA.md ref)
|
|
instead of a raw git stderr (FR-1 / AC-2).
|
|
|
|
Gated by ``fs_normalize_enabled`` — when the kill-switch is off the error
|
|
contract is byte-for-byte as before ORCH-057 (this helper is a no-op, the caller
|
|
re-raises the original). A non-permission error is also a no-op here, so the
|
|
caller's existing message/semantics are preserved (no meaning substitution).
|
|
Never raises anything other than the deliberate actionable RuntimeError.
|
|
"""
|
|
try:
|
|
if not settings.fs_normalize_enabled:
|
|
return
|
|
from . import fs_normalize
|
|
if fs_normalize.is_permission_failure(stderr=stderr, exc=exc):
|
|
raw = stderr if stderr is not None else (str(exc) if exc else "")
|
|
raise RuntimeError(fs_normalize.build_worktree_help(repo, branch, raw=raw))
|
|
except RuntimeError:
|
|
raise
|
|
except Exception as e: # noqa: BLE001 - classification must never mask the real error
|
|
logger.warning("worktree permission-classification skipped: %s", e)
|
|
|
|
|
|
def ensure_worktree(repo: str, branch: str) -> str:
|
|
"""Create (or reuse) an isolated worktree for ``branch``. Returns its path.
|
|
|
|
Main clone stays at ``/repos/<repo>``. Worktree lives at
|
|
``/repos/_wt/<repo>/<safe-branch>``.
|
|
|
|
- If the worktree already exists, it is fetched + fast-aligned to the branch
|
|
(and to ``origin/<branch>`` when that remote branch exists).
|
|
- If the branch exists (locally or on origin) it is checked out into a fresh
|
|
worktree; otherwise a new branch is created from ``origin/main``.
|
|
"""
|
|
main_repo = _main_repo(repo)
|
|
wt = get_worktree_path(repo, branch)
|
|
|
|
if not os.path.isdir(main_repo):
|
|
raise FileNotFoundError(f"Main repo not found: {main_repo}")
|
|
|
|
# Always refresh refs in the main clone first.
|
|
subprocess.run(["git", "-C", main_repo, "fetch", "origin"],
|
|
capture_output=True, timeout=60)
|
|
|
|
# Reuse existing worktree (.git may be a dir or a file pointer for worktrees).
|
|
if os.path.isdir(os.path.join(wt, ".git")) or os.path.isfile(os.path.join(wt, ".git")):
|
|
subprocess.run(["git", "-C", wt, "fetch", "origin"], capture_output=True, timeout=60)
|
|
subprocess.run(["git", "-C", wt, "checkout", branch], capture_output=True, timeout=30)
|
|
# Align to remote only if the remote branch exists (avoid wiping local-only work).
|
|
rb = subprocess.run(
|
|
["git", "-C", wt, "rev-parse", "--verify", "--quiet", f"origin/{branch}"],
|
|
capture_output=True,
|
|
)
|
|
if rb.returncode == 0:
|
|
subprocess.run(["git", "-C", wt, "reset", "--hard", f"origin/{branch}"],
|
|
capture_output=True, timeout=30)
|
|
logger.info(f"Worktree reused: {wt} (branch {branch})")
|
|
return wt
|
|
|
|
# ORCH-057 D1: creating the leading worktree directory next to a legacy
|
|
# root-owned /repos/_wt fails with Permission denied under uid 1000 — turn that
|
|
# into an actionable error (the kill-switch / non-permission path is unchanged).
|
|
try:
|
|
os.makedirs(os.path.dirname(wt), exist_ok=True)
|
|
except OSError as e:
|
|
_raise_if_permission(repo, branch, exc=e)
|
|
raise
|
|
|
|
# Try to attach an existing branch (local or remote-tracking) to the new worktree.
|
|
r = subprocess.run(["git", "-C", main_repo, "worktree", "add", wt, branch],
|
|
capture_output=True, text=True, timeout=60)
|
|
if r.returncode != 0:
|
|
# Branch doesn't exist yet — create it from origin/main.
|
|
r2 = subprocess.run(
|
|
["git", "-C", main_repo, "worktree", "add", "-b", branch, wt, "origin/main"],
|
|
capture_output=True, text=True, timeout=60,
|
|
)
|
|
if r2.returncode != 0:
|
|
combined = f"{r.stderr.strip()} | {r2.stderr.strip()}"
|
|
# ORCH-057 D1: a permission-class git fatal -> actionable RuntimeError;
|
|
# any other failure keeps the prior raw-stderr contract (AC-2).
|
|
_raise_if_permission(repo, branch, stderr=combined)
|
|
raise RuntimeError(
|
|
f"git worktree add failed for {repo}:{branch}: {combined}"
|
|
)
|
|
logger.info(f"Worktree ready: {wt} (branch {branch})")
|
|
return wt
|
|
|
|
|
|
def remove_worktree(repo: str, branch: str):
|
|
"""Remove the worktree for (repo, branch) — optional cleanup when a task is done."""
|
|
main_repo = _main_repo(repo)
|
|
wt = get_worktree_path(repo, branch)
|
|
subprocess.run(["git", "-C", main_repo, "worktree", "remove", "--force", wt],
|
|
capture_output=True, timeout=30)
|
|
# Prune dangling administrative entries.
|
|
subprocess.run(["git", "-C", main_repo, "worktree", "prune"],
|
|
capture_output=True, timeout=30)
|
|
logger.info(f"Worktree removed: {wt}")
|