feat(fs): legacy root-owned ownership detect + actionable worktree error (ORCH-057)
Follow-up ORCH-040: legacy root:root files in /repos broke worktree creation under uid 1000 with a raw "Permission denied" (agent never started, no diagnosis). Three additive, kill-switch-reversible layers; STAGE_TRANSITIONS / QG_CHECKS / check_* / machine-verdict keys / DB schema are byte-for-byte unchanged. - D1: ensure_worktree classifies the permission class and raises an actionable RuntimeError (cause + chown command + INFRA.md ref); non-permission errors keep the prior raw-stderr contract; kill-switch off -> contract 1:1 as before ORCH-057. - D2: new never-raise leaf src/fs_normalize.py — scan_ownership (TTL-cached, early-exit per root), applies()-first scope (empty CSV -> self-hosting only), opt-in normalize() that chowns ONLY when privileged (no-op under uid 1000). - D3: best-effort startup detect in main.lifespan (WARNING + Telegram on mismatch, never-fatal); read-only fs_ownership block in GET /queue; POST /fs-normalize/check. Claim is NOT blocked — the clear early outcome is delivered by D1 at launch. - Docs/config: .env.example flags + CHANGELOG (architecture README / adr-0031 / INFRA.md procedure already landed on the branch). - Tests: test_fs_normalize.py, test_git_worktree_perm.py, test_fs_normalize_startup.py, test_api_queue.py (TC-01..TC-12). Full suite green. Refs: ORCH-057 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
63
src/main.py
63
src/main.py
@@ -89,6 +89,44 @@ async def lifespan(app: FastAPI):
|
||||
except Exception as e:
|
||||
log.warning(f"Log rotation skipped: {e}")
|
||||
|
||||
# ORCH-057 (D3 / FR-3): best-effort legacy-ownership detect. Surfaces a
|
||||
# PROACTIVE operator signal (WARNING + Telegram) when /repos still holds
|
||||
# root-owned files after the uid migration, BEFORE a task fails on launch.
|
||||
# never-fatal (mirrors lease-reclaim / log-rotation above): a detect error must
|
||||
# not crash the start of the shared instance. The actual "clear, early" failure
|
||||
# is delivered by the actionable error in ensure_worktree (D1) — claim is NOT
|
||||
# blocked (ADR-001 D3). Honours ORCH_FS_NORMALIZE_ENABLED inside scan_ownership.
|
||||
try:
|
||||
from .fs_normalize import scan_ownership, healing_command, normalize
|
||||
from .config import settings as _fs_settings
|
||||
scan = scan_ownership()
|
||||
if scan.mismatch:
|
||||
log.warning(
|
||||
"FS-ownership mismatch: %d root(s) with files not owned by uid %s "
|
||||
"(%s; sample: %s). Heal: %s",
|
||||
len(scan.roots_mismatch), scan.target_uid,
|
||||
", ".join(scan.roots_mismatch), scan.sample_path, healing_command(),
|
||||
)
|
||||
try:
|
||||
from .notifications import send_telegram
|
||||
send_telegram(
|
||||
"⚠️ Orchestrator: обнаружены legacy root-owned файлы в "
|
||||
f"{', '.join(scan.roots_mismatch)} (uid != {scan.target_uid}). "
|
||||
f"Первый запуск задачи может упасть на создании worktree. "
|
||||
f"Лечение: {healing_command()}"
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
# D4 / FR-4: opt-in auto-chown ONLY when privileged (no-op under uid 1000).
|
||||
if getattr(_fs_settings, "fs_normalize_auto", False):
|
||||
try:
|
||||
res = normalize()
|
||||
log.warning("FS-ownership auto-normalize: %s", res.get("note"))
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("FS-ownership auto-normalize skipped: %s", e)
|
||||
except Exception as e:
|
||||
log.warning(f"FS-ownership detect skipped: {e}")
|
||||
|
||||
# Start the background job-queue worker (ORCH-1).
|
||||
from .queue_worker import worker
|
||||
worker.start()
|
||||
@@ -171,6 +209,7 @@ async def queue():
|
||||
from . import task_deps
|
||||
from . import serial_gate
|
||||
from . import coverage_gate
|
||||
from . import fs_normalize
|
||||
from . import labels
|
||||
from . import cancel
|
||||
from .disk_watchdog import disk_watchdog
|
||||
@@ -193,6 +232,10 @@ async def queue():
|
||||
# ORCH-027 (FR-7 / AC-9): coverage-gate observability (read-only) —
|
||||
# kill-switch, scope, policy/floor/epsilon, per-repo baselines. Additive block.
|
||||
"coverage": coverage_gate.snapshot(),
|
||||
# ORCH-057 (D6 / AC-4): legacy-ownership detect observability (read-only) —
|
||||
# kill-switch, scope, target_uid, mismatch + affected roots (TTL-cached scan).
|
||||
# Additive block; never-raise.
|
||||
"fs_ownership": fs_normalize.snapshot(),
|
||||
# ORCH-089 (D7): auto-mode-by-label observability (read-only) — kill-switch,
|
||||
# label names, scope. Additive block.
|
||||
"auto_labels": labels.snapshot(),
|
||||
@@ -262,6 +305,26 @@ async def serial_gate_unfreeze(repo: str = ""):
|
||||
return {"ok": True, "repo": repo, "cleared": cleared, "frozen": frozen}
|
||||
|
||||
|
||||
@app.post("/fs-normalize/check")
|
||||
async def fs_normalize_check(normalize: bool = False):
|
||||
"""ORCH-057 (D6 / AC-4): force a fresh legacy-ownership detect (bypass the TTL
|
||||
cache) and return the snapshot. By образцу ``POST /serial-gate/unfreeze``.
|
||||
|
||||
``normalize=true`` additionally attempts an opt-in ``chown`` — a no-op under uid
|
||||
1000 (the prod-self case), effective only when the process is privileged (D4).
|
||||
The real fix remains the operator procedure (docs/operations/INFRA.md «Миграция
|
||||
uid»). Read-only/never-raise otherwise.
|
||||
"""
|
||||
from . import fs_normalize as _fs
|
||||
scan = _fs.scan_ownership(force=True)
|
||||
out = {"ok": True, "scan": scan.to_dict(), "healing": _fs.healing_command()}
|
||||
if normalize:
|
||||
out["normalize"] = _fs.normalize()
|
||||
# Re-scan so the returned snapshot reflects any change a privileged run made.
|
||||
out["scan"] = _fs.scan_ownership(force=True).to_dict()
|
||||
return out
|
||||
|
||||
|
||||
@app.post("/coverage/baseline")
|
||||
async def coverage_set_baseline(repo: str = "", value: float | None = None):
|
||||
"""ORCH-027 (D8): manually set/override the per-repo coverage baseline.
|
||||
|
||||
Reference in New Issue
Block a user