fix(deploy): resilient-pull hygiene for dirty shared deploy-base (ORCH-112)
Self-deploy git pull blocked on a dirty shared main checkout (manual/abandoned WIP from a failed/cancelled task) — incident ORCH-111: "Your local changes to src/config.py would be overwritten by merge" wedged the prod deploy and required manual intervention (a group risk on self-hosting). The deploy hook (--deploy) now converges the deploy-base to a clean, current origin/main BEFORE the pull (git fetch + reset --hard origin/main + a SCOPED `git clean -fd`, NEVER -x), strictly preserving the rollback/log artefacts (.deploy-prev-image-* / deploy-hook.log via -e), gitignored .env/data/*.db/build (no -x), and sibling/.git state (out of clean scope). Gated by CHECKOUT_HYGIENE env injected by self_deploy.build_deploy_command only when the new pure never-raise leaf src/checkout_hygiene.py says applies(repo) (kill-switch + self-hosting scope). Convergence after failed/cancelled is this same deploy-time self-heal — cancel_task is NOT extended and no background janitor is introduced. Observability: the hook writes a `hygiene` sentinel, the Phase-C finalizer reads it and sends a best-effort Telegram alert. Additive, under kill-switch (ORCH_CHECKOUT_HYGIENE_ENABLED, default true; off -> bare `git pull origin main` 1:1 before ORCH-112), never-raise, self-hosting scope. STAGE_TRANSITIONS / QG_CHECKS / check_* / machine-verdict keys / DB schema / the hook exit-code contract (0/1/2, ORCH-036) are byte-for-byte untouched. Coverage: tests/test_deploy_checkout_hygiene.py (TC-01..TC-10; real-hook shell simulation in a temp git repo, no network/prod/ssh, + unit). TC-01 is the mandatory ORCH-111 regression (RED before the fix, GREEN after). Docs golden source updated in the same PR (CLAUDE.md, CHANGELOG.md, .env.example; INFRA.md / architecture/README.md / adr-0044 written at the architecture stage). Refs: ORCH-112 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
214
src/checkout_hygiene.py
Normal file
214
src/checkout_hygiene.py
Normal file
@@ -0,0 +1,214 @@
|
||||
"""ORCH-112 (ADR-001 / adr-0044): deploy-base checkout-hygiene leaf — pure policy.
|
||||
|
||||
Leaf module mirroring ``src/serial_gate.py`` / ``src/cancel.py`` / ``src/self_deploy.py``:
|
||||
pure, unit-testable, never-raise functions over ``config`` + the deploy-state sentinels.
|
||||
Module-level imports are limited to ``config`` (and stdlib); ``self_deploy``,
|
||||
``qg.checks.is_self_hosting_repo`` and ``notifications`` are imported LAZILY so this
|
||||
stays a leaf and an import cycle can never form.
|
||||
|
||||
What it answers / does (the MECHANISM — git fetch/reset/clean — lives in the host
|
||||
deploy hook ``scripts/orchestrator-deploy-hook.sh`` block "2a. Resilient pull"; this
|
||||
leaf only decides conditionality, builds the env gate, reads the report and alerts):
|
||||
|
||||
* ``applies(repo)`` — is resilient-pull hygiene REAL here?
|
||||
* ``hook_env(repo, work_item_id)`` — the ``CHECKOUT_HYGIENE=1 HYGIENE_REPORT=…``
|
||||
env prefix injected into the detached
|
||||
deploy-hook command ("" when not applies).
|
||||
* ``read_report(repo, work_item_id)`` — read the ``hygiene`` sentinel the hook wrote.
|
||||
* ``alert_dirty(repo, work_item_id, report)``— best-effort Telegram + structured log.
|
||||
* ``snapshot()`` — read-only block for ``GET /queue``.
|
||||
|
||||
never-raise contract (self-hosting safety): every public function degrades
|
||||
conservatively. ``applies`` -> False on error (hygiene inert == kill-switch off, the
|
||||
safe default that keeps the bare ``git pull`` 1:1 as before ORCH-112). ``hook_env`` ->
|
||||
"" on error (no env -> the hook's ``${CHECKOUT_HYGIENE:-0}`` guard stays 0). The report
|
||||
reader / alert swallow every error so a deploy is NEVER crashed by an observability
|
||||
hiccup (D5 / AC-8).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shlex
|
||||
|
||||
from .config import settings
|
||||
|
||||
logger = logging.getLogger("orchestrator.checkout_hygiene")
|
||||
|
||||
# Sentinel filename the hook writes (HYGIENE_REPORT points at it) and read_report
|
||||
# reads back. Lives in the SAME deploy-state dir as self_deploy's ``result`` (shared
|
||||
# mount visible to both host and container).
|
||||
REPORT_NAME = "hygiene"
|
||||
|
||||
# Repo tokens in the CSV scope must match this (mirrors serial_gate._REPO_TOKEN). The
|
||||
# CSV is operator config, not user input, but the guard is mandatory; an invalid token
|
||||
# is dropped.
|
||||
_REPO_TOKEN = re.compile(r"^[A-Za-z0-9._-]+$")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Conditionality (mirrors self_deploy_applies / serial_gate_applies)
|
||||
# ---------------------------------------------------------------------------
|
||||
def _scope_repos() -> set[str]:
|
||||
"""Sanitised set of in-scope repo tokens from ``checkout_hygiene_repos`` (CSV).
|
||||
|
||||
Empty/blank CSV -> empty set, meaning "self-hosting only" (resolved in ``applies``).
|
||||
Invalid tokens (regex miss) are dropped. Never raises.
|
||||
"""
|
||||
try:
|
||||
raw = (settings.checkout_hygiene_repos or "").strip()
|
||||
except Exception: # noqa: BLE001
|
||||
return set()
|
||||
if not raw:
|
||||
return set()
|
||||
out: set[str] = set()
|
||||
for tok in raw.split(","):
|
||||
t = tok.strip()
|
||||
if t and _REPO_TOKEN.match(t):
|
||||
out.add(t)
|
||||
elif t:
|
||||
logger.warning("checkout_hygiene: dropping invalid repo token %r from CSV", t)
|
||||
return out
|
||||
|
||||
|
||||
def applies(repo: str) -> bool:
|
||||
"""Whether resilient-pull hygiene is REAL for this repo (D3 / AC-6).
|
||||
|
||||
* ``checkout_hygiene_enabled=False`` -> always False (kill-switch; the hook sees
|
||||
no CHECKOUT_HYGIENE env -> bare ``git pull origin main`` 1:1 as before ORCH-112).
|
||||
* ``checkout_hygiene_repos`` (CSV) non-empty -> real only for listed repos.
|
||||
* empty CSV -> real ONLY for the self-hosting repo (``orchestrator``), mirroring
|
||||
``self_deploy_repos`` — this is a self-hosting prod-deploy-path feature, so it
|
||||
must NOT touch enduro / other repos' synchronous deploy.
|
||||
Local-only (no network), meant to be checked FIRST. Never raises -> False on error.
|
||||
"""
|
||||
try:
|
||||
if not getattr(settings, "checkout_hygiene_enabled", False):
|
||||
return False
|
||||
scope = _scope_repos()
|
||||
if scope:
|
||||
return (repo or "").strip() in scope
|
||||
# Lazy import keeps this module a leaf (no qg import at module load).
|
||||
from .qg.checks import is_self_hosting_repo
|
||||
return is_self_hosting_repo(repo)
|
||||
except Exception as e: # noqa: BLE001 - never-raise
|
||||
logger.warning("checkout_hygiene.applies error for %s: %s", repo, e)
|
||||
return False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Env gate injected into the detached deploy-hook command (Phase B wiring)
|
||||
# ---------------------------------------------------------------------------
|
||||
def report_path_host(repo: str, work_item_id: str | None) -> str:
|
||||
"""HOST view of the ``hygiene`` sentinel path (the wrapper writes it there)."""
|
||||
from . import self_deploy
|
||||
return os.path.join(self_deploy.host_state_dir(repo, work_item_id), REPORT_NAME)
|
||||
|
||||
|
||||
def hook_env(repo: str, work_item_id: str | None) -> str:
|
||||
"""Build the env-assignment prefix injected into the detached deploy-hook command.
|
||||
|
||||
Returns ``CHECKOUT_HYGIENE=1 HYGIENE_REPORT=<host-path>`` (shlex-quoted) ONLY when
|
||||
``applies(repo)`` is True; otherwise ``""`` so the hook's ``${CHECKOUT_HYGIENE:-0}``
|
||||
guard stays 0 and the bare ``git pull`` runs (1:1 before ORCH-112). The
|
||||
``HYGIENE_REPORT`` path is the HOST view of the deploy-state dir (the host wrapper
|
||||
writes the sentinel there; the container reads it back via ``read_report``). Never
|
||||
raises -> "" (no hygiene env, the safe default).
|
||||
"""
|
||||
try:
|
||||
if not applies(repo):
|
||||
return ""
|
||||
report = report_path_host(repo, work_item_id)
|
||||
return f"CHECKOUT_HYGIENE=1 HYGIENE_REPORT={shlex.quote(report)}"
|
||||
except Exception as e: # noqa: BLE001 - never-raise -> no hygiene env
|
||||
logger.warning("checkout_hygiene.hook_env error for %s/%s: %s", repo, work_item_id, e)
|
||||
return ""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Report sentinel reader (Phase C observability)
|
||||
# ---------------------------------------------------------------------------
|
||||
def read_report(repo: str, work_item_id: str | None) -> dict | None:
|
||||
"""Read the ``hygiene`` sentinel the hook wrote (container view of deploy-state).
|
||||
|
||||
The hook writes the sentinel ONLY when it detected a dirty base, body::
|
||||
|
||||
dirty=1
|
||||
<git status --porcelain lines...>
|
||||
|
||||
Returns ``{"dirty": True, "paths": [...]}`` when the sentinel exists and reports a
|
||||
dirty base; ``None`` when there is no sentinel (clean base / hygiene disabled / not
|
||||
written yet). Never raises -> None on error.
|
||||
"""
|
||||
try:
|
||||
from . import self_deploy
|
||||
p = os.path.join(self_deploy.container_state_dir(repo, work_item_id), REPORT_NAME)
|
||||
with open(p, "r", encoding="utf-8") as f:
|
||||
raw = f.read()
|
||||
except FileNotFoundError:
|
||||
return None
|
||||
except Exception as e: # noqa: BLE001 - never-raise
|
||||
logger.warning("checkout_hygiene.read_report error for %s/%s: %s", repo, work_item_id, e)
|
||||
return None
|
||||
lines = raw.splitlines()
|
||||
if not any(ln.strip() == "dirty=1" for ln in lines):
|
||||
return None
|
||||
paths = [
|
||||
ln.strip() for ln in lines
|
||||
if ln.strip() and not ln.strip().startswith("dirty=")
|
||||
]
|
||||
return {"dirty": True, "paths": paths}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Best-effort Telegram alert (Phase C observability) — D5 / AC-8
|
||||
# ---------------------------------------------------------------------------
|
||||
def alert_dirty(repo: str, work_item_id: str | None, report: dict | None) -> bool:
|
||||
"""Structured log + best-effort Telegram that the deploy-base was dirty and was
|
||||
converged to ``origin/main`` before the pull (D5 / AC-8). Returns True iff an alert
|
||||
was sent. Its failure NEVER crashes the finalizer (never-raise) — observability is
|
||||
best-effort and must not block the conveyor (AC-8 FAIL is "alert crashes deploy").
|
||||
"""
|
||||
try:
|
||||
if not report or not report.get("dirty"):
|
||||
return False
|
||||
paths = report.get("paths") or []
|
||||
n = len(paths)
|
||||
logger.warning(
|
||||
"checkout_hygiene: dirty deploy-base converged to origin/main for %s/%s "
|
||||
"(%d path(s)): %s", repo, work_item_id, n, paths[:20],
|
||||
)
|
||||
from .notifications import link_for, send_telegram
|
||||
send_telegram(
|
||||
f"\U0001f9f9 {link_for(work_item_id)}: грязная deploy-база сведена к "
|
||||
f"origin/main перед прод-деплоем ({n} путь(ей) сброшено)."
|
||||
)
|
||||
return True
|
||||
except Exception as e: # noqa: BLE001 - never-raise: alert is best-effort
|
||||
logger.warning("checkout_hygiene.alert_dirty error for %s/%s: %s", repo, work_item_id, e)
|
||||
return False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Observability snapshot for GET /queue (D3, optional)
|
||||
# ---------------------------------------------------------------------------
|
||||
def snapshot() -> dict:
|
||||
"""Read-only checkout-hygiene summary for GET /queue.
|
||||
|
||||
Additive block; existing /queue keys are untouched. never-raise -> a minimal dict
|
||||
with the flags on error.
|
||||
"""
|
||||
try:
|
||||
enabled = bool(getattr(settings, "checkout_hygiene_enabled", False))
|
||||
except Exception: # noqa: BLE001
|
||||
enabled = False
|
||||
try:
|
||||
repos_cfg = getattr(settings, "checkout_hygiene_repos", "") or ""
|
||||
except Exception: # noqa: BLE001
|
||||
repos_cfg = ""
|
||||
return {
|
||||
"enabled": enabled,
|
||||
"repos": repos_cfg,
|
||||
"scope": "csv" if (repos_cfg or "").strip() else "self-hosting-only",
|
||||
}
|
||||
@@ -290,6 +290,25 @@ class Settings(BaseSettings):
|
||||
deploy_prod_compose_profile: str = ""
|
||||
deploy_prod_prev_image_file: str = ".deploy-prev-image-prod"
|
||||
|
||||
# ORCH-112: deploy-base checkout-hygiene (resilient-pull). The self-deploy hook's
|
||||
# bare `git pull origin main` in the shared main clone blocked on a dirty working
|
||||
# tree (manual/abandoned WIP left by a failed/cancelled task — incident ORCH-111
|
||||
# from ORCH-104). The fix converges the deploy-base to a clean, current origin/main
|
||||
# (git fetch + reset --hard + a SCOPED `git clean -fd`, NEVER `-x`) BEFORE the pull,
|
||||
# gated by the CHECKOUT_HYGIENE env injected by self_deploy.build_deploy_command.
|
||||
# Pure leaf: src/checkout_hygiene.py (never-raise). Not a Quality Gate / not a stage
|
||||
# — STAGE_TRANSITIONS / QG_CHECKS / check_* / machine-verdict / DB schema / the
|
||||
# hook's exit-code contract (0/1/2, ORCH-036) are byte-for-byte untouched.
|
||||
#
|
||||
# checkout_hygiene_enabled -> kill-switch (env ORCH_CHECKOUT_HYGIENE_ENABLED).
|
||||
# False -> the hook gets no CHECKOUT_HYGIENE env ->
|
||||
# bare `git pull origin main` 1:1 as before ORCH-112.
|
||||
# checkout_hygiene_repos -> CSV scope (env ORCH_CHECKOUT_HYGIENE_REPOS). Empty
|
||||
# -> only the self-hosting repo (orchestrator). Mirrors
|
||||
# self_deploy_repos (a self-hosting prod-deploy feature).
|
||||
checkout_hygiene_enabled: bool = True
|
||||
checkout_hygiene_repos: str = ""
|
||||
|
||||
# ORCH-058: staging-image provenance before the BUILD-ONCE retag to prod.
|
||||
# Closes the INV-FRESH gap (ADR-001): the BUILD-ONCE retag (ORCH-36) promotes
|
||||
# the staging image to prod WITHOUT a rebuild, assuming the staging image is
|
||||
|
||||
@@ -214,6 +214,7 @@ async def queue():
|
||||
from . import cancel
|
||||
from . import bug_fast_track
|
||||
from . import lessons
|
||||
from . import checkout_hygiene
|
||||
from .disk_watchdog import disk_watchdog
|
||||
from .build_cache_pruner import build_cache_pruner
|
||||
return {
|
||||
@@ -254,6 +255,9 @@ async def queue():
|
||||
# kill-switch, label, scope, bug-task counts + the structural savings metric
|
||||
# (architecture stages skipped). Additive block; never-raise.
|
||||
"bug_fast_track": bug_fast_track.snapshot(),
|
||||
# ORCH-112 (D3): deploy-base checkout-hygiene observability (read-only) —
|
||||
# kill-switch + scope. Additive block; never-raise.
|
||||
"checkout_hygiene": checkout_hygiene.snapshot(),
|
||||
# ORCH-098 (FR-4 / AC-4): lessons-journal observability (read-only) —
|
||||
# kill-switch + counts by type/status + last N lessons. Additive block;
|
||||
# never-raise (snapshot() returns {"enabled": ...} minimum on error).
|
||||
|
||||
@@ -239,7 +239,7 @@ def build_deploy_command(repo: str, work_item_id: str | None, branch: str) -> li
|
||||
``expected_revision`` returns ``""`` and the env is omitted, keeping the hook's
|
||||
backward-compatible "no provenance check" behaviour (AC-5 / AC-7).
|
||||
"""
|
||||
from . import image_freshness
|
||||
from . import checkout_hygiene, image_freshness
|
||||
|
||||
host_dir = host_state_dir(repo, work_item_id)
|
||||
result_sentinel = os.path.join(host_dir, RESULT)
|
||||
@@ -262,6 +262,12 @@ def build_deploy_command(repo: str, work_item_id: str | None, branch: str) -> li
|
||||
expected_rev = image_freshness.expected_revision(repo, branch)
|
||||
if expected_rev:
|
||||
env_assignments += f" EXPECTED_REVISION={shlex.quote(expected_rev)}"
|
||||
# ORCH-112: inject CHECKOUT_HYGIENE=1 HYGIENE_REPORT=<path> only when the leaf says
|
||||
# hygiene applies (kill-switch + self-hosting scope). Empty -> the hook's
|
||||
# ${CHECKOUT_HYGIENE:-0} guard stays 0 -> bare `git pull` 1:1 as before ORCH-112.
|
||||
hygiene_env = checkout_hygiene.hook_env(repo, work_item_id)
|
||||
if hygiene_env:
|
||||
env_assignments += f" {hygiene_env}"
|
||||
inner = (
|
||||
f"cd {shlex.quote(settings.deploy_host_repo_path)} && "
|
||||
f"{env_assignments} "
|
||||
|
||||
@@ -1957,6 +1957,17 @@ def run_deploy_finalizer(job: dict):
|
||||
logger.info(
|
||||
f"Task {task_id}: deploy finalized, hook exit={code} -> deploy_status={status}"
|
||||
)
|
||||
|
||||
# ORCH-112 (D5 / AC-8): if the host hook converged a DIRTY deploy-base to
|
||||
# origin/main before the pull, surface it (structured log + best-effort Telegram).
|
||||
# never-raise — observability must never crash the finalizer.
|
||||
try:
|
||||
from . import checkout_hygiene
|
||||
report = checkout_hygiene.read_report(repo, work_item_id)
|
||||
if report:
|
||||
checkout_hygiene.alert_dirty(repo, work_item_id, report)
|
||||
except Exception as e: # noqa: BLE001 - never break the finalizer
|
||||
logger.warning("Task %s: checkout-hygiene report read failed: %s", task_id, e)
|
||||
if status == "SUCCESS" and work_item_id:
|
||||
plane_add_comment(
|
||||
work_item_id,
|
||||
|
||||
Reference in New Issue
Block a user