"""ORCH-062: build-cache-pruner — periodic ``docker builder prune`` on the host. On 07.06.2026 the mva154 host disk silently grew to 100% and took down the WHOLE self-hosting pipeline of every project. The dominant consumer was the **docker build cache** (~11 GB accumulated by frequent rebuilds: ``docker compose up --build`` on prod deploy, the ``--profile staging`` rebuild, the build-once retag behind ``check_staging_image_fresh``). ORCH-063 added the disk-watchdog, which only **signals** (Telegram alert at >=85%) and explicitly deferred the cleanup to this task. **This module is that cleanup: the watchdog signals — the pruner cleans.** It is a background daemon thread modelled **1:1 on** ``src/disk_watchdog.py`` (``threading.Thread(daemon=True)`` + ``threading.Event`` for a clean stop, the ``start()`` / ``stop(timeout)`` / ``status()`` contract, a ``/queue`` snapshot, per-tick never-raise and a kill-switch ``ORCH_BUILD_CACHE_PRUNE_ENABLED``). Each tick runs **strictly** ``docker builder prune -f --filter until=`` (BuildKit GC) on the **host over ssh** — the prod container ships no docker CLI, only ``openssh-client`` (``Dockerfile:11``), so docker operations run over ssh on the host, the same channel ``image_freshness``/``self_deploy`` already use. Invariants (TRZ §5/§6 / ADR-001 D2/D6): * The command touches **only** the BuildKit build cache. There is NO ``docker image prune``, NO ``docker system prune``, no image/container removal of running services and no container stop/restart. The prod ``orchestrator`` container is NEVER restarted (self-hosting blast radius). ``-a/--all`` is only ever added **paired with** the ``until`` age filter — never a bare "nuke everything". * ``STAGE_TRANSITIONS`` / ``QG_CHECKS`` / ``check_*`` / ``_parse_*`` / ``src/stage_engine.py`` / the DB schema are UNCHANGED — the pruner is an operational daemon, not a Quality Gate (like ``reconciler`` / ``job_reaper`` / ``disk_watchdog``). No new migration (last-run / last-result is in-memory, best-effort, may reset on restart — safe: at worst one extra safe prune). * never-raise on two levels: per-command (non-zero rc / timeout / ``OSError`` / no ssh target / output-parse error -> logged and swallowed, the tick lives) and per-tick (outer ``try/except`` in ``_run``, like ``disk_watchdog._run``). The background loop and the pipeline never fall over. * No ssh target configured (``deploy_ssh_host`` empty) -> the tick is a no-op (logged, reflected in ``status().last_error``). This scopes the feature to the self-hosting prod (where ssh is configured) and makes the default safe in any environment without host access — parallel to how ``self_deploy`` / ``image_freshness`` degrade without a target. * Kill-switch ``build_cache_prune_enabled=False`` -> the daemon does not start (``main.lifespan`` guard + ``start()`` guard) and ``/queue`` returns ``{"enabled": false, ...}`` — behaviour 1:1 as before the task. See docs/work-items/ORCH-062/06-adr/ADR-001-build-cache-pruner.md and the cross-cutting docs/architecture/adr/adr-0025-build-cache-pruner.md. """ import logging import re import shlex import subprocess import threading import time from .config import settings from .notifications import send_telegram logger = logging.getLogger("orchestrator.build_cache_pruner") _BYTES_PER_GB = 1024 ** 3 # Multipliers for the "Total reclaimed space: " line emitted by # `docker builder prune`. Decimal units are base-1000 (docker's HumanSize), # the *i* binary units base-1024. Best-effort — only used for observability / # the optional notify threshold, never for a decision. _SIZE_UNITS = { "B": 1, "KB": 1000, "MB": 1000 ** 2, "GB": 1000 ** 3, "TB": 1000 ** 4, "KIB": 1024, "MIB": 1024 ** 2, "GIB": 1024 ** 3, "TIB": 1024 ** 4, } _RECLAIMED_RE = re.compile( r"Total reclaimed space:\s*([\d.]+)\s*([KMGT]?i?B)", re.IGNORECASE ) def decide_prune(prev_run_ts: float | None, now: float, interval_s: float) -> bool: """Pure decision (anti-frequency, NFR-4): should this tick prune? Returns ``True`` when no prune has run yet (``prev_run_ts is None``) or at least ``interval_s`` seconds have elapsed since the last attempt; ``False`` otherwise. Testable without a thread or a real timer (TC-01/TC-02). A non-positive / unusable ``interval_s`` falls open to ``True`` (prune) — the config validator already guards the value, this is belt-and-braces. """ if prev_run_ts is None: return True try: return (now - prev_run_ts) >= interval_s except TypeError: # pragma: no cover - defensive, inputs are numbers return True def _ssh_target() -> str | None: """ssh ``user@host`` for the host prune, or ``None`` when no host is configured (tests / non-self contexts). Mirrors ``image_freshness._ssh_target``. """ host = (settings.deploy_ssh_host or "").strip() if not host: return None user = (settings.deploy_ssh_user or "").strip() return f"{user}@{host}" if user else host def build_prune_command( ssh_target: str, until: str, prune_all: bool = False ) -> list[str]: """Build the ssh command that runs ``docker builder prune`` on the host. The remote is **strictly** ``docker builder prune -f`` (BuildKit GC), with the age filter ``--filter until=`` appended whenever ``until`` is set so the warm recent cache is kept (BR-2/AC-2), and ``-a`` added **only** when ``prune_all`` is set — always paired with the age filter (D2). It NEVER emits ``docker image prune`` / ``docker system prune`` / any image/container removal (BR-3/AC-3). The ``until`` value is ``shlex.quote``-d for the remote shell. """ remote = "docker builder prune -f" if prune_all: remote += " -a" if until: remote += " --filter until=" + shlex.quote(until) return ["ssh", "-o", "StrictHostKeyChecking=no", ssh_target, remote] def parse_reclaimed(output: str) -> int | None: """Best-effort parse of ``Total reclaimed space: `` -> bytes. Returns the reclaimed size in bytes, or ``None`` when the line is absent / unparseable (FR-4: observability is best-effort, never a decision). Never raises. """ try: m = _RECLAIMED_RE.search(output or "") if not m: return None value = float(m.group(1)) unit = m.group(2).upper() mult = _SIZE_UNITS.get(unit) if mult is None: return None return int(value * mult) except Exception as e: # noqa: BLE001 - parsing is best-effort logger.warning("build-cache-pruner: cannot parse reclaimed space: %s", e) return None class BuildCachePruner: """Background daemon running ``docker builder prune`` on the host on a period. Modelled on ``DiskWatchdog``: a ``threading.Thread(daemon=True)`` + a ``threading.Event`` for a clean stop. The only in-memory state is the best-effort ``last_run_ts`` / ``_last_reclaimed`` / ``_last_error`` — all reset on restart, which is safe (at worst one extra safe prune; D6). ``now_provider`` is injectable so the anti-frequency decision is testable deterministically without a real timer. """ def __init__(self, interval_s: float | None = None, now_provider=None): self.interval_s = ( interval_s if interval_s is not None else settings.build_cache_prune_interval_s ) self._now = now_provider or time.time self._stop = threading.Event() self._thread: threading.Thread | None = None # Best-effort in-memory state (no DB row, no migration). self.last_run_ts: float | None = None self._last_reclaimed: int | None = None self._last_reclaimed_human: str | None = None self._last_error: str | None = None # -- config helpers ---------------------------------------------------- @property def _until(self) -> str: return settings.build_cache_prune_until @property def _all(self) -> bool: return settings.build_cache_prune_all @property def _timeout_s(self) -> int: return settings.build_cache_prune_timeout_s @property def _notify_min_gb(self) -> float: return settings.build_cache_prune_notify_min_gb # -- tick -------------------------------------------------------------- def tick(self) -> None: """One pass: prune if the anti-frequency window has elapsed (never-raise). Runs the pure ``decide_prune`` against the injected clock; on a PRUNE decision it performs the host prune (``_prune``), which is itself never-raise. A SKIP decision leaves all state untouched. """ now = self._now() if not decide_prune(self.last_run_ts, now, self.interval_s): return self._prune(now) def _prune(self, now: float) -> None: """Run ``docker builder prune`` on the host over ssh. Never raises (AC-4). Records the attempt time (``last_run_ts``) up front so the anti-frequency window advances even when the command fails or there is no ssh target. Every failure mode — no target, timeout, non-zero rc, ``OSError`` — is logged, stored in ``_last_error`` and swallowed; the loop stays alive. """ self.last_run_ts = now target = _ssh_target() if not target: self._last_error = "no ssh host configured (deploy_ssh_host empty)" logger.info("build-cache-pruner: %s — tick is a no-op", self._last_error) return cmd = build_prune_command(target, self._until, self._all) try: r = subprocess.run( cmd, capture_output=True, text=True, timeout=self._timeout_s ) except subprocess.TimeoutExpired: self._last_error = f"timeout after {self._timeout_s}s" logger.warning("build-cache-pruner: prune %s", self._last_error) return except (subprocess.SubprocessError, OSError) as e: self._last_error = f"ssh/subprocess error: {e}" logger.warning("build-cache-pruner: %s", self._last_error) return if r.returncode != 0: self._last_error = ( f"rc={r.returncode}: {(r.stderr or '').strip()[:200]}" ) logger.warning("build-cache-pruner: prune %s", self._last_error) return # Success: parse the best-effort reclaimed size and clear the error. self._last_error = None reclaimed = parse_reclaimed(r.stdout or "") self._last_reclaimed = reclaimed self._last_reclaimed_human = self._format_reclaimed(reclaimed) logger.info( "build-cache-pruner: pruned host build cache (until=%s, all=%s), " "reclaimed=%s", self._until, self._all, self._last_reclaimed_human or "unknown", ) self._maybe_notify(reclaimed) @staticmethod def _format_reclaimed(reclaimed: int | None) -> str | None: """Human GB label for a reclaimed byte count (best-effort, never raises).""" if reclaimed is None: return None try: return f"{reclaimed / _BYTES_PER_GB:.2f} GB" except Exception: # noqa: BLE001 - observability only return None def _maybe_notify(self, reclaimed: int | None) -> None: """Telegram when reclaimed >= ``notify_min_gb`` (>0 to enable). Never raises.""" try: min_gb = self._notify_min_gb if not min_gb or min_gb <= 0 or reclaimed is None: return gb = reclaimed / _BYTES_PER_GB if gb < min_gb: return self._send( f"\U0001f9f9 build-cache-pruner: освобождено {gb:.2f} ГБ " f"docker build cache на хосте (until={self._until})." ) except Exception as e: # noqa: BLE001 - notify is best-effort logger.warning("build-cache-pruner: notify decision failed: %s", e) def _send(self, text: str) -> None: """Send a Telegram message (notifying). Never raises (best-effort).""" try: send_telegram(text) except Exception as e: # noqa: BLE001 - delivery is best-effort logger.warning("build-cache-pruner: telegram send failed: %s", e) # -- loop / lifecycle -------------------------------------------------- def _tick(self) -> None: try: self.tick() except Exception as e: # noqa: BLE001 - inner never-raise logger.error("build-cache-pruner: tick error: %s", e) def _run(self) -> None: logger.info( "BuildCachePruner started (interval=%ss, until=%s, all=%s, " "timeout=%ss, enabled=%s)", self.interval_s, self._until, self._all, self._timeout_s, settings.build_cache_prune_enabled, ) while not self._stop.is_set(): try: self._tick() except Exception as e: # noqa: BLE001 - outer never-raise logger.error("BuildCachePruner loop error: %s", e) self._stop.wait(self.interval_s) logger.info("BuildCachePruner stopped") def start(self) -> None: """Start the daemon thread (idempotent: a live thread is a no-op). Honours the kill-switch: ``build_cache_prune_enabled=False`` -> no-op (the daemon never starts; ``main.lifespan`` also guards, AC-5/TC-07). """ if not settings.build_cache_prune_enabled: return if self._thread and self._thread.is_alive(): return self._stop.clear() self._thread = threading.Thread( target=self._run, name="build-cache-pruner", daemon=True ) self._thread.start() def stop(self, timeout: float = 5.0) -> None: self._stop.set() if self._thread: self._thread.join(timeout=timeout) def status(self) -> dict: """Build-cache-pruner snapshot for /queue observability (FR-4/AC-7). Never raises — returns a minimal ``{"enabled": ...}`` on any error. """ try: return { "enabled": settings.build_cache_prune_enabled, "interval_s": self.interval_s, "until": self._until, "all": self._all, "last_run_ts": self.last_run_ts, "last_reclaimed_bytes": self._last_reclaimed, "last_reclaimed": self._last_reclaimed_human, "last_error": self._last_error, } except Exception as e: # noqa: BLE001 - observability must never raise logger.warning("build-cache-pruner: status() failed: %s", e) return {"enabled": settings.build_cache_prune_enabled} # Module-level singleton used by the FastAPI lifespan. build_cache_pruner = BuildCachePruner()