orchestrator/src/build_cache_pruner.py

"""ORCH-062: build-cache-pruner — periodic ``docker builder prune`` on the host.

On 07.06.2026 the mva154 host disk silently grew to 100% and took down the WHOLE
self-hosting pipeline of every project. The dominant consumer was the **docker
build cache** (~11 GB accumulated by frequent rebuilds: ``docker compose up
--build`` on prod deploy, the ``--profile staging`` rebuild, the build-once retag
behind ``check_staging_image_fresh``). ORCH-063 added the disk-watchdog, which
only **signals** (Telegram alert at >=85%) and explicitly deferred the cleanup to
this task. **This module is that cleanup: the watchdog signals — the pruner
cleans.**

It is a background daemon thread modelled **1:1 on** ``src/disk_watchdog.py``
(``threading.Thread(daemon=True)`` + ``threading.Event`` for a clean stop, the
``start()`` / ``stop(timeout)`` / ``status()`` contract, a ``/queue`` snapshot,
per-tick never-raise and a kill-switch ``ORCH_BUILD_CACHE_PRUNE_ENABLED``). Each
tick runs **strictly** ``docker builder prune -f --filter until=<until>`` (BuildKit
GC) on the **host over ssh** — the prod container ships no docker CLI, only
``openssh-client`` (``Dockerfile:11``), so docker operations run over ssh on the
host, the same channel ``image_freshness``/``self_deploy`` already use.

Invariants (TRZ §5/§6 / ADR-001 D2/D6):
  * The command touches **only** the BuildKit build cache. There is NO
    ``docker image prune``, NO ``docker system prune``, no image/container removal
    of running services and no container stop/restart. The prod ``orchestrator``
    container is NEVER restarted (self-hosting blast radius). ``-a/--all`` is only
    ever added **paired with** the ``until`` age filter — never a bare
    "nuke everything".
  * ``STAGE_TRANSITIONS`` / ``QG_CHECKS`` / ``check_*`` / ``_parse_*`` /
    ``src/stage_engine.py`` / the DB schema are UNCHANGED — the pruner is an
    operational daemon, not a Quality Gate (like ``reconciler`` / ``job_reaper`` /
    ``disk_watchdog``). No new migration (last-run / last-result is in-memory,
    best-effort, may reset on restart — safe: at worst one extra safe prune).
  * never-raise on two levels: per-command (non-zero rc / timeout / ``OSError`` /
    no ssh target / output-parse error -> logged and swallowed, the tick lives)
    and per-tick (outer ``try/except`` in ``_run``, like ``disk_watchdog._run``).
    The background loop and the pipeline never fall over.
  * No ssh target configured (``deploy_ssh_host`` empty) -> the tick is a no-op
    (logged, reflected in ``status().last_error``). This scopes the feature to the
    self-hosting prod (where ssh is configured) and makes the default safe in any
    environment without host access — parallel to how ``self_deploy`` /
    ``image_freshness`` degrade without a target.
  * Kill-switch ``build_cache_prune_enabled=False`` -> the daemon does not start
    (``main.lifespan`` guard + ``start()`` guard) and ``/queue`` returns
    ``{"enabled": false, ...}`` — behaviour 1:1 as before the task.

See docs/work-items/ORCH-062/06-adr/ADR-001-build-cache-pruner.md and the
cross-cutting docs/architecture/adr/adr-0025-build-cache-pruner.md.
"""

import logging
import re
import shlex
import subprocess
import threading
import time

from .config import settings
from .notifications import send_telegram

logger = logging.getLogger("orchestrator.build_cache_pruner")

_BYTES_PER_GB = 1024 ** 3

# Multipliers for the "Total reclaimed space: <n><unit>" line emitted by
# `docker builder prune`. Decimal units are base-1000 (docker's HumanSize),
# the *i* binary units base-1024. Best-effort — only used for observability /
# the optional notify threshold, never for a decision.
_SIZE_UNITS = {
    "B": 1,
    "KB": 1000, "MB": 1000 ** 2, "GB": 1000 ** 3, "TB": 1000 ** 4,
    "KIB": 1024, "MIB": 1024 ** 2, "GIB": 1024 ** 3, "TIB": 1024 ** 4,
}
_RECLAIMED_RE = re.compile(
    r"Total reclaimed space:\s*([\d.]+)\s*([KMGT]?i?B)", re.IGNORECASE
)


def decide_prune(prev_run_ts: float | None, now: float, interval_s: float) -> bool:
    """Pure decision (anti-frequency, NFR-4): should this tick prune?

    Returns ``True`` when no prune has run yet (``prev_run_ts is None``) or at
    least ``interval_s`` seconds have elapsed since the last attempt; ``False``
    otherwise. Testable without a thread or a real timer (TC-01/TC-02). A
    non-positive / unusable ``interval_s`` falls open to ``True`` (prune) — the
    config validator already guards the value, this is belt-and-braces.
    """
    if prev_run_ts is None:
        return True
    try:
        return (now - prev_run_ts) >= interval_s
    except TypeError:  # pragma: no cover - defensive, inputs are numbers
        return True


def _ssh_target() -> str | None:
    """ssh ``user@host`` for the host prune, or ``None`` when no host is
    configured (tests / non-self contexts). Mirrors ``image_freshness._ssh_target``.
    """
    host = (settings.deploy_ssh_host or "").strip()
    if not host:
        return None
    user = (settings.deploy_ssh_user or "").strip()
    return f"{user}@{host}" if user else host


def build_prune_command(
    ssh_target: str, until: str, prune_all: bool = False
) -> list[str]:
    """Build the ssh command that runs ``docker builder prune`` on the host.

    The remote is **strictly** ``docker builder prune -f`` (BuildKit GC), with the
    age filter ``--filter until=<until>`` appended whenever ``until`` is set so the
    warm recent cache is kept (BR-2/AC-2), and ``-a`` added **only** when
    ``prune_all`` is set — always paired with the age filter (D2). It NEVER emits
    ``docker image prune`` / ``docker system prune`` / any image/container removal
    (BR-3/AC-3). The ``until`` value is ``shlex.quote``-d for the remote shell.
    """
    remote = "docker builder prune -f"
    if prune_all:
        remote += " -a"
    if until:
        remote += " --filter until=" + shlex.quote(until)
    return ["ssh", "-o", "StrictHostKeyChecking=no", ssh_target, remote]


def parse_reclaimed(output: str) -> int | None:
    """Best-effort parse of ``Total reclaimed space: <n><unit>`` -> bytes.

    Returns the reclaimed size in bytes, or ``None`` when the line is absent /
    unparseable (FR-4: observability is best-effort, never a decision). Never
    raises.
    """
    try:
        m = _RECLAIMED_RE.search(output or "")
        if not m:
            return None
        value = float(m.group(1))
        unit = m.group(2).upper()
        mult = _SIZE_UNITS.get(unit)
        if mult is None:
            return None
        return int(value * mult)
    except Exception as e:  # noqa: BLE001 - parsing is best-effort
        logger.warning("build-cache-pruner: cannot parse reclaimed space: %s", e)
        return None


class BuildCachePruner:
    """Background daemon running ``docker builder prune`` on the host on a period.

    Modelled on ``DiskWatchdog``: a ``threading.Thread(daemon=True)`` + a
    ``threading.Event`` for a clean stop. The only in-memory state is the
    best-effort ``last_run_ts`` / ``_last_reclaimed`` / ``_last_error`` — all reset
    on restart, which is safe (at worst one extra safe prune; D6).

    ``now_provider`` is injectable so the anti-frequency decision is testable
    deterministically without a real timer.
    """

    def __init__(self, interval_s: float | None = None, now_provider=None):
        self.interval_s = (
            interval_s
            if interval_s is not None
            else settings.build_cache_prune_interval_s
        )
        self._now = now_provider or time.time
        self._stop = threading.Event()
        self._thread: threading.Thread | None = None
        # Best-effort in-memory state (no DB row, no migration).
        self.last_run_ts: float | None = None
        self._last_reclaimed: int | None = None
        self._last_reclaimed_human: str | None = None
        self._last_error: str | None = None

    # -- config helpers ----------------------------------------------------
    @property
    def _until(self) -> str:
        return settings.build_cache_prune_until

    @property
    def _all(self) -> bool:
        return settings.build_cache_prune_all

    @property
    def _timeout_s(self) -> int:
        return settings.build_cache_prune_timeout_s

    @property
    def _notify_min_gb(self) -> float:
        return settings.build_cache_prune_notify_min_gb

    # -- tick --------------------------------------------------------------
    def tick(self) -> None:
        """One pass: prune if the anti-frequency window has elapsed (never-raise).

        Runs the pure ``decide_prune`` against the injected clock; on a PRUNE
        decision it performs the host prune (``_prune``), which is itself
        never-raise. A SKIP decision leaves all state untouched.
        """
        now = self._now()
        if not decide_prune(self.last_run_ts, now, self.interval_s):
            return
        self._prune(now)

    def _prune(self, now: float) -> None:
        """Run ``docker builder prune`` on the host over ssh. Never raises (AC-4).

        Records the attempt time (``last_run_ts``) up front so the anti-frequency
        window advances even when the command fails or there is no ssh target.
        Every failure mode — no target, timeout, non-zero rc, ``OSError`` — is
        logged, stored in ``_last_error`` and swallowed; the loop stays alive.
        """
        self.last_run_ts = now
        target = _ssh_target()
        if not target:
            self._last_error = "no ssh host configured (deploy_ssh_host empty)"
            logger.info("build-cache-pruner: %s — tick is a no-op", self._last_error)
            return

        cmd = build_prune_command(target, self._until, self._all)
        try:
            r = subprocess.run(
                cmd, capture_output=True, text=True, timeout=self._timeout_s
            )
        except subprocess.TimeoutExpired:
            self._last_error = f"timeout after {self._timeout_s}s"
            logger.warning("build-cache-pruner: prune %s", self._last_error)
            return
        except (subprocess.SubprocessError, OSError) as e:
            self._last_error = f"ssh/subprocess error: {e}"
            logger.warning("build-cache-pruner: %s", self._last_error)
            return

        if r.returncode != 0:
            self._last_error = (
                f"rc={r.returncode}: {(r.stderr or '').strip()[:200]}"
            )
            logger.warning("build-cache-pruner: prune %s", self._last_error)
            return

        # Success: parse the best-effort reclaimed size and clear the error.
        self._last_error = None
        reclaimed = parse_reclaimed(r.stdout or "")
        self._last_reclaimed = reclaimed
        self._last_reclaimed_human = self._format_reclaimed(reclaimed)
        logger.info(
            "build-cache-pruner: pruned host build cache (until=%s, all=%s), "
            "reclaimed=%s",
            self._until, self._all, self._last_reclaimed_human or "unknown",
        )
        self._maybe_notify(reclaimed)

    @staticmethod
    def _format_reclaimed(reclaimed: int | None) -> str | None:
        """Human GB label for a reclaimed byte count (best-effort, never raises)."""
        if reclaimed is None:
            return None
        try:
            return f"{reclaimed / _BYTES_PER_GB:.2f} GB"
        except Exception:  # noqa: BLE001 - observability only
            return None

    def _maybe_notify(self, reclaimed: int | None) -> None:
        """Telegram when reclaimed >= ``notify_min_gb`` (>0 to enable). Never raises."""
        try:
            min_gb = self._notify_min_gb
            if not min_gb or min_gb <= 0 or reclaimed is None:
                return
            gb = reclaimed / _BYTES_PER_GB
            if gb < min_gb:
                return
            self._send(
                f"\U0001f9f9 build-cache-pruner: освобождено {gb:.2f} ГБ "
                f"docker build cache на хосте (until={self._until})."
            )
        except Exception as e:  # noqa: BLE001 - notify is best-effort
            logger.warning("build-cache-pruner: notify decision failed: %s", e)

    def _send(self, text: str) -> None:
        """Send a Telegram message (notifying). Never raises (best-effort)."""
        try:
            send_telegram(text)
        except Exception as e:  # noqa: BLE001 - delivery is best-effort
            logger.warning("build-cache-pruner: telegram send failed: %s", e)

    # -- loop / lifecycle --------------------------------------------------
    def _tick(self) -> None:
        try:
            self.tick()
        except Exception as e:  # noqa: BLE001 - inner never-raise
            logger.error("build-cache-pruner: tick error: %s", e)

    def _run(self) -> None:
        logger.info(
            "BuildCachePruner started (interval=%ss, until=%s, all=%s, "
            "timeout=%ss, enabled=%s)",
            self.interval_s, self._until, self._all, self._timeout_s,
            settings.build_cache_prune_enabled,
        )
        while not self._stop.is_set():
            try:
                self._tick()
            except Exception as e:  # noqa: BLE001 - outer never-raise
                logger.error("BuildCachePruner loop error: %s", e)
            self._stop.wait(self.interval_s)
        logger.info("BuildCachePruner stopped")

    def start(self) -> None:
        """Start the daemon thread (idempotent: a live thread is a no-op).

        Honours the kill-switch: ``build_cache_prune_enabled=False`` -> no-op (the
        daemon never starts; ``main.lifespan`` also guards, AC-5/TC-07).
        """
        if not settings.build_cache_prune_enabled:
            return
        if self._thread and self._thread.is_alive():
            return
        self._stop.clear()
        self._thread = threading.Thread(
            target=self._run, name="build-cache-pruner", daemon=True
        )
        self._thread.start()

    def stop(self, timeout: float = 5.0) -> None:
        self._stop.set()
        if self._thread:
            self._thread.join(timeout=timeout)

    def status(self) -> dict:
        """Build-cache-pruner snapshot for /queue observability (FR-4/AC-7).

        Never raises — returns a minimal ``{"enabled": ...}`` on any error.
        """
        try:
            return {
                "enabled": settings.build_cache_prune_enabled,
                "interval_s": self.interval_s,
                "until": self._until,
                "all": self._all,
                "last_run_ts": self.last_run_ts,
                "last_reclaimed_bytes": self._last_reclaimed,
                "last_reclaimed": self._last_reclaimed_human,
                "last_error": self._last_error,
            }
        except Exception as e:  # noqa: BLE001 - observability must never raise
            logger.warning("build-cache-pruner: status() failed: %s", e)
            return {"enabled": settings.build_cache_prune_enabled}


# Module-level singleton used by the FastAPI lifespan.
build_cache_pruner = BuildCachePruner()