diff --git a/.env.example b/.env.example index c08340f..9e4bf8d 100644 --- a/.env.example +++ b/.env.example @@ -286,6 +286,26 @@ ORCH_DISK_MONITOR_THRESHOLD_PCT=85 ORCH_DISK_MONITOR_REALERT_S=21600 ORCH_DISK_MONITOR_PATHS=/repos,/app/data +# ORCH-062: build-cache-pruner — the "second half" of the disk-watchdog +# (watchdog SIGNALS, pruner CLEANS). A daemon thread modelled on disk_watchdog +# that periodically runs STRICTLY `docker builder prune -f --filter until=` +# on the HOST over ssh (BuildKit GC). Touches ONLY the build cache: never +# images/containers of running services, never restarts the docker daemon or the +# prod container (self-hosting safety). State is in-memory (no DB migration). No +# ssh host configured -> the tick is a no-op. See docs/operations/INFRA.md. +# BUILD_CACHE_PRUNE_ENABLED -> kill-switch; false -> the daemon does not start (1:1 as before). +# BUILD_CACHE_PRUNE_INTERVAL_S -> tick period, seconds (order of hours; default ~6h). >0, else default. +# BUILD_CACHE_PRUNE_UNTIL -> retention age for the warm cache (`--filter until=`); ^\d+[smhdw]?$, else 24h. +# BUILD_CACHE_PRUNE_ALL -> add `-a` (ALWAYS paired with until); default false. +# BUILD_CACHE_PRUNE_TIMEOUT_S -> bound on the ssh command, seconds. >0, else default. +# BUILD_CACHE_PRUNE_NOTIFY_MIN_GB -> Telegram when reclaimed >= N GB; 0 -> silent. +ORCH_BUILD_CACHE_PRUNE_ENABLED=true +ORCH_BUILD_CACHE_PRUNE_INTERVAL_S=21600 +ORCH_BUILD_CACHE_PRUNE_UNTIL=24h +ORCH_BUILD_CACHE_PRUNE_ALL=false +ORCH_BUILD_CACHE_PRUNE_TIMEOUT_S=120 +ORCH_BUILD_CACHE_PRUNE_NOTIFY_MIN_GB=0 + # ORCH-022: security-gate (secret-scanning + dependency audit) on the # deploy-staging -> deploy edge, run FIRST among the edge sub-gates. Deterministic # (no LLM): gitleaks (offline secret-scan, pinned Go binary in the image) + pip-audit diff --git a/CHANGELOG.md b/CHANGELOG.md index b01e1a1..a0c77e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,12 @@ Формат: [Keep a Changelog](https://keepachangelog.com/). Записи — на смысловой PR/задачу. ## [Unreleased] +- **Build-cache-pruner: авто-prune docker build cache на mva154** (ORCH-062, `feat`): новый фоновый daemon-поток `src/build_cache_pruner.py` (каркас `disk_watchdog`) — «вторая половина» disk-watchdog (ORCH-063): **watchdog сигналит — pruner убирает**. Устраняет корень инцидента 07.06.2026 (docker build cache ≈11 ГБ → диск mva154 100% → падение self-hosting-конвейера всех проектов) **автоматически, без оператора**. **Аддитивно, never-raise:** `STAGE_TRANSITIONS`/`QG_CHECKS`/`check_*`/`_parse_*`/`src/stage_engine.py`/схема БД — **не тронуты**, новой миграции нет (состояние last-run/last-result — in-memory, best-effort). + - **Периодическая уборка (FR-1/AC-1):** каждые `build_cache_prune_interval_s` (дефолт **21600с = 6ч**) тик выполняет **строго `docker builder prune -f --filter until=`** (BuildKit GC). Анти-частота — pure-функция `decide_prune(prev_run_ts, now, interval_s)` (юнит-тестируема без потока/таймера, время инъецируется). Дефолт `until=24h` удерживает тёплый недавний кэш (BR-2/AC-2); `-a/--all` (`build_cache_prune_all`, дефолт `False`) — **только в паре** с возрастным фильтром. + - **Self-hosting безопасность (FR-3/AC-3):** команда затрагивает **только** build cache — **нет** `docker image prune`/`docker system prune`, удаления образов/контейнеров запущенных сервисов, остановки/рестарта контейнеров; прод-контейнер `orchestrator` **никогда** не рестартится. Уборка исполняется **на хосте через ssh** (`deploy_ssh_user@deploy_ssh_host`, тот же канал, что `image_freshness`/`self_deploy` — в образе нет docker CLI). Нет ssh-таргета → тик no-op (наблюдаемо в `status().last_error`). + - **never-raise (FR-6/AC-4):** per-команда (ненулевой rc / `TimeoutExpired` / `OSError`/`FileNotFoundError` / недоступность ssh / parsing-ошибка → лог + проглот, тик жив) и per-tick (внешний `try/except` в `_run`, как `disk_watchdog`). Фоновый цикл и конвейер не падают. + - **Конфигурируемость + kill-switch (FR-5/AC-5/AC-6):** флаги `build_cache_prune_enabled`/`_interval_s`/`_until`/`_all`/`_timeout_s`/`_notify_min_gb` (`src/config.py`, env `ORCH_BUILD_CACHE_PRUNE_*`) с defensive-валидацией (интервал/таймаут >0, `until` ~ `^\d+[smhdw]?$`, notify_min_gb ≥0 → невалидное к безопасному дефолту + warning, старт не падает). `build_cache_prune_enabled=false` → демон не стартует (старт/стоп в `main.lifespan` рядом с `disk_watchdog`, гард), `GET /queue` → `{"enabled": false}` — поведение 1:1 как до задачи. + - **Наблюдаемость (FR-4/AC-7):** аддитивный read-only блок `build_cache_prune` в `GET /queue` (`enabled`/`interval_s`/`until`/`all`/`last_run_ts`/`last_reclaimed`[+`_bytes`]/`last_error`); `status()` never-raise. Опц. Telegram при освобождении ≥ `notify_min_gb` ГБ (дефолт `0` = тихо). Тесты: `tests/test_build_cache_pruner.py` (TC-01..TC-12, 23 кейса, docker замокан — ни один тест не трогает реальный docker); полный регресс `tests/` зелёный (1319). Документация: `docs/operations/INFRA.md` (секция авто-prune + env-карта; снята формулировка ORCH-063 «освобождение build cache — ручная операция»), `docs/architecture/README.md`, `.env.example`. ADR: `docs/work-items/ORCH-062/06-adr/ADR-001-build-cache-pruner.md`, сквозной `docs/architecture/adr/adr-0025-build-cache-pruner.md`. Откат: `ORCH_BUILD_CACHE_PRUNE_ENABLED=false` (миграций нет). - **Disk-watchdog: мониторинг заполнения диска mva154 + Telegram-алерт при ≥85%** (ORCH-063, `feat`): новый фоновый daemon-поток `src/disk_watchdog.py` (каркас `reconciler`/`job_reaper`) — недостающий **проактивный** сигнал о заполнении хост-диска (07.06.2026 диск mva154 тихо дорос до 100% и положил весь self-hosting-конвейер всех проектов). **Аддитивно, never-raise:** `STAGE_TRANSITIONS`/`QG_CHECKS`/`check_*`/схема БД — **не тронуты**, новой миграции нет (состояние анти-спама — in-memory). - **Замер хост-ФС (FR-2/AC-8):** каждые `disk_monitor_interval_s` (дефолт 300с) меряет заполнение **смонтированных хост-bind-путей** (`/repos`, `/app/data`) через stdlib `shutil.disk_usage` — НЕ overlay `/` контейнера, НЕ субпроцесс `df`; дедуп путей по физическому устройству (`st_dev`) → один алерт на раздел. Недоступный путь → пропуск с warning, остальные пути меряются (per-path never-raise). - **Решение об алерте (FR-3/FR-4/AC-2..AC-4):** pure-функция `decide_action(used_pct, threshold, prev_state, now, realert_s)` (юнит-тестируема без потока/таймера, время инъецируется): алерт на пересечении порога (дефолт **85%**, граница `>=` включительно), cooldown-повтор `disk_monitor_realert_s` (~6ч, анти-спам — не на каждом тике), однократный recovery при возврате ниже порога. Алерт — `send_telegram` (notifying, не silent), best-effort. diff --git a/src/build_cache_pruner.py b/src/build_cache_pruner.py new file mode 100644 index 0000000..261e19c --- /dev/null +++ b/src/build_cache_pruner.py @@ -0,0 +1,351 @@ +"""ORCH-062: build-cache-pruner — periodic ``docker builder prune`` on the host. + +On 07.06.2026 the mva154 host disk silently grew to 100% and took down the WHOLE +self-hosting pipeline of every project. The dominant consumer was the **docker +build cache** (~11 GB accumulated by frequent rebuilds: ``docker compose up +--build`` on prod deploy, the ``--profile staging`` rebuild, the build-once retag +behind ``check_staging_image_fresh``). ORCH-063 added the disk-watchdog, which +only **signals** (Telegram alert at >=85%) and explicitly deferred the cleanup to +this task. **This module is that cleanup: the watchdog signals — the pruner +cleans.** + +It is a background daemon thread modelled **1:1 on** ``src/disk_watchdog.py`` +(``threading.Thread(daemon=True)`` + ``threading.Event`` for a clean stop, the +``start()`` / ``stop(timeout)`` / ``status()`` contract, a ``/queue`` snapshot, +per-tick never-raise and a kill-switch ``ORCH_BUILD_CACHE_PRUNE_ENABLED``). Each +tick runs **strictly** ``docker builder prune -f --filter until=`` (BuildKit +GC) on the **host over ssh** — the prod container ships no docker CLI, only +``openssh-client`` (``Dockerfile:11``), so docker operations run over ssh on the +host, the same channel ``image_freshness``/``self_deploy`` already use. + +Invariants (TRZ §5/§6 / ADR-001 D2/D6): + * The command touches **only** the BuildKit build cache. There is NO + ``docker image prune``, NO ``docker system prune``, no image/container removal + of running services and no container stop/restart. The prod ``orchestrator`` + container is NEVER restarted (self-hosting blast radius). ``-a/--all`` is only + ever added **paired with** the ``until`` age filter — never a bare + "nuke everything". + * ``STAGE_TRANSITIONS`` / ``QG_CHECKS`` / ``check_*`` / ``_parse_*`` / + ``src/stage_engine.py`` / the DB schema are UNCHANGED — the pruner is an + operational daemon, not a Quality Gate (like ``reconciler`` / ``job_reaper`` / + ``disk_watchdog``). No new migration (last-run / last-result is in-memory, + best-effort, may reset on restart — safe: at worst one extra safe prune). + * never-raise on two levels: per-command (non-zero rc / timeout / ``OSError`` / + no ssh target / output-parse error -> logged and swallowed, the tick lives) + and per-tick (outer ``try/except`` in ``_run``, like ``disk_watchdog._run``). + The background loop and the pipeline never fall over. + * No ssh target configured (``deploy_ssh_host`` empty) -> the tick is a no-op + (logged, reflected in ``status().last_error``). This scopes the feature to the + self-hosting prod (where ssh is configured) and makes the default safe in any + environment without host access — parallel to how ``self_deploy`` / + ``image_freshness`` degrade without a target. + * Kill-switch ``build_cache_prune_enabled=False`` -> the daemon does not start + (``main.lifespan`` guard + ``start()`` guard) and ``/queue`` returns + ``{"enabled": false, ...}`` — behaviour 1:1 as before the task. + +See docs/work-items/ORCH-062/06-adr/ADR-001-build-cache-pruner.md and the +cross-cutting docs/architecture/adr/adr-0025-build-cache-pruner.md. +""" + +import logging +import re +import shlex +import subprocess +import threading +import time + +from .config import settings +from .notifications import send_telegram + +logger = logging.getLogger("orchestrator.build_cache_pruner") + +_BYTES_PER_GB = 1024 ** 3 + +# Multipliers for the "Total reclaimed space: " line emitted by +# `docker builder prune`. Decimal units are base-1000 (docker's HumanSize), +# the *i* binary units base-1024. Best-effort — only used for observability / +# the optional notify threshold, never for a decision. +_SIZE_UNITS = { + "B": 1, + "KB": 1000, "MB": 1000 ** 2, "GB": 1000 ** 3, "TB": 1000 ** 4, + "KIB": 1024, "MIB": 1024 ** 2, "GIB": 1024 ** 3, "TIB": 1024 ** 4, +} +_RECLAIMED_RE = re.compile( + r"Total reclaimed space:\s*([\d.]+)\s*([KMGT]?i?B)", re.IGNORECASE +) + + +def decide_prune(prev_run_ts: float | None, now: float, interval_s: float) -> bool: + """Pure decision (anti-frequency, NFR-4): should this tick prune? + + Returns ``True`` when no prune has run yet (``prev_run_ts is None``) or at + least ``interval_s`` seconds have elapsed since the last attempt; ``False`` + otherwise. Testable without a thread or a real timer (TC-01/TC-02). A + non-positive / unusable ``interval_s`` falls open to ``True`` (prune) — the + config validator already guards the value, this is belt-and-braces. + """ + if prev_run_ts is None: + return True + try: + return (now - prev_run_ts) >= interval_s + except TypeError: # pragma: no cover - defensive, inputs are numbers + return True + + +def _ssh_target() -> str | None: + """ssh ``user@host`` for the host prune, or ``None`` when no host is + configured (tests / non-self contexts). Mirrors ``image_freshness._ssh_target``. + """ + host = (settings.deploy_ssh_host or "").strip() + if not host: + return None + user = (settings.deploy_ssh_user or "").strip() + return f"{user}@{host}" if user else host + + +def build_prune_command( + ssh_target: str, until: str, prune_all: bool = False +) -> list[str]: + """Build the ssh command that runs ``docker builder prune`` on the host. + + The remote is **strictly** ``docker builder prune -f`` (BuildKit GC), with the + age filter ``--filter until=`` appended whenever ``until`` is set so the + warm recent cache is kept (BR-2/AC-2), and ``-a`` added **only** when + ``prune_all`` is set — always paired with the age filter (D2). It NEVER emits + ``docker image prune`` / ``docker system prune`` / any image/container removal + (BR-3/AC-3). The ``until`` value is ``shlex.quote``-d for the remote shell. + """ + remote = "docker builder prune -f" + if prune_all: + remote += " -a" + if until: + remote += " --filter until=" + shlex.quote(until) + return ["ssh", "-o", "StrictHostKeyChecking=no", ssh_target, remote] + + +def parse_reclaimed(output: str) -> int | None: + """Best-effort parse of ``Total reclaimed space: `` -> bytes. + + Returns the reclaimed size in bytes, or ``None`` when the line is absent / + unparseable (FR-4: observability is best-effort, never a decision). Never + raises. + """ + try: + m = _RECLAIMED_RE.search(output or "") + if not m: + return None + value = float(m.group(1)) + unit = m.group(2).upper() + mult = _SIZE_UNITS.get(unit) + if mult is None: + return None + return int(value * mult) + except Exception as e: # noqa: BLE001 - parsing is best-effort + logger.warning("build-cache-pruner: cannot parse reclaimed space: %s", e) + return None + + +class BuildCachePruner: + """Background daemon running ``docker builder prune`` on the host on a period. + + Modelled on ``DiskWatchdog``: a ``threading.Thread(daemon=True)`` + a + ``threading.Event`` for a clean stop. The only in-memory state is the + best-effort ``last_run_ts`` / ``_last_reclaimed`` / ``_last_error`` — all reset + on restart, which is safe (at worst one extra safe prune; D6). + + ``now_provider`` is injectable so the anti-frequency decision is testable + deterministically without a real timer. + """ + + def __init__(self, interval_s: float | None = None, now_provider=None): + self.interval_s = ( + interval_s + if interval_s is not None + else settings.build_cache_prune_interval_s + ) + self._now = now_provider or time.time + self._stop = threading.Event() + self._thread: threading.Thread | None = None + # Best-effort in-memory state (no DB row, no migration). + self.last_run_ts: float | None = None + self._last_reclaimed: int | None = None + self._last_reclaimed_human: str | None = None + self._last_error: str | None = None + + # -- config helpers ---------------------------------------------------- + @property + def _until(self) -> str: + return settings.build_cache_prune_until + + @property + def _all(self) -> bool: + return settings.build_cache_prune_all + + @property + def _timeout_s(self) -> int: + return settings.build_cache_prune_timeout_s + + @property + def _notify_min_gb(self) -> float: + return settings.build_cache_prune_notify_min_gb + + # -- tick -------------------------------------------------------------- + def tick(self) -> None: + """One pass: prune if the anti-frequency window has elapsed (never-raise). + + Runs the pure ``decide_prune`` against the injected clock; on a PRUNE + decision it performs the host prune (``_prune``), which is itself + never-raise. A SKIP decision leaves all state untouched. + """ + now = self._now() + if not decide_prune(self.last_run_ts, now, self.interval_s): + return + self._prune(now) + + def _prune(self, now: float) -> None: + """Run ``docker builder prune`` on the host over ssh. Never raises (AC-4). + + Records the attempt time (``last_run_ts``) up front so the anti-frequency + window advances even when the command fails or there is no ssh target. + Every failure mode — no target, timeout, non-zero rc, ``OSError`` — is + logged, stored in ``_last_error`` and swallowed; the loop stays alive. + """ + self.last_run_ts = now + target = _ssh_target() + if not target: + self._last_error = "no ssh host configured (deploy_ssh_host empty)" + logger.info("build-cache-pruner: %s — tick is a no-op", self._last_error) + return + + cmd = build_prune_command(target, self._until, self._all) + try: + r = subprocess.run( + cmd, capture_output=True, text=True, timeout=self._timeout_s + ) + except subprocess.TimeoutExpired: + self._last_error = f"timeout after {self._timeout_s}s" + logger.warning("build-cache-pruner: prune %s", self._last_error) + return + except (subprocess.SubprocessError, OSError) as e: + self._last_error = f"ssh/subprocess error: {e}" + logger.warning("build-cache-pruner: %s", self._last_error) + return + + if r.returncode != 0: + self._last_error = ( + f"rc={r.returncode}: {(r.stderr or '').strip()[:200]}" + ) + logger.warning("build-cache-pruner: prune %s", self._last_error) + return + + # Success: parse the best-effort reclaimed size and clear the error. + self._last_error = None + reclaimed = parse_reclaimed(r.stdout or "") + self._last_reclaimed = reclaimed + self._last_reclaimed_human = self._format_reclaimed(reclaimed) + logger.info( + "build-cache-pruner: pruned host build cache (until=%s, all=%s), " + "reclaimed=%s", + self._until, self._all, self._last_reclaimed_human or "unknown", + ) + self._maybe_notify(reclaimed) + + @staticmethod + def _format_reclaimed(reclaimed: int | None) -> str | None: + """Human GB label for a reclaimed byte count (best-effort, never raises).""" + if reclaimed is None: + return None + try: + return f"{reclaimed / _BYTES_PER_GB:.2f} GB" + except Exception: # noqa: BLE001 - observability only + return None + + def _maybe_notify(self, reclaimed: int | None) -> None: + """Telegram when reclaimed >= ``notify_min_gb`` (>0 to enable). Never raises.""" + try: + min_gb = self._notify_min_gb + if not min_gb or min_gb <= 0 or reclaimed is None: + return + gb = reclaimed / _BYTES_PER_GB + if gb < min_gb: + return + self._send( + f"\U0001f9f9 build-cache-pruner: освобождено {gb:.2f} ГБ " + f"docker build cache на хосте (until={self._until})." + ) + except Exception as e: # noqa: BLE001 - notify is best-effort + logger.warning("build-cache-pruner: notify decision failed: %s", e) + + def _send(self, text: str) -> None: + """Send a Telegram message (notifying). Never raises (best-effort).""" + try: + send_telegram(text) + except Exception as e: # noqa: BLE001 - delivery is best-effort + logger.warning("build-cache-pruner: telegram send failed: %s", e) + + # -- loop / lifecycle -------------------------------------------------- + def _tick(self) -> None: + try: + self.tick() + except Exception as e: # noqa: BLE001 - inner never-raise + logger.error("build-cache-pruner: tick error: %s", e) + + def _run(self) -> None: + logger.info( + "BuildCachePruner started (interval=%ss, until=%s, all=%s, " + "timeout=%ss, enabled=%s)", + self.interval_s, self._until, self._all, self._timeout_s, + settings.build_cache_prune_enabled, + ) + while not self._stop.is_set(): + try: + self._tick() + except Exception as e: # noqa: BLE001 - outer never-raise + logger.error("BuildCachePruner loop error: %s", e) + self._stop.wait(self.interval_s) + logger.info("BuildCachePruner stopped") + + def start(self) -> None: + """Start the daemon thread (idempotent: a live thread is a no-op). + + Honours the kill-switch: ``build_cache_prune_enabled=False`` -> no-op (the + daemon never starts; ``main.lifespan`` also guards, AC-5/TC-07). + """ + if not settings.build_cache_prune_enabled: + return + if self._thread and self._thread.is_alive(): + return + self._stop.clear() + self._thread = threading.Thread( + target=self._run, name="build-cache-pruner", daemon=True + ) + self._thread.start() + + def stop(self, timeout: float = 5.0) -> None: + self._stop.set() + if self._thread: + self._thread.join(timeout=timeout) + + def status(self) -> dict: + """Build-cache-pruner snapshot for /queue observability (FR-4/AC-7). + + Never raises — returns a minimal ``{"enabled": ...}`` on any error. + """ + try: + return { + "enabled": settings.build_cache_prune_enabled, + "interval_s": self.interval_s, + "until": self._until, + "all": self._all, + "last_run_ts": self.last_run_ts, + "last_reclaimed_bytes": self._last_reclaimed, + "last_reclaimed": self._last_reclaimed_human, + "last_error": self._last_error, + } + except Exception as e: # noqa: BLE001 - observability must never raise + logger.warning("build-cache-pruner: status() failed: %s", e) + return {"enabled": settings.build_cache_prune_enabled} + + +# Module-level singleton used by the FastAPI lifespan. +build_cache_pruner = BuildCachePruner() diff --git a/src/config.py b/src/config.py index 1a9377f..8080608 100644 --- a/src/config.py +++ b/src/config.py @@ -1,4 +1,5 @@ import logging +import re from pydantic import field_validator from pydantic_settings import BaseSettings @@ -445,6 +446,88 @@ class Settings(BaseSettings): except (TypeError, ValueError): return 85 + # ORCH-062: build-cache-pruner — the "second half" of the disk-watchdog + # (ORCH-063): watchdog SIGNALS, pruner CLEANS. A background daemon thread + # modelled 1:1 on disk_watchdog (start/stop in main.lifespan, /queue snapshot, + # never-raise, kill-switch) that periodically runs `docker builder prune` on + # the HOST over ssh (the container ships no docker CLI — same channel as + # image_freshness/self_deploy). Touches ONLY the BuildKit build cache: never + # images/containers of running services, never restarts the docker daemon or + # the prod container (self-hosting safety). State (last run / result) is + # in-memory, best-effort — no DB migration. ADR-001 D1..D7. + # build_cache_prune_enabled -> kill-switch; False -> daemon does not + # start (1:1 as before), env *_ENABLED. + # build_cache_prune_interval_s -> tick period, seconds (order of hours). + # build_cache_prune_until -> retention age for warm cache + # (`docker builder prune --filter until=`). + # build_cache_prune_all -> add `-a` (ALWAYS paired with until). + # build_cache_prune_timeout_s -> bound on the ssh command, seconds. + # build_cache_prune_notify_min_gb -> Telegram when reclaimed >= N GB; 0 -> silent. + # Defensive validation (ADR-001 D4): a non-positive / non-numeric interval or + # timeout -> default + warning; an `until` not matching ^\d+[smhdw]?$ -> "24h"; + # a negative notify threshold -> 0. A bad env value NEVER crashes the start. + build_cache_prune_enabled: bool = True + build_cache_prune_interval_s: int = 21600 + build_cache_prune_until: str = "24h" + build_cache_prune_all: bool = False + build_cache_prune_timeout_s: int = 120 + build_cache_prune_notify_min_gb: float = 0.0 + + @field_validator( + "build_cache_prune_interval_s", "build_cache_prune_timeout_s", mode="before" + ) + @classmethod + def _bcp_positive_int(cls, v, info): + # Non-positive / non-numeric -> the field default (never crash the start). + _defaults = { + "build_cache_prune_interval_s": 21600, + "build_cache_prune_timeout_s": 120, + } + fallback = _defaults.get(info.field_name, 1) + try: + if v is None or (isinstance(v, str) and v.strip() == ""): + return fallback + iv = int(v) + if iv <= 0: + logging.getLogger("orchestrator.config").warning( + "%s must be > 0, got %s; falling back to %s", + info.field_name, v, fallback, + ) + return fallback + return iv + except (TypeError, ValueError): + return fallback + + @field_validator("build_cache_prune_until", mode="before") + @classmethod + def _bcp_until(cls, v): + # A docker `until` filter: digits + optional unit (s/m/h/d/w). Anything + # else -> the safe default "24h" (keeps warm cache, BR-2). + try: + if v is None: + return "24h" + s = str(v).strip() + if s and re.match(r"^\d+[smhdw]?$", s): + return s + logging.getLogger("orchestrator.config").warning( + "build_cache_prune_until must match ^\\d+[smhdw]?$, got %r; using 24h", v + ) + return "24h" + except (TypeError, ValueError): + return "24h" + + @field_validator("build_cache_prune_notify_min_gb", mode="before") + @classmethod + def _bcp_notify_min_gb(cls, v): + # A non-negative GB threshold; negative / non-numeric -> 0 (silent). + try: + if v is None or (isinstance(v, str) and v.strip() == ""): + return 0.0 + fv = float(v) + return fv if fv >= 0 else 0.0 + except (TypeError, ValueError): + return 0.0 + # ORCH-071: merge-verify under-gate on the `deploy -> done` edge. For the # self-hosting repo the `deploy` stage runs the DETERMINISTIC self-deploy path # (Phase A/B/C), where the LLM `deployer` agent — historically the ONLY actor diff --git a/src/main.py b/src/main.py index 38811c8..48b484f 100644 --- a/src/main.py +++ b/src/main.py @@ -113,10 +113,20 @@ async def lifespan(app: FastAPI): from .disk_watchdog import disk_watchdog disk_watchdog.start() + # ORCH-062: start the build-cache-pruner LAST, right after the disk-watchdog + # (D7). It is the "second half" of the watchdog (watchdog signals, pruner + # cleans): a daemon thread that periodically runs `docker builder prune` on + # the host over ssh. Honours the kill-switch ORCH_BUILD_CACHE_PRUNE_ENABLED + # (start() is a no-op when disabled, so behaviour is 1:1 as before). + from .build_cache_pruner import build_cache_pruner + build_cache_pruner.start() + try: yield finally: - # ORCH-063: stop the disk-watchdog first (reverse of startup). + # ORCH-062: stop the build-cache-pruner first (reverse of startup, D7). + build_cache_pruner.stop() + # ORCH-063: stop the disk-watchdog next (reverse of startup). disk_watchdog.stop() # Graceful shutdown order mirrors startup in reverse: stop the reaper # first, then the reconciler (it must not enqueue new work while the @@ -162,6 +172,7 @@ async def queue(): from . import serial_gate from . import labels from .disk_watchdog import disk_watchdog + from .build_cache_pruner import build_cache_pruner return { "counts": job_status_counts(), "max_concurrency": worker.max_concurrency, @@ -184,6 +195,11 @@ async def queue(): # enabled, threshold, interval, last measurement per host-path. Additive # block; never-raise (status() returns {"enabled": ...} minimum on error). "disk_monitor": disk_watchdog.status(), + # ORCH-062 (FR-4 / AC-7): build-cache-pruner observability (read-only) — + # enabled, interval, retention (until), last run + best-effort reclaimed / + # last error. Additive block; never-raise (status() returns {"enabled": + # ...} minimum on error). + "build_cache_prune": build_cache_pruner.status(), "recent": recent_jobs(10), } diff --git a/tests/test_build_cache_pruner.py b/tests/test_build_cache_pruner.py new file mode 100644 index 0000000..92ce05b --- /dev/null +++ b/tests/test_build_cache_pruner.py @@ -0,0 +1,378 @@ +"""ORCH-062: build-cache-pruner tests (TC-01..TC-12). + +The pruner never runs a real ``docker builder prune``: ``subprocess.run`` is +monkeypatched, ``send_telegram`` is captured, and the anti-frequency clock is +injected through ``now_provider`` so time-dependent decisions are tested without a +real timer (same convention as ``test_disk_watchdog.py``). No test touches the +real docker daemon or frees real disk. +""" +import os +import tempfile + +import pytest + +# Override env before importing app modules (same convention as test_disk_watchdog.py). +os.environ.setdefault("ORCH_DB_PATH", os.path.join(tempfile.gettempdir(), "test_orch_bcp.db")) +os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token") +os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token") + +import src.build_cache_pruner as bcp # noqa: E402 +from src.build_cache_pruner import ( # noqa: E402 + BuildCachePruner, + build_prune_command, + decide_prune, + parse_reclaimed, +) + + +# --------------------------------------------------------------------------- # +# Helpers +# --------------------------------------------------------------------------- # +class _Completed: + """Minimal stand-in for ``subprocess.CompletedProcess``.""" + + def __init__(self, returncode=0, stdout="", stderr=""): + self.returncode = returncode + self.stdout = stdout + self.stderr = stderr + + +@pytest.fixture +def ssh_configured(monkeypatch): + """Configure an ssh target so ``_ssh_target()`` is not None.""" + monkeypatch.setattr(bcp.settings, "deploy_ssh_host", "mva154", raising=False) + monkeypatch.setattr(bcp.settings, "deploy_ssh_user", "slin", raising=False) + + +@pytest.fixture +def prune_defaults(monkeypatch): + """Default prune policy (until=24h, all=False, timeout=120, silent).""" + monkeypatch.setattr(bcp.settings, "build_cache_prune_enabled", True, raising=False) + monkeypatch.setattr(bcp.settings, "build_cache_prune_until", "24h", raising=False) + monkeypatch.setattr(bcp.settings, "build_cache_prune_all", False, raising=False) + monkeypatch.setattr(bcp.settings, "build_cache_prune_timeout_s", 120, raising=False) + monkeypatch.setattr(bcp.settings, "build_cache_prune_notify_min_gb", 0.0, raising=False) + + +# --------------------------------------------------------------------------- # +# TC-01 / TC-02: pure anti-frequency decision +# --------------------------------------------------------------------------- # +def test_tc01_decide_prune_when_interval_elapsed(): + """TC-01: never pruned yet -> PRUNE; interval elapsed since last -> PRUNE.""" + assert decide_prune(None, now=1000.0, interval_s=21600) is True + assert decide_prune(1000.0, now=1000.0 + 21600, interval_s=21600) is True + assert decide_prune(1000.0, now=1000.0 + 30000, interval_s=21600) is True + + +def test_tc02_decide_skip_within_interval(): + """TC-02: interval not yet elapsed -> SKIP (anti-frequency, NFR-4).""" + assert decide_prune(1000.0, now=1000.0 + 10, interval_s=21600) is False + assert decide_prune(1000.0, now=1000.0 + 21599, interval_s=21600) is False + + +# --------------------------------------------------------------------------- # +# TC-03: safe command construction (retention filter, no image/system prune) +# --------------------------------------------------------------------------- # +def test_tc03_command_carries_until_and_is_builder_only(): + """TC-03: command is `docker builder prune` with until=, never + image/system prune (FR-2/FR-3/AC-2/AC-3).""" + cmd = build_prune_command("slin@mva154", "24h", prune_all=False) + assert cmd[0] == "ssh" + assert "slin@mva154" in cmd + remote = cmd[-1] + assert "docker builder prune" in remote + assert "--filter until=24h" in remote + # Strictly build cache — never images/system/containers. + assert "image prune" not in remote + assert "system prune" not in remote + assert "-a" not in remote.split() # all-flag not set by default + + +def test_tc03_all_flag_only_paired_with_until(): + """TC-03: -a is added ONLY together with the age filter (D2/AC-2).""" + cmd = build_prune_command("slin@mva154", "24h", prune_all=True) + remote = cmd[-1] + assert "docker builder prune" in remote + assert "-a" in remote.split() + assert "--filter until=24h" in remote # never a bare nuke + + +# --------------------------------------------------------------------------- # +# TC-04: never-raise on subprocess exception / non-zero rc +# --------------------------------------------------------------------------- # +def test_tc04_subprocess_exception_does_not_raise(monkeypatch, ssh_configured, prune_defaults): + """TC-04: a raising subprocess is swallowed; the tick survives, error logged.""" + def _boom(*a, **k): + raise OSError("ssh exploded") + + monkeypatch.setattr(bcp.subprocess, "run", _boom) + pruner = BuildCachePruner(now_provider=lambda: 1000.0) + pruner.tick() # must not raise + assert pruner._last_error is not None + assert pruner.status()["last_error"] is not None + + +def test_tc04_nonzero_rc_recorded(monkeypatch, ssh_configured, prune_defaults): + """TC-04: a non-zero rc is recorded as an error, never raised.""" + monkeypatch.setattr( + bcp.subprocess, "run", + lambda *a, **k: _Completed(returncode=1, stderr="permission denied"), + ) + pruner = BuildCachePruner(now_provider=lambda: 1000.0) + pruner.tick() + assert "rc=1" in pruner._last_error + + +# --------------------------------------------------------------------------- # +# TC-05: never-raise on docker.sock / ssh unavailability +# --------------------------------------------------------------------------- # +def test_tc05_socket_unavailable_skips_tick(monkeypatch, ssh_configured, prune_defaults): + """TC-05: FileNotFoundError / PermissionError -> tick skipped, loop alive.""" + def _enoent(*a, **k): + raise FileNotFoundError("docker.sock missing") + + monkeypatch.setattr(bcp.subprocess, "run", _enoent) + pruner = BuildCachePruner(now_provider=lambda: 1000.0) + pruner.tick() # must not raise + assert pruner._last_error is not None + + +def test_tc05_no_ssh_target_is_noop(monkeypatch, prune_defaults): + """TC-05: no ssh host configured -> tick is a no-op (no subprocess call).""" + monkeypatch.setattr(bcp.settings, "deploy_ssh_host", "", raising=False) + called = {"n": 0} + monkeypatch.setattr(bcp.subprocess, "run", lambda *a, **k: called.__setitem__("n", called["n"] + 1)) + pruner = BuildCachePruner(now_provider=lambda: 1000.0) + pruner.tick() + assert called["n"] == 0 + assert "no ssh host" in pruner._last_error + + +# --------------------------------------------------------------------------- # +# TC-06: never-raise on timeout +# --------------------------------------------------------------------------- # +def test_tc06_timeout_swallowed(monkeypatch, ssh_configured, prune_defaults): + """TC-06: TimeoutExpired is swallowed; the background loop continues (FR-6/AC-4).""" + def _timeout(*a, **k): + raise bcp.subprocess.TimeoutExpired(cmd="ssh ... docker builder prune", timeout=120) + + monkeypatch.setattr(bcp.subprocess, "run", _timeout) + pruner = BuildCachePruner(now_provider=lambda: 1000.0) + pruner.tick() # must not raise + assert "timeout" in pruner._last_error + + +# --------------------------------------------------------------------------- # +# TC-07: kill-switch +# --------------------------------------------------------------------------- # +def test_tc07_killswitch_does_not_start(monkeypatch): + """TC-07: build_cache_prune_enabled=False -> start() is a no-op (no thread).""" + monkeypatch.setattr(bcp.settings, "build_cache_prune_enabled", False, raising=False) + pruner = BuildCachePruner() + pruner.start() + assert pruner._thread is None + + +def test_tc07_killswitch_status_block(monkeypatch): + """TC-07: status() reports enabled=False under the kill-switch.""" + monkeypatch.setattr(bcp.settings, "build_cache_prune_enabled", False, raising=False) + pruner = BuildCachePruner() + assert pruner.status()["enabled"] is False + + +# --------------------------------------------------------------------------- # +# TC-08: config validation -> safe defaults +# --------------------------------------------------------------------------- # +def test_tc08_invalid_interval_falls_back_to_default(): + """TC-08: a non-positive / non-numeric interval -> the safe default (no crash).""" + from src.config import Settings + s = Settings(build_cache_prune_interval_s=0, build_cache_prune_timeout_s=-5) + assert s.build_cache_prune_interval_s == 21600 + assert s.build_cache_prune_timeout_s == 120 + s2 = Settings(build_cache_prune_interval_s="not-a-number") + assert s2.build_cache_prune_interval_s == 21600 + + +def test_tc08_invalid_until_falls_back_to_24h(): + """TC-08: an `until` not matching ^\\d+[smhdw]?$ -> the safe default 24h.""" + from src.config import Settings + assert Settings(build_cache_prune_until="garbage").build_cache_prune_until == "24h" + assert Settings(build_cache_prune_until="").build_cache_prune_until == "24h" + # Valid values are preserved. + assert Settings(build_cache_prune_until="48h").build_cache_prune_until == "48h" + assert Settings(build_cache_prune_until="30m").build_cache_prune_until == "30m" + assert Settings(build_cache_prune_until="7d").build_cache_prune_until == "7d" + + +def test_tc08_negative_notify_min_gb_falls_back_to_zero(): + """TC-08: a negative notify threshold -> 0 (silent), never a crash.""" + from src.config import Settings + assert Settings(build_cache_prune_notify_min_gb=-3).build_cache_prune_notify_min_gb == 0.0 + assert Settings(build_cache_prune_notify_min_gb=2.5).build_cache_prune_notify_min_gb == 2.5 + + +# --------------------------------------------------------------------------- # +# TC-09: status() never-raise + best-effort last result +# --------------------------------------------------------------------------- # +def test_tc09_status_shape(monkeypatch, prune_defaults): + """TC-09: status() carries enabled/interval_s/until/last_run_ts + reclaimed.""" + monkeypatch.setattr(bcp.settings, "build_cache_prune_enabled", True, raising=False) + pruner = BuildCachePruner() + st = pruner.status() + for key in ( + "enabled", "interval_s", "until", "all", "last_run_ts", + "last_reclaimed", "last_reclaimed_bytes", "last_error", + ): + assert key in st + assert st["last_run_ts"] is None # no tick yet + + +def test_tc09_status_reflects_last_prune(monkeypatch, ssh_configured, prune_defaults): + """TC-09: after a successful tick status() carries last_run_ts + reclaimed.""" + monkeypatch.setattr( + bcp.subprocess, "run", + lambda *a, **k: _Completed(returncode=0, stdout="Total reclaimed space: 11.05GB"), + ) + pruner = BuildCachePruner(now_provider=lambda: 1234.0) + pruner.tick() + st = pruner.status() + assert st["last_run_ts"] == 1234.0 + assert st["last_error"] is None + assert st["last_reclaimed_bytes"] == int(11.05 * (1000 ** 3)) + assert "GB" in st["last_reclaimed"] + + +def test_parse_reclaimed_variants(): + """parse_reclaimed: decimal/binary units + absent line (best-effort, never raises).""" + assert parse_reclaimed("Total reclaimed space: 0B") == 0 + assert parse_reclaimed("Total reclaimed space: 500MB") == 500 * 1000 ** 2 + assert parse_reclaimed("Total reclaimed space: 1.5GiB") == int(1.5 * 1024 ** 3) + assert parse_reclaimed("no such line here") is None + assert parse_reclaimed("") is None + + +def test_notify_on_significant_reclaim(monkeypatch, ssh_configured, prune_defaults): + """Optional Telegram when reclaimed >= notify_min_gb; below threshold stays silent.""" + sends = [] + monkeypatch.setattr(bcp, "send_telegram", lambda text, **k: sends.append(text)) + monkeypatch.setattr(bcp.settings, "build_cache_prune_notify_min_gb", 1.0, raising=False) + monkeypatch.setattr( + bcp.subprocess, "run", + lambda *a, **k: _Completed(returncode=0, stdout="Total reclaimed space: 5.0GB"), + ) + pruner = BuildCachePruner(now_provider=lambda: 1.0) + pruner.tick() + assert len(sends) == 1 and "build-cache-pruner" in sends[0] + + # A small reclaim below the threshold stays silent. + sends.clear() + monkeypatch.setattr( + bcp.subprocess, "run", + lambda *a, **k: _Completed(returncode=0, stdout="Total reclaimed space: 100MB"), + ) + pruner2 = BuildCachePruner(now_provider=lambda: 1.0) + pruner2.tick() + assert sends == [] + + +# --------------------------------------------------------------------------- # +# TC-10: leaf isolation from the Quality Gate / stage machine +# --------------------------------------------------------------------------- # +def test_tc10_module_is_leaf_no_pipeline_imports(): + """TC-10: the pruner is a leaf — it does not import stage_engine/stages/qg. + + Inspects the actual import statements (via AST), not the docstring text — the + module legitimately *mentions* those names in prose explaining what it does NOT + touch. + """ + import ast + import inspect + tree = ast.parse(inspect.getsource(bcp)) + imported = set() + for node in ast.walk(tree): + if isinstance(node, ast.Import): + imported.update(a.name for a in node.names) + elif isinstance(node, ast.ImportFrom): + base = ("." * (node.level or 0)) + (node.module or "") + imported.add(base) + imported.update(f"{base}.{a.name}" for a in node.names) + forbidden = ("stage_engine", "stages", "qg") + for imp in imported: + tail = imp.lstrip(".") + assert not any( + tail == f or tail.endswith("." + f) or tail.startswith(f + ".") + for f in forbidden + ), f"pruner must not import a pipeline module, found: {imp}" + + +def test_tc10_stage_transitions_and_qg_unchanged(): + """TC-10: STAGE_TRANSITIONS / QG_CHECKS carry no build-cache-prune element (AC-8).""" + from src.stages import STAGE_TRANSITIONS + from src.qg.checks import QG_CHECKS + blob = repr(STAGE_TRANSITIONS) + repr(list(QG_CHECKS.keys())) + assert "build_cache" not in blob + assert "builder prune" not in blob + + +# --------------------------------------------------------------------------- # +# TC-11: lifespan integration +# --------------------------------------------------------------------------- # +def test_tc11_lifespan_starts_and_stops(monkeypatch): + """TC-11: with the flag on the daemon starts in lifespan and stops cleanly, + docker mocked (FR-1/AC-1).""" + monkeypatch.setattr(bcp.settings, "build_cache_prune_enabled", True, raising=False) + # A very long interval so the loop sleeps immediately after the first tick; + # subprocess is mocked so no real docker call happens. + monkeypatch.setattr(bcp.settings, "build_cache_prune_interval_s", 3600, raising=False) + monkeypatch.setattr(bcp.settings, "deploy_ssh_host", "", raising=False) # no-op tick + pruner = BuildCachePruner(interval_s=3600) + pruner.start() + assert pruner._thread is not None and pruner._thread.is_alive() + pruner.stop(timeout=5.0) + assert not pruner._thread.is_alive() + + +# --------------------------------------------------------------------------- # +# TC-12: GET /queue integration +# --------------------------------------------------------------------------- # +def test_tc12_queue_has_build_cache_block(monkeypatch): + """TC-12: GET /queue carries an additive build_cache_prune block; existing keys kept.""" + import asyncio + import src.db as db + from src.db import init_db + from src import main + + dbfile = os.path.join(tempfile.gettempdir(), "test_bcp_queue.db") + monkeypatch.setattr(db.settings, "db_path", dbfile, raising=False) + init_db() + + payload = asyncio.run(main.queue()) + + for key in ( + "counts", "max_concurrency", "poll_interval", "resilience", "reconcile", + "reaper", "post_deploy", "merge_verify", "task_deps", "serial_gate", + "auto_labels", "disk_monitor", "recent", + ): + assert key in payload, f"existing /queue key '{key}' must be preserved" + + assert "build_cache_prune" in payload + block = payload["build_cache_prune"] + assert "enabled" in block and "interval_s" in block and "until" in block + assert "last_run_ts" in block + + +def test_tc12_queue_disabled_block(monkeypatch): + """TC-12: with the kill-switch off, /queue reports build_cache_prune.enabled=false.""" + import asyncio + import src.db as db + from src.db import init_db + from src import main + from src import build_cache_pruner as bcpmod + + dbfile = os.path.join(tempfile.gettempdir(), "test_bcp_queue2.db") + monkeypatch.setattr(db.settings, "db_path", dbfile, raising=False) + monkeypatch.setattr(bcpmod.settings, "build_cache_prune_enabled", False, raising=False) + init_db() + + payload = asyncio.run(main.queue()) + assert payload["build_cache_prune"]["enabled"] is False