Add src/build_cache_pruner.py — a background daemon thread modelled 1:1 on src/disk_watchdog.py that periodically runs STRICTLY `docker builder prune -f --filter until=<until>` (BuildKit GC) on the HOST over ssh. It is the "second half" of the disk-watchdog (ORCH-063): the watchdog signals, the pruner cleans. Removes the root cause of the 07.06.2026 incident (build cache ~11GB -> disk 100% -> whole self-hosting pipeline down) automatically, без оператора. ADR-001 (Variant A): host-over-ssh, same channel as image_freshness/self_deploy (no docker CLI in the image). Touches ONLY the build cache — no image/system prune, no image/container removal, never restarts the docker daemon or the prod container (self-hosting safety). No ssh target -> tick is a no-op. - src/config.py: ORCH_BUILD_CACHE_PRUNE_* flags + defensive validators (interval/timeout >0, until ~ ^\d+[smhdw]?$, notify_min_gb >=0 -> safe default). - src/main.py: start last (after disk_watchdog) / stop first in lifespan; additive read-only build_cache_prune block in GET /queue. - never-raise on two levels (per-command + per-tick); kill-switch ORCH_BUILD_CACHE_PRUNE_ENABLED (false -> daemon does not start, 1:1 as before). - STAGE_TRANSITIONS / QG_CHECKS / check_* / _parse_* / DB schema UNCHANGED; last-run/last-result is in-memory (no migration). - tests/test_build_cache_pruner.py: TC-01..TC-12 (23 cases, docker fully mocked). - .env.example + CHANGELOG.md updated; INFRA.md / architecture docs already carry the component (architecture stage). Refs: ORCH-062 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
379 lines
16 KiB
Python
379 lines
16 KiB
Python
"""ORCH-062: build-cache-pruner tests (TC-01..TC-12).
|
|
|
|
The pruner never runs a real ``docker builder prune``: ``subprocess.run`` is
|
|
monkeypatched, ``send_telegram`` is captured, and the anti-frequency clock is
|
|
injected through ``now_provider`` so time-dependent decisions are tested without a
|
|
real timer (same convention as ``test_disk_watchdog.py``). No test touches the
|
|
real docker daemon or frees real disk.
|
|
"""
|
|
import os
|
|
import tempfile
|
|
|
|
import pytest
|
|
|
|
# Override env before importing app modules (same convention as test_disk_watchdog.py).
|
|
os.environ.setdefault("ORCH_DB_PATH", os.path.join(tempfile.gettempdir(), "test_orch_bcp.db"))
|
|
os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
|
|
os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")
|
|
|
|
import src.build_cache_pruner as bcp # noqa: E402
|
|
from src.build_cache_pruner import ( # noqa: E402
|
|
BuildCachePruner,
|
|
build_prune_command,
|
|
decide_prune,
|
|
parse_reclaimed,
|
|
)
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# Helpers
|
|
# --------------------------------------------------------------------------- #
|
|
class _Completed:
|
|
"""Minimal stand-in for ``subprocess.CompletedProcess``."""
|
|
|
|
def __init__(self, returncode=0, stdout="", stderr=""):
|
|
self.returncode = returncode
|
|
self.stdout = stdout
|
|
self.stderr = stderr
|
|
|
|
|
|
@pytest.fixture
|
|
def ssh_configured(monkeypatch):
|
|
"""Configure an ssh target so ``_ssh_target()`` is not None."""
|
|
monkeypatch.setattr(bcp.settings, "deploy_ssh_host", "mva154", raising=False)
|
|
monkeypatch.setattr(bcp.settings, "deploy_ssh_user", "slin", raising=False)
|
|
|
|
|
|
@pytest.fixture
|
|
def prune_defaults(monkeypatch):
|
|
"""Default prune policy (until=24h, all=False, timeout=120, silent)."""
|
|
monkeypatch.setattr(bcp.settings, "build_cache_prune_enabled", True, raising=False)
|
|
monkeypatch.setattr(bcp.settings, "build_cache_prune_until", "24h", raising=False)
|
|
monkeypatch.setattr(bcp.settings, "build_cache_prune_all", False, raising=False)
|
|
monkeypatch.setattr(bcp.settings, "build_cache_prune_timeout_s", 120, raising=False)
|
|
monkeypatch.setattr(bcp.settings, "build_cache_prune_notify_min_gb", 0.0, raising=False)
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# TC-01 / TC-02: pure anti-frequency decision
|
|
# --------------------------------------------------------------------------- #
|
|
def test_tc01_decide_prune_when_interval_elapsed():
|
|
"""TC-01: never pruned yet -> PRUNE; interval elapsed since last -> PRUNE."""
|
|
assert decide_prune(None, now=1000.0, interval_s=21600) is True
|
|
assert decide_prune(1000.0, now=1000.0 + 21600, interval_s=21600) is True
|
|
assert decide_prune(1000.0, now=1000.0 + 30000, interval_s=21600) is True
|
|
|
|
|
|
def test_tc02_decide_skip_within_interval():
|
|
"""TC-02: interval not yet elapsed -> SKIP (anti-frequency, NFR-4)."""
|
|
assert decide_prune(1000.0, now=1000.0 + 10, interval_s=21600) is False
|
|
assert decide_prune(1000.0, now=1000.0 + 21599, interval_s=21600) is False
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# TC-03: safe command construction (retention filter, no image/system prune)
|
|
# --------------------------------------------------------------------------- #
|
|
def test_tc03_command_carries_until_and_is_builder_only():
|
|
"""TC-03: command is `docker builder prune` with until=<retention>, never
|
|
image/system prune (FR-2/FR-3/AC-2/AC-3)."""
|
|
cmd = build_prune_command("slin@mva154", "24h", prune_all=False)
|
|
assert cmd[0] == "ssh"
|
|
assert "slin@mva154" in cmd
|
|
remote = cmd[-1]
|
|
assert "docker builder prune" in remote
|
|
assert "--filter until=24h" in remote
|
|
# Strictly build cache — never images/system/containers.
|
|
assert "image prune" not in remote
|
|
assert "system prune" not in remote
|
|
assert "-a" not in remote.split() # all-flag not set by default
|
|
|
|
|
|
def test_tc03_all_flag_only_paired_with_until():
|
|
"""TC-03: -a is added ONLY together with the age filter (D2/AC-2)."""
|
|
cmd = build_prune_command("slin@mva154", "24h", prune_all=True)
|
|
remote = cmd[-1]
|
|
assert "docker builder prune" in remote
|
|
assert "-a" in remote.split()
|
|
assert "--filter until=24h" in remote # never a bare nuke
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# TC-04: never-raise on subprocess exception / non-zero rc
|
|
# --------------------------------------------------------------------------- #
|
|
def test_tc04_subprocess_exception_does_not_raise(monkeypatch, ssh_configured, prune_defaults):
|
|
"""TC-04: a raising subprocess is swallowed; the tick survives, error logged."""
|
|
def _boom(*a, **k):
|
|
raise OSError("ssh exploded")
|
|
|
|
monkeypatch.setattr(bcp.subprocess, "run", _boom)
|
|
pruner = BuildCachePruner(now_provider=lambda: 1000.0)
|
|
pruner.tick() # must not raise
|
|
assert pruner._last_error is not None
|
|
assert pruner.status()["last_error"] is not None
|
|
|
|
|
|
def test_tc04_nonzero_rc_recorded(monkeypatch, ssh_configured, prune_defaults):
|
|
"""TC-04: a non-zero rc is recorded as an error, never raised."""
|
|
monkeypatch.setattr(
|
|
bcp.subprocess, "run",
|
|
lambda *a, **k: _Completed(returncode=1, stderr="permission denied"),
|
|
)
|
|
pruner = BuildCachePruner(now_provider=lambda: 1000.0)
|
|
pruner.tick()
|
|
assert "rc=1" in pruner._last_error
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# TC-05: never-raise on docker.sock / ssh unavailability
|
|
# --------------------------------------------------------------------------- #
|
|
def test_tc05_socket_unavailable_skips_tick(monkeypatch, ssh_configured, prune_defaults):
|
|
"""TC-05: FileNotFoundError / PermissionError -> tick skipped, loop alive."""
|
|
def _enoent(*a, **k):
|
|
raise FileNotFoundError("docker.sock missing")
|
|
|
|
monkeypatch.setattr(bcp.subprocess, "run", _enoent)
|
|
pruner = BuildCachePruner(now_provider=lambda: 1000.0)
|
|
pruner.tick() # must not raise
|
|
assert pruner._last_error is not None
|
|
|
|
|
|
def test_tc05_no_ssh_target_is_noop(monkeypatch, prune_defaults):
|
|
"""TC-05: no ssh host configured -> tick is a no-op (no subprocess call)."""
|
|
monkeypatch.setattr(bcp.settings, "deploy_ssh_host", "", raising=False)
|
|
called = {"n": 0}
|
|
monkeypatch.setattr(bcp.subprocess, "run", lambda *a, **k: called.__setitem__("n", called["n"] + 1))
|
|
pruner = BuildCachePruner(now_provider=lambda: 1000.0)
|
|
pruner.tick()
|
|
assert called["n"] == 0
|
|
assert "no ssh host" in pruner._last_error
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# TC-06: never-raise on timeout
|
|
# --------------------------------------------------------------------------- #
|
|
def test_tc06_timeout_swallowed(monkeypatch, ssh_configured, prune_defaults):
|
|
"""TC-06: TimeoutExpired is swallowed; the background loop continues (FR-6/AC-4)."""
|
|
def _timeout(*a, **k):
|
|
raise bcp.subprocess.TimeoutExpired(cmd="ssh ... docker builder prune", timeout=120)
|
|
|
|
monkeypatch.setattr(bcp.subprocess, "run", _timeout)
|
|
pruner = BuildCachePruner(now_provider=lambda: 1000.0)
|
|
pruner.tick() # must not raise
|
|
assert "timeout" in pruner._last_error
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# TC-07: kill-switch
|
|
# --------------------------------------------------------------------------- #
|
|
def test_tc07_killswitch_does_not_start(monkeypatch):
|
|
"""TC-07: build_cache_prune_enabled=False -> start() is a no-op (no thread)."""
|
|
monkeypatch.setattr(bcp.settings, "build_cache_prune_enabled", False, raising=False)
|
|
pruner = BuildCachePruner()
|
|
pruner.start()
|
|
assert pruner._thread is None
|
|
|
|
|
|
def test_tc07_killswitch_status_block(monkeypatch):
|
|
"""TC-07: status() reports enabled=False under the kill-switch."""
|
|
monkeypatch.setattr(bcp.settings, "build_cache_prune_enabled", False, raising=False)
|
|
pruner = BuildCachePruner()
|
|
assert pruner.status()["enabled"] is False
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# TC-08: config validation -> safe defaults
|
|
# --------------------------------------------------------------------------- #
|
|
def test_tc08_invalid_interval_falls_back_to_default():
|
|
"""TC-08: a non-positive / non-numeric interval -> the safe default (no crash)."""
|
|
from src.config import Settings
|
|
s = Settings(build_cache_prune_interval_s=0, build_cache_prune_timeout_s=-5)
|
|
assert s.build_cache_prune_interval_s == 21600
|
|
assert s.build_cache_prune_timeout_s == 120
|
|
s2 = Settings(build_cache_prune_interval_s="not-a-number")
|
|
assert s2.build_cache_prune_interval_s == 21600
|
|
|
|
|
|
def test_tc08_invalid_until_falls_back_to_24h():
|
|
"""TC-08: an `until` not matching ^\\d+[smhdw]?$ -> the safe default 24h."""
|
|
from src.config import Settings
|
|
assert Settings(build_cache_prune_until="garbage").build_cache_prune_until == "24h"
|
|
assert Settings(build_cache_prune_until="").build_cache_prune_until == "24h"
|
|
# Valid values are preserved.
|
|
assert Settings(build_cache_prune_until="48h").build_cache_prune_until == "48h"
|
|
assert Settings(build_cache_prune_until="30m").build_cache_prune_until == "30m"
|
|
assert Settings(build_cache_prune_until="7d").build_cache_prune_until == "7d"
|
|
|
|
|
|
def test_tc08_negative_notify_min_gb_falls_back_to_zero():
|
|
"""TC-08: a negative notify threshold -> 0 (silent), never a crash."""
|
|
from src.config import Settings
|
|
assert Settings(build_cache_prune_notify_min_gb=-3).build_cache_prune_notify_min_gb == 0.0
|
|
assert Settings(build_cache_prune_notify_min_gb=2.5).build_cache_prune_notify_min_gb == 2.5
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# TC-09: status() never-raise + best-effort last result
|
|
# --------------------------------------------------------------------------- #
|
|
def test_tc09_status_shape(monkeypatch, prune_defaults):
|
|
"""TC-09: status() carries enabled/interval_s/until/last_run_ts + reclaimed."""
|
|
monkeypatch.setattr(bcp.settings, "build_cache_prune_enabled", True, raising=False)
|
|
pruner = BuildCachePruner()
|
|
st = pruner.status()
|
|
for key in (
|
|
"enabled", "interval_s", "until", "all", "last_run_ts",
|
|
"last_reclaimed", "last_reclaimed_bytes", "last_error",
|
|
):
|
|
assert key in st
|
|
assert st["last_run_ts"] is None # no tick yet
|
|
|
|
|
|
def test_tc09_status_reflects_last_prune(monkeypatch, ssh_configured, prune_defaults):
|
|
"""TC-09: after a successful tick status() carries last_run_ts + reclaimed."""
|
|
monkeypatch.setattr(
|
|
bcp.subprocess, "run",
|
|
lambda *a, **k: _Completed(returncode=0, stdout="Total reclaimed space: 11.05GB"),
|
|
)
|
|
pruner = BuildCachePruner(now_provider=lambda: 1234.0)
|
|
pruner.tick()
|
|
st = pruner.status()
|
|
assert st["last_run_ts"] == 1234.0
|
|
assert st["last_error"] is None
|
|
assert st["last_reclaimed_bytes"] == int(11.05 * (1000 ** 3))
|
|
assert "GB" in st["last_reclaimed"]
|
|
|
|
|
|
def test_parse_reclaimed_variants():
|
|
"""parse_reclaimed: decimal/binary units + absent line (best-effort, never raises)."""
|
|
assert parse_reclaimed("Total reclaimed space: 0B") == 0
|
|
assert parse_reclaimed("Total reclaimed space: 500MB") == 500 * 1000 ** 2
|
|
assert parse_reclaimed("Total reclaimed space: 1.5GiB") == int(1.5 * 1024 ** 3)
|
|
assert parse_reclaimed("no such line here") is None
|
|
assert parse_reclaimed("") is None
|
|
|
|
|
|
def test_notify_on_significant_reclaim(monkeypatch, ssh_configured, prune_defaults):
|
|
"""Optional Telegram when reclaimed >= notify_min_gb; below threshold stays silent."""
|
|
sends = []
|
|
monkeypatch.setattr(bcp, "send_telegram", lambda text, **k: sends.append(text))
|
|
monkeypatch.setattr(bcp.settings, "build_cache_prune_notify_min_gb", 1.0, raising=False)
|
|
monkeypatch.setattr(
|
|
bcp.subprocess, "run",
|
|
lambda *a, **k: _Completed(returncode=0, stdout="Total reclaimed space: 5.0GB"),
|
|
)
|
|
pruner = BuildCachePruner(now_provider=lambda: 1.0)
|
|
pruner.tick()
|
|
assert len(sends) == 1 and "build-cache-pruner" in sends[0]
|
|
|
|
# A small reclaim below the threshold stays silent.
|
|
sends.clear()
|
|
monkeypatch.setattr(
|
|
bcp.subprocess, "run",
|
|
lambda *a, **k: _Completed(returncode=0, stdout="Total reclaimed space: 100MB"),
|
|
)
|
|
pruner2 = BuildCachePruner(now_provider=lambda: 1.0)
|
|
pruner2.tick()
|
|
assert sends == []
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# TC-10: leaf isolation from the Quality Gate / stage machine
|
|
# --------------------------------------------------------------------------- #
|
|
def test_tc10_module_is_leaf_no_pipeline_imports():
|
|
"""TC-10: the pruner is a leaf — it does not import stage_engine/stages/qg.
|
|
|
|
Inspects the actual import statements (via AST), not the docstring text — the
|
|
module legitimately *mentions* those names in prose explaining what it does NOT
|
|
touch.
|
|
"""
|
|
import ast
|
|
import inspect
|
|
tree = ast.parse(inspect.getsource(bcp))
|
|
imported = set()
|
|
for node in ast.walk(tree):
|
|
if isinstance(node, ast.Import):
|
|
imported.update(a.name for a in node.names)
|
|
elif isinstance(node, ast.ImportFrom):
|
|
base = ("." * (node.level or 0)) + (node.module or "")
|
|
imported.add(base)
|
|
imported.update(f"{base}.{a.name}" for a in node.names)
|
|
forbidden = ("stage_engine", "stages", "qg")
|
|
for imp in imported:
|
|
tail = imp.lstrip(".")
|
|
assert not any(
|
|
tail == f or tail.endswith("." + f) or tail.startswith(f + ".")
|
|
for f in forbidden
|
|
), f"pruner must not import a pipeline module, found: {imp}"
|
|
|
|
|
|
def test_tc10_stage_transitions_and_qg_unchanged():
|
|
"""TC-10: STAGE_TRANSITIONS / QG_CHECKS carry no build-cache-prune element (AC-8)."""
|
|
from src.stages import STAGE_TRANSITIONS
|
|
from src.qg.checks import QG_CHECKS
|
|
blob = repr(STAGE_TRANSITIONS) + repr(list(QG_CHECKS.keys()))
|
|
assert "build_cache" not in blob
|
|
assert "builder prune" not in blob
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# TC-11: lifespan integration
|
|
# --------------------------------------------------------------------------- #
|
|
def test_tc11_lifespan_starts_and_stops(monkeypatch):
|
|
"""TC-11: with the flag on the daemon starts in lifespan and stops cleanly,
|
|
docker mocked (FR-1/AC-1)."""
|
|
monkeypatch.setattr(bcp.settings, "build_cache_prune_enabled", True, raising=False)
|
|
# A very long interval so the loop sleeps immediately after the first tick;
|
|
# subprocess is mocked so no real docker call happens.
|
|
monkeypatch.setattr(bcp.settings, "build_cache_prune_interval_s", 3600, raising=False)
|
|
monkeypatch.setattr(bcp.settings, "deploy_ssh_host", "", raising=False) # no-op tick
|
|
pruner = BuildCachePruner(interval_s=3600)
|
|
pruner.start()
|
|
assert pruner._thread is not None and pruner._thread.is_alive()
|
|
pruner.stop(timeout=5.0)
|
|
assert not pruner._thread.is_alive()
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# TC-12: GET /queue integration
|
|
# --------------------------------------------------------------------------- #
|
|
def test_tc12_queue_has_build_cache_block(monkeypatch):
|
|
"""TC-12: GET /queue carries an additive build_cache_prune block; existing keys kept."""
|
|
import asyncio
|
|
import src.db as db
|
|
from src.db import init_db
|
|
from src import main
|
|
|
|
dbfile = os.path.join(tempfile.gettempdir(), "test_bcp_queue.db")
|
|
monkeypatch.setattr(db.settings, "db_path", dbfile, raising=False)
|
|
init_db()
|
|
|
|
payload = asyncio.run(main.queue())
|
|
|
|
for key in (
|
|
"counts", "max_concurrency", "poll_interval", "resilience", "reconcile",
|
|
"reaper", "post_deploy", "merge_verify", "task_deps", "serial_gate",
|
|
"auto_labels", "disk_monitor", "recent",
|
|
):
|
|
assert key in payload, f"existing /queue key '{key}' must be preserved"
|
|
|
|
assert "build_cache_prune" in payload
|
|
block = payload["build_cache_prune"]
|
|
assert "enabled" in block and "interval_s" in block and "until" in block
|
|
assert "last_run_ts" in block
|
|
|
|
|
|
def test_tc12_queue_disabled_block(monkeypatch):
|
|
"""TC-12: with the kill-switch off, /queue reports build_cache_prune.enabled=false."""
|
|
import asyncio
|
|
import src.db as db
|
|
from src.db import init_db
|
|
from src import main
|
|
from src import build_cache_pruner as bcpmod
|
|
|
|
dbfile = os.path.join(tempfile.gettempdir(), "test_bcp_queue2.db")
|
|
monkeypatch.setattr(db.settings, "db_path", dbfile, raising=False)
|
|
monkeypatch.setattr(bcpmod.settings, "build_cache_prune_enabled", False, raising=False)
|
|
init_db()
|
|
|
|
payload = asyncio.run(main.queue())
|
|
assert payload["build_cache_prune"]["enabled"] is False
|