feat(disk-watchdog): host-FS fill heartbeat + Telegram alert at >=85% (ORCH-063)

Adds src/disk_watchdog.py — a background daemon thread modelled on reconciler/job_reaper that measures host-FS fill via the mounted bind-paths (/repos, /app/data) with shutil.disk_usage and Telegram-alerts the operator at >= threshold (default 85%). The missing proactive signal: on 07.06.2026 the mva154 host disk silently hit 100% and stalled the whole self-hosting pipeline. - Pure decide_action(used_pct, threshold, prev, now, realert_s): alert on crossing up, cooldown re-alert, single recovery below threshold (unit-tested without a thread/timer; clock injected). - measure_paths: shutil.disk_usage per path, dedup by st_dev, per-path never-raise (a broken path never fails the tick). - Config flags ORCH_DISK_MONITOR_* with defensive validation (threshold 1..100, positive intervals -> default + warning). Kill-switch -> daemon does not start. - Additive disk_monitor block in GET /queue; start/stop in main.lifespan. - never-raise (per-path/per-tick/per-send); STAGE_TRANSITIONS/QG_CHECKS/check_*/ DB schema untouched, no migration (anti-spam state in-memory). Tests: tests/test_disk_watchdog.py (TC-01..TC-12, 18 cases); full suite green (1296). Docs: INFRA.md, .env.example, CHANGELOG.md (architecture/README.md + ADRs authored at architecture stage). Refs: ORCH-063 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-09 18:51:37 +03:00
parent 4d9251c698
commit 8759cb7df8
7 changed files with 817 additions and 0 deletions
--- a/src/config.py
+++ b/src/config.py
@@ -1,3 +1,5 @@
+import logging
+
 from pydantic import field_validator
 from pydantic_settings import BaseSettings

@@ -381,6 +383,68 @@ class Settings(BaseSettings):
    reaper_finalize_grace_s: int = 300
    lease_reclaim_enabled: bool = True

+    # ORCH-063: disk-watchdog — background heartbeat that measures host-FS fill via
+    # the mounted bind-paths and Telegram-alerts the operator at >= threshold. On
+    # 07.06.2026 the mva154 host disk silently hit 100% and stalled the WHOLE
+    # self-hosting pipeline; the watchdog is the missing proactive signal. Modelled
+    # on reconciler/job_reaper (daemon thread, start/stop in main.lifespan, /queue
+    # snapshot, never-raise). Anti-spam state is in-memory (no DB migration).
+    #   disk_monitor_enabled       -> kill-switch; False -> the daemon does not start
+    #                                 (zero regression), env ORCH_DISK_MONITOR_ENABLED.
+    #   disk_monitor_interval_s    -> heartbeat measurement period, seconds (order of
+    #                                 minutes; cheap shutil.disk_usage, no df subprocess).
+    #   disk_monitor_threshold_pct -> fill % that triggers the alert (Owner-fixed 85).
+    #   disk_monitor_realert_s     -> min interval between repeat alerts while still
+    #                                 above threshold (anti-spam cooldown, ~6h).
+    #   disk_monitor_paths         -> CSV of monitored HOST bind-paths (NOT overlay /);
+    #                                 empty -> the default set (/repos, /app/data).
+    # Defensive validation (ADR-001 D7): threshold out of 1..100 or a non-positive
+    # interval -> default + warning (the process never crashes on a bad env value).
+    disk_monitor_enabled: bool = True
+    disk_monitor_interval_s: int = 300
+    disk_monitor_threshold_pct: int = 85
+    disk_monitor_realert_s: int = 21600
+    disk_monitor_paths: str = "/repos,/app/data"
+
+    @field_validator(
+        "disk_monitor_interval_s", "disk_monitor_realert_s", mode="before"
+    )
+    @classmethod
+    def _disk_positive_int(cls, v, info):
+        # Non-positive / non-numeric interval -> the field default (never crash).
+        _defaults = {"disk_monitor_interval_s": 300, "disk_monitor_realert_s": 21600}
+        fallback = _defaults.get(info.field_name, 1)
+        try:
+            if v is None or (isinstance(v, str) and v.strip() == ""):
+                return fallback
+            iv = int(v)
+            if iv <= 0:
+                logging.getLogger("orchestrator.config").warning(
+                    "%s must be > 0, got %s; falling back to %s",
+                    info.field_name, v, fallback,
+                )
+                return fallback
+            return iv
+        except (TypeError, ValueError):
+            return fallback
+
+    @field_validator("disk_monitor_threshold_pct", mode="before")
+    @classmethod
+    def _disk_threshold_pct(cls, v):
+        # Threshold must be a percentage in 1..100; otherwise -> default 85.
+        try:
+            if v is None or (isinstance(v, str) and v.strip() == ""):
+                return 85
+            iv = int(v)
+            if 1 <= iv <= 100:
+                return iv
+            logging.getLogger("orchestrator.config").warning(
+                "disk_monitor_threshold_pct must be 1..100, got %s; using 85", v
+            )
+            return 85
+        except (TypeError, ValueError):
+            return 85
+
    # ORCH-071: merge-verify under-gate on the `deploy -> done` edge. For the
    # self-hosting repo the `deploy` stage runs the DETERMINISTIC self-deploy path
    # (Phase A/B/C), where the LLM `deployer` agent — historically the ONLY actor
--- a/src/disk_watchdog.py
+++ b/src/disk_watchdog.py
@@ -0,0 +1,358 @@
+"""ORCH-063: disk-watchdog — host-FS fill heartbeat + Telegram alert at >=85%.
+
+On 07.06.2026 the mva154 host disk silently grew to 100% and took down the WHOLE
+self-hosting pipeline of every project (one prod ``orchestrator`` instance serves
+all prod projects from a shared DB/queue). The system had no proactive signal —
+the operator only learned of the problem once the instance was already stuck.
+
+This module is a background daemon thread modelled 1:1 on ``reconciler``
+(ORCH-053) and ``job_reaper`` (ORCH-065): a ``threading.Thread(daemon=True)`` +
+``threading.Event`` for a clean stop, the ``start()`` / ``stop(timeout)`` /
+``status()`` contract, a ``/queue`` snapshot, per-tick never-raise and a
+kill-switch (``ORCH_DISK_MONITOR_ENABLED``). Each tick measures the fill of the
+mounted **host** bind-paths (``/repos``, ``/app/data``) via stdlib
+``shutil.disk_usage`` — NOT the container overlay ``/``, NOT a ``df`` subprocess —
+deduplicates paths by physical device (``st_dev``), and through a pure decision
+function from ``(used_pct, threshold, prev_state, now, realert_s)`` decides to
+alert (threshold crossed up), re-alert (cooldown elapsed), send recovery (back
+below threshold) or stay silent.
+
+Invariants (TRZ §10 / ADR-001):
+  * ``STAGE_TRANSITIONS`` / ``QG_CHECKS`` / ``check_*`` / the DB schema are
+    UNCHANGED — the watchdog is an operational daemon, not a Quality Gate (like
+    ``reconciler`` / ``job_reaper``). No new migration (anti-spam state is
+    in-memory, best-effort, may reset on restart — safe: an early signal, not an
+    SLA).
+  * never-raise on three levels: per-path (a broken path is skipped, the rest are
+    measured), per-tick (outer ``try/except`` in ``_run``), per-send
+    (``send_telegram`` wrapped).
+  * Self-hosting safety: the watchdog only READS fill and SENDS Telegram — it
+    never touches the disk/container, never restarts prod. Safe for enduro-trails
+    in the shared instance.
+  * Kill-switch ``disk_monitor_enabled=False`` -> the daemon does not start
+    (``main.lifespan`` guard) and ``/queue`` returns ``{"enabled": false}`` —
+    behaviour 1:1 as before.
+
+See docs/work-items/ORCH-063/06-adr/ADR-001-disk-watchdog.md and the cross-cutting
+docs/architecture/adr/adr-0024-disk-watchdog.md.
+"""
+
+import logging
+import os
+import shutil
+import socket
+import threading
+import time
+from dataclasses import dataclass
+from datetime import datetime, timezone
+
+from .config import settings
+from .notifications import send_telegram
+
+logger = logging.getLogger("orchestrator.disk_watchdog")
+
+_BYTES_PER_GB = 1024 ** 3
+
+# Decision actions returned by ``decide_action`` (D3).
+ACTION_NONE = "none"
+ACTION_ALERT = "alert"
+ACTION_REALERT = "realert"
+ACTION_RECOVERY = "recovery"
+
+
+@dataclass
+class PathAlertState:
+    """In-memory anti-spam state for one logical device/path (D3).
+
+    Best-effort: lives only in the daemon (no DB row, no migration). After a
+    process restart ``alerting`` resets to ``False`` -> a still-full disk re-alerts
+    once, which is safe (an early signal, not an SLA; TRZ §5/NFR-5).
+    """
+
+    alerting: bool = False
+    last_alert_at: float | None = None
+
+
+def _resolve_host() -> str:
+    """Best-effort host label for alert text (never raises).
+
+    The prod container runs ``network_mode: host`` so ``gethostname()`` resolves
+    to the real host (``mva154``). Any failure -> the neutral ``"host"``.
+    """
+    try:
+        name = socket.gethostname()
+        return name or "host"
+    except Exception:  # noqa: BLE001 - never break the tick
+        return "host"
+
+
+def parse_paths(raw: str) -> list[str]:
+    """Parse the ``disk_monitor_paths`` CSV into a clean path list.
+
+    Empty / blank -> the default host bind-paths (``/repos``, ``/app/data``,
+    TRZ §8). Never raises.
+    """
+    default = ["/repos", "/app/data"]
+    try:
+        if not raw or not raw.strip():
+            return default
+        paths = [p.strip() for p in raw.split(",") if p.strip()]
+        return paths or default
+    except Exception:  # noqa: BLE001 - never break the tick
+        return default
+
+
+def decide_action(
+    used_pct: float,
+    threshold: float,
+    prev: PathAlertState,
+    now: float,
+    realert_s: float,
+) -> str:
+    """Pure alert decision (D3) — testable without a thread or a real timer.
+
+    Returns one of ``ACTION_{NONE,ALERT,REALERT,RECOVERY}`` as a function of the
+    current fill, the threshold, the previous per-path state and the injected
+    clock:
+
+      * not alerting & ``used_pct >= threshold``      -> ALERT     (crossed up)
+      * alerting & still ``>= threshold`` & cooldown   -> REALERT   (re-alert)
+      * alerting & still ``>= threshold`` & in cooldown-> NONE      (anti-spam)
+      * alerting & ``used_pct < threshold``            -> RECOVERY  (crossed down)
+      * not alerting & ``used_pct < threshold``        -> NONE      (normal)
+
+    Threshold is inclusive: ``used_pct == threshold`` counts as exceeding
+    (``>=``, TC-05).
+    """
+    above = used_pct >= threshold
+    if not prev.alerting:
+        return ACTION_ALERT if above else ACTION_NONE
+    # prev.alerting is True
+    if not above:
+        return ACTION_RECOVERY
+    last = prev.last_alert_at
+    if last is None or (now - last) >= realert_s:
+        return ACTION_REALERT
+    return ACTION_NONE
+
+
+def _measure_one(path: str) -> dict | None:
+    """Measure one path via ``shutil.disk_usage`` (D1). Never raises.
+
+    Returns a measurement dict, or ``None`` if the path is missing / unreadable
+    (``FileNotFoundError`` / ``PermissionError`` / ``OSError``) -> the caller skips
+    THIS path and keeps measuring the others (FR-2, AC-6: one broken path never
+    fails the whole tick).
+    """
+    try:
+        usage = shutil.disk_usage(path)
+        total = int(usage.total)
+        used = int(usage.used)
+        free = int(usage.free)
+        used_pct = round(used / total * 100, 1) if total > 0 else 0.0
+        free_pct = round(free / total * 100, 1) if total > 0 else 0.0
+        return {
+            "path": path,
+            "total_bytes": total,
+            "used_bytes": used,
+            "free_bytes": free,
+            "used_pct": used_pct,
+            "free_pct": free_pct,
+            "free_gb": round(free / _BYTES_PER_GB, 1),
+        }
+    except Exception as e:  # noqa: BLE001 - skip this path, keep the tick alive
+        logger.warning("disk-watchdog: cannot measure path %s, skipping: %s", path, e)
+        return None
+
+
+def _dedup_key(path: str) -> object:
+    """Physical-device dedup key (D2): ``st_dev`` if resolvable, else the path.
+
+    Paths sharing a device (``/repos`` and ``/app/data`` on the same host
+    partition) collapse to one logical partition -> one alert, not two. Failure to
+    ``os.stat`` -> fail-open (the path is its own key, measured independently).
+    """
+    try:
+        return os.stat(path).st_dev
+    except Exception:  # noqa: BLE001 - fail-open, treat as a distinct device
+        return path
+
+
+def measure_paths(paths: list[str]) -> list[dict]:
+    """Measure every path, deduplicated by physical device (D1/D2). Never raises.
+
+    For each distinct ``st_dev`` the FIRST successfully-measured path is kept and
+    carries a stable ``dedup_key`` (so anti-spam state is per-device). A path that
+    fails to measure is skipped (AC-6).
+    """
+    out: list[dict] = []
+    seen: set[object] = set()
+    for path in paths:
+        key = _dedup_key(path)
+        if key in seen:
+            continue
+        m = _measure_one(path)
+        if m is None:
+            continue
+        seen.add(key)
+        m["dedup_key"] = key
+        out.append(m)
+    return out
+
+
+def format_alert_message(m: dict, threshold: float, host: str) -> str:
+    """Actionable Telegram alert text (FR-3/AC-2): host, path, used %, free, threshold."""
+    return (
+        f"\U0001f534 Диск {host}: {m['path']} заполнен на {m['used_pct']}% "
+        f"(порог {threshold}%). Свободно {m['free_gb']} ГБ ({m['free_pct']}%). "
+        f"Освободите место — риск остановки конвейера всех проектов."
+    )
+
+
+def format_recovery_message(m: dict, host: str) -> str:
+    """Single recovery message when fill returns below threshold (FR-4/AC-4)."""
+    return (
+        f"\U0001f7e2 Диск {host}: {m['path']} вернулся ниже порога — "
+        f"{m['used_pct']}% (свободно {m['free_gb']} ГБ)."
+    )
+
+
+class DiskWatchdog:
+    """Background daemon measuring host-FS fill and alerting on >= threshold.
+
+    Modelled on ``Reconciler`` / ``JobReaper``: a ``threading.Thread(daemon=True)``
+    + a ``threading.Event`` for a clean stop. The only in-memory state is the
+    best-effort anti-spam map (``_states``), the last-measurement snapshot
+    (``_last``) and ``last_run_ts`` — all reset on restart, which is safe (D3).
+
+    ``now_provider`` is injectable so the cooldown / recovery logic is testable
+    deterministically without a real timer (AC-3).
+    """
+
+    def __init__(self, interval_s: float | None = None, now_provider=None):
+        self.interval_s = (
+            interval_s if interval_s is not None else settings.disk_monitor_interval_s
+        )
+        self._now = now_provider or time.time
+        self._stop = threading.Event()
+        self._thread: threading.Thread | None = None
+        self._host = _resolve_host()
+        # Best-effort in-memory state, per dedup_key (device/path).
+        self._states: dict[object, PathAlertState] = {}
+        self._last: dict[object, dict] = {}
+        self.last_run_ts: float | None = None
+
+    # -- config helpers ----------------------------------------------------
+    @property
+    def _threshold(self) -> int:
+        return settings.disk_monitor_threshold_pct
+
+    @property
+    def _realert_s(self) -> int:
+        return settings.disk_monitor_realert_s
+
+    def _paths(self) -> list[str]:
+        return parse_paths(settings.disk_monitor_paths)
+
+    # -- tick --------------------------------------------------------------
+    def tick(self) -> None:
+        """One measurement pass over all monitored paths (never-raise per send).
+
+        Measures every (deduplicated) path, runs the pure ``decide_action`` per
+        device and dispatches the resulting alert / re-alert / recovery via
+        ``send_telegram`` (notifying). Telegram failures are logged and swallowed
+        (best-effort delivery, AC-6).
+        """
+        threshold = self._threshold
+        realert_s = self._realert_s
+        now = self._now()
+        for m in measure_paths(self._paths()):
+            key = m["dedup_key"]
+            prev = self._states.get(key) or PathAlertState()
+            action = decide_action(m["used_pct"], threshold, prev, now, realert_s)
+            if action in (ACTION_ALERT, ACTION_REALERT):
+                self._send(format_alert_message(m, threshold, self._host), notifying=True)
+                self._states[key] = PathAlertState(alerting=True, last_alert_at=now)
+            elif action == ACTION_RECOVERY:
+                self._send(format_recovery_message(m, self._host), notifying=True)
+                self._states[key] = PathAlertState(alerting=False, last_alert_at=None)
+            # ACTION_NONE: leave prev state untouched (anti-spam / normal).
+            # Record the snapshot for /queue observability.
+            cur = self._states.get(key) or prev
+            self._last[key] = {
+                "path": m["path"],
+                "used_pct": m["used_pct"],
+                "free_gb": m["free_gb"],
+                "free_pct": m["free_pct"],
+                "alerting": cur.alerting,
+                "last_alert_at": cur.last_alert_at,
+            }
+
+    def _send(self, text: str, notifying: bool) -> None:
+        """Send a Telegram alert (notifying, not silent). Never raises (AC-6)."""
+        try:
+            send_telegram(text, disable_notification=not notifying)
+        except Exception as e:  # noqa: BLE001 - delivery is best-effort
+            logger.warning("disk-watchdog: telegram send failed: %s", e)
+
+    # -- loop / lifecycle --------------------------------------------------
+    def _tick(self) -> None:
+        try:
+            self.tick()
+        finally:
+            self.last_run_ts = datetime.now(timezone.utc).timestamp()
+
+    def _run(self) -> None:
+        logger.info(
+            "DiskWatchdog started (interval=%ss, threshold=%s%%, realert=%ss, "
+            "paths=%s, enabled=%s)",
+            self.interval_s, self._threshold, self._realert_s,
+            self._paths(), settings.disk_monitor_enabled,
+        )
+        while not self._stop.is_set():
+            try:
+                self._tick()
+            except Exception as e:  # noqa: BLE001 - outer never-raise
+                logger.error("DiskWatchdog loop error: %s", e)
+            self._stop.wait(self.interval_s)
+        logger.info("DiskWatchdog stopped")
+
+    def start(self) -> None:
+        """Start the daemon thread (idempotent: a live thread is a no-op).
+
+        Honours the kill-switch: ``disk_monitor_enabled=False`` -> no-op (the
+        daemon never starts; ``main.lifespan`` also guards, AC-5/TC-09).
+        """
+        if not settings.disk_monitor_enabled:
+            return
+        if self._thread and self._thread.is_alive():
+            return
+        self._stop.clear()
+        self._thread = threading.Thread(
+            target=self._run, name="disk-watchdog", daemon=True
+        )
+        self._thread.start()
+
+    def stop(self, timeout: float = 5.0) -> None:
+        self._stop.set()
+        if self._thread:
+            self._thread.join(timeout=timeout)
+
+    def status(self) -> dict:
+        """Disk-monitor snapshot for /queue observability (FR-6/AC-7). Never raises."""
+        try:
+            return {
+                "enabled": settings.disk_monitor_enabled,
+                "threshold_pct": self._threshold,
+                "interval_s": self.interval_s,
+                "realert_s": self._realert_s,
+                "last_run_ts": self.last_run_ts,
+                "paths": list(self._last.values()),
+            }
+        except Exception as e:  # noqa: BLE001 - observability must never raise
+            logger.warning("disk-watchdog: status() failed: %s", e)
+            return {"enabled": settings.disk_monitor_enabled}
+
+
+# Module-level singleton used by the FastAPI lifespan.
+disk_watchdog = DiskWatchdog()
--- a/src/main.py
+++ b/src/main.py
@@ -105,9 +105,19 @@ async def lifespan(app: FastAPI):
    from .job_reaper import reaper
    reaper.start()

+    # ORCH-063: start the disk-watchdog LAST (after the reaper). It is independent
+    # of the queue/DB — it only reads host-FS fill and Telegram-alerts at >=
+    # threshold — so the order is not critical, but we follow the daemon
+    # convention. Honours the kill-switch ORCH_DISK_MONITOR_ENABLED (start() is a
+    # no-op when disabled, so behaviour is 1:1 as before).
+    from .disk_watchdog import disk_watchdog
+    disk_watchdog.start()
+
    try:
        yield
    finally:
+        # ORCH-063: stop the disk-watchdog first (reverse of startup).
+        disk_watchdog.stop()
        # Graceful shutdown order mirrors startup in reverse: stop the reaper
        # first, then the reconciler (it must not enqueue new work while the
        # worker is winding down), then the worker. Running agents keep going;
@@ -151,6 +161,7 @@ async def queue():
    from . import task_deps
    from . import serial_gate
    from . import labels
+    from .disk_watchdog import disk_watchdog
    return {
        "counts": job_status_counts(),
        "max_concurrency": worker.max_concurrency,
@@ -169,6 +180,10 @@ async def queue():
        # ORCH-089 (D7): auto-mode-by-label observability (read-only) — kill-switch,
        # label names, scope. Additive block.
        "auto_labels": labels.snapshot(),
+        # ORCH-063 (FR-6 / AC-7): disk-watchdog observability (read-only) —
+        # enabled, threshold, interval, last measurement per host-path. Additive
+        # block; never-raise (status() returns {"enabled": ...} minimum on error).
+        "disk_monitor": disk_watchdog.status(),
        "recent": recent_jobs(10),
    }