Adds src/disk_watchdog.py — a background daemon thread modelled on reconciler/job_reaper that measures host-FS fill via the mounted bind-paths (/repos, /app/data) with shutil.disk_usage and Telegram-alerts the operator at >= threshold (default 85%). The missing proactive signal: on 07.06.2026 the mva154 host disk silently hit 100% and stalled the whole self-hosting pipeline. - Pure decide_action(used_pct, threshold, prev, now, realert_s): alert on crossing up, cooldown re-alert, single recovery below threshold (unit-tested without a thread/timer; clock injected). - measure_paths: shutil.disk_usage per path, dedup by st_dev, per-path never-raise (a broken path never fails the tick). - Config flags ORCH_DISK_MONITOR_* with defensive validation (threshold 1..100, positive intervals -> default + warning). Kill-switch -> daemon does not start. - Additive disk_monitor block in GET /queue; start/stop in main.lifespan. - never-raise (per-path/per-tick/per-send); STAGE_TRANSITIONS/QG_CHECKS/check_*/ DB schema untouched, no migration (anti-spam state in-memory). Tests: tests/test_disk_watchdog.py (TC-01..TC-12, 18 cases); full suite green (1296). Docs: INFRA.md, .env.example, CHANGELOG.md (architecture/README.md + ADRs authored at architecture stage). Refs: ORCH-063 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
359 lines
14 KiB
Python
359 lines
14 KiB
Python
"""ORCH-063: disk-watchdog — host-FS fill heartbeat + Telegram alert at >=85%.
|
|
|
|
On 07.06.2026 the mva154 host disk silently grew to 100% and took down the WHOLE
|
|
self-hosting pipeline of every project (one prod ``orchestrator`` instance serves
|
|
all prod projects from a shared DB/queue). The system had no proactive signal —
|
|
the operator only learned of the problem once the instance was already stuck.
|
|
|
|
This module is a background daemon thread modelled 1:1 on ``reconciler``
|
|
(ORCH-053) and ``job_reaper`` (ORCH-065): a ``threading.Thread(daemon=True)`` +
|
|
``threading.Event`` for a clean stop, the ``start()`` / ``stop(timeout)`` /
|
|
``status()`` contract, a ``/queue`` snapshot, per-tick never-raise and a
|
|
kill-switch (``ORCH_DISK_MONITOR_ENABLED``). Each tick measures the fill of the
|
|
mounted **host** bind-paths (``/repos``, ``/app/data``) via stdlib
|
|
``shutil.disk_usage`` — NOT the container overlay ``/``, NOT a ``df`` subprocess —
|
|
deduplicates paths by physical device (``st_dev``), and through a pure decision
|
|
function from ``(used_pct, threshold, prev_state, now, realert_s)`` decides to
|
|
alert (threshold crossed up), re-alert (cooldown elapsed), send recovery (back
|
|
below threshold) or stay silent.
|
|
|
|
Invariants (TRZ §10 / ADR-001):
|
|
* ``STAGE_TRANSITIONS`` / ``QG_CHECKS`` / ``check_*`` / the DB schema are
|
|
UNCHANGED — the watchdog is an operational daemon, not a Quality Gate (like
|
|
``reconciler`` / ``job_reaper``). No new migration (anti-spam state is
|
|
in-memory, best-effort, may reset on restart — safe: an early signal, not an
|
|
SLA).
|
|
* never-raise on three levels: per-path (a broken path is skipped, the rest are
|
|
measured), per-tick (outer ``try/except`` in ``_run``), per-send
|
|
(``send_telegram`` wrapped).
|
|
* Self-hosting safety: the watchdog only READS fill and SENDS Telegram — it
|
|
never touches the disk/container, never restarts prod. Safe for enduro-trails
|
|
in the shared instance.
|
|
* Kill-switch ``disk_monitor_enabled=False`` -> the daemon does not start
|
|
(``main.lifespan`` guard) and ``/queue`` returns ``{"enabled": false}`` —
|
|
behaviour 1:1 as before.
|
|
|
|
See docs/work-items/ORCH-063/06-adr/ADR-001-disk-watchdog.md and the cross-cutting
|
|
docs/architecture/adr/adr-0024-disk-watchdog.md.
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import shutil
|
|
import socket
|
|
import threading
|
|
import time
|
|
from dataclasses import dataclass
|
|
from datetime import datetime, timezone
|
|
|
|
from .config import settings
|
|
from .notifications import send_telegram
|
|
|
|
logger = logging.getLogger("orchestrator.disk_watchdog")
|
|
|
|
_BYTES_PER_GB = 1024 ** 3
|
|
|
|
# Decision actions returned by ``decide_action`` (D3).
|
|
ACTION_NONE = "none"
|
|
ACTION_ALERT = "alert"
|
|
ACTION_REALERT = "realert"
|
|
ACTION_RECOVERY = "recovery"
|
|
|
|
|
|
@dataclass
|
|
class PathAlertState:
|
|
"""In-memory anti-spam state for one logical device/path (D3).
|
|
|
|
Best-effort: lives only in the daemon (no DB row, no migration). After a
|
|
process restart ``alerting`` resets to ``False`` -> a still-full disk re-alerts
|
|
once, which is safe (an early signal, not an SLA; TRZ §5/NFR-5).
|
|
"""
|
|
|
|
alerting: bool = False
|
|
last_alert_at: float | None = None
|
|
|
|
|
|
def _resolve_host() -> str:
|
|
"""Best-effort host label for alert text (never raises).
|
|
|
|
The prod container runs ``network_mode: host`` so ``gethostname()`` resolves
|
|
to the real host (``mva154``). Any failure -> the neutral ``"host"``.
|
|
"""
|
|
try:
|
|
name = socket.gethostname()
|
|
return name or "host"
|
|
except Exception: # noqa: BLE001 - never break the tick
|
|
return "host"
|
|
|
|
|
|
def parse_paths(raw: str) -> list[str]:
|
|
"""Parse the ``disk_monitor_paths`` CSV into a clean path list.
|
|
|
|
Empty / blank -> the default host bind-paths (``/repos``, ``/app/data``,
|
|
TRZ §8). Never raises.
|
|
"""
|
|
default = ["/repos", "/app/data"]
|
|
try:
|
|
if not raw or not raw.strip():
|
|
return default
|
|
paths = [p.strip() for p in raw.split(",") if p.strip()]
|
|
return paths or default
|
|
except Exception: # noqa: BLE001 - never break the tick
|
|
return default
|
|
|
|
|
|
def decide_action(
|
|
used_pct: float,
|
|
threshold: float,
|
|
prev: PathAlertState,
|
|
now: float,
|
|
realert_s: float,
|
|
) -> str:
|
|
"""Pure alert decision (D3) — testable without a thread or a real timer.
|
|
|
|
Returns one of ``ACTION_{NONE,ALERT,REALERT,RECOVERY}`` as a function of the
|
|
current fill, the threshold, the previous per-path state and the injected
|
|
clock:
|
|
|
|
* not alerting & ``used_pct >= threshold`` -> ALERT (crossed up)
|
|
* alerting & still ``>= threshold`` & cooldown -> REALERT (re-alert)
|
|
* alerting & still ``>= threshold`` & in cooldown-> NONE (anti-spam)
|
|
* alerting & ``used_pct < threshold`` -> RECOVERY (crossed down)
|
|
* not alerting & ``used_pct < threshold`` -> NONE (normal)
|
|
|
|
Threshold is inclusive: ``used_pct == threshold`` counts as exceeding
|
|
(``>=``, TC-05).
|
|
"""
|
|
above = used_pct >= threshold
|
|
if not prev.alerting:
|
|
return ACTION_ALERT if above else ACTION_NONE
|
|
# prev.alerting is True
|
|
if not above:
|
|
return ACTION_RECOVERY
|
|
last = prev.last_alert_at
|
|
if last is None or (now - last) >= realert_s:
|
|
return ACTION_REALERT
|
|
return ACTION_NONE
|
|
|
|
|
|
def _measure_one(path: str) -> dict | None:
|
|
"""Measure one path via ``shutil.disk_usage`` (D1). Never raises.
|
|
|
|
Returns a measurement dict, or ``None`` if the path is missing / unreadable
|
|
(``FileNotFoundError`` / ``PermissionError`` / ``OSError``) -> the caller skips
|
|
THIS path and keeps measuring the others (FR-2, AC-6: one broken path never
|
|
fails the whole tick).
|
|
"""
|
|
try:
|
|
usage = shutil.disk_usage(path)
|
|
total = int(usage.total)
|
|
used = int(usage.used)
|
|
free = int(usage.free)
|
|
used_pct = round(used / total * 100, 1) if total > 0 else 0.0
|
|
free_pct = round(free / total * 100, 1) if total > 0 else 0.0
|
|
return {
|
|
"path": path,
|
|
"total_bytes": total,
|
|
"used_bytes": used,
|
|
"free_bytes": free,
|
|
"used_pct": used_pct,
|
|
"free_pct": free_pct,
|
|
"free_gb": round(free / _BYTES_PER_GB, 1),
|
|
}
|
|
except Exception as e: # noqa: BLE001 - skip this path, keep the tick alive
|
|
logger.warning("disk-watchdog: cannot measure path %s, skipping: %s", path, e)
|
|
return None
|
|
|
|
|
|
def _dedup_key(path: str) -> object:
|
|
"""Physical-device dedup key (D2): ``st_dev`` if resolvable, else the path.
|
|
|
|
Paths sharing a device (``/repos`` and ``/app/data`` on the same host
|
|
partition) collapse to one logical partition -> one alert, not two. Failure to
|
|
``os.stat`` -> fail-open (the path is its own key, measured independently).
|
|
"""
|
|
try:
|
|
return os.stat(path).st_dev
|
|
except Exception: # noqa: BLE001 - fail-open, treat as a distinct device
|
|
return path
|
|
|
|
|
|
def measure_paths(paths: list[str]) -> list[dict]:
|
|
"""Measure every path, deduplicated by physical device (D1/D2). Never raises.
|
|
|
|
For each distinct ``st_dev`` the FIRST successfully-measured path is kept and
|
|
carries a stable ``dedup_key`` (so anti-spam state is per-device). A path that
|
|
fails to measure is skipped (AC-6).
|
|
"""
|
|
out: list[dict] = []
|
|
seen: set[object] = set()
|
|
for path in paths:
|
|
key = _dedup_key(path)
|
|
if key in seen:
|
|
continue
|
|
m = _measure_one(path)
|
|
if m is None:
|
|
continue
|
|
seen.add(key)
|
|
m["dedup_key"] = key
|
|
out.append(m)
|
|
return out
|
|
|
|
|
|
def format_alert_message(m: dict, threshold: float, host: str) -> str:
|
|
"""Actionable Telegram alert text (FR-3/AC-2): host, path, used %, free, threshold."""
|
|
return (
|
|
f"\U0001f534 Диск {host}: {m['path']} заполнен на {m['used_pct']}% "
|
|
f"(порог {threshold}%). Свободно {m['free_gb']} ГБ ({m['free_pct']}%). "
|
|
f"Освободите место — риск остановки конвейера всех проектов."
|
|
)
|
|
|
|
|
|
def format_recovery_message(m: dict, host: str) -> str:
|
|
"""Single recovery message when fill returns below threshold (FR-4/AC-4)."""
|
|
return (
|
|
f"\U0001f7e2 Диск {host}: {m['path']} вернулся ниже порога — "
|
|
f"{m['used_pct']}% (свободно {m['free_gb']} ГБ)."
|
|
)
|
|
|
|
|
|
class DiskWatchdog:
|
|
"""Background daemon measuring host-FS fill and alerting on >= threshold.
|
|
|
|
Modelled on ``Reconciler`` / ``JobReaper``: a ``threading.Thread(daemon=True)``
|
|
+ a ``threading.Event`` for a clean stop. The only in-memory state is the
|
|
best-effort anti-spam map (``_states``), the last-measurement snapshot
|
|
(``_last``) and ``last_run_ts`` — all reset on restart, which is safe (D3).
|
|
|
|
``now_provider`` is injectable so the cooldown / recovery logic is testable
|
|
deterministically without a real timer (AC-3).
|
|
"""
|
|
|
|
def __init__(self, interval_s: float | None = None, now_provider=None):
|
|
self.interval_s = (
|
|
interval_s if interval_s is not None else settings.disk_monitor_interval_s
|
|
)
|
|
self._now = now_provider or time.time
|
|
self._stop = threading.Event()
|
|
self._thread: threading.Thread | None = None
|
|
self._host = _resolve_host()
|
|
# Best-effort in-memory state, per dedup_key (device/path).
|
|
self._states: dict[object, PathAlertState] = {}
|
|
self._last: dict[object, dict] = {}
|
|
self.last_run_ts: float | None = None
|
|
|
|
# -- config helpers ----------------------------------------------------
|
|
@property
|
|
def _threshold(self) -> int:
|
|
return settings.disk_monitor_threshold_pct
|
|
|
|
@property
|
|
def _realert_s(self) -> int:
|
|
return settings.disk_monitor_realert_s
|
|
|
|
def _paths(self) -> list[str]:
|
|
return parse_paths(settings.disk_monitor_paths)
|
|
|
|
# -- tick --------------------------------------------------------------
|
|
def tick(self) -> None:
|
|
"""One measurement pass over all monitored paths (never-raise per send).
|
|
|
|
Measures every (deduplicated) path, runs the pure ``decide_action`` per
|
|
device and dispatches the resulting alert / re-alert / recovery via
|
|
``send_telegram`` (notifying). Telegram failures are logged and swallowed
|
|
(best-effort delivery, AC-6).
|
|
"""
|
|
threshold = self._threshold
|
|
realert_s = self._realert_s
|
|
now = self._now()
|
|
for m in measure_paths(self._paths()):
|
|
key = m["dedup_key"]
|
|
prev = self._states.get(key) or PathAlertState()
|
|
action = decide_action(m["used_pct"], threshold, prev, now, realert_s)
|
|
if action in (ACTION_ALERT, ACTION_REALERT):
|
|
self._send(format_alert_message(m, threshold, self._host), notifying=True)
|
|
self._states[key] = PathAlertState(alerting=True, last_alert_at=now)
|
|
elif action == ACTION_RECOVERY:
|
|
self._send(format_recovery_message(m, self._host), notifying=True)
|
|
self._states[key] = PathAlertState(alerting=False, last_alert_at=None)
|
|
# ACTION_NONE: leave prev state untouched (anti-spam / normal).
|
|
# Record the snapshot for /queue observability.
|
|
cur = self._states.get(key) or prev
|
|
self._last[key] = {
|
|
"path": m["path"],
|
|
"used_pct": m["used_pct"],
|
|
"free_gb": m["free_gb"],
|
|
"free_pct": m["free_pct"],
|
|
"alerting": cur.alerting,
|
|
"last_alert_at": cur.last_alert_at,
|
|
}
|
|
|
|
def _send(self, text: str, notifying: bool) -> None:
|
|
"""Send a Telegram alert (notifying, not silent). Never raises (AC-6)."""
|
|
try:
|
|
send_telegram(text, disable_notification=not notifying)
|
|
except Exception as e: # noqa: BLE001 - delivery is best-effort
|
|
logger.warning("disk-watchdog: telegram send failed: %s", e)
|
|
|
|
# -- loop / lifecycle --------------------------------------------------
|
|
def _tick(self) -> None:
|
|
try:
|
|
self.tick()
|
|
finally:
|
|
self.last_run_ts = datetime.now(timezone.utc).timestamp()
|
|
|
|
def _run(self) -> None:
|
|
logger.info(
|
|
"DiskWatchdog started (interval=%ss, threshold=%s%%, realert=%ss, "
|
|
"paths=%s, enabled=%s)",
|
|
self.interval_s, self._threshold, self._realert_s,
|
|
self._paths(), settings.disk_monitor_enabled,
|
|
)
|
|
while not self._stop.is_set():
|
|
try:
|
|
self._tick()
|
|
except Exception as e: # noqa: BLE001 - outer never-raise
|
|
logger.error("DiskWatchdog loop error: %s", e)
|
|
self._stop.wait(self.interval_s)
|
|
logger.info("DiskWatchdog stopped")
|
|
|
|
def start(self) -> None:
|
|
"""Start the daemon thread (idempotent: a live thread is a no-op).
|
|
|
|
Honours the kill-switch: ``disk_monitor_enabled=False`` -> no-op (the
|
|
daemon never starts; ``main.lifespan`` also guards, AC-5/TC-09).
|
|
"""
|
|
if not settings.disk_monitor_enabled:
|
|
return
|
|
if self._thread and self._thread.is_alive():
|
|
return
|
|
self._stop.clear()
|
|
self._thread = threading.Thread(
|
|
target=self._run, name="disk-watchdog", daemon=True
|
|
)
|
|
self._thread.start()
|
|
|
|
def stop(self, timeout: float = 5.0) -> None:
|
|
self._stop.set()
|
|
if self._thread:
|
|
self._thread.join(timeout=timeout)
|
|
|
|
def status(self) -> dict:
|
|
"""Disk-monitor snapshot for /queue observability (FR-6/AC-7). Never raises."""
|
|
try:
|
|
return {
|
|
"enabled": settings.disk_monitor_enabled,
|
|
"threshold_pct": self._threshold,
|
|
"interval_s": self.interval_s,
|
|
"realert_s": self._realert_s,
|
|
"last_run_ts": self.last_run_ts,
|
|
"paths": list(self._last.values()),
|
|
}
|
|
except Exception as e: # noqa: BLE001 - observability must never raise
|
|
logger.warning("disk-watchdog: status() failed: %s", e)
|
|
return {"enabled": settings.disk_monitor_enabled}
|
|
|
|
|
|
# Module-level singleton used by the FastAPI lifespan.
|
|
disk_watchdog = DiskWatchdog()
|