feat(disk-watchdog): host-FS fill heartbeat + Telegram alert at >=85% (ORCH-063)
Adds src/disk_watchdog.py — a background daemon thread modelled on reconciler/job_reaper that measures host-FS fill via the mounted bind-paths (/repos, /app/data) with shutil.disk_usage and Telegram-alerts the operator at >= threshold (default 85%). The missing proactive signal: on 07.06.2026 the mva154 host disk silently hit 100% and stalled the whole self-hosting pipeline. - Pure decide_action(used_pct, threshold, prev, now, realert_s): alert on crossing up, cooldown re-alert, single recovery below threshold (unit-tested without a thread/timer; clock injected). - measure_paths: shutil.disk_usage per path, dedup by st_dev, per-path never-raise (a broken path never fails the tick). - Config flags ORCH_DISK_MONITOR_* with defensive validation (threshold 1..100, positive intervals -> default + warning). Kill-switch -> daemon does not start. - Additive disk_monitor block in GET /queue; start/stop in main.lifespan. - never-raise (per-path/per-tick/per-send); STAGE_TRANSITIONS/QG_CHECKS/check_*/ DB schema untouched, no migration (anti-spam state in-memory). Tests: tests/test_disk_watchdog.py (TC-01..TC-12, 18 cases); full suite green (1296). Docs: INFRA.md, .env.example, CHANGELOG.md (architecture/README.md + ADRs authored at architecture stage). Refs: ORCH-063 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -1,3 +1,5 @@
|
||||
import logging
|
||||
|
||||
from pydantic import field_validator
|
||||
from pydantic_settings import BaseSettings
|
||||
|
||||
@@ -381,6 +383,68 @@ class Settings(BaseSettings):
|
||||
reaper_finalize_grace_s: int = 300
|
||||
lease_reclaim_enabled: bool = True
|
||||
|
||||
# ORCH-063: disk-watchdog — background heartbeat that measures host-FS fill via
|
||||
# the mounted bind-paths and Telegram-alerts the operator at >= threshold. On
|
||||
# 07.06.2026 the mva154 host disk silently hit 100% and stalled the WHOLE
|
||||
# self-hosting pipeline; the watchdog is the missing proactive signal. Modelled
|
||||
# on reconciler/job_reaper (daemon thread, start/stop in main.lifespan, /queue
|
||||
# snapshot, never-raise). Anti-spam state is in-memory (no DB migration).
|
||||
# disk_monitor_enabled -> kill-switch; False -> the daemon does not start
|
||||
# (zero regression), env ORCH_DISK_MONITOR_ENABLED.
|
||||
# disk_monitor_interval_s -> heartbeat measurement period, seconds (order of
|
||||
# minutes; cheap shutil.disk_usage, no df subprocess).
|
||||
# disk_monitor_threshold_pct -> fill % that triggers the alert (Owner-fixed 85).
|
||||
# disk_monitor_realert_s -> min interval between repeat alerts while still
|
||||
# above threshold (anti-spam cooldown, ~6h).
|
||||
# disk_monitor_paths -> CSV of monitored HOST bind-paths (NOT overlay /);
|
||||
# empty -> the default set (/repos, /app/data).
|
||||
# Defensive validation (ADR-001 D7): threshold out of 1..100 or a non-positive
|
||||
# interval -> default + warning (the process never crashes on a bad env value).
|
||||
disk_monitor_enabled: bool = True
|
||||
disk_monitor_interval_s: int = 300
|
||||
disk_monitor_threshold_pct: int = 85
|
||||
disk_monitor_realert_s: int = 21600
|
||||
disk_monitor_paths: str = "/repos,/app/data"
|
||||
|
||||
@field_validator(
|
||||
"disk_monitor_interval_s", "disk_monitor_realert_s", mode="before"
|
||||
)
|
||||
@classmethod
|
||||
def _disk_positive_int(cls, v, info):
|
||||
# Non-positive / non-numeric interval -> the field default (never crash).
|
||||
_defaults = {"disk_monitor_interval_s": 300, "disk_monitor_realert_s": 21600}
|
||||
fallback = _defaults.get(info.field_name, 1)
|
||||
try:
|
||||
if v is None or (isinstance(v, str) and v.strip() == ""):
|
||||
return fallback
|
||||
iv = int(v)
|
||||
if iv <= 0:
|
||||
logging.getLogger("orchestrator.config").warning(
|
||||
"%s must be > 0, got %s; falling back to %s",
|
||||
info.field_name, v, fallback,
|
||||
)
|
||||
return fallback
|
||||
return iv
|
||||
except (TypeError, ValueError):
|
||||
return fallback
|
||||
|
||||
@field_validator("disk_monitor_threshold_pct", mode="before")
|
||||
@classmethod
|
||||
def _disk_threshold_pct(cls, v):
|
||||
# Threshold must be a percentage in 1..100; otherwise -> default 85.
|
||||
try:
|
||||
if v is None or (isinstance(v, str) and v.strip() == ""):
|
||||
return 85
|
||||
iv = int(v)
|
||||
if 1 <= iv <= 100:
|
||||
return iv
|
||||
logging.getLogger("orchestrator.config").warning(
|
||||
"disk_monitor_threshold_pct must be 1..100, got %s; using 85", v
|
||||
)
|
||||
return 85
|
||||
except (TypeError, ValueError):
|
||||
return 85
|
||||
|
||||
# ORCH-071: merge-verify under-gate on the `deploy -> done` edge. For the
|
||||
# self-hosting repo the `deploy` stage runs the DETERMINISTIC self-deploy path
|
||||
# (Phase A/B/C), where the LLM `deployer` agent — historically the ONLY actor
|
||||
|
||||
Reference in New Issue
Block a user