Add the `watchdog/` package (thin Python-3.12 stdlib-only daemon) and the `orchestrator-watchdog` compose service — the brain half of the domain-0 observability pair. F1a (ORCH-099) exposes GET /metrics raw signal; F1b reads it, augments with host / container / dependency probes, runs each signal through a generalised pure decision function (decide(signal_active, prev, now, cooldown), a strict superset of disk_watchdog.decide_action) with per-signal in-memory dedup/throttle/recovery, and alerts over its OWN independent Telegram channel. Key properties (ADR-001): - Observer separated from observed: separate container; /metrics not answering is itself the master `orch_down` alarm (debounced K ticks — no flap on a hiccup). - Strictly read-only: docker.sock GET-only + mounted :ro (double guard), host paths :ro, no DB/disk writes, no process control — self-hosting-safe. - never-raise on three levels (per-source/per-tick/per-send) + WATCHDOG_ENABLED kill-switch (disabled -> inert idle-loop, not exit). - Disk anti-duplicate (D6): disk_watchdog (ORCH-063) stays sole owner of the 85% alert; sidecar carries orch_down + an opt-in 97% ceiling (default off). - NO import from src/** (C-1); src/**, STAGE_TRANSITIONS, QG_CHECKS, check_*, DB schema — untouched. env_file optional so a missing .env.watchdog never breaks `docker compose up` for the prod orchestrator. Tests: tests/watchdog/ (TC-01…TC-13) + full tests/ regression green (TC-14). Docs: CHANGELOG, .env.example canon (WATCHDOG_*); architecture README + adr-0033 authored at the architecture stage. Refs: ORCH-100 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
57 lines
2.1 KiB
Python
57 lines
2.1 KiB
Python
"""TC-01…TC-04: the pure decision function (alert/throttle/realert/recovery).
|
|
|
|
Mirrors the disk_watchdog.decide_action tests — the generalised ``decide`` is a
|
|
strict superset (boolean ``signal_active`` instead of ``used_pct >= threshold``).
|
|
"""
|
|
from watchdog.decision import (
|
|
ACTION_ALERT,
|
|
ACTION_NONE,
|
|
ACTION_REALERT,
|
|
ACTION_RECOVERY,
|
|
AlertState,
|
|
decide,
|
|
)
|
|
|
|
COOLDOWN = 1800.0
|
|
|
|
|
|
def test_tc01_not_alerting_active_alerts():
|
|
# TC-01: not-alerting & signal active -> ALERT (one per crossing).
|
|
prev = AlertState(alerting=False)
|
|
assert decide(True, prev, now=100.0, cooldown_s=COOLDOWN) == ACTION_ALERT
|
|
|
|
|
|
def test_tc01_not_alerting_inactive_is_none():
|
|
prev = AlertState(alerting=False)
|
|
assert decide(False, prev, now=100.0, cooldown_s=COOLDOWN) == ACTION_NONE
|
|
|
|
|
|
def test_tc02_alerting_active_in_cooldown_is_none():
|
|
# TC-02: alerting & still active & cooldown NOT elapsed -> NONE (anti-spam).
|
|
prev = AlertState(alerting=True, last_alert_at=1000.0)
|
|
assert decide(True, prev, now=1000.0 + 10.0, cooldown_s=COOLDOWN) == ACTION_NONE
|
|
|
|
|
|
def test_tc03_alerting_active_cooldown_elapsed_realerts():
|
|
# TC-03: alerting & still active & cooldown elapsed -> REALERT.
|
|
prev = AlertState(alerting=True, last_alert_at=1000.0)
|
|
assert decide(True, prev, now=1000.0 + COOLDOWN, cooldown_s=COOLDOWN) == ACTION_REALERT
|
|
|
|
|
|
def test_tc03_alerting_active_no_last_alert_realerts():
|
|
# Defensive: alerting but last_alert_at missing -> treat cooldown as elapsed.
|
|
prev = AlertState(alerting=True, last_alert_at=None)
|
|
assert decide(True, prev, now=5.0, cooldown_s=COOLDOWN) == ACTION_REALERT
|
|
|
|
|
|
def test_tc04_alerting_recovers_when_inactive():
|
|
# TC-04: alerting & signal back to normal -> RECOVERY.
|
|
prev = AlertState(alerting=True, last_alert_at=1000.0)
|
|
assert decide(False, prev, now=1200.0, cooldown_s=COOLDOWN) == ACTION_RECOVERY
|
|
|
|
|
|
def test_cooldown_boundary_is_inclusive():
|
|
# Exactly at cooldown boundary -> REALERT (>= semantics, like disk_watchdog).
|
|
prev = AlertState(alerting=True, last_alert_at=0.0)
|
|
assert decide(True, prev, now=COOLDOWN, cooldown_s=COOLDOWN) == ACTION_REALERT
|