Add the `watchdog/` package (thin Python-3.12 stdlib-only daemon) and the `orchestrator-watchdog` compose service — the brain half of the domain-0 observability pair. F1a (ORCH-099) exposes GET /metrics raw signal; F1b reads it, augments with host / container / dependency probes, runs each signal through a generalised pure decision function (decide(signal_active, prev, now, cooldown), a strict superset of disk_watchdog.decide_action) with per-signal in-memory dedup/throttle/recovery, and alerts over its OWN independent Telegram channel. Key properties (ADR-001): - Observer separated from observed: separate container; /metrics not answering is itself the master `orch_down` alarm (debounced K ticks — no flap on a hiccup). - Strictly read-only: docker.sock GET-only + mounted :ro (double guard), host paths :ro, no DB/disk writes, no process control — self-hosting-safe. - never-raise on three levels (per-source/per-tick/per-send) + WATCHDOG_ENABLED kill-switch (disabled -> inert idle-loop, not exit). - Disk anti-duplicate (D6): disk_watchdog (ORCH-063) stays sole owner of the 85% alert; sidecar carries orch_down + an opt-in 97% ceiling (default off). - NO import from src/** (C-1); src/**, STAGE_TRANSITIONS, QG_CHECKS, check_*, DB schema — untouched. env_file optional so a missing .env.watchdog never breaks `docker compose up` for the prod orchestrator. Tests: tests/watchdog/ (TC-01…TC-13) + full tests/ regression green (TC-14). Docs: CHANGELOG, .env.example canon (WATCHDOG_*); architecture README + adr-0033 authored at the architecture stage. Refs: ORCH-100 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
68 lines
2.3 KiB
Python
68 lines
2.3 KiB
Python
"""TC-05: orchestrator-down detection.
|
|
|
|
A ``/metrics`` timeout / connection-refused / 5xx / unreadable body -> the
|
|
``orchestrator_down`` signal -> ALERT "орк не отвечает" once the debounce
|
|
threshold of consecutive failures is reached (FR-3).
|
|
"""
|
|
from watchdog.collectors import orch as orch_mod
|
|
from watchdog.config import Config
|
|
from watchdog.signals import orch_down_signal
|
|
|
|
from .conftest import http_error, make_opener
|
|
|
|
|
|
def _cfg(**kw):
|
|
return Config.from_env({**{"WATCHDOG_ORCH_DOWN_TICKS": "3"}, **kw})
|
|
|
|
|
|
def test_fetch_timeout_is_not_ok():
|
|
opener = make_opener(exc=TimeoutError("timed out"))
|
|
res = orch_mod.fetch_metrics("http://x/metrics", 1.0, opener=opener)
|
|
assert res.ok is False
|
|
assert res.envelope is None
|
|
assert res.error
|
|
|
|
|
|
def test_fetch_connection_refused_is_not_ok():
|
|
opener = make_opener(exc=ConnectionRefusedError("refused"))
|
|
res = orch_mod.fetch_metrics("http://x/metrics", 1.0, opener=opener)
|
|
assert res.ok is False
|
|
|
|
|
|
def test_fetch_5xx_is_not_ok():
|
|
opener = make_opener(status=503, body=b"oops")
|
|
res = orch_mod.fetch_metrics("http://x/metrics", 1.0, opener=opener)
|
|
assert res.ok is False
|
|
assert "503" in (res.error or "")
|
|
|
|
|
|
def test_fetch_httperror_5xx_is_not_ok():
|
|
opener = make_opener(exc=http_error(502))
|
|
res = orch_mod.fetch_metrics("http://x/metrics", 1.0, opener=opener)
|
|
assert res.ok is False
|
|
|
|
|
|
def test_fetch_unreadable_body_is_not_ok():
|
|
opener = make_opener(status=200, body=b"not-json{{{")
|
|
res = orch_mod.fetch_metrics("http://x/metrics", 1.0, opener=opener)
|
|
assert res.ok is False
|
|
|
|
|
|
def test_fetch_good_body_is_ok():
|
|
opener = make_opener(status=200, body=b'{"schema_version":1,"stages":[]}')
|
|
res = orch_mod.fetch_metrics("http://x/metrics", 1.0, opener=opener)
|
|
assert res.ok is True
|
|
assert res.envelope["schema_version"] == 1
|
|
|
|
|
|
def test_orch_down_signal_debounce_then_alert():
|
|
cfg = _cfg()
|
|
# Single transient failure -> NOT active (does not flap).
|
|
assert orch_down_signal(1, cfg, "timeout").active is False
|
|
assert orch_down_signal(2, cfg, "timeout").active is False
|
|
# K-th consecutive failure -> active alarm.
|
|
sig = orch_down_signal(3, cfg, "timeout")
|
|
assert sig.active is True
|
|
assert sig.key == "orch_down"
|
|
assert "не отвечает" in sig.detail
|