Add the `watchdog/` package (thin Python-3.12 stdlib-only daemon) and the `orchestrator-watchdog` compose service — the brain half of the domain-0 observability pair. F1a (ORCH-099) exposes GET /metrics raw signal; F1b reads it, augments with host / container / dependency probes, runs each signal through a generalised pure decision function (decide(signal_active, prev, now, cooldown), a strict superset of disk_watchdog.decide_action) with per-signal in-memory dedup/throttle/recovery, and alerts over its OWN independent Telegram channel. Key properties (ADR-001): - Observer separated from observed: separate container; /metrics not answering is itself the master `orch_down` alarm (debounced K ticks — no flap on a hiccup). - Strictly read-only: docker.sock GET-only + mounted :ro (double guard), host paths :ro, no DB/disk writes, no process control — self-hosting-safe. - never-raise on three levels (per-source/per-tick/per-send) + WATCHDOG_ENABLED kill-switch (disabled -> inert idle-loop, not exit). - Disk anti-duplicate (D6): disk_watchdog (ORCH-063) stays sole owner of the 85% alert; sidecar carries orch_down + an opt-in 97% ceiling (default off). - NO import from src/** (C-1); src/**, STAGE_TRANSITIONS, QG_CHECKS, check_*, DB schema — untouched. env_file optional so a missing .env.watchdog never breaks `docker compose up` for the prod orchestrator. Tests: tests/watchdog/ (TC-01…TC-13) + full tests/ regression green (TC-14). Docs: CHANGELOG, .env.example canon (WATCHDOG_*); architecture README + adr-0033 authored at the architecture stage. Refs: ORCH-100 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
119 lines
4.5 KiB
Python
119 lines
4.5 KiB
Python
"""Collector: orchestrator ``GET /metrics`` -> parsed envelope | orchestrator_down.
|
|
|
|
The orchestrator runs ``network_mode: host`` on port 8500, so from the
|
|
host-network sidecar ``/metrics`` is reachable at ``http://127.0.0.1:8500/metrics``
|
|
(configurable). The body is the F1a versioned envelope
|
|
``{schema_version, generated_at, clk_tck, stages[], queue, agents[], cost,
|
|
enabled}`` (adr-0030 D2). Parsing is DEFENSIVE (D9): unknown keys are ignored,
|
|
a missing optional is not an error, a ``schema_version`` higher than known is
|
|
logged (warning) but read as the compatible subset — never a crash.
|
|
|
|
A timeout / connection-refused / 5xx / unreadable body is itself the master
|
|
alarm signal ``orchestrator_down`` (FR-3), surfaced by ``FetchResult.ok ==
|
|
False`` — NOT an exception (never-raise per-source, D8).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import urllib.error
|
|
import urllib.request
|
|
from dataclasses import dataclass
|
|
from datetime import datetime, timezone
|
|
|
|
from .. import KNOWN_SCHEMA_VERSION
|
|
|
|
logger = logging.getLogger("watchdog.collectors.orch")
|
|
|
|
|
|
@dataclass
|
|
class FetchResult:
|
|
"""Outcome of one ``/metrics`` probe.
|
|
|
|
``ok`` is ``True`` only when a 2xx response carried a parseable JSON object.
|
|
Any other outcome (timeout / refused / 5xx / unreadable) -> ``ok == False``
|
|
with a human ``error`` -> the ``orchestrator_down`` signal source.
|
|
"""
|
|
|
|
ok: bool
|
|
envelope: dict | None = None
|
|
error: str | None = None
|
|
|
|
|
|
def parse_envelope(body: str | bytes) -> dict:
|
|
"""Parse the ``/metrics`` body into a dict — tolerant (D9, TC-11).
|
|
|
|
Raises ``ValueError`` only when the body is not a JSON object (that is the
|
|
"unreadable body" case the caller maps to ``orchestrator_down``). A valid
|
|
object with unknown / missing keys parses cleanly; downstream readers use
|
|
``.get(...)`` with defaults.
|
|
"""
|
|
if isinstance(body, bytes):
|
|
body = body.decode("utf-8", errors="replace")
|
|
data = json.loads(body)
|
|
if not isinstance(data, dict):
|
|
raise ValueError("metrics body is not a JSON object")
|
|
return data
|
|
|
|
|
|
def check_schema_version(envelope: dict) -> None:
|
|
"""Warn (never crash) when the orchestrator advertises a newer contract (D9)."""
|
|
try:
|
|
sv = envelope.get("schema_version")
|
|
if isinstance(sv, int) and sv > KNOWN_SCHEMA_VERSION:
|
|
logger.warning(
|
|
"watchdog: /metrics schema_version=%s > known=%s; reading the "
|
|
"compatible subset",
|
|
sv,
|
|
KNOWN_SCHEMA_VERSION,
|
|
)
|
|
except Exception as e: # noqa: BLE001 - tolerance must never crash
|
|
logger.warning("watchdog: schema_version check error: %s", e)
|
|
|
|
|
|
def fetch_metrics(
|
|
url: str,
|
|
timeout_s: float,
|
|
*,
|
|
opener=urllib.request.urlopen,
|
|
) -> FetchResult:
|
|
"""Probe ``GET <url>`` and return a :class:`FetchResult`. never-raise (D8).
|
|
|
|
``opener`` is injected so tests drive timeout / refused / 5xx / good-body
|
|
without the network. A 5xx (or any ``HTTPError`` >= 500) is treated as
|
|
down; a parseable 2xx object is ``ok``.
|
|
"""
|
|
try:
|
|
with opener(url, timeout=timeout_s) as resp:
|
|
status = int(getattr(resp, "status", None) or resp.getcode())
|
|
raw = resp.read()
|
|
if status >= 500:
|
|
return FetchResult(ok=False, error=f"http {status}")
|
|
if status >= 400:
|
|
# 4xx is "reachable but refusing" — still not a usable envelope.
|
|
return FetchResult(ok=False, error=f"http {status}")
|
|
env = parse_envelope(raw)
|
|
check_schema_version(env)
|
|
return FetchResult(ok=True, envelope=env)
|
|
except urllib.error.HTTPError as e: # noqa: PERF203
|
|
return FetchResult(ok=False, error=f"http {getattr(e, 'code', '?')}")
|
|
except Exception as e: # noqa: BLE001 - timeout / refused / unreadable -> down
|
|
return FetchResult(ok=False, error=str(e) or e.__class__.__name__)
|
|
|
|
|
|
def parse_generated_at(envelope: dict) -> float | None:
|
|
"""Convert the envelope ``generated_at`` ISO-8601 (``...Z``) to epoch seconds.
|
|
|
|
Returns ``None`` on a missing / malformed timestamp (never raises) — the
|
|
caller then skips the CPU-fraction computation for that tick.
|
|
"""
|
|
try:
|
|
raw = envelope.get("generated_at")
|
|
if not raw or not isinstance(raw, str):
|
|
return None
|
|
dt = datetime.strptime(raw, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
|
|
return dt.timestamp()
|
|
except Exception as e: # noqa: BLE001 - tolerant parsing
|
|
logger.warning("watchdog: cannot parse generated_at: %s", e)
|
|
return None
|