Files
orchestrator/watchdog/collectors/orch.py
claude-bot 259b507906 feat(watchdog): sidecar-watchdog F1b — monitoring brain in a separate container (ORCH-100)
Add the `watchdog/` package (thin Python-3.12 stdlib-only daemon) and the
`orchestrator-watchdog` compose service — the brain half of the domain-0
observability pair. F1a (ORCH-099) exposes GET /metrics raw signal; F1b reads it,
augments with host / container / dependency probes, runs each signal through a
generalised pure decision function (decide(signal_active, prev, now, cooldown),
a strict superset of disk_watchdog.decide_action) with per-signal in-memory
dedup/throttle/recovery, and alerts over its OWN independent Telegram channel.

Key properties (ADR-001):
- Observer separated from observed: separate container; /metrics not answering is
  itself the master `orch_down` alarm (debounced K ticks — no flap on a hiccup).
- Strictly read-only: docker.sock GET-only + mounted :ro (double guard), host
  paths :ro, no DB/disk writes, no process control — self-hosting-safe.
- never-raise on three levels (per-source/per-tick/per-send) + WATCHDOG_ENABLED
  kill-switch (disabled -> inert idle-loop, not exit).
- Disk anti-duplicate (D6): disk_watchdog (ORCH-063) stays sole owner of the 85%
  alert; sidecar carries orch_down + an opt-in 97% ceiling (default off).
- NO import from src/** (C-1); src/**, STAGE_TRANSITIONS, QG_CHECKS, check_*, DB
  schema — untouched. env_file optional so a missing .env.watchdog never breaks
  `docker compose up` for the prod orchestrator.

Tests: tests/watchdog/ (TC-01…TC-13) + full tests/ regression green (TC-14).
Docs: CHANGELOG, .env.example canon (WATCHDOG_*); architecture README + adr-0033
authored at the architecture stage.

Refs: ORCH-100

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-10 09:36:02 +03:00

119 lines
4.5 KiB
Python

"""Collector: orchestrator ``GET /metrics`` -> parsed envelope | orchestrator_down.
The orchestrator runs ``network_mode: host`` on port 8500, so from the
host-network sidecar ``/metrics`` is reachable at ``http://127.0.0.1:8500/metrics``
(configurable). The body is the F1a versioned envelope
``{schema_version, generated_at, clk_tck, stages[], queue, agents[], cost,
enabled}`` (adr-0030 D2). Parsing is DEFENSIVE (D9): unknown keys are ignored,
a missing optional is not an error, a ``schema_version`` higher than known is
logged (warning) but read as the compatible subset — never a crash.
A timeout / connection-refused / 5xx / unreadable body is itself the master
alarm signal ``orchestrator_down`` (FR-3), surfaced by ``FetchResult.ok ==
False`` — NOT an exception (never-raise per-source, D8).
"""
from __future__ import annotations
import json
import logging
import urllib.error
import urllib.request
from dataclasses import dataclass
from datetime import datetime, timezone
from .. import KNOWN_SCHEMA_VERSION
logger = logging.getLogger("watchdog.collectors.orch")
@dataclass
class FetchResult:
"""Outcome of one ``/metrics`` probe.
``ok`` is ``True`` only when a 2xx response carried a parseable JSON object.
Any other outcome (timeout / refused / 5xx / unreadable) -> ``ok == False``
with a human ``error`` -> the ``orchestrator_down`` signal source.
"""
ok: bool
envelope: dict | None = None
error: str | None = None
def parse_envelope(body: str | bytes) -> dict:
"""Parse the ``/metrics`` body into a dict — tolerant (D9, TC-11).
Raises ``ValueError`` only when the body is not a JSON object (that is the
"unreadable body" case the caller maps to ``orchestrator_down``). A valid
object with unknown / missing keys parses cleanly; downstream readers use
``.get(...)`` with defaults.
"""
if isinstance(body, bytes):
body = body.decode("utf-8", errors="replace")
data = json.loads(body)
if not isinstance(data, dict):
raise ValueError("metrics body is not a JSON object")
return data
def check_schema_version(envelope: dict) -> None:
"""Warn (never crash) when the orchestrator advertises a newer contract (D9)."""
try:
sv = envelope.get("schema_version")
if isinstance(sv, int) and sv > KNOWN_SCHEMA_VERSION:
logger.warning(
"watchdog: /metrics schema_version=%s > known=%s; reading the "
"compatible subset",
sv,
KNOWN_SCHEMA_VERSION,
)
except Exception as e: # noqa: BLE001 - tolerance must never crash
logger.warning("watchdog: schema_version check error: %s", e)
def fetch_metrics(
url: str,
timeout_s: float,
*,
opener=urllib.request.urlopen,
) -> FetchResult:
"""Probe ``GET <url>`` and return a :class:`FetchResult`. never-raise (D8).
``opener`` is injected so tests drive timeout / refused / 5xx / good-body
without the network. A 5xx (or any ``HTTPError`` >= 500) is treated as
down; a parseable 2xx object is ``ok``.
"""
try:
with opener(url, timeout=timeout_s) as resp:
status = int(getattr(resp, "status", None) or resp.getcode())
raw = resp.read()
if status >= 500:
return FetchResult(ok=False, error=f"http {status}")
if status >= 400:
# 4xx is "reachable but refusing" — still not a usable envelope.
return FetchResult(ok=False, error=f"http {status}")
env = parse_envelope(raw)
check_schema_version(env)
return FetchResult(ok=True, envelope=env)
except urllib.error.HTTPError as e: # noqa: PERF203
return FetchResult(ok=False, error=f"http {getattr(e, 'code', '?')}")
except Exception as e: # noqa: BLE001 - timeout / refused / unreadable -> down
return FetchResult(ok=False, error=str(e) or e.__class__.__name__)
def parse_generated_at(envelope: dict) -> float | None:
"""Convert the envelope ``generated_at`` ISO-8601 (``...Z``) to epoch seconds.
Returns ``None`` on a missing / malformed timestamp (never raises) — the
caller then skips the CPU-fraction computation for that tick.
"""
try:
raw = envelope.get("generated_at")
if not raw or not isinstance(raw, str):
return None
dt = datetime.strptime(raw, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
return dt.timestamp()
except Exception as e: # noqa: BLE001 - tolerant parsing
logger.warning("watchdog: cannot parse generated_at: %s", e)
return None