orchestrator/watchdog/__main__.py

"""Sidecar entrypoint: the tick loop with kill-switch + per-tick never-raise (D8).

Run as ``python -m watchdog`` (the container ``ENTRYPOINT``). The loop:
  * honours ``WATCHDOG_ENABLED=false`` -> stays INERT (idle-loops with a log line,
    does NOT ``exit``, so ``restart: unless-stopped`` does not spin a restart loop);
  * wraps every tick in an outer ``try/except`` so a tick error logs and the daemon
    survives (per-tick never-raise);
  * logs start / each tick so the container logs prove the sidecar is alive and why
    an alert did (not) fire (NFR-7).
"""
from __future__ import annotations

import logging
import time

from .config import Config
from .core import Watchdog

logger = logging.getLogger("watchdog")


def _setup_logging() -> None:
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s %(levelname)s %(name)s: %(message)s",
    )


def run(cfg: Config | None = None, max_ticks: int | None = None) -> None:
    """Run the tick loop. ``max_ticks`` bounds the loop for tests (``None`` = forever)."""
    cfg = cfg or Config.from_env()

    if not cfg.enabled:
        logger.info("watchdog: WATCHDOG_ENABLED=false -> inert (idle, no ticks)")
        # Idle, not exit: keep the container up so restart-policy does not flap.
        ticks = 0
        while max_ticks is None or ticks < max_ticks:
            time.sleep(cfg.interval_s)
            ticks += 1
        return

    logger.info(
        "watchdog started (interval=%ss, metrics=%s, containers=%s, deps=%s, "
        "mem_pct=%s, disk_crit=%s)",
        cfg.interval_s,
        cfg.metrics_url,
        cfg.containers,
        list(cfg.deps),
        cfg.mem_pct,
        cfg.disk_crit_enabled,
    )
    dog = Watchdog(cfg)
    ticks = 0
    while max_ticks is None or ticks < max_ticks:
        try:
            dispatched = dog.tick()
            fired = [
                (a, getattr(s, "key", None)) for a, s in dispatched if a != "none"
            ]
            logger.info("watchdog tick ok (fired=%s)", fired)
        except Exception as e:  # noqa: BLE001 - per-tick outer never-raise (D8)
            logger.error("watchdog tick error: %s", e)
        ticks += 1
        if max_ticks is not None and ticks >= max_ticks:
            break
        time.sleep(cfg.interval_s)


def main() -> None:
    _setup_logging()
    run()


if __name__ == "__main__":
    main()