orchestrator/watchdog/decision.py

"""Generalised pure alert-decision function + in-memory anti-spam state (D4).

``src/disk_watchdog.py::decide_action`` is hard-wired to ``used_pct >= threshold``.
F1b has many heterogeneous signals (booleans — "orch down", "container
unhealthy"; counters — "job-failed delta"; thresholds — "memory %", "agent hung N
min"), so the *comparison is lifted out* and this function works on an
already-computed boolean ``signal_active``. The set of outcomes, the cooldown /
recovery semantics and the in-memory best-effort state are a strict
generalisation of the disk variant (BRD §BR-9 names it the template).

``now`` and ``cooldown_s`` are injected so the cooldown / recovery logic is
testable deterministically without a real timer (TC-01…TC-04).
"""
from __future__ import annotations

from dataclasses import dataclass

# Decision outcomes — same vocabulary as ``disk_watchdog`` (1:1 semantics).
ACTION_NONE = "none"
ACTION_ALERT = "alert"
ACTION_REALERT = "realert"
ACTION_RECOVERY = "recovery"


@dataclass
class AlertState:
    """In-memory anti-spam state for one signal key (1:1 with ``PathAlertState``).

    Best-effort: lives only in the daemon (no DB row, no migration). After a
    process restart ``alerting`` resets to ``False`` -> a still-standing problem
    re-alerts once, which is safe (an early signal, not an SLA; FR-7).
    """

    alerting: bool = False
    last_alert_at: float | None = None


def decide(
    signal_active: bool,
    prev: AlertState,
    now: float,
    cooldown_s: float,
) -> str:
    """Pure alert decision — testable without a thread or a real timer (D4).

    Returns one of ``ACTION_{NONE,ALERT,REALERT,RECOVERY}`` as a function of the
    current boolean signal, the previous per-key state and the injected clock:

      * not alerting & active                 -> ALERT     (threshold crossed)
      * alerting & active & cooldown elapsed  -> REALERT   (re-alert)
      * alerting & active & in cooldown       -> NONE      (anti-spam)
      * alerting & not active                 -> RECOVERY  (back to normal)
      * not alerting & not active             -> NONE      (normal)
    """
    if not prev.alerting:
        return ACTION_ALERT if signal_active else ACTION_NONE
    # prev.alerting is True
    if not signal_active:
        return ACTION_RECOVERY
    last = prev.last_alert_at
    if last is None or (now - last) >= cooldown_s:
        return ACTION_REALERT
    return ACTION_NONE