"""ORCH-099 (FND/F1a): lightweight read-only ``/metrics`` raw-signal collector. A leaf module that builds a versioned JSON snapshot of the orchestrator's own raw state for the future observability sidecar (F1b, ``watchdog/``): active task stages, the job queue, agent-liveness, and cost/tokens. The orchestrator emits ONLY raw signal it alone knows — the sidecar is the stateful arbiter that computes thresholds / deltas / alerts (BRD §1, observer separated from observed). Design (ADR-001, by образцу ``serial_gate.snapshot()`` / ``cancel.snapshot()``): * pure, never-raise, no side effects — only reads existing tables (``tasks`` / ``jobs`` / ``agent_runs``) and the in-memory worker snapshot; * ``build_metrics()`` assembles the envelope section-by-section, each section in its own ``try/except`` with a safe default (``None`` / ``[]`` / ``{}``) so a failing source degrades one field, never the whole endpoint (FR-6, NFR-2); * strictly read-only — no INSERT/UPDATE/DELETE/CREATE/ALTER, no process control, no network. Self-hosting-safe on the shared prod instance. The endpoint ``GET /metrics`` (``src/main.py``) is a thin wrapper that returns ``build_metrics()`` as-is. """ from __future__ import annotations import logging import os from datetime import datetime, timezone logger = logging.getLogger("orchestrator.metrics") # Contract version for the sidecar (D2). Additive changes (new field/section) do # NOT bump it — the sidecar MUST ignore unknown keys and tolerate missing # optional ones. Bumped ONLY on a breaking change (rename/remove/retype an # existing field). SCHEMA_VERSION = 1 def _now_iso() -> str: """UTC ISO-8601 snapshot timestamp (``...Z``), the orchestrator's own clock. Same clock domain as the SQLite ``datetime('now')`` timestamps and the CPU tick reads, so the sidecar's ``(cpu_ticks, generated_at)`` deltas are immune to orchestrator↔sidecar clock skew (TR-3). Never raises. """ try: return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") except Exception as e: # noqa: BLE001 - never-raise logger.warning("metrics._now_iso error: %s", e) return "" def _clk_tck() -> int | None: """Process-global SC_CLK_TCK (ticks/second) — the basis for converting raw CPU ticks to seconds on the sidecar side. ``None`` on non-Linux / failure. """ try: return int(os.sysconf("SC_CLK_TCK")) except Exception as e: # noqa: BLE001 - never-raise (non-Linux / unsupported) logger.warning("metrics._clk_tck error: %s", e) return None def _read_cpu_ticks(pid: int | None) -> int | None: """Sum of utime+stime (CPU ticks) from ``/proc//stat`` — raw liveness signal. The orchestrator emits raw ticks and does NOT compute the delta — the sidecar is the stateless arbiter (it divides ``(ticks₂−ticks₁)/clk_tck`` by the ``generated_at`` delta to get a CPU fraction; a tiny fraction at a growing ``runtime_s`` ⇒ a "stuck" candidate). Parsing is robust to spaces in ``comm``: fields are read AFTER the closing ``") "`` of the process name (canonical proc-stat read). utime = field 14, stime = field 15 → indices 11 and 12 of the post-``)`` token list (fields 3.. shift by 3). never-raise (NFR-2, AC-6): ``pid is None`` / missing ``/proc/`` (process died or non-Linux) / any parse error → ``None`` (NOT an exception). The caller keeps every other field and the whole endpoint intact. """ if pid is None: return None try: with open(f"/proc/{int(pid)}/stat", "r") as f: data = f.read() rparen = data.rfind(") ") if rparen == -1: return None rest = data[rparen + 2:].split() # rest[0] = state (field 3); utime = field 14 -> rest[11], stime -> rest[12] return int(rest[11]) + int(rest[12]) except Exception: # noqa: BLE001 - dead pid / no /proc / non-Linux -> null return None def _build_stages() -> list: """Active (non-terminal) task stages (D3, FR-1). Source: ``db.get_active_tasks_for_reconcile()`` (``stage != 'done'`` + SQL ``age_s``), with an extra ``stage NOT IN ('done','cancelled')`` filter on the metrics side: that helper deliberately still returns ``cancelled`` tasks for the reconciler's skip-counter (ORCH-086), but terminal tasks are not raw observability signal (terminal set ``{done, cancelled}``, ORCH-090). The helper invariant belongs to ORCH-053/086 — we filter at the consumer, not the source. """ from . import db rows = db.get_active_tasks_for_reconcile() out = [] for t in rows: if t.get("stage") in ("done", "cancelled"): continue out.append({ "work_item": t.get("work_item_id"), "stage": t.get("stage"), "age_in_stage_s": t.get("age_s"), "repo": t.get("repo"), "task_id": t.get("id"), }) return out def _build_queue() -> dict: """Job-queue raw signal (D4, FR-2): counts, depth, retries, breaker, concurrency. Each sub-source is independently guarded: an uninitialised ``worker`` (e.g. in a test) degrades to ``breaker: null`` / ``max_concurrency: null`` — never a 500 (NFR-2). """ from . import db counts = None try: counts = db.job_status_counts() except Exception as e: # noqa: BLE001 logger.warning("metrics queue counts error: %s", e) retries = None try: retries = db.queue_retry_stats() except Exception as e: # noqa: BLE001 logger.warning("metrics queue retries error: %s", e) breaker = None max_concurrency = None poll_interval = None try: from .queue_worker import worker try: breaker = worker.breaker.snapshot() except Exception as e: # noqa: BLE001 logger.warning("metrics breaker snapshot error: %s", e) max_concurrency = getattr(worker, "max_concurrency", None) poll_interval = getattr(worker, "poll_interval", None) except Exception as e: # noqa: BLE001 - worker not initialised logger.warning("metrics worker access error: %s", e) depth = counts.get("queued") if isinstance(counts, dict) else None return { "counts": counts, "depth": depth, "retries": retries, "breaker": breaker, "max_concurrency": max_concurrency, "poll_interval": poll_interval, } def _build_agents() -> list: """Agent-liveness raw signal (D5/D6, FR-3). One entry per running job from ``db.get_running_agents()`` with process identity (``agent`` / ``run_id`` / ``job_id`` / ``pid``), ``runtime_s`` (= ``running_age_s``, anchored on ``jobs.started_at``, D6), ``model`` / ``effort``, and the raw ``cpu_ticks`` from ``/proc//stat``. ``pid is None`` / dead process → ``cpu_ticks: null`` for THAT agent; the rest stays intact (AC-6, TC-05). """ from . import db rows = db.get_running_agents() out = [] for j in rows: pid = j.get("pid") out.append({ "agent": j.get("agent"), "run_id": j.get("run_id"), "job_id": j.get("job_id"), "repo": j.get("repo"), "pid": pid, "runtime_s": j.get("running_age_s"), "model": j.get("model"), "effort": j.get("effort"), "cpu_ticks": _read_cpu_ticks(pid), }) return out def _build_cost() -> dict: """Cost / token raw signal (D7, FR-4). ``running`` — current per-running-job accruals from ``agent_runs`` (often ``null`` until the job finishes and the launcher parses the CLI JSON — ``null`` is honest raw, NOT zero, TR-5). ``aggregate`` — summed totals over all ``agent_runs`` (empty table → zeros, TC-06/TC-11). """ from . import db running = [] try: for j in db.get_running_agents(): running.append({ "run_id": j.get("run_id"), "job_id": j.get("job_id"), "agent": j.get("agent"), "cost_usd": j.get("cost_usd"), "input_tokens": j.get("input_tokens"), "output_tokens": j.get("output_tokens"), "cache_read_tokens": j.get("cache_read_tokens"), "cache_creation_tokens": j.get("cache_creation_tokens"), }) except Exception as e: # noqa: BLE001 logger.warning("metrics cost.running error: %s", e) running = [] aggregate = None try: aggregate = db.agent_cost_totals() except Exception as e: # noqa: BLE001 logger.warning("metrics cost.aggregate error: %s", e) return {"running": running, "aggregate": aggregate} def build_metrics() -> dict: """Assemble the ``/metrics`` envelope (FR-5). never-raise (FR-6, NFR-2, AC-4). Each section is collected in its own ``try/except`` with a safe default so a failing source degrades one section, not the whole response. Honours the ``metrics_endpoint_enabled`` kill-switch (D8): when off, returns a minimal parsable body ``{"schema_version", "enabled": false}`` (200, NOT 404) so the sidecar sees the off-switch explicitly. """ try: from .config import settings if not bool(getattr(settings, "metrics_endpoint_enabled", True)): return {"schema_version": SCHEMA_VERSION, "enabled": False} except Exception as e: # noqa: BLE001 - kill-switch read must never break /metrics logger.warning("metrics kill-switch read error: %s", e) out: dict = { "schema_version": SCHEMA_VERSION, "generated_at": _now_iso(), "clk_tck": _clk_tck(), } try: out["stages"] = _build_stages() except Exception as e: # noqa: BLE001 logger.warning("metrics stages section error: %s", e) out["stages"] = [] try: out["queue"] = _build_queue() except Exception as e: # noqa: BLE001 logger.warning("metrics queue section error: %s", e) out["queue"] = None try: out["agents"] = _build_agents() except Exception as e: # noqa: BLE001 logger.warning("metrics agents section error: %s", e) out["agents"] = [] try: out["cost"] = _build_cost() except Exception as e: # noqa: BLE001 logger.warning("metrics cost section error: %s", e) out["cost"] = None return out