feat(metrics): lightweight read-only GET /metrics raw-signal endpoint (ORCH-099)

FND/F1a: add a versioned read-only JSON endpoint GET /metrics that exposes the orchestrator's own raw state for the future observability sidecar F1b — active task stages, job queue, agent-liveness (pid/runtime/cpu_ticks), and cost/tokens. The orchestrator emits ONLY raw signal it alone knows; thresholds/alerts/history live in the separate sidecar (observer separated from observed, BRD §1). - src/metrics.py: new leaf collector build_metrics() (never-raise per section, serial_gate.snapshot() pattern); envelope schema_version/generated_at/clk_tck + stages/queue/agents/cost. _read_cpu_ticks(pid) reads utime+stime from /proc/<pid>/stat (null on None/dead/non-Linux pid — never raises). - src/main.py: thin @app.get("/metrics") wrapper (style of GET /queue). - src/db.py: read-only helpers get_running_agents() (dedicated SELECT, not an extension of the hot-path get_running_jobs()), agent_cost_totals(), queue_retry_stats(); job_status_counts() default dict gains the cancelled key. - src/config.py: metrics_endpoint_enabled kill-switch (default True), env ORCH_METRICS_ENABLED via explicit validation_alias so the documented switch actually controls the flag. - docs: README API table row + CHANGELOG entry (contract section already added by architect); .env.example ORCH_METRICS_ENABLED. Strictly read-only / never-raise: STAGE_TRANSITIONS / QG_CHECKS / check_* / machine-verdict keys / DB schema untouched; /health//status//queue byte-for-byte. Tests: tests/test_metrics.py (TC-01..TC-11) + env-alias tests in test_config.py. Full suite green (1482). Refs: ORCH-099 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-10 01:58:47 +03:00
parent 8988dca14d
commit d8793c9698
10 changed files with 739 additions and 5 deletions
--- a/src/config.py
+++ b/src/config.py
@@ -1,7 +1,7 @@
 import logging
 import re

-from pydantic import field_validator
+from pydantic import Field, field_validator
 from pydantic_settings import BaseSettings


@@ -819,6 +819,17 @@ class Settings(BaseSettings):
    # 200 (was hardcoded 80). Invalid/empty value -> default (graceful, no crash).
    qg0_title_max: int = 200

+    # ORCH-099 (D8): operator off-switch for the read-only GET /metrics endpoint.
+    # The env var is ORCH_METRICS_ENABLED (explicit validation_alias — the documented
+    # contract name, ADR-001 D8 / README — overriding the default ORCH_ + field-name
+    # mapping so the documented switch actually controls the flag). Default True ->
+    # the endpoint is available out of the box (zero regression vs BRD). False ->
+    # /metrics returns a minimal parsable body {"schema_version": 1, "enabled": false}
+    # (200, NOT 404) so the F1b sidecar sees the off-switch explicitly. The endpoint
+    # is inert / read-only anyway; the flag is a cheap self-hosting insurance on the
+    # shared prod instance.
+    metrics_endpoint_enabled: bool = Field(True, validation_alias="ORCH_METRICS_ENABLED")
+
    @field_validator("qg0_title_max", mode="before")
    @classmethod
    def _qg0_title_max_default(cls, v):
--- a/src/db.py
+++ b/src/db.py
@@ -1133,6 +1133,100 @@ def get_running_jobs() -> list[dict]:
    return [dict(r) for r in rows]


+def get_running_agents() -> list[dict]:
+    """ORCH-099 (D5): read-only liveness snapshot of every 'running' job for /metrics.
+
+    A dedicated read-only SELECT — deliberately NOT an extension of
+    ``get_running_jobs()`` (the job-reaper hot path, ORCH-065): widening that
+    query under observability needs would migrate a foreign component's invariant.
+    Each row carries the process identity + cost context the F1b sidecar needs:
+      * ``job_id`` / ``run_id`` / ``pid`` — process identity (pid may be NULL until
+        the launcher stamps it / after the process exits);
+      * ``agent`` / ``repo`` — role and project (the sidecar is multi-project);
+      * ``running_age_s`` — seconds since ``jobs.started_at`` (the same process
+        anchor the reaper uses for backstop-liveness, D6);
+      * ``model`` / ``effort`` — cost context (LEFT JOIN ``agent_runs``);
+      * the token / ``cost_usd`` columns — current per-run accruals, usually NULL
+        until the launcher parses the CLI result JSON on finish (honest raw, TR-5).
+
+    A LEFT JOIN on ``run_id`` keeps a job with no ``agent_runs`` row. Read-only;
+    never mutates.
+    """
+    conn = get_db()
+    try:
+        rows = conn.execute(
+            "SELECT j.id AS job_id, j.run_id AS run_id, j.pid AS pid, "
+            "j.agent AS agent, j.repo AS repo, j.started_at AS started_at, "
+            "CAST(strftime('%s','now') - strftime('%s', j.started_at) AS INTEGER) "
+            "  AS running_age_s, "
+            "r.model AS model, r.effort AS effort, r.cost_usd AS cost_usd, "
+            "r.input_tokens AS input_tokens, r.output_tokens AS output_tokens, "
+            "r.cache_read_tokens AS cache_read_tokens, "
+            "r.cache_creation_tokens AS cache_creation_tokens "
+            "FROM jobs j LEFT JOIN agent_runs r ON r.id = j.run_id "
+            "WHERE j.status='running'"
+        ).fetchall()
+    finally:
+        conn.close()
+    return [dict(r) for r in rows]
+
+
+def agent_cost_totals() -> dict:
+    """ORCH-099 (D7): read-only aggregate of cost / tokens over all agent_runs.
+
+    Pure ``SELECT COALESCE(SUM(...),0)`` — an empty ``agent_runs`` table yields
+    zeros, never an error (TC-06 / TC-11). Read-only; never mutates.
+    """
+    conn = get_db()
+    try:
+        row = conn.execute(
+            "SELECT "
+            "COALESCE(SUM(cost_usd),0) AS cost_usd, "
+            "COALESCE(SUM(input_tokens),0) AS input_tokens, "
+            "COALESCE(SUM(output_tokens),0) AS output_tokens, "
+            "COALESCE(SUM(cache_read_tokens),0) AS cache_read_tokens, "
+            "COALESCE(SUM(cache_creation_tokens),0) AS cache_creation_tokens "
+            "FROM agent_runs"
+        ).fetchone()
+    finally:
+        conn.close()
+    return dict(row) if row else {
+        "cost_usd": 0,
+        "input_tokens": 0,
+        "output_tokens": 0,
+        "cache_read_tokens": 0,
+        "cache_creation_tokens": 0,
+    }
+
+
+def queue_retry_stats() -> dict:
+    """ORCH-099 (D4): read-only retry raw over UNFINISHED jobs for /metrics.queue.
+
+    Aggregates ``attempts`` / ``transient_attempts`` and counts jobs currently in
+    backoff (``available_at > now``) across non-terminal jobs (status NOT IN
+    done/failed/cancelled). Read-only; never mutates.
+    """
+    conn = get_db()
+    try:
+        row = conn.execute(
+            "SELECT "
+            "COALESCE(SUM(attempts),0) AS total_attempts, "
+            "COALESCE(SUM(transient_attempts),0) AS total_transient_attempts, "
+            "COALESCE(MAX(attempts),0) AS max_attempts_seen, "
+            "COALESCE(SUM(CASE WHEN available_at IS NOT NULL "
+            "  AND available_at > datetime('now') THEN 1 ELSE 0 END),0) AS in_backoff "
+            "FROM jobs WHERE status NOT IN ('done','failed','cancelled')"
+        ).fetchone()
+    finally:
+        conn.close()
+    return dict(row) if row else {
+        "total_attempts": 0,
+        "total_transient_attempts": 0,
+        "max_attempts_seen": 0,
+        "in_backoff": 0,
+    }
+
+
 def reap_running_job(
    job_id: int,
    status: str,
@@ -1185,13 +1279,20 @@ def get_job(job_id: int) -> dict | None:


 def job_status_counts() -> dict:
-    """Return counts grouped by status (for /queue observability)."""
+    """Return counts grouped by status (for /queue and /metrics observability).
+
+    ORCH-099 (D4): the default dict carries the ``cancelled`` terminal key
+    (ORCH-090, terminal set ``{done, cancelled}``) so the key is always present
+    with a 0 default instead of materialising only when a cancelled job exists.
+    Purely additive — the GROUP BY query is unchanged and pre-existing keys keep
+    their meaning (no /queue contract break).
+    """
    conn = get_db()
    rows = conn.execute(
        "SELECT status, COUNT(*) AS n FROM jobs GROUP BY status"
    ).fetchall()
    conn.close()
-    counts = {"queued": 0, "running": 0, "done": 0, "failed": 0}
+    counts = {"queued": 0, "running": 0, "done": 0, "failed": 0, "cancelled": 0}
    for r in rows:
        counts[r["status"]] = r["n"]
    return counts
--- a/src/main.py
+++ b/src/main.py
@@ -213,6 +213,26 @@ async def queue():
    }


+@app.get("/metrics")
+async def metrics():
+    """ORCH-099 (FND/F1a): lightweight read-only raw-signal snapshot for the F1b sidecar.
+
+    A versioned JSON envelope (``schema_version`` / ``generated_at`` / ``clk_tck``)
+    with four raw-signal sections — ``stages`` (active task stages + age),
+    ``queue`` (counts / retries / breaker / concurrency), ``agents`` (agent-liveness:
+    pid / runtime / cpu_ticks), ``cost`` (per-run + aggregate tokens/cost). The
+    orchestrator emits ONLY raw signal it alone knows; the stateful arbiter
+    (thresholds / deltas / alerts) is the separate sidecar (BRD §1).
+
+    Thin wrapper over ``metrics.build_metrics()`` (in the style of GET /queue): the
+    collector is already strictly read-only and never-raise, so no extra error
+    handling is needed here. Same access level as /queue//status. The format is the
+    documented contract for the sidecar (docs/architecture/README.md).
+    """
+    from . import metrics as metrics_mod
+    return metrics_mod.build_metrics()
+
+
@app.post("/serial-gate/unfreeze")
 async def serial_gate_unfreeze(repo: str = ""):
    """ORCH-088 (FR-5, ADR-001 D4): manually clear a per-repo rollback-freeze.
--- a/src/metrics.py
+++ b/src/metrics.py
@@ -0,0 +1,276 @@
+"""ORCH-099 (FND/F1a): lightweight read-only ``/metrics`` raw-signal collector.
+
+A leaf module that builds a versioned JSON snapshot of the orchestrator's own
+raw state for the future observability sidecar (F1b, ``watchdog/``): active task
+stages, the job queue, agent-liveness, and cost/tokens. The orchestrator emits
+ONLY raw signal it alone knows — the sidecar is the stateful arbiter that
+computes thresholds / deltas / alerts (BRD §1, observer separated from observed).
+
+Design (ADR-001, by образцу ``serial_gate.snapshot()`` / ``cancel.snapshot()``):
+  * pure, never-raise, no side effects — only reads existing tables
+    (``tasks`` / ``jobs`` / ``agent_runs``) and the in-memory worker snapshot;
+  * ``build_metrics()`` assembles the envelope section-by-section, each section in
+    its own ``try/except`` with a safe default (``None`` / ``[]`` / ``{}``) so a
+    failing source degrades one field, never the whole endpoint (FR-6, NFR-2);
+  * strictly read-only — no INSERT/UPDATE/DELETE/CREATE/ALTER, no process control,
+    no network. Self-hosting-safe on the shared prod instance.
+
+The endpoint ``GET /metrics`` (``src/main.py``) is a thin wrapper that returns
+``build_metrics()`` as-is.
+"""
+from __future__ import annotations
+
+import logging
+import os
+from datetime import datetime, timezone
+
+logger = logging.getLogger("orchestrator.metrics")
+
+# Contract version for the sidecar (D2). Additive changes (new field/section) do
+# NOT bump it — the sidecar MUST ignore unknown keys and tolerate missing
+# optional ones. Bumped ONLY on a breaking change (rename/remove/retype an
+# existing field).
+SCHEMA_VERSION = 1
+
+
+def _now_iso() -> str:
+    """UTC ISO-8601 snapshot timestamp (``...Z``), the orchestrator's own clock.
+
+    Same clock domain as the SQLite ``datetime('now')`` timestamps and the CPU
+    tick reads, so the sidecar's ``(cpu_ticks, generated_at)`` deltas are immune
+    to orchestrator↔sidecar clock skew (TR-3). Never raises.
+    """
+    try:
+        return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+    except Exception as e:  # noqa: BLE001 - never-raise
+        logger.warning("metrics._now_iso error: %s", e)
+        return ""
+
+
+def _clk_tck() -> int | None:
+    """Process-global SC_CLK_TCK (ticks/second) — the basis for converting raw CPU
+    ticks to seconds on the sidecar side. ``None`` on non-Linux / failure.
+    """
+    try:
+        return int(os.sysconf("SC_CLK_TCK"))
+    except Exception as e:  # noqa: BLE001 - never-raise (non-Linux / unsupported)
+        logger.warning("metrics._clk_tck error: %s", e)
+        return None
+
+
+def _read_cpu_ticks(pid: int | None) -> int | None:
+    """Sum of utime+stime (CPU ticks) from ``/proc/<pid>/stat`` — raw liveness signal.
+
+    The orchestrator emits raw ticks and does NOT compute the delta — the sidecar
+    is the stateless arbiter (it divides ``(ticks₂−ticks₁)/clk_tck`` by the
+    ``generated_at`` delta to get a CPU fraction; a tiny fraction at a growing
+    ``runtime_s`` ⇒ a "stuck" candidate). Parsing is robust to spaces in ``comm``:
+    fields are read AFTER the closing ``") "`` of the process name (canonical
+    proc-stat read). utime = field 14, stime = field 15 → indices 11 and 12 of the
+    post-``)`` token list (fields 3.. shift by 3).
+
+    never-raise (NFR-2, AC-6): ``pid is None`` / missing ``/proc/<pid>`` (process
+    died or non-Linux) / any parse error → ``None`` (NOT an exception). The caller
+    keeps every other field and the whole endpoint intact.
+    """
+    if pid is None:
+        return None
+    try:
+        with open(f"/proc/{int(pid)}/stat", "r") as f:
+            data = f.read()
+        rparen = data.rfind(") ")
+        if rparen == -1:
+            return None
+        rest = data[rparen + 2:].split()
+        # rest[0] = state (field 3); utime = field 14 -> rest[11], stime -> rest[12]
+        return int(rest[11]) + int(rest[12])
+    except Exception:  # noqa: BLE001 - dead pid / no /proc / non-Linux -> null
+        return None
+
+
+def _build_stages() -> list:
+    """Active (non-terminal) task stages (D3, FR-1).
+
+    Source: ``db.get_active_tasks_for_reconcile()`` (``stage != 'done'`` + SQL
+    ``age_s``), with an extra ``stage NOT IN ('done','cancelled')`` filter on the
+    metrics side: that helper deliberately still returns ``cancelled`` tasks for
+    the reconciler's skip-counter (ORCH-086), but terminal tasks are not raw
+    observability signal (terminal set ``{done, cancelled}``, ORCH-090). The helper
+    invariant belongs to ORCH-053/086 — we filter at the consumer, not the source.
+    """
+    from . import db
+
+    rows = db.get_active_tasks_for_reconcile()
+    out = []
+    for t in rows:
+        if t.get("stage") in ("done", "cancelled"):
+            continue
+        out.append({
+            "work_item": t.get("work_item_id"),
+            "stage": t.get("stage"),
+            "age_in_stage_s": t.get("age_s"),
+            "repo": t.get("repo"),
+            "task_id": t.get("id"),
+        })
+    return out
+
+
+def _build_queue() -> dict:
+    """Job-queue raw signal (D4, FR-2): counts, depth, retries, breaker, concurrency.
+
+    Each sub-source is independently guarded: an uninitialised ``worker`` (e.g. in
+    a test) degrades to ``breaker: null`` / ``max_concurrency: null`` — never a 500
+    (NFR-2).
+    """
+    from . import db
+
+    counts = None
+    try:
+        counts = db.job_status_counts()
+    except Exception as e:  # noqa: BLE001
+        logger.warning("metrics queue counts error: %s", e)
+
+    retries = None
+    try:
+        retries = db.queue_retry_stats()
+    except Exception as e:  # noqa: BLE001
+        logger.warning("metrics queue retries error: %s", e)
+
+    breaker = None
+    max_concurrency = None
+    poll_interval = None
+    try:
+        from .queue_worker import worker
+        try:
+            breaker = worker.breaker.snapshot()
+        except Exception as e:  # noqa: BLE001
+            logger.warning("metrics breaker snapshot error: %s", e)
+        max_concurrency = getattr(worker, "max_concurrency", None)
+        poll_interval = getattr(worker, "poll_interval", None)
+    except Exception as e:  # noqa: BLE001 - worker not initialised
+        logger.warning("metrics worker access error: %s", e)
+
+    depth = counts.get("queued") if isinstance(counts, dict) else None
+    return {
+        "counts": counts,
+        "depth": depth,
+        "retries": retries,
+        "breaker": breaker,
+        "max_concurrency": max_concurrency,
+        "poll_interval": poll_interval,
+    }
+
+
+def _build_agents() -> list:
+    """Agent-liveness raw signal (D5/D6, FR-3).
+
+    One entry per running job from ``db.get_running_agents()`` with process
+    identity (``agent`` / ``run_id`` / ``job_id`` / ``pid``), ``runtime_s``
+    (= ``running_age_s``, anchored on ``jobs.started_at``, D6), ``model`` /
+    ``effort``, and the raw ``cpu_ticks`` from ``/proc/<pid>/stat``. ``pid is
+    None`` / dead process → ``cpu_ticks: null`` for THAT agent; the rest stays
+    intact (AC-6, TC-05).
+    """
+    from . import db
+
+    rows = db.get_running_agents()
+    out = []
+    for j in rows:
+        pid = j.get("pid")
+        out.append({
+            "agent": j.get("agent"),
+            "run_id": j.get("run_id"),
+            "job_id": j.get("job_id"),
+            "repo": j.get("repo"),
+            "pid": pid,
+            "runtime_s": j.get("running_age_s"),
+            "model": j.get("model"),
+            "effort": j.get("effort"),
+            "cpu_ticks": _read_cpu_ticks(pid),
+        })
+    return out
+
+
+def _build_cost() -> dict:
+    """Cost / token raw signal (D7, FR-4).
+
+    ``running`` — current per-running-job accruals from ``agent_runs`` (often
+    ``null`` until the job finishes and the launcher parses the CLI JSON — ``null``
+    is honest raw, NOT zero, TR-5). ``aggregate`` — summed totals over all
+    ``agent_runs`` (empty table → zeros, TC-06/TC-11).
+    """
+    from . import db
+
+    running = []
+    try:
+        for j in db.get_running_agents():
+            running.append({
+                "run_id": j.get("run_id"),
+                "job_id": j.get("job_id"),
+                "agent": j.get("agent"),
+                "cost_usd": j.get("cost_usd"),
+                "input_tokens": j.get("input_tokens"),
+                "output_tokens": j.get("output_tokens"),
+                "cache_read_tokens": j.get("cache_read_tokens"),
+                "cache_creation_tokens": j.get("cache_creation_tokens"),
+            })
+    except Exception as e:  # noqa: BLE001
+        logger.warning("metrics cost.running error: %s", e)
+        running = []
+
+    aggregate = None
+    try:
+        aggregate = db.agent_cost_totals()
+    except Exception as e:  # noqa: BLE001
+        logger.warning("metrics cost.aggregate error: %s", e)
+
+    return {"running": running, "aggregate": aggregate}
+
+
+def build_metrics() -> dict:
+    """Assemble the ``/metrics`` envelope (FR-5). never-raise (FR-6, NFR-2, AC-4).
+
+    Each section is collected in its own ``try/except`` with a safe default so a
+    failing source degrades one section, not the whole response. Honours the
+    ``metrics_endpoint_enabled`` kill-switch (D8): when off, returns a minimal
+    parsable body ``{"schema_version", "enabled": false}`` (200, NOT 404) so the
+    sidecar sees the off-switch explicitly.
+    """
+    try:
+        from .config import settings
+        if not bool(getattr(settings, "metrics_endpoint_enabled", True)):
+            return {"schema_version": SCHEMA_VERSION, "enabled": False}
+    except Exception as e:  # noqa: BLE001 - kill-switch read must never break /metrics
+        logger.warning("metrics kill-switch read error: %s", e)
+
+    out: dict = {
+        "schema_version": SCHEMA_VERSION,
+        "generated_at": _now_iso(),
+        "clk_tck": _clk_tck(),
+    }
+
+    try:
+        out["stages"] = _build_stages()
+    except Exception as e:  # noqa: BLE001
+        logger.warning("metrics stages section error: %s", e)
+        out["stages"] = []
+
+    try:
+        out["queue"] = _build_queue()
+    except Exception as e:  # noqa: BLE001
+        logger.warning("metrics queue section error: %s", e)
+        out["queue"] = None
+
+    try:
+        out["agents"] = _build_agents()
+    except Exception as e:  # noqa: BLE001
+        logger.warning("metrics agents section error: %s", e)
+        out["agents"] = []
+
+    try:
+        out["cost"] = _build_cost()
+    except Exception as e:  # noqa: BLE001
+        logger.warning("metrics cost section error: %s", e)
+        out["cost"] = None
+
+    return out