orchestrator/src/lessons.py

"""ORCH-098 (FND/F2): machine lessons-journal — a never-raise observer leaf.

Background
----------
The orchestrator runs an autonomous pipeline; when it deviates (a quality gate
rolls a task back, a merge is held, a transient burst exhausts the retry budget,
a post-deploy verdict comes back DEGRADED) the only trace today is free-text in
``memory/`` — not machine-readable, so nothing can count the patterns or
prioritise the fixes. ORCH-098 is step 1 («Фундамент», F2) of the
self-improvement epic: it formalises those deviations into a structured
``lessons`` table on which the future retrospective agent (E2), the RICE
prioritiser (E3) and Стрим will stand.

Design (ADR-001, by образцу ``serial_gate`` / ``coverage_gate`` / ``metrics``)
------------------------------------------------------------------------------
This is a **leaf**: it imports only ``config`` + ``db`` (lazily). It NEVER imports
``stage_engine`` / ``merge_gate`` / ``launcher`` (anti-cycle) — those choke-points
call INTO this module, never the reverse.

Two contract invariants, both load-bearing on the shared self-hosting prod DB:

  * **kill-switch** (FR-6 / AC-7): ``lessons_enabled=False`` -> every public
    function is an immediate no-op (``record→None``, ``get→[]``, ``update→False``,
    ``snapshot→{}``) WITHOUT touching the DB; the auto-record injections become
    no-ops; pipeline behaviour is byte-for-byte the pre-ORCH-098 behaviour.
  * **never-raise** (NFR-1 / AC-6): with the switch on, every body runs under
    ``try/except Exception -> logger.warning + safe default``. A journal fault
    (a failing DB, a bad row) can NEVER propagate into the hot path that called it
    (a rollback / HOLD / retry must complete regardless).

**No repo scope (D2).** Unlike the gate leaves (``serial_gate`` / ``coverage_gate``
/ ``bug_fast_track`` carry a ``*_repos`` CSV because they *act* on a repo), the
journal is observer-only: writing a row never influences any repo's pipeline.
So it records lessons about ANY repo — including enduro-trails (a degraded enduro
deploy is a valuable self-learning signal; a repo scope would drop it). The
repo cut lives on the READ side (``get(repo=...)`` / ``snapshot``). enduro is not
affected (NFR-3): an observer row about enduro changes no enduro stage/gate.

Self-hosting safety (NFR-7): the journal only reads/writes its own table. It never
deploys, never restarts prod, never touches ``main``, spawns no process, opens no
socket.
"""
from __future__ import annotations

import logging

from .config import settings

logger = logging.getLogger("orchestrator.lessons")


# ---------------------------------------------------------------------------
# Slug conventions (NOT enum constraints — forward-compatible string slugs, D1).
# Exposed as constants so the choke-point injections and tests share one spelling.
# ---------------------------------------------------------------------------
class LessonType:
    """Canonical ``lesson_type`` slugs written by the auto-detectors (D3)."""
    GATE_FAILURE = "gate_failure"        # QG rollback to development
    MERGE_HOLD = "merge_hold"            # merge not verified -> task held on deploy
    TRANSIENT_RETRY = "transient_retry"  # transient retry budget exhausted
    DEPLOY_DEGRADED = "deploy_degraded"  # post-deploy DEGRADED -> repo freeze


class Attribution:
    """``attribution`` slugs (who a lesson is about — filled in later by a human /
    the retrospective agent; auto-records leave it NULL or ``unknown``)."""
    PLATFORM = "platform"
    PROJECT = "project"
    BOTH = "both"
    UNKNOWN = "unknown"


class Domain:
    """``target_domain`` slugs (which improvement axis a lesson touches)."""
    RELIABILITY = "reliability"
    QUALITY = "quality"
    ECONOMY = "economy"
    FEATURES = "features"
    SCALE = "scale"


class Status:
    """``status`` lifecycle slugs."""
    NEW = "new"
    IN_PROGRESS = "in_progress"
    CLOSED = "closed"
    LINKED = "linked"


def _enabled() -> bool:
    """Read the kill-switch; never raises (a config read fault -> treated as off)."""
    try:
        return bool(settings.lessons_enabled)
    except Exception as e:  # noqa: BLE001 - never-raise contract
        logger.warning("lessons: kill-switch read error: %s", e)
        return False


def record(lesson_type, *, work_item_id=None, task_id=None, stage=None, agent=None,
           repo=None, root_cause=None, suggestion=None, status="new", related_task=None,
           attribution=None, target_repo=None, target_domain=None, source="auto",
           detail=None) -> int | None:
    """Record one lesson; return its new id, or ``None`` (no-op / error / deduped).

    * Kill-switch off -> immediate ``None`` WITHOUT a DB access (FR-6 / AC-7).
    * ``source="auto"`` records are DEDUPED (D4): a prior auto-lesson with the same
      ``(work_item_id, lesson_type, stage)`` within ``lessons_dedup_window_s`` ->
      ``None`` (so transient retry-storms / repeated rollbacks don't flood the
      table). ``source="manual"`` is NEVER deduped (the operator / Стрим can always
      write).
    * never-raise (NFR-1 / AC-6): any DB / internal error -> ``logger.warning`` +
      ``None``; the caller (a hot-path rollback / HOLD / retry) is untouched.
    """
    if not _enabled():
        return None
    if not lesson_type:
        return None
    try:
        from . import db
        if source == "auto":
            try:
                window = int(getattr(settings, "lessons_dedup_window_s", 3600) or 0)
            except (TypeError, ValueError):
                window = 3600
            if window > 0 and db.lessons_recent_dup_exists(
                work_item_id, lesson_type, stage, window
            ):
                logger.debug(
                    "lessons: deduped auto %s for %s/%s (within %ss window)",
                    lesson_type, work_item_id, stage, window,
                )
                return None
        return db.record_lesson(
            lesson_type=lesson_type, work_item_id=work_item_id, task_id=task_id,
            stage=stage, agent=agent, repo=repo, root_cause=root_cause,
            suggestion=suggestion, status=status, related_task=related_task,
            attribution=attribution, target_repo=target_repo,
            target_domain=target_domain, source=source, detail=detail,
        )
    except Exception as e:  # noqa: BLE001 - never-raise contract (NFR-1 / AC-6)
        logger.warning("lessons.record(%s) error: %s", lesson_type, e)
        return None


def get(*, lesson_type=None, status=None, repo=None, work_item_id=None,
        limit=None) -> list[dict]:
    """Read-only fetch of lessons (newest first). never-raise -> ``[]`` on error /
    when the kill-switch is off."""
    if not _enabled():
        return []
    try:
        if limit is None:
            limit = getattr(settings, "lessons_query_limit_default", 100)
        from . import db
        return db.get_lessons(
            lesson_type=lesson_type, status=status, repo=repo,
            work_item_id=work_item_id, limit=limit,
        )
    except Exception as e:  # noqa: BLE001 - never-raise contract
        logger.warning("lessons.get error: %s", e)
        return []


def update(lesson_id, **fields) -> bool:
    """Re-classify / re-status an existing lesson (status / attribution / target_* /
    related_task / root_cause / suggestion). Stamps ``updated_at``. never-raise ->
    ``False`` on error / kill-switch off."""
    if not _enabled():
        return False
    try:
        from . import db
        return db.update_lesson(lesson_id, **fields)
    except Exception as e:  # noqa: BLE001 - never-raise contract
        logger.warning("lessons.update(%s) error: %s", lesson_id, e)
        return False


def snapshot() -> dict:
    """Light read-only summary for the GET /queue ``lessons`` block. never-raise ->
    a minimal dict (``{"enabled": False}`` when off / ``{"enabled": True}`` on
    error)."""
    if not _enabled():
        return {"enabled": False}
    try:
        from . import db
        out = {"enabled": True}
        out.update(db.lessons_snapshot())
        return out
    except Exception as e:  # noqa: BLE001 - never-raise contract
        logger.warning("lessons.snapshot error: %s", e)
        return {"enabled": True}