"""ORCH-098 (FND/F2): machine lessons-journal — a never-raise observer leaf. Background ---------- The orchestrator runs an autonomous pipeline; when it deviates (a quality gate rolls a task back, a merge is held, a transient burst exhausts the retry budget, a post-deploy verdict comes back DEGRADED) the only trace today is free-text in ``memory/`` — not machine-readable, so nothing can count the patterns or prioritise the fixes. ORCH-098 is step 1 («Фундамент», F2) of the self-improvement epic: it formalises those deviations into a structured ``lessons`` table on which the future retrospective agent (E2), the RICE prioritiser (E3) and Стрим will stand. Design (ADR-001, by образцу ``serial_gate`` / ``coverage_gate`` / ``metrics``) ------------------------------------------------------------------------------ This is a **leaf**: it imports only ``config`` + ``db`` (lazily). It NEVER imports ``stage_engine`` / ``merge_gate`` / ``launcher`` (anti-cycle) — those choke-points call INTO this module, never the reverse. Two contract invariants, both load-bearing on the shared self-hosting prod DB: * **kill-switch** (FR-6 / AC-7): ``lessons_enabled=False`` -> every public function is an immediate no-op (``record→None``, ``get→[]``, ``update→False``, ``snapshot→{}``) WITHOUT touching the DB; the auto-record injections become no-ops; pipeline behaviour is byte-for-byte the pre-ORCH-098 behaviour. * **never-raise** (NFR-1 / AC-6): with the switch on, every body runs under ``try/except Exception -> logger.warning + safe default``. A journal fault (a failing DB, a bad row) can NEVER propagate into the hot path that called it (a rollback / HOLD / retry must complete regardless). **No repo scope (D2).** Unlike the gate leaves (``serial_gate`` / ``coverage_gate`` / ``bug_fast_track`` carry a ``*_repos`` CSV because they *act* on a repo), the journal is observer-only: writing a row never influences any repo's pipeline. So it records lessons about ANY repo — including enduro-trails (a degraded enduro deploy is a valuable self-learning signal; a repo scope would drop it). The repo cut lives on the READ side (``get(repo=...)`` / ``snapshot``). enduro is not affected (NFR-3): an observer row about enduro changes no enduro stage/gate. Self-hosting safety (NFR-7): the journal only reads/writes its own table. It never deploys, never restarts prod, never touches ``main``, spawns no process, opens no socket. """ from __future__ import annotations import logging from .config import settings logger = logging.getLogger("orchestrator.lessons") # --------------------------------------------------------------------------- # Slug conventions (NOT enum constraints — forward-compatible string slugs, D1). # Exposed as constants so the choke-point injections and tests share one spelling. # --------------------------------------------------------------------------- class LessonType: """Canonical ``lesson_type`` slugs written by the auto-detectors (D3).""" GATE_FAILURE = "gate_failure" # QG rollback to development MERGE_HOLD = "merge_hold" # merge not verified -> task held on deploy TRANSIENT_RETRY = "transient_retry" # transient retry budget exhausted DEPLOY_DEGRADED = "deploy_degraded" # post-deploy DEGRADED -> repo freeze class Attribution: """``attribution`` slugs (who a lesson is about — filled in later by a human / the retrospective agent; auto-records leave it NULL or ``unknown``).""" PLATFORM = "platform" PROJECT = "project" BOTH = "both" UNKNOWN = "unknown" class Domain: """``target_domain`` slugs (which improvement axis a lesson touches).""" RELIABILITY = "reliability" QUALITY = "quality" ECONOMY = "economy" FEATURES = "features" SCALE = "scale" class Status: """``status`` lifecycle slugs.""" NEW = "new" IN_PROGRESS = "in_progress" CLOSED = "closed" LINKED = "linked" def _enabled() -> bool: """Read the kill-switch; never raises (a config read fault -> treated as off).""" try: return bool(settings.lessons_enabled) except Exception as e: # noqa: BLE001 - never-raise contract logger.warning("lessons: kill-switch read error: %s", e) return False def record(lesson_type, *, work_item_id=None, task_id=None, stage=None, agent=None, repo=None, root_cause=None, suggestion=None, status="new", related_task=None, attribution=None, target_repo=None, target_domain=None, source="auto", detail=None) -> int | None: """Record one lesson; return its new id, or ``None`` (no-op / error / deduped). * Kill-switch off -> immediate ``None`` WITHOUT a DB access (FR-6 / AC-7). * ``source="auto"`` records are DEDUPED (D4): a prior auto-lesson with the same ``(work_item_id, lesson_type, stage)`` within ``lessons_dedup_window_s`` -> ``None`` (so transient retry-storms / repeated rollbacks don't flood the table). ``source="manual"`` is NEVER deduped (the operator / Стрим can always write). * never-raise (NFR-1 / AC-6): any DB / internal error -> ``logger.warning`` + ``None``; the caller (a hot-path rollback / HOLD / retry) is untouched. """ if not _enabled(): return None if not lesson_type: return None try: from . import db if source == "auto": try: window = int(getattr(settings, "lessons_dedup_window_s", 3600) or 0) except (TypeError, ValueError): window = 3600 if window > 0 and db.lessons_recent_dup_exists( work_item_id, lesson_type, stage, window ): logger.debug( "lessons: deduped auto %s for %s/%s (within %ss window)", lesson_type, work_item_id, stage, window, ) return None return db.record_lesson( lesson_type=lesson_type, work_item_id=work_item_id, task_id=task_id, stage=stage, agent=agent, repo=repo, root_cause=root_cause, suggestion=suggestion, status=status, related_task=related_task, attribution=attribution, target_repo=target_repo, target_domain=target_domain, source=source, detail=detail, ) except Exception as e: # noqa: BLE001 - never-raise contract (NFR-1 / AC-6) logger.warning("lessons.record(%s) error: %s", lesson_type, e) return None def get(*, lesson_type=None, status=None, repo=None, work_item_id=None, limit=None) -> list[dict]: """Read-only fetch of lessons (newest first). never-raise -> ``[]`` on error / when the kill-switch is off.""" if not _enabled(): return [] try: if limit is None: limit = getattr(settings, "lessons_query_limit_default", 100) from . import db return db.get_lessons( lesson_type=lesson_type, status=status, repo=repo, work_item_id=work_item_id, limit=limit, ) except Exception as e: # noqa: BLE001 - never-raise contract logger.warning("lessons.get error: %s", e) return [] def update(lesson_id, **fields) -> bool: """Re-classify / re-status an existing lesson (status / attribution / target_* / related_task / root_cause / suggestion). Stamps ``updated_at``. never-raise -> ``False`` on error / kill-switch off.""" if not _enabled(): return False try: from . import db return db.update_lesson(lesson_id, **fields) except Exception as e: # noqa: BLE001 - never-raise contract logger.warning("lessons.update(%s) error: %s", lesson_id, e) return False def snapshot() -> dict: """Light read-only summary for the GET /queue ``lessons`` block. never-raise -> a minimal dict (``{"enabled": False}`` when off / ``{"enabled": True}`` on error).""" if not _enabled(): return {"enabled": False} try: from . import db out = {"enabled": True} out.update(db.lessons_snapshot()) return out except Exception as e: # noqa: BLE001 - never-raise contract logger.warning("lessons.snapshot error: %s", e) return {"enabled": True}