feat(lessons): machine lessons-journal — additive table + observer leaf (ORCH-098)

Step 1 ("Foundation", F2) of the self-improvement epic: formalise free-text "lessons" from memory/ into a machine-readable `lessons` table — the foundation for the future retrospective agent (E2), the RICE prioritiser (E3) and Стрим. - src/lessons.py: pure never-raise observer leaf (record/get/update/snapshot), kill-switch only, NO repo scope (observer-only; records about any repo incl. enduro; repo cut on the read side). Slug-convention constants. - src/db.py: additive idempotent `lessons` table in init_db() (+3 indexes); nullable attribution columns from the start (NFR-6, _ensure_column forward-safe); helpers record_lesson/get_lessons/update_lesson/lessons_snapshot/ lessons_recent_dup_exists (auto-dedup window). - 4 auto-detectors (best-effort, source="auto", deduped): gate_failure (_handle_qg_failure_rollbacks), merge_hold (_handle_merge_verify HOLD), transient_retry (launcher._finalize_transient budget-exhaustion), deploy_degraded (post-deploy DEGRADED -> set_repo_freeze). - src/main.py: GET /lessons, POST /lessons, POST /lessons/{id} + read-only `lessons` block in GET /queue; off-switch -> {"enabled": false}. - src/config.py: lessons_enabled / lessons_query_limit_default / lessons_dedup_window_s. - tests/test_lessons.py: TC-01..TC-12 (unit + integration), all green. - Docs: CLAUDE.md, docs/architecture/README.md (component + schema + API), CHANGELOG. Invariant: the journal is an OBSERVER, not a Quality Gate — STAGE_TRANSITIONS / QG_CHECKS / check_* / machine-verdict / existing table schemas are byte-for-byte untouched; enduro not affected. never-raise on every public fn + injection. Refs: ORCH-098 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-10 10:24:40 +03:00
parent 9f62df02eb
commit 7d21625d84
9 changed files with 985 additions and 3 deletions
--- a/src/agents/launcher.py
+++ b/src/agents/launcher.py
@@ -1016,6 +1016,20 @@ class AgentLauncher:
            )
            self._notify_failed(job_id, agent, job, run_id,
                                f"transient (rate-limit) after {tattempts} attempts")
+            # ORCH-098 (FR-3c / D3): auto-record a `transient_retry` lesson ONLY on
+            # budget EXHAUSTION (not on each backoff — that would be noise; the
+            # valuable signal is "transients exhausted"). best-effort, never-raise,
+            # deduped; can't escape into the queue-worker path.
+            try:
+                from ..lessons import record as record_lesson, LessonType
+                record_lesson(
+                    LessonType.TRANSIENT_RETRY,
+                    task_id=job.get("task_id"), repo=job.get("repo"), agent=agent,
+                    root_cause=f"transient retry budget exhausted ({tattempts}/{tmax})",
+                    detail=err, source="auto",
+                )
+            except Exception as e:  # noqa: BLE001 - never break the queue worker
+                logger.warning(f"Job {job_id}: lessons transient_retry record failed: {e}")

    def _finalize_permanent(self, job_id, agent, run_id, exit_code, job):
        """Permanent (code-fault) failure -> normal attempts<max requeue, then fail."""
--- a/src/config.py
+++ b/src/config.py
@@ -291,6 +291,27 @@ class Settings(BaseSettings):
    coverage_tool_fail_closed: bool = False
    coverage_run_timeout_s: int = 900

+    # ORCH-098 (FND/F2): machine lessons-journal — additive `lessons` table + leaf
+    # src/lessons.py (never-raise observer, by образцу serial_gate/coverage_gate/
+    # metrics). The journal is an OBSERVER, never a Quality Gate: writing a lesson
+    # never influences any repo's pipeline, so — UNLIKE the gate leaves — it has NO
+    # `*_repos` scope (it records lessons about ANY repo, incl. enduro-trails; the
+    # repo cut lives on the READ side, get(repo=...)). The only regulator is a single
+    # global kill-switch (ADR-001 D2). See ADR-001-lessons-journal.md / adr-0033.
+    #   lessons_enabled            -> SINGLE kill-switch (env ORCH_LESSONS_ENABLED).
+    #                                 False -> record/get/update/snapshot inert (no DB
+    #                                 access), endpoints return {"enabled": false},
+    #                                 auto-record injections no-op. Default True.
+    #   lessons_query_limit_default-> default LIMIT for GET /lessons / get() when the
+    #                                 caller passes none.
+    #   lessons_dedup_window_s     -> auto-record dedup window (s): a second auto lesson
+    #                                 with the same (work_item_id, lesson_type, stage)
+    #                                 inside this window is suppressed (D4). manual
+    #                                 records are never deduped. Default 3600 (1h).
+    lessons_enabled: bool = True
+    lessons_query_limit_default: int = 100
+    lessons_dedup_window_s: int = 3600
+
    # ORCH-057: legacy root-owned file ownership detect + actionable worktree error
    # (follow-up ORCH-040). Three additive, kill-switch-reversible layers: (1) an
    # actionable RuntimeError in git_worktree.ensure_worktree when a worktree fails
--- a/src/db.py
+++ b/src/db.py
@@ -220,10 +220,195 @@ def init_db():
            updated_at  TEXT NOT NULL DEFAULT (datetime('now'))
        );
    """)
+    # ORCH-098 (FR-1, ADR-001 D1): additive machine lessons-journal — a structured
+    # table of pipeline deviations (gate-fail / merge-hold / transient-retry /
+    # post-deploy-degraded), the foundation of the self-improvement epic (E2
+    # retrospective / E3 RICE prioritiser). Purely ADDITIVE (CREATE TABLE/INDEX IF NOT
+    # EXISTS, pattern repo_freeze/coverage_baseline) -> idempotent, restart-safe on
+    # the shared prod DB; existing tables untouched (NFR-3, enduro-trails not
+    # affected). The attribution columns (attribution/target_repo/target_domain) are
+    # NULLABLE and present FROM THE START (Слава 10.06, NFR-6) so the live shared DB
+    # never needs a schema rework — an auto-recorded `unknown` lesson is classified
+    # later via update. lesson_type / attribution / target_domain carry NO enum/CHECK
+    # constraint: the values are a forward-compatible slug convention (a new lesson
+    # type never needs a migration). See docs/work-items/ORCH-098/08-data-requirements.md.
+    conn.executescript("""
+        CREATE TABLE IF NOT EXISTS lessons (
+            id            INTEGER PRIMARY KEY AUTOINCREMENT,
+            created_at    TEXT NOT NULL DEFAULT (datetime('now')),
+            updated_at    TEXT,
+            lesson_type   TEXT NOT NULL,
+            work_item_id  TEXT,
+            task_id       INTEGER,
+            stage         TEXT,
+            agent         TEXT,
+            repo          TEXT,
+            root_cause    TEXT,
+            suggestion    TEXT,
+            status        TEXT NOT NULL DEFAULT 'new',
+            related_task  TEXT,
+            attribution   TEXT,
+            target_repo   TEXT,
+            target_domain TEXT,
+            source        TEXT,
+            detail        TEXT
+        );
+        CREATE INDEX IF NOT EXISTS idx_lessons_type_status ON lessons (lesson_type, status);
+        CREATE INDEX IF NOT EXISTS idx_lessons_repo        ON lessons (repo);
+        CREATE INDEX IF NOT EXISTS idx_lessons_wi_type     ON lessons (work_item_id, lesson_type);
+    """)
+    # Forward-safe: on an already-created `lessons` table the attribution columns are
+    # added idempotently (_ensure_column is a no-op once present) so an old prod DB
+    # picks them up without a data migration (NFR-6, AC-2).
+    _ensure_column(conn, "lessons", "attribution", "TEXT")
+    _ensure_column(conn, "lessons", "target_repo", "TEXT")
+    _ensure_column(conn, "lessons", "target_domain", "TEXT")
    conn.commit()
    conn.close()


+# ---------------------------------------------------------------------------
+# ORCH-098 (FR-1..FR-5, ADR-001 D1): lessons-journal DDL helpers. Each opens its
+# own connection and closes it in `finally` (pattern coverage_baseline). The leaf
+# src/lessons.py wraps these in its never-raise contract — these may raise on a
+# real DB fault (the leaf swallows it).
+# ---------------------------------------------------------------------------
+# The full column set, in INSERT order. Single source of truth so record/get stay
+# in lockstep with the schema.
+_LESSON_COLUMNS = (
+    "lesson_type", "work_item_id", "task_id", "stage", "agent", "repo",
+    "root_cause", "suggestion", "status", "related_task",
+    "attribution", "target_repo", "target_domain", "source", "detail",
+)
+# Fields an update() may set (everything mutable; never id/created_at/lesson_type).
+_LESSON_UPDATABLE = (
+    "status", "attribution", "target_repo", "target_domain", "related_task",
+    "root_cause", "suggestion", "stage", "agent", "repo", "detail",
+)
+
+
+def record_lesson(**fields) -> int:
+    """Insert one lessons row; return the new id. Raises only on a real DB fault.
+
+    Only the known columns in ``_LESSON_COLUMNS`` are written; unknown keys are
+    ignored (forward-safe). ``created_at`` is stamped by the table default.
+    """
+    cols = [c for c in _LESSON_COLUMNS if c in fields]
+    if "lesson_type" not in cols:
+        raise ValueError("record_lesson requires lesson_type")
+    placeholders = ", ".join("?" for _ in cols)
+    sql = f"INSERT INTO lessons ({', '.join(cols)}) VALUES ({placeholders})"
+    conn = get_db()
+    try:
+        cur = conn.execute(sql, tuple(fields[c] for c in cols))
+        conn.commit()
+        return int(cur.lastrowid)
+    finally:
+        conn.close()
+
+
+def lessons_recent_dup_exists(work_item_id, lesson_type, stage, window_s: int) -> bool:
+    """ORCH-098 (D4): is there an auto-lesson with the same (work_item_id,
+    lesson_type, stage) within the last ``window_s`` seconds? One indexed lookup on
+    ``idx_lessons_wi_type``. Used to suppress duplicate auto-records on retries.
+    """
+    conn = get_db()
+    try:
+        row = conn.execute(
+            "SELECT 1 FROM lessons "
+            "WHERE work_item_id IS ? AND lesson_type = ? AND stage IS ? "
+            "AND source = 'auto' "
+            "AND created_at > datetime('now', ?) LIMIT 1",
+            (work_item_id, lesson_type, stage, f"-{int(window_s)} seconds"),
+        ).fetchone()
+    finally:
+        conn.close()
+    return row is not None
+
+
+def get_lessons(*, lesson_type=None, status=None, repo=None, work_item_id=None,
+                limit: int = 100) -> list[dict]:
+    """Read-only parametrised SELECT of lessons (ORDER BY id DESC LIMIT ?)."""
+    where = []
+    params: list = []
+    if lesson_type:
+        where.append("lesson_type = ?")
+        params.append(lesson_type)
+    if status:
+        where.append("status = ?")
+        params.append(status)
+    if repo:
+        where.append("repo = ?")
+        params.append(repo)
+    if work_item_id:
+        where.append("work_item_id = ?")
+        params.append(work_item_id)
+    sql = "SELECT * FROM lessons"
+    if where:
+        sql += " WHERE " + " AND ".join(where)
+    sql += " ORDER BY id DESC LIMIT ?"
+    try:
+        lim = int(limit)
+    except (TypeError, ValueError):
+        lim = 100
+    params.append(max(1, lim))
+    conn = get_db()
+    try:
+        rows = conn.execute(sql, tuple(params)).fetchall()
+    finally:
+        conn.close()
+    return [dict(r) for r in rows]
+
+
+def update_lesson(lesson_id: int, **fields) -> bool:
+    """Update mutable fields of a lesson + stamp updated_at. Returns True iff a row
+    changed. Unknown / non-updatable keys are ignored (forward-safe).
+    """
+    sets = [c for c in _LESSON_UPDATABLE if c in fields]
+    if not sets:
+        return False
+    assignments = ", ".join(f"{c} = ?" for c in sets)
+    sql = f"UPDATE lessons SET {assignments}, updated_at = datetime('now') WHERE id = ?"
+    conn = get_db()
+    try:
+        cur = conn.execute(sql, tuple(fields[c] for c in sets) + (int(lesson_id),))
+        conn.commit()
+        return (cur.rowcount or 0) > 0
+    finally:
+        conn.close()
+
+
+def lessons_snapshot(recent: int = 10) -> dict:
+    """Light GROUP BY summary (counts by type/status) + the last N lessons, for the
+    GET /queue observability block."""
+    conn = get_db()
+    try:
+        total = conn.execute("SELECT COUNT(*) FROM lessons").fetchone()[0]
+        by_type = {
+            r["lesson_type"]: r["n"]
+            for r in conn.execute(
+                "SELECT lesson_type, COUNT(*) AS n FROM lessons GROUP BY lesson_type"
+            ).fetchall()
+        }
+        by_status = {
+            r["status"]: r["n"]
+            for r in conn.execute(
+                "SELECT status, COUNT(*) AS n FROM lessons GROUP BY status"
+            ).fetchall()
+        }
+        rows = conn.execute(
+            "SELECT * FROM lessons ORDER BY id DESC LIMIT ?", (max(1, int(recent)),)
+        ).fetchall()
+    finally:
+        conn.close()
+    return {
+        "total": total,
+        "by_type": by_type,
+        "by_status": by_status,
+        "recent": [dict(r) for r in rows],
+    }
+
+
 def get_coverage_baseline(repo: str) -> float | None:
    """ORCH-027: read the per-repo coverage baseline (%, line coverage).

--- a/src/lessons.py
+++ b/src/lessons.py
@@ -0,0 +1,191 @@
+"""ORCH-098 (FND/F2): machine lessons-journal — a never-raise observer leaf.
+
+Background
+----------
+The orchestrator runs an autonomous pipeline; when it deviates (a quality gate
+rolls a task back, a merge is held, a transient burst exhausts the retry budget,
+a post-deploy verdict comes back DEGRADED) the only trace today is free-text in
+``memory/`` — not machine-readable, so nothing can count the patterns or
+prioritise the fixes. ORCH-098 is step 1 («Фундамент», F2) of the
+self-improvement epic: it formalises those deviations into a structured
+``lessons`` table on which the future retrospective agent (E2), the RICE
+prioritiser (E3) and Стрим will stand.
+
+Design (ADR-001, by образцу ``serial_gate`` / ``coverage_gate`` / ``metrics``)
+------------------------------------------------------------------------------
+This is a **leaf**: it imports only ``config`` + ``db`` (lazily). It NEVER imports
+``stage_engine`` / ``merge_gate`` / ``launcher`` (anti-cycle) — those choke-points
+call INTO this module, never the reverse.
+
+Two contract invariants, both load-bearing on the shared self-hosting prod DB:
+
+  * **kill-switch** (FR-6 / AC-7): ``lessons_enabled=False`` -> every public
+    function is an immediate no-op (``record→None``, ``get→[]``, ``update→False``,
+    ``snapshot→{}``) WITHOUT touching the DB; the auto-record injections become
+    no-ops; pipeline behaviour is byte-for-byte the pre-ORCH-098 behaviour.
+  * **never-raise** (NFR-1 / AC-6): with the switch on, every body runs under
+    ``try/except Exception -> logger.warning + safe default``. A journal fault
+    (a failing DB, a bad row) can NEVER propagate into the hot path that called it
+    (a rollback / HOLD / retry must complete regardless).
+
+**No repo scope (D2).** Unlike the gate leaves (``serial_gate`` / ``coverage_gate``
+/ ``bug_fast_track`` carry a ``*_repos`` CSV because they *act* on a repo), the
+journal is observer-only: writing a row never influences any repo's pipeline.
+So it records lessons about ANY repo — including enduro-trails (a degraded enduro
+deploy is a valuable self-learning signal; a repo scope would drop it). The
+repo cut lives on the READ side (``get(repo=...)`` / ``snapshot``). enduro is not
+affected (NFR-3): an observer row about enduro changes no enduro stage/gate.
+
+Self-hosting safety (NFR-7): the journal only reads/writes its own table. It never
+deploys, never restarts prod, never touches ``main``, spawns no process, opens no
+socket.
+"""
+from __future__ import annotations
+
+import logging
+
+from .config import settings
+
+logger = logging.getLogger("orchestrator.lessons")
+
+
+# ---------------------------------------------------------------------------
+# Slug conventions (NOT enum constraints — forward-compatible string slugs, D1).
+# Exposed as constants so the choke-point injections and tests share one spelling.
+# ---------------------------------------------------------------------------
+class LessonType:
+    """Canonical ``lesson_type`` slugs written by the auto-detectors (D3)."""
+    GATE_FAILURE = "gate_failure"        # QG rollback to development
+    MERGE_HOLD = "merge_hold"            # merge not verified -> task held on deploy
+    TRANSIENT_RETRY = "transient_retry"  # transient retry budget exhausted
+    DEPLOY_DEGRADED = "deploy_degraded"  # post-deploy DEGRADED -> repo freeze
+
+
+class Attribution:
+    """``attribution`` slugs (who a lesson is about — filled in later by a human /
+    the retrospective agent; auto-records leave it NULL or ``unknown``)."""
+    PLATFORM = "platform"
+    PROJECT = "project"
+    BOTH = "both"
+    UNKNOWN = "unknown"
+
+
+class Domain:
+    """``target_domain`` slugs (which improvement axis a lesson touches)."""
+    RELIABILITY = "reliability"
+    QUALITY = "quality"
+    ECONOMY = "economy"
+    FEATURES = "features"
+    SCALE = "scale"
+
+
+class Status:
+    """``status`` lifecycle slugs."""
+    NEW = "new"
+    IN_PROGRESS = "in_progress"
+    CLOSED = "closed"
+    LINKED = "linked"
+
+
+def _enabled() -> bool:
+    """Read the kill-switch; never raises (a config read fault -> treated as off)."""
+    try:
+        return bool(settings.lessons_enabled)
+    except Exception as e:  # noqa: BLE001 - never-raise contract
+        logger.warning("lessons: kill-switch read error: %s", e)
+        return False
+
+
+def record(lesson_type, *, work_item_id=None, task_id=None, stage=None, agent=None,
+           repo=None, root_cause=None, suggestion=None, status="new", related_task=None,
+           attribution=None, target_repo=None, target_domain=None, source="auto",
+           detail=None) -> int | None:
+    """Record one lesson; return its new id, or ``None`` (no-op / error / deduped).
+
+    * Kill-switch off -> immediate ``None`` WITHOUT a DB access (FR-6 / AC-7).
+    * ``source="auto"`` records are DEDUPED (D4): a prior auto-lesson with the same
+      ``(work_item_id, lesson_type, stage)`` within ``lessons_dedup_window_s`` ->
+      ``None`` (so transient retry-storms / repeated rollbacks don't flood the
+      table). ``source="manual"`` is NEVER deduped (the operator / Стрим can always
+      write).
+    * never-raise (NFR-1 / AC-6): any DB / internal error -> ``logger.warning`` +
+      ``None``; the caller (a hot-path rollback / HOLD / retry) is untouched.
+    """
+    if not _enabled():
+        return None
+    if not lesson_type:
+        return None
+    try:
+        from . import db
+        if source == "auto":
+            try:
+                window = int(getattr(settings, "lessons_dedup_window_s", 3600) or 0)
+            except (TypeError, ValueError):
+                window = 3600
+            if window > 0 and db.lessons_recent_dup_exists(
+                work_item_id, lesson_type, stage, window
+            ):
+                logger.debug(
+                    "lessons: deduped auto %s for %s/%s (within %ss window)",
+                    lesson_type, work_item_id, stage, window,
+                )
+                return None
+        return db.record_lesson(
+            lesson_type=lesson_type, work_item_id=work_item_id, task_id=task_id,
+            stage=stage, agent=agent, repo=repo, root_cause=root_cause,
+            suggestion=suggestion, status=status, related_task=related_task,
+            attribution=attribution, target_repo=target_repo,
+            target_domain=target_domain, source=source, detail=detail,
+        )
+    except Exception as e:  # noqa: BLE001 - never-raise contract (NFR-1 / AC-6)
+        logger.warning("lessons.record(%s) error: %s", lesson_type, e)
+        return None
+
+
+def get(*, lesson_type=None, status=None, repo=None, work_item_id=None,
+        limit=None) -> list[dict]:
+    """Read-only fetch of lessons (newest first). never-raise -> ``[]`` on error /
+    when the kill-switch is off."""
+    if not _enabled():
+        return []
+    try:
+        if limit is None:
+            limit = getattr(settings, "lessons_query_limit_default", 100)
+        from . import db
+        return db.get_lessons(
+            lesson_type=lesson_type, status=status, repo=repo,
+            work_item_id=work_item_id, limit=limit,
+        )
+    except Exception as e:  # noqa: BLE001 - never-raise contract
+        logger.warning("lessons.get error: %s", e)
+        return []
+
+
+def update(lesson_id, **fields) -> bool:
+    """Re-classify / re-status an existing lesson (status / attribution / target_* /
+    related_task / root_cause / suggestion). Stamps ``updated_at``. never-raise ->
+    ``False`` on error / kill-switch off."""
+    if not _enabled():
+        return False
+    try:
+        from . import db
+        return db.update_lesson(lesson_id, **fields)
+    except Exception as e:  # noqa: BLE001 - never-raise contract
+        logger.warning("lessons.update(%s) error: %s", lesson_id, e)
+        return False
+
+
+def snapshot() -> dict:
+    """Light read-only summary for the GET /queue ``lessons`` block. never-raise ->
+    a minimal dict (``{"enabled": False}`` when off / ``{"enabled": True}`` on
+    error)."""
+    if not _enabled():
+        return {"enabled": False}
+    try:
+        from . import db
+        out = {"enabled": True}
+        out.update(db.lessons_snapshot())
+        return out
+    except Exception as e:  # noqa: BLE001 - never-raise contract
+        logger.warning("lessons.snapshot error: %s", e)
+        return {"enabled": True}
--- a/src/main.py
+++ b/src/main.py
@@ -1,4 +1,4 @@
-from fastapi import FastAPI
+from fastapi import FastAPI, Request
 from contextlib import asynccontextmanager
 import logging
 from .db import init_db
@@ -213,6 +213,7 @@ async def queue():
    from . import labels
    from . import cancel
    from . import bug_fast_track
+    from . import lessons
    from .disk_watchdog import disk_watchdog
    from .build_cache_pruner import build_cache_pruner
    return {
@@ -248,6 +249,10 @@ async def queue():
        # kill-switch, label, scope, bug-task counts + the structural savings metric
        # (architecture stages skipped). Additive block; never-raise.
        "bug_fast_track": bug_fast_track.snapshot(),
+        # ORCH-098 (FR-4 / AC-4): lessons-journal observability (read-only) —
+        # kill-switch + counts by type/status + last N lessons. Additive block;
+        # never-raise (snapshot() returns {"enabled": ...} minimum on error).
+        "lessons": lessons.snapshot(),
        # ORCH-063 (FR-6 / AC-7): disk-watchdog observability (read-only) —
        # enabled, threshold, interval, last measurement per host-path. Additive
        # block; never-raise (status() returns {"enabled": ...} minimum on error).
@@ -390,3 +395,82 @@ async def bug_fast_track_escalate(work_item: str = ""):
        except Exception:
            pass
    return {"ok": True, "work_item": work_item, "track": "full", "was": prev_track}
+
+
+# ---------------------------------------------------------------------------
+# ORCH-098 (FR-4 / FR-5, ADR-001 D5): machine lessons-journal endpoints.
+# Read-only fetch + manual record + re-classify. All never-raise; with the
+# kill-switch off they return {"enabled": false} (style of /metrics, AC-7).
+# ---------------------------------------------------------------------------
+@app.get("/lessons")
+async def lessons_list(
+    type: str = "", status: str = "", repo: str = "", work_item: str = "",
+    limit: int | None = None,
+):
+    """ORCH-098: read-only lessons fetch with optional filters (type / status / repo
+    / work_item / limit). Always 200; reading never mutates. ``lessons_enabled=False``
+    -> ``{"enabled": false}``."""
+    from . import lessons
+    from .config import settings
+    if not getattr(settings, "lessons_enabled", True):
+        return {"enabled": False, "lessons": []}
+    rows = lessons.get(
+        lesson_type=(type or None), status=(status or None), repo=(repo or None),
+        work_item_id=(work_item or None), limit=limit,
+    )
+    return {"enabled": True, "lessons": rows}
+
+
+@app.post("/lessons")
+async def lessons_create(request: Request):
+    """ORCH-098: manually record a lesson (``source="manual"``, never deduped). JSON
+    body: ``lesson_type`` (required) + optional context / analysis / attribution
+    fields. Returns ``{"id": <int>}`` or ``{"enabled": false}`` /
+    ``{"error": ...}``."""
+    from . import lessons
+    from .config import settings
+    if not getattr(settings, "lessons_enabled", True):
+        return {"enabled": False}
+    try:
+        body = await request.json()
+    except Exception:  # noqa: BLE001 - malformed body
+        body = {}
+    if not isinstance(body, dict):
+        body = {}
+    lesson_type = body.get("lesson_type")
+    if not lesson_type:
+        return {"ok": False, "error": "missing 'lesson_type'"}
+    # Only forward known fields; source is forced to "manual" (operator/Стрим).
+    allowed = (
+        "work_item_id", "task_id", "stage", "agent", "repo", "root_cause",
+        "suggestion", "status", "related_task", "attribution", "target_repo",
+        "target_domain", "detail",
+    )
+    kwargs = {k: body[k] for k in allowed if k in body}
+    new_id = lessons.record(lesson_type, source="manual", **kwargs)
+    return {"id": new_id}
+
+
+@app.post("/lessons/{lesson_id}")
+async def lessons_update(lesson_id: int, request: Request):
+    """ORCH-098: re-classify / re-status an existing lesson (status / attribution /
+    target_* / related_task / root_cause / suggestion). Lets a human / the
+    retrospective agent classify an auto-recorded ``unknown``. Returns
+    ``{"ok": bool}`` or ``{"enabled": false}``."""
+    from . import lessons
+    from .config import settings
+    if not getattr(settings, "lessons_enabled", True):
+        return {"enabled": False}
+    try:
+        body = await request.json()
+    except Exception:  # noqa: BLE001 - malformed body
+        body = {}
+    if not isinstance(body, dict):
+        body = {}
+    allowed = (
+        "status", "attribution", "target_repo", "target_domain", "related_task",
+        "root_cause", "suggestion", "stage", "agent", "repo", "detail",
+    )
+    kwargs = {k: body[k] for k in allowed if k in body}
+    ok = lessons.update(lesson_id, **kwargs)
+    return {"ok": ok}
--- a/src/stage_engine.py
+++ b/src/stage_engine.py
@@ -927,6 +927,24 @@ def _handle_qg_failure_rollbacks(
            f"development ({reason})"
        )

+    # ORCH-098 (FR-3a / D3): machine lessons-journal — auto-record a `gate_failure`
+    # lesson whenever a quality gate rolled this task back to `development`
+    # (reviewer REQUEST_CHANGES / tester FAIL / staging FAILED / deploy FAILED — all
+    # four branches above set result.rolled_back_to="development"). One best-effort
+    # call covers every rollback branch; lessons.record is never-raise + deduped, and
+    # this guard ensures even an import fault can't escape into the hot rollback path.
+    if result.rolled_back_to == "development":
+        try:
+            from . import lessons
+            lessons.record(
+                lessons.LessonType.GATE_FAILURE,
+                work_item_id=work_item_id, task_id=task_id, stage=current_stage,
+                agent=agent, repo=repo, root_cause=reason, detail=qg_name,
+                source="auto",
+            )
+        except Exception as e:  # noqa: BLE001 - never break the rollback path
+            logger.warning(f"Task {task_id}: lessons gate_failure record failed: {e}")
+

 # ---------------------------------------------------------------------------
 # ORCH-043: merge-gate sub-gate on the deploy-staging -> deploy edge
@@ -1726,6 +1744,19 @@ def _handle_merge_verify(task_id, repo, work_item_id, branch, result: AdvanceRes
        result.alerted = True
        result.note = "merge-not-verified-hold"
        result.advanced = False
+        # ORCH-098 (FR-3b / D3): auto-record a `merge_hold` lesson — deploy succeeded
+        # but `main` never got the commit, so the task is held on `deploy` (not done).
+        # best-effort, never-raise, deduped; can't escape into the HOLD path.
+        try:
+            from . import lessons
+            lessons.record(
+                lessons.LessonType.MERGE_HOLD,
+                work_item_id=work_item_id, task_id=task_id, stage="deploy",
+                repo=repo, root_cause="merge-not-verified-hold", detail=merge_msg,
+                source="auto",
+            )
+        except Exception as e:  # noqa: BLE001 - never break the HOLD
+            logger.warning(f"Task {task_id}: lessons merge_hold record failed: {e}")
        return True
    except Exception as e:  # noqa: BLE001 - never-raise contract (INV-1/AC-7)
        # Any internal error -> treat as "not confirmed" -> HOLD + alert, never crash.
@@ -2009,6 +2040,24 @@ def run_post_deploy_monitor(job: dict):
    except Exception as e:  # noqa: BLE001 - never break the tick
        logger.warning(f"post-deploy: set_repo_freeze failed for {repo}: {e}")

+    # ORCH-098 (FR-3d / D3): auto-record a `deploy_degraded` lesson — "deploy OK /
+    # prod broken" (layer-3, ET-8). attribution left "unknown" + target_domain
+    # "reliability" for a human / the retrospective agent to classify later (this is
+    # exactly the signal Слава required the attribution columns for). best-effort,
+    # never-raise; can't escape into the monitor tick.
+    try:
+        from . import lessons
+        reason = f"post-deploy DEGRADED ({checks_failed}/{checks_total})"
+        lessons.record(
+            lessons.LessonType.DEPLOY_DEGRADED,
+            work_item_id=work_item_id, repo=repo, stage="deploy",
+            root_cause=reason, attribution=lessons.Attribution.UNKNOWN,
+            target_repo=repo, target_domain=lessons.Domain.RELIABILITY,
+            source="auto",
+        )
+    except Exception as e:  # noqa: BLE001 - never break the tick
+        logger.warning(f"post-deploy: lessons deploy_degraded record failed for {repo}: {e}")
+
    post_deploy.write_post_deploy_log(
        repo, work_item_id, branch, post_deploy.DEGRADED, action_taken,
        settings.post_deploy_window_s, checks_total, checks_failed,