orchestrator/tests/test_tracker_rollback_metrics.py

"""ORCH-091 — Group 2 (D2/D3): rollback reflection + stage-metric summation.

Covers TC-05..TC-08 from 04-test-plan.yaml. The render path is pure DB (no
network); a temp SQLite holds tasks + agent_runs.

  TC-05 / AC-4 — rollback deploy-staging->development: Development active (🔄),
                 Testing/Внедрение NOT shown ✅, Анализ/Архитектура stay ✅.
  TC-06 / AC-5 — stage line sums ALL of an agent's runs (ORCH-069 developer
                 3 runs ≈ $3.98), not the last run.
  TC-07 / AC-5 — task totals (💰/🔢/⏱) converge with SUM(agent_runs).
  TC-08 / AC-7 — render_task_tracker never raises on broken/partial rows.
"""

import os
import tempfile

os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")
os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")

_test_db = os.path.join(tempfile.gettempdir(), "test_orchestrator_rollback_metrics.db")
os.environ["ORCH_DB_PATH"] = _test_db

import pytest  # noqa: E402

import src.db as db_module  # noqa: E402
from src.db import init_db, get_db  # noqa: E402
from src import notifications as N  # noqa: E402
from src.usage import fmt_cost, fmt_tokens, _input_total  # noqa: E402


@pytest.fixture(autouse=True)
def setup_db(monkeypatch):
    monkeypatch.setattr(db_module.settings, "db_path", _test_db, raising=False)
    if os.path.exists(_test_db):
        os.unlink(_test_db)
    init_db()
    # Render-only: keep the live overlay off (offline core under test).
    monkeypatch.setattr(N._get_settings(), "tracker_live_status", False, raising=False)
    yield
    if os.path.exists(_test_db):
        os.unlink(_test_db)


def _mk_task(stage="development", wid="ORCH-091", title="rollback/metrics",
             created=None, updated=None):
    conn = get_db()
    cur = conn.execute(
        "INSERT INTO tasks (plane_id, work_item_id, repo, branch, stage, title) "
        "VALUES (?,?,?,?,?,?)",
        ("p1", wid, "orchestrator", "feature/ORCH-091-x", stage, title),
    )
    tid = cur.lastrowid
    if created or updated:
        conn.execute(
            "UPDATE tasks SET created_at=COALESCE(?, created_at), "
            "updated_at=COALESCE(?, updated_at) WHERE id=?",
            (created, updated, tid),
        )
    conn.commit()
    conn.close()
    return tid


def _mk_run(tid, agent, started, finished, *, model="tokenator/claude-opus-4-8",
            in_tok=10, out_tok=5, cache_read=0, cache_creation=0, cost=0.0,
            effort=None, exit_code=0):
    conn = get_db()
    conn.execute(
        "INSERT INTO agent_runs (task_id, agent, started_at, finished_at, "
        "exit_code, input_tokens, output_tokens, cache_read_tokens, "
        "cache_creation_tokens, cost_usd, model, effort) "
        "VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
        (tid, agent, started, finished, exit_code, in_tok, out_tok, cache_read,
         cache_creation, cost, model, effort),
    )
    conn.commit()
    conn.close()


def _stage_line(text, label):
    """The single '✅ <label> ...' line for a stage, or None."""
    for ln in text.splitlines():
        if ln.startswith(f"✅ {label}"):
            return ln
    return None


def _has_active(text, label):
    """True if the '🔄 <label> ...' active line is present."""
    return any(ln.startswith(f"🔄 {label}") for ln in text.splitlines())


# =========================================================================== #
# TC-05 / AC-4 — rollback reflection (deploy-staging -> development)
# =========================================================================== #
def test_tc05_rollback_suppresses_later_stage_checkmarks():
    """A task back on stage='development' after later stages ran: Development is
    active (🔄), and Тестирование/Внедрение/Код ревью are NOT shown as ✅, while
    earlier stages (Анализ/Архитектура) stay ✅."""
    tid = _mk_task(stage="development")
    # Earlier stages finished.
    _mk_run(tid, "analyst", "2026-06-04 09:00:00", "2026-06-04 09:10:00")
    _mk_run(tid, "architect", "2026-06-04 09:10:00", "2026-06-04 09:20:00")
    # First development pass finished, then later stages ran...
    _mk_run(tid, "developer", "2026-06-04 09:20:00", "2026-06-04 09:40:00", cost=1.0)
    _mk_run(tid, "reviewer", "2026-06-04 09:40:00", "2026-06-04 09:50:00")
    _mk_run(tid, "tester", "2026-06-04 09:50:00", "2026-06-04 10:00:00")
    _mk_run(tid, "deployer", "2026-06-04 10:00:00", "2026-06-04 10:10:00")
    # ...then a rollback re-launched developer -> in-flight run (finished_at NULL).
    _mk_run(tid, "developer", "2026-06-04 10:20:00", None, exit_code=None, cost=0.0)

    text = N.render_task_tracker(tid)

    # Development active, not a ✅.
    assert _has_active(text, "Разработка"), text
    # Later-than-current stages: no ✅ line (the rollback is honestly reflected).
    assert _stage_line(text, "Код ревью") is None, text
    assert _stage_line(text, "Тестирование") is None, text
    assert _stage_line(text, "Внедрение") is None, text
    # Earlier stages still ✅.
    assert _stage_line(text, "Анализ") is not None, text
    assert _stage_line(text, "Архитектура") is not None, text


def test_tc05_forward_progress_keeps_earlier_checkmarks():
    """Regression guard: normal forward progress (no rollback) still shows all
    earlier stages ✅ — the suppression gate only fires for stages AFTER current."""
    tid = _mk_task(stage="testing")
    _mk_run(tid, "analyst", "2026-06-04 09:00:00", "2026-06-04 09:10:00")
    _mk_run(tid, "architect", "2026-06-04 09:10:00", "2026-06-04 09:20:00")
    _mk_run(tid, "developer", "2026-06-04 09:20:00", "2026-06-04 09:40:00")
    _mk_run(tid, "reviewer", "2026-06-04 09:40:00", "2026-06-04 09:50:00")
    # tester in-flight on the testing stage.
    _mk_run(tid, "tester", "2026-06-04 09:50:00", None, exit_code=None)

    text = N.render_task_tracker(tid)
    assert _stage_line(text, "Анализ") is not None
    assert _stage_line(text, "Архитектура") is not None
    assert _stage_line(text, "Разработка") is not None
    assert _stage_line(text, "Код ревью") is not None
    assert _has_active(text, "Тестирование")


def test_tc05_deploy_staging_keeps_deployer_row():
    """Normalization: on stage='deploy-staging' the collapsed 'Внедрение' row
    (stage_key='deploy') is NOT wrongly suppressed by the rollback gate."""
    tid = _mk_task(stage="deploy-staging")
    _mk_run(tid, "analyst", "2026-06-04 09:00:00", "2026-06-04 09:10:00")
    _mk_run(tid, "architect", "2026-06-04 09:10:00", "2026-06-04 09:20:00")
    _mk_run(tid, "developer", "2026-06-04 09:20:00", "2026-06-04 09:40:00")
    _mk_run(tid, "reviewer", "2026-06-04 09:40:00", "2026-06-04 09:50:00")
    _mk_run(tid, "tester", "2026-06-04 09:50:00", "2026-06-04 10:00:00")
    # staging deploy finished (deployer agent, collapsed into Внедрение).
    _mk_run(tid, "deployer", "2026-06-04 10:00:00", "2026-06-04 10:10:00")

    text = N.render_task_tracker(tid)
    # Внедрение must NOT be suppressed (preserved pre-ORCH-091 behaviour).
    assert _stage_line(text, "Внедрение") is not None, text
    assert _stage_line(text, "Тестирование") is not None, text


# =========================================================================== #
# TC-06 / AC-5 — stage-metric summation over retries (ORCH-069 fixture)
# =========================================================================== #
def test_tc06_stage_line_sums_all_developer_runs():
    """developer with 3 runs (ORCH-069: Σ ≈ $3.98) -> the 'Разработка' line shows
    Σ cost / Σ tokens / Σ time, NOT the last run alone."""
    tid = _mk_task(stage="review")  # past development -> ✅ shown
    _mk_run(tid, "analyst", "2026-06-04 09:00:00", "2026-06-04 09:10:00")
    _mk_run(tid, "architect", "2026-06-04 09:10:00", "2026-06-04 09:20:00")
    # Three developer attempts: $1.50 + $2.00 + $0.48 = $3.98; 30m total.
    _mk_run(tid, "developer", "2026-06-04 09:20:00", "2026-06-04 09:30:00",
            cost=1.50, in_tok=100, out_tok=40, cache_read=10)
    _mk_run(tid, "developer", "2026-06-04 09:30:00", "2026-06-04 09:45:00",
            cost=2.00, in_tok=200, out_tok=60, cache_creation=20)
    _mk_run(tid, "developer", "2026-06-04 09:45:00", "2026-06-04 09:50:00",
            cost=0.48, in_tok=50, out_tok=10)

    text = N.render_task_tracker(tid)
    line = _stage_line(text, "Разработка")
    assert line is not None, text
    # Σ cost = $3.98 (not the last $0.48).
    assert fmt_cost(3.98) in line, line
    assert fmt_cost(0.48) not in line, line
    # Σ output tokens = 40+60+10 = 110.
    assert f"{fmt_tokens(110)}↑" in line, line
    # Σ input (input+cache_read+cache_creation): (100+10)+(200+20)+50 = 380.
    exp_in = _input_total({"input_tokens": 100, "cache_read_tokens": 10,
                           "cache_creation_tokens": 0}) \
        + _input_total({"input_tokens": 200, "cache_read_tokens": 0,
                        "cache_creation_tokens": 20}) \
        + _input_total({"input_tokens": 50, "cache_read_tokens": 0,
                        "cache_creation_tokens": 0})
    assert f"{fmt_tokens(exp_in)}↓" in line, line
    # Σ time = 10+15+5 = 30m.
    assert " 30м " in line, line


# =========================================================================== #
# TC-07 / AC-5 — task totals converge with SUM(agent_runs)
# =========================================================================== #
def test_tc07_totals_converge_with_sum_agent_runs():
    """The 💰 totals line equals SUM(agent_runs) over cost & tokens even with
    retries (the stage lines and the totals draw from the same row set)."""
    tid = _mk_task(stage="review")
    _mk_run(tid, "analyst", "2026-06-04 09:00:00", "2026-06-04 09:10:00",
            cost=0.20, in_tok=30, out_tok=10)
    _mk_run(tid, "architect", "2026-06-04 09:10:00", "2026-06-04 09:20:00",
            cost=0.30, in_tok=40, out_tok=12)
    _mk_run(tid, "developer", "2026-06-04 09:20:00", "2026-06-04 09:30:00",
            cost=1.50, in_tok=100, out_tok=40, cache_read=10)
    _mk_run(tid, "developer", "2026-06-04 09:30:00", "2026-06-04 09:45:00",
            cost=2.00, in_tok=200, out_tok=60, cache_creation=20)

    # Authoritative SUM straight from the DB.
    conn = get_db()
    rows = conn.execute(
        "SELECT input_tokens, output_tokens, cache_read_tokens, "
        "cache_creation_tokens, cost_usd FROM agent_runs WHERE task_id=?",
        (tid,),
    ).fetchall()
    conn.close()
    sum_cost = sum(float(r["cost_usd"] or 0) for r in rows)
    sum_out = sum(int(r["output_tokens"] or 0) for r in rows)
    sum_in = sum(_input_total({"input_tokens": r["input_tokens"],
                               "cache_read_tokens": r["cache_read_tokens"],
                               "cache_creation_tokens": r["cache_creation_tokens"]})
                 for r in rows)

    text = N.render_task_tracker(tid)
    totals = [ln for ln in text.splitlines() if ln.startswith("💰")][0]
    assert fmt_cost(sum_cost) in totals, totals
    assert f"{fmt_tokens(sum_in)}↓" in totals, totals
    assert f"{fmt_tokens(sum_out)}↑" in totals, totals


def test_tc07_sum_of_stage_lines_equals_totals_on_done():
    """On a done task with retries, Σ(stage-line costs) == totals cost: each agent
    maps to exactly one stage row, so no run is double-counted or dropped."""
    tid = _mk_task(stage="done")
    _mk_run(tid, "analyst", "2026-06-04 09:00:00", "2026-06-04 09:10:00", cost=0.20)
    _mk_run(tid, "developer", "2026-06-04 09:20:00", "2026-06-04 09:30:00", cost=1.50)
    _mk_run(tid, "developer", "2026-06-04 09:30:00", "2026-06-04 09:45:00", cost=2.00)
    _mk_run(tid, "deployer", "2026-06-04 10:00:00", "2026-06-04 10:10:00", cost=0.30)

    text = N.render_task_tracker(tid)
    totals = [ln for ln in text.splitlines() if ln.startswith("💰")][0]
    # developer stage line = Σ $3.50 (not $2.00), totals = $4.00.
    dev_line = _stage_line(text, "Разработка")
    assert fmt_cost(3.50) in dev_line, dev_line
    assert fmt_cost(4.00) in totals, totals


# =========================================================================== #
# TC-08 / AC-7 — render_task_tracker never raises on broken/partial rows
# =========================================================================== #
def test_tc08_render_survives_null_timestamps_and_runs():
    """NULL timestamps / partial runs -> render returns a string, never raises."""
    tid = _mk_task(stage="development")
    # Run with NULL started/finished and NULL token columns.
    conn = get_db()
    conn.execute(
        "INSERT INTO agent_runs (task_id, agent, started_at, finished_at, "
        "exit_code, input_tokens, output_tokens, cost_usd, model) "
        "VALUES (?,?,?,?,?,?,?,?,?)",
        (tid, "developer", None, None, None, None, None, None, None),
    )
    conn.commit()
    conn.close()
    text = N.render_task_tracker(tid)  # must not raise
    assert isinstance(text, str) and text


def test_tc08_render_survives_bogus_stage():
    """A task sitting on a truly unknown stage still renders (never-raise)."""
    tid = _mk_task(stage="__bogus__")
    _mk_run(tid, "developer", "2026-06-04 09:00:00", "2026-06-04 09:10:00")
    text = N.render_task_tracker(tid)
    assert isinstance(text, str) and text
    # Unknown stage -> developer's finished run is past "far future" current pos?
    # current_pos for unknown = len(order) -> every real stage_key <= it -> ✅ kept
    # (degrades to pre-ORCH-091 behaviour, no spurious suppression).
    assert _stage_line(text, "Разработка") is not None, text