orchestrator/tests/test_coverage_gate.py

"""ORCH-027 / TC-01..TC-15: the coverage-gate leaf module (src/coverage_gate.py).

These exercise the DETERMINISTIC core: the pure verdict / delta / frontmatter
helpers (no binaries needed), the ratchet baseline against a real tmp SQLite DB,
the conditionality / kill-switch / fail-open behaviour with the measurer mocked,
never-raise, and the gate's integration into advance_stage / GET /queue.

Contract under test (ADR-001 §7):
  * the verdict is a deterministic pure function of (measured, baseline, floor,
    policy, epsilon) — no LLM, all border / epsilon cases covered;
  * the ratchet baseline only moves UP and bootstraps on the first merge;
  * conditionality: empty scope -> self-hosting only; out-of-scope -> no-op N/A;
    kill-switch off -> inert;
  * a coverage-tool error degrades fail-open + WARNING by default, fail-closed only
    when configured;
  * the machine verdict lives ONLY in the YAML frontmatter (read-back == written);
  * never-raise: any internal error -> a (bool, reason) pair, no exception escapes;
  * self-hosting safety: the gate never deploys / restarts prod / pushes main.
"""
import os
import tempfile

os.environ["ORCH_DB_PATH"] = os.path.join(tempfile.gettempdir(), "test_coverage_gate.db")
os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")

import pytest  # noqa: E402

import src.db as db  # noqa: E402
from src import config as cfg  # noqa: E402
from src import coverage_gate as cg  # noqa: E402

_REPO = "orchestrator"
_BRANCH = "feature/ORCH-027-code-coverage"
_WI = "ORCH-027"


@pytest.fixture(autouse=True)
def fresh_db(tmp_path, monkeypatch):
    """Isolated tmp SQLite DB + gate ON / empty scope (self-hosting) by default."""
    dbfile = tmp_path / "cov.db"
    monkeypatch.setattr(db.settings, "db_path", str(dbfile))
    monkeypatch.setattr(cfg.settings, "coverage_gate_enabled", True, raising=False)
    monkeypatch.setattr(cfg.settings, "coverage_gate_repos", "", raising=False)
    monkeypatch.setattr(cfg.settings, "coverage_min_percent", 80.0, raising=False)
    monkeypatch.setattr(cfg.settings, "coverage_policy", "both", raising=False)
    monkeypatch.setattr(cfg.settings, "coverage_epsilon", 0.5, raising=False)
    monkeypatch.setattr(cfg.settings, "coverage_tool_fail_closed", False, raising=False)
    monkeypatch.setattr(cfg.settings, "coverage_run_timeout_s", 900, raising=False)
    db.init_db()
    yield


# ===========================================================================
# TC-01 — policy=absolute
# ===========================================================================
def test_tc01_policy_absolute():
    # measured >= floor -> PASS
    ok, _ = cg.compute_coverage_verdict(85.0, None, 80.0, "absolute", 0.0)
    assert ok is True
    # exactly on the floor -> PASS (>=)
    ok, _ = cg.compute_coverage_verdict(80.0, None, 80.0, "absolute", 0.0)
    assert ok is True
    # below floor-epsilon -> FAIL
    ok, _ = cg.compute_coverage_verdict(78.0, None, 80.0, "absolute", 0.5)
    assert ok is False
    # baseline is IGNORED under absolute (even a high baseline cannot fail it)
    ok, _ = cg.compute_coverage_verdict(85.0, 99.0, 80.0, "absolute", 0.0)
    assert ok is True


# ===========================================================================
# TC-02 — policy=baseline (no-regression / ratchet)
# ===========================================================================
def test_tc02_policy_baseline():
    # measured >= baseline -> PASS
    ok, _ = cg.compute_coverage_verdict(90.0, 85.0, 0.0, "baseline", 0.0)
    assert ok is True
    # exactly on baseline -> PASS
    ok, _ = cg.compute_coverage_verdict(85.0, 85.0, 0.0, "baseline", 0.0)
    assert ok is True
    # below baseline-epsilon -> FAIL
    ok, _ = cg.compute_coverage_verdict(83.0, 85.0, 0.0, "baseline", 0.5)
    assert ok is False
    # floor is IGNORED under baseline (low measured vs floor but >= baseline -> PASS)
    ok, _ = cg.compute_coverage_verdict(40.0, 30.0, 80.0, "baseline", 0.0)
    assert ok is True
    # bootstrap: baseline None under baseline policy -> PASS (cannot regress vs nothing)
    ok, reason = cg.compute_coverage_verdict(10.0, None, 80.0, "baseline", 0.0)
    assert ok is True
    assert "bootstrap" in reason.lower()


# ===========================================================================
# TC-03 — policy=both (PASS only if BOTH hold)
# ===========================================================================
def test_tc03_policy_both():
    # both hold -> PASS
    ok, _ = cg.compute_coverage_verdict(90.0, 85.0, 80.0, "both", 0.0)
    assert ok is True
    # absolute fails (below floor) -> FAIL even though >= baseline
    ok, _ = cg.compute_coverage_verdict(82.0, 80.0, 85.0, "both", 0.0)
    assert ok is False
    # baseline fails (below baseline) -> FAIL even though >= floor
    ok, _ = cg.compute_coverage_verdict(84.0, 90.0, 80.0, "both", 0.0)
    assert ok is False
    # bootstrap under both: baseline None -> only absolute decides
    ok, _ = cg.compute_coverage_verdict(85.0, None, 80.0, "both", 0.0)
    assert ok is True
    ok, _ = cg.compute_coverage_verdict(70.0, None, 80.0, "both", 0.0)
    assert ok is False


# ===========================================================================
# TC-04 — epsilon tolerance (anti-flap, NFR-4)
# ===========================================================================
def test_tc04_epsilon_tolerance():
    # measured 0.3% under baseline, epsilon 0.5 -> still PASS (within noise)
    ok, _ = cg.compute_coverage_verdict(84.7, 85.0, 80.0, "both", 0.5)
    assert ok is True
    # measured 0.3% under floor, epsilon 0.5 -> still PASS
    ok, _ = cg.compute_coverage_verdict(79.7, 80.0, 0.0, "absolute", 0.5)
    assert ok is True
    # just beyond epsilon -> FAIL
    ok, _ = cg.compute_coverage_verdict(84.4, 85.0, 80.0, "baseline", 0.5)
    assert ok is False
    # negative epsilon is clamped to 0 (no negative tolerance)
    ok, _ = cg.compute_coverage_verdict(84.9, 85.0, 0.0, "baseline", -5.0)
    assert ok is False


# ===========================================================================
# TC-05 — ratchet baseline (up only; never lowers)
# ===========================================================================
def test_tc05_ratchet_up_only():
    # bootstrap seeds the baseline
    assert db.get_coverage_baseline(_REPO) is None
    assert db.ratchet_coverage_baseline(_REPO, 80.0, "sha1") is True
    assert db.get_coverage_baseline(_REPO) == pytest.approx(80.0)
    # higher value raises it
    assert db.ratchet_coverage_baseline(_REPO, 85.0, "sha2") is True
    assert db.get_coverage_baseline(_REPO) == pytest.approx(85.0)
    # equal value re-stamps (idempotent, no harm) — baseline unchanged
    db.ratchet_coverage_baseline(_REPO, 85.0, "sha3")
    assert db.get_coverage_baseline(_REPO) == pytest.approx(85.0)
    # LOWER value does NOT lower the baseline
    assert db.ratchet_coverage_baseline(_REPO, 70.0, "sha4") is False
    assert db.get_coverage_baseline(_REPO) == pytest.approx(85.0)


def test_tc05_ratchet_per_repo_isolated():
    db.ratchet_coverage_baseline(_REPO, 85.0, "s")
    db.ratchet_coverage_baseline("enduro-trails", 42.0, "s")
    assert db.get_coverage_baseline(_REPO) == pytest.approx(85.0)
    assert db.get_coverage_baseline("enduro-trails") == pytest.approx(42.0)


# ===========================================================================
# TC-06 — bootstrap baseline (first init from main measurement)
# ===========================================================================
def test_tc06_bootstrap(monkeypatch, tmp_path):
    # No baseline yet -> ratchet_baseline_on_merge seeds it from the artefact value.
    report = (
        "---\ncoverage_status: PASS\nwork_item: ORCH-027\n"
        "measured_coverage: 77.50\nbaseline: \nfloor: 0.00\npolicy: both\n"
        "epsilon: 0.50\ndelta: 0.00\n---\n# body\n"
    )
    monkeypatch.setattr(cg, "_report_path", lambda *a, **k: str(tmp_path / "18.md"))
    (tmp_path / "18.md").write_text(report, encoding="utf-8")
    assert db.get_coverage_baseline(_REPO) is None
    assert cg.ratchet_baseline_on_merge(_REPO, _WI, _BRANCH, "sha") is True
    assert db.get_coverage_baseline(_REPO) == pytest.approx(77.5)


# ===========================================================================
# TC-07 — conditionality applies(repo) (empty scope -> self-hosting only)
# ===========================================================================
def test_tc07_applies_self_hosting_only(monkeypatch):
    monkeypatch.setattr(cfg.settings, "coverage_gate_repos", "", raising=False)
    assert cg.coverage_gate_applies("orchestrator") is True
    assert cg.coverage_gate_applies("enduro-trails") is False


def test_tc07_applies_csv_scope(monkeypatch):
    monkeypatch.setattr(cfg.settings, "coverage_gate_repos", "foo, enduro-trails", raising=False)
    assert cg.coverage_gate_applies("enduro-trails") is True
    assert cg.coverage_gate_applies("orchestrator") is False


def test_tc07_out_of_scope_noop_no_measure(monkeypatch):
    # Out-of-scope repo -> (True, "...N/A") and the expensive measurer is NOT called.
    called = {"n": 0}
    monkeypatch.setattr(cg, "measure_coverage", lambda *a, **k: called.__setitem__("n", called["n"] + 1) or 99.0)
    ok, reason = cg.check_coverage_gate("enduro-trails", "ET-1", "feature/x")
    assert ok is True
    assert "N/A" in reason
    assert called["n"] == 0


# ===========================================================================
# TC-08 — kill-switch off -> inert (1:1 as before ORCH-027)
# ===========================================================================
def test_tc08_kill_switch_off(monkeypatch):
    monkeypatch.setattr(cfg.settings, "coverage_gate_enabled", False, raising=False)
    called = {"n": 0}
    monkeypatch.setattr(cg, "measure_coverage", lambda *a, **k: called.__setitem__("n", called["n"] + 1) or 10.0)
    ok, reason = cg.check_coverage_gate(_REPO, _WI, _BRANCH)
    assert ok is True
    assert "disabled" in reason
    assert called["n"] == 0
    assert cg.coverage_gate_applies(_REPO) is False


# ===========================================================================
# TC-09 — fail-open by default on a tool error; fail-closed when configured
# ===========================================================================
def test_tc09_fail_open_default(monkeypatch, tmp_path):
    monkeypatch.setattr(cg, "measure_coverage", lambda *a, **k: None)  # tool error
    monkeypatch.setattr(cg, "_report_path", lambda *a, **k: str(tmp_path / "18.md"))
    ok, reason = cg.check_coverage_gate(_REPO, _WI, _BRANCH)
    assert ok is True
    assert "fail-open" in reason.lower()
    # The report records the fail-open PASS.
    content = (tmp_path / "18.md").read_text(encoding="utf-8")
    assert "coverage_status: PASS" in content


def test_tc09_fail_closed_when_configured(monkeypatch, tmp_path):
    monkeypatch.setattr(cfg.settings, "coverage_tool_fail_closed", True, raising=False)
    monkeypatch.setattr(cg, "measure_coverage", lambda *a, **k: None)
    monkeypatch.setattr(cg, "_report_path", lambda *a, **k: str(tmp_path / "18.md"))
    ok, reason = cg.check_coverage_gate(_REPO, _WI, _BRANCH)
    assert ok is False
    assert "fail-closed" in reason.lower()
    content = (tmp_path / "18.md").read_text(encoding="utf-8")
    assert "coverage_status: FAIL" in content


# ===========================================================================
# TC-10 — never-raise (broken inputs / internal error never escape)
# ===========================================================================
def test_tc10_verdict_never_raises_on_bad_inputs():
    ok, reason = cg.compute_coverage_verdict("not-a-number", None, 80.0, "both", 0.5)
    assert ok is False
    assert "bad inputs" in reason


def test_tc10_parse_coverage_percent_tolerant():
    assert cg.parse_coverage_percent({"totals": {"percent_covered": 73.2}}) == pytest.approx(73.2)
    assert cg.parse_coverage_percent({}) is None
    assert cg.parse_coverage_percent("garbage") is None
    assert cg.parse_coverage_percent({"totals": {}}) is None


def test_tc10_check_never_raises(monkeypatch):
    # measure_coverage explodes -> the gate swallows it and returns a pair (fail-open).
    def _boom(*a, **k):
        raise RuntimeError("coverage exploded")
    monkeypatch.setattr(cg, "measure_coverage", _boom)
    ok, reason = cg.check_coverage_gate(_REPO, _WI, _BRANCH)
    assert isinstance(ok, bool)
    assert "error (fail-open)" in reason


def test_tc10_ratchet_never_raises_on_missing_report(monkeypatch, tmp_path):
    monkeypatch.setattr(cg, "_report_path", lambda *a, **k: str(tmp_path / "nope.md"))
    assert cg.ratchet_baseline_on_merge(_REPO, _WI, _BRANCH, "sha") is False


# ===========================================================================
# TC-11 — write/read report; single source of truth via frontmatter
# ===========================================================================
def test_tc11_report_roundtrip(tmp_path):
    fields = {
        "coverage_status": "PASS",
        "measured_coverage": 88.25,
        "baseline": 85.0,
        "floor": 80.0,
        "policy": "both",
        "epsilon": 0.5,
        "delta": 3.25,
        "reason": "ok",
        "measurement": "pytest --cov=src: 88.25%",
        "policy_detail": "policy=both",
    }
    content = cg.render_coverage_report(_WI, fields)
    # machine key present and parseable
    ok, verdict = cg.parse_coverage_status(content)
    assert ok is True
    assert "PASS" in verdict
    # measured_coverage read back from the SAME file (ratchet source of truth)
    assert cg.read_measured_coverage(content) == pytest.approx(88.25)

    # FAIL roundtrip (FAIL token authoritative)
    fields["coverage_status"] = "FAIL"
    content = cg.render_coverage_report(_WI, fields)
    ok, verdict = cg.parse_coverage_status(content)
    assert ok is False
    assert "FAIL" in verdict


def test_tc11_parse_missing_frontmatter():
    ok, reason = cg.parse_coverage_status("no frontmatter here")
    assert ok is False
    assert "coverage_status" in reason
    assert cg.read_measured_coverage("no frontmatter") is None


def test_tc11_bootstrap_report_blank_baseline():
    # bootstrap: baseline None -> renders an EMPTY baseline field, still parseable.
    fields = {
        "coverage_status": "PASS", "measured_coverage": 50.0, "baseline": None,
        "floor": 0.0, "policy": "both", "epsilon": 0.5, "delta": 0.0,
    }
    content = cg.render_coverage_report(_WI, fields)
    assert "baseline: \n" in content or "baseline:\n" in content
    assert cg.parse_coverage_status(content)[0] is True


# ===========================================================================
# TC-12 — self-hosting safety: the leaf imports no engine, touches no prod
# ===========================================================================
def test_tc12_leaf_no_engine_import():
    # AST-based (not prose): the leaf must never IMPORT the engine, and the only
    # external command it runs is pytest — no docker/compose/force-push literals.
    import ast
    import inspect
    tree = ast.parse(inspect.getsource(cg))
    imported: set[str] = set()
    for node in ast.walk(tree):
        if isinstance(node, ast.ImportFrom) and node.module:
            imported.add(node.module)
        elif isinstance(node, ast.Import):
            for n in node.names:
                imported.add(n.name)
    assert not any("stage_engine" in m for m in imported), imported
    assert not any(("launcher" in m or "self_deploy" in m) for m in imported), imported
    # No deploy / restart / force-push command tokens used as actual string literals.
    consts = [
        n.value for n in ast.walk(tree)
        if isinstance(n, ast.Constant) and isinstance(n.value, str)
    ]
    for forbidden in ("compose", "--force-with-lease", "--force", "docker"):
        assert forbidden not in consts, f"coverage_gate leaf must not run {forbidden!r}"


def test_tc12_delta_signed():
    assert cg.compute_delta(85.0, 80.0, 70.0) == pytest.approx(5.0)   # vs max(80,70)
    assert cg.compute_delta(75.0, 80.0, 70.0) == pytest.approx(-5.0)
    assert cg.compute_delta(50.0, None, None) == pytest.approx(0.0)


# ===========================================================================
# TC-13 — gate integration into advance_stage (rollback on FAIL, retry++)
# ===========================================================================
def test_tc13_advance_rolls_back_on_fail(monkeypatch):
    from src import stage_engine as se

    captured = {}

    def _fake_run_qg(name, repo, wi, branch):
        captured["qg"] = name
        return (False, "measured=70.00% policy=both: absolute FAIL")

    monkeypatch.setattr(se, "_run_qg", _fake_run_qg)
    monkeypatch.setattr(se, "update_task_stage", lambda *a, **k: None)
    monkeypatch.setattr(se, "notify_stage_change", lambda *a, **k: None)
    monkeypatch.setattr(se, "plane_notify_stage", lambda *a, **k: None)
    monkeypatch.setattr(se, "set_issue_in_progress", lambda *a, **k: None)
    monkeypatch.setattr(se, "notify_qg_failure", lambda *a, **k: None)
    monkeypatch.setattr(se, "plane_add_comment", lambda *a, **k: None)
    monkeypatch.setattr(se, "_developer_retry_count", lambda *a, **k: 0)
    released = {"n": 0}
    monkeypatch.setattr(se.merge_gate, "release_merge_lease",
                        lambda *a, **k: released.__setitem__("n", released["n"] + 1))
    enq = {"n": 0}
    monkeypatch.setattr(se, "enqueue_job",
                        lambda *a, **k: enq.__setitem__("n", enq["n"] + 1) or 123)

    result = se.AdvanceResult()
    intervened = se._handle_coverage_gate(1, "deploy-staging", _REPO, _WI, _BRANCH, "deployer", result)
    assert intervened is True
    assert captured["qg"] == "check_coverage_gate"
    assert result.rolled_back_to == "development"
    assert result.enqueued_agent == "developer"
    assert enq["n"] == 1
    # merge lease released on the coverage rollback (ADR-001 D1/TR-2)
    assert released["n"] == 1


def test_tc13_advance_passes_through_on_ok(monkeypatch):
    from src import stage_engine as se
    monkeypatch.setattr(se, "_run_qg", lambda *a, **k: (True, "coverage OK"))
    result = se.AdvanceResult()
    intervened = se._handle_coverage_gate(1, "deploy-staging", _REPO, _WI, _BRANCH, "deployer", result)
    assert intervened is False
    assert result.rolled_back_to is None


# ===========================================================================
# TC-14 — real measurement on a minimal fixture repo (pytest --cov in worktree)
# ===========================================================================
def test_tc14_real_measurement(tmp_path, monkeypatch):
    # Build a minimal project: src/ with one function, tests covering part of it.
    proj = tmp_path / "fixture_repo"
    (proj / "src").mkdir(parents=True)
    (proj / "tests").mkdir()
    (proj / "src" / "__init__.py").write_text("", encoding="utf-8")
    (proj / "src" / "mod.py").write_text(
        "def covered():\n    return 1\n\n\ndef uncovered():\n    return 2\n",
        encoding="utf-8",
    )
    (proj / "tests" / "test_mod.py").write_text(
        "from src.mod import covered\n\n\ndef test_covered():\n    assert covered() == 1\n",
        encoding="utf-8",
    )
    # Point the measurer's worktree resolution at our fixture.
    monkeypatch.setattr(cg, "ensure_worktree", lambda repo, branch: str(proj))
    pct = cg.measure_coverage(_REPO, _BRANCH)
    assert pct is not None
    # mod.py: 4 statements, uncovered() body (1) unrun -> ~75%; bounds-check only.
    assert 50.0 <= pct <= 90.0
    # the scratch json is cleaned up
    assert not (proj / ".coverage-report.json").exists()


def test_tc14_measure_timeout_returns_none(monkeypatch):
    import subprocess
    monkeypatch.setattr(cg, "ensure_worktree", lambda r, b: "/tmp")

    def _timeout(*a, **k):
        raise subprocess.TimeoutExpired(cmd="pytest", timeout=1)
    monkeypatch.setattr(cg.subprocess, "run", _timeout)
    assert cg.measure_coverage(_REPO, _BRANCH) is None


# ===========================================================================
# TC-15 — observability (snapshot block) + registry compatibility unchanged
# ===========================================================================
def test_tc15_snapshot_shape(monkeypatch):
    db.ratchet_coverage_baseline(_REPO, 81.0, "sha")
    snap = cg.snapshot()
    assert snap["enabled"] is True
    assert snap["policy"] == "both"
    assert snap["floor"] == pytest.approx(80.0)
    assert "baselines" in snap
    assert _REPO in snap["baselines"]
    assert snap["baselines"][_REPO]["coverage"] == pytest.approx(81.0)


def test_tc15_snapshot_never_raises(monkeypatch):
    monkeypatch.setattr(db, "all_coverage_baselines", lambda: (_ for _ in ()).throw(RuntimeError("boom")))
    snap = cg.snapshot()
    assert snap["enabled"] is True
    assert snap["baselines"] == {}


def test_tc15_registry_and_transitions_unchanged():
    from src.qg.checks import QG_CHECKS
    from src.stages import STAGE_TRANSITIONS
    # new check registered...
    assert "check_coverage_gate" in QG_CHECKS
    # ...without touching the existing verdict checks (byte-for-byte names present)
    for name in (
        "check_ci_green", "check_tests_passed", "check_security_gate",
        "check_staging_status", "check_staging_image_fresh", "check_branch_mergeable",
    ):
        assert name in QG_CHECKS
    # coverage is an edge sub-gate, NOT a STAGE_TRANSITIONS edge
    for _stage, spec in STAGE_TRANSITIONS.items():
        assert "check_coverage_gate" not in str(spec)