Files
orchestrator/tests/test_coverage_gate.py
claude-bot eadfd8419b feat(coverage): deterministic test-coverage gate on deploy-staging->deploy edge (ORCH-027)
Introduce a deterministic (no-LLM) coverage sub-gate that blocks coverage
degradation before a task branch merges into `main`. Existing gates judge only by
the FACT of passing (check_ci_green / check_tests_passed / merge-gate re-test), not
by completeness — so a batch autonomous run (ORCH-088) silently erodes coverage.

Pattern mirrors the security-gate (ORCH-022): leaf src/coverage_gate.py (never-raise)
+ thin check_coverage_gate in QG_CHECKS + _handle_coverage_gate splice in advance_stage,
run AFTER merge-gate (measured on the caught-up HEAD that lands in main) and BEFORE
image-freshness (fail before the expensive docker rebuild).

- measure_coverage: pytest --cov=src --cov-report=json in the per-branch worktree ->
  line coverage %; None on tool error -> fail-open + WARNING by default (FR-6).
- compute_coverage_verdict (pure): absolute | baseline | both + epsilon (NFR-4 anti-flap);
  baseline None -> bootstrap (absolute-only).
- coverage_baseline DB table (additive, CREATE TABLE IF NOT EXISTS) + ratchet-up in
  _handle_merge_verify (deploy->done): atomic compare-and-set under merge-lease, never
  decreases; bootstrap on first merge.
- Artefact 18-coverage-report.md (coverage_status: frontmatter, single source of truth);
  GET /queue `coverage` block; FAIL -> Telegram; optional POST /coverage/baseline override.
- Flags ORCH_COVERAGE_* (kill-switch + self-hosting-only scope) -> enduro untouched;
  STAGE_TRANSITIONS / existing check_* / verdict keys byte-for-byte unchanged (NFR-5/AC-8).
- pytest-cov==5.0.0 added to requirements.txt.

Tests: tests/test_coverage_gate.py (TC-01..TC-15). Frozen QG-registry anti-regress
tests + deploy-staging edge tests updated for the new sub-gate. Full suite green.

Docs: README / adr-0029 / PIPELINE_DOCS / 18-coverage-report.md template (architecture
stage) + CHANGELOG / CLAUDE.md / .env.example (this PR).

Refs: ORCH-027
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-10 01:26:24 +03:00

472 lines
21 KiB
Python

"""ORCH-027 / TC-01..TC-15: the coverage-gate leaf module (src/coverage_gate.py).
These exercise the DETERMINISTIC core: the pure verdict / delta / frontmatter
helpers (no binaries needed), the ratchet baseline against a real tmp SQLite DB,
the conditionality / kill-switch / fail-open behaviour with the measurer mocked,
never-raise, and the gate's integration into advance_stage / GET /queue.
Contract under test (ADR-001 §7):
* the verdict is a deterministic pure function of (measured, baseline, floor,
policy, epsilon) — no LLM, all border / epsilon cases covered;
* the ratchet baseline only moves UP and bootstraps on the first merge;
* conditionality: empty scope -> self-hosting only; out-of-scope -> no-op N/A;
kill-switch off -> inert;
* a coverage-tool error degrades fail-open + WARNING by default, fail-closed only
when configured;
* the machine verdict lives ONLY in the YAML frontmatter (read-back == written);
* never-raise: any internal error -> a (bool, reason) pair, no exception escapes;
* self-hosting safety: the gate never deploys / restarts prod / pushes main.
"""
import os
import tempfile
os.environ["ORCH_DB_PATH"] = os.path.join(tempfile.gettempdir(), "test_coverage_gate.db")
os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")
import pytest # noqa: E402
import src.db as db # noqa: E402
from src import config as cfg # noqa: E402
from src import coverage_gate as cg # noqa: E402
_REPO = "orchestrator"
_BRANCH = "feature/ORCH-027-code-coverage"
_WI = "ORCH-027"
@pytest.fixture(autouse=True)
def fresh_db(tmp_path, monkeypatch):
"""Isolated tmp SQLite DB + gate ON / empty scope (self-hosting) by default."""
dbfile = tmp_path / "cov.db"
monkeypatch.setattr(db.settings, "db_path", str(dbfile))
monkeypatch.setattr(cfg.settings, "coverage_gate_enabled", True, raising=False)
monkeypatch.setattr(cfg.settings, "coverage_gate_repos", "", raising=False)
monkeypatch.setattr(cfg.settings, "coverage_min_percent", 80.0, raising=False)
monkeypatch.setattr(cfg.settings, "coverage_policy", "both", raising=False)
monkeypatch.setattr(cfg.settings, "coverage_epsilon", 0.5, raising=False)
monkeypatch.setattr(cfg.settings, "coverage_tool_fail_closed", False, raising=False)
monkeypatch.setattr(cfg.settings, "coverage_run_timeout_s", 900, raising=False)
db.init_db()
yield
# ===========================================================================
# TC-01 — policy=absolute
# ===========================================================================
def test_tc01_policy_absolute():
# measured >= floor -> PASS
ok, _ = cg.compute_coverage_verdict(85.0, None, 80.0, "absolute", 0.0)
assert ok is True
# exactly on the floor -> PASS (>=)
ok, _ = cg.compute_coverage_verdict(80.0, None, 80.0, "absolute", 0.0)
assert ok is True
# below floor-epsilon -> FAIL
ok, _ = cg.compute_coverage_verdict(78.0, None, 80.0, "absolute", 0.5)
assert ok is False
# baseline is IGNORED under absolute (even a high baseline cannot fail it)
ok, _ = cg.compute_coverage_verdict(85.0, 99.0, 80.0, "absolute", 0.0)
assert ok is True
# ===========================================================================
# TC-02 — policy=baseline (no-regression / ratchet)
# ===========================================================================
def test_tc02_policy_baseline():
# measured >= baseline -> PASS
ok, _ = cg.compute_coverage_verdict(90.0, 85.0, 0.0, "baseline", 0.0)
assert ok is True
# exactly on baseline -> PASS
ok, _ = cg.compute_coverage_verdict(85.0, 85.0, 0.0, "baseline", 0.0)
assert ok is True
# below baseline-epsilon -> FAIL
ok, _ = cg.compute_coverage_verdict(83.0, 85.0, 0.0, "baseline", 0.5)
assert ok is False
# floor is IGNORED under baseline (low measured vs floor but >= baseline -> PASS)
ok, _ = cg.compute_coverage_verdict(40.0, 30.0, 80.0, "baseline", 0.0)
assert ok is True
# bootstrap: baseline None under baseline policy -> PASS (cannot regress vs nothing)
ok, reason = cg.compute_coverage_verdict(10.0, None, 80.0, "baseline", 0.0)
assert ok is True
assert "bootstrap" in reason.lower()
# ===========================================================================
# TC-03 — policy=both (PASS only if BOTH hold)
# ===========================================================================
def test_tc03_policy_both():
# both hold -> PASS
ok, _ = cg.compute_coverage_verdict(90.0, 85.0, 80.0, "both", 0.0)
assert ok is True
# absolute fails (below floor) -> FAIL even though >= baseline
ok, _ = cg.compute_coverage_verdict(82.0, 80.0, 85.0, "both", 0.0)
assert ok is False
# baseline fails (below baseline) -> FAIL even though >= floor
ok, _ = cg.compute_coverage_verdict(84.0, 90.0, 80.0, "both", 0.0)
assert ok is False
# bootstrap under both: baseline None -> only absolute decides
ok, _ = cg.compute_coverage_verdict(85.0, None, 80.0, "both", 0.0)
assert ok is True
ok, _ = cg.compute_coverage_verdict(70.0, None, 80.0, "both", 0.0)
assert ok is False
# ===========================================================================
# TC-04 — epsilon tolerance (anti-flap, NFR-4)
# ===========================================================================
def test_tc04_epsilon_tolerance():
# measured 0.3% under baseline, epsilon 0.5 -> still PASS (within noise)
ok, _ = cg.compute_coverage_verdict(84.7, 85.0, 80.0, "both", 0.5)
assert ok is True
# measured 0.3% under floor, epsilon 0.5 -> still PASS
ok, _ = cg.compute_coverage_verdict(79.7, 80.0, 0.0, "absolute", 0.5)
assert ok is True
# just beyond epsilon -> FAIL
ok, _ = cg.compute_coverage_verdict(84.4, 85.0, 80.0, "baseline", 0.5)
assert ok is False
# negative epsilon is clamped to 0 (no negative tolerance)
ok, _ = cg.compute_coverage_verdict(84.9, 85.0, 0.0, "baseline", -5.0)
assert ok is False
# ===========================================================================
# TC-05 — ratchet baseline (up only; never lowers)
# ===========================================================================
def test_tc05_ratchet_up_only():
# bootstrap seeds the baseline
assert db.get_coverage_baseline(_REPO) is None
assert db.ratchet_coverage_baseline(_REPO, 80.0, "sha1") is True
assert db.get_coverage_baseline(_REPO) == pytest.approx(80.0)
# higher value raises it
assert db.ratchet_coverage_baseline(_REPO, 85.0, "sha2") is True
assert db.get_coverage_baseline(_REPO) == pytest.approx(85.0)
# equal value re-stamps (idempotent, no harm) — baseline unchanged
db.ratchet_coverage_baseline(_REPO, 85.0, "sha3")
assert db.get_coverage_baseline(_REPO) == pytest.approx(85.0)
# LOWER value does NOT lower the baseline
assert db.ratchet_coverage_baseline(_REPO, 70.0, "sha4") is False
assert db.get_coverage_baseline(_REPO) == pytest.approx(85.0)
def test_tc05_ratchet_per_repo_isolated():
db.ratchet_coverage_baseline(_REPO, 85.0, "s")
db.ratchet_coverage_baseline("enduro-trails", 42.0, "s")
assert db.get_coverage_baseline(_REPO) == pytest.approx(85.0)
assert db.get_coverage_baseline("enduro-trails") == pytest.approx(42.0)
# ===========================================================================
# TC-06 — bootstrap baseline (first init from main measurement)
# ===========================================================================
def test_tc06_bootstrap(monkeypatch, tmp_path):
# No baseline yet -> ratchet_baseline_on_merge seeds it from the artefact value.
report = (
"---\ncoverage_status: PASS\nwork_item: ORCH-027\n"
"measured_coverage: 77.50\nbaseline: \nfloor: 0.00\npolicy: both\n"
"epsilon: 0.50\ndelta: 0.00\n---\n# body\n"
)
monkeypatch.setattr(cg, "_report_path", lambda *a, **k: str(tmp_path / "18.md"))
(tmp_path / "18.md").write_text(report, encoding="utf-8")
assert db.get_coverage_baseline(_REPO) is None
assert cg.ratchet_baseline_on_merge(_REPO, _WI, _BRANCH, "sha") is True
assert db.get_coverage_baseline(_REPO) == pytest.approx(77.5)
# ===========================================================================
# TC-07 — conditionality applies(repo) (empty scope -> self-hosting only)
# ===========================================================================
def test_tc07_applies_self_hosting_only(monkeypatch):
monkeypatch.setattr(cfg.settings, "coverage_gate_repos", "", raising=False)
assert cg.coverage_gate_applies("orchestrator") is True
assert cg.coverage_gate_applies("enduro-trails") is False
def test_tc07_applies_csv_scope(monkeypatch):
monkeypatch.setattr(cfg.settings, "coverage_gate_repos", "foo, enduro-trails", raising=False)
assert cg.coverage_gate_applies("enduro-trails") is True
assert cg.coverage_gate_applies("orchestrator") is False
def test_tc07_out_of_scope_noop_no_measure(monkeypatch):
# Out-of-scope repo -> (True, "...N/A") and the expensive measurer is NOT called.
called = {"n": 0}
monkeypatch.setattr(cg, "measure_coverage", lambda *a, **k: called.__setitem__("n", called["n"] + 1) or 99.0)
ok, reason = cg.check_coverage_gate("enduro-trails", "ET-1", "feature/x")
assert ok is True
assert "N/A" in reason
assert called["n"] == 0
# ===========================================================================
# TC-08 — kill-switch off -> inert (1:1 as before ORCH-027)
# ===========================================================================
def test_tc08_kill_switch_off(monkeypatch):
monkeypatch.setattr(cfg.settings, "coverage_gate_enabled", False, raising=False)
called = {"n": 0}
monkeypatch.setattr(cg, "measure_coverage", lambda *a, **k: called.__setitem__("n", called["n"] + 1) or 10.0)
ok, reason = cg.check_coverage_gate(_REPO, _WI, _BRANCH)
assert ok is True
assert "disabled" in reason
assert called["n"] == 0
assert cg.coverage_gate_applies(_REPO) is False
# ===========================================================================
# TC-09 — fail-open by default on a tool error; fail-closed when configured
# ===========================================================================
def test_tc09_fail_open_default(monkeypatch, tmp_path):
monkeypatch.setattr(cg, "measure_coverage", lambda *a, **k: None) # tool error
monkeypatch.setattr(cg, "_report_path", lambda *a, **k: str(tmp_path / "18.md"))
ok, reason = cg.check_coverage_gate(_REPO, _WI, _BRANCH)
assert ok is True
assert "fail-open" in reason.lower()
# The report records the fail-open PASS.
content = (tmp_path / "18.md").read_text(encoding="utf-8")
assert "coverage_status: PASS" in content
def test_tc09_fail_closed_when_configured(monkeypatch, tmp_path):
monkeypatch.setattr(cfg.settings, "coverage_tool_fail_closed", True, raising=False)
monkeypatch.setattr(cg, "measure_coverage", lambda *a, **k: None)
monkeypatch.setattr(cg, "_report_path", lambda *a, **k: str(tmp_path / "18.md"))
ok, reason = cg.check_coverage_gate(_REPO, _WI, _BRANCH)
assert ok is False
assert "fail-closed" in reason.lower()
content = (tmp_path / "18.md").read_text(encoding="utf-8")
assert "coverage_status: FAIL" in content
# ===========================================================================
# TC-10 — never-raise (broken inputs / internal error never escape)
# ===========================================================================
def test_tc10_verdict_never_raises_on_bad_inputs():
ok, reason = cg.compute_coverage_verdict("not-a-number", None, 80.0, "both", 0.5)
assert ok is False
assert "bad inputs" in reason
def test_tc10_parse_coverage_percent_tolerant():
assert cg.parse_coverage_percent({"totals": {"percent_covered": 73.2}}) == pytest.approx(73.2)
assert cg.parse_coverage_percent({}) is None
assert cg.parse_coverage_percent("garbage") is None
assert cg.parse_coverage_percent({"totals": {}}) is None
def test_tc10_check_never_raises(monkeypatch):
# measure_coverage explodes -> the gate swallows it and returns a pair (fail-open).
def _boom(*a, **k):
raise RuntimeError("coverage exploded")
monkeypatch.setattr(cg, "measure_coverage", _boom)
ok, reason = cg.check_coverage_gate(_REPO, _WI, _BRANCH)
assert isinstance(ok, bool)
assert "error (fail-open)" in reason
def test_tc10_ratchet_never_raises_on_missing_report(monkeypatch, tmp_path):
monkeypatch.setattr(cg, "_report_path", lambda *a, **k: str(tmp_path / "nope.md"))
assert cg.ratchet_baseline_on_merge(_REPO, _WI, _BRANCH, "sha") is False
# ===========================================================================
# TC-11 — write/read report; single source of truth via frontmatter
# ===========================================================================
def test_tc11_report_roundtrip(tmp_path):
fields = {
"coverage_status": "PASS",
"measured_coverage": 88.25,
"baseline": 85.0,
"floor": 80.0,
"policy": "both",
"epsilon": 0.5,
"delta": 3.25,
"reason": "ok",
"measurement": "pytest --cov=src: 88.25%",
"policy_detail": "policy=both",
}
content = cg.render_coverage_report(_WI, fields)
# machine key present and parseable
ok, verdict = cg.parse_coverage_status(content)
assert ok is True
assert "PASS" in verdict
# measured_coverage read back from the SAME file (ratchet source of truth)
assert cg.read_measured_coverage(content) == pytest.approx(88.25)
# FAIL roundtrip (FAIL token authoritative)
fields["coverage_status"] = "FAIL"
content = cg.render_coverage_report(_WI, fields)
ok, verdict = cg.parse_coverage_status(content)
assert ok is False
assert "FAIL" in verdict
def test_tc11_parse_missing_frontmatter():
ok, reason = cg.parse_coverage_status("no frontmatter here")
assert ok is False
assert "coverage_status" in reason
assert cg.read_measured_coverage("no frontmatter") is None
def test_tc11_bootstrap_report_blank_baseline():
# bootstrap: baseline None -> renders an EMPTY baseline field, still parseable.
fields = {
"coverage_status": "PASS", "measured_coverage": 50.0, "baseline": None,
"floor": 0.0, "policy": "both", "epsilon": 0.5, "delta": 0.0,
}
content = cg.render_coverage_report(_WI, fields)
assert "baseline: \n" in content or "baseline:\n" in content
assert cg.parse_coverage_status(content)[0] is True
# ===========================================================================
# TC-12 — self-hosting safety: the leaf imports no engine, touches no prod
# ===========================================================================
def test_tc12_leaf_no_engine_import():
# AST-based (not prose): the leaf must never IMPORT the engine, and the only
# external command it runs is pytest — no docker/compose/force-push literals.
import ast
import inspect
tree = ast.parse(inspect.getsource(cg))
imported: set[str] = set()
for node in ast.walk(tree):
if isinstance(node, ast.ImportFrom) and node.module:
imported.add(node.module)
elif isinstance(node, ast.Import):
for n in node.names:
imported.add(n.name)
assert not any("stage_engine" in m for m in imported), imported
assert not any(("launcher" in m or "self_deploy" in m) for m in imported), imported
# No deploy / restart / force-push command tokens used as actual string literals.
consts = [
n.value for n in ast.walk(tree)
if isinstance(n, ast.Constant) and isinstance(n.value, str)
]
for forbidden in ("compose", "--force-with-lease", "--force", "docker"):
assert forbidden not in consts, f"coverage_gate leaf must not run {forbidden!r}"
def test_tc12_delta_signed():
assert cg.compute_delta(85.0, 80.0, 70.0) == pytest.approx(5.0) # vs max(80,70)
assert cg.compute_delta(75.0, 80.0, 70.0) == pytest.approx(-5.0)
assert cg.compute_delta(50.0, None, None) == pytest.approx(0.0)
# ===========================================================================
# TC-13 — gate integration into advance_stage (rollback on FAIL, retry++)
# ===========================================================================
def test_tc13_advance_rolls_back_on_fail(monkeypatch):
from src import stage_engine as se
captured = {}
def _fake_run_qg(name, repo, wi, branch):
captured["qg"] = name
return (False, "measured=70.00% policy=both: absolute FAIL")
monkeypatch.setattr(se, "_run_qg", _fake_run_qg)
monkeypatch.setattr(se, "update_task_stage", lambda *a, **k: None)
monkeypatch.setattr(se, "notify_stage_change", lambda *a, **k: None)
monkeypatch.setattr(se, "plane_notify_stage", lambda *a, **k: None)
monkeypatch.setattr(se, "set_issue_in_progress", lambda *a, **k: None)
monkeypatch.setattr(se, "notify_qg_failure", lambda *a, **k: None)
monkeypatch.setattr(se, "plane_add_comment", lambda *a, **k: None)
monkeypatch.setattr(se, "_developer_retry_count", lambda *a, **k: 0)
released = {"n": 0}
monkeypatch.setattr(se.merge_gate, "release_merge_lease",
lambda *a, **k: released.__setitem__("n", released["n"] + 1))
enq = {"n": 0}
monkeypatch.setattr(se, "enqueue_job",
lambda *a, **k: enq.__setitem__("n", enq["n"] + 1) or 123)
result = se.AdvanceResult()
intervened = se._handle_coverage_gate(1, "deploy-staging", _REPO, _WI, _BRANCH, "deployer", result)
assert intervened is True
assert captured["qg"] == "check_coverage_gate"
assert result.rolled_back_to == "development"
assert result.enqueued_agent == "developer"
assert enq["n"] == 1
# merge lease released on the coverage rollback (ADR-001 D1/TR-2)
assert released["n"] == 1
def test_tc13_advance_passes_through_on_ok(monkeypatch):
from src import stage_engine as se
monkeypatch.setattr(se, "_run_qg", lambda *a, **k: (True, "coverage OK"))
result = se.AdvanceResult()
intervened = se._handle_coverage_gate(1, "deploy-staging", _REPO, _WI, _BRANCH, "deployer", result)
assert intervened is False
assert result.rolled_back_to is None
# ===========================================================================
# TC-14 — real measurement on a minimal fixture repo (pytest --cov in worktree)
# ===========================================================================
def test_tc14_real_measurement(tmp_path, monkeypatch):
# Build a minimal project: src/ with one function, tests covering part of it.
proj = tmp_path / "fixture_repo"
(proj / "src").mkdir(parents=True)
(proj / "tests").mkdir()
(proj / "src" / "__init__.py").write_text("", encoding="utf-8")
(proj / "src" / "mod.py").write_text(
"def covered():\n return 1\n\n\ndef uncovered():\n return 2\n",
encoding="utf-8",
)
(proj / "tests" / "test_mod.py").write_text(
"from src.mod import covered\n\n\ndef test_covered():\n assert covered() == 1\n",
encoding="utf-8",
)
# Point the measurer's worktree resolution at our fixture.
monkeypatch.setattr(cg, "ensure_worktree", lambda repo, branch: str(proj))
pct = cg.measure_coverage(_REPO, _BRANCH)
assert pct is not None
# mod.py: 4 statements, uncovered() body (1) unrun -> ~75%; bounds-check only.
assert 50.0 <= pct <= 90.0
# the scratch json is cleaned up
assert not (proj / ".coverage-report.json").exists()
def test_tc14_measure_timeout_returns_none(monkeypatch):
import subprocess
monkeypatch.setattr(cg, "ensure_worktree", lambda r, b: "/tmp")
def _timeout(*a, **k):
raise subprocess.TimeoutExpired(cmd="pytest", timeout=1)
monkeypatch.setattr(cg.subprocess, "run", _timeout)
assert cg.measure_coverage(_REPO, _BRANCH) is None
# ===========================================================================
# TC-15 — observability (snapshot block) + registry compatibility unchanged
# ===========================================================================
def test_tc15_snapshot_shape(monkeypatch):
db.ratchet_coverage_baseline(_REPO, 81.0, "sha")
snap = cg.snapshot()
assert snap["enabled"] is True
assert snap["policy"] == "both"
assert snap["floor"] == pytest.approx(80.0)
assert "baselines" in snap
assert _REPO in snap["baselines"]
assert snap["baselines"][_REPO]["coverage"] == pytest.approx(81.0)
def test_tc15_snapshot_never_raises(monkeypatch):
monkeypatch.setattr(db, "all_coverage_baselines", lambda: (_ for _ in ()).throw(RuntimeError("boom")))
snap = cg.snapshot()
assert snap["enabled"] is True
assert snap["baselines"] == {}
def test_tc15_registry_and_transitions_unchanged():
from src.qg.checks import QG_CHECKS
from src.stages import STAGE_TRANSITIONS
# new check registered...
assert "check_coverage_gate" in QG_CHECKS
# ...without touching the existing verdict checks (byte-for-byte names present)
for name in (
"check_ci_green", "check_tests_passed", "check_security_gate",
"check_staging_status", "check_staging_image_fresh", "check_branch_mergeable",
):
assert name in QG_CHECKS
# coverage is an edge sub-gate, NOT a STAGE_TRANSITIONS edge
for _stage, spec in STAGE_TRANSITIONS.items():
assert "check_coverage_gate" not in str(spec)