Files
orchestrator/tests/test_usage.py
Dev Agent 61e26a8930 fix(observability): merge-gate on deploy, full token input, Plane Done, artifact links
1. BUG 8 (second door): merge webhook no longer fake-completes a task at the
   deploy stage; done is gated by the deployer verdict (check_deploy_status).
   Other stages keep merge->done.
2. Token accounting: parse+persist cache_creation_input_tokens (new
   idempotent agent_runs column). usage_comment / task_summary now show the
   FULL input (input + cache_read + cache_creation) with a cached breakdown.
   cost_usd untouched.
3. deploy->done success now forces the Plane issue to terminal Done state.
4. All agents (architect/developer/reviewer/tester/deployer) attach artifact
   links to their finish comment via gitea_public_url.

Tests added for each fix; pytest 244 passed / 9 failed (off-limits HMAC group).
2026-06-04 11:17:58 +03:00

310 lines
10 KiB
Python

"""Feature 4: token / cost accounting tests.
Covers:
* parse_usage_from_text on a REAL claude --output-format json result blob
(captured live from CLI 2.1.142), including a leading text line.
* parse on garbage / missing JSON -> None (never raises).
* record_usage writes the columns; NULLs when usage is None.
* fmt_tokens / fmt_cost formatting.
* usage_comment string format.
* task_usage_summary / task_summary_comment aggregate over agent_runs.
DB is an isolated temp file; no network or subprocess.
"""
import os
import tempfile
os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")
os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
_test_db = os.path.join(tempfile.gettempdir(), "test_orchestrator_usage.db")
os.environ["ORCH_DB_PATH"] = _test_db
import pytest # noqa: E402
from src import db as db_module # noqa: E402
from src.db import init_db, get_db # noqa: E402
from src import usage as U # noqa: E402
# Real claude --output-format json result object (captured from CLI 2.1.142).
REAL_RESULT_JSON = (
'{"type":"result","subtype":"success","is_error":false,"duration_ms":1795,'
'"num_turns":1,"result":"Hi!","session_id":"abc",'
'"total_cost_usd":0.0560175,'
'"usage":{"input_tokens":45231,"cache_creation_input_tokens":7418,'
'"cache_read_input_tokens":18500,"output_tokens":12100,'
'"service_tier":"standard"},'
'"modelUsage":{"claude-opus-4-7":{"inputTokens":6,"outputTokens":7}},'
'"permission_denials":[]}'
)
@pytest.fixture(autouse=True)
def setup_db(monkeypatch):
# get_db() reads settings.db_path live; pin it to our isolated DB.
monkeypatch.setattr(db_module.settings, "db_path", _test_db, raising=False)
if os.path.exists(_test_db):
os.unlink(_test_db)
init_db()
yield
if os.path.exists(_test_db):
os.unlink(_test_db)
# --------------------------------------------------------------------------- #
# parsing
# --------------------------------------------------------------------------- #
def test_parse_real_result_json():
u = U.parse_usage_from_text(REAL_RESULT_JSON)
assert u is not None
assert u["input_tokens"] == 45231
assert u["output_tokens"] == 12100
assert u["cache_read_tokens"] == 18500
# FIX 2: cache_creation slice must now be parsed (was dropped before).
assert u["cache_creation_tokens"] == 7418
assert abs(u["cost_usd"] - 0.0560175) < 1e-9
def test_parse_cache_creation_present():
u = U.parse_usage_from_text(REAL_RESULT_JSON)
assert u["cache_creation_tokens"] == 7418
def test_parse_cache_creation_missing_defaults_zero():
blob = (
'{"total_cost_usd":0.01,'
'"usage":{"input_tokens":10,"output_tokens":5,'
'"cache_read_input_tokens":100}}'
)
u = U.parse_usage_from_text(blob)
assert u["cache_creation_tokens"] == 0
assert u["cache_read_tokens"] == 100
def test_parse_with_leading_text():
"""The agent may print text before the trailing JSON; we still find it."""
text = "some agent stdout line\nanother line\n" + REAL_RESULT_JSON
u = U.parse_usage_from_text(text)
assert u is not None
assert u["input_tokens"] == 45231
assert u["output_tokens"] == 12100
def test_parse_garbage_returns_none():
assert U.parse_usage_from_text("not json at all { broken") is None
assert U.parse_usage_from_text("") is None
assert U.parse_usage_from_text(None) is None
def test_parse_json_without_usage_returns_none():
assert U.parse_usage_from_text('{"hello":"world"}') is None
def test_parse_from_log_missing_file_returns_none():
assert U.parse_usage_from_log("/no/such/file.log") is None
# --------------------------------------------------------------------------- #
# record_usage
# --------------------------------------------------------------------------- #
def _new_run(agent="developer", task_id=1):
conn = get_db()
cur = conn.execute("INSERT INTO agent_runs (task_id, agent) VALUES (?, ?)", (task_id, agent))
rid = cur.lastrowid
conn.commit()
conn.close()
return rid
def test_record_usage_writes_columns():
rid = _new_run()
u = U.parse_usage_from_text(REAL_RESULT_JSON)
U.record_usage(rid, u)
conn = get_db()
row = conn.execute(
"SELECT input_tokens, output_tokens, cache_read_tokens, "
"cache_creation_tokens, cost_usd "
"FROM agent_runs WHERE id=?", (rid,)
).fetchone()
conn.close()
assert row["input_tokens"] == 45231
assert row["output_tokens"] == 12100
assert row["cache_read_tokens"] == 18500
# FIX 2: cache_creation column is now persisted.
assert row["cache_creation_tokens"] == 7418
assert abs(row["cost_usd"] - 0.0560175) < 1e-9
def test_record_usage_none_writes_nulls():
rid = _new_run()
U.record_usage(rid, None) # must not raise
conn = get_db()
row = conn.execute("SELECT input_tokens, cost_usd FROM agent_runs WHERE id=?", (rid,)).fetchone()
conn.close()
assert row["input_tokens"] is None
assert row["cost_usd"] is None
# --------------------------------------------------------------------------- #
# formatting
# --------------------------------------------------------------------------- #
def test_fmt_tokens():
assert U.fmt_tokens(6) == "6"
assert U.fmt_tokens(1234) == "1.2k"
assert U.fmt_tokens(45231) == "45.2k"
assert U.fmt_tokens(2_500_000) == "2.5M"
assert U.fmt_tokens(None) == "0"
def test_fmt_cost():
assert U.fmt_cost(0.21) == "$0.21"
assert U.fmt_cost(0.0560175) == "$0.06"
assert U.fmt_cost(None) == "$0.00"
def test_usage_comment_format():
# No cache -> in_total == input_tokens, no cached breakdown shown.
u = {"input_tokens": 45231, "output_tokens": 12100, "cost_usd": 0.21}
c = U.usage_comment("developer", u)
assert "Developer" in c
assert "45.2k in" in c
assert "cached" not in c
assert "12.1k out" in c
assert "$0.21" in c
def test_usage_comment_shows_full_input_with_cached():
"""FIX 2: in = input + cache_read + cache_creation, with cached breakdown."""
u = {
"input_tokens": 81,
"cache_read_tokens": 8_400_000,
"cache_creation_tokens": 100_000,
"output_tokens": 45_800,
"cost_usd": 7.29,
}
c = U.usage_comment("developer", u)
# total in = 8_500_081 -> 8.5M ; cached = 8_500_000 -> 8.5M
assert "8.5M in (8.5M cached)" in c
assert "45.8k out" in c
assert "$7.29" in c
def test_usage_comment_no_cached_when_zero():
u = {"input_tokens": 1234, "cache_read_tokens": 0,
"cache_creation_tokens": 0, "output_tokens": 50, "cost_usd": 0.01}
c = U.usage_comment("developer", u)
assert "1.2k in" in c
assert "cached" not in c
# --------------------------------------------------------------------------- #
# FIX 4: per-agent artifact links in finish comments
# --------------------------------------------------------------------------- #
def _ctx():
return dict(repo="enduro-trails", branch="feature/ET-012-x",
work_item_id="ET-012")
def test_usage_comment_reviewer_links_review_doc():
c = U.usage_comment("reviewer", {"input_tokens": 5}, **_ctx())
assert "12-review.md" in c
assert "ET-012" in c
def test_usage_comment_tester_links_test_report():
c = U.usage_comment("tester", {"input_tokens": 5}, **_ctx())
assert "13-test-report.md" in c
def test_usage_comment_deployer_links_deploy_log():
c = U.usage_comment("deployer", {"input_tokens": 5}, **_ctx())
assert "14-deploy-log.md" in c
def test_usage_comment_developer_links_pr_and_branch():
c = U.usage_comment("developer", {"input_tokens": 5}, pr_number=7, **_ctx())
assert "pulls/7" in c
assert "feature/ET-012-x" in c
def test_usage_comment_architect_links_adr():
c = U.usage_comment("architect", {"input_tokens": 5}, **_ctx())
assert "06-adr" in c
def test_usage_comment_no_links_without_context():
"""Without repo/branch context, no links are appended (no crash)."""
c = U.usage_comment("reviewer", {"input_tokens": 5})
assert "12-review.md" not in c
assert "http" not in c
# --------------------------------------------------------------------------- #
# task summary
# --------------------------------------------------------------------------- #
def test_task_summary_aggregates_over_agents():
# two runs for the same task: developer + tester
for agent, ti, to, cost in [("developer", 1000, 200, 0.10), ("tester", 500, 100, 0.05)]:
rid = _new_run(agent=agent, task_id=42)
U.record_usage(rid, {"input_tokens": ti, "output_tokens": to,
"cache_read_tokens": 0, "cost_usd": cost})
s = U.task_usage_summary(42)
assert s["total_in"] == 1500
assert s["total_out"] == 300
assert abs(s["total_cost"] - 0.15) < 1e-9
agents = {a for a, *_ in s["per_agent"]}
assert agents == {"developer", "tester"}
comment = U.task_summary_comment(42)
assert "1.5k" in comment # total in
assert "$0.15" in comment # total cost
assert "Developer" in comment
assert "Tester" in comment
def test_task_summary_sums_all_three_input_components():
"""FIX 2: total_in = SUM(input + cache_read + cache_creation); total_cached too."""
rid = _new_run(agent="developer", task_id=77)
U.record_usage(rid, {
"input_tokens": 100,
"cache_read_tokens": 2000,
"cache_creation_tokens": 900,
"output_tokens": 50,
"cost_usd": 0.10,
})
rid2 = _new_run(agent="tester", task_id=77)
U.record_usage(rid2, {
"input_tokens": 10,
"cache_read_tokens": 500,
"cache_creation_tokens": 0,
"output_tokens": 5,
"cost_usd": 0.05,
})
s = U.task_usage_summary(77)
# total_in = (100+2000+900) + (10+500+0) = 3510
assert s["total_in"] == 3510
# total_cached = (2000+900) + (500+0) = 3400
assert s["total_cached"] == 3400
assert s["total_out"] == 55
comment = U.task_summary_comment(77)
assert "cached" in comment
def test_task_summary_handles_null_cache_creation():
"""Pre-existing rows (NULL cache_creation) must not break aggregation."""
rid = _new_run(agent="developer", task_id=88)
conn = get_db()
conn.execute(
"UPDATE agent_runs SET input_tokens=100, cache_read_tokens=200, "
"cache_creation_tokens=NULL, output_tokens=10, cost_usd=0.01 WHERE id=?",
(rid,),
)
conn.commit()
conn.close()
s = U.task_usage_summary(88) # must not raise
assert s["total_in"] == 300 # 100 + 200 + (NULL->0)
assert s["total_cached"] == 200