fix(observability): merge-gate on deploy, full token input, Plane Done, artifact links
1. BUG 8 (second door): merge webhook no longer fake-completes a task at the deploy stage; done is gated by the deployer verdict (check_deploy_status). Other stages keep merge->done. 2. Token accounting: parse+persist cache_creation_input_tokens (new idempotent agent_runs column). usage_comment / task_summary now show the FULL input (input + cache_read + cache_creation) with a cached breakdown. cost_usd untouched. 3. deploy->done success now forces the Plane issue to terminal Done state. 4. All agents (architect/developer/reviewer/tester/deployer) attach artifact links to their finish comment via gitea_public_url. Tests added for each fix; pytest 244 passed / 9 failed (off-limits HMAC group).
This commit is contained in:
@@ -69,6 +69,7 @@ def silence_side_effects(monkeypatch):
|
||||
"set_issue_needs_input",
|
||||
"set_issue_in_progress",
|
||||
"set_issue_blocked",
|
||||
"set_issue_done",
|
||||
):
|
||||
monkeypatch.setattr(stage_engine, name, MagicMock())
|
||||
|
||||
@@ -177,6 +178,40 @@ class TestHappyPathAgentSelection:
|
||||
assert res.enqueued_agent is None
|
||||
assert _jobs() == []
|
||||
|
||||
def test_deploy_success_syncs_plane_to_terminal_done(self, monkeypatch):
|
||||
"""FIX 3: a successful deploy->done forces the Plane issue to terminal Done.
|
||||
|
||||
Previously the task could stick on In Progress because the merge webhook
|
||||
completed it out-of-band. Now the engine drives set_issue_done() on the
|
||||
deploy->done success transition.
|
||||
"""
|
||||
monkeypatch.setattr(
|
||||
stage_engine, "QG_CHECKS",
|
||||
{k: _pass for k in stage_engine.QG_CHECKS},
|
||||
)
|
||||
task_id = _make_task("deploy", wi="ET-012")
|
||||
res = advance_stage(
|
||||
task_id, "deploy", "enduro-trails", "ET-012",
|
||||
"feature/ET-012-x", finished_agent="deployer",
|
||||
)
|
||||
assert res.advanced is True
|
||||
assert _stage(task_id) == "done"
|
||||
# The terminal Plane sync was invoked with the work item id.
|
||||
stage_engine.set_issue_done.assert_called_once_with("ET-012")
|
||||
|
||||
def test_non_terminal_advance_does_not_force_plane_done(self, monkeypatch):
|
||||
"""set_issue_done must only fire on the terminal deploy->done transition."""
|
||||
monkeypatch.setattr(
|
||||
stage_engine, "QG_CHECKS",
|
||||
{k: _pass for k in stage_engine.QG_CHECKS},
|
||||
)
|
||||
task_id = _make_task("review")
|
||||
advance_stage(
|
||||
task_id, "review", "enduro-trails", "ET-001",
|
||||
"feature/ET-001-x", finished_agent=None,
|
||||
)
|
||||
stage_engine.set_issue_done.assert_not_called()
|
||||
|
||||
def test_done_is_terminal(self):
|
||||
task_id = _make_task("done")
|
||||
res = advance_stage(task_id, "done", "enduro-trails", "ET-001",
|
||||
|
||||
@@ -62,9 +62,27 @@ def test_parse_real_result_json():
|
||||
assert u["input_tokens"] == 45231
|
||||
assert u["output_tokens"] == 12100
|
||||
assert u["cache_read_tokens"] == 18500
|
||||
# FIX 2: cache_creation slice must now be parsed (was dropped before).
|
||||
assert u["cache_creation_tokens"] == 7418
|
||||
assert abs(u["cost_usd"] - 0.0560175) < 1e-9
|
||||
|
||||
|
||||
def test_parse_cache_creation_present():
|
||||
u = U.parse_usage_from_text(REAL_RESULT_JSON)
|
||||
assert u["cache_creation_tokens"] == 7418
|
||||
|
||||
|
||||
def test_parse_cache_creation_missing_defaults_zero():
|
||||
blob = (
|
||||
'{"total_cost_usd":0.01,'
|
||||
'"usage":{"input_tokens":10,"output_tokens":5,'
|
||||
'"cache_read_input_tokens":100}}'
|
||||
)
|
||||
u = U.parse_usage_from_text(blob)
|
||||
assert u["cache_creation_tokens"] == 0
|
||||
assert u["cache_read_tokens"] == 100
|
||||
|
||||
|
||||
def test_parse_with_leading_text():
|
||||
"""The agent may print text before the trailing JSON; we still find it."""
|
||||
text = "some agent stdout line\nanother line\n" + REAL_RESULT_JSON
|
||||
@@ -106,13 +124,16 @@ def test_record_usage_writes_columns():
|
||||
U.record_usage(rid, u)
|
||||
conn = get_db()
|
||||
row = conn.execute(
|
||||
"SELECT input_tokens, output_tokens, cache_read_tokens, cost_usd "
|
||||
"SELECT input_tokens, output_tokens, cache_read_tokens, "
|
||||
"cache_creation_tokens, cost_usd "
|
||||
"FROM agent_runs WHERE id=?", (rid,)
|
||||
).fetchone()
|
||||
conn.close()
|
||||
assert row["input_tokens"] == 45231
|
||||
assert row["output_tokens"] == 12100
|
||||
assert row["cache_read_tokens"] == 18500
|
||||
# FIX 2: cache_creation column is now persisted.
|
||||
assert row["cache_creation_tokens"] == 7418
|
||||
assert abs(row["cost_usd"] - 0.0560175) < 1e-9
|
||||
|
||||
|
||||
@@ -144,14 +165,82 @@ def test_fmt_cost():
|
||||
|
||||
|
||||
def test_usage_comment_format():
|
||||
# No cache -> in_total == input_tokens, no cached breakdown shown.
|
||||
u = {"input_tokens": 45231, "output_tokens": 12100, "cost_usd": 0.21}
|
||||
c = U.usage_comment("developer", u)
|
||||
assert "Developer" in c
|
||||
assert "45.2k in" in c
|
||||
assert "cached" not in c
|
||||
assert "12.1k out" in c
|
||||
assert "$0.21" in c
|
||||
|
||||
|
||||
def test_usage_comment_shows_full_input_with_cached():
|
||||
"""FIX 2: in = input + cache_read + cache_creation, with cached breakdown."""
|
||||
u = {
|
||||
"input_tokens": 81,
|
||||
"cache_read_tokens": 8_400_000,
|
||||
"cache_creation_tokens": 100_000,
|
||||
"output_tokens": 45_800,
|
||||
"cost_usd": 7.29,
|
||||
}
|
||||
c = U.usage_comment("developer", u)
|
||||
# total in = 8_500_081 -> 8.5M ; cached = 8_500_000 -> 8.5M
|
||||
assert "8.5M in (8.5M cached)" in c
|
||||
assert "45.8k out" in c
|
||||
assert "$7.29" in c
|
||||
|
||||
|
||||
def test_usage_comment_no_cached_when_zero():
|
||||
u = {"input_tokens": 1234, "cache_read_tokens": 0,
|
||||
"cache_creation_tokens": 0, "output_tokens": 50, "cost_usd": 0.01}
|
||||
c = U.usage_comment("developer", u)
|
||||
assert "1.2k in" in c
|
||||
assert "cached" not in c
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# FIX 4: per-agent artifact links in finish comments
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _ctx():
|
||||
return dict(repo="enduro-trails", branch="feature/ET-012-x",
|
||||
work_item_id="ET-012")
|
||||
|
||||
|
||||
def test_usage_comment_reviewer_links_review_doc():
|
||||
c = U.usage_comment("reviewer", {"input_tokens": 5}, **_ctx())
|
||||
assert "12-review.md" in c
|
||||
assert "ET-012" in c
|
||||
|
||||
|
||||
def test_usage_comment_tester_links_test_report():
|
||||
c = U.usage_comment("tester", {"input_tokens": 5}, **_ctx())
|
||||
assert "13-test-report.md" in c
|
||||
|
||||
|
||||
def test_usage_comment_deployer_links_deploy_log():
|
||||
c = U.usage_comment("deployer", {"input_tokens": 5}, **_ctx())
|
||||
assert "14-deploy-log.md" in c
|
||||
|
||||
|
||||
def test_usage_comment_developer_links_pr_and_branch():
|
||||
c = U.usage_comment("developer", {"input_tokens": 5}, pr_number=7, **_ctx())
|
||||
assert "pulls/7" in c
|
||||
assert "feature/ET-012-x" in c
|
||||
|
||||
|
||||
def test_usage_comment_architect_links_adr():
|
||||
c = U.usage_comment("architect", {"input_tokens": 5}, **_ctx())
|
||||
assert "06-adr" in c
|
||||
|
||||
|
||||
def test_usage_comment_no_links_without_context():
|
||||
"""Without repo/branch context, no links are appended (no crash)."""
|
||||
c = U.usage_comment("reviewer", {"input_tokens": 5})
|
||||
assert "12-review.md" not in c
|
||||
assert "http" not in c
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# task summary
|
||||
# --------------------------------------------------------------------------- #
|
||||
@@ -174,3 +263,47 @@ def test_task_summary_aggregates_over_agents():
|
||||
assert "$0.15" in comment # total cost
|
||||
assert "Developer" in comment
|
||||
assert "Tester" in comment
|
||||
|
||||
|
||||
def test_task_summary_sums_all_three_input_components():
|
||||
"""FIX 2: total_in = SUM(input + cache_read + cache_creation); total_cached too."""
|
||||
rid = _new_run(agent="developer", task_id=77)
|
||||
U.record_usage(rid, {
|
||||
"input_tokens": 100,
|
||||
"cache_read_tokens": 2000,
|
||||
"cache_creation_tokens": 900,
|
||||
"output_tokens": 50,
|
||||
"cost_usd": 0.10,
|
||||
})
|
||||
rid2 = _new_run(agent="tester", task_id=77)
|
||||
U.record_usage(rid2, {
|
||||
"input_tokens": 10,
|
||||
"cache_read_tokens": 500,
|
||||
"cache_creation_tokens": 0,
|
||||
"output_tokens": 5,
|
||||
"cost_usd": 0.05,
|
||||
})
|
||||
s = U.task_usage_summary(77)
|
||||
# total_in = (100+2000+900) + (10+500+0) = 3510
|
||||
assert s["total_in"] == 3510
|
||||
# total_cached = (2000+900) + (500+0) = 3400
|
||||
assert s["total_cached"] == 3400
|
||||
assert s["total_out"] == 55
|
||||
comment = U.task_summary_comment(77)
|
||||
assert "cached" in comment
|
||||
|
||||
|
||||
def test_task_summary_handles_null_cache_creation():
|
||||
"""Pre-existing rows (NULL cache_creation) must not break aggregation."""
|
||||
rid = _new_run(agent="developer", task_id=88)
|
||||
conn = get_db()
|
||||
conn.execute(
|
||||
"UPDATE agent_runs SET input_tokens=100, cache_read_tokens=200, "
|
||||
"cache_creation_tokens=NULL, output_tokens=10, cost_usd=0.01 WHERE id=?",
|
||||
(rid,),
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
s = U.task_usage_summary(88) # must not raise
|
||||
assert s["total_in"] == 300 # 100 + 200 + (NULL->0)
|
||||
assert s["total_cached"] == 200
|
||||
|
||||
@@ -433,3 +433,67 @@ def test_ci_failure_development_escalates_at_limit(
|
||||
assert "after CI failure" in err_msg
|
||||
# Stage untouched.
|
||||
assert not mock_update_stage.called
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# BUG 8 (second door): a merged-PR webhook must NOT fake-complete a task that is
|
||||
# still in the deploy stage. On `deploy` done is gated by the deployer's verdict
|
||||
# (check_deploy_status via advance_stage), not by the merge event. For every
|
||||
# other stage the merge->done behaviour is preserved. Pure-logic tests: invoke
|
||||
# handle_pr() directly with mocked helpers (no HMAC barrier).
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _merged_pr_payload(branch="feature/ET-012-x"):
|
||||
return {
|
||||
"action": "closed",
|
||||
"pull_request": {
|
||||
"merged": True,
|
||||
"number": 7,
|
||||
"head": {"ref": branch},
|
||||
},
|
||||
"repository": {"name": "enduro-trails"},
|
||||
}
|
||||
|
||||
|
||||
@patch("src.webhooks.gitea.notify_stage_change")
|
||||
@patch("src.webhooks.gitea.update_task_stage")
|
||||
@patch("src.webhooks.gitea.get_task_by_repo_branch")
|
||||
@patch("src.webhooks.gitea.get_project_by_repo")
|
||||
def test_merge_on_deploy_stage_does_not_set_done(
|
||||
mock_proj, mock_task, mock_update_stage, mock_notify,
|
||||
):
|
||||
"""FIX 1: merge at deploy stage is ignored — done is gated by deployer verdict."""
|
||||
from src.webhooks.gitea import handle_pr
|
||||
|
||||
mock_proj.return_value = {"repo": "enduro-trails"}
|
||||
mock_task.return_value = {
|
||||
"id": 1, "stage": "deploy", "work_item_id": "ET-012",
|
||||
}
|
||||
|
||||
asyncio.run(handle_pr(_merged_pr_payload()))
|
||||
|
||||
# The merge-driven done path must NOT run on deploy.
|
||||
assert not mock_update_stage.called
|
||||
assert not mock_notify.called
|
||||
|
||||
|
||||
@patch("src.webhooks.gitea.notify_stage_change")
|
||||
@patch("src.webhooks.gitea.update_task_stage")
|
||||
@patch("src.webhooks.gitea.get_task_by_repo_branch")
|
||||
@patch("src.webhooks.gitea.get_project_by_repo")
|
||||
def test_merge_on_non_deploy_stage_sets_done(
|
||||
mock_proj, mock_task, mock_update_stage, mock_notify,
|
||||
):
|
||||
"""FIX 1: merge behaviour is preserved for non-deploy stages (e.g. review)."""
|
||||
from src.webhooks.gitea import handle_pr
|
||||
|
||||
mock_proj.return_value = {"repo": "enduro-trails"}
|
||||
mock_task.return_value = {
|
||||
"id": 2, "stage": "review", "work_item_id": "ET-013",
|
||||
}
|
||||
|
||||
asyncio.run(handle_pr(_merged_pr_payload(branch="feature/ET-013-x")))
|
||||
|
||||
# Non-deploy stages still get the merge-driven done.
|
||||
mock_update_stage.assert_called_once_with(2, "done")
|
||||
assert mock_notify.called
|
||||
|
||||
Reference in New Issue
Block a user