fix(observability): merge-gate on deploy, full token input, Plane Done, artifact links

1. BUG 8 (second door): merge webhook no longer fake-completes a task at the deploy stage; done is gated by the deployer verdict (check_deploy_status). Other stages keep merge->done. 2. Token accounting: parse+persist cache_creation_input_tokens (new idempotent agent_runs column). usage_comment / task_summary now show the FULL input (input + cache_read + cache_creation) with a cached breakdown. cost_usd untouched. 3. deploy->done success now forces the Plane issue to terminal Done state. 4. All agents (architect/developer/reviewer/tester/deployer) attach artifact links to their finish comment via gitea_public_url. Tests added for each fix; pytest 244 passed / 9 failed (off-limits HMAC group).
2026-06-04 11:17:58 +03:00
parent 2629dffe1b
commit 61e26a8930
9 changed files with 476 additions and 16 deletions
--- a/tests/test_stage_engine.py
+++ b/tests/test_stage_engine.py
@@ -69,6 +69,7 @@ def silence_side_effects(monkeypatch):
        "set_issue_needs_input",
        "set_issue_in_progress",
        "set_issue_blocked",
+        "set_issue_done",
    ):
        monkeypatch.setattr(stage_engine, name, MagicMock())

@@ -177,6 +178,40 @@ class TestHappyPathAgentSelection:
        assert res.enqueued_agent is None
        assert _jobs() == []

+    def test_deploy_success_syncs_plane_to_terminal_done(self, monkeypatch):
+        """FIX 3: a successful deploy->done forces the Plane issue to terminal Done.
+
+        Previously the task could stick on In Progress because the merge webhook
+        completed it out-of-band. Now the engine drives set_issue_done() on the
+        deploy->done success transition.
+        """
+        monkeypatch.setattr(
+            stage_engine, "QG_CHECKS",
+            {k: _pass for k in stage_engine.QG_CHECKS},
+        )
+        task_id = _make_task("deploy", wi="ET-012")
+        res = advance_stage(
+            task_id, "deploy", "enduro-trails", "ET-012",
+            "feature/ET-012-x", finished_agent="deployer",
+        )
+        assert res.advanced is True
+        assert _stage(task_id) == "done"
+        # The terminal Plane sync was invoked with the work item id.
+        stage_engine.set_issue_done.assert_called_once_with("ET-012")
+
+    def test_non_terminal_advance_does_not_force_plane_done(self, monkeypatch):
+        """set_issue_done must only fire on the terminal deploy->done transition."""
+        monkeypatch.setattr(
+            stage_engine, "QG_CHECKS",
+            {k: _pass for k in stage_engine.QG_CHECKS},
+        )
+        task_id = _make_task("review")
+        advance_stage(
+            task_id, "review", "enduro-trails", "ET-001",
+            "feature/ET-001-x", finished_agent=None,
+        )
+        stage_engine.set_issue_done.assert_not_called()
+
    def test_done_is_terminal(self):
        task_id = _make_task("done")
        res = advance_stage(task_id, "done", "enduro-trails", "ET-001",
--- a/tests/test_usage.py
+++ b/tests/test_usage.py
@@ -62,9 +62,27 @@ def test_parse_real_result_json():
    assert u["input_tokens"] == 45231
    assert u["output_tokens"] == 12100
    assert u["cache_read_tokens"] == 18500
+    # FIX 2: cache_creation slice must now be parsed (was dropped before).
+    assert u["cache_creation_tokens"] == 7418
    assert abs(u["cost_usd"] - 0.0560175) < 1e-9


+def test_parse_cache_creation_present():
+    u = U.parse_usage_from_text(REAL_RESULT_JSON)
+    assert u["cache_creation_tokens"] == 7418
+
+
+def test_parse_cache_creation_missing_defaults_zero():
+    blob = (
+        '{"total_cost_usd":0.01,'
+        '"usage":{"input_tokens":10,"output_tokens":5,'
+        '"cache_read_input_tokens":100}}'
+    )
+    u = U.parse_usage_from_text(blob)
+    assert u["cache_creation_tokens"] == 0
+    assert u["cache_read_tokens"] == 100
+
+
 def test_parse_with_leading_text():
    """The agent may print text before the trailing JSON; we still find it."""
    text = "some agent stdout line\nanother line\n" + REAL_RESULT_JSON
@@ -106,13 +124,16 @@ def test_record_usage_writes_columns():
    U.record_usage(rid, u)
    conn = get_db()
    row = conn.execute(
-        "SELECT input_tokens, output_tokens, cache_read_tokens, cost_usd "
+        "SELECT input_tokens, output_tokens, cache_read_tokens, "
+        "cache_creation_tokens, cost_usd "
        "FROM agent_runs WHERE id=?", (rid,)
    ).fetchone()
    conn.close()
    assert row["input_tokens"] == 45231
    assert row["output_tokens"] == 12100
    assert row["cache_read_tokens"] == 18500
+    # FIX 2: cache_creation column is now persisted.
+    assert row["cache_creation_tokens"] == 7418
    assert abs(row["cost_usd"] - 0.0560175) < 1e-9


@@ -144,14 +165,82 @@ def test_fmt_cost():


 def test_usage_comment_format():
+    # No cache -> in_total == input_tokens, no cached breakdown shown.
    u = {"input_tokens": 45231, "output_tokens": 12100, "cost_usd": 0.21}
    c = U.usage_comment("developer", u)
    assert "Developer" in c
    assert "45.2k in" in c
+    assert "cached" not in c
    assert "12.1k out" in c
    assert "$0.21" in c


+def test_usage_comment_shows_full_input_with_cached():
+    """FIX 2: in = input + cache_read + cache_creation, with cached breakdown."""
+    u = {
+        "input_tokens": 81,
+        "cache_read_tokens": 8_400_000,
+        "cache_creation_tokens": 100_000,
+        "output_tokens": 45_800,
+        "cost_usd": 7.29,
+    }
+    c = U.usage_comment("developer", u)
+    # total in = 8_500_081 -> 8.5M ; cached = 8_500_000 -> 8.5M
+    assert "8.5M in (8.5M cached)" in c
+    assert "45.8k out" in c
+    assert "$7.29" in c
+
+
+def test_usage_comment_no_cached_when_zero():
+    u = {"input_tokens": 1234, "cache_read_tokens": 0,
+         "cache_creation_tokens": 0, "output_tokens": 50, "cost_usd": 0.01}
+    c = U.usage_comment("developer", u)
+    assert "1.2k in" in c
+    assert "cached" not in c
+
+
+# --------------------------------------------------------------------------- #
+# FIX 4: per-agent artifact links in finish comments
+# --------------------------------------------------------------------------- #
+def _ctx():
+    return dict(repo="enduro-trails", branch="feature/ET-012-x",
+               work_item_id="ET-012")
+
+
+def test_usage_comment_reviewer_links_review_doc():
+    c = U.usage_comment("reviewer", {"input_tokens": 5}, **_ctx())
+    assert "12-review.md" in c
+    assert "ET-012" in c
+
+
+def test_usage_comment_tester_links_test_report():
+    c = U.usage_comment("tester", {"input_tokens": 5}, **_ctx())
+    assert "13-test-report.md" in c
+
+
+def test_usage_comment_deployer_links_deploy_log():
+    c = U.usage_comment("deployer", {"input_tokens": 5}, **_ctx())
+    assert "14-deploy-log.md" in c
+
+
+def test_usage_comment_developer_links_pr_and_branch():
+    c = U.usage_comment("developer", {"input_tokens": 5}, pr_number=7, **_ctx())
+    assert "pulls/7" in c
+    assert "feature/ET-012-x" in c
+
+
+def test_usage_comment_architect_links_adr():
+    c = U.usage_comment("architect", {"input_tokens": 5}, **_ctx())
+    assert "06-adr" in c
+
+
+def test_usage_comment_no_links_without_context():
+    """Without repo/branch context, no links are appended (no crash)."""
+    c = U.usage_comment("reviewer", {"input_tokens": 5})
+    assert "12-review.md" not in c
+    assert "http" not in c
+
+
 # --------------------------------------------------------------------------- #
 # task summary
 # --------------------------------------------------------------------------- #
@@ -174,3 +263,47 @@ def test_task_summary_aggregates_over_agents():
    assert "$0.15" in comment       # total cost
    assert "Developer" in comment
    assert "Tester" in comment
+
+
+def test_task_summary_sums_all_three_input_components():
+    """FIX 2: total_in = SUM(input + cache_read + cache_creation); total_cached too."""
+    rid = _new_run(agent="developer", task_id=77)
+    U.record_usage(rid, {
+        "input_tokens": 100,
+        "cache_read_tokens": 2000,
+        "cache_creation_tokens": 900,
+        "output_tokens": 50,
+        "cost_usd": 0.10,
+    })
+    rid2 = _new_run(agent="tester", task_id=77)
+    U.record_usage(rid2, {
+        "input_tokens": 10,
+        "cache_read_tokens": 500,
+        "cache_creation_tokens": 0,
+        "output_tokens": 5,
+        "cost_usd": 0.05,
+    })
+    s = U.task_usage_summary(77)
+    # total_in = (100+2000+900) + (10+500+0) = 3510
+    assert s["total_in"] == 3510
+    # total_cached = (2000+900) + (500+0) = 3400
+    assert s["total_cached"] == 3400
+    assert s["total_out"] == 55
+    comment = U.task_summary_comment(77)
+    assert "cached" in comment
+
+
+def test_task_summary_handles_null_cache_creation():
+    """Pre-existing rows (NULL cache_creation) must not break aggregation."""
+    rid = _new_run(agent="developer", task_id=88)
+    conn = get_db()
+    conn.execute(
+        "UPDATE agent_runs SET input_tokens=100, cache_read_tokens=200, "
+        "cache_creation_tokens=NULL, output_tokens=10, cost_usd=0.01 WHERE id=?",
+        (rid,),
+    )
+    conn.commit()
+    conn.close()
+    s = U.task_usage_summary(88)  # must not raise
+    assert s["total_in"] == 300   # 100 + 200 + (NULL->0)
+    assert s["total_cached"] == 200
--- a/tests/test_webhooks.py
+++ b/tests/test_webhooks.py
@@ -433,3 +433,67 @@ def test_ci_failure_development_escalates_at_limit(
    assert "after CI failure" in err_msg
    # Stage untouched.
    assert not mock_update_stage.called
+
+
+# ---------------------------------------------------------------------------
+# BUG 8 (second door): a merged-PR webhook must NOT fake-complete a task that is
+# still in the deploy stage. On `deploy` done is gated by the deployer's verdict
+# (check_deploy_status via advance_stage), not by the merge event. For every
+# other stage the merge->done behaviour is preserved. Pure-logic tests: invoke
+# handle_pr() directly with mocked helpers (no HMAC barrier).
+# ---------------------------------------------------------------------------
+
+def _merged_pr_payload(branch="feature/ET-012-x"):
+    return {
+        "action": "closed",
+        "pull_request": {
+            "merged": True,
+            "number": 7,
+            "head": {"ref": branch},
+        },
+        "repository": {"name": "enduro-trails"},
+    }
+
+
+@patch("src.webhooks.gitea.notify_stage_change")
+@patch("src.webhooks.gitea.update_task_stage")
+@patch("src.webhooks.gitea.get_task_by_repo_branch")
+@patch("src.webhooks.gitea.get_project_by_repo")
+def test_merge_on_deploy_stage_does_not_set_done(
+    mock_proj, mock_task, mock_update_stage, mock_notify,
+):
+    """FIX 1: merge at deploy stage is ignored — done is gated by deployer verdict."""
+    from src.webhooks.gitea import handle_pr
+
+    mock_proj.return_value = {"repo": "enduro-trails"}
+    mock_task.return_value = {
+        "id": 1, "stage": "deploy", "work_item_id": "ET-012",
+    }
+
+    asyncio.run(handle_pr(_merged_pr_payload()))
+
+    # The merge-driven done path must NOT run on deploy.
+    assert not mock_update_stage.called
+    assert not mock_notify.called
+
+
+@patch("src.webhooks.gitea.notify_stage_change")
+@patch("src.webhooks.gitea.update_task_stage")
+@patch("src.webhooks.gitea.get_task_by_repo_branch")
+@patch("src.webhooks.gitea.get_project_by_repo")
+def test_merge_on_non_deploy_stage_sets_done(
+    mock_proj, mock_task, mock_update_stage, mock_notify,
+):
+    """FIX 1: merge behaviour is preserved for non-deploy stages (e.g. review)."""
+    from src.webhooks.gitea import handle_pr
+
+    mock_proj.return_value = {"repo": "enduro-trails"}
+    mock_task.return_value = {
+        "id": 2, "stage": "review", "work_item_id": "ET-013",
+    }
+
+    asyncio.run(handle_pr(_merged_pr_payload(branch="feature/ET-013-x")))
+
+    # Non-deploy stages still get the merge-driven done.
+    mock_update_stage.assert_called_once_with(2, "done")
+    assert mock_notify.called