fix(observability): merge-gate on deploy, full token input, Plane Done, artifact links

1. BUG 8 (second door): merge webhook no longer fake-completes a task at the
   deploy stage; done is gated by the deployer verdict (check_deploy_status).
   Other stages keep merge->done.
2. Token accounting: parse+persist cache_creation_input_tokens (new
   idempotent agent_runs column). usage_comment / task_summary now show the
   FULL input (input + cache_read + cache_creation) with a cached breakdown.
   cost_usd untouched.
3. deploy->done success now forces the Plane issue to terminal Done state.
4. All agents (architect/developer/reviewer/tester/deployer) attach artifact
   links to their finish comment via gitea_public_url.

Tests added for each fix; pytest 244 passed / 9 failed (off-limits HMAC group).
This commit is contained in:
Dev Agent
2026-06-04 11:17:58 +03:00
parent 2629dffe1b
commit 61e26a8930
9 changed files with 476 additions and 16 deletions

View File

@@ -699,12 +699,49 @@ class AgentLauncher:
task_id, work_item_id = row[0], row[1]
if not work_item_id:
return
plane_add_comment(work_item_id, usage_comment(agent, usage), author=agent)
# Observability: every agent's finish comment links its artifact(s)
# (reviewer->12-review, tester->13-test-report, deployer->14-deploy-log,
# architect->ADR, developer->PR/branch). For the developer we resolve the
# open PR number so the link points straight at it.
pr_number = None
if agent == "developer":
pr_number = self._open_pr_number(repo, branch)
plane_add_comment(
work_item_id,
usage_comment(
agent,
usage,
repo=repo,
branch=branch,
work_item_id=work_item_id,
pr_number=pr_number,
),
author=agent,
)
if agent == "deployer":
plane_add_comment(
work_item_id, task_summary_comment(task_id), author="deployer"
)
def _open_pr_number(self, repo: str, branch: str):
"""Return the open PR number for `branch`, or None. Never raises."""
try:
import httpx
owner = settings.gitea_owner
headers = {"Authorization": f"token {settings.gitea_token}"}
resp = httpx.get(
f"{settings.gitea_url}/api/v1/repos/{owner}/{repo}/pulls",
params={"state": "open", "head": branch},
headers=headers, timeout=5,
)
if resp.status_code == 200:
prs = resp.json()
if prs:
return prs[0].get("number")
except Exception:
pass
return None
def _ensure_pr(self, repo: str, branch: str, run_id: int):
import httpx
owner = settings.gitea_owner

View File

@@ -83,6 +83,12 @@ def init_db():
_ensure_column(conn, "agent_runs", "input_tokens", "INTEGER")
_ensure_column(conn, "agent_runs", "output_tokens", "INTEGER")
_ensure_column(conn, "agent_runs", "cache_read_tokens", "INTEGER")
# Observability fix: also persist cache-CREATION input tokens. Claude CLI
# reports the real input split across input_tokens (fresh, ~tens) +
# cache_read_input_tokens (cache hit, millions) + cache_creation_input_tokens
# (writing new cache). Without this column the cache_creation slice is lost
# and the "X in" figure understates the true prompt size. Idempotent ALTER.
_ensure_column(conn, "agent_runs", "cache_creation_tokens", "INTEGER")
_ensure_column(conn, "agent_runs", "cost_usd", "REAL")
conn.commit()
conn.close()

View File

@@ -343,6 +343,17 @@ def set_issue_blocked(work_item_id: str, project_id: str = None):
_set_issue_state_direct(work_item_id, PLANE_STATES["blocked"], project_id)
def set_issue_done(work_item_id: str, project_id: str = None):
"""Observability fix: force the issue into the TERMINAL Done state.
Used by the deploy->done success path so a completed task always reaches the
terminal Plane state (it used to stick on In Progress because the merge
webhook bypassed the stage engine). Uses the existing PLANE_STATES['done']
UUID — the mapping itself is NOT changed.
"""
_set_issue_state_direct(work_item_id, PLANE_STATES["done"], project_id)
def set_issue_in_progress(work_item_id: str, project_id: str = None):
"""Set issue to 'In Progress' state — agent working."""
_set_issue_state_direct(work_item_id, PLANE_STATES["in_progress"], project_id)

View File

@@ -47,6 +47,7 @@ from .plane_sync import (
set_issue_needs_input,
set_issue_in_progress,
set_issue_blocked,
set_issue_done,
)
from .config import settings
@@ -247,6 +248,22 @@ def advance_stage(
f"(auto-advance after {agent})"
)
# --- Terminal sync: deploy -> done must reach Plane's Done -----------
# When the deployer's check_deploy_status passes we advance to the
# terminal 'done' stage. Previously a merged-PR webhook completed the
# task out-of-band and Plane stuck on In Progress. Now done flows through
# here, so explicitly drive the Plane issue into the terminal Done state
# (PLANE_STATES['done'] — mapping unchanged) in addition to the
# stage-change comment above.
if next_stage == "done" and work_item_id:
try:
set_issue_done(work_item_id)
logger.info(
f"Task {task_id}: deploy->done, Plane state forced to Done"
)
except Exception as e:
logger.error(f"Task {task_id}: failed to set Plane Done: {e}")
# --- Launch the next agent (ORCH-4 fix: current_stage, not next) -----
next_agent = get_agent_for_stage(current_stage)
if next_agent:

View File

@@ -31,7 +31,8 @@ def parse_usage_from_text(text: str) -> dict | None:
top-level '{' ... '}' that parses and carries usage/total_cost_usd.
Returns a normalised dict
{input_tokens, output_tokens, cache_read_tokens, cost_usd}
{input_tokens, output_tokens, cache_read_tokens, cache_creation_tokens,
cost_usd}
(ints / float, missing fields -> 0 / 0.0), or None if no usable JSON found.
"""
if not text:
@@ -71,6 +72,12 @@ def parse_usage_from_text(text: str) -> dict | None:
"cache_read_tokens": _int(
usage.get("cache_read_input_tokens", usage.get("cache_read_tokens"))
),
# The cache-CREATION slice (writing new cache entries) is part of the
# REAL input and used to be dropped on the floor. Persist it so the
# "X in" figure reflects the full prompt size, not just fresh tokens.
"cache_creation_tokens": _int(
usage.get("cache_creation_input_tokens", usage.get("cache_creation_tokens"))
),
"cost_usd": _float(cost),
}
@@ -150,11 +157,12 @@ def record_usage(run_id: int, usage: dict | None):
try:
conn.execute(
"UPDATE agent_runs SET input_tokens=?, output_tokens=?, "
"cache_read_tokens=?, cost_usd=? WHERE id=?",
"cache_read_tokens=?, cache_creation_tokens=?, cost_usd=? WHERE id=?",
(
usage.get("input_tokens"),
usage.get("output_tokens"),
usage.get("cache_read_tokens"),
usage.get("cache_creation_tokens"),
usage.get("cost_usd"),
run_id,
),
@@ -197,19 +205,132 @@ AGENT_DISPLAY = {
}
def usage_comment(agent: str, usage: dict | None) -> str:
def _input_total(usage: dict) -> int:
"""FULL input = fresh input + cache-read + cache-creation tokens."""
def _i(k):
try:
return int(usage.get(k) or 0)
except (TypeError, ValueError):
return 0
return _i("input_tokens") + _i("cache_read_tokens") + _i("cache_creation_tokens")
def _cached_total(usage: dict) -> int:
"""Cached portion of the input = cache-read + cache-creation tokens."""
def _i(k):
try:
return int(usage.get(k) or 0)
except (TypeError, ValueError):
return 0
return _i("cache_read_tokens") + _i("cache_creation_tokens")
def fmt_in(usage: dict) -> str:
"""Render the input figure as full total with a cached breakdown.
'8.5M in (8.4M cached)' when there is a cache; '45.2k in' when cached==0.
"""
total = _input_total(usage)
cached = _cached_total(usage)
if cached > 0:
return f"{fmt_tokens(total)} in ({fmt_tokens(cached)} cached)"
return f"{fmt_tokens(total)} in"
def usage_comment(
agent: str,
usage: dict | None,
repo: str | None = None,
branch: str | None = None,
work_item_id: str | None = None,
pr_number=None,
) -> str:
"""Build the per-agent finish comment, e.g.
'\U0001f4bb Developer \u0433\u043e\u0442\u043e\u0432 \u00b7 45.2k in / 12.1k out \u00b7 $0.21'.
'\U0001f4bb Developer \u0433\u043e\u0442\u043e\u0432 \u00b7 8.5M in (8.4M cached) / 45.8k out \u00b7 $7.29'.
When repo/branch/work_item_id are supplied, the agent's artifact link(s) are
appended (BUG: only analyst used to link its docs). Missing artifacts are
silently skipped — link building never raises.
"""
usage = usage or {}
name = AGENT_DISPLAY.get(agent, agent.capitalize())
icon = AGENT_ICON.get(agent, "\u2705")
return (
line = (
f"{icon} {name} \u0433\u043e\u0442\u043e\u0432 \u00b7 "
f"{fmt_tokens(usage.get('input_tokens'))} in / "
f"{fmt_in(usage)} / "
f"{fmt_tokens(usage.get('output_tokens'))} out \u00b7 "
f"{fmt_cost(usage.get('cost_usd'))}"
)
links = artifact_links(agent, repo, branch, work_item_id, pr_number)
if links:
line += "\n" + "\n".join(links)
return line
# Per-agent artifact file under docs/work-items/{wid}/ (architect/developer use
# special handling for ADR dirs / PR links, see artifact_links()).
AGENT_ARTIFACT = {
"reviewer": ("Review", "12-review.md"),
"tester": ("Test report", "13-test-report.md"),
"deployer": ("Deploy log", "14-deploy-log.md"),
}
def artifact_links(
agent: str,
repo: str | None,
branch: str | None,
work_item_id: str | None,
pr_number=None,
) -> list[str]:
"""Markdown link(s) to the finishing agent's artifact(s) in Gitea.
Uses gitea_public_url (falls back to gitea_url) for clickable links, mirroring
the analyst doc links. Returns [] (never raises) when there is nothing to
link or the required context is missing. analyst is intentionally NOT handled
here — its richer doc list lives in stage_engine._build_analyst_ready_comment.
"""
try:
from .config import settings
owner = getattr(settings, "gitea_owner", "admin")
base = (
getattr(settings, "gitea_public_url", "") or getattr(settings, "gitea_url", "")
).rstrip("/")
if not base or not repo:
return []
links: list[str] = []
if agent == "developer":
if branch:
links.append(
f"\U0001f4c2 [Branch {branch}]({base}/{owner}/{repo}/src/branch/{branch})"
)
if pr_number:
links.append(
f"\U0001f517 [PR #{pr_number}]({base}/{owner}/{repo}/pulls/{pr_number})"
)
return links
if agent == "architect":
if branch and work_item_id:
adr_dir = (
f"{base}/{owner}/{repo}/src/branch/{branch}/"
f"docs/work-items/{work_item_id}/06-adr"
)
links.append(f"\U0001f4d0 [ADR]({adr_dir})")
return links
spec = AGENT_ARTIFACT.get(agent)
if spec and branch and work_item_id:
label, fname = spec
href = (
f"{base}/{owner}/{repo}/src/branch/{branch}/"
f"docs/work-items/{work_item_id}/{fname}"
)
links.append(f"\U0001f4c4 [{label}]({href})")
return links
except Exception:
return []
AGENT_ICON = {
@@ -225,13 +346,22 @@ AGENT_ICON = {
def task_usage_summary(task_id: int) -> dict:
"""Aggregate agent_runs usage for a task.
Returns {total_in, total_out, total_cost, per_agent: [(agent, in, out, cost), ...]}.
total_in counts the FULL input (input + cache_read + cache_creation), and
total_cached counts the cached portion (cache_read + cache_creation).
COALESCE(...,0) keeps pre-existing rows (NULL cache_creation) from breaking.
Returns {total_in, total_cached, total_out, total_cost,
per_agent: [(agent, in, cached, out, cost), ...]}.
"""
conn = get_db()
try:
rows = conn.execute(
"SELECT agent, "
"COALESCE(SUM(input_tokens),0), "
"COALESCE(SUM(input_tokens),0) "
" + COALESCE(SUM(cache_read_tokens),0) "
" + COALESCE(SUM(cache_creation_tokens),0), "
"COALESCE(SUM(cache_read_tokens),0) "
" + COALESCE(SUM(cache_creation_tokens),0), "
"COALESCE(SUM(output_tokens),0), "
"COALESCE(SUM(cost_usd),0.0) "
"FROM agent_runs WHERE task_id=? GROUP BY agent ORDER BY agent",
@@ -239,12 +369,14 @@ def task_usage_summary(task_id: int) -> dict:
).fetchall()
finally:
conn.close()
per_agent = [(r[0], int(r[1]), int(r[2]), float(r[3])) for r in rows]
per_agent = [(r[0], int(r[1]), int(r[2]), int(r[3]), float(r[4])) for r in rows]
total_in = sum(r[1] for r in per_agent)
total_out = sum(r[2] for r in per_agent)
total_cost = sum(r[3] for r in per_agent)
total_cached = sum(r[2] for r in per_agent)
total_out = sum(r[3] for r in per_agent)
total_cost = sum(r[4] for r in per_agent)
return {
"total_in": total_in,
"total_cached": total_cached,
"total_out": total_out,
"total_cost": total_cost,
"per_agent": per_agent,
@@ -254,15 +386,26 @@ def task_usage_summary(task_id: int) -> dict:
def task_summary_comment(task_id: int) -> str:
"""Build the Deployer end-of-task summary comment (Feature 4, variant B)."""
s = task_usage_summary(task_id)
cached = s.get("total_cached", 0)
head_in = (
f"{fmt_tokens(s['total_in'])} \u0432\u0445\u043e\u0434 ({fmt_tokens(cached)} cached)"
if cached > 0
else f"{fmt_tokens(s['total_in'])} \u0432\u0445\u043e\u0434"
)
lines = [
f"\U0001f4ca \u0418\u0442\u043e\u0433\u043e \u043f\u043e \u0437\u0430\u0434\u0430\u0447\u0435: "
f"{fmt_tokens(s['total_in'])} \u0442\u043e\u043a\u0435\u043d\u043e\u0432 \u0432\u0445\u043e\u0434 / "
f"{head_in} / "
f"{fmt_tokens(s['total_out'])} \u0432\u044b\u0445\u043e\u0434 \u00b7 "
f"{fmt_cost(s['total_cost'])}"
]
for agent, ti, to, cost in s["per_agent"]:
for agent, ti, tc, to, cost in s["per_agent"]:
name = AGENT_DISPLAY.get(agent, agent.capitalize())
in_str = (
f"{fmt_tokens(ti)} in ({fmt_tokens(tc)} cached)"
if tc > 0
else f"{fmt_tokens(ti)} in"
)
lines.append(
f"\u2022 {name}: {fmt_tokens(ti)} in / {fmt_tokens(to)} out \u00b7 {fmt_cost(cost)}"
f"\u2022 {name}: {in_str} / {fmt_tokens(to)} out \u00b7 {fmt_cost(cost)}"
)
return "\n".join(lines)

View File

@@ -334,6 +334,20 @@ async def handle_pr(payload: dict):
logger.error(f"Task {task_id}: max retries reached, needs manual intervention")
elif action == "closed" and pr.get("merged", False):
# BUG 8 (second door): at the deploy stage `done` is gated by the
# deployer's verdict (check_deploy_status via advance_stage), NOT by the
# fact that the PR was merged. The deployer merges the PR at the START of
# its run, so a merged webhook arrives ~30s later while the deployer is
# still working — blindly setting done here would fake-complete the task
# and discard a later deploy_status: FAILED verdict. advance_stage will
# drive deploy→done (and Plane→Done) when the deployer job finishes.
# For every OTHER stage the merge-driven done behaviour is preserved.
if current_stage == "deploy":
logger.info(
f"Task {task_id}: PR merged at deploy stage — done gated by "
f"deployer verdict (check_deploy_status), ignoring merge-driven done."
)
return
update_task_stage(task_id, "done")
notify_stage_change(task_id, current_stage, "done")
logger.info(f"Task {task_id}: PR merged, stage → done")