feat(cancel): STOP-status task cancellation + relaunch-hole close (ORCH-090)
Introduce the dedicated Plane STOP status as a single declarative task-cancel
mechanism: stop the active agent (graceful SIGTERM cascade), cancel all jobs
(terminal `cancelled`, never requeued), remove the worktree + delete the remote
feature branch (never main, never force-push), drive the task to the new
system-terminal state `cancelled` and tombstone the natural keys so a later
"To Analyse" re-creates it from scratch (docs artefacts preserved). STOP during a
critical merge/deploy window is deferred until the irreversible step finishes
honestly. Also closes the relaunch hole: handle_status_start relaunch is gated to
the `analysis` stage; the only pipeline-start entry point remains "To Analyse".
Cross-cutting (adr-0026): the "task terminal" predicate is widened {done} ->
{done, cancelled} in serial_gate / task_deps / stages sink + reaper/worker
requeue guards. STAGE_TRANSITIONS exit-gates / QG_CHECKS / check_* are unchanged
(`cancelled` is a sink, not a new edge). Additive, never-raise, restart-safe,
under kill-switch ORCH_STOP_STATUS_ENABLED (off -> zero regression).
New: src/cancel.py (leaf), src/gitea.py (delete_remote_branch), tasks columns
cancelled_at/cancel_requested_at, jobs status `cancelled`, GET /queue `stop` block.
Tests: tests/test_stop_status.py (TC-01..TC-14 + D7); full suite green (1345).
Docs updated in-PR (architecture README, CLAUDE.md, README.md, .env.example,
CHANGELOG). ADR-001 D4 refinement: plane_issue_id is tombstoned too (the lookup
ORs on it) — original UUID recoverable from the parseable suffix.
Refs: ORCH-090
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
208
src/db.py
208
src/db.py
@@ -59,7 +59,7 @@ def init_db():
|
||||
repo TEXT NOT NULL,
|
||||
task_id INTEGER, -- FK tasks.id (nullable)
|
||||
task_content TEXT, -- written to the agent task_file
|
||||
status TEXT NOT NULL DEFAULT 'queued', -- queued|running|done|failed
|
||||
status TEXT NOT NULL DEFAULT 'queued', -- queued|running|done|failed|cancelled (ORCH-090: cancelled is a terminal outcome, never requeued)
|
||||
attempts INTEGER NOT NULL DEFAULT 0,
|
||||
max_attempts INTEGER NOT NULL DEFAULT 2,
|
||||
run_id INTEGER, -- agent_runs.id once started
|
||||
@@ -129,6 +129,17 @@ def init_db():
|
||||
# tracker can show "твоё время" without recomputing from activity history.
|
||||
_ensure_column(conn, "tasks", "brd_review_started_at", "TEXT")
|
||||
_ensure_column(conn, "tasks", "brd_review_ended_at", "TEXT")
|
||||
# ORCH-090 (08-data-requirements.md): STOP-cancellation durable markers. Both are
|
||||
# additive, idempotent (_ensure_column is a no-op once present) -> safe on the live
|
||||
# shared prod DB (enduro untouched). The durable terminal itself is tasks.stage=
|
||||
# 'cancelled' (already understood by the reconciler terminal-skip); these columns
|
||||
# are audit/observability + the deferred-cancel signal.
|
||||
# cancelled_at -> timestamp the task was cancelled (NULL otherwise).
|
||||
# cancel_requested_at -> STOP arrived inside a critical merge/deploy window
|
||||
# (ADR-001 D7): cancellation is DEFERRED until the
|
||||
# irreversible step finishes honestly, then applied.
|
||||
_ensure_column(conn, "tasks", "cancelled_at", "TEXT")
|
||||
_ensure_column(conn, "tasks", "cancel_requested_at", "TEXT")
|
||||
# ORCH-026 (Level B): declarative task dependencies. job_deps stores the
|
||||
# directed edge "task_id (B) is blocked-by depends_on_task_id (A)". The
|
||||
# scheduler gate in claim_next_job keeps B queued until every A reaches
|
||||
@@ -231,6 +242,13 @@ def get_active_tasks_for_reconcile() -> list[dict]:
|
||||
``age_s`` = seconds since ``tasks.updated_at`` (computed in SQL against UTC
|
||||
'now', matching how ``update_task_stage`` stamps ``updated_at``). The
|
||||
reconciler applies the per-stage grace and active-job guard on top.
|
||||
|
||||
ORCH-090 (adr-0026): a ``cancelled`` task is DELIBERATELY still returned here
|
||||
and skipped by the reconciler's own terminal-skip (``stage in
|
||||
('done','cancelled')``, ORCH-086 D2) — narrowing the query to exclude
|
||||
``cancelled`` would lose the observability skip-counter increment that ORCH-086
|
||||
relies on. The terminal set is harmonised in the *scheduler* predicates
|
||||
(serial_gate / task_deps), not here.
|
||||
"""
|
||||
conn = get_db()
|
||||
try:
|
||||
@@ -605,7 +623,9 @@ def claim_next_job() -> dict | None:
|
||||
dep_gate = (
|
||||
"AND NOT EXISTS ("
|
||||
" SELECT 1 FROM job_deps d JOIN tasks t ON t.id = d.depends_on_task_id "
|
||||
" WHERE d.task_id = jobs.task_id AND t.stage != 'done'"
|
||||
# ORCH-090 (adr-0026): a cancelled predecessor is TERMINAL -> the
|
||||
# dependent must NOT wait on it forever. Terminal set = {done,cancelled}.
|
||||
" WHERE d.task_id = jobs.task_id AND t.stage NOT IN ('done','cancelled')"
|
||||
") "
|
||||
)
|
||||
# ORCH-088 (FR-1, ADR-001 D1): per-repo serial gate. An analyst-job of a NEW
|
||||
@@ -683,11 +703,11 @@ def mark_job(
|
||||
run_id: int | None = None,
|
||||
error: str | None = None,
|
||||
):
|
||||
"""Update a job's status (queued|running|done|failed).
|
||||
"""Update a job's status (queued|running|done|failed|cancelled).
|
||||
|
||||
- run_id (optional): link to the agent_runs row that executed this job.
|
||||
- error (optional): last error message (for failed/retry).
|
||||
- 'done'/'failed' also stamp finished_at.
|
||||
- 'done'/'failed'/'cancelled' (ORCH-090) also stamp finished_at.
|
||||
- 'queued' (requeue for retry) clears started_at/finished_at so the next
|
||||
claim treats it as fresh.
|
||||
"""
|
||||
@@ -700,7 +720,7 @@ def mark_job(
|
||||
if error is not None:
|
||||
sets.append("error = ?")
|
||||
params.append(error)
|
||||
if status in ("done", "failed"):
|
||||
if status in ("done", "failed", "cancelled"):
|
||||
sets.append("finished_at = datetime('now')")
|
||||
elif status == "queued":
|
||||
sets.append("started_at = NULL")
|
||||
@@ -728,6 +748,178 @@ def has_active_job_for_task(task_id: int) -> bool:
|
||||
return row is not None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ORCH-090: STOP-cancellation helpers (task + jobs terminal state)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def get_task(task_id: int) -> dict | None:
|
||||
"""Fetch a single task row by id (None when absent)."""
|
||||
conn = get_db()
|
||||
try:
|
||||
row = conn.execute("SELECT * FROM tasks WHERE id = ?", (task_id,)).fetchone()
|
||||
finally:
|
||||
conn.close()
|
||||
return dict(row) if row else None
|
||||
|
||||
|
||||
def get_active_jobs_for_task(task_id: int) -> list[dict]:
|
||||
"""ORCH-090: queued/running jobs of a task (for STOP — stop agent + cancel).
|
||||
|
||||
Returns the full job rows (incl. ``pid`` / ``run_id`` / ``status``) so the
|
||||
cancel orchestrator can SIGTERM the running agent by ``jobs.pid`` and then flip
|
||||
every job to the terminal ``cancelled`` outcome.
|
||||
"""
|
||||
conn = get_db()
|
||||
try:
|
||||
rows = conn.execute(
|
||||
"SELECT * FROM jobs WHERE task_id = ? AND status IN ('queued','running') "
|
||||
"ORDER BY id",
|
||||
(task_id,),
|
||||
).fetchall()
|
||||
finally:
|
||||
conn.close()
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
|
||||
def cancel_jobs_for_task(task_id: int, only_queued: bool = False) -> int:
|
||||
"""ORCH-090 (ADR-001 D3): flip a task's jobs to the terminal ``cancelled`` outcome.
|
||||
|
||||
Guarded UPDATE over ``status IN ('queued','running')`` (or only ``'queued'`` when
|
||||
``only_queued`` — the deferred-cancel path inside a critical merge/deploy window,
|
||||
D7, which must NOT cancel the still-running deploy/merge actor). ``cancelled`` is
|
||||
never requeued: ``claim_next_job`` only selects ``status='queued'`` and the reaper
|
||||
/ worker check the task's terminal stage before any requeue. Returns the number of
|
||||
jobs cancelled. never-raise -> 0 on error.
|
||||
"""
|
||||
statuses = "('queued')" if only_queued else "('queued','running')"
|
||||
try:
|
||||
conn = get_db()
|
||||
try:
|
||||
cur = conn.execute(
|
||||
f"UPDATE jobs SET status='cancelled', finished_at=datetime('now') "
|
||||
f"WHERE task_id = ? AND status IN {statuses}",
|
||||
(task_id,),
|
||||
)
|
||||
conn.commit()
|
||||
return cur.rowcount or 0
|
||||
finally:
|
||||
conn.close()
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
|
||||
def mark_task_cancelled(task_id: int) -> bool:
|
||||
"""ORCH-090 (ADR-001 D4): durable terminal + natural-key tombstone for a task.
|
||||
|
||||
Atomically (single UPDATE):
|
||||
* ``stage='cancelled'`` (durable terminal, understood by the reconciler skip);
|
||||
* ``cancelled_at=now``, ``cancel_requested_at=NULL`` (clear any deferred flag);
|
||||
* TOMBSTONE the natural keys so a later "To Analyse" re-creates the task FROM
|
||||
SCRATCH: ``plane_id`` / ``work_item_id`` / ``plane_issue_id`` get a
|
||||
deterministic ``#cancelled-<id>`` suffix -> ``get_task_by_plane_id`` returns
|
||||
None and the anti-dup / uniqueness guards no longer collide. The row is NOT
|
||||
deleted (durable audit).
|
||||
|
||||
ADR-001 D4 refinement (ORCH-090): the ADR proposed keeping ``plane_issue_id``
|
||||
untouched for audit, but ``get_task_by_plane_id`` / ``create_task_atomic`` match
|
||||
on ``plane_id OR plane_issue_id`` — leaving ``plane_issue_id`` matchable would
|
||||
keep the cancelled row "findable" and BLOCK the clean-slate re-create (BR-3 /
|
||||
TR-4). We therefore suffix it too; the ``#cancelled-<id>`` tag is deterministic
|
||||
and parseable, so the original Plane issue UUID (== the original ``plane_id`` in
|
||||
every create path) is still fully recoverable for audit.
|
||||
|
||||
Idempotent-safe: the suffix is only appended when not already present (a repeat
|
||||
STOP on an already-cancelled row does not double-suffix). Returns True iff the
|
||||
row was updated. never-raise -> False on error.
|
||||
"""
|
||||
try:
|
||||
conn = get_db()
|
||||
try:
|
||||
row = conn.execute(
|
||||
"SELECT plane_id, work_item_id, plane_issue_id FROM tasks WHERE id = ?",
|
||||
(task_id,),
|
||||
).fetchone()
|
||||
if not row:
|
||||
return False
|
||||
suffix = f"#cancelled-{task_id}"
|
||||
|
||||
def _tomb(v):
|
||||
v = v or ""
|
||||
return v if suffix in v else f"{v}{suffix}"
|
||||
|
||||
plane_id = _tomb(row["plane_id"])
|
||||
work_item_id = _tomb(row["work_item_id"])
|
||||
plane_issue_id = _tomb(row["plane_issue_id"])
|
||||
conn.execute(
|
||||
"UPDATE tasks SET stage='cancelled', cancelled_at=datetime('now'), "
|
||||
"cancel_requested_at=NULL, plane_id=?, work_item_id=?, plane_issue_id=?, "
|
||||
"updated_at=datetime('now') WHERE id = ?",
|
||||
(plane_id, work_item_id, plane_issue_id, task_id),
|
||||
)
|
||||
conn.commit()
|
||||
return True
|
||||
finally:
|
||||
conn.close()
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def set_task_cancel_requested(task_id: int) -> bool:
|
||||
"""ORCH-090 (ADR-001 D7): mark a deferred cancellation (STOP in critical window).
|
||||
|
||||
Idempotent: only stamps ``cancel_requested_at`` the first time. The deterministic
|
||||
deploy/merge finalizer reads it once the irreversible step completes and then
|
||||
applies the full cancellation. never-raise -> False on error.
|
||||
"""
|
||||
try:
|
||||
conn = get_db()
|
||||
try:
|
||||
conn.execute(
|
||||
"UPDATE tasks SET cancel_requested_at=datetime('now') "
|
||||
"WHERE id = ? AND cancel_requested_at IS NULL",
|
||||
(task_id,),
|
||||
)
|
||||
conn.commit()
|
||||
return True
|
||||
finally:
|
||||
conn.close()
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def cancelled_tasks_snapshot(limit: int = 10) -> dict:
|
||||
"""ORCH-090 (AC-10): read-only cancellation summary for GET /queue.
|
||||
|
||||
Returns ``{count, pending, recent}`` where ``count`` is the number of cancelled
|
||||
tasks, ``pending`` the number with a deferred (not-yet-applied) cancellation, and
|
||||
``recent`` the last ``limit`` cancelled tasks. never-raise -> minimal dict.
|
||||
"""
|
||||
try:
|
||||
conn = get_db()
|
||||
try:
|
||||
count = conn.execute(
|
||||
"SELECT COUNT(*) FROM tasks WHERE stage='cancelled'"
|
||||
).fetchone()[0]
|
||||
pending = conn.execute(
|
||||
"SELECT COUNT(*) FROM tasks WHERE cancel_requested_at IS NOT NULL "
|
||||
"AND stage != 'cancelled'"
|
||||
).fetchone()[0]
|
||||
recent = [
|
||||
{"work_item_id": r["work_item_id"], "repo": r["repo"],
|
||||
"cancelled_at": r["cancelled_at"]}
|
||||
for r in conn.execute(
|
||||
"SELECT work_item_id, repo, cancelled_at FROM tasks "
|
||||
"WHERE stage='cancelled' ORDER BY cancelled_at DESC LIMIT ?",
|
||||
(limit,),
|
||||
).fetchall()
|
||||
]
|
||||
finally:
|
||||
conn.close()
|
||||
return {"count": int(count), "pending": int(pending), "recent": recent}
|
||||
except Exception:
|
||||
return {"count": 0, "pending": 0, "recent": []}
|
||||
|
||||
|
||||
def count_running_jobs() -> int:
|
||||
"""Number of jobs currently in 'running' status (for max_concurrency)."""
|
||||
conn = get_db()
|
||||
@@ -815,7 +1007,7 @@ def reap_running_job(
|
||||
if error is not None:
|
||||
sets.append("error = ?")
|
||||
params.append(error)
|
||||
if status in ("done", "failed"):
|
||||
if status in ("done", "failed", "cancelled"): # ORCH-090: cancelled is terminal
|
||||
sets.append("finished_at = datetime('now')")
|
||||
elif status == "queued":
|
||||
sets.append("started_at = NULL")
|
||||
@@ -948,7 +1140,9 @@ def get_unfinished_dependencies(task_id: int) -> list[dict]:
|
||||
rows = conn.execute(
|
||||
"SELECT t.id AS id, t.work_item_id AS work_item_id, t.stage AS stage "
|
||||
"FROM job_deps d JOIN tasks t ON t.id = d.depends_on_task_id "
|
||||
"WHERE d.task_id = ? AND t.stage != 'done'",
|
||||
# ORCH-090 (adr-0026): {done,cancelled} are both terminal -> a
|
||||
# cancelled predecessor no longer blocks the dependent.
|
||||
"WHERE d.task_id = ? AND t.stage NOT IN ('done','cancelled')",
|
||||
(task_id,),
|
||||
).fetchall()
|
||||
finally:
|
||||
|
||||
Reference in New Issue
Block a user