fix(stage-engine): durable transition-ownership lease + expected-stage CAS (ORCH-114)
Close the root class of the ORCH-110/111/112/113 incident chain: side-effectful stage transitions had no single ownership. `advance_stage` is re-enterable and wrote the stage with a bare `UPDATE ... WHERE id=?` (no compare-and-swap), while >=5 actors (monitor / Plane-webhook / reconciler F-1 / job-reaper / deploy-finalizer) enter the same transition independently. A concurrent or post-restart re-entry therefore re-applied irreversible effects (merge_pr / coverage-ratchet / image-rebuild / prod-deploy initiation) and produced a contradictory rollback<->done (incident ORCH-111, job 1914 / PR #130). Two complementary layers, both additive, under one kill-switch, never-raise: 1. Durable transition-lease (new table `transition_lease`) — owner-exclusion on ENTRY to the side-effectful region: a second actor that sees a LIVE owner does not start the heavy sub-gates at all (prevention, not post-hoc repair). 2. Expected-stage CAS (`db.update_task_stage_cas`) — atomicity on the stage WRITE: a lost race aborts with NO side effect. Also closes the 6 paths that write the stage in bypass of advance_stage (gitea x5 + plane rollback). Owner liveness = owner_pid + owner_boot_id (NOT a heartbeat — a blocking 900s merge re-test cannot beat one; ADR-001 D3), making restart recovery free (a fresh boot_id renders every prior lease stale -> reclaimed by recover_on_startup). The lease has no own TTL: its hard age ceiling is the reaper Tier-3 backstop reaper_max_running_s, so the cross-cutting budget invariant ORCH-065/109/110/113 is untouched. Generalises ORCH-113 finalizer-liveness (process-local, Tier-2, deploy-staging) to a durable cross-path lease: the reaper consults it on all relevant paths (defer live, reclaim dead; Tier-3 ignores the marker -> bounded; a reap force-releases the lease); reconciler F-1 and the Plane webhook defer on an active lease; main.lifespan calls recover_on_startup() after requeue_running_jobs. finalizer_liveness.py is unchanged (it remains the kill-switch-off fallback). Scope self-hosting (transition_lease_repos="" -> orchestrator only; enduro untouched). Kill-switch ORCH_TRANSITION_LEASE_ENABLED=false -> CAS degenerates to the prior unconditional update_task_stage, lease inert, reaper -> ORCH-113 fallback (byte-for- byte pre-ORCH-114). STAGE_TRANSITIONS / QG_CHECKS / check_* / machine-verdict keys / existing table schemas — byte-for-byte (one additive table, no epoch column on tasks). Observability: read-only `transition_lease` block in GET /queue + a Telegram alert on forced/stale reclaim + optional POST /transition-lease/release?work_item=<id>. Coverage: tests/test_orch114_transition_ownership.py (TC-01 mandatory regression of the ORCH-111 class — red before fix, green after; TC-02..TC-14). Full suite green (2048 passed); the 4 webhook tests that spied on the removed gitea.update_task_stage were updated to spy on the new commit_stage_cas write path. ADR: docs/work-items/ORCH-114/06-adr/ADR-001-transition-ownership-lease-and-stage-cas.md Cross-cutting: docs/architecture/adr/adr-0045-transition-ownership-lease-and-stage-cas.md Refs: ORCH-114 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -396,15 +396,19 @@ def _mock_db_with_retry_count(count):
|
||||
@patch("src.webhooks.gitea.notify_error")
|
||||
@patch("src.webhooks.gitea.notify_qg_failure")
|
||||
@patch("src.webhooks.gitea.enqueue_job")
|
||||
@patch("src.webhooks.gitea.update_task_stage")
|
||||
@patch("src.webhooks.gitea.transition_lease.commit_stage_cas")
|
||||
@patch("src.webhooks.gitea.get_db")
|
||||
@patch("src.webhooks.gitea.get_task_by_repo_branch")
|
||||
@patch("src.webhooks.gitea.get_project_by_repo")
|
||||
def test_ci_failure_development_retries_developer_under_limit(
|
||||
mock_proj, mock_task, mock_get_db, mock_update_stage,
|
||||
mock_proj, mock_task, mock_get_db, mock_commit_cas,
|
||||
mock_enqueue, mock_qg, mock_err,
|
||||
):
|
||||
"""retry_count < MAX_DEV_RETRIES → relaunch developer, stage untouched."""
|
||||
"""retry_count < MAX_DEV_RETRIES → relaunch developer, stage untouched.
|
||||
|
||||
ORCH-114: the CI-failure path never writes the stage (no advance) -> the
|
||||
expected-stage CAS write helper is never invoked.
|
||||
"""
|
||||
from src.webhooks.gitea import handle_ci_status
|
||||
|
||||
mock_proj.return_value = {"repo": "enduro-trails"}
|
||||
@@ -423,19 +427,19 @@ def test_ci_failure_development_retries_developer_under_limit(
|
||||
assert mock_enqueue.call_args[0][0] == "developer"
|
||||
# No escalation.
|
||||
assert not mock_err.called
|
||||
# Stage stays on development — no update_task_stage in the CI-failure path.
|
||||
assert not mock_update_stage.called
|
||||
# Stage stays on development — no stage write in the CI-failure path.
|
||||
assert not mock_commit_cas.called
|
||||
|
||||
|
||||
@patch("src.webhooks.gitea.notify_error")
|
||||
@patch("src.webhooks.gitea.notify_qg_failure")
|
||||
@patch("src.webhooks.gitea.enqueue_job")
|
||||
@patch("src.webhooks.gitea.update_task_stage")
|
||||
@patch("src.webhooks.gitea.transition_lease.commit_stage_cas")
|
||||
@patch("src.webhooks.gitea.get_db")
|
||||
@patch("src.webhooks.gitea.get_task_by_repo_branch")
|
||||
@patch("src.webhooks.gitea.get_project_by_repo")
|
||||
def test_ci_failure_development_escalates_at_limit(
|
||||
mock_proj, mock_task, mock_get_db, mock_update_stage,
|
||||
mock_proj, mock_task, mock_get_db, mock_commit_cas,
|
||||
mock_enqueue, mock_qg, mock_err,
|
||||
):
|
||||
"""retry_count >= MAX_DEV_RETRIES → escalate via notify_error, no relaunch."""
|
||||
@@ -458,8 +462,8 @@ def test_ci_failure_development_escalates_at_limit(
|
||||
err_msg = " ".join(str(a) for a in mock_err.call_args[0])
|
||||
assert "Max developer retries" in err_msg
|
||||
assert "after CI failure" in err_msg
|
||||
# Stage untouched.
|
||||
assert not mock_update_stage.called
|
||||
# Stage untouched (no stage write).
|
||||
assert not mock_commit_cas.called
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -483,11 +487,11 @@ def _merged_pr_payload(branch="feature/ET-012-x"):
|
||||
|
||||
|
||||
@patch("src.webhooks.gitea.notify_stage_change")
|
||||
@patch("src.webhooks.gitea.update_task_stage")
|
||||
@patch("src.webhooks.gitea.transition_lease.commit_stage_cas")
|
||||
@patch("src.webhooks.gitea.get_task_by_repo_branch")
|
||||
@patch("src.webhooks.gitea.get_project_by_repo")
|
||||
def test_merge_on_deploy_stage_does_not_set_done(
|
||||
mock_proj, mock_task, mock_update_stage, mock_notify,
|
||||
mock_proj, mock_task, mock_commit_cas, mock_notify,
|
||||
):
|
||||
"""FIX 1: merge at deploy stage is ignored — done is gated by deployer verdict."""
|
||||
from src.webhooks.gitea import handle_pr
|
||||
@@ -499,28 +503,34 @@ def test_merge_on_deploy_stage_does_not_set_done(
|
||||
|
||||
asyncio.run(handle_pr(_merged_pr_payload()))
|
||||
|
||||
# The merge-driven done path must NOT run on deploy.
|
||||
assert not mock_update_stage.called
|
||||
# The merge-driven done path must NOT run on deploy (no stage write).
|
||||
assert not mock_commit_cas.called
|
||||
assert not mock_notify.called
|
||||
|
||||
|
||||
@patch("src.webhooks.gitea.notify_stage_change")
|
||||
@patch("src.webhooks.gitea.update_task_stage")
|
||||
@patch("src.webhooks.gitea.transition_lease.commit_stage_cas")
|
||||
@patch("src.webhooks.gitea.get_task_by_repo_branch")
|
||||
@patch("src.webhooks.gitea.get_project_by_repo")
|
||||
def test_merge_on_non_deploy_stage_sets_done(
|
||||
mock_proj, mock_task, mock_update_stage, mock_notify,
|
||||
mock_proj, mock_task, mock_commit_cas, mock_notify,
|
||||
):
|
||||
"""FIX 1: merge behaviour is preserved for non-deploy stages (e.g. review)."""
|
||||
"""FIX 1: merge behaviour is preserved for non-deploy stages (e.g. review).
|
||||
|
||||
ORCH-114: the merge-driven done write now goes through the expected-stage CAS
|
||||
helper (commit_stage_cas(task_id, current_stage, "done", repo)); on a won CAS the
|
||||
notify still fires.
|
||||
"""
|
||||
from src.webhooks.gitea import handle_pr
|
||||
|
||||
mock_proj.return_value = {"repo": "enduro-trails"}
|
||||
mock_task.return_value = {
|
||||
"id": 2, "stage": "review", "work_item_id": "ET-013",
|
||||
}
|
||||
mock_commit_cas.return_value = True
|
||||
|
||||
asyncio.run(handle_pr(_merged_pr_payload(branch="feature/ET-013-x")))
|
||||
|
||||
# Non-deploy stages still get the merge-driven done.
|
||||
mock_update_stage.assert_called_once_with(2, "done")
|
||||
# Non-deploy stages still get the merge-driven done (review -> done via CAS).
|
||||
mock_commit_cas.assert_called_once_with(2, "review", "done", "enduro-trails")
|
||||
assert mock_notify.called
|
||||
|
||||
Reference in New Issue
Block a user