fix(deploy): clear stale self-deploy markers on rollback; document env
Re-deploy after a FAILED prod deploy wedged the task on `deploy`: the sentinel markers (approve-requested/initiated/result) are keyed by the stable work_item_id, so after the БАГ-8 rollback (deploy -> development) and a developer fix, Phase B's idempotency-guard saw a STALE `initiated` and became a no-op — the detached hook never re-launched and the finalizer was never enqueued. Add self_deploy.clear_state (never-raise, idempotent) and call it on the check_deploy_status FAILED rollback and at the start of Phase A, so every fresh prod-deploy pass starts clean. Also document the new ORCH_SELF_DEPLOY_* / ORCH_DEPLOY_* descriptors in the canonical .env.example (CLAUDE.md rule #8, ТЗ §2.6), modelled on the ORCH-043 merge-gate block (placeholders only, secrets not committed). Contracts untouched: STAGE_TRANSITIONS, QG_CHECKS, _parse_deploy_status, БАГ-8, merge-gate. Refs: ORCH-036 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -29,6 +29,7 @@ container (reads markers) and the host (writes ``result``):
|
||||
import logging
|
||||
import os
|
||||
import shlex
|
||||
import shutil
|
||||
import subprocess
|
||||
|
||||
from .config import settings
|
||||
@@ -160,6 +161,31 @@ def write_marker(repo: str, work_item_id: str | None, name: str, content: str =
|
||||
return False
|
||||
|
||||
|
||||
def clear_state(repo: str, work_item_id: str | None) -> bool:
|
||||
"""Remove ALL deploy-state sentinels for this work item (best-effort).
|
||||
|
||||
Sentinels are keyed by ``work_item_id`` (stable for the whole task lifetime),
|
||||
so a FAILED prod-deploy leaves ``approve-requested`` / ``initiated`` / ``result``
|
||||
behind. Without cleanup, after the БАГ-8 rollback (deploy -> development) and a
|
||||
fix, the task reaching ``deploy`` again would hit Phase B's idempotency-guard:
|
||||
the STALE ``initiated`` makes it a no-op, the detached hook never re-launches and
|
||||
the task wedges on ``deploy`` forever (re-deploy-after-rollback contract broken;
|
||||
AC-4/AC-10). A stale ``result`` would likewise be mis-read by the new finalizer.
|
||||
Clearing the whole state dir restores a clean slate for the next pass. Idempotent
|
||||
(a missing dir is success). Never raises.
|
||||
"""
|
||||
d = container_state_dir(repo, work_item_id)
|
||||
try:
|
||||
shutil.rmtree(d)
|
||||
logger.info("clear_state: removed deploy-state dir %s", d)
|
||||
return True
|
||||
except FileNotFoundError:
|
||||
return True
|
||||
except OSError as e: # noqa: BLE001 - never-raise contract
|
||||
logger.warning("clear_state error for %s/%s: %s", repo, work_item_id, e)
|
||||
return False
|
||||
|
||||
|
||||
def read_result(repo: str, work_item_id: str | None) -> tuple[bool, int | None]:
|
||||
"""Read the ``result`` sentinel (hook exit-code written by the host wrapper).
|
||||
|
||||
|
||||
@@ -622,6 +622,16 @@ def _handle_qg_failure_rollbacks(
|
||||
notify_stage_change(task_id, current_stage, "development")
|
||||
plane_notify_stage(work_item_id, current_stage, "development")
|
||||
result.rolled_back_to = "development"
|
||||
# ORCH-036: clear the deploy-state sentinels (approve-requested / initiated /
|
||||
# result) so the NEXT prod-deploy pass (after the developer fixes and the task
|
||||
# returns to `deploy`) is not wedged by Phase B's idempotency-guard reading a
|
||||
# STALE `initiated`, nor the finalizer mis-reading a STALE `result`. Markers are
|
||||
# keyed by work_item_id (stable across the rollback), so without this they
|
||||
# survive into the retry and break re-deploy-after-rollback (AC-4/AC-10).
|
||||
try:
|
||||
self_deploy.clear_state(repo, work_item_id)
|
||||
except Exception as e: # noqa: BLE001 - defensive (clear_state never-raises anyway)
|
||||
logger.warning(f"Task {task_id}: deploy-state clear on deploy-fail failed: {e}")
|
||||
# ORCH-043: deploy failed -> no merge will complete; release the lease so the
|
||||
# next task isn't blocked until the lease ages out (holder-aware no-op).
|
||||
try:
|
||||
@@ -821,6 +831,12 @@ def _handle_self_deploy_phase_a(
|
||||
|
||||
if work_item_id:
|
||||
set_issue_in_review(work_item_id)
|
||||
# ORCH-036: belt-and-suspenders — wipe any STALE deploy-state markers before
|
||||
# arming a fresh approve. A prior FAILED pass clears on rollback, but clearing
|
||||
# here too guarantees the entry to every new prod-deploy pass starts clean
|
||||
# (e.g. after a crash/manual intervention), so `initiated`/`result` from an
|
||||
# earlier attempt can never leak into this one.
|
||||
self_deploy.clear_state(repo, work_item_id)
|
||||
self_deploy.write_marker(
|
||||
repo, work_item_id, self_deploy.APPROVE_REQUESTED, content=str(time.time())
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user