fix(deploy): clear stale self-deploy markers on rollback; document env

Re-deploy after a FAILED prod deploy wedged the task on `deploy`: the sentinel markers (approve-requested/initiated/result) are keyed by the stable work_item_id, so after the БАГ-8 rollback (deploy -> development) and a developer fix, Phase B's idempotency-guard saw a STALE `initiated` and became a no-op — the detached hook never re-launched and the finalizer was never enqueued. Add self_deploy.clear_state (never-raise, idempotent) and call it on the check_deploy_status FAILED rollback and at the start of Phase A, so every fresh prod-deploy pass starts clean. Also document the new ORCH_SELF_DEPLOY_* / ORCH_DEPLOY_* descriptors in the canonical .env.example (CLAUDE.md rule #8, ТЗ §2.6), modelled on the ORCH-043 merge-gate block (placeholders only, secrets not committed). Contracts untouched: STAGE_TRANSITIONS, QG_CHECKS, _parse_deploy_status, БАГ-8, merge-gate. Refs: ORCH-036 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-06 20:42:38 +00:00
parent 9f43e6a0ae
commit d79defeadd
6 changed files with 138 additions and 0 deletions
--- a/tests/test_deploy_hook_mapping.py
+++ b/tests/test_deploy_hook_mapping.py
@@ -11,6 +11,7 @@ import os
 os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")
 os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")

+from src import self_deploy  # noqa: E402
 from src.self_deploy import map_exit_code_to_status, build_deploy_log  # noqa: E402


@@ -45,3 +46,21 @@ def test_deploy_log_frontmatter_carries_status():
    body_fail = build_deploy_log("ORCH-036", 2, "FAILED")
    assert "deploy_status: FAILED" in body_fail
    assert "hook_exit_code: 2" in body_fail
+
+
+def test_clear_state_removes_all_markers_and_is_idempotent(monkeypatch, tmp_path):
+    """clear_state wipes the whole work-item state dir (all sentinels) and treats a
+    missing dir as success, so a re-deploy after rollback starts from a clean slate."""
+    monkeypatch.setattr(self_deploy.settings, "repos_dir", str(tmp_path))
+    repo, wi = "orchestrator", "ORCH-036"
+    self_deploy.write_marker(repo, wi, self_deploy.APPROVE_REQUESTED, "t")
+    self_deploy.write_marker(repo, wi, self_deploy.INITIATED, "t")
+    self_deploy.write_marker(repo, wi, self_deploy.RESULT, "1")
+    assert self_deploy.has_marker(repo, wi, self_deploy.INITIATED) is True
+
+    assert self_deploy.clear_state(repo, wi) is True
+    assert self_deploy.has_marker(repo, wi, self_deploy.APPROVE_REQUESTED) is False
+    assert self_deploy.has_marker(repo, wi, self_deploy.INITIATED) is False
+    assert self_deploy.has_marker(repo, wi, self_deploy.RESULT) is False
+    # Idempotent: clearing an already-absent dir is still success (never raises).
+    assert self_deploy.clear_state(repo, wi) is True
--- a/tests/test_deploy_rollback.py
+++ b/tests/test_deploy_rollback.py
@@ -98,3 +98,44 @@ def test_tc10_failed_deploy_rolls_back_to_development(monkeypatch):
    assert stage_engine.set_issue_blocked.called
    assert stage_engine.send_telegram.called
    assert stage_engine.set_issue_done.called is False
+
+
+def test_tc11_re_deploy_after_rollback_not_wedged(monkeypatch):
+    """FAILED deploy -> rollback wipes stale markers so a later Phase B re-initiates.
+
+    Regression for the re-deploy-after-rollback contract (AC-4/AC-10): markers are
+    keyed by the (stable) work_item_id, so without cleanup the STALE `initiated` from
+    the first failed attempt would make Phase B's idempotency-guard a no-op on the
+    retry and wedge the task on `deploy` forever.
+    """
+    repo, wi, branch = "orchestrator", "ORCH-036", "feature/ORCH-036-x"
+    # First (failed) pass left BOTH the idempotency-guard and the verdict behind.
+    self_deploy.write_marker(repo, wi, self_deploy.INITIATED, "123")
+    self_deploy.write_marker(repo, wi, self_deploy.RESULT, "1")
+    monkeypatch.setattr(
+        stage_engine, "QG_CHECKS",
+        {**stage_engine.QG_CHECKS, "check_deploy_status": _fail("Deploy status: FAILED")},
+    )
+    task_id = _make_task("deploy")
+
+    stage_engine.run_deploy_finalizer(
+        {"task_id": task_id, "repo": repo, "id": 1, "agent": "deploy-finalizer"}
+    )
+
+    # Rollback fired AND the stale deploy-state sentinels were wiped.
+    assert _stage(task_id) == "development"
+    assert self_deploy.has_marker(repo, wi, self_deploy.INITIATED) is False
+    assert self_deploy.has_marker(repo, wi, self_deploy.RESULT) is False
+    assert self_deploy.read_result(repo, wi) == (False, None)
+
+    # Second pass: the task reaches `deploy` again and the human re-approves. Phase B
+    # must ACTUALLY initiate (no stale `initiated` -> not a no-op), proving the retry
+    # is no longer wedged.
+    init = MagicMock(return_value=(True, "ok"))
+    monkeypatch.setattr(stage_engine.self_deploy, "initiate_deploy", init)
+    result = stage_engine.AdvanceResult(from_stage="deploy")
+    stage_engine._handle_self_deploy_phase_b(task_id, repo, wi, branch, result)
+
+    assert init.called
+    assert result.note == "self-deploy-initiated"
+    assert self_deploy.has_marker(repo, wi, self_deploy.INITIATED) is True