fix(stage-engine): address ORCH-114 review — env/docs canon + in-region rollback CAS

Resolves the REQUEST_CHANGES findings on ORCH-114 (durable transition-ownership
lease + expected-stage CAS):

P1 — documentation = golden source:
- .env.example: add ORCH_TRANSITION_LEASE_ENABLED / ORCH_TRANSITION_LEASE_REPOS
  (canon of 100% start keys, ORCH-101), next to the other gate kill-switches.
- CLAUDE.md: add the ORCH-114 passport section (mechanism, invariant, flags,
  ADR links) so a future agent editing advance_stage/reaper/webhooks finds the
  ownership invariant in the first mandatory-read doc (ORCH-078 traceability index).

P2 — should-fix:
- docs/overview/ (system showcase, ORCH-011): add transition_lease to
  tech-data-model.md (helper tables), tech-observability.md (/queue blocks) and
  tech-architecture.md (components).
- ADR-001 D4 alignment: the four side-effectful-edge rollback handlers
  (_handle_merge_gate_rollback / _handle_security_gate / _handle_coverage_gate /
  _handle_image_freshness) now write `development` through the expected-stage CAS
  via a shared _rollback_stage_cas helper (defence against the rollback↔done
  contradiction, BR-6) instead of a bare unconditional update_task_stage. Under the
  held lease the sole owner always wins; a lost race aborts WITHOUT side effects.
  Kill-switch off / out-of-scope repo -> degenerates to the prior write -> 1:1.
- Test isolation: make tests/test_webhooks.py order-independent by pinning the
  proj-1 registry per-test (mirrors test_webhook_dedup.proj_registry); it had only
  passed by relying on import order. Drop the needless module-level ORCH_DB_PATH
  setdefault in test_orch114 (fresh_db already isolates db_path).

New regression tests (TC-11): in-region rollback writes route through CAS;
rollback CAS wins when at expected stage; rollback CAS-lost does NOT clobber `done`;
kill-switch-off rollback degenerates to the unconditional write.

ruff clean (src/stage_engine.py, src/transition_lease.py); full suite 2052 passed.

Refs: ORCH-114
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-15 18:16:49 +03:00
committed by deployer
parent 4a6b32e61d
commit c4a97a7a28
9 changed files with 202 additions and 8 deletions

View File

@@ -19,7 +19,11 @@ import tempfile
import pytest
os.environ.setdefault("ORCH_DB_PATH", os.path.join(tempfile.gettempdir(), "test_orch114.db"))
# NB: deliberately NO module-level os.environ["ORCH_DB_PATH"] setdefault — pinning the
# process-wide settings.db_path on first import is needless here (the autouse `fresh_db`
# fixture below isolates db_path per-test via monkeypatch). The cross-module settings
# singleton (e.g. ORCH_PROJECTS_JSON) is whoever imports `src` first; test_webhooks now
# pins its own registry per-test rather than relying on import order (ORCH-114 review P2).
os.environ.setdefault("ORCH_REPOS_DIR", tempfile.gettempdir())
os.environ.setdefault("ORCH_GITEA_TOKEN", "test-token")
os.environ.setdefault("ORCH_PLANE_API_TOKEN", "test-token")
@@ -525,6 +529,71 @@ def test_tc11_bypass_paths_use_cas_not_unconditional_write():
assert "commit_stage_cas(task_id, current_stage, next_stage" in asrc
def test_tc11_inregion_rollback_writes_use_cas(monkeypatch):
"""ADR-001 D4: the four side-effectful-edge rollback handlers
(_handle_merge_gate_rollback / _handle_security_gate / _handle_coverage_gate /
_handle_image_freshness) write `development` through the expected-stage CAS
(via _rollback_stage_cas), NOT a bare unconditional update_task_stage. (The
non-side-effectful launcher rollbacks in _handle_qg_failure_rollbacks are out of
scope — no lease is held there.)"""
for fn in (
se._handle_merge_gate_rollback,
se._handle_security_gate,
se._handle_coverage_gate,
se._handle_image_freshness,
):
src = inspect.getsource(fn)
assert "_rollback_stage_cas(task_id, current_stage, repo, result)" in src, (
f"{fn.__name__} must route the rollback write through the CAS helper"
)
assert 'update_task_stage(task_id, "development")' not in src, (
f"{fn.__name__} must not do a bare unconditional rollback write"
)
# The helper itself goes through commit_stage_cas.
assert "commit_stage_cas(task_id, current_stage" in inspect.getsource(
se._rollback_stage_cas
)
def test_tc11_rollback_cas_wins_when_at_expected_stage(monkeypatch):
"""With the mechanism ON, a rollback whose task is STILL at current_stage wins the
CAS -> the stage is written to `development` and the caller proceeds (returns True)."""
_enable(monkeypatch)
tid = _make_task(stage="deploy-staging")
result = se.AdvanceResult()
assert se._rollback_stage_cas(tid, "deploy-staging", _REPO, result) is True
assert _task_stage(tid) == "development"
assert result.note != "rollback-cas-lost"
def test_tc11_rollback_cas_lost_aborts_without_overwriting_done(monkeypatch):
"""BR-6 / ADR-001 D4: if a concurrent winner already advanced the task to `done`,
the stale rollback LOSES the expected-stage CAS -> it must NOT overwrite `done`
with `development`, and the caller aborts the rollback side effects."""
_enable(monkeypatch)
tid = _make_task(stage="deploy-staging")
# Simulate a concurrent winner having advanced the task to terminal `done`.
conn = get_db()
conn.execute("UPDATE tasks SET stage='done' WHERE id=?", (tid,))
conn.commit()
conn.close()
result = se.AdvanceResult()
# The rollback still believes current_stage is deploy-staging (its read-on-entry).
assert se._rollback_stage_cas(tid, "deploy-staging", _REPO, result) is False
assert _task_stage(tid) == "done" # NOT clobbered back to development
assert result.note == "rollback-cas-lost"
def test_tc11_rollback_cas_killswitch_off_unconditional(monkeypatch):
"""Kill-switch off -> _rollback_stage_cas degenerates to the prior unconditional
write (always True, no CAS), so behaviour is byte-for-byte pre-ORCH-114 (AC-9)."""
_disable(monkeypatch)
tid = _make_task(stage="done") # even a mismatched stage writes unconditionally
result = se.AdvanceResult()
assert se._rollback_stage_cas(tid, "deploy-staging", _REPO, result) is True
assert _task_stage(tid) == "development"
# ===========================================================================
# TC-12 — observability (AC-12)
# ===========================================================================

View File

@@ -25,6 +25,28 @@ os.environ["ORCH_PROJECTS_JSON"] = (
from fastapi.testclient import TestClient
from src.main import app
from src.db import init_db, get_db
from src import projects as projects_mod
@pytest.fixture(autouse=True)
def proj_registry():
"""Pin the shared project registry to proj-1/enduro-trails for each test.
The registry (projects.PROJECTS / _BY_PLANE_ID) is a process-wide singleton built
at FIRST `src` import: this module's import-time ORCH_PROJECTS_JSON only wins if
test_webhooks happens to import `src` before any other module (true when it runs
right after test_webhook_dedup, false for an arbitrary subset like
`pytest test_orch114… test_webhooks`). Forcing the registry per-test makes these
fixtures order-independent (mirrors test_webhook_dedup.proj_registry; ORCH-114
review P2)."""
os.environ["ORCH_PROJECTS_JSON"] = (
'[{"plane_project_id": "proj-1", "repo": "enduro-trails", '
'"work_item_prefix": "ET", "name": "enduro-trails"}]'
)
projects_mod.settings.projects_json = os.environ["ORCH_PROJECTS_JSON"]
projects_mod.reload_projects()
yield
projects_mod.reload_projects()
@pytest.fixture(autouse=True)