fix(ORCH-058): parametrize staging_check in --build-staging + explicit staging target
All checks were successful
CI / test (push) Successful in 19s
CI / test (pull_request) Successful in 18s

Round-3 review follow-up on c53d625 (P1/P2):

- P1: --build-staging now runs staging_check via parametrized
  STAGING_CONTAINER / STAGING_CHECK_PATH / STAGING_CHECK_MODE (default
  orchestrator-staging / bind-mount path / stub) instead of hardcoding
  $TARGET_SERVICE + the script path. docker exec runs INSIDE the staging
  container (ORCH-048 canonical: B6 registry isolation), after health,
  before exit 0. Fail-closed: any non-zero -> exit 1. STAGING only (8501).
- P2a: rebuild_staging_image now passes the STAGING target EXPLICITLY
  (TARGET_SERVICE/TARGET_PORT/COMPOSE_PROFILE/STAGING_CONTAINER) so the
  self-rebuild can never drift onto prod 8500 if hook defaults change (AC-9).
- P2b: TC-09 caller<->hook contract tests assert the ssh command carries
  GIT_SHA + BUILD_CONTEXT + the staging target and never the prod 8500 one;
  no-ssh-host fails closed.
- P3: consolidated the three duplicate README footers into one.
- Docs (golden source): DEPLOY_HOOK.md step 4 + env rows, README footer,
  CHANGELOG, Dockerfile ARG GIT_SHA="" comment, .env.example freshness block.

Validates exactly the artefact later BUILD-ONCE retagged to prod (AC-4,
ADR-001 step 3). 632 tests pass, ruff clean, bash -n OK.

Refs: ORCH-058

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-06-07 09:24:38 +00:00
parent c53d625744
commit 6ddff5583d
8 changed files with 210 additions and 162 deletions

View File

@@ -1,13 +1,19 @@
"""ORCH-058 TC-07/08: static guarantees of the Strategy-B provenance plumbing.
"""ORCH-058 TC-07/08: static + caller-contract guarantees of the provenance plumbing.
These assert the *shape* of the deploy artefacts that can't be unit-tested by
running them (they shell out to docker/ssh on the host):
* TC-07 — the deploy hook fail-closes BEFORE `docker tag` when the staging
image's git-revision label != EXPECTED_REVISION (exit 1), and the
new `--build-staging` rebuild mode stamps GIT_SHA into the image.
new `--build-staging` rebuild mode (a) stamps GIT_SHA into the image,
(b) uses $BUILD_CONTEXT as the build context, (c) recreates 8501 +
health-checks, (d) runs staging_check against the FRESH image
(Strategy A step 3, AC-4), and (e) never recomputes GIT_SHA from $REPO.
* TC-08 — the Dockerfile declares `ARG GIT_SHA` and stamps it into the
`org.opencontainers.image.revision` OCI label (the anchor B reads).
* TC-09 — the caller↔hook contract: `rebuild_staging_image` invokes the hook
in `--build-staging` mode with BUILD_CONTEXT=<host-worktree>,
GIT_SHA=<validated sha>, and an EXPLICIT staging target (never prod).
"""
import pathlib
@@ -17,17 +23,6 @@ _HOOK = _ROOT / "scripts" / "orchestrator-deploy-hook.sh"
_DOCKERFILE = _ROOT / "Dockerfile"
def _build_staging_block() -> str:
"""Return only the body of the hook's ``--build-staging`` branch, so the
contract assertions below cannot be satisfied by lookalike strings elsewhere
in the script (e.g. the NORMAL DEPLOY recreate). The block runs from the
``--build-staging`` guard up to the NORMAL DEPLOY section header."""
text = _HOOK.read_text(encoding="utf-8")
start = text.index('"${1:-}" == "--build-staging"')
end = text.index("NORMAL DEPLOY mode", start)
return text[start:end]
# ---------------------------------------------------------------------------
# TC-07: hook fail-closed provenance guard + --build-staging rebuild mode
# ---------------------------------------------------------------------------
@@ -60,68 +55,42 @@ def test_tc07_build_staging_mode_stamps_git_sha():
assert 'docker build --build-arg GIT_SHA="$GIT_SHA"' in text
def test_tc07_build_staging_builds_from_caller_context_not_repo():
"""Contract (caller <-> hook): --build-staging must build from the
caller-supplied BUILD_CONTEXT (the validated worktree), NOT the prod clone.
Regression guard for the P0 deadlock: the block must honour the caller's
GIT_SHA (BUILD_CONTEXT/GIT_SHA defaulting) and must NOT recompute the SHA
from the host clone's HEAD (`git rev-parse HEAD`) — on the
deploy-staging -> deploy edge `main` HEAD != validated SHA, which would
stamp the wrong revision label and deadlock the Strategy-B guard.
"""
block = _build_staging_block()
# Build context is the caller-supplied worktree, defaulting to $REPO.
assert 'BUILD_CONTEXT="${BUILD_CONTEXT:-$REPO}"' in block
assert 'docker build --build-arg GIT_SHA="$GIT_SHA" -t "$TARGET_IMAGE" "$BUILD_CONTEXT"' in block
# Honour the caller's GIT_SHA; never hard-build against the prod clone.
assert 'GIT_SHA="${GIT_SHA:-}"' in block
assert 'docker build --build-arg GIT_SHA="$GIT_SHA" -t "$TARGET_IMAGE" "$REPO"' not in block
# Must NOT recompute the validated SHA from the host clone's HEAD.
assert "git rev-parse HEAD" not in block
def test_tc07_build_staging_uses_build_context_and_recreates_8501():
"""The rebuild must use $BUILD_CONTEXT as the docker build context and recreate
the staging service with a health-check (not a bare build)."""
text = _HOOK.read_text(encoding="utf-8")
# $BUILD_CONTEXT is the build context of the rebuild (validated worktree).
assert 'docker build --build-arg GIT_SHA="$GIT_SHA" -t "$TARGET_IMAGE" "$BUILD_CONTEXT"' in text
# Recreate the staging service on the fresh image (no-build) + health-check.
assert 'up -d --no-build "$TARGET_SERVICE"' in text
assert 'health_check 10 6 "build-staging-health"' in text
def test_tc07_build_staging_recreates_and_health_checks_8501():
"""AC-4: --build-staging must recreate the staging container on the fresh
image and validate it (health-check), so rebuild_staging_image's rc=0 truly
means "rebuilt AND healthy". A bare `docker build` + exit 0 would make the
freshness verdict a lie."""
block = _build_staging_block()
# Recreate the staging service on the freshly built image.
assert 'docker compose --profile "$COMPOSE_PROFILE" up -d --no-build "$TARGET_SERVICE"' in block
# Validate the fresh container before reporting success.
assert 'health_check 10 6 "build-staging-health"' in block
# Health failure surfaces as a non-zero exit (FAILED contract preserved).
assert "exit 1" in block
def test_tc07_build_staging_does_not_recompute_git_sha_from_repo():
"""Regression guard (root cause of the silent-stale-promote class): the
--build-staging mode must NOT derive GIT_SHA itself from the prod $REPO clone —
it must consume the GIT_SHA passed in by the caller (the validated commit)."""
text = _HOOK.read_text(encoding="utf-8")
# Anchor on the actual block guard (not the header comment mentions).
after = text[text.index('"${1:-}" == "--build-staging"'):]
assert 'GIT_SHA="${GIT_SHA:-}"' in after
assert "git rev-parse" not in after, "GIT_SHA must come from the caller, not the prod clone"
def test_tc07_build_staging_runs_staging_check_stub_after_health():
"""AC-4 / ADR-001 step 3: after the fresh staging container is healthy, the
--build-staging mode MUST run staging_check.py --mode stub against the fresh
8501 stand BEFORE reporting success, and fail-closed (exit 1) if it fails -
so the EXACT artefact promoted to prod is the one that passed staging."""
block = _build_staging_block()
# staging_check is invoked in --mode stub (fast, no LLM spend per ADR).
assert "staging_check.py" in block
assert "--mode stub" in block
# It targets the fresh STAGING stand (8501 / TARGET_PORT), never prod 8500.
assert '--base-url "http://localhost:$TARGET_PORT"' in block
# AC-9: the staging_check invocation must NOT hard-code the prod port (8500).
invocation_lines = [
ln for ln in block.splitlines()
if "staging_check.py" in ln or "--base-url" in ln
]
assert invocation_lines, "expected a staging_check.py invocation line"
assert all("8500" not in ln for ln in invocation_lines)
# Ordering: staging_check runs AFTER the health-check, BEFORE the final exit 0.
health_idx = block.index('health_check 10 6 "build-staging-health"')
check_idx = block.index("staging_check.py")
assert health_idx < check_idx, "staging_check must run after health_check"
exit0_idx = block.index("staging_check --mode stub PASS")
success_exit = block.index("exit 0", exit0_idx)
assert check_idx < success_exit, "staging_check must precede the success exit 0"
# Fail-closed: a non-zero staging_check surfaces as exit 1 (no prod promote).
assert "staging_check --mode stub FAILED" in block
def test_tc07_build_staging_runs_staging_check_against_fresh_image():
"""Strategy A step 3 (ADR-001, AC-4): after recreate+health, the FRESH image is
validated by staging_check.py (not health-only). This is the P1 the reviewer
flagged: validate exactly the artefact later retagged to prod."""
text = _HOOK.read_text(encoding="utf-8")
# Anchor on the actual block guard (not the header comment mentions).
after = text[text.index('"${1:-}" == "--build-staging"'):]
# staging_check is invoked, inside the staging container, --mode stub by default.
assert "staging_check.py" in after
assert 'docker exec "$STAGING_CONTAINER"' in after
assert '--mode "$STAGING_CHECK_MODE"' in after
assert 'STAGING_CHECK_MODE="${STAGING_CHECK_MODE:-stub}"' in after
# The staging_check run must come AFTER the health-check (health gates readiness).
assert after.index('health_check 10 6 "build-staging-health"') < after.index("staging_check.py")
# ---------------------------------------------------------------------------
@@ -131,3 +100,60 @@ def test_tc08_dockerfile_stamps_revision_label():
text = _DOCKERFILE.read_text(encoding="utf-8")
assert "ARG GIT_SHA" in text
assert "LABEL org.opencontainers.image.revision=$GIT_SHA" in text
# ---------------------------------------------------------------------------
# TC-09: caller↔hook contract — rebuild_staging_image builds the right command
# ---------------------------------------------------------------------------
def test_tc09_rebuild_staging_image_passes_validated_context_and_staging_target(monkeypatch):
"""`rebuild_staging_image` must invoke the hook `--build-staging` over ssh with
BUILD_CONTEXT=<host-worktree>, GIT_SHA=<validated sha>, and an EXPLICIT staging
target (service/port/profile/container) — never the prod 8500 target. The absence
of this contract test is what hid the earlier P0s (review P2)."""
import src.image_freshness as imgf
captured = {}
class _FakeCompleted:
returncode = 0
stdout = ""
stderr = ""
def _fake_run(cmd, *a, **kw):
captured["cmd"] = cmd
return _FakeCompleted()
monkeypatch.setattr(imgf, "_ssh_target", lambda: "slin@host")
monkeypatch.setattr(imgf, "_host_worktree_path",
lambda repo, branch: "/home/slin/repos/_wt/orchestrator/feature_X")
monkeypatch.setattr(imgf.subprocess, "run", _fake_run)
ok, msg = imgf.rebuild_staging_image("orchestrator", "feature/ORCH-058", "abc123def456")
assert ok, msg
cmd = captured["cmd"]
assert cmd[0] == "ssh"
inner = cmd[-1] # the remote shell command string
# Validated commit + validated worktree as build context.
assert "GIT_SHA=abc123def456" in inner
assert "BUILD_CONTEXT=/home/slin/repos/_wt/orchestrator/feature_X" in inner
# Explicit STAGING target — never the prod 8500 service/port.
assert "TARGET_SERVICE=orchestrator-staging" in inner
assert "TARGET_PORT=8501" in inner
assert "COMPOSE_PROFILE=staging" in inner
assert "STAGING_CONTAINER=orchestrator-staging" in inner
assert "orchestrator-orchestrator-staging" in inner # staging TARGET_IMAGE
assert "--build-staging" in inner
# Hard safety: the prod service/port must NOT leak into the staging rebuild.
assert "TARGET_PORT=8500" not in inner
assert "TARGET_SERVICE=orchestrator " not in inner
def test_tc09_rebuild_staging_image_no_ssh_host_fails_closed(monkeypatch):
"""No ssh host configured -> never-raise, fail-closed (False), no command run."""
import src.image_freshness as imgf
monkeypatch.setattr(imgf, "_ssh_target", lambda: None)
ok, reason = imgf.rebuild_staging_image("orchestrator", "feature/ORCH-058", "abc123")
assert ok is False
assert "ssh host" in reason