#!/bin/bash # Deploy hook for orchestrator # Supports --deploy (default) and --rollback modes. # Adds health-check loop + automatic rollback if new deploy is unhealthy. # # Parametrised via env vars (defaults are STAGING — never prod): # TARGET_SERVICE - docker-compose service name (default: orchestrator-staging) # TARGET_PORT - health check port (default: 8501) # TARGET_IMAGE - image name for retag (default: orchestrator-orchestrator-staging) # COMPOSE_PROFILE - docker compose profile (default: staging) # PREV_IMAGE_FILE - path to prev-image snapshot (default: $REPO/.deploy-prev-image-staging) # SOURCE_IMAGE - build-once source image (default: unset; ORCH-36) # When set, the prevalidated (staging) image is retagged onto # TARGET_IMAGE instead of rebuilding — guarantees prod runs the # exact artefact that passed staging (no `docker build`). # EXPECTED_REVISION- expected git SHA of SOURCE_IMAGE (default: unset; ORCH-58) # Strategy-B fail-closed provenance guard: when set, the # SOURCE_IMAGE's org.opencontainers.image.revision label MUST # equal this value before the BUILD-ONCE retag, else exit 1 # (a stale image is never promoted). Unset -> no check (legacy). # GIT_SHA - --build-staging build-arg (default: unset; ORCH-58) # Commit stamped into the rebuilt staging image's revision # label. Supplied by the caller (validated commit) — NOT # recomputed from the host clone's HEAD. # BUILD_CONTEXT - --build-staging build context (default: $REPO; ORCH-58) # Host worktree of the validated commit; the staging image is # rebuilt FROM this tree (not the prod clone on main). # LOG - log file path (default: /var/log/orchestrator/deploy-hook.log) # # Usage: # ./orchestrator-deploy-hook.sh [--deploy] # normal deploy (default) # ./orchestrator-deploy-hook.sh --rollback # manual rollback # ./orchestrator-deploy-hook.sh --build-staging # ORCH-58: rebuild staging image (8501) set -euo pipefail REPO=/home/slin/repos/orchestrator # ---- Defaults (STAGING — safe) --------------------------------------------- TARGET_SERVICE="${TARGET_SERVICE:-orchestrator-staging}" TARGET_PORT="${TARGET_PORT:-8501}" TARGET_IMAGE="${TARGET_IMAGE:-orchestrator-orchestrator-staging}" COMPOSE_PROFILE="${COMPOSE_PROFILE:-staging}" PREV_IMAGE_FILE="${PREV_IMAGE_FILE:-$REPO/.deploy-prev-image-staging}" # Build-once (ORCH-36): optional prevalidated source image to retag onto # TARGET_IMAGE. Unset -> backward-compatible (no retag), exit-code contract intact. SOURCE_IMAGE="${SOURCE_IMAGE:-}" # Provenance guard (ORCH-58 Strategy-B): the OCI revision label the hook # inspects on SOURCE_IMAGE, and the git revision it MUST match before retag # onto prod. EXPECTED_REVISION unset -> backward-compatible (guard skipped). REVISION_LABEL="org.opencontainers.image.revision" EXPECTED_REVISION="${EXPECTED_REVISION:-}" # ---- Log setup ------------------------------------------------------------- LOG_DIR=/var/log/orchestrator if mkdir -p "$LOG_DIR" 2>/dev/null; then LOG="${LOG:-$LOG_DIR/deploy-hook.log}" else LOG="${LOG:-$REPO/deploy-hook.log}" fi log() { echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] $*" | tee -a "$LOG" } log "Deploy hook called: target=$TARGET_SERVICE port=$TARGET_PORT args=$*" cd "$REPO" # ============================================================================ # HEALTH CHECK helper # Args: max_attempts sleep_sec label # Returns 0 if healthy within attempts, 1 otherwise # ============================================================================ health_check() { local max_attempts="$1" local sleep_sec="$2" local label="${3:-health-check}" local attempt=0 while [[ $attempt -lt $max_attempts ]]; do attempt=$(( attempt + 1 )) log "$label: attempt $attempt/$max_attempts - GET http://localhost:$TARGET_PORT/health" local http_code body body=$(curl -s --max-time 5 "http://localhost:$TARGET_PORT/health" 2>/dev/null || true) http_code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 "http://localhost:$TARGET_PORT/health" 2>/dev/null || echo "000") if [[ "$http_code" == "200" ]] && echo "$body" | grep -q '"status":"ok"'; then log "$label: OK (HTTP $http_code, body=$body)" return 0 fi log "$label: not ready yet (HTTP $http_code, body=$body)" if [[ $attempt -lt $max_attempts ]]; then sleep "$sleep_sec" fi done log "$label: FAILED after $max_attempts attempts" return 1 } # ============================================================================ # ROLLBACK helper (also called for auto-rollback after bad deploy) # ============================================================================ do_rollback() { log "ROLLBACK: checking $PREV_IMAGE_FILE" if [[ ! -s "$PREV_IMAGE_FILE" ]]; then log "ROLLBACK: no previous image recorded - rollback skipped (exit 1)" return 1 fi local prev_img prev_img=$(cat "$PREV_IMAGE_FILE") if [[ -z "$prev_img" ]]; then log "ROLLBACK: PREV_IMAGE_FILE is empty - rollback skipped (exit 1)" return 1 fi if ! docker image inspect "$prev_img" >/dev/null 2>&1; then log "ROLLBACK: recorded image '$prev_img' not found locally - rollback skipped (exit 1)" return 1 fi log "ROLLBACK: retagging $prev_img -> $TARGET_IMAGE" docker tag "$prev_img" "$TARGET_IMAGE" >> "$LOG" 2>&1 log "ROLLBACK: restarting $TARGET_SERVICE on previous image" if [[ -n "$COMPOSE_PROFILE" ]]; then docker compose --profile "$COMPOSE_PROFILE" up -d --no-build "$TARGET_SERVICE" >> "$LOG" 2>&1 else docker compose up -d --no-build "$TARGET_SERVICE" >> "$LOG" 2>&1 fi log "ROLLBACK: container restarted, running post-rollback health check (5x3s)" if health_check 5 3 "ROLLBACK-health"; then log "ROLLBACK: service is healthy on previous image ($prev_img)" return 0 else log "ROLLBACK: ROLLBACK ALSO FAILED - service still unhealthy after restoring $prev_img" return 2 fi } # ============================================================================ # MANUAL --rollback mode # ============================================================================ if [[ "${1:-}" == "--rollback" ]]; then log "Manual ROLLBACK requested" if do_rollback; then log "Manual ROLLBACK succeeded" exit 0 else log "Manual ROLLBACK failed" exit 1 fi fi # ============================================================================ # --build-staging mode (ORCH-58, Strategy A): rebuild the STAGING image from the # VALIDATED commit and recreate 8501, so the artefact we validate is the EXACT one # later BUILD-ONCE retagged to prod (INV-FRESH). Builds/recreates STAGING ONLY # (8501) — never prod (8500). Same exit-code contract (0 = healthy, !=0 = failed). # # Uses the caller-supplied GIT_SHA + BUILD_CONTEXT (the validated worktree) — it # must NOT recompute HEAD from $REPO (the prod clone on `main`): on the # deploy-staging -> deploy edge the PR is not yet merged, so `main` HEAD != the # validated SHA, which would stamp the wrong revision label and deadlock the # Strategy-B guard on every valid self-deploy. # ============================================================================ if [[ "${1:-}" == "--build-staging" ]]; then BUILD_CONTEXT="${BUILD_CONTEXT:-$REPO}" GIT_SHA="${GIT_SHA:-}" log "BUILD-STAGING: rebuilding $TARGET_IMAGE from $BUILD_CONTEXT (GIT_SHA=$GIT_SHA, service=$TARGET_SERVICE, port=$TARGET_PORT)" if ! docker build --build-arg GIT_SHA="$GIT_SHA" -t "$TARGET_IMAGE" "$BUILD_CONTEXT" >> "$LOG" 2>&1; then log "BUILD-STAGING: docker build failed - aborting (exit 1)" exit 1 fi log "BUILD-STAGING: recreating $TARGET_SERVICE (profile=$COMPOSE_PROFILE) on the fresh image" if [[ -n "$COMPOSE_PROFILE" ]]; then docker compose --profile "$COMPOSE_PROFILE" up -d --no-build "$TARGET_SERVICE" >> "$LOG" 2>&1 else docker compose up -d --no-build "$TARGET_SERVICE" >> "$LOG" 2>&1 fi log "BUILD-STAGING: running health-check on port $TARGET_PORT (10x6s)" if health_check 10 6 "build-staging-health"; then log "BUILD-STAGING: $TARGET_SERVICE healthy on the fresh image (exit 0)" exit 0 fi log "BUILD-STAGING: health FAILED after rebuild (exit 1)" exit 1 fi # ============================================================================ # NORMAL DEPLOY mode (--deploy or no argument) # ============================================================================ # 1. Capture currently running image BEFORE restart (best-effort) PREV_IMG="" SVC_CID=$(docker compose --profile "$COMPOSE_PROFILE" ps -q "$TARGET_SERVICE" 2>/dev/null || true) if [[ -n "$SVC_CID" ]]; then PREV_IMG=$(docker inspect --format '{{.Image}}' "$SVC_CID" 2>/dev/null || true) fi if [[ -n "$PREV_IMG" ]]; then echo "$PREV_IMG" > "$PREV_IMAGE_FILE" log "Saved previous image: $PREV_IMG -> $PREV_IMAGE_FILE" else log "No previous image captured (first deploy or service not running?)" fi # 2. Pull latest code (keeps the host working tree current for future builds; # the DEPLOYED artefact is the retagged SOURCE_IMAGE below when build-once). log "git pull origin main" git pull origin main >> "$LOG" 2>&1 # 2b. Build-once (ORCH-36): retag the prevalidated staging image onto TARGET_IMAGE # instead of rebuilding, so prod runs the exact artefact that passed staging. # Backward compatible: skipped when SOURCE_IMAGE is unset. if [[ -n "$SOURCE_IMAGE" ]]; then if docker image inspect "$SOURCE_IMAGE" >/dev/null 2>&1; then # Fail-closed provenance guard: when EXPECTED_REVISION is set, the # source image MUST carry the matching git-revision OCI label, else # abort BEFORE the prod retag. Empty EXPECTED_REVISION -> guard # skipped (ORCH-36 backward-compat). if [[ -n "$EXPECTED_REVISION" ]]; then IMG_REV=$(docker image inspect --format '{{ index .Config.Labels "'"$REVISION_LABEL"'" }}' "$SOURCE_IMAGE" 2>/dev/null || true) # docker emits "" when the label is absent -> normalise. if [[ "$IMG_REV" == "" ]]; then IMG_REV="" fi if [[ -z "$IMG_REV" || "$IMG_REV" != "$EXPECTED_REVISION" ]]; then log "PROVENANCE: SOURCE_IMAGE revision '$IMG_REV' != expected '$EXPECTED_REVISION' - aborting before retag (exit 1)" exit 1 fi log "PROVENANCE: SOURCE_IMAGE revision matches expected ($EXPECTED_REVISION)" fi log "BUILD-ONCE: retagging $SOURCE_IMAGE -> $TARGET_IMAGE (no rebuild)" docker tag "$SOURCE_IMAGE" "$TARGET_IMAGE" >> "$LOG" 2>&1 else log "BUILD-ONCE: SOURCE_IMAGE '$SOURCE_IMAGE' not found locally - aborting (exit 1)" exit 1 fi fi # 3. Restart service log "Starting $TARGET_SERVICE (profile=$COMPOSE_PROFILE)" if [[ -n "$COMPOSE_PROFILE" ]]; then docker compose --profile "$COMPOSE_PROFILE" up -d --no-build "$TARGET_SERVICE" >> "$LOG" 2>&1 else docker compose up -d --no-build "$TARGET_SERVICE" >> "$LOG" 2>&1 fi log "$TARGET_SERVICE restarted" # 4. Health-check loop: 10 attempts x 6 seconds = up to 60s log "Starting health-check: 10 attempts x 6s (max 60s)" if health_check 10 6 "deploy-health"; then log "Deploy SUCCESS: $TARGET_SERVICE healthy on port $TARGET_PORT" exit 0 fi # 5. Health failed -> AUTO ROLLBACK log "deploy FAILED: health not ok after 60s - initiating AUTO ROLLBACK" rollback_rc=0 do_rollback || rollback_rc=$? if [[ $rollback_rc -eq 0 ]]; then log "deploy FAILED, rolled back to previous image successfully - exit 1" exit 1 elif [[ $rollback_rc -eq 2 ]]; then log "deploy FAILED, ROLLBACK ALSO FAILED - service may be down - exit 2" exit 2 else log "deploy FAILED, rollback skipped (no previous image) - exit 1" exit 1 fi