feat(staging): add orchestrator deploy hook with health-check and auto-rollback (ORCH-34)
All checks were successful
CI / test (push) Successful in 13s
CI / test (pull_request) Successful in 9s

This commit is contained in:
Dev Agent
2026-06-05 09:26:12 +03:00
parent 93169f16e0
commit a6cbacb62c
2 changed files with 266 additions and 0 deletions

View File

@@ -0,0 +1,176 @@
#!/bin/bash
# Deploy hook for orchestrator
# Supports --deploy (default) and --rollback modes.
# Adds health-check loop + automatic rollback if new deploy is unhealthy.
#
# Parametrised via env vars (defaults are STAGING — never prod):
# TARGET_SERVICE - docker-compose service name (default: orchestrator-staging)
# TARGET_PORT - health check port (default: 8501)
# TARGET_IMAGE - image name for retag (default: orchestrator-orchestrator-staging)
# COMPOSE_PROFILE - docker compose profile (default: staging)
# PREV_IMAGE_FILE - path to prev-image snapshot (default: $REPO/.deploy-prev-image-staging)
# LOG - log file path (default: /var/log/orchestrator/deploy-hook.log)
#
# Usage:
# ./orchestrator-deploy-hook.sh [--deploy] # normal deploy (default)
# ./orchestrator-deploy-hook.sh --rollback # manual rollback
set -euo pipefail
REPO=/home/slin/repos/orchestrator
# ---- Defaults (STAGING — safe) ---------------------------------------------
TARGET_SERVICE="${TARGET_SERVICE:-orchestrator-staging}"
TARGET_PORT="${TARGET_PORT:-8501}"
TARGET_IMAGE="${TARGET_IMAGE:-orchestrator-orchestrator-staging}"
COMPOSE_PROFILE="${COMPOSE_PROFILE:-staging}"
PREV_IMAGE_FILE="${PREV_IMAGE_FILE:-$REPO/.deploy-prev-image-staging}"
# ---- Log setup -------------------------------------------------------------
LOG_DIR=/var/log/orchestrator
if mkdir -p "$LOG_DIR" 2>/dev/null; then
LOG="${LOG:-$LOG_DIR/deploy-hook.log}"
else
LOG="${LOG:-$REPO/deploy-hook.log}"
fi
log() {
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] $*" | tee -a "$LOG"
}
log "Deploy hook called: target=$TARGET_SERVICE port=$TARGET_PORT args=$*"
cd "$REPO"
# ============================================================================
# HEALTH CHECK helper
# Args: max_attempts sleep_sec label
# Returns 0 if healthy within attempts, 1 otherwise
# ============================================================================
health_check() {
local max_attempts="$1"
local sleep_sec="$2"
local label="${3:-health-check}"
local attempt=0
while [[ $attempt -lt $max_attempts ]]; do
attempt=$(( attempt + 1 ))
log "$label: attempt $attempt/$max_attempts - GET http://localhost:$TARGET_PORT/health"
local http_code body
body=$(curl -s --max-time 5 "http://localhost:$TARGET_PORT/health" 2>/dev/null || true)
http_code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 "http://localhost:$TARGET_PORT/health" 2>/dev/null || echo "000")
if [[ "$http_code" == "200" ]] && echo "$body" | grep -q '"status":"ok"'; then
log "$label: OK (HTTP $http_code, body=$body)"
return 0
fi
log "$label: not ready yet (HTTP $http_code, body=$body)"
if [[ $attempt -lt $max_attempts ]]; then
sleep "$sleep_sec"
fi
done
log "$label: FAILED after $max_attempts attempts"
return 1
}
# ============================================================================
# ROLLBACK helper (also called for auto-rollback after bad deploy)
# ============================================================================
do_rollback() {
log "ROLLBACK: checking $PREV_IMAGE_FILE"
if [[ ! -s "$PREV_IMAGE_FILE" ]]; then
log "ROLLBACK: no previous image recorded - rollback skipped (exit 1)"
return 1
fi
local prev_img
prev_img=$(cat "$PREV_IMAGE_FILE")
if [[ -z "$prev_img" ]]; then
log "ROLLBACK: PREV_IMAGE_FILE is empty - rollback skipped (exit 1)"
return 1
fi
if ! docker image inspect "$prev_img" >/dev/null 2>&1; then
log "ROLLBACK: recorded image '$prev_img' not found locally - rollback skipped (exit 1)"
return 1
fi
log "ROLLBACK: retagging $prev_img -> $TARGET_IMAGE"
docker tag "$prev_img" "$TARGET_IMAGE" >> "$LOG" 2>&1
log "ROLLBACK: restarting $TARGET_SERVICE on previous image"
if [[ -n "$COMPOSE_PROFILE" ]]; then
docker compose --profile "$COMPOSE_PROFILE" up -d --no-build "$TARGET_SERVICE" >> "$LOG" 2>&1
else
docker compose up -d --no-build "$TARGET_SERVICE" >> "$LOG" 2>&1
fi
log "ROLLBACK: container restarted, running post-rollback health check (5x3s)"
if health_check 5 3 "ROLLBACK-health"; then
log "ROLLBACK: service is healthy on previous image ($prev_img)"
return 0
else
log "ROLLBACK: ROLLBACK ALSO FAILED - service still unhealthy after restoring $prev_img"
return 2
fi
}
# ============================================================================
# MANUAL --rollback mode
# ============================================================================
if [[ "${1:-}" == "--rollback" ]]; then
log "Manual ROLLBACK requested"
if do_rollback; then
log "Manual ROLLBACK succeeded"
exit 0
else
log "Manual ROLLBACK failed"
exit 1
fi
fi
# ============================================================================
# NORMAL DEPLOY mode (--deploy or no argument)
# ============================================================================
# 1. Capture currently running image BEFORE restart (best-effort)
PREV_IMG=""
SVC_CID=$(docker compose --profile "$COMPOSE_PROFILE" ps -q "$TARGET_SERVICE" 2>/dev/null || true)
if [[ -n "$SVC_CID" ]]; then
PREV_IMG=$(docker inspect --format '{{.Image}}' "$SVC_CID" 2>/dev/null || true)
fi
if [[ -n "$PREV_IMG" ]]; then
echo "$PREV_IMG" > "$PREV_IMAGE_FILE"
log "Saved previous image: $PREV_IMG -> $PREV_IMAGE_FILE"
else
log "No previous image captured (first deploy or service not running?)"
fi
# 2. Pull latest code
log "git pull origin main"
git pull origin main >> "$LOG" 2>&1
# 3. Restart service
log "Starting $TARGET_SERVICE (profile=$COMPOSE_PROFILE)"
if [[ -n "$COMPOSE_PROFILE" ]]; then
docker compose --profile "$COMPOSE_PROFILE" up -d --no-build "$TARGET_SERVICE" >> "$LOG" 2>&1
else
docker compose up -d --no-build "$TARGET_SERVICE" >> "$LOG" 2>&1
fi
log "$TARGET_SERVICE restarted"
# 4. Health-check loop: 10 attempts x 6 seconds = up to 60s
log "Starting health-check: 10 attempts x 6s (max 60s)"
if health_check 10 6 "deploy-health"; then
log "Deploy SUCCESS: $TARGET_SERVICE healthy on port $TARGET_PORT"
exit 0
fi
# 5. Health failed -> AUTO ROLLBACK
log "deploy FAILED: health not ok after 60s - initiating AUTO ROLLBACK"
rollback_rc=0
do_rollback || rollback_rc=$?
if [[ $rollback_rc -eq 0 ]]; then
log "deploy FAILED, rolled back to previous image successfully - exit 1"
exit 1
elif [[ $rollback_rc -eq 2 ]]; then
log "deploy FAILED, ROLLBACK ALSO FAILED - service may be down - exit 2"
exit 2
else
log "deploy FAILED, rollback skipped (no previous image) - exit 1"
exit 1
fi