feat(deploy): Saturn-side dispatcher wrapper

Replaces the per-node staged framework with a single entry point that
runs from SATURN: preflight (branch/clean-tree/push/SSH), gate (pytest +
docker build per service), execute (control-plane.sh --ssh or remote
deploy-node.sh), verify (docker ps), and one-line report.

Exit codes: 0=ok 1=preflight 2=gate 3=execute 4=verify 5=sudo-handoff.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Oskar Kapala 2026-06-03 16:06:36 +02:00
parent 00fc36df3a
commit db592fbc28

View file

@ -1,270 +1,321 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# deploy.sh - Staged deployment framework for homelab nodes. # scripts/deploy/deploy.sh — Saturn-side deploy dispatcher
# Usage: deploy.sh <target> [--dry-run] [--no-gate]
# target ∈ {control-plane, vps, piha, solaria, chelsty-infra}
# Exit codes: 0=ok 1=preflight 2=gate 3=execute 4=verify 5=handoff(sudo)
set -o pipefail set -uo pipefail
# --- Configuration --- REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
export RUNTIME_PATH="/opt/homelab" SSH_USER="${SSH_USER:-oskar}"
export STATE_DIR="${RUNTIME_PATH}/state/deploy" START_TIME=$(date +%s)
export LOG_DIR="${RUNTIME_PATH}/logs/deploy" TARGET=""
export REPO_PATH="${HOME}/homelab-codex-ws" DRY_RUN=false
export TIMESTAMP=$(date +%Y%m%d_%H%M%S) NO_GATE=false
export LOG_FILE="${LOG_DIR}/deploy_${TIMESTAMP}.log"
# --- Initialization --- usage() {
mkdir -p "$STATE_DIR" "$LOG_DIR" cat >&2 <<'EOF'
Usage: deploy.sh <target> [--dry-run] [--no-gate]
# Redirection for logging Targets:
exec > >(tee -a "$LOG_FILE") 2>&1 control-plane observer/supervisor/executor/operator-ui on VPS
vps all VPS GitOps services
piha PIHA services
solaria SOLARIA compute services
chelsty-infra CHELSTY edge node (LTE, longer SSH timeout)
# --- Load Libraries --- Flags:
LIB_PATH="${REPO_PATH}/scripts/lib" --dry-run run preflight + gate only; stop before deploy
source "${LIB_PATH}/log.sh" --no-gate skip pytest + docker build (emergency only; logged as WARNING)
source "${LIB_PATH}/state.sh"
source "${LIB_PATH}/inventory.sh"
source "${LIB_PATH}/compose.sh"
source "${LIB_PATH}/diagnostics.sh"
# --- CLI Parsing --- Exit codes: 0=ok 1=preflight 2=gate 3=execute 4=verify 5=handoff(sudo)
TARGET_HOST=$(hostname) EOF
TARGET_SERVICE="" exit 1
RESUME=false }
REQUESTED_STAGE=""
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case $1 in case $1 in
--host) control-plane|vps|piha|solaria|chelsty-infra)
TARGET_HOST="$2" TARGET="$1"; shift ;;
shift 2 --dry-run)
;; DRY_RUN=true; shift ;;
--service) --no-gate)
TARGET_SERVICE="$2" NO_GATE=true; shift ;;
shift 2 -h|--help)
;; usage ;;
--resume)
RESUME=true
shift
;;
--stage)
REQUESTED_STAGE="$2"
shift 2
;;
*) *)
if [[ "$1" =~ ^(prepare|validate|deploy|verify|diagnose|complete)$ ]]; then echo "Unknown argument: $1" >&2
REQUESTED_STAGE="$1" usage ;;
fi
shift
;;
esac esac
done done
# --- Stages --- [[ -z "$TARGET" ]] && { echo "Error: target is required." >&2; usage; }
stage_prepare() { case "$TARGET" in
local host=$1 control-plane) SSH_HOST="vps" ;;
if is_stage_complete "prepare" && [[ "$RESUME" == "true" ]]; then *) SSH_HOST="$TARGET" ;;
log "INFO" "Skipping PREPARE (already complete)" esac
case "$TARGET" in
chelsty-*) SSH_TIMEOUT=30 ;;
*) SSH_TIMEOUT=5 ;;
esac
# ── PREFLIGHT ────────────────────────────────────────────────────────────────
preflight() {
echo "=== PREFLIGHT ==="
local branch
branch=$(git -C "$REPO_ROOT" rev-parse --abbrev-ref HEAD)
if [[ "$branch" != "master" ]]; then
echo "ERROR: On branch '${branch}', not master. Switch to master and push first." >&2
exit 1
fi
echo "[ok] branch: master"
if ! git -C "$REPO_ROOT" diff --quiet; then
echo "ERROR: Unstaged changes in working tree. Commit or stash before deploying." >&2
exit 1
fi
if ! git -C "$REPO_ROOT" diff --cached --quiet; then
echo "ERROR: Staged but uncommitted changes. Commit before deploying." >&2
exit 1
fi
echo "[ok] working tree clean"
git -C "$REPO_ROOT" fetch origin master --quiet
local unpushed
unpushed=$(git -C "$REPO_ROOT" log origin/master..HEAD --oneline)
if [[ -n "$unpushed" ]]; then
echo "ERROR: Unpushed commits on master:" >&2
echo "$unpushed" >&2
echo "Push first: git push origin master" >&2
exit 1
fi
echo "[ok] no unpushed commits"
echo "Checking SSH: ${SSH_USER}@${SSH_HOST} (ConnectTimeout=${SSH_TIMEOUT}s)..."
if ! ssh -o "ConnectTimeout=${SSH_TIMEOUT}" -o BatchMode=yes \
"${SSH_USER}@${SSH_HOST}" true 2>/dev/null; then
echo "ERROR: Cannot reach ${SSH_HOST} via SSH (timeout ${SSH_TIMEOUT}s)." >&2
exit 1
fi
echo "[ok] ${SSH_HOST} reachable"
}
# ── GATE ─────────────────────────────────────────────────────────────────────
gate() {
if [[ "$NO_GATE" == "true" ]]; then
echo "=== GATE: SKIPPED ==="
echo "WARNING: --no-gate active — pytest + docker build bypassed (emergency mode)." >&2
return 0 return 0
fi fi
log "INFO" "Stage: PREPARE ($host)" echo "=== GATE ==="
set_stage "prepare"
emit_event "deployment_started" "info" "deploy.sh" "all" "${TIMESTAMP}" "{\"stage\": \"prepare\"}" local services=()
cd "$REPO_PATH" || exit 1 if [[ "$TARGET" == "control-plane" ]]; then
log "INFO" "Pulling latest changes..." services=("control-plane")
if ! git pull; then
log "WARN" "Git pull failed, proceeding with local state (offline mode or network flap)"
fi
# Ensure runtime directories exist
mkdir -p "${RUNTIME_PATH}/config" "${RUNTIME_PATH}/data" "${RUNTIME_PATH}/state" "${RUNTIME_PATH}/logs"
struct_log "prepare" "$host" "all" "success" "repo_updated"
mark_stage_complete "prepare"
}
stage_validate() {
local host=$1
if is_stage_complete "validate" && [[ "$RESUME" == "true" ]]; then
log "INFO" "Skipping VALIDATE (already complete)"
return 0
fi
log "INFO" "Stage: VALIDATE ($host)"
set_stage "validate"
for service in "${SERVICES[@]}"; do
log "INFO" "Validating $service..."
if [[ ! -d "${REPO_PATH}/services/$service" ]]; then
log "ERROR" "Service definition not found: $service"
struct_log "validate" "$host" "$service" "fail" "not_found"
return 1
fi
done
struct_log "validate" "$host" "all" "success" "validated"
mark_stage_complete "validate"
}
stage_deploy() {
local host=$1
if is_stage_complete "deploy" && [[ "$RESUME" == "true" ]]; then
log "INFO" "Skipping DEPLOY (already complete)"
return 0
fi
log "INFO" "Stage: DEPLOY ($host)"
set_stage "deploy"
local last_s=$(get_last_service)
local skip=false
if [[ "$RESUME" == "true" && -n "$last_s" ]]; then
skip=true
fi
for service in "${SERVICES[@]}"; do
if [[ "$skip" == "true" ]]; then
if [[ "$service" == "$last_s" ]]; then
skip=false
log "INFO" "Resuming from $service..."
else
log "INFO" "Skipping $service (already processed)"
continue
fi
fi
log "INFO" "Deploying $service..."
set_last_service "$service"
if ! run_compose_up "$service"; then
struct_log "deploy" "$host" "$service" "fail" "docker_compose_failed"
collect_diagnostics "$host" "$service"
return 1
fi
struct_log "deploy" "$host" "$service" "success" "deployed"
done
set_last_service ""
mark_stage_complete "deploy"
}
stage_verify() {
local host=$1
if is_stage_complete "verify" && [[ "$RESUME" == "true" ]]; then
log "INFO" "Skipping VERIFY (already complete)"
return 0
fi
log "INFO" "Stage: VERIFY ($host)"
set_stage "verify"
for service in "${SERVICES[@]}"; do
log "INFO" "Verifying $service..."
local health_script="${REPO_PATH}/services/${service}/healthcheck.sh"
if [[ -f "$health_script" ]]; then
if ! bash "$health_script"; then
log "ERROR" "Healthcheck failed for $service"
struct_log "verify" "$host" "$service" "fail" "healthcheck_failed"
collect_diagnostics "$host" "$service"
return 1
fi
else
# Generic check if container is running
if ! docker ps --filter "name=$service" --filter "status=running" | grep -q "$service"; then
log "ERROR" "Container $service is not running"
struct_log "verify" "$host" "$service" "fail" "container_not_running"
collect_diagnostics "$host" "$service"
return 1
fi
fi
struct_log "verify" "$host" "$service" "success" "verified"
done
mark_stage_complete "verify"
}
stage_complete() {
local host=$1
log "INFO" "Stage: COMPLETE ($host)"
set_stage "complete"
struct_log "complete" "$host" "all" "success" "deployment_finished"
clear_deployment_state
}
# --- Execution Logic ---
run_deployment() {
local start_stage=$1
# Sequential execution from start_stage
case "$start_stage" in
prepare)
stage_prepare "$TARGET_HOST" || return 1
;&
validate)
stage_validate "$TARGET_HOST" || return 1
;&
deploy)
stage_deploy "$TARGET_HOST" || return 1
;&
verify)
stage_verify "$TARGET_HOST" || return 1
;&
complete)
stage_complete "$TARGET_HOST" || return 1
;;
*)
log "ERROR" "Invalid stage: $start_stage"
return 1
;;
esac
}
# --- Main ---
log "INFO" "--- Homelab Deployment Started (Host: $TARGET_HOST, Service: ${TARGET_SERVICE:-all}) ---"
if ! load_inventory "$TARGET_HOST" "$TARGET_SERVICE"; then
log "ERROR" "Failed to load inventory"
exit 1
fi
EXIT_STATUS=0
if [[ "$RESUME" == "true" ]]; then
CURRENT=$(get_stage)
log "INFO" "Resuming from state: $CURRENT"
case "$CURRENT" in
prepare|validate|deploy|verify)
run_deployment "$CURRENT" || EXIT_STATUS=1
;;
complete|none)
log "INFO" "No interrupted deployment found. Starting from scratch..."
run_deployment "prepare" || EXIT_STATUS=1
;;
*)
log "INFO" "Unknown state. Starting from prepare..."
run_deployment "prepare" || EXIT_STATUS=1
;;
esac
elif [[ -n "$REQUESTED_STAGE" ]]; then
if [[ "$REQUESTED_STAGE" == "diagnose" ]]; then
collect_diagnostics "$TARGET_HOST" "$TARGET_SERVICE"
else else
run_deployment "$REQUESTED_STAGE" || EXIT_STATUS=1 local svc_yaml="${REPO_ROOT}/hosts/${TARGET}/services.yaml"
if [[ ! -f "$svc_yaml" ]]; then
echo "ERROR: ${svc_yaml} not found." >&2
exit 2
fi
local svc_list
svc_list=$(python3 -c "
import yaml
with open('${svc_yaml}') as f:
data = yaml.safe_load(f)
svcs = data.get('services', {})
if isinstance(svcs, dict):
print('\n'.join(svcs.keys()))
elif isinstance(svcs, list):
print('\n'.join(svcs))
")
while IFS= read -r svc; do
[[ -z "$svc" ]] && continue
if [[ -f "${REPO_ROOT}/services/${svc}/Dockerfile" ]]; then
services+=("$svc")
fi
done <<< "$svc_list"
fi fi
else
# New deployment - clear previous state if [[ ${#services[@]} -eq 0 ]]; then
clear_deployment_state echo "[info] No services with local Dockerfile found for ${TARGET} — gate trivially passes."
run_deployment "prepare" || EXIT_STATUS=1 return 0
fi
echo "Services under gate: ${services[*]}"
local gate_failed=false
for svc in "${services[@]}"; do
local svc_dir="${REPO_ROOT}/services/${svc}"
if [[ -d "${svc_dir}/tests" ]]; then
echo "--- pytest: ${svc} ---"
if ! python3 -m pytest "${svc_dir}/tests" -q; then
echo "GATE FAIL: pytest failed for ${svc}" >&2
gate_failed=true
fi
fi
echo "--- docker build: ${svc} ---"
if ! docker build --quiet "${svc_dir}" >/dev/null; then
echo "GATE FAIL: docker build failed for ${svc}" >&2
gate_failed=true
fi
done
if [[ "$gate_failed" == "true" ]]; then
exit 2
fi
echo "[ok] gate passed"
}
# ── EXECUTE ──────────────────────────────────────────────────────────────────
execute() {
echo "=== EXECUTE ==="
local cmd_output
local cmd_exit=0
if [[ "$TARGET" == "control-plane" ]]; then
echo "Running deploy-control-plane.sh --ssh..."
cmd_output=$("${REPO_ROOT}/scripts/deploy/deploy-control-plane.sh" --ssh 2>&1) \
|| cmd_exit=$?
else
echo "SSHing to ${SSH_HOST}: git pull + deploy-node.sh..."
cmd_output=$(ssh -o "ConnectTimeout=${SSH_TIMEOUT}" -o BatchMode=yes \
"${SSH_USER}@${SSH_HOST}" \
'cd ~/homelab-codex-ws && git pull && ./scripts/deploy/deploy-node.sh' 2>&1) \
|| cmd_exit=$?
fi
echo "$cmd_output"
if echo "$cmd_output" | grep -qF "[sudo] password"; then
echo "" >&2
echo "ERROR (exit 5): Deploy hit an interactive sudo prompt." >&2
echo "Run manually:" >&2
if [[ "$TARGET" == "control-plane" ]]; then
echo " ssh -t ${SSH_USER}@${SSH_HOST} 'cd ~/homelab-codex-ws && git pull origin master && cd services/control-plane && bash deploy-local.sh'" >&2
else
echo " ssh -t ${SSH_USER}@${SSH_HOST} 'cd ~/homelab-codex-ws && git pull && ./scripts/deploy/deploy-node.sh'" >&2
fi
exit 5
fi
if [[ $cmd_exit -ne 0 ]]; then
echo "ERROR: Deploy command exited ${cmd_exit}." >&2
exit 3
fi
echo "[ok] execute completed"
}
# ── VERIFY ───────────────────────────────────────────────────────────────────
verify() {
echo "=== VERIFY ==="
local ps_output
local ps_exit=0
ps_output=$(ssh -o "ConnectTimeout=${SSH_TIMEOUT}" -o BatchMode=yes \
"${SSH_USER}@${SSH_HOST}" \
'docker ps --format "{{.Names}}\t{{.Status}}"' 2>&1) \
|| ps_exit=$?
if [[ $ps_exit -ne 0 ]]; then
echo "ERROR: docker ps failed on ${SSH_HOST}:" >&2
echo "$ps_output" >&2
exit 4
fi
echo "$ps_output"
local failed=false
local not_up
not_up=$(echo "$ps_output" | grep -v '^$' | grep -v $'\tUp' || true)
if [[ -n "$not_up" ]]; then
echo "ERROR: Containers not in Up state:" >&2
echo "$not_up" >&2
failed=true
fi
local unhealthy
unhealthy=$(echo "$ps_output" | grep '(unhealthy)' || true)
if [[ -n "$unhealthy" ]]; then
echo "ERROR: Unhealthy containers:" >&2
echo "$unhealthy" >&2
failed=true
fi
if [[ "$TARGET" == "control-plane" ]]; then
for cp_svc in supervisor observer executor operator-ui; do
if ! echo "$ps_output" | grep -q "$cp_svc"; then
echo "ERROR: control-plane component absent from docker ps: ${cp_svc}" >&2
failed=true
fi
done
fi
if [[ "$failed" == "true" ]]; then
echo "" >&2
echo "Full docker ps output above." >&2
exit 4
fi
echo "[ok] all containers healthy"
}
# ── REPORT ───────────────────────────────────────────────────────────────────
report() {
local mode="${1:-deploy}"
local end_time
end_time=$(date +%s)
local elapsed
elapsed=$(( end_time - START_TIME ))
local commit_hash
commit_hash=$(git -C "$REPO_ROOT" rev-parse --short HEAD)
local gate_s verify_s
if [[ "$NO_GATE" == "true" ]]; then
gate_s="skip"
else
gate_s="ok"
fi
if [[ "$mode" == "dry-run" ]]; then
verify_s="skip(dry-run)"
else
verify_s="green"
fi
echo ""
if [[ "$mode" == "dry-run" ]]; then
echo "DRY RUN OK | target=${TARGET} | commit=${commit_hash} | gate=${gate_s} | verify=${verify_s} | ${elapsed}s"
else
echo "DEPLOY OK | target=${TARGET} | commit=${commit_hash} | gate=${gate_s} | verify=${verify_s} | ${elapsed}s"
fi
}
# ── MAIN ─────────────────────────────────────────────────────────────────────
preflight
gate
if [[ "$DRY_RUN" == "true" ]]; then
report dry-run
exit 0
fi fi
if [[ $EXIT_STATUS -eq 0 ]]; then execute
print_summary "$TARGET_HOST" "SUCCESS" verify
log "INFO" "--- Homelab Deployment Finished Successfully ---" report
else
print_summary "$TARGET_HOST" "FAILED"
log "ERROR" "--- Homelab Deployment Failed ---"
exit 1
fi