From db592fbc28854a1d857cb75869fcf2cff2bc6bef Mon Sep 17 00:00:00 2001 From: Oskar Kapala Date: Wed, 3 Jun 2026 16:06:36 +0200 Subject: [PATCH] feat(deploy): Saturn-side dispatcher wrapper Replaces the per-node staged framework with a single entry point that runs from SATURN: preflight (branch/clean-tree/push/SSH), gate (pytest + docker build per service), execute (control-plane.sh --ssh or remote deploy-node.sh), verify (docker ps), and one-line report. Exit codes: 0=ok 1=preflight 2=gate 3=execute 4=verify 5=sudo-handoff. Co-Authored-By: Claude Sonnet 4.6 --- scripts/deploy/deploy.sh | 543 +++++++++++++++++++++------------------ 1 file changed, 297 insertions(+), 246 deletions(-) diff --git a/scripts/deploy/deploy.sh b/scripts/deploy/deploy.sh index 691c3cc..68079c2 100755 --- a/scripts/deploy/deploy.sh +++ b/scripts/deploy/deploy.sh @@ -1,270 +1,321 @@ #!/usr/bin/env bash -# deploy.sh - Staged deployment framework for homelab nodes. +# scripts/deploy/deploy.sh — Saturn-side deploy dispatcher +# Usage: deploy.sh [--dry-run] [--no-gate] +# target ∈ {control-plane, vps, piha, solaria, chelsty-infra} +# Exit codes: 0=ok 1=preflight 2=gate 3=execute 4=verify 5=handoff(sudo) -set -o pipefail +set -uo pipefail -# --- Configuration --- -export RUNTIME_PATH="/opt/homelab" -export STATE_DIR="${RUNTIME_PATH}/state/deploy" -export LOG_DIR="${RUNTIME_PATH}/logs/deploy" -export REPO_PATH="${HOME}/homelab-codex-ws" -export TIMESTAMP=$(date +%Y%m%d_%H%M%S) -export LOG_FILE="${LOG_DIR}/deploy_${TIMESTAMP}.log" +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +SSH_USER="${SSH_USER:-oskar}" +START_TIME=$(date +%s) +TARGET="" +DRY_RUN=false +NO_GATE=false -# --- Initialization --- -mkdir -p "$STATE_DIR" "$LOG_DIR" +usage() { + cat >&2 <<'EOF' +Usage: deploy.sh [--dry-run] [--no-gate] -# Redirection for logging -exec > >(tee -a "$LOG_FILE") 2>&1 +Targets: + control-plane observer/supervisor/executor/operator-ui on VPS + vps all VPS GitOps services + piha PIHA services + solaria SOLARIA compute services + chelsty-infra CHELSTY edge node (LTE, longer SSH timeout) -# --- Load Libraries --- -LIB_PATH="${REPO_PATH}/scripts/lib" -source "${LIB_PATH}/log.sh" -source "${LIB_PATH}/state.sh" -source "${LIB_PATH}/inventory.sh" -source "${LIB_PATH}/compose.sh" -source "${LIB_PATH}/diagnostics.sh" +Flags: + --dry-run run preflight + gate only; stop before deploy + --no-gate skip pytest + docker build (emergency only; logged as WARNING) -# --- CLI Parsing --- -TARGET_HOST=$(hostname) -TARGET_SERVICE="" -RESUME=false -REQUESTED_STAGE="" +Exit codes: 0=ok 1=preflight 2=gate 3=execute 4=verify 5=handoff(sudo) +EOF + exit 1 +} while [[ $# -gt 0 ]]; do case $1 in - --host) - TARGET_HOST="$2" - shift 2 - ;; - --service) - TARGET_SERVICE="$2" - shift 2 - ;; - --resume) - RESUME=true - shift - ;; - --stage) - REQUESTED_STAGE="$2" - shift 2 - ;; + control-plane|vps|piha|solaria|chelsty-infra) + TARGET="$1"; shift ;; + --dry-run) + DRY_RUN=true; shift ;; + --no-gate) + NO_GATE=true; shift ;; + -h|--help) + usage ;; *) - if [[ "$1" =~ ^(prepare|validate|deploy|verify|diagnose|complete)$ ]]; then - REQUESTED_STAGE="$1" - fi - shift - ;; + echo "Unknown argument: $1" >&2 + usage ;; esac done -# --- Stages --- +[[ -z "$TARGET" ]] && { echo "Error: target is required." >&2; usage; } -stage_prepare() { - local host=$1 - if is_stage_complete "prepare" && [[ "$RESUME" == "true" ]]; then - log "INFO" "Skipping PREPARE (already complete)" +case "$TARGET" in + control-plane) SSH_HOST="vps" ;; + *) SSH_HOST="$TARGET" ;; +esac + +case "$TARGET" in + chelsty-*) SSH_TIMEOUT=30 ;; + *) SSH_TIMEOUT=5 ;; +esac + +# ── PREFLIGHT ──────────────────────────────────────────────────────────────── + +preflight() { + echo "=== PREFLIGHT ===" + + local branch + branch=$(git -C "$REPO_ROOT" rev-parse --abbrev-ref HEAD) + if [[ "$branch" != "master" ]]; then + echo "ERROR: On branch '${branch}', not master. Switch to master and push first." >&2 + exit 1 + fi + echo "[ok] branch: master" + + if ! git -C "$REPO_ROOT" diff --quiet; then + echo "ERROR: Unstaged changes in working tree. Commit or stash before deploying." >&2 + exit 1 + fi + if ! git -C "$REPO_ROOT" diff --cached --quiet; then + echo "ERROR: Staged but uncommitted changes. Commit before deploying." >&2 + exit 1 + fi + echo "[ok] working tree clean" + + git -C "$REPO_ROOT" fetch origin master --quiet + local unpushed + unpushed=$(git -C "$REPO_ROOT" log origin/master..HEAD --oneline) + if [[ -n "$unpushed" ]]; then + echo "ERROR: Unpushed commits on master:" >&2 + echo "$unpushed" >&2 + echo "Push first: git push origin master" >&2 + exit 1 + fi + echo "[ok] no unpushed commits" + + echo "Checking SSH: ${SSH_USER}@${SSH_HOST} (ConnectTimeout=${SSH_TIMEOUT}s)..." + if ! ssh -o "ConnectTimeout=${SSH_TIMEOUT}" -o BatchMode=yes \ + "${SSH_USER}@${SSH_HOST}" true 2>/dev/null; then + echo "ERROR: Cannot reach ${SSH_HOST} via SSH (timeout ${SSH_TIMEOUT}s)." >&2 + exit 1 + fi + echo "[ok] ${SSH_HOST} reachable" +} + +# ── GATE ───────────────────────────────────────────────────────────────────── + +gate() { + if [[ "$NO_GATE" == "true" ]]; then + echo "=== GATE: SKIPPED ===" + echo "WARNING: --no-gate active — pytest + docker build bypassed (emergency mode)." >&2 return 0 fi - log "INFO" "Stage: PREPARE ($host)" - set_stage "prepare" - - emit_event "deployment_started" "info" "deploy.sh" "all" "${TIMESTAMP}" "{\"stage\": \"prepare\"}" + echo "=== GATE ===" - cd "$REPO_PATH" || exit 1 - log "INFO" "Pulling latest changes..." - if ! git pull; then - log "WARN" "Git pull failed, proceeding with local state (offline mode or network flap)" - fi + local services=() - # Ensure runtime directories exist - mkdir -p "${RUNTIME_PATH}/config" "${RUNTIME_PATH}/data" "${RUNTIME_PATH}/state" "${RUNTIME_PATH}/logs" - - struct_log "prepare" "$host" "all" "success" "repo_updated" - mark_stage_complete "prepare" -} - -stage_validate() { - local host=$1 - if is_stage_complete "validate" && [[ "$RESUME" == "true" ]]; then - log "INFO" "Skipping VALIDATE (already complete)" - return 0 - fi - - log "INFO" "Stage: VALIDATE ($host)" - set_stage "validate" - - for service in "${SERVICES[@]}"; do - log "INFO" "Validating $service..." - if [[ ! -d "${REPO_PATH}/services/$service" ]]; then - log "ERROR" "Service definition not found: $service" - struct_log "validate" "$host" "$service" "fail" "not_found" - return 1 - fi - done - - struct_log "validate" "$host" "all" "success" "validated" - mark_stage_complete "validate" -} - -stage_deploy() { - local host=$1 - if is_stage_complete "deploy" && [[ "$RESUME" == "true" ]]; then - log "INFO" "Skipping DEPLOY (already complete)" - return 0 - fi - - log "INFO" "Stage: DEPLOY ($host)" - set_stage "deploy" - - local last_s=$(get_last_service) - local skip=false - if [[ "$RESUME" == "true" && -n "$last_s" ]]; then - skip=true - fi - - for service in "${SERVICES[@]}"; do - if [[ "$skip" == "true" ]]; then - if [[ "$service" == "$last_s" ]]; then - skip=false - log "INFO" "Resuming from $service..." - else - log "INFO" "Skipping $service (already processed)" - continue - fi - fi - - log "INFO" "Deploying $service..." - set_last_service "$service" - - if ! run_compose_up "$service"; then - struct_log "deploy" "$host" "$service" "fail" "docker_compose_failed" - collect_diagnostics "$host" "$service" - return 1 - fi - - struct_log "deploy" "$host" "$service" "success" "deployed" - done - - set_last_service "" - mark_stage_complete "deploy" -} - -stage_verify() { - local host=$1 - if is_stage_complete "verify" && [[ "$RESUME" == "true" ]]; then - log "INFO" "Skipping VERIFY (already complete)" - return 0 - fi - - log "INFO" "Stage: VERIFY ($host)" - set_stage "verify" - - for service in "${SERVICES[@]}"; do - log "INFO" "Verifying $service..." - local health_script="${REPO_PATH}/services/${service}/healthcheck.sh" - if [[ -f "$health_script" ]]; then - if ! bash "$health_script"; then - log "ERROR" "Healthcheck failed for $service" - struct_log "verify" "$host" "$service" "fail" "healthcheck_failed" - collect_diagnostics "$host" "$service" - return 1 - fi - else - # Generic check if container is running - if ! docker ps --filter "name=$service" --filter "status=running" | grep -q "$service"; then - log "ERROR" "Container $service is not running" - struct_log "verify" "$host" "$service" "fail" "container_not_running" - collect_diagnostics "$host" "$service" - return 1 - fi - fi - struct_log "verify" "$host" "$service" "success" "verified" - done - mark_stage_complete "verify" -} - -stage_complete() { - local host=$1 - log "INFO" "Stage: COMPLETE ($host)" - set_stage "complete" - struct_log "complete" "$host" "all" "success" "deployment_finished" - clear_deployment_state -} - -# --- Execution Logic --- - -run_deployment() { - local start_stage=$1 - - # Sequential execution from start_stage - case "$start_stage" in - prepare) - stage_prepare "$TARGET_HOST" || return 1 - ;& - validate) - stage_validate "$TARGET_HOST" || return 1 - ;& - deploy) - stage_deploy "$TARGET_HOST" || return 1 - ;& - verify) - stage_verify "$TARGET_HOST" || return 1 - ;& - complete) - stage_complete "$TARGET_HOST" || return 1 - ;; - *) - log "ERROR" "Invalid stage: $start_stage" - return 1 - ;; - esac -} - -# --- Main --- - -log "INFO" "--- Homelab Deployment Started (Host: $TARGET_HOST, Service: ${TARGET_SERVICE:-all}) ---" - -if ! load_inventory "$TARGET_HOST" "$TARGET_SERVICE"; then - log "ERROR" "Failed to load inventory" - exit 1 -fi - -EXIT_STATUS=0 -if [[ "$RESUME" == "true" ]]; then - CURRENT=$(get_stage) - log "INFO" "Resuming from state: $CURRENT" - case "$CURRENT" in - prepare|validate|deploy|verify) - run_deployment "$CURRENT" || EXIT_STATUS=1 - ;; - complete|none) - log "INFO" "No interrupted deployment found. Starting from scratch..." - run_deployment "prepare" || EXIT_STATUS=1 - ;; - *) - log "INFO" "Unknown state. Starting from prepare..." - run_deployment "prepare" || EXIT_STATUS=1 - ;; - esac -elif [[ -n "$REQUESTED_STAGE" ]]; then - if [[ "$REQUESTED_STAGE" == "diagnose" ]]; then - collect_diagnostics "$TARGET_HOST" "$TARGET_SERVICE" + if [[ "$TARGET" == "control-plane" ]]; then + services=("control-plane") else - run_deployment "$REQUESTED_STAGE" || EXIT_STATUS=1 + local svc_yaml="${REPO_ROOT}/hosts/${TARGET}/services.yaml" + if [[ ! -f "$svc_yaml" ]]; then + echo "ERROR: ${svc_yaml} not found." >&2 + exit 2 + fi + local svc_list + svc_list=$(python3 -c " +import yaml +with open('${svc_yaml}') as f: + data = yaml.safe_load(f) +svcs = data.get('services', {}) +if isinstance(svcs, dict): + print('\n'.join(svcs.keys())) +elif isinstance(svcs, list): + print('\n'.join(svcs)) +") + while IFS= read -r svc; do + [[ -z "$svc" ]] && continue + if [[ -f "${REPO_ROOT}/services/${svc}/Dockerfile" ]]; then + services+=("$svc") + fi + done <<< "$svc_list" fi -else - # New deployment - clear previous state - clear_deployment_state - run_deployment "prepare" || EXIT_STATUS=1 + + if [[ ${#services[@]} -eq 0 ]]; then + echo "[info] No services with local Dockerfile found for ${TARGET} — gate trivially passes." + return 0 + fi + + echo "Services under gate: ${services[*]}" + local gate_failed=false + + for svc in "${services[@]}"; do + local svc_dir="${REPO_ROOT}/services/${svc}" + + if [[ -d "${svc_dir}/tests" ]]; then + echo "--- pytest: ${svc} ---" + if ! python3 -m pytest "${svc_dir}/tests" -q; then + echo "GATE FAIL: pytest failed for ${svc}" >&2 + gate_failed=true + fi + fi + + echo "--- docker build: ${svc} ---" + if ! docker build --quiet "${svc_dir}" >/dev/null; then + echo "GATE FAIL: docker build failed for ${svc}" >&2 + gate_failed=true + fi + done + + if [[ "$gate_failed" == "true" ]]; then + exit 2 + fi + echo "[ok] gate passed" +} + +# ── EXECUTE ────────────────────────────────────────────────────────────────── + +execute() { + echo "=== EXECUTE ===" + + local cmd_output + local cmd_exit=0 + + if [[ "$TARGET" == "control-plane" ]]; then + echo "Running deploy-control-plane.sh --ssh..." + cmd_output=$("${REPO_ROOT}/scripts/deploy/deploy-control-plane.sh" --ssh 2>&1) \ + || cmd_exit=$? + else + echo "SSHing to ${SSH_HOST}: git pull + deploy-node.sh..." + cmd_output=$(ssh -o "ConnectTimeout=${SSH_TIMEOUT}" -o BatchMode=yes \ + "${SSH_USER}@${SSH_HOST}" \ + 'cd ~/homelab-codex-ws && git pull && ./scripts/deploy/deploy-node.sh' 2>&1) \ + || cmd_exit=$? + fi + + echo "$cmd_output" + + if echo "$cmd_output" | grep -qF "[sudo] password"; then + echo "" >&2 + echo "ERROR (exit 5): Deploy hit an interactive sudo prompt." >&2 + echo "Run manually:" >&2 + if [[ "$TARGET" == "control-plane" ]]; then + echo " ssh -t ${SSH_USER}@${SSH_HOST} 'cd ~/homelab-codex-ws && git pull origin master && cd services/control-plane && bash deploy-local.sh'" >&2 + else + echo " ssh -t ${SSH_USER}@${SSH_HOST} 'cd ~/homelab-codex-ws && git pull && ./scripts/deploy/deploy-node.sh'" >&2 + fi + exit 5 + fi + + if [[ $cmd_exit -ne 0 ]]; then + echo "ERROR: Deploy command exited ${cmd_exit}." >&2 + exit 3 + fi + + echo "[ok] execute completed" +} + +# ── VERIFY ─────────────────────────────────────────────────────────────────── + +verify() { + echo "=== VERIFY ===" + + local ps_output + local ps_exit=0 + ps_output=$(ssh -o "ConnectTimeout=${SSH_TIMEOUT}" -o BatchMode=yes \ + "${SSH_USER}@${SSH_HOST}" \ + 'docker ps --format "{{.Names}}\t{{.Status}}"' 2>&1) \ + || ps_exit=$? + + if [[ $ps_exit -ne 0 ]]; then + echo "ERROR: docker ps failed on ${SSH_HOST}:" >&2 + echo "$ps_output" >&2 + exit 4 + fi + + echo "$ps_output" + + local failed=false + + local not_up + not_up=$(echo "$ps_output" | grep -v '^$' | grep -v $'\tUp' || true) + if [[ -n "$not_up" ]]; then + echo "ERROR: Containers not in Up state:" >&2 + echo "$not_up" >&2 + failed=true + fi + + local unhealthy + unhealthy=$(echo "$ps_output" | grep '(unhealthy)' || true) + if [[ -n "$unhealthy" ]]; then + echo "ERROR: Unhealthy containers:" >&2 + echo "$unhealthy" >&2 + failed=true + fi + + if [[ "$TARGET" == "control-plane" ]]; then + for cp_svc in supervisor observer executor operator-ui; do + if ! echo "$ps_output" | grep -q "$cp_svc"; then + echo "ERROR: control-plane component absent from docker ps: ${cp_svc}" >&2 + failed=true + fi + done + fi + + if [[ "$failed" == "true" ]]; then + echo "" >&2 + echo "Full docker ps output above." >&2 + exit 4 + fi + + echo "[ok] all containers healthy" +} + +# ── REPORT ─────────────────────────────────────────────────────────────────── + +report() { + local mode="${1:-deploy}" + local end_time + end_time=$(date +%s) + local elapsed + elapsed=$(( end_time - START_TIME )) + local commit_hash + commit_hash=$(git -C "$REPO_ROOT" rev-parse --short HEAD) + local gate_s verify_s + + if [[ "$NO_GATE" == "true" ]]; then + gate_s="skip" + else + gate_s="ok" + fi + + if [[ "$mode" == "dry-run" ]]; then + verify_s="skip(dry-run)" + else + verify_s="green" + fi + + echo "" + if [[ "$mode" == "dry-run" ]]; then + echo "DRY RUN OK | target=${TARGET} | commit=${commit_hash} | gate=${gate_s} | verify=${verify_s} | ${elapsed}s" + else + echo "DEPLOY OK | target=${TARGET} | commit=${commit_hash} | gate=${gate_s} | verify=${verify_s} | ${elapsed}s" + fi +} + +# ── MAIN ───────────────────────────────────────────────────────────────────── + +preflight +gate + +if [[ "$DRY_RUN" == "true" ]]; then + report dry-run + exit 0 fi -if [[ $EXIT_STATUS -eq 0 ]]; then - print_summary "$TARGET_HOST" "SUCCESS" - log "INFO" "--- Homelab Deployment Finished Successfully ---" -else - print_summary "$TARGET_HOST" "FAILED" - log "ERROR" "--- Homelab Deployment Failed ---" - exit 1 -fi +execute +verify +report