#!/usr/bin/env bash # deploy.sh - Staged deployment framework for homelab nodes. set -o pipefail # --- Configuration --- RUNTIME_PATH="/opt/homelab" STATE_DIR="${RUNTIME_PATH}/state/deploy" LOG_DIR="${RUNTIME_PATH}/logs/deploy" REPO_PATH="${HOME}/homelab-codex-ws" TIMESTAMP=$(date +%Y%m%d_%H%M%S) LOG_FILE="${LOG_DIR}/deploy_${TIMESTAMP}.log" # --- Initialization --- mkdir -p "$STATE_DIR" "$LOG_DIR" # Redirection for logging exec > >(tee -a "$LOG_FILE") 2>&1 # --- Helpers --- log() { local level=$1 shift local message=$* echo "[$(date +'%Y-%m-%d %H:%M:%S')] [$level] $message" } # Structured log for machine reading # timestamp, stage, host, service, command_result, retry_info struct_log() { local stage=$1 local host=$2 local service=$3 local result=$4 local info=$5 log "STRUCT" "stage=$stage host=$host service=$service result=$result info=\"$info\"" } set_state() { echo "$1" > "${STATE_DIR}/current_stage" } get_state() { if [ -f "${STATE_DIR}/current_stage" ]; then cat "${STATE_DIR}/current_stage" else echo "none" fi } set_last_service() { echo "$1" > "${STATE_DIR}/last_service" } get_last_service() { if [ -f "${STATE_DIR}/last_service" ]; then cat "${STATE_DIR}/last_service" else echo "" fi } # --- CLI Parsing --- TARGET_HOST=$(hostname) TARGET_SERVICE="" RESUME=false REQUESTED_STAGE="" while [[ $# -gt 0 ]]; do case $1 in --host) TARGET_HOST="$2" shift 2 ;; --service) TARGET_SERVICE="$2" shift 2 ;; --resume) RESUME=true shift ;; --stage) REQUESTED_STAGE="$2" shift 2 ;; *) if [[ "$1" =~ ^(prepare|validate|deploy|verify|diagnose|complete)$ ]]; then REQUESTED_STAGE="$1" fi shift ;; esac done # --- Inventory Loading --- load_inventory() { log "INFO" "Loading inventory for host: $TARGET_HOST" if [[ ! -d "${REPO_PATH}/hosts/${TARGET_HOST}" ]]; then log "ERROR" "Host directory not found: ${REPO_PATH}/hosts/${TARGET_HOST}" exit 1 fi if [[ -n "$TARGET_SERVICE" ]]; then SERVICES=("$TARGET_SERVICE") else if [[ -f "${REPO_PATH}/hosts/${TARGET_HOST}/services.txt" ]]; then SERVICES=($(cat "${REPO_PATH}/hosts/${TARGET_HOST}/services.txt")) elif [[ -f "${REPO_PATH}/hosts/${TARGET_HOST}/services.yaml" ]]; then SERVICES=($(grep -A 100 "services:" "${REPO_PATH}/hosts/${TARGET_HOST}/services.yaml" | grep "^ [a-z0-9_-]\+:" | sed 's/ \(.*\):/\1/')) else log "WARN" "No services found for $TARGET_HOST" SERVICES=() fi fi log "INFO" "Services to process: ${SERVICES[*]}" } # --- Stages --- stage_prepare() { local host=$1 log "INFO" "Stage: PREPARE ($host)" set_state "prepare" cd "$REPO_PATH" || exit 1 log "INFO" "Pulling latest changes..." if ! git pull; then log "WARN" "Git pull failed, proceeding with local state (offline mode or network flap)" fi mkdir -p "${RUNTIME_PATH}/config" "${RUNTIME_PATH}/data" "${RUNTIME_PATH}/state" "${RUNTIME_PATH}/logs" struct_log "prepare" "$host" "all" "success" "repo_updated" } stage_validate() { local host=$1 log "INFO" "Stage: VALIDATE ($host)" set_state "validate" for service in "${SERVICES[@]}"; do log "INFO" "Validating $service..." if [[ ! -d "${REPO_PATH}/services/$service" ]]; then log "ERROR" "Service definition not found: $service" struct_log "validate" "$host" "$service" "fail" "not_found" return 1 fi done struct_log "validate" "$host" "all" "success" "validated" } stage_deploy() { local host=$1 log "INFO" "Stage: DEPLOY ($host)" set_state "deploy" local last_s=$(get_last_service) local skip=false if [[ "$RESUME" == "true" && -n "$last_s" ]]; then skip=true fi for service in "${SERVICES[@]}"; do if [[ "$skip" == "true" ]]; then if [[ "$service" == "$last_s" ]]; then skip=false log "INFO" "Resuming from $service..." else log "INFO" "Skipping $service (already processed)" continue fi fi log "INFO" "Deploying $service..." set_last_service "$service" local svc_dir="${REPO_PATH}/services/$service" local runtime_config_dir="${RUNTIME_PATH}/config/$service" mkdir -p "$runtime_config_dir" local compose_args=("-f" "${svc_dir}/docker-compose.yml") if [[ -f "${runtime_config_dir}/docker-compose.override.yml" ]]; then log "INFO" "Using override for $service" compose_args+=("-f" "${runtime_config_dir}/docker-compose.override.yml") fi # Determine .env local env_file="" if [[ -f "${runtime_config_dir}/.env" ]]; then env_file="${runtime_config_dir}/.env" elif [[ -f "${svc_dir}/.env" ]]; then env_file="${svc_dir}/.env" fi local run_cmd=("docker" "compose") run_cmd+=("${compose_args[@]}") if [[ -n "$env_file" ]]; then run_cmd+=("--env-file" "$env_file") fi run_cmd+=("up" "-d" "--remove-orphans") log "INFO" "Running: ${run_cmd[*]}" if ! "${run_cmd[@]}"; then log "ERROR" "Failed to deploy $service" struct_log "deploy" "$host" "$service" "fail" "docker_compose_failed" stage_diagnose "$host" "$service" return 1 fi struct_log "deploy" "$host" "$service" "success" "deployed" done set_last_service "" } stage_verify() { local host=$1 log "INFO" "Stage: VERIFY ($host)" set_state "verify" for service in "${SERVICES[@]}"; do log "INFO" "Verifying $service..." local health_script="${REPO_PATH}/services/${service}/healthcheck.sh" if [[ -f "$health_script" ]]; then if ! bash "$health_script"; then log "ERROR" "Healthcheck failed for $service" struct_log "verify" "$host" "$service" "fail" "healthcheck_failed" stage_diagnose "$host" "$service" return 1 fi else if ! docker ps --filter "name=$service" --filter "status=running" | grep -q "$service"; then log "ERROR" "Container $service is not running" struct_log "verify" "$host" "$service" "fail" "container_not_running" stage_diagnose "$host" "$service" return 1 fi fi struct_log "verify" "$host" "$service" "success" "verified" done } stage_diagnose() { local host=$1 local service=$2 log "INFO" "Stage: DIAGNOSE ($host - ${service:-all})" echo "--- DIAGNOSTICS FOR ${service:-all} ---" docker ps --filter "name=${service:-}" if [[ -n "$service" ]]; then local svc_dir="${REPO_PATH}/services/$service" if [[ -d "$svc_dir" ]]; then cd "$svc_dir" || exit 1 docker compose ps docker compose logs --tail=50 fi fi echo "--- END DIAGNOSTICS ---" struct_log "diagnose" "$host" "${service:-all}" "done" "diagnostics_collected" } stage_complete() { local host=$1 log "INFO" "Stage: COMPLETE ($host)" set_state "complete" struct_log "complete" "$host" "all" "success" "deployment_finished" } # --- Execution Logic --- run_deployment() { local start_stage=$1 case "$start_stage" in prepare) stage_prepare "$TARGET_HOST" || return 1 ;& validate) stage_validate "$TARGET_HOST" || return 1 ;& deploy) stage_deploy "$TARGET_HOST" || return 1 ;& verify) stage_verify "$TARGET_HOST" || return 1 ;& complete) stage_complete "$TARGET_HOST" || return 1 ;; *) log "ERROR" "Invalid stage: $start_stage" return 1 ;; esac } # --- Main --- log "INFO" "--- Homelab Deployment Started (Host: $TARGET_HOST, Service: ${TARGET_SERVICE:-all}) ---" load_inventory if [[ "$RESUME" == "true" ]]; then CURRENT=$(get_state) log "INFO" "Resuming from state: $CURRENT" case "$CURRENT" in prepare|validate|deploy|verify) run_deployment "$CURRENT" ;; complete) log "INFO" "Last deployment was complete. Nothing to resume." ;; *) log "INFO" "No valid state to resume. Starting from prepare..." run_deployment "prepare" ;; esac elif [[ -n "$REQUESTED_STAGE" ]]; then if [[ "$REQUESTED_STAGE" == "diagnose" ]]; then stage_diagnose "$TARGET_HOST" "$TARGET_SERVICE" else run_deployment "$REQUESTED_STAGE" fi else run_deployment "prepare" fi log "INFO" "--- Homelab Deployment Finished ---"