From b524a3886a440f5856e4fdcf983ac6fce9299b33 Mon Sep 17 00:00:00 2001 From: Oskar Kapala Date: Mon, 11 May 2026 21:20:13 +0200 Subject: [PATCH] Harden deployment runtime framework --- docs/deployment.md | 24 +++- docs/lifecycle.md | 8 +- scripts/deploy/deploy.sh | 218 +++++++++++++------------------------ scripts/lib/compose.sh | 45 ++++++++ scripts/lib/diagnostics.sh | 53 +++++++++ scripts/lib/inventory.sh | 45 ++++++++ scripts/lib/log.sh | 23 ++++ scripts/lib/state.sh | 51 +++++++++ 8 files changed, 318 insertions(+), 149 deletions(-) create mode 100644 scripts/lib/compose.sh create mode 100644 scripts/lib/diagnostics.sh create mode 100644 scripts/lib/inventory.sh create mode 100644 scripts/lib/log.sh create mode 100644 scripts/lib/state.sh diff --git a/docs/deployment.md b/docs/deployment.md index 9d8bcc4..a13e972 100644 --- a/docs/deployment.md +++ b/docs/deployment.md @@ -12,7 +12,17 @@ This document describes the GitOps-lite deployment process for the homelab. ## Staged Deployment Framework -The homelab uses a staged deployment framework located at `scripts/deploy/deploy.sh`. This script is designed to be resumable, stage-aware, and observable. +The homelab uses a modularized staged deployment framework located at `scripts/deploy/deploy.sh`. This script is designed to be resumable, stage-aware, and observable, with core logic split into maintainable libraries in `scripts/lib/`. + +### Runtime Architecture + +The runtime consists of: +- `deploy.sh`: Orchestration entrypoint. +- `lib/log.sh`: Logging and structured output. +- `lib/state.sh`: Deployment state tracking and stage persistence. +- `lib/inventory.sh`: Reliable host and service discovery (Python-based YAML parsing). +- `lib/compose.sh`: Docker Compose operations. +- `lib/diagnostics.sh`: Post-failure analysis and summary generation. ### Deployment Stages @@ -32,8 +42,16 @@ The homelab uses a staged deployment framework located at `scripts/deploy/deploy If a deployment is interrupted (e.g., due to LTE disconnect on CHELSTY): 1. Rerun the script with the `--resume` flag: `scripts/deploy/deploy.sh --resume`. -2. The script reads the last incomplete stage and continues from there. -3. In the `deploy` stage, it specifically resumes from the first service that was not successfully completed. +2. The script identifies the last incomplete stage using deterministic markers (`/opt/homelab/state/deploy/stage__complete`) and continues from the exact failure point. +3. In the `deploy` stage, it specifically resumes from the first service that was not successfully completed, skipping those already up. +4. Repeated runs are safe and idempotent; completed stages are not re-executed unless the resume flag is omitted (which clears state for a fresh run). + +### Diagnostics and Troubleshooting + +The runtime is designed to fail predictably and provide immediate feedback: +- **Automatic Diagnostics**: If any stage fails, `collect_diagnostics` is triggered to capture system state and container logs into `/opt/homelab/logs/deploy/diagnostics_.txt`. +- **Deployment Summary**: Every run concludes with a concise summary showing the host status, last stage reached, and log locations. +- **Offline Resilience**: The `prepare` stage handles `git pull` failures gracefully, allowing deployment from local cache during network instability. ### Operational Semantics diff --git a/docs/lifecycle.md b/docs/lifecycle.md index 16460ef..06c7493 100644 --- a/docs/lifecycle.md +++ b/docs/lifecycle.md @@ -13,11 +13,11 @@ This document defines the lifecycle of a service in the homelab and the procedur - Ensure `/opt/homelab/config/` exists and contains required secrets/configs. - Setup environment variables from `env.example` into `/opt/homelab/config//.env`. 3. **Deployment**: - - `scripts/deploy/deploy.sh prepare` - - `scripts/deploy/deploy.sh deploy` + - `scripts/deploy/deploy.sh` (Starts fresh) + - `scripts/deploy/deploy.sh --resume` (Continues after interruption) 4. **Verification**: - - `scripts/deploy/deploy.sh verify` - - Healthchecks are automated within the verify stage. + - Automatic as part of the `deploy.sh` pipeline (`verify` stage). + - Manual: `scripts/deploy/deploy.sh --stage verify`. 5. **Maintenance**: - Periodic updates via `docker compose pull`. - Log monitoring via `docker compose logs -f`. diff --git a/scripts/deploy/deploy.sh b/scripts/deploy/deploy.sh index 0ca462f..5769517 100755 --- a/scripts/deploy/deploy.sh +++ b/scripts/deploy/deploy.sh @@ -4,12 +4,12 @@ set -o pipefail # --- Configuration --- -RUNTIME_PATH="/opt/homelab" -STATE_DIR="${RUNTIME_PATH}/state/deploy" -LOG_DIR="${RUNTIME_PATH}/logs/deploy" -REPO_PATH="${HOME}/homelab-codex-ws" -TIMESTAMP=$(date +%Y%m%d_%H%M%S) -LOG_FILE="${LOG_DIR}/deploy_${TIMESTAMP}.log" +export RUNTIME_PATH="/opt/homelab" +export STATE_DIR="${RUNTIME_PATH}/state/deploy" +export LOG_DIR="${RUNTIME_PATH}/logs/deploy" +export REPO_PATH="${HOME}/homelab-codex-ws" +export TIMESTAMP=$(date +%Y%m%d_%H%M%S) +export LOG_FILE="${LOG_DIR}/deploy_${TIMESTAMP}.log" # --- Initialization --- mkdir -p "$STATE_DIR" "$LOG_DIR" @@ -17,52 +17,15 @@ mkdir -p "$STATE_DIR" "$LOG_DIR" # Redirection for logging exec > >(tee -a "$LOG_FILE") 2>&1 -# --- Helpers --- - -log() { - local level=$1 - shift - local message=$* - echo "[$(date +'%Y-%m-%d %H:%M:%S')] [$level] $message" -} - -# Structured log for machine reading -# timestamp, stage, host, service, command_result, retry_info -struct_log() { - local stage=$1 - local host=$2 - local service=$3 - local result=$4 - local info=$5 - log "STRUCT" "stage=$stage host=$host service=$service result=$result info=\"$info\"" -} - -set_state() { - echo "$1" > "${STATE_DIR}/current_stage" -} - -get_state() { - if [ -f "${STATE_DIR}/current_stage" ]; then - cat "${STATE_DIR}/current_stage" - else - echo "none" - fi -} - -set_last_service() { - echo "$1" > "${STATE_DIR}/last_service" -} - -get_last_service() { - if [ -f "${STATE_DIR}/last_service" ]; then - cat "${STATE_DIR}/last_service" - else - echo "" - fi -} +# --- Load Libraries --- +LIB_PATH="${REPO_PATH}/scripts/lib" +source "${LIB_PATH}/log.sh" +source "${LIB_PATH}/state.sh" +source "${LIB_PATH}/inventory.sh" +source "${LIB_PATH}/compose.sh" +source "${LIB_PATH}/diagnostics.sh" # --- CLI Parsing --- - TARGET_HOST=$(hostname) TARGET_SERVICE="" RESUME=false @@ -95,37 +58,17 @@ while [[ $# -gt 0 ]]; do esac done -# --- Inventory Loading --- - -load_inventory() { - log "INFO" "Loading inventory for host: $TARGET_HOST" - - if [[ ! -d "${REPO_PATH}/hosts/${TARGET_HOST}" ]]; then - log "ERROR" "Host directory not found: ${REPO_PATH}/hosts/${TARGET_HOST}" - exit 1 - fi - - if [[ -n "$TARGET_SERVICE" ]]; then - SERVICES=("$TARGET_SERVICE") - else - if [[ -f "${REPO_PATH}/hosts/${TARGET_HOST}/services.txt" ]]; then - SERVICES=($(cat "${REPO_PATH}/hosts/${TARGET_HOST}/services.txt")) - elif [[ -f "${REPO_PATH}/hosts/${TARGET_HOST}/services.yaml" ]]; then - SERVICES=($(grep -A 100 "services:" "${REPO_PATH}/hosts/${TARGET_HOST}/services.yaml" | grep "^ [a-z0-9_-]\+:" | sed 's/ \(.*\):/\1/')) - else - log "WARN" "No services found for $TARGET_HOST" - SERVICES=() - fi - fi - log "INFO" "Services to process: ${SERVICES[*]}" -} - # --- Stages --- stage_prepare() { local host=$1 + if is_stage_complete "prepare" && [[ "$RESUME" == "true" ]]; then + log "INFO" "Skipping PREPARE (already complete)" + return 0 + fi + log "INFO" "Stage: PREPARE ($host)" - set_state "prepare" + set_stage "prepare" cd "$REPO_PATH" || exit 1 log "INFO" "Pulling latest changes..." @@ -133,15 +76,22 @@ stage_prepare() { log "WARN" "Git pull failed, proceeding with local state (offline mode or network flap)" fi + # Ensure runtime directories exist mkdir -p "${RUNTIME_PATH}/config" "${RUNTIME_PATH}/data" "${RUNTIME_PATH}/state" "${RUNTIME_PATH}/logs" struct_log "prepare" "$host" "all" "success" "repo_updated" + mark_stage_complete "prepare" } stage_validate() { local host=$1 + if is_stage_complete "validate" && [[ "$RESUME" == "true" ]]; then + log "INFO" "Skipping VALIDATE (already complete)" + return 0 + fi + log "INFO" "Stage: VALIDATE ($host)" - set_state "validate" + set_stage "validate" for service in "${SERVICES[@]}"; do log "INFO" "Validating $service..." @@ -153,12 +103,18 @@ stage_validate() { done struct_log "validate" "$host" "all" "success" "validated" + mark_stage_complete "validate" } stage_deploy() { local host=$1 + if is_stage_complete "deploy" && [[ "$RESUME" == "true" ]]; then + log "INFO" "Skipping DEPLOY (already complete)" + return 0 + fi + log "INFO" "Stage: DEPLOY ($host)" - set_state "deploy" + set_stage "deploy" local last_s=$(get_last_service) local skip=false @@ -180,48 +136,28 @@ stage_deploy() { log "INFO" "Deploying $service..." set_last_service "$service" - local svc_dir="${REPO_PATH}/services/$service" - local runtime_config_dir="${RUNTIME_PATH}/config/$service" - mkdir -p "$runtime_config_dir" - - local compose_args=("-f" "${svc_dir}/docker-compose.yml") - if [[ -f "${runtime_config_dir}/docker-compose.override.yml" ]]; then - log "INFO" "Using override for $service" - compose_args+=("-f" "${runtime_config_dir}/docker-compose.override.yml") - fi - - # Determine .env - local env_file="" - if [[ -f "${runtime_config_dir}/.env" ]]; then - env_file="${runtime_config_dir}/.env" - elif [[ -f "${svc_dir}/.env" ]]; then - env_file="${svc_dir}/.env" - fi - - local run_cmd=("docker" "compose") - run_cmd+=("${compose_args[@]}") - if [[ -n "$env_file" ]]; then - run_cmd+=("--env-file" "$env_file") - fi - run_cmd+=("up" "-d" "--remove-orphans") - - log "INFO" "Running: ${run_cmd[*]}" - if ! "${run_cmd[@]}"; then - log "ERROR" "Failed to deploy $service" + if ! run_compose_up "$service"; then struct_log "deploy" "$host" "$service" "fail" "docker_compose_failed" - stage_diagnose "$host" "$service" + collect_diagnostics "$host" "$service" return 1 fi struct_log "deploy" "$host" "$service" "success" "deployed" done + set_last_service "" + mark_stage_complete "deploy" } stage_verify() { local host=$1 + if is_stage_complete "verify" && [[ "$RESUME" == "true" ]]; then + log "INFO" "Skipping VERIFY (already complete)" + return 0 + fi + log "INFO" "Stage: VERIFY ($host)" - set_state "verify" + set_stage "verify" for service in "${SERVICES[@]}"; do log "INFO" "Verifying $service..." @@ -230,46 +166,29 @@ stage_verify() { if ! bash "$health_script"; then log "ERROR" "Healthcheck failed for $service" struct_log "verify" "$host" "$service" "fail" "healthcheck_failed" - stage_diagnose "$host" "$service" + collect_diagnostics "$host" "$service" return 1 fi else + # Generic check if container is running if ! docker ps --filter "name=$service" --filter "status=running" | grep -q "$service"; then log "ERROR" "Container $service is not running" struct_log "verify" "$host" "$service" "fail" "container_not_running" - stage_diagnose "$host" "$service" + collect_diagnostics "$host" "$service" return 1 fi fi struct_log "verify" "$host" "$service" "success" "verified" done -} - -stage_diagnose() { - local host=$1 - local service=$2 - log "INFO" "Stage: DIAGNOSE ($host - ${service:-all})" - - echo "--- DIAGNOSTICS FOR ${service:-all} ---" - docker ps --filter "name=${service:-}" - - if [[ -n "$service" ]]; then - local svc_dir="${REPO_PATH}/services/$service" - if [[ -d "$svc_dir" ]]; then - cd "$svc_dir" || exit 1 - docker compose ps - docker compose logs --tail=50 - fi - fi - echo "--- END DIAGNOSTICS ---" - struct_log "diagnose" "$host" "${service:-all}" "done" "diagnostics_collected" + mark_stage_complete "verify" } stage_complete() { local host=$1 log "INFO" "Stage: COMPLETE ($host)" - set_state "complete" + set_stage "complete" struct_log "complete" "$host" "all" "success" "deployment_finished" + clear_deployment_state } # --- Execution Logic --- @@ -277,6 +196,7 @@ stage_complete() { run_deployment() { local start_stage=$1 + # Sequential execution from start_stage case "$start_stage" in prepare) stage_prepare "$TARGET_HOST" || return 1 @@ -304,31 +224,45 @@ run_deployment() { log "INFO" "--- Homelab Deployment Started (Host: $TARGET_HOST, Service: ${TARGET_SERVICE:-all}) ---" -load_inventory +if ! load_inventory "$TARGET_HOST" "$TARGET_SERVICE"; then + log "ERROR" "Failed to load inventory" + exit 1 +fi +EXIT_STATUS=0 if [[ "$RESUME" == "true" ]]; then - CURRENT=$(get_state) + CURRENT=$(get_stage) log "INFO" "Resuming from state: $CURRENT" case "$CURRENT" in prepare|validate|deploy|verify) - run_deployment "$CURRENT" + run_deployment "$CURRENT" || EXIT_STATUS=1 ;; - complete) - log "INFO" "Last deployment was complete. Nothing to resume." + complete|none) + log "INFO" "No interrupted deployment found. Starting from scratch..." + run_deployment "prepare" || EXIT_STATUS=1 ;; *) - log "INFO" "No valid state to resume. Starting from prepare..." - run_deployment "prepare" + log "INFO" "Unknown state. Starting from prepare..." + run_deployment "prepare" || EXIT_STATUS=1 ;; esac elif [[ -n "$REQUESTED_STAGE" ]]; then if [[ "$REQUESTED_STAGE" == "diagnose" ]]; then - stage_diagnose "$TARGET_HOST" "$TARGET_SERVICE" + collect_diagnostics "$TARGET_HOST" "$TARGET_SERVICE" else - run_deployment "$REQUESTED_STAGE" + run_deployment "$REQUESTED_STAGE" || EXIT_STATUS=1 fi else - run_deployment "prepare" + # New deployment - clear previous state + clear_deployment_state + run_deployment "prepare" || EXIT_STATUS=1 fi -log "INFO" "--- Homelab Deployment Finished ---" +if [[ $EXIT_STATUS -eq 0 ]]; then + print_summary "$TARGET_HOST" "SUCCESS" + log "INFO" "--- Homelab Deployment Finished Successfully ---" +else + print_summary "$TARGET_HOST" "FAILED" + log "ERROR" "--- Homelab Deployment Failed ---" + exit 1 +fi diff --git a/scripts/lib/compose.sh b/scripts/lib/compose.sh new file mode 100644 index 0000000..8e3c2d8 --- /dev/null +++ b/scripts/lib/compose.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +# compose.sh - Docker Compose operations + +run_compose_up() { + local service=$1 + local svc_dir="${REPO_PATH}/services/$service" + local runtime_config_dir="${RUNTIME_PATH}/config/$service" + + if [[ ! -d "$svc_dir" ]]; then + log "ERROR" "Service directory not found: $svc_dir" + return 1 + fi + + mkdir -p "$runtime_config_dir" + + local compose_args=("-f" "${svc_dir}/docker-compose.yml") + if [[ -f "${runtime_config_dir}/docker-compose.override.yml" ]]; then + log "INFO" "Using override for $service" + compose_args+=("-f" "${runtime_config_dir}/docker-compose.override.yml") + fi + + # Determine .env + local env_file="" + if [[ -f "${runtime_config_dir}/.env" ]]; then + env_file="${runtime_config_dir}/.env" + elif [[ -f "${svc_dir}/.env" ]]; then + env_file="${svc_dir}/.env" + fi + + local run_cmd=("docker" "compose") + run_cmd+=("${compose_args[@]}") + if [[ -n "$env_file" ]]; then + run_cmd+=("--env-file" "$env_file") + fi + run_cmd+=("up" "-d" "--remove-orphans") + + log "INFO" "Running: ${run_cmd[*]}" + if ! "${run_cmd[@]}"; then + log "ERROR" "Docker compose failed for $service" + return 1 + fi + return 0 +} + +export -f run_compose_up diff --git a/scripts/lib/diagnostics.sh b/scripts/lib/diagnostics.sh new file mode 100644 index 0000000..7547840 --- /dev/null +++ b/scripts/lib/diagnostics.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +# diagnostics.sh - Deployment failure diagnostics + +collect_diagnostics() { + local host=$1 + local service=$2 + log "INFO" "Stage: DIAGNOSE ($host - ${service:-all})" + + local diag_file="${LOG_DIR}/diagnostics_${TIMESTAMP}.txt" + { + echo "--- DIAGNOSTICS FOR ${service:-all} (Host: $host, Time: $(date)) ---" + echo "Uptime: $(uptime)" + echo "Memory: $(free -h)" + echo "Disk: $(df -h /)" + echo "--- Docker Status ---" + docker ps --filter "name=${service:-}" + + if [[ -n "$service" ]]; then + local svc_dir="${REPO_PATH}/services/$service" + if [[ -d "$svc_dir" ]]; then + echo "--- $service Logs ---" + cd "$svc_dir" && docker compose logs --tail=50 + fi + fi + echo "--- END DIAGNOSTICS ---" + } > "$diag_file" 2>&1 + + # Also output to console for immediate visibility + cat "$diag_file" + log "INFO" "Diagnostics stored in $diag_file" +} + +print_summary() { + local host=$1 + local status=$2 + local last_stage=$(get_stage) + local last_service=$(get_last_service) + + echo "" + echo "==========================================" + echo " DEPLOYMENT SUMMARY" + echo "==========================================" + echo "Host: $host" + echo "Status: $status" + echo "Last Stage: $last_stage" + [[ -n "$last_service" ]] && echo "Last Service: $last_service" + echo "Log File: $LOG_FILE" + echo "==========================================" + echo "" +} + +export -f collect_diagnostics +export -f print_summary diff --git a/scripts/lib/inventory.sh b/scripts/lib/inventory.sh new file mode 100644 index 0000000..3c7fb8f --- /dev/null +++ b/scripts/lib/inventory.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +# inventory.sh - Host and service discovery + +load_inventory() { + local host=$1 + local service_override=$2 + + log "INFO" "Loading inventory for host: $host" + + if [[ ! -d "${REPO_PATH}/hosts/${host}" ]]; then + log "ERROR" "Host directory not found: ${REPO_PATH}/hosts/${host}" + return 1 + fi + + if [[ -n "$service_override" ]]; then + SERVICES=("$service_override") + else + if [[ -f "${REPO_PATH}/hosts/${host}/services.txt" ]]; then + # Read services from text file, ignoring comments and empty lines + mapfile -t SERVICES < <(grep -v '^\s*#' "${REPO_PATH}/hosts/${host}/services.txt" | grep -v '^\s*$') + elif [[ -f "${REPO_PATH}/hosts/${host}/services.yaml" ]]; then + # Use python for reliable YAML parsing + SERVICES=($(python3 -c " +import yaml, sys +try: + with open('${REPO_PATH}/hosts/${host}/services.yaml', 'r') as f: + data = yaml.safe_load(f) + if data and 'services' in data: + if isinstance(data['services'], dict): + print(' '.join(data['services'].keys())) + elif isinstance(data['services'], list): + print(' '.join(data['services'])) +except Exception as e: + print(f'Error parsing YAML: {e}', file=sys.stderr) + sys.exit(1) +")) + else + log "WARN" "No services found for $host" + SERVICES=() + fi + fi + log "INFO" "Services to process: ${SERVICES[*]}" +} + +export -f load_inventory diff --git a/scripts/lib/log.sh b/scripts/lib/log.sh new file mode 100644 index 0000000..af9008e --- /dev/null +++ b/scripts/lib/log.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# log.sh - Logging utilities for homelab deployment + +log() { + local level=$1 + shift + local message=$* + echo "[$(date +'%Y-%m-%d %H:%M:%S')] [$level] $message" +} + +# Structured log for machine reading +# timestamp, stage, host, service, command_result, info +struct_log() { + local stage=$1 + local host=$2 + local service=$3 + local result=$4 + local info=$5 + log "STRUCT" "stage=$stage host=$host service=$service result=$result info=\"$info\"" +} + +export -f log +export -f struct_log diff --git a/scripts/lib/state.sh b/scripts/lib/state.sh new file mode 100644 index 0000000..1b6f2e2 --- /dev/null +++ b/scripts/lib/state.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +# state.sh - Deployment state management + +set_stage() { + local stage=$1 + echo "$stage" > "${STATE_DIR}/current_stage" +} + +get_stage() { + if [[ -f "${STATE_DIR}/current_stage" ]]; then + cat "${STATE_DIR}/current_stage" + else + echo "none" + fi +} + +mark_stage_complete() { + local stage=$1 + touch "${STATE_DIR}/stage_${stage}_complete" +} + +is_stage_complete() { + local stage=$1 + [[ -f "${STATE_DIR}/stage_${stage}_complete" ]] +} + +clear_deployment_state() { + rm -f "${STATE_DIR}"/stage_*_complete + rm -f "${STATE_DIR}/current_stage" + rm -f "${STATE_DIR}/last_service" +} + +set_last_service() { + echo "$1" > "${STATE_DIR}/last_service" +} + +get_last_service() { + if [[ -f "${STATE_DIR}/last_service" ]]; then + cat "${STATE_DIR}/last_service" + else + echo "" + fi +} + +export -f set_stage +export -f get_stage +export -f mark_stage_complete +export -f is_stage_complete +export -f clear_deployment_state +export -f set_last_service +export -f get_last_service