diff --git a/docs/deployment.md b/docs/deployment.md index 164fb9f..9d8bcc4 100644 --- a/docs/deployment.md +++ b/docs/deployment.md @@ -16,17 +16,24 @@ The homelab uses a staged deployment framework located at `scripts/deploy/deploy ### Deployment Stages -1. **prepare**: Pulls the latest changes from Git, validates inventory, and prepares the local environment. -2. **deploy**: Executes `docker compose` commands for all assigned services. -3. **verify**: Checks the health and connectivity of deployed services. -4. **diagnose**: Performs deep checks and resource analysis if something goes wrong. -5. **rollback**: Reverts to a previous known-good state. -6. **resume**: Automatically continues from the last successful stage. +1. **prepare**: Pulls the latest changes from Git, validates inventory, and prepares the local environment. It is tolerant of network failures to support intermittently connected nodes like CHELSTY. +2. **validate**: Ensures all required service definitions and metadata are present. +3. **deploy**: Executes `docker compose` commands for all assigned services. Supports `.env` files and `docker-compose.override.yml` under `/opt/homelab/config//`. +4. **verify**: Executes service-specific `healthcheck.sh` scripts or checks container status. +5. **diagnose**: Automatically triggered on failure; collects container status and logs for troubleshooting. +6. **complete**: Finalizes the deployment and marks the state as finished. ### State Tracking and Logging -- **State**: Local node state is tracked in `/opt/homelab/state/deploy/current_stage`. -- **Logs**: Detailed execution logs are stored in `/opt/homelab/logs/deploy/deploy_.log`. +- **State**: Local node state is tracked in `/opt/homelab/state/deploy/current_stage`. The last successfully processed service in the `deploy` stage is tracked in `last_service` to support granular resumption. +- **Logs**: Detailed execution logs are stored in `/opt/homelab/logs/deploy/deploy_.log`. Structured log entries prefixed with `[STRUCT]` provide machine-parseable event data. + +### Resume Semantics + +If a deployment is interrupted (e.g., due to LTE disconnect on CHELSTY): +1. Rerun the script with the `--resume` flag: `scripts/deploy/deploy.sh --resume`. +2. The script reads the last incomplete stage and continues from there. +3. In the `deploy` stage, it specifically resumes from the first service that was not successfully completed. ### Operational Semantics @@ -38,9 +45,9 @@ Deployment is **hybrid**: ### Recovery Workflow If a deployment fails: -1. Run `deploy.sh diagnose` to identify the issue. +1. Run `deploy.sh --stage diagnose` to identify the issue. 2. Use the `recover-node` AI prompt to analyze logs and get recommendations. -3. Either fix the issue and run `deploy.sh resume`, or use `deploy.sh rollback`. +3. Fix the issue (e.g., update a secret in `.env`) and run `deploy.sh --resume`. ## Onboarding New Nodes diff --git a/docs/lifecycle.md b/docs/lifecycle.md index 0ef8128..16460ef 100644 --- a/docs/lifecycle.md +++ b/docs/lifecycle.md @@ -13,11 +13,11 @@ This document defines the lifecycle of a service in the homelab and the procedur - Ensure `/opt/homelab/config/` exists and contains required secrets/configs. - Setup environment variables from `env.example` into `/opt/homelab/config//.env`. 3. **Deployment**: - - `docker compose pull` - - `docker compose up -d` + - `scripts/deploy/deploy.sh prepare` + - `scripts/deploy/deploy.sh deploy` 4. **Verification**: - - Run `healthcheck.sh`. - - Verify ports are reachable according to `service.yaml`. + - `scripts/deploy/deploy.sh verify` + - Healthchecks are automated within the verify stage. 5. **Maintenance**: - Periodic updates via `docker compose pull`. - Log monitoring via `docker compose logs -f`. diff --git a/scripts/deploy/deploy.sh b/scripts/deploy/deploy.sh index da462ef..0ca462f 100755 --- a/scripts/deploy/deploy.sh +++ b/scripts/deploy/deploy.sh @@ -1,8 +1,7 @@ #!/usr/bin/env bash # deploy.sh - Staged deployment framework for homelab nodes. -# Usage: ./deploy.sh [stage] -set -e +set -o pipefail # --- Configuration --- RUNTIME_PATH="/opt/homelab" @@ -18,13 +17,28 @@ mkdir -p "$STATE_DIR" "$LOG_DIR" # Redirection for logging exec > >(tee -a "$LOG_FILE") 2>&1 +# --- Helpers --- + log() { - echo "[$(date +'%Y-%m-%d %H:%M:%S')] $1" + local level=$1 + shift + local message=$* + echo "[$(date +'%Y-%m-%d %H:%M:%S')] [$level] $message" +} + +# Structured log for machine reading +# timestamp, stage, host, service, command_result, retry_info +struct_log() { + local stage=$1 + local host=$2 + local service=$3 + local result=$4 + local info=$5 + log "STRUCT" "stage=$stage host=$host service=$service result=$result info=\"$info\"" } set_state() { echo "$1" > "${STATE_DIR}/current_stage" - log "State set to: $1" } get_state() { @@ -35,76 +49,286 @@ get_state() { fi } +set_last_service() { + echo "$1" > "${STATE_DIR}/last_service" +} + +get_last_service() { + if [ -f "${STATE_DIR}/last_service" ]; then + cat "${STATE_DIR}/last_service" + else + echo "" + fi +} + +# --- CLI Parsing --- + +TARGET_HOST=$(hostname) +TARGET_SERVICE="" +RESUME=false +REQUESTED_STAGE="" + +while [[ $# -gt 0 ]]; do + case $1 in + --host) + TARGET_HOST="$2" + shift 2 + ;; + --service) + TARGET_SERVICE="$2" + shift 2 + ;; + --resume) + RESUME=true + shift + ;; + --stage) + REQUESTED_STAGE="$2" + shift 2 + ;; + *) + if [[ "$1" =~ ^(prepare|validate|deploy|verify|diagnose|complete)$ ]]; then + REQUESTED_STAGE="$1" + fi + shift + ;; + esac +done + +# --- Inventory Loading --- + +load_inventory() { + log "INFO" "Loading inventory for host: $TARGET_HOST" + + if [[ ! -d "${REPO_PATH}/hosts/${TARGET_HOST}" ]]; then + log "ERROR" "Host directory not found: ${REPO_PATH}/hosts/${TARGET_HOST}" + exit 1 + fi + + if [[ -n "$TARGET_SERVICE" ]]; then + SERVICES=("$TARGET_SERVICE") + else + if [[ -f "${REPO_PATH}/hosts/${TARGET_HOST}/services.txt" ]]; then + SERVICES=($(cat "${REPO_PATH}/hosts/${TARGET_HOST}/services.txt")) + elif [[ -f "${REPO_PATH}/hosts/${TARGET_HOST}/services.yaml" ]]; then + SERVICES=($(grep -A 100 "services:" "${REPO_PATH}/hosts/${TARGET_HOST}/services.yaml" | grep "^ [a-z0-9_-]\+:" | sed 's/ \(.*\):/\1/')) + else + log "WARN" "No services found for $TARGET_HOST" + SERVICES=() + fi + fi + log "INFO" "Services to process: ${SERVICES[*]}" +} + # --- Stages --- stage_prepare() { - log "Stage: PREPARE" + local host=$1 + log "INFO" "Stage: PREPARE ($host)" set_state "prepare" - # Skeleton: Pull latest changes, check dependencies, validate inventory - log "Checking repository at $REPO_PATH..." - cd "$REPO_PATH" && git pull - log "Preparation complete." + + cd "$REPO_PATH" || exit 1 + log "INFO" "Pulling latest changes..." + if ! git pull; then + log "WARN" "Git pull failed, proceeding with local state (offline mode or network flap)" + fi + + mkdir -p "${RUNTIME_PATH}/config" "${RUNTIME_PATH}/data" "${RUNTIME_PATH}/state" "${RUNTIME_PATH}/logs" + + struct_log "prepare" "$host" "all" "success" "repo_updated" +} + +stage_validate() { + local host=$1 + log "INFO" "Stage: VALIDATE ($host)" + set_state "validate" + + for service in "${SERVICES[@]}"; do + log "INFO" "Validating $service..." + if [[ ! -d "${REPO_PATH}/services/$service" ]]; then + log "ERROR" "Service definition not found: $service" + struct_log "validate" "$host" "$service" "fail" "not_found" + return 1 + fi + done + + struct_log "validate" "$host" "all" "success" "validated" } stage_deploy() { - log "Stage: DEPLOY" + local host=$1 + log "INFO" "Stage: DEPLOY ($host)" set_state "deploy" - # Skeleton: Iterate through services and run docker compose - log "Deploying services defined for $(hostname)..." - # Implementation detail: loop through services/ and run compose - log "Deployment complete." + + local last_s=$(get_last_service) + local skip=false + if [[ "$RESUME" == "true" && -n "$last_s" ]]; then + skip=true + fi + + for service in "${SERVICES[@]}"; do + if [[ "$skip" == "true" ]]; then + if [[ "$service" == "$last_s" ]]; then + skip=false + log "INFO" "Resuming from $service..." + else + log "INFO" "Skipping $service (already processed)" + continue + fi + fi + + log "INFO" "Deploying $service..." + set_last_service "$service" + + local svc_dir="${REPO_PATH}/services/$service" + local runtime_config_dir="${RUNTIME_PATH}/config/$service" + mkdir -p "$runtime_config_dir" + + local compose_args=("-f" "${svc_dir}/docker-compose.yml") + if [[ -f "${runtime_config_dir}/docker-compose.override.yml" ]]; then + log "INFO" "Using override for $service" + compose_args+=("-f" "${runtime_config_dir}/docker-compose.override.yml") + fi + + # Determine .env + local env_file="" + if [[ -f "${runtime_config_dir}/.env" ]]; then + env_file="${runtime_config_dir}/.env" + elif [[ -f "${svc_dir}/.env" ]]; then + env_file="${svc_dir}/.env" + fi + + local run_cmd=("docker" "compose") + run_cmd+=("${compose_args[@]}") + if [[ -n "$env_file" ]]; then + run_cmd+=("--env-file" "$env_file") + fi + run_cmd+=("up" "-d" "--remove-orphans") + + log "INFO" "Running: ${run_cmd[*]}" + if ! "${run_cmd[@]}"; then + log "ERROR" "Failed to deploy $service" + struct_log "deploy" "$host" "$service" "fail" "docker_compose_failed" + stage_diagnose "$host" "$service" + return 1 + fi + + struct_log "deploy" "$host" "$service" "success" "deployed" + done + set_last_service "" } stage_verify() { - log "Stage: VERIFY" + local host=$1 + log "INFO" "Stage: VERIFY ($host)" set_state "verify" - # Skeleton: Check container status, healthchecks, connectivity - log "Verifying service health..." - docker ps - log "Verification complete." + + for service in "${SERVICES[@]}"; do + log "INFO" "Verifying $service..." + local health_script="${REPO_PATH}/services/${service}/healthcheck.sh" + if [[ -f "$health_script" ]]; then + if ! bash "$health_script"; then + log "ERROR" "Healthcheck failed for $service" + struct_log "verify" "$host" "$service" "fail" "healthcheck_failed" + stage_diagnose "$host" "$service" + return 1 + fi + else + if ! docker ps --filter "name=$service" --filter "status=running" | grep -q "$service"; then + log "ERROR" "Container $service is not running" + struct_log "verify" "$host" "$service" "fail" "container_not_running" + stage_diagnose "$host" "$service" + return 1 + fi + fi + struct_log "verify" "$host" "$service" "success" "verified" + done } stage_diagnose() { - log "Stage: DIAGNOSE" - # Skeleton: Check logs, resource usage, networking - log "Running diagnostics..." - docker stats --no-stream - log "Diagnostics complete." + local host=$1 + local service=$2 + log "INFO" "Stage: DIAGNOSE ($host - ${service:-all})" + + echo "--- DIAGNOSTICS FOR ${service:-all} ---" + docker ps --filter "name=${service:-}" + + if [[ -n "$service" ]]; then + local svc_dir="${REPO_PATH}/services/$service" + if [[ -d "$svc_dir" ]]; then + cd "$svc_dir" || exit 1 + docker compose ps + docker compose logs --tail=50 + fi + fi + echo "--- END DIAGNOSTICS ---" + struct_log "diagnose" "$host" "${service:-all}" "done" "diagnostics_collected" } -stage_rollback() { - log "Stage: ROLLBACK" - # Skeleton: Revert to previous git commit or previous state - log "Rolling back changes..." - log "Rollback complete." +stage_complete() { + local host=$1 + log "INFO" "Stage: COMPLETE ($host)" + set_state "complete" + struct_log "complete" "$host" "all" "success" "deployment_finished" } -stage_resume() { - log "Stage: RESUME" - CURRENT=$(get_state) - log "Resuming from state: $CURRENT" - case "$CURRENT" in - "prepare") stage_deploy ;; - "deploy") stage_verify ;; - "verify") log "Last deployment was verified. Nothing to resume." ;; - *) log "Unknown state or nothing to resume. Starting from prepare..."; stage_prepare ;; +# --- Execution Logic --- + +run_deployment() { + local start_stage=$1 + + case "$start_stage" in + prepare) + stage_prepare "$TARGET_HOST" || return 1 + ;& + validate) + stage_validate "$TARGET_HOST" || return 1 + ;& + deploy) + stage_deploy "$TARGET_HOST" || return 1 + ;& + verify) + stage_verify "$TARGET_HOST" || return 1 + ;& + complete) + stage_complete "$TARGET_HOST" || return 1 + ;; + *) + log "ERROR" "Invalid stage: $start_stage" + return 1 + ;; esac } # --- Main --- -COMMAND=${1:-resume} +log "INFO" "--- Homelab Deployment Started (Host: $TARGET_HOST, Service: ${TARGET_SERVICE:-all}) ---" -log "--- Homelab Deployment Started (Command: $COMMAND) ---" +load_inventory -case "$COMMAND" in - prepare) stage_prepare ;; - deploy) stage_deploy ;; - verify) stage_verify ;; - diagnose) stage_diagnose ;; - rollback) stage_rollback ;; - resume) stage_resume ;; - *) echo "Usage: $0 {prepare|deploy|verify|diagnose|rollback|resume}"; exit 1 ;; -esac +if [[ "$RESUME" == "true" ]]; then + CURRENT=$(get_state) + log "INFO" "Resuming from state: $CURRENT" + case "$CURRENT" in + prepare|validate|deploy|verify) + run_deployment "$CURRENT" + ;; + complete) + log "INFO" "Last deployment was complete. Nothing to resume." + ;; + *) + log "INFO" "No valid state to resume. Starting from prepare..." + run_deployment "prepare" + ;; + esac +elif [[ -n "$REQUESTED_STAGE" ]]; then + if [[ "$REQUESTED_STAGE" == "diagnose" ]]; then + stage_diagnose "$TARGET_HOST" "$TARGET_SERVICE" + else + run_deployment "$REQUESTED_STAGE" + fi +else + run_deployment "prepare" +fi -log "--- Homelab Deployment Finished ---" +log "INFO" "--- Homelab Deployment Finished ---"