Compare commits

...

2 commits

8 changed files with 318 additions and 149 deletions

View file

@ -12,7 +12,17 @@ This document describes the GitOps-lite deployment process for the homelab.
## Staged Deployment Framework ## Staged Deployment Framework
The homelab uses a staged deployment framework located at `scripts/deploy/deploy.sh`. This script is designed to be resumable, stage-aware, and observable. The homelab uses a modularized staged deployment framework located at `scripts/deploy/deploy.sh`. This script is designed to be resumable, stage-aware, and observable, with core logic split into maintainable libraries in `scripts/lib/`.
### Runtime Architecture
The runtime consists of:
- `deploy.sh`: Orchestration entrypoint.
- `lib/log.sh`: Logging and structured output.
- `lib/state.sh`: Deployment state tracking and stage persistence.
- `lib/inventory.sh`: Reliable host and service discovery (Python-based YAML parsing).
- `lib/compose.sh`: Docker Compose operations.
- `lib/diagnostics.sh`: Post-failure analysis and summary generation.
### Deployment Stages ### Deployment Stages
@ -32,8 +42,16 @@ The homelab uses a staged deployment framework located at `scripts/deploy/deploy
If a deployment is interrupted (e.g., due to LTE disconnect on CHELSTY): If a deployment is interrupted (e.g., due to LTE disconnect on CHELSTY):
1. Rerun the script with the `--resume` flag: `scripts/deploy/deploy.sh --resume`. 1. Rerun the script with the `--resume` flag: `scripts/deploy/deploy.sh --resume`.
2. The script reads the last incomplete stage and continues from there. 2. The script identifies the last incomplete stage using deterministic markers (`/opt/homelab/state/deploy/stage_<name>_complete`) and continues from the exact failure point.
3. In the `deploy` stage, it specifically resumes from the first service that was not successfully completed. 3. In the `deploy` stage, it specifically resumes from the first service that was not successfully completed, skipping those already up.
4. Repeated runs are safe and idempotent; completed stages are not re-executed unless the resume flag is omitted (which clears state for a fresh run).
### Diagnostics and Troubleshooting
The runtime is designed to fail predictably and provide immediate feedback:
- **Automatic Diagnostics**: If any stage fails, `collect_diagnostics` is triggered to capture system state and container logs into `/opt/homelab/logs/deploy/diagnostics_<timestamp>.txt`.
- **Deployment Summary**: Every run concludes with a concise summary showing the host status, last stage reached, and log locations.
- **Offline Resilience**: The `prepare` stage handles `git pull` failures gracefully, allowing deployment from local cache during network instability.
### Operational Semantics ### Operational Semantics

View file

@ -13,11 +13,11 @@ This document defines the lifecycle of a service in the homelab and the procedur
- Ensure `/opt/homelab/config/<service>` exists and contains required secrets/configs. - Ensure `/opt/homelab/config/<service>` exists and contains required secrets/configs.
- Setup environment variables from `env.example` into `/opt/homelab/config/<service>/.env`. - Setup environment variables from `env.example` into `/opt/homelab/config/<service>/.env`.
3. **Deployment**: 3. **Deployment**:
- `scripts/deploy/deploy.sh prepare` - `scripts/deploy/deploy.sh` (Starts fresh)
- `scripts/deploy/deploy.sh deploy` - `scripts/deploy/deploy.sh --resume` (Continues after interruption)
4. **Verification**: 4. **Verification**:
- `scripts/deploy/deploy.sh verify` - Automatic as part of the `deploy.sh` pipeline (`verify` stage).
- Healthchecks are automated within the verify stage. - Manual: `scripts/deploy/deploy.sh --stage verify`.
5. **Maintenance**: 5. **Maintenance**:
- Periodic updates via `docker compose pull`. - Periodic updates via `docker compose pull`.
- Log monitoring via `docker compose logs -f`. - Log monitoring via `docker compose logs -f`.

View file

@ -4,12 +4,12 @@
set -o pipefail set -o pipefail
# --- Configuration --- # --- Configuration ---
RUNTIME_PATH="/opt/homelab" export RUNTIME_PATH="/opt/homelab"
STATE_DIR="${RUNTIME_PATH}/state/deploy" export STATE_DIR="${RUNTIME_PATH}/state/deploy"
LOG_DIR="${RUNTIME_PATH}/logs/deploy" export LOG_DIR="${RUNTIME_PATH}/logs/deploy"
REPO_PATH="${HOME}/homelab-codex-ws" export REPO_PATH="${HOME}/homelab-codex-ws"
TIMESTAMP=$(date +%Y%m%d_%H%M%S) export TIMESTAMP=$(date +%Y%m%d_%H%M%S)
LOG_FILE="${LOG_DIR}/deploy_${TIMESTAMP}.log" export LOG_FILE="${LOG_DIR}/deploy_${TIMESTAMP}.log"
# --- Initialization --- # --- Initialization ---
mkdir -p "$STATE_DIR" "$LOG_DIR" mkdir -p "$STATE_DIR" "$LOG_DIR"
@ -17,52 +17,15 @@ mkdir -p "$STATE_DIR" "$LOG_DIR"
# Redirection for logging # Redirection for logging
exec > >(tee -a "$LOG_FILE") 2>&1 exec > >(tee -a "$LOG_FILE") 2>&1
# --- Helpers --- # --- Load Libraries ---
LIB_PATH="${REPO_PATH}/scripts/lib"
log() { source "${LIB_PATH}/log.sh"
local level=$1 source "${LIB_PATH}/state.sh"
shift source "${LIB_PATH}/inventory.sh"
local message=$* source "${LIB_PATH}/compose.sh"
echo "[$(date +'%Y-%m-%d %H:%M:%S')] [$level] $message" source "${LIB_PATH}/diagnostics.sh"
}
# Structured log for machine reading
# timestamp, stage, host, service, command_result, retry_info
struct_log() {
local stage=$1
local host=$2
local service=$3
local result=$4
local info=$5
log "STRUCT" "stage=$stage host=$host service=$service result=$result info=\"$info\""
}
set_state() {
echo "$1" > "${STATE_DIR}/current_stage"
}
get_state() {
if [ -f "${STATE_DIR}/current_stage" ]; then
cat "${STATE_DIR}/current_stage"
else
echo "none"
fi
}
set_last_service() {
echo "$1" > "${STATE_DIR}/last_service"
}
get_last_service() {
if [ -f "${STATE_DIR}/last_service" ]; then
cat "${STATE_DIR}/last_service"
else
echo ""
fi
}
# --- CLI Parsing --- # --- CLI Parsing ---
TARGET_HOST=$(hostname) TARGET_HOST=$(hostname)
TARGET_SERVICE="" TARGET_SERVICE=""
RESUME=false RESUME=false
@ -95,37 +58,17 @@ while [[ $# -gt 0 ]]; do
esac esac
done done
# --- Inventory Loading ---
load_inventory() {
log "INFO" "Loading inventory for host: $TARGET_HOST"
if [[ ! -d "${REPO_PATH}/hosts/${TARGET_HOST}" ]]; then
log "ERROR" "Host directory not found: ${REPO_PATH}/hosts/${TARGET_HOST}"
exit 1
fi
if [[ -n "$TARGET_SERVICE" ]]; then
SERVICES=("$TARGET_SERVICE")
else
if [[ -f "${REPO_PATH}/hosts/${TARGET_HOST}/services.txt" ]]; then
SERVICES=($(cat "${REPO_PATH}/hosts/${TARGET_HOST}/services.txt"))
elif [[ -f "${REPO_PATH}/hosts/${TARGET_HOST}/services.yaml" ]]; then
SERVICES=($(grep -A 100 "services:" "${REPO_PATH}/hosts/${TARGET_HOST}/services.yaml" | grep "^ [a-z0-9_-]\+:" | sed 's/ \(.*\):/\1/'))
else
log "WARN" "No services found for $TARGET_HOST"
SERVICES=()
fi
fi
log "INFO" "Services to process: ${SERVICES[*]}"
}
# --- Stages --- # --- Stages ---
stage_prepare() { stage_prepare() {
local host=$1 local host=$1
if is_stage_complete "prepare" && [[ "$RESUME" == "true" ]]; then
log "INFO" "Skipping PREPARE (already complete)"
return 0
fi
log "INFO" "Stage: PREPARE ($host)" log "INFO" "Stage: PREPARE ($host)"
set_state "prepare" set_stage "prepare"
cd "$REPO_PATH" || exit 1 cd "$REPO_PATH" || exit 1
log "INFO" "Pulling latest changes..." log "INFO" "Pulling latest changes..."
@ -133,15 +76,22 @@ stage_prepare() {
log "WARN" "Git pull failed, proceeding with local state (offline mode or network flap)" log "WARN" "Git pull failed, proceeding with local state (offline mode or network flap)"
fi fi
# Ensure runtime directories exist
mkdir -p "${RUNTIME_PATH}/config" "${RUNTIME_PATH}/data" "${RUNTIME_PATH}/state" "${RUNTIME_PATH}/logs" mkdir -p "${RUNTIME_PATH}/config" "${RUNTIME_PATH}/data" "${RUNTIME_PATH}/state" "${RUNTIME_PATH}/logs"
struct_log "prepare" "$host" "all" "success" "repo_updated" struct_log "prepare" "$host" "all" "success" "repo_updated"
mark_stage_complete "prepare"
} }
stage_validate() { stage_validate() {
local host=$1 local host=$1
if is_stage_complete "validate" && [[ "$RESUME" == "true" ]]; then
log "INFO" "Skipping VALIDATE (already complete)"
return 0
fi
log "INFO" "Stage: VALIDATE ($host)" log "INFO" "Stage: VALIDATE ($host)"
set_state "validate" set_stage "validate"
for service in "${SERVICES[@]}"; do for service in "${SERVICES[@]}"; do
log "INFO" "Validating $service..." log "INFO" "Validating $service..."
@ -153,12 +103,18 @@ stage_validate() {
done done
struct_log "validate" "$host" "all" "success" "validated" struct_log "validate" "$host" "all" "success" "validated"
mark_stage_complete "validate"
} }
stage_deploy() { stage_deploy() {
local host=$1 local host=$1
if is_stage_complete "deploy" && [[ "$RESUME" == "true" ]]; then
log "INFO" "Skipping DEPLOY (already complete)"
return 0
fi
log "INFO" "Stage: DEPLOY ($host)" log "INFO" "Stage: DEPLOY ($host)"
set_state "deploy" set_stage "deploy"
local last_s=$(get_last_service) local last_s=$(get_last_service)
local skip=false local skip=false
@ -180,48 +136,28 @@ stage_deploy() {
log "INFO" "Deploying $service..." log "INFO" "Deploying $service..."
set_last_service "$service" set_last_service "$service"
local svc_dir="${REPO_PATH}/services/$service" if ! run_compose_up "$service"; then
local runtime_config_dir="${RUNTIME_PATH}/config/$service"
mkdir -p "$runtime_config_dir"
local compose_args=("-f" "${svc_dir}/docker-compose.yml")
if [[ -f "${runtime_config_dir}/docker-compose.override.yml" ]]; then
log "INFO" "Using override for $service"
compose_args+=("-f" "${runtime_config_dir}/docker-compose.override.yml")
fi
# Determine .env
local env_file=""
if [[ -f "${runtime_config_dir}/.env" ]]; then
env_file="${runtime_config_dir}/.env"
elif [[ -f "${svc_dir}/.env" ]]; then
env_file="${svc_dir}/.env"
fi
local run_cmd=("docker" "compose")
run_cmd+=("${compose_args[@]}")
if [[ -n "$env_file" ]]; then
run_cmd+=("--env-file" "$env_file")
fi
run_cmd+=("up" "-d" "--remove-orphans")
log "INFO" "Running: ${run_cmd[*]}"
if ! "${run_cmd[@]}"; then
log "ERROR" "Failed to deploy $service"
struct_log "deploy" "$host" "$service" "fail" "docker_compose_failed" struct_log "deploy" "$host" "$service" "fail" "docker_compose_failed"
stage_diagnose "$host" "$service" collect_diagnostics "$host" "$service"
return 1 return 1
fi fi
struct_log "deploy" "$host" "$service" "success" "deployed" struct_log "deploy" "$host" "$service" "success" "deployed"
done done
set_last_service "" set_last_service ""
mark_stage_complete "deploy"
} }
stage_verify() { stage_verify() {
local host=$1 local host=$1
if is_stage_complete "verify" && [[ "$RESUME" == "true" ]]; then
log "INFO" "Skipping VERIFY (already complete)"
return 0
fi
log "INFO" "Stage: VERIFY ($host)" log "INFO" "Stage: VERIFY ($host)"
set_state "verify" set_stage "verify"
for service in "${SERVICES[@]}"; do for service in "${SERVICES[@]}"; do
log "INFO" "Verifying $service..." log "INFO" "Verifying $service..."
@ -230,46 +166,29 @@ stage_verify() {
if ! bash "$health_script"; then if ! bash "$health_script"; then
log "ERROR" "Healthcheck failed for $service" log "ERROR" "Healthcheck failed for $service"
struct_log "verify" "$host" "$service" "fail" "healthcheck_failed" struct_log "verify" "$host" "$service" "fail" "healthcheck_failed"
stage_diagnose "$host" "$service" collect_diagnostics "$host" "$service"
return 1 return 1
fi fi
else else
# Generic check if container is running
if ! docker ps --filter "name=$service" --filter "status=running" | grep -q "$service"; then if ! docker ps --filter "name=$service" --filter "status=running" | grep -q "$service"; then
log "ERROR" "Container $service is not running" log "ERROR" "Container $service is not running"
struct_log "verify" "$host" "$service" "fail" "container_not_running" struct_log "verify" "$host" "$service" "fail" "container_not_running"
stage_diagnose "$host" "$service" collect_diagnostics "$host" "$service"
return 1 return 1
fi fi
fi fi
struct_log "verify" "$host" "$service" "success" "verified" struct_log "verify" "$host" "$service" "success" "verified"
done done
} mark_stage_complete "verify"
stage_diagnose() {
local host=$1
local service=$2
log "INFO" "Stage: DIAGNOSE ($host - ${service:-all})"
echo "--- DIAGNOSTICS FOR ${service:-all} ---"
docker ps --filter "name=${service:-}"
if [[ -n "$service" ]]; then
local svc_dir="${REPO_PATH}/services/$service"
if [[ -d "$svc_dir" ]]; then
cd "$svc_dir" || exit 1
docker compose ps
docker compose logs --tail=50
fi
fi
echo "--- END DIAGNOSTICS ---"
struct_log "diagnose" "$host" "${service:-all}" "done" "diagnostics_collected"
} }
stage_complete() { stage_complete() {
local host=$1 local host=$1
log "INFO" "Stage: COMPLETE ($host)" log "INFO" "Stage: COMPLETE ($host)"
set_state "complete" set_stage "complete"
struct_log "complete" "$host" "all" "success" "deployment_finished" struct_log "complete" "$host" "all" "success" "deployment_finished"
clear_deployment_state
} }
# --- Execution Logic --- # --- Execution Logic ---
@ -277,6 +196,7 @@ stage_complete() {
run_deployment() { run_deployment() {
local start_stage=$1 local start_stage=$1
# Sequential execution from start_stage
case "$start_stage" in case "$start_stage" in
prepare) prepare)
stage_prepare "$TARGET_HOST" || return 1 stage_prepare "$TARGET_HOST" || return 1
@ -304,31 +224,45 @@ run_deployment() {
log "INFO" "--- Homelab Deployment Started (Host: $TARGET_HOST, Service: ${TARGET_SERVICE:-all}) ---" log "INFO" "--- Homelab Deployment Started (Host: $TARGET_HOST, Service: ${TARGET_SERVICE:-all}) ---"
load_inventory if ! load_inventory "$TARGET_HOST" "$TARGET_SERVICE"; then
log "ERROR" "Failed to load inventory"
exit 1
fi
EXIT_STATUS=0
if [[ "$RESUME" == "true" ]]; then if [[ "$RESUME" == "true" ]]; then
CURRENT=$(get_state) CURRENT=$(get_stage)
log "INFO" "Resuming from state: $CURRENT" log "INFO" "Resuming from state: $CURRENT"
case "$CURRENT" in case "$CURRENT" in
prepare|validate|deploy|verify) prepare|validate|deploy|verify)
run_deployment "$CURRENT" run_deployment "$CURRENT" || EXIT_STATUS=1
;; ;;
complete) complete|none)
log "INFO" "Last deployment was complete. Nothing to resume." log "INFO" "No interrupted deployment found. Starting from scratch..."
run_deployment "prepare" || EXIT_STATUS=1
;; ;;
*) *)
log "INFO" "No valid state to resume. Starting from prepare..." log "INFO" "Unknown state. Starting from prepare..."
run_deployment "prepare" run_deployment "prepare" || EXIT_STATUS=1
;; ;;
esac esac
elif [[ -n "$REQUESTED_STAGE" ]]; then elif [[ -n "$REQUESTED_STAGE" ]]; then
if [[ "$REQUESTED_STAGE" == "diagnose" ]]; then if [[ "$REQUESTED_STAGE" == "diagnose" ]]; then
stage_diagnose "$TARGET_HOST" "$TARGET_SERVICE" collect_diagnostics "$TARGET_HOST" "$TARGET_SERVICE"
else else
run_deployment "$REQUESTED_STAGE" run_deployment "$REQUESTED_STAGE" || EXIT_STATUS=1
fi fi
else else
run_deployment "prepare" # New deployment - clear previous state
clear_deployment_state
run_deployment "prepare" || EXIT_STATUS=1
fi fi
log "INFO" "--- Homelab Deployment Finished ---" if [[ $EXIT_STATUS -eq 0 ]]; then
print_summary "$TARGET_HOST" "SUCCESS"
log "INFO" "--- Homelab Deployment Finished Successfully ---"
else
print_summary "$TARGET_HOST" "FAILED"
log "ERROR" "--- Homelab Deployment Failed ---"
exit 1
fi

45
scripts/lib/compose.sh Normal file
View file

@ -0,0 +1,45 @@
#!/usr/bin/env bash
# compose.sh - Docker Compose operations
run_compose_up() {
local service=$1
local svc_dir="${REPO_PATH}/services/$service"
local runtime_config_dir="${RUNTIME_PATH}/config/$service"
if [[ ! -d "$svc_dir" ]]; then
log "ERROR" "Service directory not found: $svc_dir"
return 1
fi
mkdir -p "$runtime_config_dir"
local compose_args=("-f" "${svc_dir}/docker-compose.yml")
if [[ -f "${runtime_config_dir}/docker-compose.override.yml" ]]; then
log "INFO" "Using override for $service"
compose_args+=("-f" "${runtime_config_dir}/docker-compose.override.yml")
fi
# Determine .env
local env_file=""
if [[ -f "${runtime_config_dir}/.env" ]]; then
env_file="${runtime_config_dir}/.env"
elif [[ -f "${svc_dir}/.env" ]]; then
env_file="${svc_dir}/.env"
fi
local run_cmd=("docker" "compose")
run_cmd+=("${compose_args[@]}")
if [[ -n "$env_file" ]]; then
run_cmd+=("--env-file" "$env_file")
fi
run_cmd+=("up" "-d" "--remove-orphans")
log "INFO" "Running: ${run_cmd[*]}"
if ! "${run_cmd[@]}"; then
log "ERROR" "Docker compose failed for $service"
return 1
fi
return 0
}
export -f run_compose_up

View file

@ -0,0 +1,53 @@
#!/usr/bin/env bash
# diagnostics.sh - Deployment failure diagnostics
collect_diagnostics() {
local host=$1
local service=$2
log "INFO" "Stage: DIAGNOSE ($host - ${service:-all})"
local diag_file="${LOG_DIR}/diagnostics_${TIMESTAMP}.txt"
{
echo "--- DIAGNOSTICS FOR ${service:-all} (Host: $host, Time: $(date)) ---"
echo "Uptime: $(uptime)"
echo "Memory: $(free -h)"
echo "Disk: $(df -h /)"
echo "--- Docker Status ---"
docker ps --filter "name=${service:-}"
if [[ -n "$service" ]]; then
local svc_dir="${REPO_PATH}/services/$service"
if [[ -d "$svc_dir" ]]; then
echo "--- $service Logs ---"
cd "$svc_dir" && docker compose logs --tail=50
fi
fi
echo "--- END DIAGNOSTICS ---"
} > "$diag_file" 2>&1
# Also output to console for immediate visibility
cat "$diag_file"
log "INFO" "Diagnostics stored in $diag_file"
}
print_summary() {
local host=$1
local status=$2
local last_stage=$(get_stage)
local last_service=$(get_last_service)
echo ""
echo "=========================================="
echo " DEPLOYMENT SUMMARY"
echo "=========================================="
echo "Host: $host"
echo "Status: $status"
echo "Last Stage: $last_stage"
[[ -n "$last_service" ]] && echo "Last Service: $last_service"
echo "Log File: $LOG_FILE"
echo "=========================================="
echo ""
}
export -f collect_diagnostics
export -f print_summary

45
scripts/lib/inventory.sh Normal file
View file

@ -0,0 +1,45 @@
#!/usr/bin/env bash
# inventory.sh - Host and service discovery
load_inventory() {
local host=$1
local service_override=$2
log "INFO" "Loading inventory for host: $host"
if [[ ! -d "${REPO_PATH}/hosts/${host}" ]]; then
log "ERROR" "Host directory not found: ${REPO_PATH}/hosts/${host}"
return 1
fi
if [[ -n "$service_override" ]]; then
SERVICES=("$service_override")
else
if [[ -f "${REPO_PATH}/hosts/${host}/services.txt" ]]; then
# Read services from text file, ignoring comments and empty lines
mapfile -t SERVICES < <(grep -v '^\s*#' "${REPO_PATH}/hosts/${host}/services.txt" | grep -v '^\s*$')
elif [[ -f "${REPO_PATH}/hosts/${host}/services.yaml" ]]; then
# Use python for reliable YAML parsing
SERVICES=($(python3 -c "
import yaml, sys
try:
with open('${REPO_PATH}/hosts/${host}/services.yaml', 'r') as f:
data = yaml.safe_load(f)
if data and 'services' in data:
if isinstance(data['services'], dict):
print(' '.join(data['services'].keys()))
elif isinstance(data['services'], list):
print(' '.join(data['services']))
except Exception as e:
print(f'Error parsing YAML: {e}', file=sys.stderr)
sys.exit(1)
"))
else
log "WARN" "No services found for $host"
SERVICES=()
fi
fi
log "INFO" "Services to process: ${SERVICES[*]}"
}
export -f load_inventory

23
scripts/lib/log.sh Normal file
View file

@ -0,0 +1,23 @@
#!/usr/bin/env bash
# log.sh - Logging utilities for homelab deployment
log() {
local level=$1
shift
local message=$*
echo "[$(date +'%Y-%m-%d %H:%M:%S')] [$level] $message"
}
# Structured log for machine reading
# timestamp, stage, host, service, command_result, info
struct_log() {
local stage=$1
local host=$2
local service=$3
local result=$4
local info=$5
log "STRUCT" "stage=$stage host=$host service=$service result=$result info=\"$info\""
}
export -f log
export -f struct_log

51
scripts/lib/state.sh Normal file
View file

@ -0,0 +1,51 @@
#!/usr/bin/env bash
# state.sh - Deployment state management
set_stage() {
local stage=$1
echo "$stage" > "${STATE_DIR}/current_stage"
}
get_stage() {
if [[ -f "${STATE_DIR}/current_stage" ]]; then
cat "${STATE_DIR}/current_stage"
else
echo "none"
fi
}
mark_stage_complete() {
local stage=$1
touch "${STATE_DIR}/stage_${stage}_complete"
}
is_stage_complete() {
local stage=$1
[[ -f "${STATE_DIR}/stage_${stage}_complete" ]]
}
clear_deployment_state() {
rm -f "${STATE_DIR}"/stage_*_complete
rm -f "${STATE_DIR}/current_stage"
rm -f "${STATE_DIR}/last_service"
}
set_last_service() {
echo "$1" > "${STATE_DIR}/last_service"
}
get_last_service() {
if [[ -f "${STATE_DIR}/last_service" ]]; then
cat "${STATE_DIR}/last_service"
else
echo ""
fi
}
export -f set_stage
export -f get_stage
export -f mark_stage_complete
export -f is_stage_complete
export -f clear_deployment_state
export -f set_last_service
export -f get_last_service