#!/usr/bin/env bash # health-monitor.sh - Homelab node health monitor and safe disk cleanup # # Designed to run standalone on the host (cron or direct) or to be called by # the node-agent Python daemon. All cleanup decisions follow the conservative # policy agreed in the design review: # # lte_node (chelsty-infra, chelsty-ha) : NO cleanup at all # sd_card (piha, saturn) : dangling images + stopped containers, # rate-limited to once per 24 h # ai_node (solaria) : dangling images + stopped containers # + build cache (NEVER -a) # standard (vps) : dangling images + stopped containers # + build cache # # VPS additionally rotates control-plane filesystem artefacts: # actions/completed + failed > 7 days # logs/deploy > 30 days # events/** > 3 days AND past observer checkpoint # # NEVER TOUCHED (any node): /opt/homelab/data/, config/, state/, # actions/pending|approved|running, Frigate recordings, Ollama models, # Zigbee2MQTT data, Mosquitto data, HA database/config. set -euo pipefail # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- RUNTIME_PATH="${RUNTIME_PATH:-/opt/homelab}" EVENTS_DIR="${RUNTIME_PATH}/events" STATE_DIR="${RUNTIME_PATH}/state" LOGS_DIR="${RUNTIME_PATH}/logs" ACTIONS_DIR="${RUNTIME_PATH}/actions" NODE_NAME="${NODE_NAME:-$(hostname)}" TIMESTAMP=$(date +%s) DATE=$(date -u +%Y-%m-%dT%H:%M:%SZ) # Thresholds DISK_WARN_PCT=75 DISK_CRIT_PCT=85 MEM_WARN_PCT=85 MEM_CRIT_PCT=95 # Rate-limit file for SD-card nodes (max one Docker cleanup per 24 h) CLEANUP_LOCK="${STATE_DIR}/last-docker-cleanup" CLEANUP_INTERVAL=86400 # seconds # Node classifications LTE_NODES="chelsty-infra chelsty-ha" SD_CARD_NODES="piha saturn" AI_NODES="solaria" # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- log() { echo "$(date -u +%H:%M:%S) [INFO] $*"; } warn() { echo "$(date -u +%H:%M:%S) [WARN] $*" >&2; } err() { echo "$(date -u +%H:%M:%S) [ERROR] $*" >&2; } contains() { local word="$1"; shift for w in "$@"; do [[ "$w" == "$word" ]] && return 0; done return 1 } get_node_type() { # shellcheck disable=SC2086 if contains "$NODE_NAME" $LTE_NODES; then echo "lte_node"; return; fi if contains "$NODE_NAME" $SD_CARD_NODES; then echo "sd_card"; return; fi if contains "$NODE_NAME" $AI_NODES; then echo "ai_node"; return; fi echo "standard" } # --------------------------------------------------------------------------- # Event emission # --------------------------------------------------------------------------- emit_event() { local type="$1" severity="$2" service="${3:-}" message="$4" payload="${5:-{}}" local id="evt-${NODE_NAME}-${TIMESTAMP}-${type}" local dir="${EVENTS_DIR}/${NODE_NAME}" mkdir -p "$dir" cat > "${dir}/${id}.json" </dev/null | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || return avail_mb=$(df "${mount}" 2>/dev/null | awk 'NR==2 {printf "%d", $4/1024}') || return total_mb=$(df "${mount}" 2>/dev/null | awk 'NR==2 {printf "%d", $2/1024}') || return if [[ "${usage_pct}" -ge "${DISK_CRIT_PCT}" ]]; then warn "Disk CRITICAL: ${usage_pct}% used (${avail_mb} MB free)" emit_event "disk_pressure" "high" "" \ "Disk usage critical: ${usage_pct}% on ${mount} (${avail_mb} MB free)" \ "{\"usage_pct\": ${usage_pct}, \"avail_mb\": ${avail_mb}, \"total_mb\": ${total_mb}, \"mount\": \"${mount}\"}" elif [[ "${usage_pct}" -ge "${DISK_WARN_PCT}" ]]; then warn "Disk elevated: ${usage_pct}% used" emit_event "disk_pressure" "medium" "" \ "Disk usage elevated: ${usage_pct}% on ${mount} (${avail_mb} MB free)" \ "{\"usage_pct\": ${usage_pct}, \"avail_mb\": ${avail_mb}, \"total_mb\": ${total_mb}, \"mount\": \"${mount}\"}" fi echo "${usage_pct}" } check_memory() { local total avail pct avail_mb total=$(awk '/^MemTotal/ {print $2}' /proc/meminfo) avail=$(awk '/^MemAvailable/ {print $2}' /proc/meminfo) pct=$(( (total - avail) * 100 / total )) avail_mb=$(( avail / 1024 )) if [[ "${pct}" -ge "${MEM_CRIT_PCT}" ]]; then warn "Memory CRITICAL: ${pct}% used" emit_event "high_memory" "high" "" \ "Memory usage critical: ${pct}% (${avail_mb} MB available)" \ "{\"usage_pct\": ${pct}, \"avail_mb\": ${avail_mb}, \"total_mb\": $((total/1024))}" elif [[ "${pct}" -ge "${MEM_WARN_PCT}" ]]; then warn "Memory elevated: ${pct}%" emit_event "high_memory" "medium" "" \ "Memory usage elevated: ${pct}% (${avail_mb} MB available)" \ "{\"usage_pct\": ${pct}, \"avail_mb\": ${avail_mb}, \"total_mb\": $((total/1024))}" fi echo "${pct}" } check_cpu() { # Two-sample /proc/stat delta for accurate instantaneous CPU usage. local idle1 total1 idle2 total2 pct read -r idle1 total1 < <(awk '/^cpu / {idle=$5; total=0; for(i=2;i<=NF;i++) total+=$i; print idle, total}' /proc/stat) sleep 1 read -r idle2 total2 < <(awk '/^cpu / {idle=$5; total=0; for(i=2;i<=NF;i++) total+=$i; print idle, total}' /proc/stat) local d_idle=$(( idle2 - idle1 )) local d_total=$(( total2 - total1 )) pct=$(( d_total > 0 ? 100 - d_idle * 100 / d_total : 0 )) if [[ "${pct}" -ge 90 ]]; then warn "CPU elevated: ${pct}%" emit_event "high_cpu" "medium" "" \ "CPU usage elevated: ${pct}%" \ "{\"usage_pct\": ${pct}}" fi echo "${pct}" } check_containers() { command -v docker &>/dev/null || return # Containers that have exited but carry a restart policy meaning they should be up local cname while IFS= read -r cname; do [[ -z "$cname" ]] && continue warn "Container exited (should be running): ${cname}" emit_event "containers_not_running" "high" "${cname}" \ "Container '${cname}' has exited unexpectedly (restart=unless-stopped)" \ "{\"container\": \"${cname}\"}" done < <(docker ps -a \ --filter "status=exited" \ --filter "label=com.docker.compose.project" \ --format "{{.Names}}" 2>/dev/null || true) # Containers that are running but their health check is failing while IFS= read -r cname; do [[ -z "$cname" ]] && continue warn "Container unhealthy: ${cname}" emit_event "healthcheck_failed" "high" "${cname}" \ "Container '${cname}' is running but health check is failing" \ "{\"container\": \"${cname}\"}" done < <(docker ps \ --filter "health=unhealthy" \ --format "{{.Names}}" 2>/dev/null || true) } # --------------------------------------------------------------------------- # Safe Docker cleanup (per policy) # --------------------------------------------------------------------------- _sd_card_rate_ok() { if [[ -f "${CLEANUP_LOCK}" ]]; then local last_ts elapsed last_ts=$(cat "${CLEANUP_LOCK}" 2>/dev/null || echo 0) elapsed=$(( TIMESTAMP - last_ts )) if [[ "${elapsed}" -lt "${CLEANUP_INTERVAL}" ]]; then log "Docker cleanup skipped: last run ${elapsed}s ago (limit ${CLEANUP_INTERVAL}s)" return 1 fi fi return 0 } _mark_cleanup_done() { echo "${TIMESTAMP}" > "${CLEANUP_LOCK}" } run_safe_cleanup() { command -v docker &>/dev/null || return local node_type node_type=$(get_node_type) case "${node_type}" in lte_node) # NO cleanup on LTE nodes. Any docker operation risks triggering # a pull over a metered/intermittent connection. log "Skipping Docker cleanup: LTE node (${NODE_NAME})" ;; sd_card) # Dangling images + stopped containers only. # Rate-limited to once per 24 hours to protect SD card write endurance. _sd_card_rate_ok || return log "Running rate-limited Docker cleanup (SD card node)" docker image prune -f >/dev/null 2>&1 || true docker container prune -f >/dev/null 2>&1 || true _mark_cleanup_done ;; ai_node) # Dangling images + stopped containers + build cache. # NEVER docker image prune -a (would remove Ollama runtime images, # requiring a multi-hour re-pull of model weights). log "Running AI-node Docker cleanup (dangling images + containers + build cache)" docker image prune -f >/dev/null 2>&1 || true docker container prune -f >/dev/null 2>&1 || true docker builder prune -f >/dev/null 2>&1 || true ;; standard) # VPS and other standard nodes: full safe cleanup. log "Running standard Docker cleanup" docker image prune -f >/dev/null 2>&1 || true docker container prune -f >/dev/null 2>&1 || true docker builder prune -f >/dev/null 2>&1 || true ;; esac } # --------------------------------------------------------------------------- # VPS-specific: control-plane filesystem rotation # --------------------------------------------------------------------------- cleanup_control_plane_fs() { log "Running control-plane filesystem rotation" # Completed / failed actions older than 7 days for status in completed failed; do local dir="${ACTIONS_DIR}/${status}" [[ -d "${dir}" ]] || continue find "${dir}" -name "*.json" -mtime +7 -delete 2>/dev/null && \ log "Cleaned ${status} actions older than 7 days" || true done # Deploy logs older than 30 days local deploy_logs="${LOGS_DIR}/deploy" if [[ -d "${deploy_logs}" ]]; then find "${deploy_logs}" -name "*.log" -mtime +30 -delete 2>/dev/null && \ log "Cleaned deploy logs older than 30 days" || true fi # Event files older than 3 days AND already past the observer checkpoint. # The dual condition ensures we never delete an event the observer hasn't seen. local checkpoint="${STATE_DIR}/observer_checkpoint.json" if [[ -f "${checkpoint}" ]] && command -v python3 &>/dev/null; then local last_processed last_processed=$(python3 -c " import json, sys try: d = json.load(open('${checkpoint}')) print(d.get('last_processed_file', '')) except Exception: print('') " 2>/dev/null || echo "") if [[ -n "${last_processed}" ]]; then find "${EVENTS_DIR}" -name "*.json" -mtime +3 | while IFS= read -r f; do # Only delete files that sort before the checkpoint path # (i.e., the observer has already processed them). if [[ "$f" < "${last_processed}" ]]; then rm -f "$f" log "Cleaned old event: $(basename "$f")" fi done else log "No observer checkpoint set; skipping event file cleanup" fi fi } # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- mkdir -p "${EVENTS_DIR}/${NODE_NAME}" "${STATE_DIR}" log "Health check starting on ${NODE_NAME} (type=$(get_node_type))" disk_pct=$(check_disk || echo 0) mem_pct=$(check_memory || echo 0) cpu_pct=$(check_cpu || echo 0) check_containers run_safe_cleanup # VPS: also rotate control-plane filesystem artefacts if [[ "${NODE_NAME}" == "vps" ]]; then cleanup_control_plane_fs fi # Emit a node_health heartbeat so the observer can update node status # and the supervisor can see up-to-date resource metrics. emit_event "node_health" "info" "" \ "Health check completed on ${NODE_NAME}" \ "{\"disk_pct\": ${disk_pct}, \"mem_pct\": ${mem_pct}, \"cpu_pct\": ${cpu_pct}}" log "Health check complete (disk=${disk_pct}% mem=${mem_pct}% cpu=${cpu_pct}%)"