homelab-codex-ws/scripts/monitor/health-monitor.sh

#!/usr/bin/env bash
# health-monitor.sh - Homelab node health monitor and safe disk cleanup
#
# Designed to run standalone on the host (cron or direct) or to be called by
# the node-agent Python daemon. All cleanup decisions follow the conservative
# policy agreed in the design review:
#
#  lte_node  (chelsty-infra, chelsty-ha) : NO cleanup at all
#  sd_card   (piha, saturn)              : dangling images + stopped containers,
#                                          rate-limited to once per 24 h
#  ai_node   (solaria)                   : dangling images + stopped containers
#                                          + build cache (NEVER -a)
#  standard  (vps)                       : dangling images + stopped containers
#                                          + build cache
#
# VPS additionally rotates control-plane filesystem artefacts:
#   actions/completed + failed  > 7 days
#   logs/deploy                 > 30 days
#   events/**                   > 3 days AND past observer checkpoint
#
# NEVER TOUCHED (any node): /opt/homelab/data/, config/, state/,
#   actions/pending|approved|running, Frigate recordings, Ollama models,
#   Zigbee2MQTT data, Mosquitto data, HA database/config.

set -euo pipefail

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
RUNTIME_PATH="${RUNTIME_PATH:-/opt/homelab}"
EVENTS_DIR="${RUNTIME_PATH}/events"
STATE_DIR="${RUNTIME_PATH}/state"
LOGS_DIR="${RUNTIME_PATH}/logs"
ACTIONS_DIR="${RUNTIME_PATH}/actions"

NODE_NAME="${NODE_NAME:-$(hostname)}"
TIMESTAMP=$(date +%s)
DATE=$(date -u +%Y-%m-%dT%H:%M:%SZ)

# Thresholds
DISK_WARN_PCT=75
DISK_CRIT_PCT=85
MEM_WARN_PCT=85
MEM_CRIT_PCT=95

# Rate-limit file for SD-card nodes (max one Docker cleanup per 24 h)
CLEANUP_LOCK="${STATE_DIR}/last-docker-cleanup"
CLEANUP_INTERVAL=86400   # seconds

# Node classifications
LTE_NODES="chelsty-infra chelsty-ha"
SD_CARD_NODES="piha saturn"
AI_NODES="solaria"

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

log()  { echo "$(date -u +%H:%M:%S) [INFO]  $*"; }
warn() { echo "$(date -u +%H:%M:%S) [WARN]  $*" >&2; }
err()  { echo "$(date -u +%H:%M:%S) [ERROR] $*" >&2; }

contains() {
    local word="$1"; shift
    for w in "$@"; do [[ "$w" == "$word" ]] && return 0; done
    return 1
}

get_node_type() {
    # shellcheck disable=SC2086
    if contains "$NODE_NAME" $LTE_NODES;    then echo "lte_node";  return; fi
    if contains "$NODE_NAME" $SD_CARD_NODES; then echo "sd_card";   return; fi
    if contains "$NODE_NAME" $AI_NODES;     then echo "ai_node";   return; fi
    echo "standard"
}

# ---------------------------------------------------------------------------
# Event emission
# ---------------------------------------------------------------------------

emit_event() {
    local type="$1" severity="$2" service="${3:-}" message="$4" payload="${5:-{}}"
    local id="evt-${NODE_NAME}-${TIMESTAMP}-${type}"
    local dir="${EVENTS_DIR}/${NODE_NAME}"
    mkdir -p "$dir"
    cat > "${dir}/${id}.json" <<EOF
{
  "id": "${id}",
  "timestamp": ${TIMESTAMP},
  "date": "${DATE}",
  "type": "${type}",
  "severity": "${severity}",
  "node": "${NODE_NAME}",
  "service": "${service}",
  "message": "${message}",
  "payload": ${payload}
}
EOF
}

# ---------------------------------------------------------------------------
# Health checks
# ---------------------------------------------------------------------------

check_disk() {
    # Use /opt/homelab as the check target — it lives on the host filesystem
    # and this path is correct both when running natively and in a container
    # that mounts /opt/homelab from the host.
    local mount="${RUNTIME_PATH}"
    local usage_pct avail_mb total_mb
    usage_pct=$(df "${mount}" 2>/dev/null | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || return
    avail_mb=$(df  "${mount}" 2>/dev/null | awk 'NR==2 {printf "%d", $4/1024}')       || return
    total_mb=$(df  "${mount}" 2>/dev/null | awk 'NR==2 {printf "%d", $2/1024}')       || return

    if [[ "${usage_pct}" -ge "${DISK_CRIT_PCT}" ]]; then
        warn "Disk CRITICAL: ${usage_pct}% used (${avail_mb} MB free)"
        emit_event "disk_pressure" "high" "" \
            "Disk usage critical: ${usage_pct}% on ${mount} (${avail_mb} MB free)" \
            "{\"usage_pct\": ${usage_pct}, \"avail_mb\": ${avail_mb}, \"total_mb\": ${total_mb}, \"mount\": \"${mount}\"}"
    elif [[ "${usage_pct}" -ge "${DISK_WARN_PCT}" ]]; then
        warn "Disk elevated: ${usage_pct}% used"
        emit_event "disk_pressure" "medium" "" \
            "Disk usage elevated: ${usage_pct}% on ${mount} (${avail_mb} MB free)" \
            "{\"usage_pct\": ${usage_pct}, \"avail_mb\": ${avail_mb}, \"total_mb\": ${total_mb}, \"mount\": \"${mount}\"}"
    fi
    echo "${usage_pct}"
}

check_memory() {
    local total avail pct avail_mb
    total=$(awk '/^MemTotal/ {print $2}' /proc/meminfo)
    avail=$(awk '/^MemAvailable/ {print $2}' /proc/meminfo)
    pct=$(( (total - avail) * 100 / total ))
    avail_mb=$(( avail / 1024 ))

    if [[ "${pct}" -ge "${MEM_CRIT_PCT}" ]]; then
        warn "Memory CRITICAL: ${pct}% used"
        emit_event "high_memory" "high" "" \
            "Memory usage critical: ${pct}% (${avail_mb} MB available)" \
            "{\"usage_pct\": ${pct}, \"avail_mb\": ${avail_mb}, \"total_mb\": $((total/1024))}"
    elif [[ "${pct}" -ge "${MEM_WARN_PCT}" ]]; then
        warn "Memory elevated: ${pct}%"
        emit_event "high_memory" "medium" "" \
            "Memory usage elevated: ${pct}% (${avail_mb} MB available)" \
            "{\"usage_pct\": ${pct}, \"avail_mb\": ${avail_mb}, \"total_mb\": $((total/1024))}"
    fi
    echo "${pct}"
}

check_cpu() {
    # Two-sample /proc/stat delta for accurate instantaneous CPU usage.
    local idle1 total1 idle2 total2 pct
    read -r idle1 total1 < <(awk '/^cpu / {idle=$5; total=0; for(i=2;i<=NF;i++) total+=$i; print idle, total}' /proc/stat)
    sleep 1
    read -r idle2 total2 < <(awk '/^cpu / {idle=$5; total=0; for(i=2;i<=NF;i++) total+=$i; print idle, total}' /proc/stat)

    local d_idle=$(( idle2 - idle1 ))
    local d_total=$(( total2 - total1 ))
    pct=$(( d_total > 0 ? 100 - d_idle * 100 / d_total : 0 ))

    if [[ "${pct}" -ge 90 ]]; then
        warn "CPU elevated: ${pct}%"
        emit_event "high_cpu" "medium" "" \
            "CPU usage elevated: ${pct}%" \
            "{\"usage_pct\": ${pct}}"
    fi
    echo "${pct}"
}

check_containers() {
    command -v docker &>/dev/null || return

    # Containers that have exited but carry a restart policy meaning they should be up
    local cname
    while IFS= read -r cname; do
        [[ -z "$cname" ]] && continue
        warn "Container exited (should be running): ${cname}"
        emit_event "containers_not_running" "high" "${cname}" \
            "Container '${cname}' has exited unexpectedly (restart=unless-stopped)" \
            "{\"container\": \"${cname}\"}"
    done < <(docker ps -a \
        --filter "status=exited" \
        --filter "label=com.docker.compose.project" \
        --format "{{.Names}}" 2>/dev/null || true)

    # Containers that are running but their health check is failing
    while IFS= read -r cname; do
        [[ -z "$cname" ]] && continue
        warn "Container unhealthy: ${cname}"
        emit_event "healthcheck_failed" "high" "${cname}" \
            "Container '${cname}' is running but health check is failing" \
            "{\"container\": \"${cname}\"}"
    done < <(docker ps \
        --filter "health=unhealthy" \
        --format "{{.Names}}" 2>/dev/null || true)
}

# ---------------------------------------------------------------------------
# Safe Docker cleanup (per policy)
# ---------------------------------------------------------------------------

_sd_card_rate_ok() {
    if [[ -f "${CLEANUP_LOCK}" ]]; then
        local last_ts elapsed
        last_ts=$(cat "${CLEANUP_LOCK}" 2>/dev/null || echo 0)
        elapsed=$(( TIMESTAMP - last_ts ))
        if [[ "${elapsed}" -lt "${CLEANUP_INTERVAL}" ]]; then
            log "Docker cleanup skipped: last run ${elapsed}s ago (limit ${CLEANUP_INTERVAL}s)"
            return 1
        fi
    fi
    return 0
}

_mark_cleanup_done() {
    echo "${TIMESTAMP}" > "${CLEANUP_LOCK}"
}

run_safe_cleanup() {
    command -v docker &>/dev/null || return
    local node_type
    node_type=$(get_node_type)

    case "${node_type}" in
        lte_node)
            # NO cleanup on LTE nodes. Any docker operation risks triggering
            # a pull over a metered/intermittent connection.
            log "Skipping Docker cleanup: LTE node (${NODE_NAME})"
            ;;

        sd_card)
            # Dangling images + stopped containers only.
            # Rate-limited to once per 24 hours to protect SD card write endurance.
            _sd_card_rate_ok || return
            log "Running rate-limited Docker cleanup (SD card node)"
            docker image prune -f     >/dev/null 2>&1 || true
            docker container prune -f >/dev/null 2>&1 || true
            _mark_cleanup_done
            ;;

        ai_node)
            # Dangling images + stopped containers + build cache.
            # NEVER docker image prune -a (would remove Ollama runtime images,
            # requiring a multi-hour re-pull of model weights).
            log "Running AI-node Docker cleanup (dangling images + containers + build cache)"
            docker image prune -f     >/dev/null 2>&1 || true
            docker container prune -f >/dev/null 2>&1 || true
            docker builder prune -f   >/dev/null 2>&1 || true
            ;;

        standard)
            # VPS and other standard nodes: full safe cleanup.
            log "Running standard Docker cleanup"
            docker image prune -f     >/dev/null 2>&1 || true
            docker container prune -f >/dev/null 2>&1 || true
            docker builder prune -f   >/dev/null 2>&1 || true
            ;;
    esac
}

# ---------------------------------------------------------------------------
# VPS-specific: control-plane filesystem rotation
# ---------------------------------------------------------------------------

cleanup_control_plane_fs() {
    log "Running control-plane filesystem rotation"

    # Completed / failed actions older than 7 days
    for status in completed failed; do
        local dir="${ACTIONS_DIR}/${status}"
        [[ -d "${dir}" ]] || continue
        find "${dir}" -name "*.json" -mtime +7 -delete 2>/dev/null && \
            log "Cleaned ${status} actions older than 7 days" || true
    done

    # Deploy logs older than 30 days
    local deploy_logs="${LOGS_DIR}/deploy"
    if [[ -d "${deploy_logs}" ]]; then
        find "${deploy_logs}" -name "*.log" -mtime +30 -delete 2>/dev/null && \
            log "Cleaned deploy logs older than 30 days" || true
    fi

    # Event files older than 3 days AND already past the observer checkpoint.
    # The dual condition ensures we never delete an event the observer hasn't seen.
    local checkpoint="${STATE_DIR}/observer_checkpoint.json"
    if [[ -f "${checkpoint}" ]] && command -v python3 &>/dev/null; then
        local last_processed
        last_processed=$(python3 -c "
import json, sys
try:
    d = json.load(open('${checkpoint}'))
    print(d.get('last_processed_file', ''))
except Exception:
    print('')
" 2>/dev/null || echo "")

        if [[ -n "${last_processed}" ]]; then
            find "${EVENTS_DIR}" -name "*.json" -mtime +3 | while IFS= read -r f; do
                # Only delete files that sort before the checkpoint path
                # (i.e., the observer has already processed them).
                if [[ "$f" < "${last_processed}" ]]; then
                    rm -f "$f"
                    log "Cleaned old event: $(basename "$f")"
                fi
            done
        else
            log "No observer checkpoint set; skipping event file cleanup"
        fi
    fi
}

# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

mkdir -p "${EVENTS_DIR}/${NODE_NAME}" "${STATE_DIR}"

log "Health check starting on ${NODE_NAME} (type=$(get_node_type))"

disk_pct=$(check_disk || echo 0)
mem_pct=$(check_memory || echo 0)
cpu_pct=$(check_cpu || echo 0)
check_containers

run_safe_cleanup

# VPS: also rotate control-plane filesystem artefacts
if [[ "${NODE_NAME}" == "vps" ]]; then
    cleanup_control_plane_fs
fi

# Emit a node_health heartbeat so the observer can update node status
# and the supervisor can see up-to-date resource metrics.
emit_event "node_health" "info" "" \
    "Health check completed on ${NODE_NAME}" \
    "{\"disk_pct\": ${disk_pct}, \"mem_pct\": ${mem_pct}, \"cpu_pct\": ${cpu_pct}}"

log "Health check complete (disk=${disk_pct}% mem=${mem_pct}% cpu=${cpu_pct}%)"