339 lines
13 KiB
Bash
339 lines
13 KiB
Bash
|
|
#!/usr/bin/env bash
|
||
|
|
# health-monitor.sh - Homelab node health monitor and safe disk cleanup
|
||
|
|
#
|
||
|
|
# Designed to run standalone on the host (cron or direct) or to be called by
|
||
|
|
# the node-agent Python daemon. All cleanup decisions follow the conservative
|
||
|
|
# policy agreed in the design review:
|
||
|
|
#
|
||
|
|
# lte_node (chelsty-infra, chelsty-ha) : NO cleanup at all
|
||
|
|
# sd_card (piha, saturn) : dangling images + stopped containers,
|
||
|
|
# rate-limited to once per 24 h
|
||
|
|
# ai_node (solaria) : dangling images + stopped containers
|
||
|
|
# + build cache (NEVER -a)
|
||
|
|
# standard (vps) : dangling images + stopped containers
|
||
|
|
# + build cache
|
||
|
|
#
|
||
|
|
# VPS additionally rotates control-plane filesystem artefacts:
|
||
|
|
# actions/completed + failed > 7 days
|
||
|
|
# logs/deploy > 30 days
|
||
|
|
# events/** > 3 days AND past observer checkpoint
|
||
|
|
#
|
||
|
|
# NEVER TOUCHED (any node): /opt/homelab/data/, config/, state/,
|
||
|
|
# actions/pending|approved|running, Frigate recordings, Ollama models,
|
||
|
|
# Zigbee2MQTT data, Mosquitto data, HA database/config.
|
||
|
|
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Configuration
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
RUNTIME_PATH="${RUNTIME_PATH:-/opt/homelab}"
|
||
|
|
EVENTS_DIR="${RUNTIME_PATH}/events"
|
||
|
|
STATE_DIR="${RUNTIME_PATH}/state"
|
||
|
|
LOGS_DIR="${RUNTIME_PATH}/logs"
|
||
|
|
ACTIONS_DIR="${RUNTIME_PATH}/actions"
|
||
|
|
|
||
|
|
NODE_NAME="${NODE_NAME:-$(hostname)}"
|
||
|
|
TIMESTAMP=$(date +%s)
|
||
|
|
DATE=$(date -u +%Y-%m-%dT%H:%M:%SZ)
|
||
|
|
|
||
|
|
# Thresholds
|
||
|
|
DISK_WARN_PCT=75
|
||
|
|
DISK_CRIT_PCT=85
|
||
|
|
MEM_WARN_PCT=85
|
||
|
|
MEM_CRIT_PCT=95
|
||
|
|
|
||
|
|
# Rate-limit file for SD-card nodes (max one Docker cleanup per 24 h)
|
||
|
|
CLEANUP_LOCK="${STATE_DIR}/last-docker-cleanup"
|
||
|
|
CLEANUP_INTERVAL=86400 # seconds
|
||
|
|
|
||
|
|
# Node classifications
|
||
|
|
LTE_NODES="chelsty-infra chelsty-ha"
|
||
|
|
SD_CARD_NODES="piha saturn"
|
||
|
|
AI_NODES="solaria"
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Helpers
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
log() { echo "$(date -u +%H:%M:%S) [INFO] $*"; }
|
||
|
|
warn() { echo "$(date -u +%H:%M:%S) [WARN] $*" >&2; }
|
||
|
|
err() { echo "$(date -u +%H:%M:%S) [ERROR] $*" >&2; }
|
||
|
|
|
||
|
|
contains() {
|
||
|
|
local word="$1"; shift
|
||
|
|
for w in "$@"; do [[ "$w" == "$word" ]] && return 0; done
|
||
|
|
return 1
|
||
|
|
}
|
||
|
|
|
||
|
|
get_node_type() {
|
||
|
|
# shellcheck disable=SC2086
|
||
|
|
if contains "$NODE_NAME" $LTE_NODES; then echo "lte_node"; return; fi
|
||
|
|
if contains "$NODE_NAME" $SD_CARD_NODES; then echo "sd_card"; return; fi
|
||
|
|
if contains "$NODE_NAME" $AI_NODES; then echo "ai_node"; return; fi
|
||
|
|
echo "standard"
|
||
|
|
}
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Event emission
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
emit_event() {
|
||
|
|
local type="$1" severity="$2" service="${3:-}" message="$4" payload="${5:-{}}"
|
||
|
|
local id="evt-${NODE_NAME}-${TIMESTAMP}-${type}"
|
||
|
|
local dir="${EVENTS_DIR}/${NODE_NAME}"
|
||
|
|
mkdir -p "$dir"
|
||
|
|
cat > "${dir}/${id}.json" <<EOF
|
||
|
|
{
|
||
|
|
"id": "${id}",
|
||
|
|
"timestamp": ${TIMESTAMP},
|
||
|
|
"date": "${DATE}",
|
||
|
|
"type": "${type}",
|
||
|
|
"severity": "${severity}",
|
||
|
|
"node": "${NODE_NAME}",
|
||
|
|
"service": "${service}",
|
||
|
|
"message": "${message}",
|
||
|
|
"payload": ${payload}
|
||
|
|
}
|
||
|
|
EOF
|
||
|
|
}
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Health checks
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
check_disk() {
|
||
|
|
# Use /opt/homelab as the check target — it lives on the host filesystem
|
||
|
|
# and this path is correct both when running natively and in a container
|
||
|
|
# that mounts /opt/homelab from the host.
|
||
|
|
local mount="${RUNTIME_PATH}"
|
||
|
|
local usage_pct avail_mb total_mb
|
||
|
|
usage_pct=$(df "${mount}" 2>/dev/null | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || return
|
||
|
|
avail_mb=$(df "${mount}" 2>/dev/null | awk 'NR==2 {printf "%d", $4/1024}') || return
|
||
|
|
total_mb=$(df "${mount}" 2>/dev/null | awk 'NR==2 {printf "%d", $2/1024}') || return
|
||
|
|
|
||
|
|
if [[ "${usage_pct}" -ge "${DISK_CRIT_PCT}" ]]; then
|
||
|
|
warn "Disk CRITICAL: ${usage_pct}% used (${avail_mb} MB free)"
|
||
|
|
emit_event "disk_pressure" "high" "" \
|
||
|
|
"Disk usage critical: ${usage_pct}% on ${mount} (${avail_mb} MB free)" \
|
||
|
|
"{\"usage_pct\": ${usage_pct}, \"avail_mb\": ${avail_mb}, \"total_mb\": ${total_mb}, \"mount\": \"${mount}\"}"
|
||
|
|
elif [[ "${usage_pct}" -ge "${DISK_WARN_PCT}" ]]; then
|
||
|
|
warn "Disk elevated: ${usage_pct}% used"
|
||
|
|
emit_event "disk_pressure" "medium" "" \
|
||
|
|
"Disk usage elevated: ${usage_pct}% on ${mount} (${avail_mb} MB free)" \
|
||
|
|
"{\"usage_pct\": ${usage_pct}, \"avail_mb\": ${avail_mb}, \"total_mb\": ${total_mb}, \"mount\": \"${mount}\"}"
|
||
|
|
fi
|
||
|
|
echo "${usage_pct}"
|
||
|
|
}
|
||
|
|
|
||
|
|
check_memory() {
|
||
|
|
local total avail pct avail_mb
|
||
|
|
total=$(awk '/^MemTotal/ {print $2}' /proc/meminfo)
|
||
|
|
avail=$(awk '/^MemAvailable/ {print $2}' /proc/meminfo)
|
||
|
|
pct=$(( (total - avail) * 100 / total ))
|
||
|
|
avail_mb=$(( avail / 1024 ))
|
||
|
|
|
||
|
|
if [[ "${pct}" -ge "${MEM_CRIT_PCT}" ]]; then
|
||
|
|
warn "Memory CRITICAL: ${pct}% used"
|
||
|
|
emit_event "high_memory" "high" "" \
|
||
|
|
"Memory usage critical: ${pct}% (${avail_mb} MB available)" \
|
||
|
|
"{\"usage_pct\": ${pct}, \"avail_mb\": ${avail_mb}, \"total_mb\": $((total/1024))}"
|
||
|
|
elif [[ "${pct}" -ge "${MEM_WARN_PCT}" ]]; then
|
||
|
|
warn "Memory elevated: ${pct}%"
|
||
|
|
emit_event "high_memory" "medium" "" \
|
||
|
|
"Memory usage elevated: ${pct}% (${avail_mb} MB available)" \
|
||
|
|
"{\"usage_pct\": ${pct}, \"avail_mb\": ${avail_mb}, \"total_mb\": $((total/1024))}"
|
||
|
|
fi
|
||
|
|
echo "${pct}"
|
||
|
|
}
|
||
|
|
|
||
|
|
check_cpu() {
|
||
|
|
# Two-sample /proc/stat delta for accurate instantaneous CPU usage.
|
||
|
|
local idle1 total1 idle2 total2 pct
|
||
|
|
read -r idle1 total1 < <(awk '/^cpu / {idle=$5; total=0; for(i=2;i<=NF;i++) total+=$i; print idle, total}' /proc/stat)
|
||
|
|
sleep 1
|
||
|
|
read -r idle2 total2 < <(awk '/^cpu / {idle=$5; total=0; for(i=2;i<=NF;i++) total+=$i; print idle, total}' /proc/stat)
|
||
|
|
|
||
|
|
local d_idle=$(( idle2 - idle1 ))
|
||
|
|
local d_total=$(( total2 - total1 ))
|
||
|
|
pct=$(( d_total > 0 ? 100 - d_idle * 100 / d_total : 0 ))
|
||
|
|
|
||
|
|
if [[ "${pct}" -ge 90 ]]; then
|
||
|
|
warn "CPU elevated: ${pct}%"
|
||
|
|
emit_event "high_cpu" "medium" "" \
|
||
|
|
"CPU usage elevated: ${pct}%" \
|
||
|
|
"{\"usage_pct\": ${pct}}"
|
||
|
|
fi
|
||
|
|
echo "${pct}"
|
||
|
|
}
|
||
|
|
|
||
|
|
check_containers() {
|
||
|
|
command -v docker &>/dev/null || return
|
||
|
|
|
||
|
|
# Containers that have exited but carry a restart policy meaning they should be up
|
||
|
|
local cname
|
||
|
|
while IFS= read -r cname; do
|
||
|
|
[[ -z "$cname" ]] && continue
|
||
|
|
warn "Container exited (should be running): ${cname}"
|
||
|
|
emit_event "containers_not_running" "high" "${cname}" \
|
||
|
|
"Container '${cname}' has exited unexpectedly (restart=unless-stopped)" \
|
||
|
|
"{\"container\": \"${cname}\"}"
|
||
|
|
done < <(docker ps -a \
|
||
|
|
--filter "status=exited" \
|
||
|
|
--filter "label=com.docker.compose.project" \
|
||
|
|
--format "{{.Names}}" 2>/dev/null || true)
|
||
|
|
|
||
|
|
# Containers that are running but their health check is failing
|
||
|
|
while IFS= read -r cname; do
|
||
|
|
[[ -z "$cname" ]] && continue
|
||
|
|
warn "Container unhealthy: ${cname}"
|
||
|
|
emit_event "healthcheck_failed" "high" "${cname}" \
|
||
|
|
"Container '${cname}' is running but health check is failing" \
|
||
|
|
"{\"container\": \"${cname}\"}"
|
||
|
|
done < <(docker ps \
|
||
|
|
--filter "health=unhealthy" \
|
||
|
|
--format "{{.Names}}" 2>/dev/null || true)
|
||
|
|
}
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Safe Docker cleanup (per policy)
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
_sd_card_rate_ok() {
|
||
|
|
if [[ -f "${CLEANUP_LOCK}" ]]; then
|
||
|
|
local last_ts elapsed
|
||
|
|
last_ts=$(cat "${CLEANUP_LOCK}" 2>/dev/null || echo 0)
|
||
|
|
elapsed=$(( TIMESTAMP - last_ts ))
|
||
|
|
if [[ "${elapsed}" -lt "${CLEANUP_INTERVAL}" ]]; then
|
||
|
|
log "Docker cleanup skipped: last run ${elapsed}s ago (limit ${CLEANUP_INTERVAL}s)"
|
||
|
|
return 1
|
||
|
|
fi
|
||
|
|
fi
|
||
|
|
return 0
|
||
|
|
}
|
||
|
|
|
||
|
|
_mark_cleanup_done() {
|
||
|
|
echo "${TIMESTAMP}" > "${CLEANUP_LOCK}"
|
||
|
|
}
|
||
|
|
|
||
|
|
run_safe_cleanup() {
|
||
|
|
command -v docker &>/dev/null || return
|
||
|
|
local node_type
|
||
|
|
node_type=$(get_node_type)
|
||
|
|
|
||
|
|
case "${node_type}" in
|
||
|
|
lte_node)
|
||
|
|
# NO cleanup on LTE nodes. Any docker operation risks triggering
|
||
|
|
# a pull over a metered/intermittent connection.
|
||
|
|
log "Skipping Docker cleanup: LTE node (${NODE_NAME})"
|
||
|
|
;;
|
||
|
|
|
||
|
|
sd_card)
|
||
|
|
# Dangling images + stopped containers only.
|
||
|
|
# Rate-limited to once per 24 hours to protect SD card write endurance.
|
||
|
|
_sd_card_rate_ok || return
|
||
|
|
log "Running rate-limited Docker cleanup (SD card node)"
|
||
|
|
docker image prune -f >/dev/null 2>&1 || true
|
||
|
|
docker container prune -f >/dev/null 2>&1 || true
|
||
|
|
_mark_cleanup_done
|
||
|
|
;;
|
||
|
|
|
||
|
|
ai_node)
|
||
|
|
# Dangling images + stopped containers + build cache.
|
||
|
|
# NEVER docker image prune -a (would remove Ollama runtime images,
|
||
|
|
# requiring a multi-hour re-pull of model weights).
|
||
|
|
log "Running AI-node Docker cleanup (dangling images + containers + build cache)"
|
||
|
|
docker image prune -f >/dev/null 2>&1 || true
|
||
|
|
docker container prune -f >/dev/null 2>&1 || true
|
||
|
|
docker builder prune -f >/dev/null 2>&1 || true
|
||
|
|
;;
|
||
|
|
|
||
|
|
standard)
|
||
|
|
# VPS and other standard nodes: full safe cleanup.
|
||
|
|
log "Running standard Docker cleanup"
|
||
|
|
docker image prune -f >/dev/null 2>&1 || true
|
||
|
|
docker container prune -f >/dev/null 2>&1 || true
|
||
|
|
docker builder prune -f >/dev/null 2>&1 || true
|
||
|
|
;;
|
||
|
|
esac
|
||
|
|
}
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# VPS-specific: control-plane filesystem rotation
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
cleanup_control_plane_fs() {
|
||
|
|
log "Running control-plane filesystem rotation"
|
||
|
|
|
||
|
|
# Completed / failed actions older than 7 days
|
||
|
|
for status in completed failed; do
|
||
|
|
local dir="${ACTIONS_DIR}/${status}"
|
||
|
|
[[ -d "${dir}" ]] || continue
|
||
|
|
find "${dir}" -name "*.json" -mtime +7 -delete 2>/dev/null && \
|
||
|
|
log "Cleaned ${status} actions older than 7 days" || true
|
||
|
|
done
|
||
|
|
|
||
|
|
# Deploy logs older than 30 days
|
||
|
|
local deploy_logs="${LOGS_DIR}/deploy"
|
||
|
|
if [[ -d "${deploy_logs}" ]]; then
|
||
|
|
find "${deploy_logs}" -name "*.log" -mtime +30 -delete 2>/dev/null && \
|
||
|
|
log "Cleaned deploy logs older than 30 days" || true
|
||
|
|
fi
|
||
|
|
|
||
|
|
# Event files older than 3 days AND already past the observer checkpoint.
|
||
|
|
# The dual condition ensures we never delete an event the observer hasn't seen.
|
||
|
|
local checkpoint="${STATE_DIR}/observer_checkpoint.json"
|
||
|
|
if [[ -f "${checkpoint}" ]] && command -v python3 &>/dev/null; then
|
||
|
|
local last_processed
|
||
|
|
last_processed=$(python3 -c "
|
||
|
|
import json, sys
|
||
|
|
try:
|
||
|
|
d = json.load(open('${checkpoint}'))
|
||
|
|
print(d.get('last_processed_file', ''))
|
||
|
|
except Exception:
|
||
|
|
print('')
|
||
|
|
" 2>/dev/null || echo "")
|
||
|
|
|
||
|
|
if [[ -n "${last_processed}" ]]; then
|
||
|
|
find "${EVENTS_DIR}" -name "*.json" -mtime +3 | while IFS= read -r f; do
|
||
|
|
# Only delete files that sort before the checkpoint path
|
||
|
|
# (i.e., the observer has already processed them).
|
||
|
|
if [[ "$f" < "${last_processed}" ]]; then
|
||
|
|
rm -f "$f"
|
||
|
|
log "Cleaned old event: $(basename "$f")"
|
||
|
|
fi
|
||
|
|
done
|
||
|
|
else
|
||
|
|
log "No observer checkpoint set; skipping event file cleanup"
|
||
|
|
fi
|
||
|
|
fi
|
||
|
|
}
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Main
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
mkdir -p "${EVENTS_DIR}/${NODE_NAME}" "${STATE_DIR}"
|
||
|
|
|
||
|
|
log "Health check starting on ${NODE_NAME} (type=$(get_node_type))"
|
||
|
|
|
||
|
|
disk_pct=$(check_disk || echo 0)
|
||
|
|
mem_pct=$(check_memory || echo 0)
|
||
|
|
cpu_pct=$(check_cpu || echo 0)
|
||
|
|
check_containers
|
||
|
|
|
||
|
|
run_safe_cleanup
|
||
|
|
|
||
|
|
# VPS: also rotate control-plane filesystem artefacts
|
||
|
|
if [[ "${NODE_NAME}" == "vps" ]]; then
|
||
|
|
cleanup_control_plane_fs
|
||
|
|
fi
|
||
|
|
|
||
|
|
# Emit a node_health heartbeat so the observer can update node status
|
||
|
|
# and the supervisor can see up-to-date resource metrics.
|
||
|
|
emit_event "node_health" "info" "" \
|
||
|
|
"Health check completed on ${NODE_NAME}" \
|
||
|
|
"{\"disk_pct\": ${disk_pct}, \"mem_pct\": ${mem_pct}, \"cpu_pct\": ${cpu_pct}}"
|
||
|
|
|
||
|
|
log "Health check complete (disk=${disk_pct}% mem=${mem_pct}% cpu=${cpu_pct}%)"
|