homelab-codex-ws/scripts/monitor/health-monitor.sh

#!/usr/bin/env bash
# health-monitor.sh - Homelab node health monitor and safe disk cleanup
#
# Designed to run standalone on the host (cron or direct) or to be called by
# the node-agent Python daemon. All cleanup decisions follow the conservative
# policy agreed in the design review:
#
#  lte_node  (chelsty-infra, chelsty-ha) : NO cleanup at all
#  sd_card   (piha, saturn)              : dangling images + stopped containers,
#                                          rate-limited to once per 24 h
#  ai_node   (solaria)                   : dangling images + stopped containers
#                                          + build cache (NEVER -a)
#  standard  (vps)                       : dangling images + stopped containers
#                                          + build cache
#
# VPS additionally rotates control-plane filesystem artefacts:
#   actions/completed + failed  > 7 days
#   logs/deploy                 > 30 days
#   events/**                   > 3 days AND past observer checkpoint
#
# NEVER TOUCHED (any node): /opt/homelab/data/, config/, state/,
#   actions/pending|approved|running, Frigate recordings, Ollama models,
#   Zigbee2MQTT data, Mosquitto data, HA database/config.

set -euo pipefail

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
RUNTIME_PATH="${RUNTIME_PATH:-/opt/homelab}"
EVENTS_DIR="${RUNTIME_PATH}/events"
STATE_DIR="${RUNTIME_PATH}/state"
LOGS_DIR="${RUNTIME_PATH}/logs"
ACTIONS_DIR="${RUNTIME_PATH}/actions"

NODE_NAME="${NODE_NAME:-$(hostname)}"
TIMESTAMP=$(date +%s)
DATE=$(date -u +%Y-%m-%dT%H:%M:%SZ)

# Thresholds
DISK_WARN_PCT=75
DISK_CRIT_PCT=85
MEM_WARN_PCT=85
MEM_CRIT_PCT=95

# Rate-limit file for SD-card nodes (max one Docker cleanup per 24 h)
CLEANUP_LOCK="${STATE_DIR}/last-docker-cleanup"
CLEANUP_INTERVAL=86400   # seconds

# Node classifications
LTE_NODES="chelsty-infra chelsty-ha"
SD_CARD_NODES="piha saturn"
AI_NODES="solaria"

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

log()  { echo "$(date -u +%H:%M:%S) [INFO]  $*"; }
warn() { echo "$(date -u +%H:%M:%S) [WARN]  $*" >&2; }
err()  { echo "$(date -u +%H:%M:%S) [ERROR] $*" >&2; }

contains() {
    local word="$1"; shift
    for w in "$@"; do [[ "$w" == "$word" ]] && return 0; done
    return 1
}

get_node_type() {
    # shellcheck disable=SC2086
    if contains "$NODE_NAME" $LTE_NODES;    then echo "lte_node";  return; fi
    if contains "$NODE_NAME" $SD_CARD_NODES; then echo "sd_card";   return; fi
    if contains "$NODE_NAME" $AI_NODES;     then echo "ai_node";   return; fi
    echo "standard"
}

# ---------------------------------------------------------------------------
# Event emission
# ---------------------------------------------------------------------------

emit_event() {
    local type="$1" severity="$2" service="${3:-}" message="$4" payload="${5:-{}}"
    local id="evt-${NODE_NAME}-${TIMESTAMP}-${type}"
    local dir="${EVENTS_DIR}/${NODE_NAME}"
    mkdir -p "$dir"
    cat > "${dir}/${id}.json" <<EOF
{
  "id": "${id}",
  "timestamp": ${TIMESTAMP},
  "date": "${DATE}",
  "type": "${type}",
  "severity": "${severity}",
  "node": "${NODE_NAME}",
  "service": "${service}",
  "message": "${message}",
  "payload": ${payload}
}
EOF
}

# ---------------------------------------------------------------------------
# Health checks
# ---------------------------------------------------------------------------

check_disk() {
    # Use /opt/homelab as the check target — it lives on the host filesystem
    # and this path is correct both when running natively and in a container
    # that mounts /opt/homelab from the host.
    local mount="${RUNTIME_PATH}"
    local usage_pct avail_mb total_mb
    usage_pct=$(df "${mount}" 2>/dev/null | awk 'NR==2 {gsub(/%/,"",$5); print $5}') || return
    avail_mb=$(df  "${mount}" 2>/dev/null | awk 'NR==2 {printf "%d", $4/1024}')       || return
    total_mb=$(df  "${mount}" 2>/dev/null | awk 'NR==2 {printf "%d", $2/1024}')       || return

    if [[ "${usage_pct}" -ge "${DISK_CRIT_PCT}" ]]; then
        warn "Disk CRITICAL: ${usage_pct}% used (${avail_mb} MB free)"
        emit_event "disk_pressure" "high" "" \
            "Disk usage critical: ${usage_pct}% on ${mount} (${avail_mb} MB free)" \
            "{\"usage_pct\": ${usage_pct}, \"avail_mb\": ${avail_mb}, \"total_mb\": ${total_mb}, \"mount\": \"${mount}\"}"
    elif [[ "${usage_pct}" -ge "${DISK_WARN_PCT}" ]]; then
        warn "Disk elevated: ${usage_pct}% used"
        emit_event "disk_pressure" "medium" "" \
            "Disk usage elevated: ${usage_pct}% on ${mount} (${avail_mb} MB free)" \
            "{\"usage_pct\": ${usage_pct}, \"avail_mb\": ${avail_mb}, \"total_mb\": ${total_mb}, \"mount\": \"${mount}\"}"
    fi
    echo "${usage_pct}"
}

check_memory() {
    local total avail pct avail_mb
    total=$(awk '/^MemTotal/ {print $2}' /proc/meminfo)
    avail=$(awk '/^MemAvailable/ {print $2}' /proc/meminfo)
    pct=$(( (total - avail) * 100 / total ))
    avail_mb=$(( avail / 1024 ))

    if [[ "${pct}" -ge "${MEM_CRIT_PCT}" ]]; then
        warn "Memory CRITICAL: ${pct}% used"
        emit_event "high_memory" "high" "" \
            "Memory usage critical: ${pct}% (${avail_mb} MB available)" \
            "{\"usage_pct\": ${pct}, \"avail_mb\": ${avail_mb}, \"total_mb\": $((total/1024))}"
    elif [[ "${pct}" -ge "${MEM_WARN_PCT}" ]]; then
        warn "Memory elevated: ${pct}%"
        emit_event "high_memory" "medium" "" \
            "Memory usage elevated: ${pct}% (${avail_mb} MB available)" \
            "{\"usage_pct\": ${pct}, \"avail_mb\": ${avail_mb}, \"total_mb\": $((total/1024))}"
    fi
    echo "${pct}"
}

check_cpu() {
    # Two-sample /proc/stat delta for accurate instantaneous CPU usage.
    local idle1 total1 idle2 total2 pct
    read -r idle1 total1 < <(awk '/^cpu / {idle=$5; total=0; for(i=2;i<=NF;i++) total+=$i; print idle, total}' /proc/stat)
    sleep 1
    read -r idle2 total2 < <(awk '/^cpu / {idle=$5; total=0; for(i=2;i<=NF;i++) total+=$i; print idle, total}' /proc/stat)

    local d_idle=$(( idle2 - idle1 ))
    local d_total=$(( total2 - total1 ))
    pct=$(( d_total > 0 ? 100 - d_idle * 100 / d_total : 0 ))

    if [[ "${pct}" -ge 90 ]]; then
        warn "CPU elevated: ${pct}%"
        emit_event "high_cpu" "medium" "" \
            "CPU usage elevated: ${pct}%" \
            "{\"usage_pct\": ${pct}}"
    fi
    echo "${pct}"
}

check_containers() {
    command -v docker &>/dev/null || return

    # Containers that have exited but carry a restart policy meaning they should be up
    local cname
    while IFS= read -r cname; do
        [[ -z "$cname" ]] && continue
        warn "Container exited (should be running): ${cname}"
        emit_event "containers_not_running" "high" "${cname}" \
            "Container '${cname}' has exited unexpectedly (restart=unless-stopped)" \
            "{\"container\": \"${cname}\"}"
    done < <(docker ps -a \
        --filter "status=exited" \
        --filter "label=com.docker.compose.project" \
        --format "{{.Names}}" 2>/dev/null || true)

    # Containers that are running but their health check is failing
    while IFS= read -r cname; do
        [[ -z "$cname" ]] && continue
        warn "Container unhealthy: ${cname}"
        emit_event "healthcheck_failed" "high" "${cname}" \
            "Container '${cname}' is running but health check is failing" \
            "{\"container\": \"${cname}\"}"
    done < <(docker ps \
        --filter "health=unhealthy" \
        --format "{{.Names}}" 2>/dev/null || true)
}

# ---------------------------------------------------------------------------
# Safe Docker cleanup (per policy)
# ---------------------------------------------------------------------------

_sd_card_rate_ok() {
    if [[ -f "${CLEANUP_LOCK}" ]]; then
        local last_ts elapsed
        last_ts=$(cat "${CLEANUP_LOCK}" 2>/dev/null || echo 0)
        elapsed=$(( TIMESTAMP - last_ts ))
        if [[ "${elapsed}" -lt "${CLEANUP_INTERVAL}" ]]; then
            log "Docker cleanup skipped: last run ${elapsed}s ago (limit ${CLEANUP_INTERVAL}s)"
            return 1
        fi
    fi
    return 0
}

_mark_cleanup_done() {
    echo "${TIMESTAMP}" > "${CLEANUP_LOCK}"
}

run_safe_cleanup() {
    command -v docker &>/dev/null || return
    local node_type
    node_type=$(get_node_type)

    case "${node_type}" in
        lte_node)
            # NO cleanup on LTE nodes. Any docker operation risks triggering
            # a pull over a metered/intermittent connection.
            log "Skipping Docker cleanup: LTE node (${NODE_NAME})"
            ;;

        sd_card)
            # Dangling images + stopped containers only.
            # Rate-limited to once per 24 hours to protect SD card write endurance.
            _sd_card_rate_ok || return
            log "Running rate-limited Docker cleanup (SD card node)"
            docker image prune -f     >/dev/null 2>&1 || true
            docker container prune -f >/dev/null 2>&1 || true
            _mark_cleanup_done
            ;;

        ai_node)
            # Dangling images + stopped containers + build cache.
            # NEVER docker image prune -a (would remove Ollama runtime images,
            # requiring a multi-hour re-pull of model weights).
            log "Running AI-node Docker cleanup (dangling images + containers + build cache)"
            docker image prune -f     >/dev/null 2>&1 || true
            docker container prune -f >/dev/null 2>&1 || true
            docker builder prune -f   >/dev/null 2>&1 || true
            ;;

        standard)
            # VPS and other standard nodes: full safe cleanup.
            log "Running standard Docker cleanup"
            docker image prune -f     >/dev/null 2>&1 || true
            docker container prune -f >/dev/null 2>&1 || true
            docker builder prune -f   >/dev/null 2>&1 || true
            ;;
    esac
}

# ---------------------------------------------------------------------------
# VPS-specific: control-plane filesystem rotation
# ---------------------------------------------------------------------------

cleanup_control_plane_fs() {
    log "Running control-plane filesystem rotation"

    # Completed / failed actions older than 7 days
    for status in completed failed; do
        local dir="${ACTIONS_DIR}/${status}"
        [[ -d "${dir}" ]] || continue
        find "${dir}" -name "*.json" -mtime +7 -delete 2>/dev/null && \
            log "Cleaned ${status} actions older than 7 days" || true
    done

    # Deploy logs older than 30 days
    local deploy_logs="${LOGS_DIR}/deploy"
    if [[ -d "${deploy_logs}" ]]; then
        find "${deploy_logs}" -name "*.log" -mtime +30 -delete 2>/dev/null && \
            log "Cleaned deploy logs older than 30 days" || true
    fi

    # Event files older than 3 days AND already past the observer checkpoint.
    # The dual condition ensures we never delete an event the observer hasn't seen.
    local checkpoint="${STATE_DIR}/observer_checkpoint.json"
    if [[ -f "${checkpoint}" ]] && command -v python3 &>/dev/null; then
        local last_processed
        last_processed=$(python3 -c "
import json, sys
try:
    d = json.load(open('${checkpoint}'))
    print(d.get('last_processed_file', ''))
except Exception:
    print('')
" 2>/dev/null || echo "")

        if [[ -n "${last_processed}" ]]; then
            find "${EVENTS_DIR}" -name "*.json" -mtime +3 | while IFS= read -r f; do
                # Only delete files that sort before the checkpoint path
                # (i.e., the observer has already processed them).
                if [[ "$f" < "${last_processed}" ]]; then
                    rm -f "$f"
                    log "Cleaned old event: $(basename "$f")"
                fi
            done
        else
            log "No observer checkpoint set; skipping event file cleanup"
        fi
    fi
}

# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

mkdir -p "${EVENTS_DIR}/${NODE_NAME}" "${STATE_DIR}"

log "Health check starting on ${NODE_NAME} (type=$(get_node_type))"

disk_pct=$(check_disk || echo 0)
mem_pct=$(check_memory || echo 0)
cpu_pct=$(check_cpu || echo 0)
check_containers

run_safe_cleanup

# VPS: also rotate control-plane filesystem artefacts
if [[ "${NODE_NAME}" == "vps" ]]; then
    cleanup_control_plane_fs
fi

# Emit a node_health heartbeat so the observer can update node status
# and the supervisor can see up-to-date resource metrics.
emit_event "node_health" "info" "" \
    "Health check completed on ${NODE_NAME}" \
    "{\"disk_pct\": ${disk_pct}, \"mem_pct\": ${mem_pct}, \"cpu_pct\": ${cpu_pct}}"

log "Health check complete (disk=${disk_pct}% mem=${mem_pct}% cpu=${cpu_pct}%)"
feat(node-agent): implement health monitor and safe cleanup policy scripts/monitor/health-monitor.sh (new): - Standalone bash health monitor: disk/RAM/CPU checks + docker container health - Per-node-type cleanup policy enforced: lte_node (chelsty-infra, chelsty-ha): NO cleanup, no docker ops sd_card (piha, saturn): dangling images + containers, rate-limited once/24h ai_node (solaria): dangling + containers + build cache, NEVER -a standard (vps): dangling + containers + build cache + CP filesystem rotation - VPS filesystem rotation: completed/failed actions >7d, deploy logs >30d, events >3d AND past observer checkpoint - Emits structured JSON events (node_health, disk_pressure, high_memory, high_cpu, containers_not_running, healthcheck_failed) services/node-agent/ (new): - Python daemon (node_agent.py): same policy as bash script, Docker SDK for container checks and cleanup, /proc for system metrics - Optional event shipping to VPS via rsync+SSH (VPS_EVENTS_HOST env var) - Dockerfile: python:3.11-slim + openssh-client + rsync + docker>=6.0 - docker-compose.yml: mounts docker socket, /opt/homelab, repo read-only observer.py: - Handle node_health: update node status + disk/mem/cpu metrics, clear disk_pressure - Handle disk_pressure: record severity on node, clear when healthy - Handle high_memory / high_cpu: record pressure level for correlation supervisor.py: - Add NO_DISK_CLEANUP_NODES = {chelsty-infra, chelsty-ha} - reconcile() step 3: generate disk_cleanup actions for nodes with high disk pressure - _generate_disk_cleanup_recommendation(): stable ID disk-cleanup-{node}, checks all active states, risk=guarded (operator approval required) executor.py: - Handle disk_cleanup action type via _execute_disk_cleanup() - Commands come from action payload; safety gate rejects any command touching /opt/homelab/data/, /opt/homelab/config/, /opt/homelab/state/, or rm -rf / hosts/*/services.yaml: - Rename stability-agent -> node-agent on piha, vps, solaria, chelsty-infra - Add node-agent to chelsty-ha (previously missing) - Add cleanup policy notes to LTE node comments Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-05-27 13:15:06 +02:00			`#!/usr/bin/env bash`
			`# health-monitor.sh - Homelab node health monitor and safe disk cleanup`
			`#`
			`# Designed to run standalone on the host (cron or direct) or to be called by`
			`# the node-agent Python daemon. All cleanup decisions follow the conservative`
			`# policy agreed in the design review:`
			`#`
			`# lte_node (chelsty-infra, chelsty-ha) : NO cleanup at all`
			`# sd_card (piha, saturn) : dangling images + stopped containers,`
			`# rate-limited to once per 24 h`
			`# ai_node (solaria) : dangling images + stopped containers`
			`# + build cache (NEVER -a)`
			`# standard (vps) : dangling images + stopped containers`
			`# + build cache`
			`#`
			`# VPS additionally rotates control-plane filesystem artefacts:`
			`# actions/completed + failed > 7 days`
			`# logs/deploy > 30 days`
			`# events/** > 3 days AND past observer checkpoint`
			`#`
			`# NEVER TOUCHED (any node): /opt/homelab/data/, config/, state/,`
			`# actions/pending\|approved\|running, Frigate recordings, Ollama models,`
			`# Zigbee2MQTT data, Mosquitto data, HA database/config.`

			`set -euo pipefail`

			`# ---------------------------------------------------------------------------`
			`# Configuration`
			`# ---------------------------------------------------------------------------`
			`RUNTIME_PATH="${RUNTIME_PATH:-/opt/homelab}"`
			`EVENTS_DIR="${RUNTIME_PATH}/events"`
			`STATE_DIR="${RUNTIME_PATH}/state"`
			`LOGS_DIR="${RUNTIME_PATH}/logs"`
			`ACTIONS_DIR="${RUNTIME_PATH}/actions"`

			`NODE_NAME="${NODE_NAME:-$(hostname)}"`
			`TIMESTAMP=$(date +%s)`
			`DATE=$(date -u +%Y-%m-%dT%H:%M:%SZ)`

			`# Thresholds`
			`DISK_WARN_PCT=75`
			`DISK_CRIT_PCT=85`
			`MEM_WARN_PCT=85`
			`MEM_CRIT_PCT=95`

			`# Rate-limit file for SD-card nodes (max one Docker cleanup per 24 h)`
			`CLEANUP_LOCK="${STATE_DIR}/last-docker-cleanup"`
			`CLEANUP_INTERVAL=86400 # seconds`

			`# Node classifications`
			`LTE_NODES="chelsty-infra chelsty-ha"`
			`SD_CARD_NODES="piha saturn"`
			`AI_NODES="solaria"`

			`# ---------------------------------------------------------------------------`
			`# Helpers`
			`# ---------------------------------------------------------------------------`

			`log() { echo "$(date -u +%H:%M:%S) [INFO] $*"; }`
			`warn() { echo "$(date -u +%H:%M:%S) [WARN] $*" >&2; }`
			`err() { echo "$(date -u +%H:%M:%S) [ERROR] $*" >&2; }`

			`contains() {`
			`local word="$1"; shift`
			`for w in "$@"; do [[ "$w" == "$word" ]] && return 0; done`
			`return 1`
			`}`

			`get_node_type() {`
			`# shellcheck disable=SC2086`
			`if contains "$NODE_NAME" $LTE_NODES; then echo "lte_node"; return; fi`
			`if contains "$NODE_NAME" $SD_CARD_NODES; then echo "sd_card"; return; fi`
			`if contains "$NODE_NAME" $AI_NODES; then echo "ai_node"; return; fi`
			`echo "standard"`
			`}`

			`# ---------------------------------------------------------------------------`
			`# Event emission`
			`# ---------------------------------------------------------------------------`

			`emit_event() {`
			`local type="$1" severity="$2" service="${3:-}" message="$4" payload="${5:-{}}"`
			`local id="evt-${NODE_NAME}-${TIMESTAMP}-${type}"`
			`local dir="${EVENTS_DIR}/${NODE_NAME}"`
			`mkdir -p "$dir"`
			`cat > "${dir}/${id}.json" <<EOF`
			`{`
			`"id": "${id}",`
			`"timestamp": ${TIMESTAMP},`
			`"date": "${DATE}",`
			`"type": "${type}",`
			`"severity": "${severity}",`
			`"node": "${NODE_NAME}",`
			`"service": "${service}",`
			`"message": "${message}",`
			`"payload": ${payload}`
			`}`
			`EOF`
			`}`

			`# ---------------------------------------------------------------------------`
			`# Health checks`
			`# ---------------------------------------------------------------------------`

			`check_disk() {`
			`# Use /opt/homelab as the check target — it lives on the host filesystem`
			`# and this path is correct both when running natively and in a container`
			`# that mounts /opt/homelab from the host.`
			`local mount="${RUNTIME_PATH}"`
			`local usage_pct avail_mb total_mb`
			`usage_pct=$(df "${mount}" 2>/dev/null \| awk 'NR==2 {gsub(/%/,"",$5); print $5}') \|\| return`
			`avail_mb=$(df "${mount}" 2>/dev/null \| awk 'NR==2 {printf "%d", $4/1024}') \|\| return`
			`total_mb=$(df "${mount}" 2>/dev/null \| awk 'NR==2 {printf "%d", $2/1024}') \|\| return`

			`if [[ "${usage_pct}" -ge "${DISK_CRIT_PCT}" ]]; then`
			`warn "Disk CRITICAL: ${usage_pct}% used (${avail_mb} MB free)"`
			`emit_event "disk_pressure" "high" "" \`
			`"Disk usage critical: ${usage_pct}% on ${mount} (${avail_mb} MB free)" \`
			`"{\"usage_pct\": ${usage_pct}, \"avail_mb\": ${avail_mb}, \"total_mb\": ${total_mb}, \"mount\": \"${mount}\"}"`
			`elif [[ "${usage_pct}" -ge "${DISK_WARN_PCT}" ]]; then`
			`warn "Disk elevated: ${usage_pct}% used"`
			`emit_event "disk_pressure" "medium" "" \`
			`"Disk usage elevated: ${usage_pct}% on ${mount} (${avail_mb} MB free)" \`
			`"{\"usage_pct\": ${usage_pct}, \"avail_mb\": ${avail_mb}, \"total_mb\": ${total_mb}, \"mount\": \"${mount}\"}"`
			`fi`
			`echo "${usage_pct}"`
			`}`

			`check_memory() {`
			`local total avail pct avail_mb`
			`total=$(awk '/^MemTotal/ {print $2}' /proc/meminfo)`
			`avail=$(awk '/^MemAvailable/ {print $2}' /proc/meminfo)`
			`pct=$(( (total - avail) * 100 / total ))`
			`avail_mb=$(( avail / 1024 ))`

			`if [[ "${pct}" -ge "${MEM_CRIT_PCT}" ]]; then`
			`warn "Memory CRITICAL: ${pct}% used"`
			`emit_event "high_memory" "high" "" \`
			`"Memory usage critical: ${pct}% (${avail_mb} MB available)" \`
			`"{\"usage_pct\": ${pct}, \"avail_mb\": ${avail_mb}, \"total_mb\": $((total/1024))}"`
			`elif [[ "${pct}" -ge "${MEM_WARN_PCT}" ]]; then`
			`warn "Memory elevated: ${pct}%"`
			`emit_event "high_memory" "medium" "" \`
			`"Memory usage elevated: ${pct}% (${avail_mb} MB available)" \`
			`"{\"usage_pct\": ${pct}, \"avail_mb\": ${avail_mb}, \"total_mb\": $((total/1024))}"`
			`fi`
			`echo "${pct}"`
			`}`

			`check_cpu() {`
			`# Two-sample /proc/stat delta for accurate instantaneous CPU usage.`
			`local idle1 total1 idle2 total2 pct`
			`read -r idle1 total1 < <(awk '/^cpu / {idle=$5; total=0; for(i=2;i<=NF;i++) total+=$i; print idle, total}' /proc/stat)`
			`sleep 1`
			`read -r idle2 total2 < <(awk '/^cpu / {idle=$5; total=0; for(i=2;i<=NF;i++) total+=$i; print idle, total}' /proc/stat)`

			`local d_idle=$(( idle2 - idle1 ))`
			`local d_total=$(( total2 - total1 ))`
			`pct=$(( d_total > 0 ? 100 - d_idle * 100 / d_total : 0 ))`

			`if [[ "${pct}" -ge 90 ]]; then`
			`warn "CPU elevated: ${pct}%"`
			`emit_event "high_cpu" "medium" "" \`
			`"CPU usage elevated: ${pct}%" \`
			`"{\"usage_pct\": ${pct}}"`
			`fi`
			`echo "${pct}"`
			`}`

			`check_containers() {`
			`command -v docker &>/dev/null \|\| return`

			`# Containers that have exited but carry a restart policy meaning they should be up`
			`local cname`
			`while IFS= read -r cname; do`
			`[[ -z "$cname" ]] && continue`
			`warn "Container exited (should be running): ${cname}"`
			`emit_event "containers_not_running" "high" "${cname}" \`
			`"Container '${cname}' has exited unexpectedly (restart=unless-stopped)" \`
			`"{\"container\": \"${cname}\"}"`
			`done < <(docker ps -a \`
			`--filter "status=exited" \`
			`--filter "label=com.docker.compose.project" \`
			`--format "{{.Names}}" 2>/dev/null \|\| true)`

			`# Containers that are running but their health check is failing`
			`while IFS= read -r cname; do`
			`[[ -z "$cname" ]] && continue`
			`warn "Container unhealthy: ${cname}"`
			`emit_event "healthcheck_failed" "high" "${cname}" \`
			`"Container '${cname}' is running but health check is failing" \`
			`"{\"container\": \"${cname}\"}"`
			`done < <(docker ps \`
			`--filter "health=unhealthy" \`
			`--format "{{.Names}}" 2>/dev/null \|\| true)`
			`}`

			`# ---------------------------------------------------------------------------`
			`# Safe Docker cleanup (per policy)`
			`# ---------------------------------------------------------------------------`

			`_sd_card_rate_ok() {`
			`if [[ -f "${CLEANUP_LOCK}" ]]; then`
			`local last_ts elapsed`
			`last_ts=$(cat "${CLEANUP_LOCK}" 2>/dev/null \|\| echo 0)`
			`elapsed=$(( TIMESTAMP - last_ts ))`
			`if [[ "${elapsed}" -lt "${CLEANUP_INTERVAL}" ]]; then`
			`log "Docker cleanup skipped: last run ${elapsed}s ago (limit ${CLEANUP_INTERVAL}s)"`
			`return 1`
			`fi`
			`fi`
			`return 0`
			`}`

			`_mark_cleanup_done() {`
			`echo "${TIMESTAMP}" > "${CLEANUP_LOCK}"`
			`}`

			`run_safe_cleanup() {`
			`command -v docker &>/dev/null \|\| return`
			`local node_type`
			`node_type=$(get_node_type)`

			`case "${node_type}" in`
			`lte_node)`
			`# NO cleanup on LTE nodes. Any docker operation risks triggering`
			`# a pull over a metered/intermittent connection.`
			`log "Skipping Docker cleanup: LTE node (${NODE_NAME})"`
			`;;`

			`sd_card)`
			`# Dangling images + stopped containers only.`
			`# Rate-limited to once per 24 hours to protect SD card write endurance.`
			`_sd_card_rate_ok \|\| return`
			`log "Running rate-limited Docker cleanup (SD card node)"`
			`docker image prune -f >/dev/null 2>&1 \|\| true`
			`docker container prune -f >/dev/null 2>&1 \|\| true`
			`_mark_cleanup_done`
			`;;`

			`ai_node)`
			`# Dangling images + stopped containers + build cache.`
			`# NEVER docker image prune -a (would remove Ollama runtime images,`
			`# requiring a multi-hour re-pull of model weights).`
			`log "Running AI-node Docker cleanup (dangling images + containers + build cache)"`
			`docker image prune -f >/dev/null 2>&1 \|\| true`
			`docker container prune -f >/dev/null 2>&1 \|\| true`
			`docker builder prune -f >/dev/null 2>&1 \|\| true`
			`;;`

			`standard)`
			`# VPS and other standard nodes: full safe cleanup.`
			`log "Running standard Docker cleanup"`
			`docker image prune -f >/dev/null 2>&1 \|\| true`
			`docker container prune -f >/dev/null 2>&1 \|\| true`
			`docker builder prune -f >/dev/null 2>&1 \|\| true`
			`;;`
			`esac`
			`}`

			`# ---------------------------------------------------------------------------`
			`# VPS-specific: control-plane filesystem rotation`
			`# ---------------------------------------------------------------------------`

			`cleanup_control_plane_fs() {`
			`log "Running control-plane filesystem rotation"`

			`# Completed / failed actions older than 7 days`
			`for status in completed failed; do`
			`local dir="${ACTIONS_DIR}/${status}"`
			`[[ -d "${dir}" ]] \|\| continue`
			`find "${dir}" -name "*.json" -mtime +7 -delete 2>/dev/null && \`
			`log "Cleaned ${status} actions older than 7 days" \|\| true`
			`done`

			`# Deploy logs older than 30 days`
			`local deploy_logs="${LOGS_DIR}/deploy"`
			`if [[ -d "${deploy_logs}" ]]; then`
			`find "${deploy_logs}" -name "*.log" -mtime +30 -delete 2>/dev/null && \`
			`log "Cleaned deploy logs older than 30 days" \|\| true`
			`fi`

			`# Event files older than 3 days AND already past the observer checkpoint.`
			`# The dual condition ensures we never delete an event the observer hasn't seen.`
			`local checkpoint="${STATE_DIR}/observer_checkpoint.json"`
			`if [[ -f "${checkpoint}" ]] && command -v python3 &>/dev/null; then`
			`local last_processed`
			`last_processed=$(python3 -c "`
			`import json, sys`
			`try:`
			`d = json.load(open('${checkpoint}'))`
			`print(d.get('last_processed_file', ''))`
			`except Exception:`
			`print('')`
			`" 2>/dev/null \|\| echo "")`

			`if [[ -n "${last_processed}" ]]; then`
			`find "${EVENTS_DIR}" -name "*.json" -mtime +3 \| while IFS= read -r f; do`
			`# Only delete files that sort before the checkpoint path`
			`# (i.e., the observer has already processed them).`
			`if [[ "$f" < "${last_processed}" ]]; then`
			`rm -f "$f"`
			`log "Cleaned old event: $(basename "$f")"`
			`fi`
			`done`
			`else`
			`log "No observer checkpoint set; skipping event file cleanup"`
			`fi`
			`fi`
			`}`

			`# ---------------------------------------------------------------------------`
			`# Main`
			`# ---------------------------------------------------------------------------`

			`mkdir -p "${EVENTS_DIR}/${NODE_NAME}" "${STATE_DIR}"`

			`log "Health check starting on ${NODE_NAME} (type=$(get_node_type))"`

			`disk_pct=$(check_disk \|\| echo 0)`
			`mem_pct=$(check_memory \|\| echo 0)`
			`cpu_pct=$(check_cpu \|\| echo 0)`
			`check_containers`

			`run_safe_cleanup`

			`# VPS: also rotate control-plane filesystem artefacts`
			`if [[ "${NODE_NAME}" == "vps" ]]; then`
			`cleanup_control_plane_fs`
			`fi`

			`# Emit a node_health heartbeat so the observer can update node status`
			`# and the supervisor can see up-to-date resource metrics.`
			`emit_event "node_health" "info" "" \`
			`"Health check completed on ${NODE_NAME}" \`
			`"{\"disk_pct\": ${disk_pct}, \"mem_pct\": ${mem_pct}, \"cpu_pct\": ${cpu_pct}}"`

			`log "Health check complete (disk=${disk_pct}% mem=${mem_pct}% cpu=${cpu_pct}%)"`