homelab-codex-ws/scripts/onboard/steps/30-node-agent.sh

137 lines
6.7 KiB
Bash
Raw Normal View History

#!/usr/bin/env bash
# scripts/onboard/steps/30-node-agent.sh — deploy node-agent to remote node
#
# Push-based deploy (git_control=false on LUSTRO): rsync services/node-agent/
# and the host override to /opt/homelab/deploy/node-agent/ on the remote, then
# docker compose build + up via SSH. Mirrors the PIHA pattern but pushes files
# instead of git-pulling them on the node.
#
# Stages:
# 1. push — rsync base compose+src, copy override to remote deploy dir
# 2. up — docker compose up -d --build (guarded: skip if already running)
# 3. verify — container running + fresh event in /opt/homelab/events/<node>/
#
# Dry-run: probes run unconditionally; rsync/rrun mutations honour DRY_RUN.
set -euo pipefail
STEP_NAME="30-node-agent"
: "${REPO_ROOT:?REPO_ROOT is not set — run via onboard.sh}"
: "${NODE_YAML:?NODE_YAML is not set — run via onboard.sh}"
: "${DRY_RUN:=0}"
# Source common.sh when run standalone (orchestrator sources it before calling steps)
if ! declare -f log >/dev/null 2>&1; then
# shellcheck source=../lib/common.sh
source "${REPO_ROOT}/scripts/onboard/lib/common.sh"
fi
# ── parse node.yaml ───────────────────────────────────────────────────────────
SSH_USER=$(yaml_get "$NODE_YAML" "ssh_user")
TS_HOSTNAME=$(yaml_get "$NODE_YAML" "tailscale.hostname")
[[ -z "$SSH_USER" ]] && die "ssh_user not set in $NODE_YAML"
[[ -z "$TS_HOSTNAME" ]] && die "tailscale.hostname not set in $NODE_YAML"
export ONBOARD_SSH_USER="${ONBOARD_SSH_USER:-${SSH_USER}}"
export ONBOARD_SSH_HOST="${ONBOARD_SSH_HOST:-${TS_HOSTNAME}}"
# shellcheck source=../lib/remote.sh
source "${REPO_ROOT}/scripts/onboard/lib/remote.sh"
REMOTE_DEPLOY_DIR="/opt/homelab/deploy/node-agent"
COMPOSE_BASE="${REMOTE_DEPLOY_DIR}/docker-compose.yml"
COMPOSE_OVERRIDE="${REMOTE_DEPLOY_DIR}/docker-compose.override.yml"
LOCAL_SVC_DIR="${REPO_ROOT}/services/node-agent"
LOCAL_OVERRIDE="${REPO_ROOT}/hosts/${TS_HOSTNAME}/runtime/node-agent/docker-compose.override.yml"
# ── rprobe: read-only remote probe — always runs, even in dry-run ─────────────
rprobe() {
ssh "${_SSH_OPTS[@]}" "${ONBOARD_SSH_USER}@${ONBOARD_SSH_HOST}" -- "$@"
}
# ═══════════════════════════════════════════════════════════════════════════════
# Stage 1 — push compose files to remote
# ═══════════════════════════════════════════════════════════════════════════════
step "[$STEP_NAME] 1/3 push compose → ${ONBOARD_SSH_HOST}:${REMOTE_DEPLOY_DIR}"
# Guard by EFFECT: is node-agent already running?
_running=0
if rprobe "docker ps --filter name=^node-agent\$ --filter status=running --format '{{.Names}}' 2>/dev/null | grep -q node-agent" 2>/dev/null; then
_running=1
log "node-agent container already running — skip push+build+up"
fi
if [[ "$_running" -eq 0 ]]; then
[[ -f "$LOCAL_OVERRIDE" ]] \
|| die "Override not found: $LOCAL_OVERRIDE"
# Ensure remote deploy dir exists (rsync does not create intermediate dirs)
# pi owns /opt/homelab, so no sudo needed
rrun mkdir -p "${REMOTE_DEPLOY_DIR}"
# Push base compose + Dockerfile + src/ (rsync_dir handles DRY_RUN)
rsync_dir "${LOCAL_SVC_DIR}/" "${REMOTE_DEPLOY_DIR}/"
# Push host-specific override (rcopy handles DRY_RUN)
rcopy "${LOCAL_OVERRIDE}" "${REMOTE_DEPLOY_DIR}/docker-compose.override.yml"
fi
# ═══════════════════════════════════════════════════════════════════════════════
# Stage 2 — docker compose build + up
# ═══════════════════════════════════════════════════════════════════════════════
step "[$STEP_NAME] 2/3 docker compose up node-agent"
if [[ "$_running" -eq 1 ]]; then
log "node-agent already running — skip"
else
# Build image on remote (arm64 native); then start the service.
# --build rebuilds if context changed; idempotent if image is current.
rrun docker compose \
-f "${COMPOSE_BASE}" \
-f "${COMPOSE_OVERRIDE}" \
up -d --build node-agent
fi
# ═══════════════════════════════════════════════════════════════════════════════
# Stage 3 — verify
# ═══════════════════════════════════════════════════════════════════════════════
step "[$STEP_NAME] 3/3 verify"
if [ "${DRY_RUN:-0}" = 1 ]; then
log "dry-run: skipping verify (mutations may not have run)"
else
# Verify: container running (docker ps — not command -v)
if rprobe "docker ps --filter name=^node-agent\$ --filter status=running --format '{{.Names}}' 2>/dev/null | grep -q node-agent" 2>/dev/null; then
log "Verify OK: node-agent container running"
rprobe "docker ps --filter name=node-agent --format 'table {{.Names}}\t{{.Status}}\t{{.Image}}'" || true
else
die "node-agent container is NOT running — check: docker logs node-agent on ${TS_HOSTNAME}"
fi
# Verify: fresh events appear in /opt/homelab/events/<node>/ (confirms agent writes)
# First cycle runs at start then sleeps CHECK_INTERVAL; allow 90s.
log "Waiting for first event (up to 90 s, CHECK_INTERVAL=60)..."
_event_ok=0
for _i in $(seq 1 9); do
if rprobe "ls /opt/homelab/events/${TS_HOSTNAME}/*.json 2>/dev/null | head -1 | grep -q .json" 2>/dev/null; then
_event_ok=1
break
fi
log " ... ${_i}0 s elapsed, waiting..."
sleep 10
done
if [[ "$_event_ok" -eq 1 ]]; then
log "Verify OK: events present in /opt/homelab/events/${TS_HOSTNAME}/"
rprobe "ls -lth /opt/homelab/events/${TS_HOSTNAME}/ | head -5" || true
else
warn "No events yet in /opt/homelab/events/${TS_HOSTNAME}/ after 90 s — agent may still be initialising (CHECK_INTERVAL=60)"
warn "Re-run verify manually: docker logs node-agent on ${TS_HOSTNAME}"
fi
fi
log "[$STEP_NAME] done"