rsync fails with "No such file or directory" when intermediate dirs don't exist. /opt/homelab/deploy/ is not created by 20-base.sh. Add rrun mkdir -p before rsync_dir; pi owns /opt/homelab so no sudo. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
137 lines
6.7 KiB
Bash
137 lines
6.7 KiB
Bash
#!/usr/bin/env bash
|
|
# scripts/onboard/steps/30-node-agent.sh — deploy node-agent to remote node
|
|
#
|
|
# Push-based deploy (git_control=false on LUSTRO): rsync services/node-agent/
|
|
# and the host override to /opt/homelab/deploy/node-agent/ on the remote, then
|
|
# docker compose build + up via SSH. Mirrors the PIHA pattern but pushes files
|
|
# instead of git-pulling them on the node.
|
|
#
|
|
# Stages:
|
|
# 1. push — rsync base compose+src, copy override to remote deploy dir
|
|
# 2. up — docker compose up -d --build (guarded: skip if already running)
|
|
# 3. verify — container running + fresh event in /opt/homelab/events/<node>/
|
|
#
|
|
# Dry-run: probes run unconditionally; rsync/rrun mutations honour DRY_RUN.
|
|
|
|
set -euo pipefail
|
|
|
|
STEP_NAME="30-node-agent"
|
|
|
|
: "${REPO_ROOT:?REPO_ROOT is not set — run via onboard.sh}"
|
|
: "${NODE_YAML:?NODE_YAML is not set — run via onboard.sh}"
|
|
: "${DRY_RUN:=0}"
|
|
|
|
# Source common.sh when run standalone (orchestrator sources it before calling steps)
|
|
if ! declare -f log >/dev/null 2>&1; then
|
|
# shellcheck source=../lib/common.sh
|
|
source "${REPO_ROOT}/scripts/onboard/lib/common.sh"
|
|
fi
|
|
|
|
# ── parse node.yaml ───────────────────────────────────────────────────────────
|
|
SSH_USER=$(yaml_get "$NODE_YAML" "ssh_user")
|
|
TS_HOSTNAME=$(yaml_get "$NODE_YAML" "tailscale.hostname")
|
|
|
|
[[ -z "$SSH_USER" ]] && die "ssh_user not set in $NODE_YAML"
|
|
[[ -z "$TS_HOSTNAME" ]] && die "tailscale.hostname not set in $NODE_YAML"
|
|
|
|
export ONBOARD_SSH_USER="${ONBOARD_SSH_USER:-${SSH_USER}}"
|
|
export ONBOARD_SSH_HOST="${ONBOARD_SSH_HOST:-${TS_HOSTNAME}}"
|
|
|
|
# shellcheck source=../lib/remote.sh
|
|
source "${REPO_ROOT}/scripts/onboard/lib/remote.sh"
|
|
|
|
REMOTE_DEPLOY_DIR="/opt/homelab/deploy/node-agent"
|
|
COMPOSE_BASE="${REMOTE_DEPLOY_DIR}/docker-compose.yml"
|
|
COMPOSE_OVERRIDE="${REMOTE_DEPLOY_DIR}/docker-compose.override.yml"
|
|
|
|
LOCAL_SVC_DIR="${REPO_ROOT}/services/node-agent"
|
|
LOCAL_OVERRIDE="${REPO_ROOT}/hosts/${TS_HOSTNAME}/runtime/node-agent/docker-compose.override.yml"
|
|
|
|
# ── rprobe: read-only remote probe — always runs, even in dry-run ─────────────
|
|
rprobe() {
|
|
ssh "${_SSH_OPTS[@]}" "${ONBOARD_SSH_USER}@${ONBOARD_SSH_HOST}" -- "$@"
|
|
}
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
# Stage 1 — push compose files to remote
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
step "[$STEP_NAME] 1/3 push compose → ${ONBOARD_SSH_HOST}:${REMOTE_DEPLOY_DIR}"
|
|
|
|
# Guard by EFFECT: is node-agent already running?
|
|
_running=0
|
|
if rprobe "docker ps --filter name=^node-agent\$ --filter status=running --format '{{.Names}}' 2>/dev/null | grep -q node-agent" 2>/dev/null; then
|
|
_running=1
|
|
log "node-agent container already running — skip push+build+up"
|
|
fi
|
|
|
|
if [[ "$_running" -eq 0 ]]; then
|
|
[[ -f "$LOCAL_OVERRIDE" ]] \
|
|
|| die "Override not found: $LOCAL_OVERRIDE"
|
|
|
|
# Ensure remote deploy dir exists (rsync does not create intermediate dirs)
|
|
# pi owns /opt/homelab, so no sudo needed
|
|
rrun mkdir -p "${REMOTE_DEPLOY_DIR}"
|
|
|
|
# Push base compose + Dockerfile + src/ (rsync_dir handles DRY_RUN)
|
|
rsync_dir "${LOCAL_SVC_DIR}/" "${REMOTE_DEPLOY_DIR}/"
|
|
|
|
# Push host-specific override (rcopy handles DRY_RUN)
|
|
rcopy "${LOCAL_OVERRIDE}" "${REMOTE_DEPLOY_DIR}/docker-compose.override.yml"
|
|
fi
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
# Stage 2 — docker compose build + up
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
step "[$STEP_NAME] 2/3 docker compose up node-agent"
|
|
|
|
if [[ "$_running" -eq 1 ]]; then
|
|
log "node-agent already running — skip"
|
|
else
|
|
# Build image on remote (arm64 native); then start the service.
|
|
# --build rebuilds if context changed; idempotent if image is current.
|
|
rrun docker compose \
|
|
-f "${COMPOSE_BASE}" \
|
|
-f "${COMPOSE_OVERRIDE}" \
|
|
up -d --build node-agent
|
|
fi
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
# Stage 3 — verify
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
step "[$STEP_NAME] 3/3 verify"
|
|
|
|
if [ "${DRY_RUN:-0}" = 1 ]; then
|
|
log "dry-run: skipping verify (mutations may not have run)"
|
|
else
|
|
# Verify: container running (docker ps — not command -v)
|
|
if rprobe "docker ps --filter name=^node-agent\$ --filter status=running --format '{{.Names}}' 2>/dev/null | grep -q node-agent" 2>/dev/null; then
|
|
log "Verify OK: node-agent container running"
|
|
rprobe "docker ps --filter name=node-agent --format 'table {{.Names}}\t{{.Status}}\t{{.Image}}'" || true
|
|
else
|
|
die "node-agent container is NOT running — check: docker logs node-agent on ${TS_HOSTNAME}"
|
|
fi
|
|
|
|
# Verify: fresh events appear in /opt/homelab/events/<node>/ (confirms agent writes)
|
|
# First cycle runs at start then sleeps CHECK_INTERVAL; allow 90s.
|
|
log "Waiting for first event (up to 90 s, CHECK_INTERVAL=60)..."
|
|
_event_ok=0
|
|
for _i in $(seq 1 9); do
|
|
if rprobe "ls /opt/homelab/events/${TS_HOSTNAME}/*.json 2>/dev/null | head -1 | grep -q .json" 2>/dev/null; then
|
|
_event_ok=1
|
|
break
|
|
fi
|
|
log " ... ${_i}0 s elapsed, waiting..."
|
|
sleep 10
|
|
done
|
|
|
|
if [[ "$_event_ok" -eq 1 ]]; then
|
|
log "Verify OK: events present in /opt/homelab/events/${TS_HOSTNAME}/"
|
|
rprobe "ls -lth /opt/homelab/events/${TS_HOSTNAME}/ | head -5" || true
|
|
else
|
|
warn "No events yet in /opt/homelab/events/${TS_HOSTNAME}/ after 90 s — agent may still be initialising (CHECK_INTERVAL=60)"
|
|
warn "Re-run verify manually: docker logs node-agent on ${TS_HOSTNAME}"
|
|
fi
|
|
fi
|
|
|
|
log "[$STEP_NAME] done"
|