feat(onboard): add 30-node-agent.sh + lustro node-agent override

Push-based deploy step for LUSTRO (git_control=false): rsync
services/node-agent/ and the host override to /opt/homelab/deploy/node-agent/
on the remote, then docker compose up --build via SSH.

Guard by effect: skip push+build+up if node-agent container already running
(docker ps filter, not command -v). Verify: container running + events appear
in /opt/homelab/events/lustro/ within 90 s (confirms agent write path).

Override (hosts/lustro/runtime/node-agent/docker-compose.override.yml):
- group_add: ["991"]  (docker GID on LUSTRO; 999 from base concatenated — harmless)
- mem_limit: 256m  (MagicMirror ~1.9 GiB; agent must be bounded)
- /home/pi/.ssh:/root/.ssh:ro  (not /home/oskar/.ssh — pi user)
- /opt/homelab/deploy/node-agent:/repo:ro  (no repo checkout on push-based node)
- NODE_NAME=lustro, NODE_TYPE=sd_card, VPS_EVENTS_HOST=100.95.58.48

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Oskar Kapala 2026-06-09 14:24:39 +02:00
parent 415479454a
commit f6342749e6
2 changed files with 151 additions and 0 deletions

View file

@ -0,0 +1,19 @@
services:
node-agent:
# Docker GID on LUSTRO is 991 (not the Debian default 999).
# Compose concatenates group_add lists; 991 is what gives socket access here.
group_add:
- "991"
mem_limit: 256m # RPi4 4 GiB; MagicMirror consumes ~1.9 GiB — agent must be bounded
environment:
- NODE_NAME=lustro
- NODE_TYPE=sd_card
- VPS_EVENTS_HOST=100.95.58.48
- VPS_EVENTS_USER=oskar
- VPS_EVENTS_PATH=/opt/homelab/events
- CHECK_INTERVAL=60
volumes:
# pi's SSH key for rsync event shipping to VPS (push-based node, no repo checkout)
- /home/pi/.ssh:/root/.ssh:ro
# Override ../.. from the base compose to the pushed deploy dir (no repo on node)
- /opt/homelab/deploy/node-agent:/repo:ro

View file

@ -0,0 +1,132 @@
#!/usr/bin/env bash
# scripts/onboard/steps/30-node-agent.sh — deploy node-agent to remote node
#
# Push-based deploy (git_control=false on LUSTRO): rsync services/node-agent/
# and the host override to /opt/homelab/deploy/node-agent/ on the remote, then
# docker compose build + up via SSH. Mirrors the PIHA pattern but pushes files
# instead of git-pulling them on the node.
#
# Stages:
# 1. push — rsync base compose+src, copy override to remote deploy dir
# 2. up — docker compose up -d --build (guarded: skip if already running)
# 3. verify — container running + fresh event in /opt/homelab/events/<node>/
#
# Dry-run: probes run unconditionally; rsync/rrun mutations honour DRY_RUN.
set -euo pipefail
STEP_NAME="30-node-agent"
: "${REPO_ROOT:?REPO_ROOT is not set — run via onboard.sh}"
: "${NODE_YAML:?NODE_YAML is not set — run via onboard.sh}"
: "${DRY_RUN:=0}"
# Source common.sh when run standalone (orchestrator sources it before calling steps)
if ! declare -f log >/dev/null 2>&1; then
# shellcheck source=../lib/common.sh
source "${REPO_ROOT}/scripts/onboard/lib/common.sh"
fi
# ── parse node.yaml ───────────────────────────────────────────────────────────
SSH_USER=$(yaml_get "$NODE_YAML" "ssh_user")
TS_HOSTNAME=$(yaml_get "$NODE_YAML" "tailscale.hostname")
[[ -z "$SSH_USER" ]] && die "ssh_user not set in $NODE_YAML"
[[ -z "$TS_HOSTNAME" ]] && die "tailscale.hostname not set in $NODE_YAML"
export ONBOARD_SSH_USER="${ONBOARD_SSH_USER:-${SSH_USER}}"
export ONBOARD_SSH_HOST="${ONBOARD_SSH_HOST:-${TS_HOSTNAME}}"
# shellcheck source=../lib/remote.sh
source "${REPO_ROOT}/scripts/onboard/lib/remote.sh"
REMOTE_DEPLOY_DIR="/opt/homelab/deploy/node-agent"
COMPOSE_BASE="${REMOTE_DEPLOY_DIR}/docker-compose.yml"
COMPOSE_OVERRIDE="${REMOTE_DEPLOY_DIR}/docker-compose.override.yml"
LOCAL_SVC_DIR="${REPO_ROOT}/services/node-agent"
LOCAL_OVERRIDE="${REPO_ROOT}/hosts/${TS_HOSTNAME}/runtime/node-agent/docker-compose.override.yml"
# ── rprobe: read-only remote probe — always runs, even in dry-run ─────────────
rprobe() {
ssh "${_SSH_OPTS[@]}" "${ONBOARD_SSH_USER}@${ONBOARD_SSH_HOST}" -- "$@"
}
# ═══════════════════════════════════════════════════════════════════════════════
# Stage 1 — push compose files to remote
# ═══════════════════════════════════════════════════════════════════════════════
step "[$STEP_NAME] 1/3 push compose → ${ONBOARD_SSH_HOST}:${REMOTE_DEPLOY_DIR}"
# Guard by EFFECT: is node-agent already running?
_running=0
if rprobe "docker ps --filter name=^node-agent\$ --filter status=running --format '{{.Names}}' 2>/dev/null | grep -q node-agent" 2>/dev/null; then
_running=1
log "node-agent container already running — skip push+build+up"
fi
if [[ "$_running" -eq 0 ]]; then
[[ -f "$LOCAL_OVERRIDE" ]] \
|| die "Override not found: $LOCAL_OVERRIDE"
# Push base compose + Dockerfile + src/ (rsync_dir handles DRY_RUN)
rsync_dir "${LOCAL_SVC_DIR}/" "${REMOTE_DEPLOY_DIR}/"
# Push host-specific override (rcopy handles DRY_RUN)
rcopy "${LOCAL_OVERRIDE}" "${REMOTE_DEPLOY_DIR}/docker-compose.override.yml"
fi
# ═══════════════════════════════════════════════════════════════════════════════
# Stage 2 — docker compose build + up
# ═══════════════════════════════════════════════════════════════════════════════
step "[$STEP_NAME] 2/3 docker compose up node-agent"
if [[ "$_running" -eq 1 ]]; then
log "node-agent already running — skip"
else
# Build image on remote (arm64 native); then start the service.
# --build rebuilds if context changed; idempotent if image is current.
rrun docker compose \
-f "${COMPOSE_BASE}" \
-f "${COMPOSE_OVERRIDE}" \
up -d --build node-agent
fi
# ═══════════════════════════════════════════════════════════════════════════════
# Stage 3 — verify
# ═══════════════════════════════════════════════════════════════════════════════
step "[$STEP_NAME] 3/3 verify"
if [ "${DRY_RUN:-0}" = 1 ]; then
log "dry-run: skipping verify (mutations may not have run)"
else
# Verify: container running (docker ps — not command -v)
if rprobe "docker ps --filter name=^node-agent\$ --filter status=running --format '{{.Names}}' 2>/dev/null | grep -q node-agent" 2>/dev/null; then
log "Verify OK: node-agent container running"
rprobe "docker ps --filter name=node-agent --format 'table {{.Names}}\t{{.Status}}\t{{.Image}}'" || true
else
die "node-agent container is NOT running — check: docker logs node-agent on ${TS_HOSTNAME}"
fi
# Verify: fresh events appear in /opt/homelab/events/<node>/ (confirms agent writes)
# First cycle runs at start then sleeps CHECK_INTERVAL; allow 90s.
log "Waiting for first event (up to 90 s, CHECK_INTERVAL=60)..."
_event_ok=0
for _i in $(seq 1 9); do
if rprobe "ls /opt/homelab/events/${TS_HOSTNAME}/*.json 2>/dev/null | head -1 | grep -q .json" 2>/dev/null; then
_event_ok=1
break
fi
log " ... ${_i}0 s elapsed, waiting..."
sleep 10
done
if [[ "$_event_ok" -eq 1 ]]; then
log "Verify OK: events present in /opt/homelab/events/${TS_HOSTNAME}/"
rprobe "ls -lth /opt/homelab/events/${TS_HOSTNAME}/ | head -5" || true
else
warn "No events yet in /opt/homelab/events/${TS_HOSTNAME}/ after 90 s — agent may still be initialising (CHECK_INTERVAL=60)"
warn "Re-run verify manually: docker logs node-agent on ${TS_HOSTNAME}"
fi
fi
log "[$STEP_NAME] done"