#!/usr/bin/env bash # scripts/onboard/steps/30-node-agent.sh — deploy node-agent to remote node # # Push-based deploy (git_control=false on LUSTRO): rsync services/node-agent/ # and the host override to /opt/homelab/deploy/node-agent/ on the remote, then # docker compose build + up via SSH. Mirrors the PIHA pattern but pushes files # instead of git-pulling them on the node. # # Stages: # 1. push — rsync base compose+src, copy override to remote deploy dir # 2. up — docker compose up -d --build (guarded: skip if already running) # 3. verify — container running + fresh event in /opt/homelab/events// # # Dry-run: probes run unconditionally; rsync/rrun mutations honour DRY_RUN. set -euo pipefail STEP_NAME="30-node-agent" : "${REPO_ROOT:?REPO_ROOT is not set — run via onboard.sh}" : "${NODE_YAML:?NODE_YAML is not set — run via onboard.sh}" : "${DRY_RUN:=0}" # Source common.sh when run standalone (orchestrator sources it before calling steps) if ! declare -f log >/dev/null 2>&1; then # shellcheck source=../lib/common.sh source "${REPO_ROOT}/scripts/onboard/lib/common.sh" fi # ── parse node.yaml ─────────────────────────────────────────────────────────── SSH_USER=$(yaml_get "$NODE_YAML" "ssh_user") TS_HOSTNAME=$(yaml_get "$NODE_YAML" "tailscale.hostname") [[ -z "$SSH_USER" ]] && die "ssh_user not set in $NODE_YAML" [[ -z "$TS_HOSTNAME" ]] && die "tailscale.hostname not set in $NODE_YAML" export ONBOARD_SSH_USER="${ONBOARD_SSH_USER:-${SSH_USER}}" export ONBOARD_SSH_HOST="${ONBOARD_SSH_HOST:-${TS_HOSTNAME}}" # shellcheck source=../lib/remote.sh source "${REPO_ROOT}/scripts/onboard/lib/remote.sh" REMOTE_DEPLOY_DIR="/opt/homelab/deploy/node-agent" COMPOSE_BASE="${REMOTE_DEPLOY_DIR}/docker-compose.yml" COMPOSE_OVERRIDE="${REMOTE_DEPLOY_DIR}/docker-compose.override.yml" LOCAL_SVC_DIR="${REPO_ROOT}/services/node-agent" LOCAL_OVERRIDE="${REPO_ROOT}/hosts/${TS_HOSTNAME}/runtime/node-agent/docker-compose.override.yml" # ── rprobe: read-only remote probe — always runs, even in dry-run ───────────── rprobe() { ssh "${_SSH_OPTS[@]}" "${ONBOARD_SSH_USER}@${ONBOARD_SSH_HOST}" -- "$@" } # ═══════════════════════════════════════════════════════════════════════════════ # Stage 1 — push compose files to remote # ═══════════════════════════════════════════════════════════════════════════════ step "[$STEP_NAME] 1/3 push compose → ${ONBOARD_SSH_HOST}:${REMOTE_DEPLOY_DIR}" # Guard by EFFECT: is node-agent already running? _running=0 if rprobe "docker ps --filter name=^node-agent\$ --filter status=running --format '{{.Names}}' 2>/dev/null | grep -q node-agent" 2>/dev/null; then _running=1 log "node-agent container already running — skip push+build+up" fi if [[ "$_running" -eq 0 ]]; then [[ -f "$LOCAL_OVERRIDE" ]] \ || die "Override not found: $LOCAL_OVERRIDE" # Push base compose + Dockerfile + src/ (rsync_dir handles DRY_RUN) rsync_dir "${LOCAL_SVC_DIR}/" "${REMOTE_DEPLOY_DIR}/" # Push host-specific override (rcopy handles DRY_RUN) rcopy "${LOCAL_OVERRIDE}" "${REMOTE_DEPLOY_DIR}/docker-compose.override.yml" fi # ═══════════════════════════════════════════════════════════════════════════════ # Stage 2 — docker compose build + up # ═══════════════════════════════════════════════════════════════════════════════ step "[$STEP_NAME] 2/3 docker compose up node-agent" if [[ "$_running" -eq 1 ]]; then log "node-agent already running — skip" else # Build image on remote (arm64 native); then start the service. # --build rebuilds if context changed; idempotent if image is current. rrun docker compose \ -f "${COMPOSE_BASE}" \ -f "${COMPOSE_OVERRIDE}" \ up -d --build node-agent fi # ═══════════════════════════════════════════════════════════════════════════════ # Stage 3 — verify # ═══════════════════════════════════════════════════════════════════════════════ step "[$STEP_NAME] 3/3 verify" if [ "${DRY_RUN:-0}" = 1 ]; then log "dry-run: skipping verify (mutations may not have run)" else # Verify: container running (docker ps — not command -v) if rprobe "docker ps --filter name=^node-agent\$ --filter status=running --format '{{.Names}}' 2>/dev/null | grep -q node-agent" 2>/dev/null; then log "Verify OK: node-agent container running" rprobe "docker ps --filter name=node-agent --format 'table {{.Names}}\t{{.Status}}\t{{.Image}}'" || true else die "node-agent container is NOT running — check: docker logs node-agent on ${TS_HOSTNAME}" fi # Verify: fresh events appear in /opt/homelab/events// (confirms agent writes) # First cycle runs at start then sleeps CHECK_INTERVAL; allow 90s. log "Waiting for first event (up to 90 s, CHECK_INTERVAL=60)..." _event_ok=0 for _i in $(seq 1 9); do if rprobe "ls /opt/homelab/events/${TS_HOSTNAME}/*.json 2>/dev/null | head -1 | grep -q .json" 2>/dev/null; then _event_ok=1 break fi log " ... ${_i}0 s elapsed, waiting..." sleep 10 done if [[ "$_event_ok" -eq 1 ]]; then log "Verify OK: events present in /opt/homelab/events/${TS_HOSTNAME}/" rprobe "ls -lth /opt/homelab/events/${TS_HOSTNAME}/ | head -5" || true else warn "No events yet in /opt/homelab/events/${TS_HOSTNAME}/ after 90 s — agent may still be initialising (CHECK_INTERVAL=60)" warn "Re-run verify manually: docker logs node-agent on ${TS_HOSTNAME}" fi fi log "[$STEP_NAME] done"