From adb84079ab1f1a2e7517b53ad4162e546f18139f Mon Sep 17 00:00:00 2001 From: Oskar Kapala Date: Mon, 8 Jun 2026 14:23:21 +0200 Subject: [PATCH] feat(onboard): add node onboarding scaffold (bash, idempotent) - scripts/onboard/onboard.sh: orchestrator with --node/--step/--from/--dry-run flags, deploy_autonomy + git_control gates, lexicographic step ordering - scripts/onboard/lib/common.sh: log/warn/die/step helpers, yaml_get (yq+grep/sed fallback), ensure_line, git() wrapper enforcing --no-pager - scripts/onboard/lib/remote.sh: rrun/rcopy/rsync_dir/rcheck SSH wrappers, dry-run aware - scripts/onboard/steps/00-preflight.sh: read-only fact collection (arch, RAM, disk, docker, tailscale, MagicMirror runtime, swap), human report + machine YAML snippet - scripts/onboard/steps/10-50: stub files with TODO headers, no mutations - hosts/lustro/node.yaml: LUSTRO edge node draft (KEN, role=edge, deploy_autonomy=true, git_control=false); hardware fields marked TODO for preflight population Co-Authored-By: Claude Sonnet 4.6 --- hosts/lustro/node.yaml | 26 +++ scripts/onboard/lib/common.sh | 64 ++++++ scripts/onboard/lib/remote.sh | 51 +++++ scripts/onboard/onboard.sh | 187 ++++++++++++++++++ scripts/onboard/steps/00-preflight.sh | 144 ++++++++++++++ scripts/onboard/steps/10-bootstrap-runtime.sh | 14 ++ scripts/onboard/steps/20-install-docker.sh | 16 ++ scripts/onboard/steps/30-install-tailscale.sh | 16 ++ scripts/onboard/steps/40-deploy-node-agent.sh | 16 ++ scripts/onboard/steps/50-verify.sh | 16 ++ 10 files changed, 550 insertions(+) create mode 100644 hosts/lustro/node.yaml create mode 100644 scripts/onboard/lib/common.sh create mode 100644 scripts/onboard/lib/remote.sh create mode 100755 scripts/onboard/onboard.sh create mode 100755 scripts/onboard/steps/00-preflight.sh create mode 100755 scripts/onboard/steps/10-bootstrap-runtime.sh create mode 100755 scripts/onboard/steps/20-install-docker.sh create mode 100755 scripts/onboard/steps/30-install-tailscale.sh create mode 100755 scripts/onboard/steps/40-deploy-node-agent.sh create mode 100755 scripts/onboard/steps/50-verify.sh diff --git a/hosts/lustro/node.yaml b/hosts/lustro/node.yaml new file mode 100644 index 0000000..72cfaac --- /dev/null +++ b/hosts/lustro/node.yaml @@ -0,0 +1,26 @@ +# hosts/lustro/node.yaml — LUSTRO edge node manifest +# Run scripts/onboard/onboard.sh --node lustro --step 00-preflight +# to auto-populate the TODO fields below. + +name: LUSTRO +role: edge +location: KEN + +ssh_user: oskar + +tailscale: + hostname: lustro + # ip: TODO — fill after tailscale join (step 30) + +deploy_autonomy: true # onboard.sh may run mutating steps autonomously +git_control: false # node does NOT pull from Forgejo directly; push-based + +hardware: + arch: TODO # populated by 00-preflight (e.g. aarch64, x86_64) + ram_mb: TODO # populated by 00-preflight + swap: TODO # populated by 00-preflight (none / ) + mm_runtime: TODO # populated by 00-preflight (systemd / pm2 / process / none) + +services: + node-agent: + runtime: TODO # populated by 40-deploy-node-agent (image tag, config hash) diff --git a/scripts/onboard/lib/common.sh b/scripts/onboard/lib/common.sh new file mode 100644 index 0000000..95142f3 --- /dev/null +++ b/scripts/onboard/lib/common.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +# scripts/onboard/lib/common.sh — shared helpers for the onboarding tool + +set -euo pipefail + +# ── colour codes (disabled when not a tty) ────────────────────────────────── +if [[ -t 1 ]]; then + _C_RESET='\033[0m' + _C_GREEN='\033[0;32m' + _C_YELLOW='\033[1;33m' + _C_RED='\033[0;31m' + _C_CYAN='\033[0;36m' + _C_BOLD='\033[1m' +else + _C_RESET='' _C_GREEN='' _C_YELLOW='' _C_RED='' _C_CYAN='' _C_BOLD='' +fi + +# ── logging ────────────────────────────────────────────────────────────────── +log() { echo -e "${_C_GREEN}[onboard]${_C_RESET} $(date +'%H:%M:%S') ${*}"; } +warn() { echo -e "${_C_YELLOW}[WARN]${_C_RESET} $(date +'%H:%M:%S') ${*}" >&2; } +die() { echo -e "${_C_RED}[ERROR]${_C_RESET} $(date +'%H:%M:%S') ${*}" >&2; exit 1; } +step() { echo -e "${_C_CYAN}${_C_BOLD}==> ${*}${_C_RESET}"; } +dryrun() { echo -e "${_C_YELLOW}[dry-run]${_C_RESET} ${*}"; } + +# ── command detection ───────────────────────────────────────────────────────── +have_cmd() { command -v "$1" >/dev/null 2>&1; } + +# ── file helpers ────────────────────────────────────────────────────────────── +# ensure_line FILE LINE — appends LINE to FILE if it is not already present (idempotent) +ensure_line() { + local file="$1" line="$2" + [[ -f "$file" ]] || touch "$file" + grep -qxF "$line" "$file" || echo "$line" >> "$file" +} + +# ── node.yaml parsing ───────────────────────────────────────────────────────── +# require_node_yaml NODE — sets NODE_YAML; exits if not found +require_node_yaml() { + local node="$1" + NODE_YAML="${REPO_ROOT}/hosts/${node,,}/node.yaml" + [[ -f "$NODE_YAML" ]] || die "node.yaml not found: $NODE_YAML" + export NODE_YAML +} + +# yaml_get NODE_YAML KEY — read a scalar value from a YAML file +# Uses yq when available; falls back to grep/sed for simple key: value pairs. +# Supports dot-separated paths (e.g. tailscale.hostname) only in yq mode; +# the grep fallback handles only the last path component. +yaml_get() { + local file="$1" key="$2" + if have_cmd yq; then + yq -r ".${key} // empty" "$file" 2>/dev/null + else + # fallback: extract last segment of key, match " key: value" + local leaf="${key##*.}" + grep -E "^\s*${leaf}:" "$file" | head -1 | sed 's/.*: *//' | tr -d '"' | tr -d "'" + fi +} + +# ── git wrapper ──────────────────────────────────────────────────────────────── +# All git calls from onboarding scripts must go through this so --no-pager is +# always set and there is no interactive output. +git() { command git --no-pager "$@"; } +export -f git diff --git a/scripts/onboard/lib/remote.sh b/scripts/onboard/lib/remote.sh new file mode 100644 index 0000000..92b4ecf --- /dev/null +++ b/scripts/onboard/lib/remote.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +# scripts/onboard/lib/remote.sh — SSH helpers for remote node operations +# Requires: ONBOARD_SSH_USER, ONBOARD_SSH_HOST to be set by the caller. +# Inherits: DRY_RUN (boolean string "true"/"false") + +set -euo pipefail + +: "${ONBOARD_SSH_USER:?remote.sh: ONBOARD_SSH_USER is not set}" +: "${ONBOARD_SSH_HOST:?remote.sh: ONBOARD_SSH_HOST is not set}" +: "${DRY_RUN:=false}" + +_SSH_OPTS=( + -o StrictHostKeyChecking=accept-new + -o ConnectTimeout=10 + -o BatchMode=yes +) + +# rrun CMD [ARGS…] — run a command on the remote node via SSH +rrun() { + if [[ "$DRY_RUN" == "true" ]]; then + dryrun "ssh ${ONBOARD_SSH_USER}@${ONBOARD_SSH_HOST} -- $*" + return 0 + fi + ssh "${_SSH_OPTS[@]}" "${ONBOARD_SSH_USER}@${ONBOARD_SSH_HOST}" -- "$@" +} + +# rcopy LOCAL_PATH REMOTE_PATH — copy a file to the remote node via scp +rcopy() { + local src="$1" dst="$2" + if [[ "$DRY_RUN" == "true" ]]; then + dryrun "scp $src ${ONBOARD_SSH_USER}@${ONBOARD_SSH_HOST}:$dst" + return 0 + fi + scp "${_SSH_OPTS[@]}" "$src" "${ONBOARD_SSH_USER}@${ONBOARD_SSH_HOST}:$dst" +} + +# rsync_dir LOCAL_DIR REMOTE_DIR [EXTRA_RSYNC_ARGS…] +rsync_dir() { + local src="$1" dst="$2" + shift 2 + if [[ "$DRY_RUN" == "true" ]]; then + dryrun "rsync -az $src ${ONBOARD_SSH_USER}@${ONBOARD_SSH_HOST}:$dst" + return 0 + fi + rsync -az -e "ssh ${_SSH_OPTS[*]}" "$src" "${ONBOARD_SSH_USER}@${ONBOARD_SSH_HOST}:$dst" "$@" +} + +# rcheck — verify SSH connectivity; returns 0 if reachable +rcheck() { + ssh "${_SSH_OPTS[@]}" -o ConnectTimeout=5 "${ONBOARD_SSH_USER}@${ONBOARD_SSH_HOST}" -- true 2>/dev/null +} diff --git a/scripts/onboard/onboard.sh b/scripts/onboard/onboard.sh new file mode 100755 index 0000000..40fc363 --- /dev/null +++ b/scripts/onboard/onboard.sh @@ -0,0 +1,187 @@ +#!/usr/bin/env bash +# scripts/onboard/onboard.sh — node onboarding orchestrator +# +# Usage: +# onboard.sh --node [--step ] [--from ] [--dry-run] +# +# Flags: +# --node node name matching hosts//node.yaml (required) +# --step run only this step (e.g. 00-preflight) +# --from start from this step, run all subsequent steps +# --dry-run print what would be done without mutating anything +# +# Steps run in lexicographic order from scripts/onboard/steps/. +# Steps that require deploy_autonomy=true are skipped (with a warning) when +# that flag is false in node.yaml. Steps that require git_control=true are +# similarly gated. + +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +STEPS_DIR="${REPO_ROOT}/scripts/onboard/steps" +LIB_DIR="${REPO_ROOT}/scripts/onboard/lib" + +# ── load helpers ────────────────────────────────────────────────────────────── +# shellcheck source=lib/common.sh +source "${LIB_DIR}/common.sh" + +# ── defaults ────────────────────────────────────────────────────────────────── +NODE_NAME="" +ONLY_STEP="" +FROM_STEP="" +DRY_RUN="false" +export DRY_RUN REPO_ROOT + +# ── argument parsing ────────────────────────────────────────────────────────── +usage() { + cat >&2 <<'EOF' +Usage: onboard.sh --node [--step ] [--from ] [--dry-run] + + --node node name matching hosts//node.yaml (required) + --step run only this single step (e.g. 00-preflight) + --from start from this step, continue to end + --dry-run no mutations; show what would run + +Examples: + onboard.sh --node lustro + onboard.sh --node lustro --step 00-preflight + onboard.sh --node lustro --from 20-install-docker + onboard.sh --node lustro --dry-run +EOF + exit 1 +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --node) NODE_NAME="${2:?--node requires a value}"; shift 2 ;; + --step) ONLY_STEP="${2:?--step requires a value}"; shift 2 ;; + --from) FROM_STEP="${2:?--from requires a value}"; shift 2 ;; + --dry-run) DRY_RUN="true"; shift ;; + -h|--help) usage ;; + *) die "Unknown argument: $1" ;; + esac +done + +[[ -z "$NODE_NAME" ]] && { warn "--node is required"; usage; } + +export NODE_NAME + +# ── load node.yaml ──────────────────────────────────────────────────────────── +require_node_yaml "$NODE_NAME" + +log "Loading manifest: $NODE_YAML" + +DEPLOY_AUTONOMY=$(yaml_get "$NODE_YAML" "deploy_autonomy") +GIT_CONTROL=$(yaml_get "$NODE_YAML" "git_control") +SSH_USER=$(yaml_get "$NODE_YAML" "ssh_user") +TS_HOSTNAME=$(yaml_get "$NODE_YAML" "tailscale.hostname") + +DEPLOY_AUTONOMY="${DEPLOY_AUTONOMY:-false}" +GIT_CONTROL="${GIT_CONTROL:-false}" + +[[ -z "$SSH_USER" ]] && die "ssh_user not set in $NODE_YAML" +[[ -z "$TS_HOSTNAME" ]] && die "tailscale.hostname not set in $NODE_YAML" + +export ONBOARD_SSH_USER="$SSH_USER" +export ONBOARD_SSH_HOST="$TS_HOSTNAME" + +log "Node: ${NODE_NAME} | host: ${TS_HOSTNAME} | user: ${SSH_USER}" +log "deploy_autonomy=${DEPLOY_AUTONOMY} git_control=${GIT_CONTROL} dry_run=${DRY_RUN}" + +# ── collect steps ───────────────────────────────────────────────────────────── +# Steps are NN-name.sh files in lexicographic order. +mapfile -t ALL_STEPS < <(find "$STEPS_DIR" -maxdepth 1 -name '[0-9][0-9]-*.sh' | sort) + +if [[ ${#ALL_STEPS[@]} -eq 0 ]]; then + die "No steps found in $STEPS_DIR" +fi + +# Determine which steps to run based on flags. +declare -a STEPS_TO_RUN=() + +for step_path in "${ALL_STEPS[@]}"; do + step_file=$(basename "$step_path" .sh) + + if [[ -n "$ONLY_STEP" ]]; then + # Match on prefix (e.g. "00-preflight" matches "00-preflight.sh") + [[ "$step_file" == "$ONLY_STEP" ]] || continue + elif [[ -n "$FROM_STEP" ]]; then + # Skip steps before FROM_STEP + [[ "$step_file" < "$FROM_STEP" && "$step_file" != "$FROM_STEP" ]] && continue + fi + + STEPS_TO_RUN+=("$step_path") +done + +if [[ ${#STEPS_TO_RUN[@]} -eq 0 ]]; then + die "No matching steps found (--step='${ONLY_STEP}' --from='${FROM_STEP}')" +fi + +log "Steps to run (${#STEPS_TO_RUN[@]}):" +for s in "${STEPS_TO_RUN[@]}"; do + printf " %s\n" "$(basename "$s")" +done +echo "" + +# ── step execution loop ─────────────────────────────────────────────────────── +# Steps that start at 10+ are "mutating" and require deploy_autonomy=true. +# Steps that start at 30+ and deal with git/repo sync require git_control=true. +# Step 00-preflight is always allowed (read-only). + +_step_needs_autonomy() { + local num="${1%%[^0-9]*}" # leading digits + [[ "$num" -ge 10 ]] 2>/dev/null +} + +_step_needs_git_control() { + local name="$1" + [[ "$name" == *"git"* || "$name" == *"repo"* || "$name" == *"clone"* ]] +} + +FAILED_STEPS=() + +for step_path in "${STEPS_TO_RUN[@]}"; do + step_file=$(basename "$step_path" .sh) + step_num="${step_file%%[^0-9]*}" + + # autonomy gate + if _step_needs_autonomy "$step_num" && [[ "$DEPLOY_AUTONOMY" != "true" ]]; then + warn "Skipping $step_file — deploy_autonomy=false in $NODE_YAML" + warn "Run this step manually or set deploy_autonomy: true" + continue + fi + + # git_control gate + if _step_needs_git_control "$step_file" && [[ "$GIT_CONTROL" != "true" ]]; then + warn "Skipping $step_file — git_control=false in $NODE_YAML" + continue + fi + + step "Running: $step_file" + + if [[ "$DRY_RUN" == "true" ]]; then + dryrun "Would execute: $step_path" + continue + fi + + if bash "$step_path"; then + log "$step_file — OK" + else + rc=$? + warn "$step_file — FAILED (exit $rc)" + FAILED_STEPS+=("$step_file") + fi + + echo "" +done + +# ── summary ─────────────────────────────────────────────────────────────────── +if [[ ${#FAILED_STEPS[@]} -gt 0 ]]; then + die "Onboarding finished with failures: ${FAILED_STEPS[*]}" +fi + +if [[ "$DRY_RUN" == "true" ]]; then + log "Dry-run complete — no changes made." +else + log "All steps completed successfully for node ${NODE_NAME}." +fi diff --git a/scripts/onboard/steps/00-preflight.sh b/scripts/onboard/steps/00-preflight.sh new file mode 100755 index 0000000..6d70764 --- /dev/null +++ b/scripts/onboard/steps/00-preflight.sh @@ -0,0 +1,144 @@ +#!/usr/bin/env bash +# scripts/onboard/steps/00-preflight.sh — READ-ONLY remote node discovery +# +# Collects facts from the remote node and prints: +# 1. A human-readable report block +# 2. A machine-readable YAML snippet ready to paste into hosts//node.yaml +# +# NO mutations are performed on the remote host. +# Depends on: lib/common.sh (sourced by orchestrator), lib/remote.sh (sourced here) + +set -euo pipefail + +STEP_NAME="00-preflight" + +# remote.sh is sourced here so individual steps can also be run standalone +# (when REPO_ROOT is in the environment). +: "${REPO_ROOT:?REPO_ROOT is not set — run via onboard.sh}" +# shellcheck source=../lib/remote.sh +source "${REPO_ROOT}/scripts/onboard/lib/remote.sh" + +step "[$STEP_NAME] Collecting facts from ${ONBOARD_SSH_USER}@${ONBOARD_SSH_HOST} (read-only)" + +# ── gather all facts in a single SSH session ────────────────────────────────── +raw=$(rrun bash -s <<'REMOTE' +set -euo pipefail + +# arch / bitness +arch=$(uname -m) +bits=$(getconf LONG_BIT) + +# RAM (kB → MB) +mem_kb=$(grep MemTotal /proc/meminfo | awk '{print $2}') +mem_mb=$(( mem_kb / 1024 )) + +# disk root +disk_root=$(df -h / | awk 'NR==2{print $2" total, "$3" used, "$4" free ("$5" used)"}') + +# docker +docker_present=false +docker_info="" +if command -v docker >/dev/null 2>&1; then + docker_present=true + docker_info=$(docker info --format '{{.ServerVersion}}' 2>/dev/null || echo "unknown") +fi + +# tailscale +tailscale_present=false +tailscale_status="" +if command -v tailscale >/dev/null 2>&1; then + tailscale_present=true + tailscale_status=$(tailscale status --json 2>/dev/null | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('BackendState','unknown'))" 2>/dev/null || tailscale status 2>/dev/null | head -1 || echo "unknown") +fi + +# Magic Mirror runtime detection +mm_runtime="none" +if systemctl is-active --quiet MagicMirror 2>/dev/null || systemctl is-active --quiet magicmirror 2>/dev/null; then + mm_runtime="systemd" +elif command -v pm2 >/dev/null 2>&1 && pm2 list 2>/dev/null | grep -qi "MagicMirror"; then + mm_runtime="pm2" +elif pgrep -fa "MagicMirror" >/dev/null 2>&1; then + mm_runtime="process" +fi + +# swap +swap_current="none" +if command -v swapon >/dev/null 2>&1; then + swap_lines=$(swapon --show --noheadings 2>/dev/null || true) + if [[ -n "$swap_lines" ]]; then + swap_current="$swap_lines" + fi +fi +if command -v zramctl >/dev/null 2>&1; then + zram_lines=$(zramctl --noheadings 2>/dev/null || true) + [[ -n "$zram_lines" ]] && swap_current="${swap_current:+$swap_current; }zram: $zram_lines" +fi + +# hostname / os +hostname=$(hostname -f 2>/dev/null || hostname) +os_pretty=$(grep PRETTY_NAME /etc/os-release 2>/dev/null | cut -d= -f2 | tr -d '"' || echo "unknown") + +cat < +# TODO: verify node appears in tailscale status within timeout +# TODO: gate on deploy_autonomy=true in node.yaml + +set -euo pipefail +: "${REPO_ROOT:?REPO_ROOT is not set — run via onboard.sh}" +source "${REPO_ROOT}/scripts/onboard/lib/remote.sh" + +STEP_NAME="30-install-tailscale" +step "[$STEP_NAME] TODO — not yet implemented" diff --git a/scripts/onboard/steps/40-deploy-node-agent.sh b/scripts/onboard/steps/40-deploy-node-agent.sh new file mode 100755 index 0000000..54406be --- /dev/null +++ b/scripts/onboard/steps/40-deploy-node-agent.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +# scripts/onboard/steps/40-deploy-node-agent.sh — deploy node-agent to remote node +# +# TODO: rsync services/node-agent/ and hosts//runtime/node-agent/ to remote +# TODO: populate /opt/homelab/config/node-agent/.env from env.example + operator-provided secrets +# TODO: docker compose up -d (or docker-compose for CHELSTY nodes using v1) +# TODO: wait for healthcheck to pass +# TODO: emit deployment_completed event via scripts/lib/events.sh +# TODO: gate on git_control flag — if false, skip and print manual instructions + +set -euo pipefail +: "${REPO_ROOT:?REPO_ROOT is not set — run via onboard.sh}" +source "${REPO_ROOT}/scripts/onboard/lib/remote.sh" + +STEP_NAME="40-deploy-node-agent" +step "[$STEP_NAME] TODO — not yet implemented" diff --git a/scripts/onboard/steps/50-verify.sh b/scripts/onboard/steps/50-verify.sh new file mode 100755 index 0000000..28d5c5b --- /dev/null +++ b/scripts/onboard/steps/50-verify.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +# scripts/onboard/steps/50-verify.sh — end-to-end verification of onboarded node +# +# TODO: rcheck SSH reachability +# TODO: rrun docker ps — confirm node-agent container is running +# TODO: check /opt/homelab/state/heartbeat timestamp is recent (< 5 min) +# TODO: verify node appears in Observer world state (/opt/homelab/world/nodes.json on control node) +# TODO: run services//healthcheck.sh for each enabled service +# TODO: print pass/fail summary table; exit 1 if any check failed + +set -euo pipefail +: "${REPO_ROOT:?REPO_ROOT is not set — run via onboard.sh}" +source "${REPO_ROOT}/scripts/onboard/lib/remote.sh" + +STEP_NAME="50-verify" +step "[$STEP_NAME] TODO — not yet implemented"