#!/usr/bin/env bash # scripts/onboard/onboard.sh — node onboarding orchestrator # # Usage: # onboard.sh --node [--step ] [--from ] [--dry-run] # # Flags: # --node node name matching hosts//node.yaml (required) # --step run only this step (e.g. 00-preflight) # --from start from this step, run all subsequent steps # --dry-run print what would be done without mutating anything # # Steps run in lexicographic order from scripts/onboard/steps/. # Steps that require deploy_autonomy=true are skipped (with a warning) when # that flag is false in node.yaml. Steps that require git_control=true are # similarly gated. set -euo pipefail REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" STEPS_DIR="${REPO_ROOT}/scripts/onboard/steps" LIB_DIR="${REPO_ROOT}/scripts/onboard/lib" # ── load helpers ────────────────────────────────────────────────────────────── # shellcheck source=lib/common.sh source "${LIB_DIR}/common.sh" # ── defaults ────────────────────────────────────────────────────────────────── NODE_NAME="" ONLY_STEP="" FROM_STEP="" DRY_RUN=0 export DRY_RUN REPO_ROOT # ── argument parsing ────────────────────────────────────────────────────────── usage() { cat >&2 <<'EOF' Usage: onboard.sh --node [--step ] [--from ] [--dry-run] --node node name matching hosts//node.yaml (required) --step run only this single step (e.g. 00-preflight) --from start from this step, continue to end --dry-run no mutations; show what would run Examples: onboard.sh --node lustro onboard.sh --node lustro --step 00-preflight onboard.sh --node lustro --from 20-install-docker onboard.sh --node lustro --dry-run EOF exit 1 } while [[ $# -gt 0 ]]; do case "$1" in --node) NODE_NAME="${2:?--node requires a value}"; shift 2 ;; --step) ONLY_STEP="${2:?--step requires a value}"; shift 2 ;; --from) FROM_STEP="${2:?--from requires a value}"; shift 2 ;; --dry-run) DRY_RUN=1; shift ;; -h|--help) usage ;; *) die "Unknown argument: $1" ;; esac done [[ -z "$NODE_NAME" ]] && { warn "--node is required"; usage; } export NODE_NAME # ── load node.yaml ──────────────────────────────────────────────────────────── require_node_yaml "$NODE_NAME" log "Loading manifest: $NODE_YAML" DEPLOY_AUTONOMY=$(yaml_get "$NODE_YAML" "deploy_autonomy") GIT_CONTROL=$(yaml_get "$NODE_YAML" "git_control") SSH_USER=$(yaml_get "$NODE_YAML" "ssh_user") TS_HOSTNAME=$(yaml_get "$NODE_YAML" "tailscale.hostname") DEPLOY_AUTONOMY="${DEPLOY_AUTONOMY:-false}" GIT_CONTROL="${GIT_CONTROL:-false}" [[ -z "$SSH_USER" ]] && die "ssh_user not set in $NODE_YAML" [[ -z "$TS_HOSTNAME" ]] && die "tailscale.hostname not set in $NODE_YAML" export ONBOARD_SSH_USER="$SSH_USER" export ONBOARD_SSH_HOST="$TS_HOSTNAME" log "Node: ${NODE_NAME} | host: ${TS_HOSTNAME} | user: ${SSH_USER}" log "deploy_autonomy=${DEPLOY_AUTONOMY} git_control=${GIT_CONTROL} dry_run=${DRY_RUN}" # ── collect steps ───────────────────────────────────────────────────────────── # Steps are NN-name.sh files in lexicographic order. mapfile -t ALL_STEPS < <(find "$STEPS_DIR" -maxdepth 1 -name '[0-9][0-9]-*.sh' | sort) if [[ ${#ALL_STEPS[@]} -eq 0 ]]; then die "No steps found in $STEPS_DIR" fi # Determine which steps to run based on flags. declare -a STEPS_TO_RUN=() for step_path in "${ALL_STEPS[@]}"; do step_file=$(basename "$step_path" .sh) if [[ -n "$ONLY_STEP" ]]; then # Match on prefix (e.g. "00-preflight" matches "00-preflight.sh") [[ "$step_file" == "$ONLY_STEP" ]] || continue elif [[ -n "$FROM_STEP" ]]; then # Skip steps before FROM_STEP [[ "$step_file" < "$FROM_STEP" && "$step_file" != "$FROM_STEP" ]] && continue fi STEPS_TO_RUN+=("$step_path") done if [[ ${#STEPS_TO_RUN[@]} -eq 0 ]]; then die "No matching steps found (--step='${ONLY_STEP}' --from='${FROM_STEP}')" fi log "Steps to run (${#STEPS_TO_RUN[@]}):" for s in "${STEPS_TO_RUN[@]}"; do printf " %s\n" "$(basename "$s")" done echo "" # ── step execution loop ─────────────────────────────────────────────────────── # Steps that start at 10+ are "mutating" and require deploy_autonomy=true. # Steps that start at 30+ and deal with git/repo sync require git_control=true. # Step 00-preflight is always allowed (read-only). _step_needs_autonomy() { local num="${1%%[^0-9]*}" # leading digits [[ "$num" -ge 10 ]] 2>/dev/null } _step_needs_git_control() { local name="$1" [[ "$name" == *"git"* || "$name" == *"repo"* || "$name" == *"clone"* ]] } FAILED_STEPS=() for step_path in "${STEPS_TO_RUN[@]}"; do step_file=$(basename "$step_path" .sh) step_num="${step_file%%[^0-9]*}" # autonomy gate if _step_needs_autonomy "$step_num" && [[ "$DEPLOY_AUTONOMY" != "true" ]]; then warn "Skipping $step_file — deploy_autonomy=false in $NODE_YAML" warn "Run this step manually or set deploy_autonomy: true" continue fi # git_control gate if _step_needs_git_control "$step_file" && [[ "$GIT_CONTROL" != "true" ]]; then warn "Skipping $step_file — git_control=false in $NODE_YAML" continue fi step "Running: $step_file" if bash "$step_path"; then log "$step_file — OK" else rc=$? warn "$step_file — FAILED (exit $rc)" FAILED_STEPS+=("$step_file") fi echo "" done # ── summary ─────────────────────────────────────────────────────────────────── if [[ ${#FAILED_STEPS[@]} -gt 0 ]]; then die "Onboarding finished with failures: ${FAILED_STEPS[*]}" fi if [ "${DRY_RUN:-0}" = 1 ]; then log "Dry-run complete — no mutations performed." else log "All steps completed successfully for node ${NODE_NAME}." fi