homelab-codex-ws/scripts/onboard/onboard.sh

#!/usr/bin/env bash
# scripts/onboard/onboard.sh — node onboarding orchestrator
#
# Usage:
#   onboard.sh --node <name> [--step <name>] [--from <step>] [--dry-run]
#
# Flags:
#   --node   <name>   node name matching hosts/<name>/node.yaml  (required)
#   --step   <name>   run only this step (e.g. 00-preflight)
#   --from   <step>   start from this step, run all subsequent steps
#   --dry-run         print what would be done without mutating anything
#
# Steps run in lexicographic order from scripts/onboard/steps/.
# Steps that require deploy_autonomy=true are skipped (with a warning) when
# that flag is false in node.yaml.  Steps that require git_control=true are
# similarly gated.

set -euo pipefail

REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
STEPS_DIR="${REPO_ROOT}/scripts/onboard/steps"
LIB_DIR="${REPO_ROOT}/scripts/onboard/lib"

# ── load helpers ──────────────────────────────────────────────────────────────
# shellcheck source=lib/common.sh
source "${LIB_DIR}/common.sh"

# ── defaults ──────────────────────────────────────────────────────────────────
NODE_NAME=""
ONLY_STEP=""
FROM_STEP=""
DRY_RUN="false"
export DRY_RUN REPO_ROOT

# ── argument parsing ──────────────────────────────────────────────────────────
usage() {
    cat >&2 <<'EOF'
Usage: onboard.sh --node <name> [--step <name>] [--from <step>] [--dry-run]

  --node   <name>   node name matching hosts/<name>/node.yaml   (required)
  --step   <name>   run only this single step (e.g. 00-preflight)
  --from   <step>   start from this step, continue to end
  --dry-run         no mutations; show what would run

Examples:
  onboard.sh --node lustro
  onboard.sh --node lustro --step 00-preflight
  onboard.sh --node lustro --from 20-install-docker
  onboard.sh --node lustro --dry-run
EOF
    exit 1
}

while [[ $# -gt 0 ]]; do
    case "$1" in
        --node)    NODE_NAME="${2:?--node requires a value}";  shift 2 ;;
        --step)    ONLY_STEP="${2:?--step requires a value}";  shift 2 ;;
        --from)    FROM_STEP="${2:?--from requires a value}";  shift 2 ;;
        --dry-run) DRY_RUN="true";                            shift   ;;
        -h|--help) usage ;;
        *) die "Unknown argument: $1" ;;
    esac
done

[[ -z "$NODE_NAME" ]] && { warn "--node is required"; usage; }

export NODE_NAME

# ── load node.yaml ────────────────────────────────────────────────────────────
require_node_yaml "$NODE_NAME"

log "Loading manifest: $NODE_YAML"

DEPLOY_AUTONOMY=$(yaml_get "$NODE_YAML" "deploy_autonomy")
GIT_CONTROL=$(yaml_get     "$NODE_YAML" "git_control")
SSH_USER=$(yaml_get        "$NODE_YAML" "ssh_user")
TS_HOSTNAME=$(yaml_get     "$NODE_YAML" "tailscale.hostname")

DEPLOY_AUTONOMY="${DEPLOY_AUTONOMY:-false}"
GIT_CONTROL="${GIT_CONTROL:-false}"

[[ -z "$SSH_USER"    ]] && die "ssh_user not set in $NODE_YAML"
[[ -z "$TS_HOSTNAME" ]] && die "tailscale.hostname not set in $NODE_YAML"

export ONBOARD_SSH_USER="$SSH_USER"
export ONBOARD_SSH_HOST="$TS_HOSTNAME"

log "Node: ${NODE_NAME} | host: ${TS_HOSTNAME} | user: ${SSH_USER}"
log "deploy_autonomy=${DEPLOY_AUTONOMY}  git_control=${GIT_CONTROL}  dry_run=${DRY_RUN}"

# ── collect steps ─────────────────────────────────────────────────────────────
# Steps are NN-name.sh files in lexicographic order.
mapfile -t ALL_STEPS < <(find "$STEPS_DIR" -maxdepth 1 -name '[0-9][0-9]-*.sh' | sort)

if [[ ${#ALL_STEPS[@]} -eq 0 ]]; then
    die "No steps found in $STEPS_DIR"
fi

# Determine which steps to run based on flags.
declare -a STEPS_TO_RUN=()

for step_path in "${ALL_STEPS[@]}"; do
    step_file=$(basename "$step_path" .sh)

    if [[ -n "$ONLY_STEP" ]]; then
        # Match on prefix (e.g. "00-preflight" matches "00-preflight.sh")
        [[ "$step_file" == "$ONLY_STEP" ]] || continue
    elif [[ -n "$FROM_STEP" ]]; then
        # Skip steps before FROM_STEP
        [[ "$step_file" < "$FROM_STEP" && "$step_file" != "$FROM_STEP" ]] && continue
    fi

    STEPS_TO_RUN+=("$step_path")
done

if [[ ${#STEPS_TO_RUN[@]} -eq 0 ]]; then
    die "No matching steps found (--step='${ONLY_STEP}' --from='${FROM_STEP}')"
fi

log "Steps to run (${#STEPS_TO_RUN[@]}):"
for s in "${STEPS_TO_RUN[@]}"; do
    printf "    %s\n" "$(basename "$s")"
done
echo ""

# ── step execution loop ───────────────────────────────────────────────────────
# Steps that start at 10+ are "mutating" and require deploy_autonomy=true.
# Steps that start at 30+ and deal with git/repo sync require git_control=true.
# Step 00-preflight is always allowed (read-only).

_step_needs_autonomy() {
    local num="${1%%[^0-9]*}"   # leading digits
    [[ "$num" -ge 10 ]] 2>/dev/null
}

_step_needs_git_control() {
    local name="$1"
    [[ "$name" == *"git"* || "$name" == *"repo"* || "$name" == *"clone"* ]]
}

FAILED_STEPS=()

for step_path in "${STEPS_TO_RUN[@]}"; do
    step_file=$(basename "$step_path" .sh)
    step_num="${step_file%%[^0-9]*}"

    # autonomy gate
    if _step_needs_autonomy "$step_num" && [[ "$DEPLOY_AUTONOMY" != "true" ]]; then
        warn "Skipping $step_file — deploy_autonomy=false in $NODE_YAML"
        warn "Run this step manually or set deploy_autonomy: true"
        continue
    fi

    # git_control gate
    if _step_needs_git_control "$step_file" && [[ "$GIT_CONTROL" != "true" ]]; then
        warn "Skipping $step_file — git_control=false in $NODE_YAML"
        continue
    fi

    step "Running: $step_file"

    if [[ "$DRY_RUN" == "true" ]]; then
        dryrun "Would execute: $step_path"
        continue
    fi

    if bash "$step_path"; then
        log "$step_file — OK"
    else
        rc=$?
        warn "$step_file — FAILED (exit $rc)"
        FAILED_STEPS+=("$step_file")
    fi

    echo ""
done

# ── summary ───────────────────────────────────────────────────────────────────
if [[ ${#FAILED_STEPS[@]} -gt 0 ]]; then
    die "Onboarding finished with failures: ${FAILED_STEPS[*]}"
fi

if [[ "$DRY_RUN" == "true" ]]; then
    log "Dry-run complete — no changes made."
else
    log "All steps completed successfully for node ${NODE_NAME}."
fi