DRY_RUN now uses 1/0 instead of "true"/"false" across all onboard scripts.
common.sh: add run() — wraps mutations; prints "[dry-run] would: ..." when
DRY_RUN=1. Exported via `export -f run` so child bash processes inherit it.
onboard.sh: remove the `--dry-run → dryrun "Would execute" → continue` bypass.
Steps now always execute; DRY_RUN=1 is exported so each step's own run()
calls handle simulation. The orchestrator no longer needs to know step internals.
remote.sh: update DRY_RUN checks to [ "${DRY_RUN:-0}" = 1 ] for consistency.
00-access.sh: remove all if/else DRY_RUN blocks; replace with:
- Mutations (ssh-copy-id, curl install, tailscale up) wrapped in run()
- Probes (SSH BatchMode test, command -v, _ts_state) run unconditionally
so dry-run reports real current state ("key present → skip" vs "would: ...")
- Stage 3 verify runs always; SSH failure is die in live mode, warn in
dry-run (Tailscale not yet joined is expected on a fresh node)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
183 lines
6.7 KiB
Bash
Executable file
183 lines
6.7 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# scripts/onboard/onboard.sh — node onboarding orchestrator
|
|
#
|
|
# Usage:
|
|
# onboard.sh --node <name> [--step <name>] [--from <step>] [--dry-run]
|
|
#
|
|
# Flags:
|
|
# --node <name> node name matching hosts/<name>/node.yaml (required)
|
|
# --step <name> run only this step (e.g. 00-preflight)
|
|
# --from <step> start from this step, run all subsequent steps
|
|
# --dry-run print what would be done without mutating anything
|
|
#
|
|
# Steps run in lexicographic order from scripts/onboard/steps/.
|
|
# Steps that require deploy_autonomy=true are skipped (with a warning) when
|
|
# that flag is false in node.yaml. Steps that require git_control=true are
|
|
# similarly gated.
|
|
|
|
set -euo pipefail
|
|
|
|
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
STEPS_DIR="${REPO_ROOT}/scripts/onboard/steps"
|
|
LIB_DIR="${REPO_ROOT}/scripts/onboard/lib"
|
|
|
|
# ── load helpers ──────────────────────────────────────────────────────────────
|
|
# shellcheck source=lib/common.sh
|
|
source "${LIB_DIR}/common.sh"
|
|
|
|
# ── defaults ──────────────────────────────────────────────────────────────────
|
|
NODE_NAME=""
|
|
ONLY_STEP=""
|
|
FROM_STEP=""
|
|
DRY_RUN=0
|
|
export DRY_RUN REPO_ROOT
|
|
|
|
# ── argument parsing ──────────────────────────────────────────────────────────
|
|
usage() {
|
|
cat >&2 <<'EOF'
|
|
Usage: onboard.sh --node <name> [--step <name>] [--from <step>] [--dry-run]
|
|
|
|
--node <name> node name matching hosts/<name>/node.yaml (required)
|
|
--step <name> run only this single step (e.g. 00-preflight)
|
|
--from <step> start from this step, continue to end
|
|
--dry-run no mutations; show what would run
|
|
|
|
Examples:
|
|
onboard.sh --node lustro
|
|
onboard.sh --node lustro --step 00-preflight
|
|
onboard.sh --node lustro --from 20-install-docker
|
|
onboard.sh --node lustro --dry-run
|
|
EOF
|
|
exit 1
|
|
}
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--node) NODE_NAME="${2:?--node requires a value}"; shift 2 ;;
|
|
--step) ONLY_STEP="${2:?--step requires a value}"; shift 2 ;;
|
|
--from) FROM_STEP="${2:?--from requires a value}"; shift 2 ;;
|
|
--dry-run) DRY_RUN=1; shift ;;
|
|
-h|--help) usage ;;
|
|
*) die "Unknown argument: $1" ;;
|
|
esac
|
|
done
|
|
|
|
[[ -z "$NODE_NAME" ]] && { warn "--node is required"; usage; }
|
|
|
|
export NODE_NAME
|
|
|
|
# ── load node.yaml ────────────────────────────────────────────────────────────
|
|
require_node_yaml "$NODE_NAME"
|
|
|
|
log "Loading manifest: $NODE_YAML"
|
|
|
|
DEPLOY_AUTONOMY=$(yaml_get "$NODE_YAML" "deploy_autonomy")
|
|
GIT_CONTROL=$(yaml_get "$NODE_YAML" "git_control")
|
|
SSH_USER=$(yaml_get "$NODE_YAML" "ssh_user")
|
|
TS_HOSTNAME=$(yaml_get "$NODE_YAML" "tailscale.hostname")
|
|
|
|
DEPLOY_AUTONOMY="${DEPLOY_AUTONOMY:-false}"
|
|
GIT_CONTROL="${GIT_CONTROL:-false}"
|
|
|
|
[[ -z "$SSH_USER" ]] && die "ssh_user not set in $NODE_YAML"
|
|
[[ -z "$TS_HOSTNAME" ]] && die "tailscale.hostname not set in $NODE_YAML"
|
|
|
|
export ONBOARD_SSH_USER="$SSH_USER"
|
|
export ONBOARD_SSH_HOST="$TS_HOSTNAME"
|
|
|
|
log "Node: ${NODE_NAME} | host: ${TS_HOSTNAME} | user: ${SSH_USER}"
|
|
log "deploy_autonomy=${DEPLOY_AUTONOMY} git_control=${GIT_CONTROL} dry_run=${DRY_RUN}"
|
|
|
|
# ── collect steps ─────────────────────────────────────────────────────────────
|
|
# Steps are NN-name.sh files in lexicographic order.
|
|
mapfile -t ALL_STEPS < <(find "$STEPS_DIR" -maxdepth 1 -name '[0-9][0-9]-*.sh' | sort)
|
|
|
|
if [[ ${#ALL_STEPS[@]} -eq 0 ]]; then
|
|
die "No steps found in $STEPS_DIR"
|
|
fi
|
|
|
|
# Determine which steps to run based on flags.
|
|
declare -a STEPS_TO_RUN=()
|
|
|
|
for step_path in "${ALL_STEPS[@]}"; do
|
|
step_file=$(basename "$step_path" .sh)
|
|
|
|
if [[ -n "$ONLY_STEP" ]]; then
|
|
# Match on prefix (e.g. "00-preflight" matches "00-preflight.sh")
|
|
[[ "$step_file" == "$ONLY_STEP" ]] || continue
|
|
elif [[ -n "$FROM_STEP" ]]; then
|
|
# Skip steps before FROM_STEP
|
|
[[ "$step_file" < "$FROM_STEP" && "$step_file" != "$FROM_STEP" ]] && continue
|
|
fi
|
|
|
|
STEPS_TO_RUN+=("$step_path")
|
|
done
|
|
|
|
if [[ ${#STEPS_TO_RUN[@]} -eq 0 ]]; then
|
|
die "No matching steps found (--step='${ONLY_STEP}' --from='${FROM_STEP}')"
|
|
fi
|
|
|
|
log "Steps to run (${#STEPS_TO_RUN[@]}):"
|
|
for s in "${STEPS_TO_RUN[@]}"; do
|
|
printf " %s\n" "$(basename "$s")"
|
|
done
|
|
echo ""
|
|
|
|
# ── step execution loop ───────────────────────────────────────────────────────
|
|
# Steps that start at 10+ are "mutating" and require deploy_autonomy=true.
|
|
# Steps that start at 30+ and deal with git/repo sync require git_control=true.
|
|
# Step 00-preflight is always allowed (read-only).
|
|
|
|
_step_needs_autonomy() {
|
|
local num="${1%%[^0-9]*}" # leading digits
|
|
[[ "$num" -ge 10 ]] 2>/dev/null
|
|
}
|
|
|
|
_step_needs_git_control() {
|
|
local name="$1"
|
|
[[ "$name" == *"git"* || "$name" == *"repo"* || "$name" == *"clone"* ]]
|
|
}
|
|
|
|
FAILED_STEPS=()
|
|
|
|
for step_path in "${STEPS_TO_RUN[@]}"; do
|
|
step_file=$(basename "$step_path" .sh)
|
|
step_num="${step_file%%[^0-9]*}"
|
|
|
|
# autonomy gate
|
|
if _step_needs_autonomy "$step_num" && [[ "$DEPLOY_AUTONOMY" != "true" ]]; then
|
|
warn "Skipping $step_file — deploy_autonomy=false in $NODE_YAML"
|
|
warn "Run this step manually or set deploy_autonomy: true"
|
|
continue
|
|
fi
|
|
|
|
# git_control gate
|
|
if _step_needs_git_control "$step_file" && [[ "$GIT_CONTROL" != "true" ]]; then
|
|
warn "Skipping $step_file — git_control=false in $NODE_YAML"
|
|
continue
|
|
fi
|
|
|
|
step "Running: $step_file"
|
|
|
|
if bash "$step_path"; then
|
|
log "$step_file — OK"
|
|
else
|
|
rc=$?
|
|
warn "$step_file — FAILED (exit $rc)"
|
|
FAILED_STEPS+=("$step_file")
|
|
fi
|
|
|
|
echo ""
|
|
done
|
|
|
|
# ── summary ───────────────────────────────────────────────────────────────────
|
|
if [[ ${#FAILED_STEPS[@]} -gt 0 ]]; then
|
|
die "Onboarding finished with failures: ${FAILED_STEPS[*]}"
|
|
fi
|
|
|
|
if [ "${DRY_RUN:-0}" = 1 ]; then
|
|
log "Dry-run complete — no mutations performed."
|
|
else
|
|
log "All steps completed successfully for node ${NODE_NAME}."
|
|
fi
|