homelab-codex-ws/scripts/onboard/onboard.sh
Oskar Kapala 931fd46e62 fix(onboard): propagate dry-run into steps via run() helper
DRY_RUN now uses 1/0 instead of "true"/"false" across all onboard scripts.

common.sh: add run() — wraps mutations; prints "[dry-run] would: ..." when
  DRY_RUN=1. Exported via `export -f run` so child bash processes inherit it.

onboard.sh: remove the `--dry-run → dryrun "Would execute" → continue` bypass.
  Steps now always execute; DRY_RUN=1 is exported so each step's own run()
  calls handle simulation. The orchestrator no longer needs to know step internals.

remote.sh: update DRY_RUN checks to [ "${DRY_RUN:-0}" = 1 ] for consistency.

00-access.sh: remove all if/else DRY_RUN blocks; replace with:
  - Mutations (ssh-copy-id, curl install, tailscale up) wrapped in run()
  - Probes (SSH BatchMode test, command -v, _ts_state) run unconditionally
    so dry-run reports real current state ("key present → skip" vs "would: ...")
  - Stage 3 verify runs always; SSH failure is die in live mode, warn in
    dry-run (Tailscale not yet joined is expected on a fresh node)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-08 15:01:09 +02:00

183 lines
6.7 KiB
Bash
Executable file

#!/usr/bin/env bash
# scripts/onboard/onboard.sh — node onboarding orchestrator
#
# Usage:
# onboard.sh --node <name> [--step <name>] [--from <step>] [--dry-run]
#
# Flags:
# --node <name> node name matching hosts/<name>/node.yaml (required)
# --step <name> run only this step (e.g. 00-preflight)
# --from <step> start from this step, run all subsequent steps
# --dry-run print what would be done without mutating anything
#
# Steps run in lexicographic order from scripts/onboard/steps/.
# Steps that require deploy_autonomy=true are skipped (with a warning) when
# that flag is false in node.yaml. Steps that require git_control=true are
# similarly gated.
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
STEPS_DIR="${REPO_ROOT}/scripts/onboard/steps"
LIB_DIR="${REPO_ROOT}/scripts/onboard/lib"
# ── load helpers ──────────────────────────────────────────────────────────────
# shellcheck source=lib/common.sh
source "${LIB_DIR}/common.sh"
# ── defaults ──────────────────────────────────────────────────────────────────
NODE_NAME=""
ONLY_STEP=""
FROM_STEP=""
DRY_RUN=0
export DRY_RUN REPO_ROOT
# ── argument parsing ──────────────────────────────────────────────────────────
usage() {
cat >&2 <<'EOF'
Usage: onboard.sh --node <name> [--step <name>] [--from <step>] [--dry-run]
--node <name> node name matching hosts/<name>/node.yaml (required)
--step <name> run only this single step (e.g. 00-preflight)
--from <step> start from this step, continue to end
--dry-run no mutations; show what would run
Examples:
onboard.sh --node lustro
onboard.sh --node lustro --step 00-preflight
onboard.sh --node lustro --from 20-install-docker
onboard.sh --node lustro --dry-run
EOF
exit 1
}
while [[ $# -gt 0 ]]; do
case "$1" in
--node) NODE_NAME="${2:?--node requires a value}"; shift 2 ;;
--step) ONLY_STEP="${2:?--step requires a value}"; shift 2 ;;
--from) FROM_STEP="${2:?--from requires a value}"; shift 2 ;;
--dry-run) DRY_RUN=1; shift ;;
-h|--help) usage ;;
*) die "Unknown argument: $1" ;;
esac
done
[[ -z "$NODE_NAME" ]] && { warn "--node is required"; usage; }
export NODE_NAME
# ── load node.yaml ────────────────────────────────────────────────────────────
require_node_yaml "$NODE_NAME"
log "Loading manifest: $NODE_YAML"
DEPLOY_AUTONOMY=$(yaml_get "$NODE_YAML" "deploy_autonomy")
GIT_CONTROL=$(yaml_get "$NODE_YAML" "git_control")
SSH_USER=$(yaml_get "$NODE_YAML" "ssh_user")
TS_HOSTNAME=$(yaml_get "$NODE_YAML" "tailscale.hostname")
DEPLOY_AUTONOMY="${DEPLOY_AUTONOMY:-false}"
GIT_CONTROL="${GIT_CONTROL:-false}"
[[ -z "$SSH_USER" ]] && die "ssh_user not set in $NODE_YAML"
[[ -z "$TS_HOSTNAME" ]] && die "tailscale.hostname not set in $NODE_YAML"
export ONBOARD_SSH_USER="$SSH_USER"
export ONBOARD_SSH_HOST="$TS_HOSTNAME"
log "Node: ${NODE_NAME} | host: ${TS_HOSTNAME} | user: ${SSH_USER}"
log "deploy_autonomy=${DEPLOY_AUTONOMY} git_control=${GIT_CONTROL} dry_run=${DRY_RUN}"
# ── collect steps ─────────────────────────────────────────────────────────────
# Steps are NN-name.sh files in lexicographic order.
mapfile -t ALL_STEPS < <(find "$STEPS_DIR" -maxdepth 1 -name '[0-9][0-9]-*.sh' | sort)
if [[ ${#ALL_STEPS[@]} -eq 0 ]]; then
die "No steps found in $STEPS_DIR"
fi
# Determine which steps to run based on flags.
declare -a STEPS_TO_RUN=()
for step_path in "${ALL_STEPS[@]}"; do
step_file=$(basename "$step_path" .sh)
if [[ -n "$ONLY_STEP" ]]; then
# Match on prefix (e.g. "00-preflight" matches "00-preflight.sh")
[[ "$step_file" == "$ONLY_STEP" ]] || continue
elif [[ -n "$FROM_STEP" ]]; then
# Skip steps before FROM_STEP
[[ "$step_file" < "$FROM_STEP" && "$step_file" != "$FROM_STEP" ]] && continue
fi
STEPS_TO_RUN+=("$step_path")
done
if [[ ${#STEPS_TO_RUN[@]} -eq 0 ]]; then
die "No matching steps found (--step='${ONLY_STEP}' --from='${FROM_STEP}')"
fi
log "Steps to run (${#STEPS_TO_RUN[@]}):"
for s in "${STEPS_TO_RUN[@]}"; do
printf " %s\n" "$(basename "$s")"
done
echo ""
# ── step execution loop ───────────────────────────────────────────────────────
# Steps that start at 10+ are "mutating" and require deploy_autonomy=true.
# Steps that start at 30+ and deal with git/repo sync require git_control=true.
# Step 00-preflight is always allowed (read-only).
_step_needs_autonomy() {
local num="${1%%[^0-9]*}" # leading digits
[[ "$num" -ge 10 ]] 2>/dev/null
}
_step_needs_git_control() {
local name="$1"
[[ "$name" == *"git"* || "$name" == *"repo"* || "$name" == *"clone"* ]]
}
FAILED_STEPS=()
for step_path in "${STEPS_TO_RUN[@]}"; do
step_file=$(basename "$step_path" .sh)
step_num="${step_file%%[^0-9]*}"
# autonomy gate
if _step_needs_autonomy "$step_num" && [[ "$DEPLOY_AUTONOMY" != "true" ]]; then
warn "Skipping $step_file — deploy_autonomy=false in $NODE_YAML"
warn "Run this step manually or set deploy_autonomy: true"
continue
fi
# git_control gate
if _step_needs_git_control "$step_file" && [[ "$GIT_CONTROL" != "true" ]]; then
warn "Skipping $step_file — git_control=false in $NODE_YAML"
continue
fi
step "Running: $step_file"
if bash "$step_path"; then
log "$step_file — OK"
else
rc=$?
warn "$step_file — FAILED (exit $rc)"
FAILED_STEPS+=("$step_file")
fi
echo ""
done
# ── summary ───────────────────────────────────────────────────────────────────
if [[ ${#FAILED_STEPS[@]} -gt 0 ]]; then
die "Onboarding finished with failures: ${FAILED_STEPS[*]}"
fi
if [ "${DRY_RUN:-0}" = 1 ]; then
log "Dry-run complete — no mutations performed."
else
log "All steps completed successfully for node ${NODE_NAME}."
fi