diff --git a/scripts/onboard/lib/common.sh b/scripts/onboard/lib/common.sh index 95142f3..fa51cbb 100644 --- a/scripts/onboard/lib/common.sh +++ b/scripts/onboard/lib/common.sh @@ -25,6 +25,19 @@ dryrun() { echo -e "${_C_YELLOW}[dry-run]${_C_RESET} ${*}"; } # ── command detection ───────────────────────────────────────────────────────── have_cmd() { command -v "$1" >/dev/null 2>&1; } +# ── dry-run execution wrapper ───────────────────────────────────────────────── +# run CMD [ARGS…] — executes CMD in live mode; prints intent in dry-run. +# Wrap MUTATIONS with this. Read-only probes (SSH BatchMode tests, status +# queries, command -v checks) must run unconditionally — never wrap them. +run() { + if [ "${DRY_RUN:-0}" = 1 ]; then + echo "[dry-run] would: $*" + else + "$@" + fi +} +export -f run + # ── file helpers ────────────────────────────────────────────────────────────── # ensure_line FILE LINE — appends LINE to FILE if it is not already present (idempotent) ensure_line() { diff --git a/scripts/onboard/lib/remote.sh b/scripts/onboard/lib/remote.sh index 92b4ecf..5ea76ef 100644 --- a/scripts/onboard/lib/remote.sh +++ b/scripts/onboard/lib/remote.sh @@ -7,7 +7,7 @@ set -euo pipefail : "${ONBOARD_SSH_USER:?remote.sh: ONBOARD_SSH_USER is not set}" : "${ONBOARD_SSH_HOST:?remote.sh: ONBOARD_SSH_HOST is not set}" -: "${DRY_RUN:=false}" +: "${DRY_RUN:=0}" _SSH_OPTS=( -o StrictHostKeyChecking=accept-new @@ -17,7 +17,7 @@ _SSH_OPTS=( # rrun CMD [ARGS…] — run a command on the remote node via SSH rrun() { - if [[ "$DRY_RUN" == "true" ]]; then + if [ "${DRY_RUN:-0}" = 1 ]; then dryrun "ssh ${ONBOARD_SSH_USER}@${ONBOARD_SSH_HOST} -- $*" return 0 fi @@ -27,7 +27,7 @@ rrun() { # rcopy LOCAL_PATH REMOTE_PATH — copy a file to the remote node via scp rcopy() { local src="$1" dst="$2" - if [[ "$DRY_RUN" == "true" ]]; then + if [ "${DRY_RUN:-0}" = 1 ]; then dryrun "scp $src ${ONBOARD_SSH_USER}@${ONBOARD_SSH_HOST}:$dst" return 0 fi @@ -38,7 +38,7 @@ rcopy() { rsync_dir() { local src="$1" dst="$2" shift 2 - if [[ "$DRY_RUN" == "true" ]]; then + if [ "${DRY_RUN:-0}" = 1 ]; then dryrun "rsync -az $src ${ONBOARD_SSH_USER}@${ONBOARD_SSH_HOST}:$dst" return 0 fi diff --git a/scripts/onboard/onboard.sh b/scripts/onboard/onboard.sh index 40fc363..c6f95ec 100755 --- a/scripts/onboard/onboard.sh +++ b/scripts/onboard/onboard.sh @@ -29,7 +29,7 @@ source "${LIB_DIR}/common.sh" NODE_NAME="" ONLY_STEP="" FROM_STEP="" -DRY_RUN="false" +DRY_RUN=0 export DRY_RUN REPO_ROOT # ── argument parsing ────────────────────────────────────────────────────────── @@ -56,7 +56,7 @@ while [[ $# -gt 0 ]]; do --node) NODE_NAME="${2:?--node requires a value}"; shift 2 ;; --step) ONLY_STEP="${2:?--step requires a value}"; shift 2 ;; --from) FROM_STEP="${2:?--from requires a value}"; shift 2 ;; - --dry-run) DRY_RUN="true"; shift ;; + --dry-run) DRY_RUN=1; shift ;; -h|--help) usage ;; *) die "Unknown argument: $1" ;; esac @@ -159,11 +159,6 @@ for step_path in "${STEPS_TO_RUN[@]}"; do step "Running: $step_file" - if [[ "$DRY_RUN" == "true" ]]; then - dryrun "Would execute: $step_path" - continue - fi - if bash "$step_path"; then log "$step_file — OK" else @@ -180,8 +175,8 @@ if [[ ${#FAILED_STEPS[@]} -gt 0 ]]; then die "Onboarding finished with failures: ${FAILED_STEPS[*]}" fi -if [[ "$DRY_RUN" == "true" ]]; then - log "Dry-run complete — no changes made." +if [ "${DRY_RUN:-0}" = 1 ]; then + log "Dry-run complete — no mutations performed." else log "All steps completed successfully for node ${NODE_NAME}." fi diff --git a/scripts/onboard/steps/00-access.sh b/scripts/onboard/steps/00-access.sh index 08347c1..abf2c24 100755 --- a/scripts/onboard/steps/00-access.sh +++ b/scripts/onboard/steps/00-access.sh @@ -6,9 +6,13 @@ # 2. ensure_tailscale — install Tailscale and join network (interactive auth URL) # 3. verify — confirm SSH over Tailscale, assert arch=aarch64 # -# Does NOT configure NOPASSWD, /opt/homelab, or any host mutation beyond Tailscale. -# Reads: first_contact (e.g. pi@pimirror2.local) and tailscale.hostname from node.yaml. -# pi user on Raspberry Pi OS has passwordless sudo by default — required for `tailscale up`. +# Dry-run convention (DRY_RUN=1): +# - Read-only probes (SSH BatchMode test, tailscale status, command -v) run ALWAYS +# so the plan reflects real current state ("key present → skip" vs "would: install") +# - Mutations (ssh-copy-id, curl installer, tailscale up) are wrapped with run() +# +# Does NOT configure NOPASSWD or /opt/homelab — those are later steps. +# pi user on Raspberry Pi OS has passwordless sudo — required for `tailscale up`. set -euo pipefail @@ -16,7 +20,7 @@ STEP_NAME="00-access" : "${REPO_ROOT:?REPO_ROOT is not set — run via onboard.sh}" : "${NODE_YAML:?NODE_YAML is not set — run via onboard.sh}" -: "${DRY_RUN:=false}" +: "${DRY_RUN:=0}" # Source common.sh when run standalone (orchestrator sources it before calling steps) if ! declare -f log >/dev/null 2>&1; then @@ -33,8 +37,8 @@ TS_HOSTNAME=$(yaml_get "$NODE_YAML" "tailscale.hostname") FC_USER="${FIRST_CONTACT%%@*}" -# remote.sh binds to ONBOARD_SSH_USER / ONBOARD_SSH_HOST — set from orchestrator -# (post-Tailscale target); fall back to first_contact coordinates for standalone use. +# ONBOARD_SSH_USER/HOST set by orchestrator to post-Tailscale coordinates; +# fall back to first_contact for standalone invocation. export ONBOARD_SSH_USER="${ONBOARD_SSH_USER:-${FC_USER}}" export ONBOARD_SSH_HOST="${ONBOARD_SSH_HOST:-${TS_HOSTNAME}}" @@ -42,41 +46,42 @@ export ONBOARD_SSH_HOST="${ONBOARD_SSH_HOST:-${TS_HOSTNAME}}" source "${REPO_ROOT}/scripts/onboard/lib/remote.sh" # ── SSH option arrays ───────────────────────────────────────────────────────── -# Interactive — no BatchMode; used for ssh-copy-id (may need password) +# No BatchMode — used for ssh-copy-id where a password prompt may appear _FC_SSH_NOKEY=(-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10) -# Post-key — BatchMode; used once the key is installed +# BatchMode — used for all probes and post-key-install operations _FC_SSH=(-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes) +# ── tailscale state probe helper ────────────────────────────────────────────── +# Always runs; returns BackendState or "unknown" on any SSH/parse failure. +_ts_state() { + ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" \ + 'tailscale status --json 2>/dev/null | python3 -c \ + "import sys,json; print(json.load(sys.stdin).get(\"BackendState\",\"unknown\"))" \ + 2>/dev/null || echo "unknown"' 2>/dev/null || echo "unknown" +} + # ═══════════════════════════════════════════════════════════════════════════════ # Stage 1 — ensure_ssh_key # ═══════════════════════════════════════════════════════════════════════════════ step "[$STEP_NAME] 1/3 ensure_ssh_key → ${FIRST_CONTACT}" -if [[ "$DRY_RUN" == "true" ]]; then - dryrun "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 ${FIRST_CONTACT} true" - dryrun "# if key not present:" - dryrun "ssh-copy-id -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -i ~/.ssh/id_*.pub ${FIRST_CONTACT}" +# Probe: test key-based auth — always runs so dry-run reports real current state +if ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" true 2>/dev/null; then + log "SSH key already accepted by ${FIRST_CONTACT} — skip" else - if ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" true 2>/dev/null; then - log "SSH key already accepted by ${FIRST_CONTACT} — skip" - else - # Find the first available public key - pubkeys=( "$HOME"/.ssh/id_*.pub ) - [[ -f "${pubkeys[0]}" ]] || die "No public key found at ~/.ssh/id_*.pub on SATURN" + pubkeys=( "$HOME"/.ssh/id_*.pub ) + [[ -f "${pubkeys[0]}" ]] || die "No public key found at ~/.ssh/id_*.pub on SATURN" - log "Installing public key ${pubkeys[0]} on ${FIRST_CONTACT}" - log "(password prompt for ${FIRST_CONTACT} expected)" - ssh-copy-id \ - -o StrictHostKeyChecking=accept-new \ - -o ConnectTimeout=10 \ - -i "${pubkeys[0]}" \ - "$FIRST_CONTACT" - - log "Verifying key-based access..." - ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" true \ - || die "Key-based SSH still fails after ssh-copy-id — check ~/.ssh/authorized_keys on ${FIRST_CONTACT}" - log "Key installed and verified" - fi + log "Key not yet installed on ${FIRST_CONTACT} (password prompt expected)" + # Mutation: install public key + run ssh-copy-id \ + "${_FC_SSH_NOKEY[@]}" \ + -i "${pubkeys[0]}" \ + "$FIRST_CONTACT" + # Probe: verify key was installed (run() is a no-op in dry-run so this + # prints "would:" — avoids a false-failure after a skipped ssh-copy-id) + run ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" true + log "Key installed and verified" fi # ═══════════════════════════════════════════════════════════════════════════════ @@ -84,46 +89,36 @@ fi # ═══════════════════════════════════════════════════════════════════════════════ step "[$STEP_NAME] 2/3 ensure_tailscale on ${FIRST_CONTACT} → hostname=${TS_HOSTNAME}" -_ts_state() { - # Returns BackendState string or "unknown". Uses python3 (available on RPi OS). - ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" \ - 'tailscale status --json 2>/dev/null | python3 -c \ - "import sys,json; print(json.load(sys.stdin).get(\"BackendState\",\"unknown\"))" \ - 2>/dev/null || echo "unknown"' -} - -if [[ "$DRY_RUN" == "true" ]]; then - dryrun "ssh ${FIRST_CONTACT} 'command -v tailscale' # check if installed" - dryrun "# if missing: ssh ${FIRST_CONTACT} 'curl -fsSL https://tailscale.com/install.sh | sh'" - dryrun "ssh ${FIRST_CONTACT} 'tailscale status --json' # check BackendState" - dryrun "# if not Running: ssh ${FIRST_CONTACT} 'sudo tailscale up --hostname=${TS_HOSTNAME}'" +# Probe: check if tailscale binary present — always runs. +# SSH auth failure (key not yet installed in dry-run) falls through to the +# "not found" branch, which is correct for a fresh node. +if ! ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" 'command -v tailscale' >/dev/null 2>&1; then + log "Tailscale not found on ${FIRST_CONTACT}" + # Mutation: install tailscale + run ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" \ + 'curl -fsSL https://tailscale.com/install.sh | sh' else - # 2a — install if missing - if ! ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" 'command -v tailscale' >/dev/null 2>&1; then - log "Tailscale not found — installing on ${FIRST_CONTACT}..." - ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" 'curl -fsSL https://tailscale.com/install.sh | sh' - log "Tailscale installed" - else - log "Tailscale already installed on ${FIRST_CONTACT}" - fi + log "Tailscale already installed on ${FIRST_CONTACT}" +fi - # 2b — join if not active - ts_state=$(_ts_state) - if [[ "$ts_state" == "Running" ]]; then - log "Tailscale already active (BackendState=Running) — skip" - else - warn "Tailscale BackendState=${ts_state} — running tailscale up" - echo "" - echo -e "${_C_BOLD}┌─────────────────────────────────────────────────────────────┐" - echo -e "│ ACTION REQUIRED: open the URL below in your browser to │" - echo -e "│ authorize ${TS_HOSTNAME} in your Tailscale account. │" - echo -e "└─────────────────────────────────────────────────────────────┘${_C_RESET}" - echo "" - # pi user has passwordless sudo on Raspberry Pi OS; tailscale up blocks - # until the user authenticates via the URL it prints to stdout. - ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" "sudo tailscale up --hostname=${TS_HOSTNAME}" - echo "" +# Probe: check backend state — always runs +ts_state=$(_ts_state) +if [[ "$ts_state" == "Running" ]]; then + log "Tailscale already active (BackendState=Running) — skip" +else + warn "Tailscale BackendState=${ts_state} — joining network..." + echo "" + echo -e "${_C_BOLD}┌─────────────────────────────────────────────────────────────┐" + echo -e "│ ACTION REQUIRED: open the URL below in your browser to │" + echo -e "│ authorize ${TS_HOSTNAME} in your Tailscale account. │" + echo -e "└─────────────────────────────────────────────────────────────┘${_C_RESET}" + echo "" + # Mutation: tailscale up — blocks until user authenticates via printed URL + run ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" "sudo tailscale up --hostname=${TS_HOSTNAME}" + echo "" + # Post-join state check — only meaningful after the mutation actually ran + if [ "${DRY_RUN:-0}" != 1 ]; then ts_state2=$(_ts_state) [[ "$ts_state2" == "Running" ]] \ || die "Tailscale still not active after tailscale up (BackendState=${ts_state2})" @@ -136,18 +131,18 @@ fi # ═══════════════════════════════════════════════════════════════════════════════ step "[$STEP_NAME] 3/3 verify SSH over Tailscale → ${ONBOARD_SSH_USER}@${TS_HOSTNAME}" -if [[ "$DRY_RUN" == "true" ]]; then - dryrun "ssh -o BatchMode=yes ${ONBOARD_SSH_USER}@${TS_HOSTNAME} 'echo ok && uname -m'" - dryrun "# expected output: ok / aarch64" -else - out=$(ssh "${_FC_SSH[@]}" "${ONBOARD_SSH_USER}@${TS_HOSTNAME}" 'echo ok && uname -m' 2>&1) \ - || die "Verification SSH to ${TS_HOSTNAME} failed:\n ${out}" - - echo "$out" | grep -q '^ok' || die "Verification output missing 'ok' line: ${out}" +# Probe: always runs — on a node already joined this works even in dry-run. +# On a fresh node in dry-run mode Tailscale isn't set up yet, so SSH will fail; +# that is reported as a warning (not a fatal error) to keep dry-run informative. +if out=$(ssh "${_FC_SSH[@]}" "${ONBOARD_SSH_USER}@${TS_HOSTNAME}" \ + 'echo ok && uname -m' 2>&1); then + echo "$out" | grep -q '^ok' || warn "Unexpected verify output: ${out}" arch=$(echo "$out" | grep -v '^ok' | head -1 | tr -d '[:space:]') [[ "$arch" == "aarch64" ]] || warn "Unexpected arch '${arch}' — expected aarch64" - log "Verify OK: ${ONBOARD_SSH_USER}@${TS_HOSTNAME} reachable, arch=${arch}" +else + msg="Verify SSH to ${ONBOARD_SSH_USER}@${TS_HOSTNAME} failed (Tailscale not yet joined?)" + [ "${DRY_RUN:-0}" = 1 ] && warn "$msg" || die "$msg" fi -log "[$STEP_NAME] done — SSH key installed, Tailscale active, Tailscale SSH verified" +log "[$STEP_NAME] done"