fix(onboard): propagate dry-run into steps via run() helper

DRY_RUN now uses 1/0 instead of "true"/"false" across all onboard scripts.

common.sh: add run() — wraps mutations; prints "[dry-run] would: ..." when
  DRY_RUN=1. Exported via `export -f run` so child bash processes inherit it.

onboard.sh: remove the `--dry-run → dryrun "Would execute" → continue` bypass.
  Steps now always execute; DRY_RUN=1 is exported so each step's own run()
  calls handle simulation. The orchestrator no longer needs to know step internals.

remote.sh: update DRY_RUN checks to [ "${DRY_RUN:-0}" = 1 ] for consistency.

00-access.sh: remove all if/else DRY_RUN blocks; replace with:
  - Mutations (ssh-copy-id, curl install, tailscale up) wrapped in run()
  - Probes (SSH BatchMode test, command -v, _ts_state) run unconditionally
    so dry-run reports real current state ("key present → skip" vs "would: ...")
  - Stage 3 verify runs always; SSH failure is die in live mode, warn in
    dry-run (Tailscale not yet joined is expected on a fresh node)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Oskar Kapala 2026-06-08 15:01:09 +02:00
parent 9012a36827
commit 931fd46e62
4 changed files with 94 additions and 91 deletions

View file

@ -25,6 +25,19 @@ dryrun() { echo -e "${_C_YELLOW}[dry-run]${_C_RESET} ${*}"; }
# ── command detection ─────────────────────────────────────────────────────────
have_cmd() { command -v "$1" >/dev/null 2>&1; }
# ── dry-run execution wrapper ─────────────────────────────────────────────────
# run CMD [ARGS…] — executes CMD in live mode; prints intent in dry-run.
# Wrap MUTATIONS with this. Read-only probes (SSH BatchMode tests, status
# queries, command -v checks) must run unconditionally — never wrap them.
run() {
if [ "${DRY_RUN:-0}" = 1 ]; then
echo "[dry-run] would: $*"
else
"$@"
fi
}
export -f run
# ── file helpers ──────────────────────────────────────────────────────────────
# ensure_line FILE LINE — appends LINE to FILE if it is not already present (idempotent)
ensure_line() {

View file

@ -7,7 +7,7 @@ set -euo pipefail
: "${ONBOARD_SSH_USER:?remote.sh: ONBOARD_SSH_USER is not set}"
: "${ONBOARD_SSH_HOST:?remote.sh: ONBOARD_SSH_HOST is not set}"
: "${DRY_RUN:=false}"
: "${DRY_RUN:=0}"
_SSH_OPTS=(
-o StrictHostKeyChecking=accept-new
@ -17,7 +17,7 @@ _SSH_OPTS=(
# rrun CMD [ARGS…] — run a command on the remote node via SSH
rrun() {
if [[ "$DRY_RUN" == "true" ]]; then
if [ "${DRY_RUN:-0}" = 1 ]; then
dryrun "ssh ${ONBOARD_SSH_USER}@${ONBOARD_SSH_HOST} -- $*"
return 0
fi
@ -27,7 +27,7 @@ rrun() {
# rcopy LOCAL_PATH REMOTE_PATH — copy a file to the remote node via scp
rcopy() {
local src="$1" dst="$2"
if [[ "$DRY_RUN" == "true" ]]; then
if [ "${DRY_RUN:-0}" = 1 ]; then
dryrun "scp $src ${ONBOARD_SSH_USER}@${ONBOARD_SSH_HOST}:$dst"
return 0
fi
@ -38,7 +38,7 @@ rcopy() {
rsync_dir() {
local src="$1" dst="$2"
shift 2
if [[ "$DRY_RUN" == "true" ]]; then
if [ "${DRY_RUN:-0}" = 1 ]; then
dryrun "rsync -az $src ${ONBOARD_SSH_USER}@${ONBOARD_SSH_HOST}:$dst"
return 0
fi

View file

@ -29,7 +29,7 @@ source "${LIB_DIR}/common.sh"
NODE_NAME=""
ONLY_STEP=""
FROM_STEP=""
DRY_RUN="false"
DRY_RUN=0
export DRY_RUN REPO_ROOT
# ── argument parsing ──────────────────────────────────────────────────────────
@ -56,7 +56,7 @@ while [[ $# -gt 0 ]]; do
--node) NODE_NAME="${2:?--node requires a value}"; shift 2 ;;
--step) ONLY_STEP="${2:?--step requires a value}"; shift 2 ;;
--from) FROM_STEP="${2:?--from requires a value}"; shift 2 ;;
--dry-run) DRY_RUN="true"; shift ;;
--dry-run) DRY_RUN=1; shift ;;
-h|--help) usage ;;
*) die "Unknown argument: $1" ;;
esac
@ -159,11 +159,6 @@ for step_path in "${STEPS_TO_RUN[@]}"; do
step "Running: $step_file"
if [[ "$DRY_RUN" == "true" ]]; then
dryrun "Would execute: $step_path"
continue
fi
if bash "$step_path"; then
log "$step_file — OK"
else
@ -180,8 +175,8 @@ if [[ ${#FAILED_STEPS[@]} -gt 0 ]]; then
die "Onboarding finished with failures: ${FAILED_STEPS[*]}"
fi
if [[ "$DRY_RUN" == "true" ]]; then
log "Dry-run complete — no changes made."
if [ "${DRY_RUN:-0}" = 1 ]; then
log "Dry-run complete — no mutations performed."
else
log "All steps completed successfully for node ${NODE_NAME}."
fi

View file

@ -6,9 +6,13 @@
# 2. ensure_tailscale — install Tailscale and join network (interactive auth URL)
# 3. verify — confirm SSH over Tailscale, assert arch=aarch64
#
# Does NOT configure NOPASSWD, /opt/homelab, or any host mutation beyond Tailscale.
# Reads: first_contact (e.g. pi@pimirror2.local) and tailscale.hostname from node.yaml.
# pi user on Raspberry Pi OS has passwordless sudo by default — required for `tailscale up`.
# Dry-run convention (DRY_RUN=1):
# - Read-only probes (SSH BatchMode test, tailscale status, command -v) run ALWAYS
# so the plan reflects real current state ("key present → skip" vs "would: install")
# - Mutations (ssh-copy-id, curl installer, tailscale up) are wrapped with run()
#
# Does NOT configure NOPASSWD or /opt/homelab — those are later steps.
# pi user on Raspberry Pi OS has passwordless sudo — required for `tailscale up`.
set -euo pipefail
@ -16,7 +20,7 @@ STEP_NAME="00-access"
: "${REPO_ROOT:?REPO_ROOT is not set — run via onboard.sh}"
: "${NODE_YAML:?NODE_YAML is not set — run via onboard.sh}"
: "${DRY_RUN:=false}"
: "${DRY_RUN:=0}"
# Source common.sh when run standalone (orchestrator sources it before calling steps)
if ! declare -f log >/dev/null 2>&1; then
@ -33,8 +37,8 @@ TS_HOSTNAME=$(yaml_get "$NODE_YAML" "tailscale.hostname")
FC_USER="${FIRST_CONTACT%%@*}"
# remote.sh binds to ONBOARD_SSH_USER / ONBOARD_SSH_HOST — set from orchestrator
# (post-Tailscale target); fall back to first_contact coordinates for standalone use.
# ONBOARD_SSH_USER/HOST set by orchestrator to post-Tailscale coordinates;
# fall back to first_contact for standalone invocation.
export ONBOARD_SSH_USER="${ONBOARD_SSH_USER:-${FC_USER}}"
export ONBOARD_SSH_HOST="${ONBOARD_SSH_HOST:-${TS_HOSTNAME}}"
@ -42,41 +46,42 @@ export ONBOARD_SSH_HOST="${ONBOARD_SSH_HOST:-${TS_HOSTNAME}}"
source "${REPO_ROOT}/scripts/onboard/lib/remote.sh"
# ── SSH option arrays ─────────────────────────────────────────────────────────
# Interactive — no BatchMode; used for ssh-copy-id (may need password)
# No BatchMode — used for ssh-copy-id where a password prompt may appear
_FC_SSH_NOKEY=(-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10)
# Post-key — BatchMode; used once the key is installed
# BatchMode — used for all probes and post-key-install operations
_FC_SSH=(-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes)
# ── tailscale state probe helper ──────────────────────────────────────────────
# Always runs; returns BackendState or "unknown" on any SSH/parse failure.
_ts_state() {
ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" \
'tailscale status --json 2>/dev/null | python3 -c \
"import sys,json; print(json.load(sys.stdin).get(\"BackendState\",\"unknown\"))" \
2>/dev/null || echo "unknown"' 2>/dev/null || echo "unknown"
}
# ═══════════════════════════════════════════════════════════════════════════════
# Stage 1 — ensure_ssh_key
# ═══════════════════════════════════════════════════════════════════════════════
step "[$STEP_NAME] 1/3 ensure_ssh_key → ${FIRST_CONTACT}"
if [[ "$DRY_RUN" == "true" ]]; then
dryrun "ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 ${FIRST_CONTACT} true"
dryrun "# if key not present:"
dryrun "ssh-copy-id -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -i ~/.ssh/id_*.pub ${FIRST_CONTACT}"
# Probe: test key-based auth — always runs so dry-run reports real current state
if ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" true 2>/dev/null; then
log "SSH key already accepted by ${FIRST_CONTACT} — skip"
else
if ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" true 2>/dev/null; then
log "SSH key already accepted by ${FIRST_CONTACT} — skip"
else
# Find the first available public key
pubkeys=( "$HOME"/.ssh/id_*.pub )
[[ -f "${pubkeys[0]}" ]] || die "No public key found at ~/.ssh/id_*.pub on SATURN"
pubkeys=( "$HOME"/.ssh/id_*.pub )
[[ -f "${pubkeys[0]}" ]] || die "No public key found at ~/.ssh/id_*.pub on SATURN"
log "Installing public key ${pubkeys[0]} on ${FIRST_CONTACT}"
log "(password prompt for ${FIRST_CONTACT} expected)"
ssh-copy-id \
-o StrictHostKeyChecking=accept-new \
-o ConnectTimeout=10 \
-i "${pubkeys[0]}" \
"$FIRST_CONTACT"
log "Verifying key-based access..."
ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" true \
|| die "Key-based SSH still fails after ssh-copy-id — check ~/.ssh/authorized_keys on ${FIRST_CONTACT}"
log "Key installed and verified"
fi
log "Key not yet installed on ${FIRST_CONTACT} (password prompt expected)"
# Mutation: install public key
run ssh-copy-id \
"${_FC_SSH_NOKEY[@]}" \
-i "${pubkeys[0]}" \
"$FIRST_CONTACT"
# Probe: verify key was installed (run() is a no-op in dry-run so this
# prints "would:" — avoids a false-failure after a skipped ssh-copy-id)
run ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" true
log "Key installed and verified"
fi
# ═══════════════════════════════════════════════════════════════════════════════
@ -84,46 +89,36 @@ fi
# ═══════════════════════════════════════════════════════════════════════════════
step "[$STEP_NAME] 2/3 ensure_tailscale on ${FIRST_CONTACT} → hostname=${TS_HOSTNAME}"
_ts_state() {
# Returns BackendState string or "unknown". Uses python3 (available on RPi OS).
ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" \
'tailscale status --json 2>/dev/null | python3 -c \
"import sys,json; print(json.load(sys.stdin).get(\"BackendState\",\"unknown\"))" \
2>/dev/null || echo "unknown"'
}
if [[ "$DRY_RUN" == "true" ]]; then
dryrun "ssh ${FIRST_CONTACT} 'command -v tailscale' # check if installed"
dryrun "# if missing: ssh ${FIRST_CONTACT} 'curl -fsSL https://tailscale.com/install.sh | sh'"
dryrun "ssh ${FIRST_CONTACT} 'tailscale status --json' # check BackendState"
dryrun "# if not Running: ssh ${FIRST_CONTACT} 'sudo tailscale up --hostname=${TS_HOSTNAME}'"
# Probe: check if tailscale binary present — always runs.
# SSH auth failure (key not yet installed in dry-run) falls through to the
# "not found" branch, which is correct for a fresh node.
if ! ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" 'command -v tailscale' >/dev/null 2>&1; then
log "Tailscale not found on ${FIRST_CONTACT}"
# Mutation: install tailscale
run ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" \
'curl -fsSL https://tailscale.com/install.sh | sh'
else
# 2a — install if missing
if ! ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" 'command -v tailscale' >/dev/null 2>&1; then
log "Tailscale not found — installing on ${FIRST_CONTACT}..."
ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" 'curl -fsSL https://tailscale.com/install.sh | sh'
log "Tailscale installed"
else
log "Tailscale already installed on ${FIRST_CONTACT}"
fi
log "Tailscale already installed on ${FIRST_CONTACT}"
fi
# 2b — join if not active
ts_state=$(_ts_state)
if [[ "$ts_state" == "Running" ]]; then
log "Tailscale already active (BackendState=Running) — skip"
else
warn "Tailscale BackendState=${ts_state} — running tailscale up"
echo ""
echo -e "${_C_BOLD}┌─────────────────────────────────────────────────────────────┐"
echo -e "│ ACTION REQUIRED: open the URL below in your browser to │"
echo -e "│ authorize ${TS_HOSTNAME} in your Tailscale account. │"
echo -e "└─────────────────────────────────────────────────────────────┘${_C_RESET}"
echo ""
# pi user has passwordless sudo on Raspberry Pi OS; tailscale up blocks
# until the user authenticates via the URL it prints to stdout.
ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" "sudo tailscale up --hostname=${TS_HOSTNAME}"
echo ""
# Probe: check backend state — always runs
ts_state=$(_ts_state)
if [[ "$ts_state" == "Running" ]]; then
log "Tailscale already active (BackendState=Running) — skip"
else
warn "Tailscale BackendState=${ts_state} — joining network..."
echo ""
echo -e "${_C_BOLD}┌─────────────────────────────────────────────────────────────┐"
echo -e "│ ACTION REQUIRED: open the URL below in your browser to │"
echo -e "│ authorize ${TS_HOSTNAME} in your Tailscale account. │"
echo -e "└─────────────────────────────────────────────────────────────┘${_C_RESET}"
echo ""
# Mutation: tailscale up — blocks until user authenticates via printed URL
run ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" "sudo tailscale up --hostname=${TS_HOSTNAME}"
echo ""
# Post-join state check — only meaningful after the mutation actually ran
if [ "${DRY_RUN:-0}" != 1 ]; then
ts_state2=$(_ts_state)
[[ "$ts_state2" == "Running" ]] \
|| die "Tailscale still not active after tailscale up (BackendState=${ts_state2})"
@ -136,18 +131,18 @@ fi
# ═══════════════════════════════════════════════════════════════════════════════
step "[$STEP_NAME] 3/3 verify SSH over Tailscale → ${ONBOARD_SSH_USER}@${TS_HOSTNAME}"
if [[ "$DRY_RUN" == "true" ]]; then
dryrun "ssh -o BatchMode=yes ${ONBOARD_SSH_USER}@${TS_HOSTNAME} 'echo ok && uname -m'"
dryrun "# expected output: ok / aarch64"
else
out=$(ssh "${_FC_SSH[@]}" "${ONBOARD_SSH_USER}@${TS_HOSTNAME}" 'echo ok && uname -m' 2>&1) \
|| die "Verification SSH to ${TS_HOSTNAME} failed:\n ${out}"
echo "$out" | grep -q '^ok' || die "Verification output missing 'ok' line: ${out}"
# Probe: always runs — on a node already joined this works even in dry-run.
# On a fresh node in dry-run mode Tailscale isn't set up yet, so SSH will fail;
# that is reported as a warning (not a fatal error) to keep dry-run informative.
if out=$(ssh "${_FC_SSH[@]}" "${ONBOARD_SSH_USER}@${TS_HOSTNAME}" \
'echo ok && uname -m' 2>&1); then
echo "$out" | grep -q '^ok' || warn "Unexpected verify output: ${out}"
arch=$(echo "$out" | grep -v '^ok' | head -1 | tr -d '[:space:]')
[[ "$arch" == "aarch64" ]] || warn "Unexpected arch '${arch}' — expected aarch64"
log "Verify OK: ${ONBOARD_SSH_USER}@${TS_HOSTNAME} reachable, arch=${arch}"
else
msg="Verify SSH to ${ONBOARD_SSH_USER}@${TS_HOSTNAME} failed (Tailscale not yet joined?)"
[ "${DRY_RUN:-0}" = 1 ] && warn "$msg" || die "$msg"
fi
log "[$STEP_NAME] done — SSH key installed, Tailscale active, Tailscale SSH verified"
log "[$STEP_NAME] done"