homelab-codex-ws/scripts/onboard/steps/00-access.sh

149 lines
8.3 KiB
Bash
Raw Normal View History

#!/usr/bin/env bash
# scripts/onboard/steps/00-access.sh — establish remote access channel
#
# Stages:
# 1. ensure_ssh_key — copy SATURN public key to first_contact (idempotent)
# 2. ensure_tailscale — install Tailscale and join network (interactive auth URL)
# 3. verify — confirm SSH over Tailscale, assert arch=aarch64
#
# Dry-run convention (DRY_RUN=1):
# - Read-only probes (SSH BatchMode test, tailscale status, command -v) run ALWAYS
# so the plan reflects real current state ("key present → skip" vs "would: install")
# - Mutations (ssh-copy-id, curl installer, tailscale up) are wrapped with run()
#
# Does NOT configure NOPASSWD or /opt/homelab — those are later steps.
# pi user on Raspberry Pi OS has passwordless sudo — required for `tailscale up`.
set -euo pipefail
STEP_NAME="00-access"
: "${REPO_ROOT:?REPO_ROOT is not set — run via onboard.sh}"
: "${NODE_YAML:?NODE_YAML is not set — run via onboard.sh}"
: "${DRY_RUN:=0}"
# Source common.sh when run standalone (orchestrator sources it before calling steps)
if ! declare -f log >/dev/null 2>&1; then
# shellcheck source=../lib/common.sh
source "${REPO_ROOT}/scripts/onboard/lib/common.sh"
fi
# ── parse node.yaml ───────────────────────────────────────────────────────────
FIRST_CONTACT=$(yaml_get "$NODE_YAML" "first_contact")
TS_HOSTNAME=$(yaml_get "$NODE_YAML" "tailscale.hostname")
[[ -z "$FIRST_CONTACT" ]] && die "first_contact not set in $NODE_YAML"
[[ -z "$TS_HOSTNAME" ]] && die "tailscale.hostname not set in $NODE_YAML"
FC_USER="${FIRST_CONTACT%%@*}"
# ONBOARD_SSH_USER/HOST set by orchestrator to post-Tailscale coordinates;
# fall back to first_contact for standalone invocation.
export ONBOARD_SSH_USER="${ONBOARD_SSH_USER:-${FC_USER}}"
export ONBOARD_SSH_HOST="${ONBOARD_SSH_HOST:-${TS_HOSTNAME}}"
# shellcheck source=../lib/remote.sh
source "${REPO_ROOT}/scripts/onboard/lib/remote.sh"
# ── SSH option arrays ─────────────────────────────────────────────────────────
# No BatchMode — used for ssh-copy-id where a password prompt may appear
_FC_SSH_NOKEY=(-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10)
# BatchMode — used for all probes and post-key-install operations
_FC_SSH=(-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes)
# ── tailscale state probe helper ──────────────────────────────────────────────
# Always runs; returns BackendState or "unknown" on any SSH/parse failure.
_ts_state() {
ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" \
'tailscale status --json 2>/dev/null | python3 -c \
"import sys,json; print(json.load(sys.stdin).get(\"BackendState\",\"unknown\"))" \
2>/dev/null || echo "unknown"' 2>/dev/null || echo "unknown"
}
# ═══════════════════════════════════════════════════════════════════════════════
# Stage 1 — ensure_ssh_key
# ═══════════════════════════════════════════════════════════════════════════════
step "[$STEP_NAME] 1/3 ensure_ssh_key → ${FIRST_CONTACT}"
# Probe: test key-based auth — always runs so dry-run reports real current state
if ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" true 2>/dev/null; then
log "SSH key already accepted by ${FIRST_CONTACT} — skip"
else
pubkeys=( "$HOME"/.ssh/id_*.pub )
[[ -f "${pubkeys[0]}" ]] || die "No public key found at ~/.ssh/id_*.pub on SATURN"
log "Key not yet installed on ${FIRST_CONTACT} (password prompt expected)"
# Mutation: install public key
run ssh-copy-id \
"${_FC_SSH_NOKEY[@]}" \
-i "${pubkeys[0]}" \
"$FIRST_CONTACT"
# Probe: verify key was installed (run() is a no-op in dry-run so this
# prints "would:" — avoids a false-failure after a skipped ssh-copy-id)
run ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" true
log "Key installed and verified"
fi
# ═══════════════════════════════════════════════════════════════════════════════
# Stage 2 — ensure_tailscale
# ═══════════════════════════════════════════════════════════════════════════════
step "[$STEP_NAME] 2/3 ensure_tailscale on ${FIRST_CONTACT} → hostname=${TS_HOSTNAME}"
# Probe: check if tailscale binary present — always runs.
# SSH auth failure (key not yet installed in dry-run) falls through to the
# "not found" branch, which is correct for a fresh node.
if ! ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" 'command -v tailscale' >/dev/null 2>&1; then
log "Tailscale not found on ${FIRST_CONTACT}"
# Mutation: install tailscale
run ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" \
'curl -fsSL https://tailscale.com/install.sh | sh'
else
log "Tailscale already installed on ${FIRST_CONTACT}"
fi
# Probe: check backend state — always runs
ts_state=$(_ts_state)
if [[ "$ts_state" == "Running" ]]; then
log "Tailscale already active (BackendState=Running) — skip"
else
warn "Tailscale BackendState=${ts_state} — joining network..."
echo ""
echo -e "${_C_BOLD}┌─────────────────────────────────────────────────────────────┐"
echo -e "│ ACTION REQUIRED: open the URL below in your browser to │"
echo -e "│ authorize ${TS_HOSTNAME} in your Tailscale account. │"
echo -e "└─────────────────────────────────────────────────────────────┘${_C_RESET}"
echo ""
# Mutation: tailscale up — blocks until user authenticates via printed URL
run ssh "${_FC_SSH[@]}" "$FIRST_CONTACT" "sudo tailscale up --hostname=${TS_HOSTNAME}"
echo ""
# Post-join state check — only meaningful after the mutation actually ran
if [ "${DRY_RUN:-0}" != 1 ]; then
ts_state2=$(_ts_state)
[[ "$ts_state2" == "Running" ]] \
|| die "Tailscale still not active after tailscale up (BackendState=${ts_state2})"
log "Tailscale joined successfully (BackendState=Running)"
fi
fi
# ═══════════════════════════════════════════════════════════════════════════════
# Stage 3 — verify over Tailscale
# ═══════════════════════════════════════════════════════════════════════════════
step "[$STEP_NAME] 3/3 verify SSH over Tailscale → ${ONBOARD_SSH_USER}@${TS_HOSTNAME}"
# Probe: always runs — on a node already joined this works even in dry-run.
# On a fresh node in dry-run mode Tailscale isn't set up yet, so SSH will fail;
# that is reported as a warning (not a fatal error) to keep dry-run informative.
if out=$(ssh "${_FC_SSH[@]}" "${ONBOARD_SSH_USER}@${TS_HOSTNAME}" \
'echo ok && uname -m' 2>&1); then
echo "$out" | grep -q '^ok' || warn "Unexpected verify output: ${out}"
arch=$(echo "$out" | grep -v '^ok' | head -1 | tr -d '[:space:]')
[[ "$arch" == "aarch64" ]] || warn "Unexpected arch '${arch}' — expected aarch64"
log "Verify OK: ${ONBOARD_SSH_USER}@${TS_HOSTNAME} reachable, arch=${arch}"
else
msg="Verify SSH to ${ONBOARD_SSH_USER}@${TS_HOSTNAME} failed (Tailscale not yet joined?)"
[ "${DRY_RUN:-0}" = 1 ] && warn "$msg" || die "$msg"
fi
log "[$STEP_NAME] done"