From 1304c8449f50898d9f9aab7f1b8e59af8c871e08 Mon Sep 17 00:00:00 2001 From: Oskar Kapala Date: Tue, 9 Jun 2026 20:36:00 +0200 Subject: [PATCH] feat(onboard): implement 40-register + 50-verify, remove dead scaffold MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 40-register.sh: idempotent — dopisuje lustro do topology.yaml + tworzy hosts//services.yaml, commituje na bieżącym branchu (bez push) - 50-verify.sh: 4 checki — node-agent running, eventy, observer restart + heartbeat poll, world/nodes.json; tabela pass/fail; exit 1 on failure - 40-deploy-node-agent.sh: usunięty (martwy scaffold; deploy w 30-node-agent.sh) Co-Authored-By: Claude Sonnet 4.6 --- scripts/onboard/steps/40-deploy-node-agent.sh | 16 -- scripts/onboard/steps/40-register.sh | 140 +++++++++++++++ scripts/onboard/steps/50-verify.sh | 164 ++++++++++++++++-- 3 files changed, 294 insertions(+), 26 deletions(-) delete mode 100755 scripts/onboard/steps/40-deploy-node-agent.sh create mode 100755 scripts/onboard/steps/40-register.sh diff --git a/scripts/onboard/steps/40-deploy-node-agent.sh b/scripts/onboard/steps/40-deploy-node-agent.sh deleted file mode 100755 index 54406be..0000000 --- a/scripts/onboard/steps/40-deploy-node-agent.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash -# scripts/onboard/steps/40-deploy-node-agent.sh — deploy node-agent to remote node -# -# TODO: rsync services/node-agent/ and hosts//runtime/node-agent/ to remote -# TODO: populate /opt/homelab/config/node-agent/.env from env.example + operator-provided secrets -# TODO: docker compose up -d (or docker-compose for CHELSTY nodes using v1) -# TODO: wait for healthcheck to pass -# TODO: emit deployment_completed event via scripts/lib/events.sh -# TODO: gate on git_control flag — if false, skip and print manual instructions - -set -euo pipefail -: "${REPO_ROOT:?REPO_ROOT is not set — run via onboard.sh}" -source "${REPO_ROOT}/scripts/onboard/lib/remote.sh" - -STEP_NAME="40-deploy-node-agent" -step "[$STEP_NAME] TODO — not yet implemented" diff --git a/scripts/onboard/steps/40-register.sh b/scripts/onboard/steps/40-register.sh new file mode 100755 index 0000000..d3e1eff --- /dev/null +++ b/scripts/onboard/steps/40-register.sh @@ -0,0 +1,140 @@ +#!/usr/bin/env bash +# scripts/onboard/steps/40-register.sh — wpisz node do inventory i commituj na branchu +# +# Efekty (wszystkie idempotentne): +# 1. Dopisuje blok do inventory/topology.yaml +# 2. Tworzy hosts//services.yaml jeśli nie istnieje +# 3. git add + git commit na aktualnym branchu (NIE push — merge należy do operatora) +# +# Reload observera celowo poza tym krokiem — wykonywany ręcznie po merge→master, +# git pull na VPS i uruchomieniu 50-verify.sh. + +set -euo pipefail + +STEP_NAME="40-register" + +: "${REPO_ROOT:?REPO_ROOT is not set — run via onboard.sh}" +: "${NODE_YAML:?NODE_YAML is not set — run via onboard.sh}" +: "${DRY_RUN:=0}" + +if ! declare -f log >/dev/null 2>&1; then + # shellcheck source=../lib/common.sh + source "${REPO_ROOT}/scripts/onboard/lib/common.sh" +fi + +NODE_ENTRY=$(yaml_get "${NODE_YAML}" "tailscale.hostname") +[[ -z "${NODE_ENTRY}" ]] && die "tailscale.hostname not set in ${NODE_YAML}" + +TOPOLOGY="${REPO_ROOT}/inventory/topology.yaml" +SERVICES_YAML="${REPO_ROOT}/hosts/${NODE_ENTRY}/services.yaml" + +# ── 1. inventory/topology.yaml ──────────────────────────────────────────────── +step "[${STEP_NAME}] 1/3 inventory/topology.yaml" + +_TOPOLOGY_BLOCK=$(cat << 'EOF' + + PLACEHOLDER: + roles: + - edge + services: + - node-agent +EOF +) +# Replace the PLACEHOLDER with the actual node name +_TOPOLOGY_BLOCK="${_TOPOLOGY_BLOCK//PLACEHOLDER/${NODE_ENTRY}}" + +if grep -q "^ ${NODE_ENTRY}:" "${TOPOLOGY}"; then + log "${NODE_ENTRY} already present in topology.yaml — skip" +else + if [ "${DRY_RUN:-0}" = 1 ]; then + dryrun "Would append to ${TOPOLOGY}:" + echo "${_TOPOLOGY_BLOCK}" + else + printf '%s\n' "${_TOPOLOGY_BLOCK}" >> "${TOPOLOGY}" + log "Appended ${NODE_ENTRY} block to topology.yaml" + fi +fi + +# ── 2. hosts//services.yaml ──────────────────────────────────────────── +step "[${STEP_NAME}] 2/3 hosts/${NODE_ENTRY}/services.yaml" + +if [[ -f "${SERVICES_YAML}" ]]; then + log "services.yaml already exists — skip" +else + if [ "${DRY_RUN:-0}" = 1 ]; then + dryrun "Would create ${SERVICES_YAML}:" + cat << EOF +host: ${NODE_ENTRY} + +services: + node-agent: + role: node-stability-monitor + deployment_model: docker-compose + exposure: local-only + offline_required: true + depends_on: + local: [] + external: [] + runtime: + config_path: /opt/homelab/config/node-agent + data_path: /opt/homelab/state + logs_path: /opt/homelab/events +EOF + else + mkdir -p "${REPO_ROOT}/hosts/${NODE_ENTRY}" + cat > "${SERVICES_YAML}" << EOF +host: ${NODE_ENTRY} + +services: + node-agent: + role: node-stability-monitor + deployment_model: docker-compose + exposure: local-only + offline_required: true + depends_on: + local: [] + external: [] + runtime: + config_path: /opt/homelab/config/node-agent + data_path: /opt/homelab/state + logs_path: /opt/homelab/events +EOF + log "Created ${SERVICES_YAML}" + fi +fi + +# ── 3. git commit ───────────────────────────────────────────────────────────── +step "[${STEP_NAME}] 3/3 git commit" + +cd "${REPO_ROOT}" + +_changed_files=() +git diff --quiet "${TOPOLOGY}" 2>/dev/null || _changed_files+=("inventory/topology.yaml") +[[ -f "${SERVICES_YAML}" ]] && \ + git ls-files --error-unmatch "${SERVICES_YAML}" 2>/dev/null || \ + _changed_files+=("hosts/${NODE_ENTRY}/services.yaml") + +# Re-check: is anything staged or unstaged for these paths? +_needs_commit=0 +if git diff --quiet && git diff --cached --quiet; then + # Nothing changed at all — may already be committed + if git ls-files --error-unmatch "${TOPOLOGY}" "${SERVICES_YAML}" >/dev/null 2>&1 && \ + ! git diff HEAD -- "${TOPOLOGY}" "${SERVICES_YAML}" | grep -q .; then + log "Nothing to commit — ${NODE_ENTRY} already registered and committed" + else + _needs_commit=1 + fi +else + _needs_commit=1 +fi + +if [[ "${_needs_commit}" -eq 1 ]]; then + run git add "inventory/topology.yaml" "hosts/${NODE_ENTRY}/services.yaml" + run git commit -m "feat(onboard): register ${NODE_ENTRY} in topology + services.yaml" + if [ "${DRY_RUN:-0}" != 1 ]; then + log "Committed on $(git branch --show-current)" + log "Next: agent.sh merge task/node-onboarding → master, git pull VPS, run 50-verify.sh" + fi +fi + +log "[${STEP_NAME}] done" diff --git a/scripts/onboard/steps/50-verify.sh b/scripts/onboard/steps/50-verify.sh index 28d5c5b..2b9ff84 100755 --- a/scripts/onboard/steps/50-verify.sh +++ b/scripts/onboard/steps/50-verify.sh @@ -1,16 +1,160 @@ #!/usr/bin/env bash -# scripts/onboard/steps/50-verify.sh — end-to-end verification of onboarded node +# scripts/onboard/steps/50-verify.sh — restart observera + smoke test węzła w panelu # -# TODO: rcheck SSH reachability -# TODO: rrun docker ps — confirm node-agent container is running -# TODO: check /opt/homelab/state/heartbeat timestamp is recent (< 5 min) -# TODO: verify node appears in Observer world state (/opt/homelab/world/nodes.json on control node) -# TODO: run services//healthcheck.sh for each enabled service -# TODO: print pass/fail summary table; exit 1 if any check failed +# Uruchamiaj PO: merge task/node-onboarding → master + git pull na VPS. +# +# Sprawdzenia: +# 1. SSH : node-agent container running +# 2. SSH : eventy obecne w /opt/homelab/events// +# 3. SSH VPS: docker restart control-plane-observer + poll observer.heartbeat +# 4. SSH VPS: widoczny w /opt/homelab/world/nodes.json +# +# Exit 0 — wszystkie OK | Exit 1 — co najmniej jedno FAIL (tabela podsumowująca) set -euo pipefail -: "${REPO_ROOT:?REPO_ROOT is not set — run via onboard.sh}" -source "${REPO_ROOT}/scripts/onboard/lib/remote.sh" STEP_NAME="50-verify" -step "[$STEP_NAME] TODO — not yet implemented" + +: "${REPO_ROOT:?REPO_ROOT is not set — run via onboard.sh}" +: "${NODE_YAML:?NODE_YAML is not set — run via onboard.sh}" +: "${DRY_RUN:=0}" + +if ! declare -f log >/dev/null 2>&1; then + # shellcheck source=../lib/common.sh + source "${REPO_ROOT}/scripts/onboard/lib/common.sh" +fi + +SSH_USER=$(yaml_get "${NODE_YAML}" "ssh_user") +TS_HOSTNAME=$(yaml_get "${NODE_YAML}" "tailscale.hostname") +[[ -z "${SSH_USER}" ]] && die "ssh_user not set in ${NODE_YAML}" +[[ -z "${TS_HOSTNAME}" ]] && die "tailscale.hostname not set in ${NODE_YAML}" + +VPS_SSH_USER="oskar" +VPS_SSH_HOST="100.95.58.48" +VPS_REPO_PATH="/home/oskar/homelab-codex-ws" + +_SSH_OPTS=(-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes) + +_ssh_node() { ssh "${_SSH_OPTS[@]}" "${SSH_USER}@${TS_HOSTNAME}" -- "$@"; } +_ssh_vps() { ssh "${_SSH_OPTS[@]}" "${VPS_SSH_USER}@${VPS_SSH_HOST}" -- "$@"; } + +declare -A RESULTS=() + +# ── 1. node-agent running on ────────────────────────────────────────── +step "[${STEP_NAME}] 1/4 ${TS_HOSTNAME}: node-agent container" + +if [ "${DRY_RUN:-0}" = 1 ]; then + dryrun "ssh ${SSH_USER}@${TS_HOSTNAME} docker ps --filter name=^node-agent\$" + RESULTS["node-agent-running"]="skip" +elif _ssh_node "docker ps --filter name=^node-agent\$ --filter status=running --format '{{.Names}}'" 2>/dev/null \ + | grep -q "node-agent"; then + log "OK: node-agent running" + _ssh_node "docker ps --filter name=node-agent --format 'table {{.Names}}\t{{.Status}}'" 2>/dev/null || true + RESULTS["node-agent-running"]="PASS" +else + warn "FAIL: node-agent nie działa na ${TS_HOSTNAME}" + RESULTS["node-agent-running"]="FAIL" +fi + +# ── 2. eventy w /opt/homelab/events// ─────────────────────────────────── +step "[${STEP_NAME}] 2/4 ${TS_HOSTNAME}: eventy" + +if [ "${DRY_RUN:-0}" = 1 ]; then + dryrun "ssh ${SSH_USER}@${TS_HOSTNAME} find /opt/homelab/events/${TS_HOSTNAME}/ -name '*.json'" + RESULTS["events-present"]="skip" +elif _ssh_node "find /opt/homelab/events/${TS_HOSTNAME}/ -name '*.json' 2>/dev/null | head -1" 2>/dev/null \ + | grep -q ".json"; then + _latest=$(_ssh_node "ls -t /opt/homelab/events/${TS_HOSTNAME}/*.json 2>/dev/null | head -1" || echo "?") + log "OK: eventy obecne (ostatni: ${_latest})" + RESULTS["events-present"]="PASS" +else + warn "FAIL: brak eventów w /opt/homelab/events/${TS_HOSTNAME}/" + RESULTS["events-present"]="FAIL" +fi + +# ── 3. restart observera + healthcheck ──────────────────────────────────────── +step "[${STEP_NAME}] 3/4 VPS: restart control-plane-observer" + +if [ "${DRY_RUN:-0}" = 1 ]; then + dryrun "ssh ${VPS_SSH_USER}@${VPS_SSH_HOST} docker restart control-plane-observer" + dryrun "poll /opt/homelab/state/observer.heartbeat (max 30s)" + RESULTS["observer-healthy"]="skip" +else + log "Restarting control-plane-observer na VPS..." + _ssh_vps "docker restart control-plane-observer" + + log "Polling observer.heartbeat (max 30s)..." + _ok=0 + for _i in $(seq 1 6); do + sleep 5 + _age=$(_ssh_vps "python3 -c \ + \"import os,time; s=os.stat('/opt/homelab/state/observer.heartbeat'); \ + print(int(time.time()-s.st_mtime))\" 2>/dev/null" || echo "999") + if [[ "${_age}" -lt 20 ]]; then + log "OK: observer.heartbeat fresh (${_age}s temu)" + _ok=1 + break + fi + log " ... ${_i}×5s, heartbeat ${_age}s old..." + done + + if [[ "${_ok}" -eq 1 ]]; then + RESULTS["observer-healthy"]="PASS" + else + warn "FAIL: observer.heartbeat nie odświeżony po 30s" + warn "Sprawdź: ssh ${VPS_SSH_USER}@${VPS_SSH_HOST} docker logs control-plane-observer --tail 30" + RESULTS["observer-healthy"]="FAIL" + fi +fi + +# ── 4. widoczny w world/nodes.json ───────────────────────────────────── +step "[${STEP_NAME}] 4/4 VPS: ${TS_HOSTNAME} w world/nodes.json" + +if [ "${DRY_RUN:-0}" = 1 ]; then + dryrun "ssh ${VPS_SSH_USER}@${VPS_SSH_HOST} python3 -c \"json.load(.../world/nodes.json)['${TS_HOSTNAME}']\"" + RESULTS["world-state"]="skip" +else + _node_status=$(_ssh_vps "python3 -c \" +import json, sys +try: + d = json.load(open('/opt/homelab/world/nodes.json')) + node = d.get('${TS_HOSTNAME}', {}) + print(node.get('status', 'missing')) +except Exception as e: + print('error:' + str(e)) +\"" 2>/dev/null || echo "ssh-error") + + case "${_node_status}" in + online|offline) + log "OK: ${TS_HOSTNAME} w world/nodes.json (status=${_node_status})" + RESULTS["world-state"]="PASS" + ;; + missing) + warn "FAIL: ${TS_HOSTNAME} nie ma wpisu w world/nodes.json" + warn "Możliwa przyczyna: observer nie przetworzyл jeszcze eventów (poczekaj 60s i spróbuj ponownie)" + RESULTS["world-state"]="FAIL" + ;; + *) + warn "FAIL: nieoczekiwana odpowiedź: ${_node_status}" + RESULTS["world-state"]="FAIL" + ;; + esac +fi + +# ── tabela podsumowująca ────────────────────────────────────────────────────── +echo "" +printf '%s\n' "══════════════════════════════════════════" +printf " %-30s %s\n" "CHECK" "RESULT" +printf '%s\n' "──────────────────────────────────────────" +for _key in "node-agent-running" "events-present" "observer-healthy" "world-state"; do + _val="${RESULTS[${_key}]:-???}" + printf " %-30s %s\n" "${_key}" "${_val}" +done +printf '%s\n' "══════════════════════════════════════════" +echo "" + +for _val in "${RESULTS[@]}"; do + [[ "${_val}" == "FAIL" ]] && { warn "Verify: co najmniej jeden check nie przeszedł"; exit 1; } +done + +log "[${STEP_NAME}] Verify OK — ${TS_HOSTNAME} zarejestrowany i widoczny w panelu"