homelab-codex-ws/scripts/onboard/steps/50-verify.sh
Oskar Kapala 1304c8449f feat(onboard): implement 40-register + 50-verify, remove dead scaffold
- 40-register.sh: idempotent — dopisuje lustro do topology.yaml + tworzy
  hosts/<node>/services.yaml, commituje na bieżącym branchu (bez push)
- 50-verify.sh: 4 checki — node-agent running, eventy, observer restart +
  heartbeat poll, world/nodes.json; tabela pass/fail; exit 1 on failure
- 40-deploy-node-agent.sh: usunięty (martwy scaffold; deploy w 30-node-agent.sh)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-09 20:36:00 +02:00

161 lines
6.8 KiB
Bash
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# scripts/onboard/steps/50-verify.sh — restart observera + smoke test węzła w panelu
#
# Uruchamiaj PO: merge task/node-onboarding → master + git pull na VPS.
#
# Sprawdzenia:
# 1. SSH <node>: node-agent container running
# 2. SSH <node>: eventy obecne w /opt/homelab/events/<node>/
# 3. SSH VPS: docker restart control-plane-observer + poll observer.heartbeat
# 4. SSH VPS: <node> widoczny w /opt/homelab/world/nodes.json
#
# Exit 0 — wszystkie OK | Exit 1 — co najmniej jedno FAIL (tabela podsumowująca)
set -euo pipefail
STEP_NAME="50-verify"
: "${REPO_ROOT:?REPO_ROOT is not set — run via onboard.sh}"
: "${NODE_YAML:?NODE_YAML is not set — run via onboard.sh}"
: "${DRY_RUN:=0}"
if ! declare -f log >/dev/null 2>&1; then
# shellcheck source=../lib/common.sh
source "${REPO_ROOT}/scripts/onboard/lib/common.sh"
fi
SSH_USER=$(yaml_get "${NODE_YAML}" "ssh_user")
TS_HOSTNAME=$(yaml_get "${NODE_YAML}" "tailscale.hostname")
[[ -z "${SSH_USER}" ]] && die "ssh_user not set in ${NODE_YAML}"
[[ -z "${TS_HOSTNAME}" ]] && die "tailscale.hostname not set in ${NODE_YAML}"
VPS_SSH_USER="oskar"
VPS_SSH_HOST="100.95.58.48"
VPS_REPO_PATH="/home/oskar/homelab-codex-ws"
_SSH_OPTS=(-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes)
_ssh_node() { ssh "${_SSH_OPTS[@]}" "${SSH_USER}@${TS_HOSTNAME}" -- "$@"; }
_ssh_vps() { ssh "${_SSH_OPTS[@]}" "${VPS_SSH_USER}@${VPS_SSH_HOST}" -- "$@"; }
declare -A RESULTS=()
# ── 1. node-agent running on <node> ──────────────────────────────────────────
step "[${STEP_NAME}] 1/4 ${TS_HOSTNAME}: node-agent container"
if [ "${DRY_RUN:-0}" = 1 ]; then
dryrun "ssh ${SSH_USER}@${TS_HOSTNAME} docker ps --filter name=^node-agent\$"
RESULTS["node-agent-running"]="skip"
elif _ssh_node "docker ps --filter name=^node-agent\$ --filter status=running --format '{{.Names}}'" 2>/dev/null \
| grep -q "node-agent"; then
log "OK: node-agent running"
_ssh_node "docker ps --filter name=node-agent --format 'table {{.Names}}\t{{.Status}}'" 2>/dev/null || true
RESULTS["node-agent-running"]="PASS"
else
warn "FAIL: node-agent nie działa na ${TS_HOSTNAME}"
RESULTS["node-agent-running"]="FAIL"
fi
# ── 2. eventy w /opt/homelab/events/<node>/ ───────────────────────────────────
step "[${STEP_NAME}] 2/4 ${TS_HOSTNAME}: eventy"
if [ "${DRY_RUN:-0}" = 1 ]; then
dryrun "ssh ${SSH_USER}@${TS_HOSTNAME} find /opt/homelab/events/${TS_HOSTNAME}/ -name '*.json'"
RESULTS["events-present"]="skip"
elif _ssh_node "find /opt/homelab/events/${TS_HOSTNAME}/ -name '*.json' 2>/dev/null | head -1" 2>/dev/null \
| grep -q ".json"; then
_latest=$(_ssh_node "ls -t /opt/homelab/events/${TS_HOSTNAME}/*.json 2>/dev/null | head -1" || echo "?")
log "OK: eventy obecne (ostatni: ${_latest})"
RESULTS["events-present"]="PASS"
else
warn "FAIL: brak eventów w /opt/homelab/events/${TS_HOSTNAME}/"
RESULTS["events-present"]="FAIL"
fi
# ── 3. restart observera + healthcheck ────────────────────────────────────────
step "[${STEP_NAME}] 3/4 VPS: restart control-plane-observer"
if [ "${DRY_RUN:-0}" = 1 ]; then
dryrun "ssh ${VPS_SSH_USER}@${VPS_SSH_HOST} docker restart control-plane-observer"
dryrun "poll /opt/homelab/state/observer.heartbeat (max 30s)"
RESULTS["observer-healthy"]="skip"
else
log "Restarting control-plane-observer na VPS..."
_ssh_vps "docker restart control-plane-observer"
log "Polling observer.heartbeat (max 30s)..."
_ok=0
for _i in $(seq 1 6); do
sleep 5
_age=$(_ssh_vps "python3 -c \
\"import os,time; s=os.stat('/opt/homelab/state/observer.heartbeat'); \
print(int(time.time()-s.st_mtime))\" 2>/dev/null" || echo "999")
if [[ "${_age}" -lt 20 ]]; then
log "OK: observer.heartbeat fresh (${_age}s temu)"
_ok=1
break
fi
log " ... ${_i}×5s, heartbeat ${_age}s old..."
done
if [[ "${_ok}" -eq 1 ]]; then
RESULTS["observer-healthy"]="PASS"
else
warn "FAIL: observer.heartbeat nie odświeżony po 30s"
warn "Sprawdź: ssh ${VPS_SSH_USER}@${VPS_SSH_HOST} docker logs control-plane-observer --tail 30"
RESULTS["observer-healthy"]="FAIL"
fi
fi
# ── 4. <node> widoczny w world/nodes.json ─────────────────────────────────────
step "[${STEP_NAME}] 4/4 VPS: ${TS_HOSTNAME} w world/nodes.json"
if [ "${DRY_RUN:-0}" = 1 ]; then
dryrun "ssh ${VPS_SSH_USER}@${VPS_SSH_HOST} python3 -c \"json.load(.../world/nodes.json)['${TS_HOSTNAME}']\""
RESULTS["world-state"]="skip"
else
_node_status=$(_ssh_vps "python3 -c \"
import json, sys
try:
d = json.load(open('/opt/homelab/world/nodes.json'))
node = d.get('${TS_HOSTNAME}', {})
print(node.get('status', 'missing'))
except Exception as e:
print('error:' + str(e))
\"" 2>/dev/null || echo "ssh-error")
case "${_node_status}" in
online|offline)
log "OK: ${TS_HOSTNAME} w world/nodes.json (status=${_node_status})"
RESULTS["world-state"]="PASS"
;;
missing)
warn "FAIL: ${TS_HOSTNAME} nie ma wpisu w world/nodes.json"
warn "Możliwa przyczyna: observer nie przetworzyл jeszcze eventów (poczekaj 60s i spróbuj ponownie)"
RESULTS["world-state"]="FAIL"
;;
*)
warn "FAIL: nieoczekiwana odpowiedź: ${_node_status}"
RESULTS["world-state"]="FAIL"
;;
esac
fi
# ── tabela podsumowująca ──────────────────────────────────────────────────────
echo ""
printf '%s\n' "══════════════════════════════════════════"
printf " %-30s %s\n" "CHECK" "RESULT"
printf '%s\n' "──────────────────────────────────────────"
for _key in "node-agent-running" "events-present" "observer-healthy" "world-state"; do
_val="${RESULTS[${_key}]:-???}"
printf " %-30s %s\n" "${_key}" "${_val}"
done
printf '%s\n' "══════════════════════════════════════════"
echo ""
for _val in "${RESULTS[@]}"; do
[[ "${_val}" == "FAIL" ]] && { warn "Verify: co najmniej jeden check nie przeszedł"; exit 1; }
done
log "[${STEP_NAME}] Verify OK — ${TS_HOSTNAME} zarejestrowany i widoczny w panelu"