#!/usr/bin/env bash # scripts/onboard/steps/50-verify.sh — restart observera + smoke test węzła w panelu # # Uruchamiaj PO: merge task/node-onboarding → master + git pull na VPS. # # Sprawdzenia: # 1. SSH : node-agent container running # 2. SSH : eventy obecne w /opt/homelab/events// # 3. SSH VPS: docker restart control-plane-observer + poll observer.heartbeat # 4. SSH VPS: widoczny w /opt/homelab/world/nodes.json # # Exit 0 — wszystkie OK | Exit 1 — co najmniej jedno FAIL (tabela podsumowująca) set -euo pipefail STEP_NAME="50-verify" : "${REPO_ROOT:?REPO_ROOT is not set — run via onboard.sh}" : "${NODE_YAML:?NODE_YAML is not set — run via onboard.sh}" : "${DRY_RUN:=0}" if ! declare -f log >/dev/null 2>&1; then # shellcheck source=../lib/common.sh source "${REPO_ROOT}/scripts/onboard/lib/common.sh" fi SSH_USER=$(yaml_get "${NODE_YAML}" "ssh_user") TS_HOSTNAME=$(yaml_get "${NODE_YAML}" "tailscale.hostname") [[ -z "${SSH_USER}" ]] && die "ssh_user not set in ${NODE_YAML}" [[ -z "${TS_HOSTNAME}" ]] && die "tailscale.hostname not set in ${NODE_YAML}" VPS_SSH_USER="oskar" VPS_SSH_HOST="100.95.58.48" VPS_REPO_PATH="/home/oskar/homelab-codex-ws" _SSH_OPTS=(-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes) _ssh_node() { ssh "${_SSH_OPTS[@]}" "${SSH_USER}@${TS_HOSTNAME}" -- "$@"; } _ssh_vps() { ssh "${_SSH_OPTS[@]}" "${VPS_SSH_USER}@${VPS_SSH_HOST}" -- "$@"; } declare -A RESULTS=() # ── 1. node-agent running on ────────────────────────────────────────── step "[${STEP_NAME}] 1/4 ${TS_HOSTNAME}: node-agent container" if [ "${DRY_RUN:-0}" = 1 ]; then dryrun "ssh ${SSH_USER}@${TS_HOSTNAME} docker ps --filter name=^node-agent\$" RESULTS["node-agent-running"]="skip" elif _ssh_node "docker ps --filter name=^node-agent\$ --filter status=running --format '{{.Names}}'" 2>/dev/null \ | grep -q "node-agent"; then log "OK: node-agent running" _ssh_node "docker ps --filter name=node-agent --format 'table {{.Names}}\t{{.Status}}'" 2>/dev/null || true RESULTS["node-agent-running"]="PASS" else warn "FAIL: node-agent nie działa na ${TS_HOSTNAME}" RESULTS["node-agent-running"]="FAIL" fi # ── 2. eventy w /opt/homelab/events// ─────────────────────────────────── step "[${STEP_NAME}] 2/4 ${TS_HOSTNAME}: eventy" if [ "${DRY_RUN:-0}" = 1 ]; then dryrun "ssh ${SSH_USER}@${TS_HOSTNAME} find /opt/homelab/events/${TS_HOSTNAME}/ -name '*.json'" RESULTS["events-present"]="skip" elif _ssh_node "find /opt/homelab/events/${TS_HOSTNAME}/ -name '*.json' 2>/dev/null | head -1" 2>/dev/null \ | grep -q ".json"; then _latest=$(_ssh_node "ls -t /opt/homelab/events/${TS_HOSTNAME}/*.json 2>/dev/null | head -1" || echo "?") log "OK: eventy obecne (ostatni: ${_latest})" RESULTS["events-present"]="PASS" else warn "FAIL: brak eventów w /opt/homelab/events/${TS_HOSTNAME}/" RESULTS["events-present"]="FAIL" fi # ── 3. restart observera + healthcheck ──────────────────────────────────────── step "[${STEP_NAME}] 3/4 VPS: restart control-plane-observer" if [ "${DRY_RUN:-0}" = 1 ]; then dryrun "ssh ${VPS_SSH_USER}@${VPS_SSH_HOST} docker restart control-plane-observer" dryrun "poll /opt/homelab/state/observer.heartbeat (max 30s)" RESULTS["observer-healthy"]="skip" else log "Restarting control-plane-observer na VPS..." _ssh_vps "docker restart control-plane-observer" log "Polling observer.heartbeat (max 30s)..." _ok=0 for _i in $(seq 1 6); do sleep 5 _age=$(_ssh_vps "python3 -c \ \"import os,time; s=os.stat('/opt/homelab/state/observer.heartbeat'); \ print(int(time.time()-s.st_mtime))\" 2>/dev/null" || echo "999") if [[ "${_age}" -lt 20 ]]; then log "OK: observer.heartbeat fresh (${_age}s temu)" _ok=1 break fi log " ... ${_i}×5s, heartbeat ${_age}s old..." done if [[ "${_ok}" -eq 1 ]]; then RESULTS["observer-healthy"]="PASS" else warn "FAIL: observer.heartbeat nie odświeżony po 30s" warn "Sprawdź: ssh ${VPS_SSH_USER}@${VPS_SSH_HOST} docker logs control-plane-observer --tail 30" RESULTS["observer-healthy"]="FAIL" fi fi # ── 4. widoczny w world/nodes.json ───────────────────────────────────── step "[${STEP_NAME}] 4/4 VPS: ${TS_HOSTNAME} w world/nodes.json" if [ "${DRY_RUN:-0}" = 1 ]; then dryrun "ssh ${VPS_SSH_USER}@${VPS_SSH_HOST} python3 -c \"json.load(.../world/nodes.json)['${TS_HOSTNAME}']\"" RESULTS["world-state"]="skip" else _node_status=$(_ssh_vps "python3 -c \" import json, sys try: d = json.load(open('/opt/homelab/world/nodes.json')) node = d.get('${TS_HOSTNAME}', {}) print(node.get('status', 'missing')) except Exception as e: print('error:' + str(e)) \"" 2>/dev/null || echo "ssh-error") case "${_node_status}" in online|offline) log "OK: ${TS_HOSTNAME} w world/nodes.json (status=${_node_status})" RESULTS["world-state"]="PASS" ;; missing) warn "FAIL: ${TS_HOSTNAME} nie ma wpisu w world/nodes.json" warn "Możliwa przyczyna: observer nie przetworzyл jeszcze eventów (poczekaj 60s i spróbuj ponownie)" RESULTS["world-state"]="FAIL" ;; *) warn "FAIL: nieoczekiwana odpowiedź: ${_node_status}" RESULTS["world-state"]="FAIL" ;; esac fi # ── tabela podsumowująca ────────────────────────────────────────────────────── echo "" printf '%s\n' "══════════════════════════════════════════" printf " %-30s %s\n" "CHECK" "RESULT" printf '%s\n' "──────────────────────────────────────────" for _key in "node-agent-running" "events-present" "observer-healthy" "world-state"; do _val="${RESULTS[${_key}]:-???}" printf " %-30s %s\n" "${_key}" "${_val}" done printf '%s\n' "══════════════════════════════════════════" echo "" for _val in "${RESULTS[@]}"; do [[ "${_val}" == "FAIL" ]] && { warn "Verify: co najmniej jeden check nie przeszedł"; exit 1; } done log "[${STEP_NAME}] Verify OK — ${TS_HOSTNAME} zarejestrowany i widoczny w panelu"