feat(onboard): implement 40-register + 50-verify, remove dead scaffold
- 40-register.sh: idempotent — dopisuje lustro do topology.yaml + tworzy hosts/<node>/services.yaml, commituje na bieżącym branchu (bez push) - 50-verify.sh: 4 checki — node-agent running, eventy, observer restart + heartbeat poll, world/nodes.json; tabela pass/fail; exit 1 on failure - 40-deploy-node-agent.sh: usunięty (martwy scaffold; deploy w 30-node-agent.sh) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
a99bf9dadc
commit
1304c8449f
|
|
@ -1,16 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# scripts/onboard/steps/40-deploy-node-agent.sh — deploy node-agent to remote node
|
||||
#
|
||||
# TODO: rsync services/node-agent/ and hosts/<node>/runtime/node-agent/ to remote
|
||||
# TODO: populate /opt/homelab/config/node-agent/.env from env.example + operator-provided secrets
|
||||
# TODO: docker compose up -d (or docker-compose for CHELSTY nodes using v1)
|
||||
# TODO: wait for healthcheck to pass
|
||||
# TODO: emit deployment_completed event via scripts/lib/events.sh
|
||||
# TODO: gate on git_control flag — if false, skip and print manual instructions
|
||||
|
||||
set -euo pipefail
|
||||
: "${REPO_ROOT:?REPO_ROOT is not set — run via onboard.sh}"
|
||||
source "${REPO_ROOT}/scripts/onboard/lib/remote.sh"
|
||||
|
||||
STEP_NAME="40-deploy-node-agent"
|
||||
step "[$STEP_NAME] TODO — not yet implemented"
|
||||
140
scripts/onboard/steps/40-register.sh
Executable file
140
scripts/onboard/steps/40-register.sh
Executable file
|
|
@ -0,0 +1,140 @@
|
|||
#!/usr/bin/env bash
|
||||
# scripts/onboard/steps/40-register.sh — wpisz node do inventory i commituj na branchu
|
||||
#
|
||||
# Efekty (wszystkie idempotentne):
|
||||
# 1. Dopisuje blok <node> do inventory/topology.yaml
|
||||
# 2. Tworzy hosts/<node>/services.yaml jeśli nie istnieje
|
||||
# 3. git add + git commit na aktualnym branchu (NIE push — merge należy do operatora)
|
||||
#
|
||||
# Reload observera celowo poza tym krokiem — wykonywany ręcznie po merge→master,
|
||||
# git pull na VPS i uruchomieniu 50-verify.sh.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
STEP_NAME="40-register"
|
||||
|
||||
: "${REPO_ROOT:?REPO_ROOT is not set — run via onboard.sh}"
|
||||
: "${NODE_YAML:?NODE_YAML is not set — run via onboard.sh}"
|
||||
: "${DRY_RUN:=0}"
|
||||
|
||||
if ! declare -f log >/dev/null 2>&1; then
|
||||
# shellcheck source=../lib/common.sh
|
||||
source "${REPO_ROOT}/scripts/onboard/lib/common.sh"
|
||||
fi
|
||||
|
||||
NODE_ENTRY=$(yaml_get "${NODE_YAML}" "tailscale.hostname")
|
||||
[[ -z "${NODE_ENTRY}" ]] && die "tailscale.hostname not set in ${NODE_YAML}"
|
||||
|
||||
TOPOLOGY="${REPO_ROOT}/inventory/topology.yaml"
|
||||
SERVICES_YAML="${REPO_ROOT}/hosts/${NODE_ENTRY}/services.yaml"
|
||||
|
||||
# ── 1. inventory/topology.yaml ────────────────────────────────────────────────
|
||||
step "[${STEP_NAME}] 1/3 inventory/topology.yaml"
|
||||
|
||||
_TOPOLOGY_BLOCK=$(cat << 'EOF'
|
||||
|
||||
PLACEHOLDER:
|
||||
roles:
|
||||
- edge
|
||||
services:
|
||||
- node-agent
|
||||
EOF
|
||||
)
|
||||
# Replace the PLACEHOLDER with the actual node name
|
||||
_TOPOLOGY_BLOCK="${_TOPOLOGY_BLOCK//PLACEHOLDER/${NODE_ENTRY}}"
|
||||
|
||||
if grep -q "^ ${NODE_ENTRY}:" "${TOPOLOGY}"; then
|
||||
log "${NODE_ENTRY} already present in topology.yaml — skip"
|
||||
else
|
||||
if [ "${DRY_RUN:-0}" = 1 ]; then
|
||||
dryrun "Would append to ${TOPOLOGY}:"
|
||||
echo "${_TOPOLOGY_BLOCK}"
|
||||
else
|
||||
printf '%s\n' "${_TOPOLOGY_BLOCK}" >> "${TOPOLOGY}"
|
||||
log "Appended ${NODE_ENTRY} block to topology.yaml"
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── 2. hosts/<node>/services.yaml ────────────────────────────────────────────
|
||||
step "[${STEP_NAME}] 2/3 hosts/${NODE_ENTRY}/services.yaml"
|
||||
|
||||
if [[ -f "${SERVICES_YAML}" ]]; then
|
||||
log "services.yaml already exists — skip"
|
||||
else
|
||||
if [ "${DRY_RUN:-0}" = 1 ]; then
|
||||
dryrun "Would create ${SERVICES_YAML}:"
|
||||
cat << EOF
|
||||
host: ${NODE_ENTRY}
|
||||
|
||||
services:
|
||||
node-agent:
|
||||
role: node-stability-monitor
|
||||
deployment_model: docker-compose
|
||||
exposure: local-only
|
||||
offline_required: true
|
||||
depends_on:
|
||||
local: []
|
||||
external: []
|
||||
runtime:
|
||||
config_path: /opt/homelab/config/node-agent
|
||||
data_path: /opt/homelab/state
|
||||
logs_path: /opt/homelab/events
|
||||
EOF
|
||||
else
|
||||
mkdir -p "${REPO_ROOT}/hosts/${NODE_ENTRY}"
|
||||
cat > "${SERVICES_YAML}" << EOF
|
||||
host: ${NODE_ENTRY}
|
||||
|
||||
services:
|
||||
node-agent:
|
||||
role: node-stability-monitor
|
||||
deployment_model: docker-compose
|
||||
exposure: local-only
|
||||
offline_required: true
|
||||
depends_on:
|
||||
local: []
|
||||
external: []
|
||||
runtime:
|
||||
config_path: /opt/homelab/config/node-agent
|
||||
data_path: /opt/homelab/state
|
||||
logs_path: /opt/homelab/events
|
||||
EOF
|
||||
log "Created ${SERVICES_YAML}"
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── 3. git commit ─────────────────────────────────────────────────────────────
|
||||
step "[${STEP_NAME}] 3/3 git commit"
|
||||
|
||||
cd "${REPO_ROOT}"
|
||||
|
||||
_changed_files=()
|
||||
git diff --quiet "${TOPOLOGY}" 2>/dev/null || _changed_files+=("inventory/topology.yaml")
|
||||
[[ -f "${SERVICES_YAML}" ]] && \
|
||||
git ls-files --error-unmatch "${SERVICES_YAML}" 2>/dev/null || \
|
||||
_changed_files+=("hosts/${NODE_ENTRY}/services.yaml")
|
||||
|
||||
# Re-check: is anything staged or unstaged for these paths?
|
||||
_needs_commit=0
|
||||
if git diff --quiet && git diff --cached --quiet; then
|
||||
# Nothing changed at all — may already be committed
|
||||
if git ls-files --error-unmatch "${TOPOLOGY}" "${SERVICES_YAML}" >/dev/null 2>&1 && \
|
||||
! git diff HEAD -- "${TOPOLOGY}" "${SERVICES_YAML}" | grep -q .; then
|
||||
log "Nothing to commit — ${NODE_ENTRY} already registered and committed"
|
||||
else
|
||||
_needs_commit=1
|
||||
fi
|
||||
else
|
||||
_needs_commit=1
|
||||
fi
|
||||
|
||||
if [[ "${_needs_commit}" -eq 1 ]]; then
|
||||
run git add "inventory/topology.yaml" "hosts/${NODE_ENTRY}/services.yaml"
|
||||
run git commit -m "feat(onboard): register ${NODE_ENTRY} in topology + services.yaml"
|
||||
if [ "${DRY_RUN:-0}" != 1 ]; then
|
||||
log "Committed on $(git branch --show-current)"
|
||||
log "Next: agent.sh merge task/node-onboarding → master, git pull VPS, run 50-verify.sh"
|
||||
fi
|
||||
fi
|
||||
|
||||
log "[${STEP_NAME}] done"
|
||||
|
|
@ -1,16 +1,160 @@
|
|||
#!/usr/bin/env bash
|
||||
# scripts/onboard/steps/50-verify.sh — end-to-end verification of onboarded node
|
||||
# scripts/onboard/steps/50-verify.sh — restart observera + smoke test węzła w panelu
|
||||
#
|
||||
# TODO: rcheck SSH reachability
|
||||
# TODO: rrun docker ps — confirm node-agent container is running
|
||||
# TODO: check /opt/homelab/state/heartbeat timestamp is recent (< 5 min)
|
||||
# TODO: verify node appears in Observer world state (/opt/homelab/world/nodes.json on control node)
|
||||
# TODO: run services/<service>/healthcheck.sh for each enabled service
|
||||
# TODO: print pass/fail summary table; exit 1 if any check failed
|
||||
# Uruchamiaj PO: merge task/node-onboarding → master + git pull na VPS.
|
||||
#
|
||||
# Sprawdzenia:
|
||||
# 1. SSH <node>: node-agent container running
|
||||
# 2. SSH <node>: eventy obecne w /opt/homelab/events/<node>/
|
||||
# 3. SSH VPS: docker restart control-plane-observer + poll observer.heartbeat
|
||||
# 4. SSH VPS: <node> widoczny w /opt/homelab/world/nodes.json
|
||||
#
|
||||
# Exit 0 — wszystkie OK | Exit 1 — co najmniej jedno FAIL (tabela podsumowująca)
|
||||
|
||||
set -euo pipefail
|
||||
: "${REPO_ROOT:?REPO_ROOT is not set — run via onboard.sh}"
|
||||
source "${REPO_ROOT}/scripts/onboard/lib/remote.sh"
|
||||
|
||||
STEP_NAME="50-verify"
|
||||
step "[$STEP_NAME] TODO — not yet implemented"
|
||||
|
||||
: "${REPO_ROOT:?REPO_ROOT is not set — run via onboard.sh}"
|
||||
: "${NODE_YAML:?NODE_YAML is not set — run via onboard.sh}"
|
||||
: "${DRY_RUN:=0}"
|
||||
|
||||
if ! declare -f log >/dev/null 2>&1; then
|
||||
# shellcheck source=../lib/common.sh
|
||||
source "${REPO_ROOT}/scripts/onboard/lib/common.sh"
|
||||
fi
|
||||
|
||||
SSH_USER=$(yaml_get "${NODE_YAML}" "ssh_user")
|
||||
TS_HOSTNAME=$(yaml_get "${NODE_YAML}" "tailscale.hostname")
|
||||
[[ -z "${SSH_USER}" ]] && die "ssh_user not set in ${NODE_YAML}"
|
||||
[[ -z "${TS_HOSTNAME}" ]] && die "tailscale.hostname not set in ${NODE_YAML}"
|
||||
|
||||
VPS_SSH_USER="oskar"
|
||||
VPS_SSH_HOST="100.95.58.48"
|
||||
VPS_REPO_PATH="/home/oskar/homelab-codex-ws"
|
||||
|
||||
_SSH_OPTS=(-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes)
|
||||
|
||||
_ssh_node() { ssh "${_SSH_OPTS[@]}" "${SSH_USER}@${TS_HOSTNAME}" -- "$@"; }
|
||||
_ssh_vps() { ssh "${_SSH_OPTS[@]}" "${VPS_SSH_USER}@${VPS_SSH_HOST}" -- "$@"; }
|
||||
|
||||
declare -A RESULTS=()
|
||||
|
||||
# ── 1. node-agent running on <node> ──────────────────────────────────────────
|
||||
step "[${STEP_NAME}] 1/4 ${TS_HOSTNAME}: node-agent container"
|
||||
|
||||
if [ "${DRY_RUN:-0}" = 1 ]; then
|
||||
dryrun "ssh ${SSH_USER}@${TS_HOSTNAME} docker ps --filter name=^node-agent\$"
|
||||
RESULTS["node-agent-running"]="skip"
|
||||
elif _ssh_node "docker ps --filter name=^node-agent\$ --filter status=running --format '{{.Names}}'" 2>/dev/null \
|
||||
| grep -q "node-agent"; then
|
||||
log "OK: node-agent running"
|
||||
_ssh_node "docker ps --filter name=node-agent --format 'table {{.Names}}\t{{.Status}}'" 2>/dev/null || true
|
||||
RESULTS["node-agent-running"]="PASS"
|
||||
else
|
||||
warn "FAIL: node-agent nie działa na ${TS_HOSTNAME}"
|
||||
RESULTS["node-agent-running"]="FAIL"
|
||||
fi
|
||||
|
||||
# ── 2. eventy w /opt/homelab/events/<node>/ ───────────────────────────────────
|
||||
step "[${STEP_NAME}] 2/4 ${TS_HOSTNAME}: eventy"
|
||||
|
||||
if [ "${DRY_RUN:-0}" = 1 ]; then
|
||||
dryrun "ssh ${SSH_USER}@${TS_HOSTNAME} find /opt/homelab/events/${TS_HOSTNAME}/ -name '*.json'"
|
||||
RESULTS["events-present"]="skip"
|
||||
elif _ssh_node "find /opt/homelab/events/${TS_HOSTNAME}/ -name '*.json' 2>/dev/null | head -1" 2>/dev/null \
|
||||
| grep -q ".json"; then
|
||||
_latest=$(_ssh_node "ls -t /opt/homelab/events/${TS_HOSTNAME}/*.json 2>/dev/null | head -1" || echo "?")
|
||||
log "OK: eventy obecne (ostatni: ${_latest})"
|
||||
RESULTS["events-present"]="PASS"
|
||||
else
|
||||
warn "FAIL: brak eventów w /opt/homelab/events/${TS_HOSTNAME}/"
|
||||
RESULTS["events-present"]="FAIL"
|
||||
fi
|
||||
|
||||
# ── 3. restart observera + healthcheck ────────────────────────────────────────
|
||||
step "[${STEP_NAME}] 3/4 VPS: restart control-plane-observer"
|
||||
|
||||
if [ "${DRY_RUN:-0}" = 1 ]; then
|
||||
dryrun "ssh ${VPS_SSH_USER}@${VPS_SSH_HOST} docker restart control-plane-observer"
|
||||
dryrun "poll /opt/homelab/state/observer.heartbeat (max 30s)"
|
||||
RESULTS["observer-healthy"]="skip"
|
||||
else
|
||||
log "Restarting control-plane-observer na VPS..."
|
||||
_ssh_vps "docker restart control-plane-observer"
|
||||
|
||||
log "Polling observer.heartbeat (max 30s)..."
|
||||
_ok=0
|
||||
for _i in $(seq 1 6); do
|
||||
sleep 5
|
||||
_age=$(_ssh_vps "python3 -c \
|
||||
\"import os,time; s=os.stat('/opt/homelab/state/observer.heartbeat'); \
|
||||
print(int(time.time()-s.st_mtime))\" 2>/dev/null" || echo "999")
|
||||
if [[ "${_age}" -lt 20 ]]; then
|
||||
log "OK: observer.heartbeat fresh (${_age}s temu)"
|
||||
_ok=1
|
||||
break
|
||||
fi
|
||||
log " ... ${_i}×5s, heartbeat ${_age}s old..."
|
||||
done
|
||||
|
||||
if [[ "${_ok}" -eq 1 ]]; then
|
||||
RESULTS["observer-healthy"]="PASS"
|
||||
else
|
||||
warn "FAIL: observer.heartbeat nie odświeżony po 30s"
|
||||
warn "Sprawdź: ssh ${VPS_SSH_USER}@${VPS_SSH_HOST} docker logs control-plane-observer --tail 30"
|
||||
RESULTS["observer-healthy"]="FAIL"
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── 4. <node> widoczny w world/nodes.json ─────────────────────────────────────
|
||||
step "[${STEP_NAME}] 4/4 VPS: ${TS_HOSTNAME} w world/nodes.json"
|
||||
|
||||
if [ "${DRY_RUN:-0}" = 1 ]; then
|
||||
dryrun "ssh ${VPS_SSH_USER}@${VPS_SSH_HOST} python3 -c \"json.load(.../world/nodes.json)['${TS_HOSTNAME}']\""
|
||||
RESULTS["world-state"]="skip"
|
||||
else
|
||||
_node_status=$(_ssh_vps "python3 -c \"
|
||||
import json, sys
|
||||
try:
|
||||
d = json.load(open('/opt/homelab/world/nodes.json'))
|
||||
node = d.get('${TS_HOSTNAME}', {})
|
||||
print(node.get('status', 'missing'))
|
||||
except Exception as e:
|
||||
print('error:' + str(e))
|
||||
\"" 2>/dev/null || echo "ssh-error")
|
||||
|
||||
case "${_node_status}" in
|
||||
online|offline)
|
||||
log "OK: ${TS_HOSTNAME} w world/nodes.json (status=${_node_status})"
|
||||
RESULTS["world-state"]="PASS"
|
||||
;;
|
||||
missing)
|
||||
warn "FAIL: ${TS_HOSTNAME} nie ma wpisu w world/nodes.json"
|
||||
warn "Możliwa przyczyna: observer nie przetworzyл jeszcze eventów (poczekaj 60s i spróbuj ponownie)"
|
||||
RESULTS["world-state"]="FAIL"
|
||||
;;
|
||||
*)
|
||||
warn "FAIL: nieoczekiwana odpowiedź: ${_node_status}"
|
||||
RESULTS["world-state"]="FAIL"
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
# ── tabela podsumowująca ──────────────────────────────────────────────────────
|
||||
echo ""
|
||||
printf '%s\n' "══════════════════════════════════════════"
|
||||
printf " %-30s %s\n" "CHECK" "RESULT"
|
||||
printf '%s\n' "──────────────────────────────────────────"
|
||||
for _key in "node-agent-running" "events-present" "observer-healthy" "world-state"; do
|
||||
_val="${RESULTS[${_key}]:-???}"
|
||||
printf " %-30s %s\n" "${_key}" "${_val}"
|
||||
done
|
||||
printf '%s\n' "══════════════════════════════════════════"
|
||||
echo ""
|
||||
|
||||
for _val in "${RESULTS[@]}"; do
|
||||
[[ "${_val}" == "FAIL" ]] && { warn "Verify: co najmniej jeden check nie przeszedł"; exit 1; }
|
||||
done
|
||||
|
||||
log "[${STEP_NAME}] Verify OK — ${TS_HOSTNAME} zarejestrowany i widoczny w panelu"
|
||||
|
|
|
|||
Loading…
Reference in a new issue