feat(piha): brain-watchdog — external watchdog for control-plane
Polls /summary on VPS over Tailscale every 60s; computes freshness locally from last_update epoch (never trusts self-reported status). Alerts via Telegram Bot API directly after 3 consecutive failures; sends recovery message on heal. State (fail_count, alerted) persisted to volume so debounce survives restarts. - services/brain-watchdog/: Python service, no external deps (stdlib only) - hosts/piha/runtime/brain-watchdog/: override with mem_limit 64m - hosts/piha/services.yaml + inventory/topology.yaml: manifest entries Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
495741e7ac
commit
039f9f7247
|
|
@ -0,0 +1,4 @@
|
||||||
|
services:
|
||||||
|
brain-watchdog:
|
||||||
|
mem_limit: 64m
|
||||||
|
restart: unless-stopped
|
||||||
|
|
@ -29,3 +29,14 @@ services:
|
||||||
config_path: /opt/homelab/config/node-agent
|
config_path: /opt/homelab/config/node-agent
|
||||||
data_path: /opt/homelab/state
|
data_path: /opt/homelab/state
|
||||||
logs_path: /opt/homelab/events
|
logs_path: /opt/homelab/events
|
||||||
|
|
||||||
|
brain-watchdog:
|
||||||
|
role: control-plane-watchdog
|
||||||
|
deployment_model: docker-compose
|
||||||
|
exposure: private
|
||||||
|
offline_required: false
|
||||||
|
depends_on:
|
||||||
|
local: []
|
||||||
|
external: [control-plane]
|
||||||
|
runtime:
|
||||||
|
config_path: /opt/homelab/config/brain-watchdog
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,10 @@ nodes:
|
||||||
roles:
|
roles:
|
||||||
- infra
|
- infra
|
||||||
- monitoring
|
- monitoring
|
||||||
|
services:
|
||||||
|
- node-agent
|
||||||
|
- ha-diag-agent
|
||||||
|
- brain-watchdog
|
||||||
|
|
||||||
solaria:
|
solaria:
|
||||||
roles:
|
roles:
|
||||||
|
|
@ -28,6 +32,17 @@ nodes:
|
||||||
- edge
|
- edge
|
||||||
- ingress
|
- ingress
|
||||||
- control-plane
|
- control-plane
|
||||||
|
services:
|
||||||
|
# Repo-managed GitOps services (hosts/vps/services.yaml is authoritative)
|
||||||
|
- node-agent
|
||||||
|
- control-plane # executor, observer, supervisor, operator-ui
|
||||||
|
- node_exporter
|
||||||
|
- stability-agent
|
||||||
|
- npm # Nginx Proxy Manager — public ingress, TLS termination
|
||||||
|
- outline # Team wiki (outline + postgres + redis)
|
||||||
|
- joplin # Note sync server (joplin-server + postgres)
|
||||||
|
- ai-cluster # AI workers: codex-worker, openclaw, planner-worker,
|
||||||
|
# service-ops-worker, redis, mosquitto
|
||||||
|
|
||||||
chelsty-infra:
|
chelsty-infra:
|
||||||
site: chelsty
|
site: chelsty
|
||||||
|
|
|
||||||
9
services/brain-watchdog/Dockerfile
Normal file
9
services/brain-watchdog/Dockerfile
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY src/ src/
|
||||||
|
|
||||||
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
|
||||||
|
CMD ["python", "-m", "brain_watchdog.main"]
|
||||||
30
services/brain-watchdog/docker-compose.yml
Normal file
30
services/brain-watchdog/docker-compose.yml
Normal file
|
|
@ -0,0 +1,30 @@
|
||||||
|
services:
|
||||||
|
brain-watchdog:
|
||||||
|
build: .
|
||||||
|
container_name: brain-watchdog
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
env_file:
|
||||||
|
- /opt/homelab/config/brain-watchdog/.env
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
- brain_watchdog_data:/data
|
||||||
|
|
||||||
|
healthcheck:
|
||||||
|
test:
|
||||||
|
- "CMD"
|
||||||
|
- "python"
|
||||||
|
- "-c"
|
||||||
|
- |
|
||||||
|
import os, time, json, sys
|
||||||
|
p = '/data/state.json'
|
||||||
|
if not os.path.exists(p): sys.exit(1)
|
||||||
|
age = time.time() - os.path.getmtime(p)
|
||||||
|
sys.exit(0 if age < 300 else 1)
|
||||||
|
interval: 1m
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 30s
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
brain_watchdog_data:
|
||||||
7
services/brain-watchdog/env.example
Normal file
7
services/brain-watchdog/env.example
Normal file
|
|
@ -0,0 +1,7 @@
|
||||||
|
CONTROL_PLANE_URL=
|
||||||
|
STALE_THRESHOLD=600
|
||||||
|
INTERVAL=60
|
||||||
|
FAILS_BEFORE_ALERT=3
|
||||||
|
TG_TOKEN=
|
||||||
|
TG_CHAT_ID=
|
||||||
|
HEALTHCHECKS_URL=
|
||||||
10
services/brain-watchdog/healthcheck.sh
Executable file
10
services/brain-watchdog/healthcheck.sh
Executable file
|
|
@ -0,0 +1,10 @@
|
||||||
|
#!/bin/sh
|
||||||
|
# Healthy if state.json was written within the last 5 minutes.
|
||||||
|
python -c "
|
||||||
|
import os, time, sys
|
||||||
|
p = '/data/state.json'
|
||||||
|
if not os.path.exists(p):
|
||||||
|
sys.exit(1)
|
||||||
|
age = time.time() - os.path.getmtime(p)
|
||||||
|
sys.exit(0 if age < 300 else 1)
|
||||||
|
"
|
||||||
34
services/brain-watchdog/service.yaml
Normal file
34
services/brain-watchdog/service.yaml
Normal file
|
|
@ -0,0 +1,34 @@
|
||||||
|
service:
|
||||||
|
name: brain-watchdog
|
||||||
|
owner_node: piha
|
||||||
|
exposure: private
|
||||||
|
description: >
|
||||||
|
External watchdog for the control-plane on VPS. Queries /summary over
|
||||||
|
Tailscale and alerts via Telegram Bot API directly — no dependency on the
|
||||||
|
control-plane itself. Freshness is computed locally from last_update epoch.
|
||||||
|
|
||||||
|
dependencies:
|
||||||
|
- control-plane # external — on VPS; deliberately untrusted for liveness
|
||||||
|
|
||||||
|
healthcheck:
|
||||||
|
type: docker
|
||||||
|
interval: 60s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 30s
|
||||||
|
|
||||||
|
restart_policy: unless-stopped
|
||||||
|
|
||||||
|
persistence:
|
||||||
|
paths:
|
||||||
|
- /data # state.json: fail_count, alerted, last_ok
|
||||||
|
|
||||||
|
runtime:
|
||||||
|
env_vars:
|
||||||
|
- CONTROL_PLANE_URL # Tailscale IP + port of operator-ui (required)
|
||||||
|
- STALE_THRESHOLD # seconds before brain is considered stale (default: 600)
|
||||||
|
- INTERVAL # poll interval seconds (default: 60)
|
||||||
|
- FAILS_BEFORE_ALERT # consecutive failures before Telegram alert (default: 3)
|
||||||
|
- TG_TOKEN # Telegram Bot API token (required)
|
||||||
|
- TG_CHAT_ID # Telegram chat/user ID (required)
|
||||||
|
- HEALTHCHECKS_URL # optional healthchecks.io ping URL
|
||||||
157
services/brain-watchdog/src/brain_watchdog/main.py
Normal file
157
services/brain-watchdog/src/brain_watchdog/main.py
Normal file
|
|
@ -0,0 +1,157 @@
|
||||||
|
"""
|
||||||
|
brain-watchdog: external watchdog for the control-plane on VPS.
|
||||||
|
|
||||||
|
Runs on PIHA; queries /summary directly over Tailscale and alerts via
|
||||||
|
Telegram Bot API without going through the control-plane itself.
|
||||||
|
Never trusts the self-reported "status" field — freshness is computed
|
||||||
|
locally from last_update epoch vs. time.time().
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import urllib.error
|
||||||
|
import urllib.request
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
CONTROL_PLANE_URL = os.environ["CONTROL_PLANE_URL"].rstrip("/")
|
||||||
|
STALE_THRESHOLD = int(os.environ.get("STALE_THRESHOLD", "600"))
|
||||||
|
INTERVAL = int(os.environ.get("INTERVAL", "60"))
|
||||||
|
FAILS_BEFORE_ALERT = int(os.environ.get("FAILS_BEFORE_ALERT", "3"))
|
||||||
|
TG_TOKEN = os.environ["TG_TOKEN"]
|
||||||
|
TG_CHAT_ID = os.environ["TG_CHAT_ID"]
|
||||||
|
HEALTHCHECKS_URL = os.environ.get("HEALTHCHECKS_URL", "").strip()
|
||||||
|
|
||||||
|
STATE_FILE = Path("/data/state.json")
|
||||||
|
|
||||||
|
|
||||||
|
def load_state() -> dict:
|
||||||
|
if STATE_FILE.exists():
|
||||||
|
try:
|
||||||
|
return json.loads(STATE_FILE.read_text())
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return {"fail_count": 0, "alerted": False, "last_ok": 0.0}
|
||||||
|
|
||||||
|
|
||||||
|
def save_state(state: dict) -> None:
|
||||||
|
STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
STATE_FILE.write_text(json.dumps(state))
|
||||||
|
|
||||||
|
|
||||||
|
def http_get(url: str, timeout: int = 10) -> tuple[int | None, dict | None]:
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(url, timeout=timeout) as resp:
|
||||||
|
return resp.status, json.loads(resp.read())
|
||||||
|
except urllib.error.HTTPError as exc:
|
||||||
|
return exc.code, None
|
||||||
|
except Exception:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
|
def send_telegram(message: str) -> bool:
|
||||||
|
url = f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage"
|
||||||
|
payload = json.dumps(
|
||||||
|
{"chat_id": TG_CHAT_ID, "text": message, "parse_mode": "HTML"}
|
||||||
|
).encode()
|
||||||
|
req = urllib.request.Request(
|
||||||
|
url, data=payload, headers={"Content-Type": "application/json"}
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(req, timeout=10) as resp:
|
||||||
|
return resp.status == 200
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[telegram] send failed: {exc}", flush=True)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def ping_healthchecks() -> None:
|
||||||
|
if not HEALTHCHECKS_URL:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
urllib.request.urlopen(HEALTHCHECKS_URL, timeout=10)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[healthchecks] ping failed: {exc}", flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
def check() -> tuple[bool, str]:
|
||||||
|
"""Return (ok, human-readable reason). Never reads 'status' field."""
|
||||||
|
status, body = http_get(f"{CONTROL_PLANE_URL}/summary")
|
||||||
|
|
||||||
|
if status is None:
|
||||||
|
return False, "panel unreachable (connection error)"
|
||||||
|
|
||||||
|
if status != 200:
|
||||||
|
return False, f"panel returned HTTP {status}"
|
||||||
|
|
||||||
|
if not body:
|
||||||
|
return False, "panel returned empty / invalid JSON"
|
||||||
|
|
||||||
|
raw = body.get("last_update")
|
||||||
|
if raw is None:
|
||||||
|
return False, "summary missing last_update field"
|
||||||
|
|
||||||
|
try:
|
||||||
|
last_update_ts = float(raw)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return False, f"last_update not parseable: {raw!r}"
|
||||||
|
|
||||||
|
age = time.time() - last_update_ts
|
||||||
|
if age > STALE_THRESHOLD:
|
||||||
|
return False, (
|
||||||
|
f"brain stale: last update {int(age // 60)}m ago "
|
||||||
|
f"(threshold {STALE_THRESHOLD // 60}m)"
|
||||||
|
)
|
||||||
|
|
||||||
|
return True, f"ok (age {int(age)}s)"
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
print(
|
||||||
|
f"[brain-watchdog] starting — "
|
||||||
|
f"url={CONTROL_PLANE_URL} "
|
||||||
|
f"stale_threshold={STALE_THRESHOLD}s "
|
||||||
|
f"interval={INTERVAL}s "
|
||||||
|
f"fails_before_alert={FAILS_BEFORE_ALERT}",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
state = load_state()
|
||||||
|
|
||||||
|
while True:
|
||||||
|
ok, reason = check()
|
||||||
|
ts = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
||||||
|
print(f"[{ts}] {'OK ' if ok else 'FAIL'} — {reason}", flush=True)
|
||||||
|
|
||||||
|
if ok:
|
||||||
|
if state["alerted"]:
|
||||||
|
send_telegram(
|
||||||
|
"✅ <b>brain-watchdog: control-plane RECOVERED</b>\n"
|
||||||
|
f"{reason}"
|
||||||
|
)
|
||||||
|
print("[telegram] sent recovery alert", flush=True)
|
||||||
|
state["fail_count"] = 0
|
||||||
|
state["alerted"] = False
|
||||||
|
state["last_ok"] = time.time()
|
||||||
|
save_state(state)
|
||||||
|
ping_healthchecks()
|
||||||
|
else:
|
||||||
|
state["fail_count"] = state.get("fail_count", 0) + 1
|
||||||
|
save_state(state)
|
||||||
|
|
||||||
|
if state["fail_count"] >= FAILS_BEFORE_ALERT and not state["alerted"]:
|
||||||
|
sent = send_telegram(
|
||||||
|
"🚨 <b>brain-watchdog: control-plane DOWN</b>\n"
|
||||||
|
f"Reason: {reason}\n"
|
||||||
|
f"Consecutive failures: {state['fail_count']}\n"
|
||||||
|
f"URL: <code>{CONTROL_PLANE_URL}</code>"
|
||||||
|
)
|
||||||
|
if sent:
|
||||||
|
state["alerted"] = True
|
||||||
|
save_state(state)
|
||||||
|
print("[telegram] sent alert", flush=True)
|
||||||
|
|
||||||
|
time.sleep(INTERVAL)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Reference in a new issue