feat(piha): brain-watchdog — external watchdog for control-plane

Polls /summary on VPS over Tailscale every 60s; computes freshness
locally from last_update epoch (never trusts self-reported status).
Alerts via Telegram Bot API directly after 3 consecutive failures;
sends recovery message on heal. State (fail_count, alerted) persisted
to volume so debounce survives restarts.

- services/brain-watchdog/: Python service, no external deps (stdlib only)
- hosts/piha/runtime/brain-watchdog/: override with mem_limit 64m
- hosts/piha/services.yaml + inventory/topology.yaml: manifest entries

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Oskar Kapala 2026-06-01 17:54:36 +02:00
parent 495741e7ac
commit 039f9f7247
10 changed files with 277 additions and 0 deletions

View file

@ -0,0 +1,4 @@
services:
brain-watchdog:
mem_limit: 64m
restart: unless-stopped

View file

@ -29,3 +29,14 @@ services:
config_path: /opt/homelab/config/node-agent config_path: /opt/homelab/config/node-agent
data_path: /opt/homelab/state data_path: /opt/homelab/state
logs_path: /opt/homelab/events logs_path: /opt/homelab/events
brain-watchdog:
role: control-plane-watchdog
deployment_model: docker-compose
exposure: private
offline_required: false
depends_on:
local: []
external: [control-plane]
runtime:
config_path: /opt/homelab/config/brain-watchdog

View file

@ -17,6 +17,10 @@ nodes:
roles: roles:
- infra - infra
- monitoring - monitoring
services:
- node-agent
- ha-diag-agent
- brain-watchdog
solaria: solaria:
roles: roles:
@ -28,6 +32,17 @@ nodes:
- edge - edge
- ingress - ingress
- control-plane - control-plane
services:
# Repo-managed GitOps services (hosts/vps/services.yaml is authoritative)
- node-agent
- control-plane # executor, observer, supervisor, operator-ui
- node_exporter
- stability-agent
- npm # Nginx Proxy Manager — public ingress, TLS termination
- outline # Team wiki (outline + postgres + redis)
- joplin # Note sync server (joplin-server + postgres)
- ai-cluster # AI workers: codex-worker, openclaw, planner-worker,
# service-ops-worker, redis, mosquitto
chelsty-infra: chelsty-infra:
site: chelsty site: chelsty

View file

@ -0,0 +1,9 @@
FROM python:3.11-slim
WORKDIR /app
COPY src/ src/
ENV PYTHONUNBUFFERED=1
CMD ["python", "-m", "brain_watchdog.main"]

View file

@ -0,0 +1,30 @@
services:
brain-watchdog:
build: .
container_name: brain-watchdog
restart: unless-stopped
env_file:
- /opt/homelab/config/brain-watchdog/.env
volumes:
- brain_watchdog_data:/data
healthcheck:
test:
- "CMD"
- "python"
- "-c"
- |
import os, time, json, sys
p = '/data/state.json'
if not os.path.exists(p): sys.exit(1)
age = time.time() - os.path.getmtime(p)
sys.exit(0 if age < 300 else 1)
interval: 1m
timeout: 10s
retries: 3
start_period: 30s
volumes:
brain_watchdog_data:

View file

@ -0,0 +1,7 @@
CONTROL_PLANE_URL=
STALE_THRESHOLD=600
INTERVAL=60
FAILS_BEFORE_ALERT=3
TG_TOKEN=
TG_CHAT_ID=
HEALTHCHECKS_URL=

View file

@ -0,0 +1,10 @@
#!/bin/sh
# Healthy if state.json was written within the last 5 minutes.
python -c "
import os, time, sys
p = '/data/state.json'
if not os.path.exists(p):
sys.exit(1)
age = time.time() - os.path.getmtime(p)
sys.exit(0 if age < 300 else 1)
"

View file

@ -0,0 +1,34 @@
service:
name: brain-watchdog
owner_node: piha
exposure: private
description: >
External watchdog for the control-plane on VPS. Queries /summary over
Tailscale and alerts via Telegram Bot API directly — no dependency on the
control-plane itself. Freshness is computed locally from last_update epoch.
dependencies:
- control-plane # external — on VPS; deliberately untrusted for liveness
healthcheck:
type: docker
interval: 60s
timeout: 10s
retries: 3
start_period: 30s
restart_policy: unless-stopped
persistence:
paths:
- /data # state.json: fail_count, alerted, last_ok
runtime:
env_vars:
- CONTROL_PLANE_URL # Tailscale IP + port of operator-ui (required)
- STALE_THRESHOLD # seconds before brain is considered stale (default: 600)
- INTERVAL # poll interval seconds (default: 60)
- FAILS_BEFORE_ALERT # consecutive failures before Telegram alert (default: 3)
- TG_TOKEN # Telegram Bot API token (required)
- TG_CHAT_ID # Telegram chat/user ID (required)
- HEALTHCHECKS_URL # optional healthchecks.io ping URL

View file

@ -0,0 +1,157 @@
"""
brain-watchdog: external watchdog for the control-plane on VPS.
Runs on PIHA; queries /summary directly over Tailscale and alerts via
Telegram Bot API without going through the control-plane itself.
Never trusts the self-reported "status" field freshness is computed
locally from last_update epoch vs. time.time().
"""
import json
import os
import time
import urllib.error
import urllib.request
from pathlib import Path
CONTROL_PLANE_URL = os.environ["CONTROL_PLANE_URL"].rstrip("/")
STALE_THRESHOLD = int(os.environ.get("STALE_THRESHOLD", "600"))
INTERVAL = int(os.environ.get("INTERVAL", "60"))
FAILS_BEFORE_ALERT = int(os.environ.get("FAILS_BEFORE_ALERT", "3"))
TG_TOKEN = os.environ["TG_TOKEN"]
TG_CHAT_ID = os.environ["TG_CHAT_ID"]
HEALTHCHECKS_URL = os.environ.get("HEALTHCHECKS_URL", "").strip()
STATE_FILE = Path("/data/state.json")
def load_state() -> dict:
if STATE_FILE.exists():
try:
return json.loads(STATE_FILE.read_text())
except Exception:
pass
return {"fail_count": 0, "alerted": False, "last_ok": 0.0}
def save_state(state: dict) -> None:
STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
STATE_FILE.write_text(json.dumps(state))
def http_get(url: str, timeout: int = 10) -> tuple[int | None, dict | None]:
try:
with urllib.request.urlopen(url, timeout=timeout) as resp:
return resp.status, json.loads(resp.read())
except urllib.error.HTTPError as exc:
return exc.code, None
except Exception:
return None, None
def send_telegram(message: str) -> bool:
url = f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage"
payload = json.dumps(
{"chat_id": TG_CHAT_ID, "text": message, "parse_mode": "HTML"}
).encode()
req = urllib.request.Request(
url, data=payload, headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=10) as resp:
return resp.status == 200
except Exception as exc:
print(f"[telegram] send failed: {exc}", flush=True)
return False
def ping_healthchecks() -> None:
if not HEALTHCHECKS_URL:
return
try:
urllib.request.urlopen(HEALTHCHECKS_URL, timeout=10)
except Exception as exc:
print(f"[healthchecks] ping failed: {exc}", flush=True)
def check() -> tuple[bool, str]:
"""Return (ok, human-readable reason). Never reads 'status' field."""
status, body = http_get(f"{CONTROL_PLANE_URL}/summary")
if status is None:
return False, "panel unreachable (connection error)"
if status != 200:
return False, f"panel returned HTTP {status}"
if not body:
return False, "panel returned empty / invalid JSON"
raw = body.get("last_update")
if raw is None:
return False, "summary missing last_update field"
try:
last_update_ts = float(raw)
except (TypeError, ValueError):
return False, f"last_update not parseable: {raw!r}"
age = time.time() - last_update_ts
if age > STALE_THRESHOLD:
return False, (
f"brain stale: last update {int(age // 60)}m ago "
f"(threshold {STALE_THRESHOLD // 60}m)"
)
return True, f"ok (age {int(age)}s)"
def main() -> None:
print(
f"[brain-watchdog] starting — "
f"url={CONTROL_PLANE_URL} "
f"stale_threshold={STALE_THRESHOLD}s "
f"interval={INTERVAL}s "
f"fails_before_alert={FAILS_BEFORE_ALERT}",
flush=True,
)
state = load_state()
while True:
ok, reason = check()
ts = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
print(f"[{ts}] {'OK ' if ok else 'FAIL'}{reason}", flush=True)
if ok:
if state["alerted"]:
send_telegram(
"✅ <b>brain-watchdog: control-plane RECOVERED</b>\n"
f"{reason}"
)
print("[telegram] sent recovery alert", flush=True)
state["fail_count"] = 0
state["alerted"] = False
state["last_ok"] = time.time()
save_state(state)
ping_healthchecks()
else:
state["fail_count"] = state.get("fail_count", 0) + 1
save_state(state)
if state["fail_count"] >= FAILS_BEFORE_ALERT and not state["alerted"]:
sent = send_telegram(
"🚨 <b>brain-watchdog: control-plane DOWN</b>\n"
f"Reason: {reason}\n"
f"Consecutive failures: {state['fail_count']}\n"
f"URL: <code>{CONTROL_PLANE_URL}</code>"
)
if sent:
state["alerted"] = True
save_state(state)
print("[telegram] sent alert", flush=True)
time.sleep(INTERVAL)
if __name__ == "__main__":
main()