From 039f9f7247c9dd0064fe91e5ce89381b0f871c25 Mon Sep 17 00:00:00 2001 From: Oskar Kapala Date: Mon, 1 Jun 2026 17:54:36 +0200 Subject: [PATCH] =?UTF-8?q?feat(piha):=20brain-watchdog=20=E2=80=94=20exte?= =?UTF-8?q?rnal=20watchdog=20for=20control-plane?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Polls /summary on VPS over Tailscale every 60s; computes freshness locally from last_update epoch (never trusts self-reported status). Alerts via Telegram Bot API directly after 3 consecutive failures; sends recovery message on heal. State (fail_count, alerted) persisted to volume so debounce survives restarts. - services/brain-watchdog/: Python service, no external deps (stdlib only) - hosts/piha/runtime/brain-watchdog/: override with mem_limit 64m - hosts/piha/services.yaml + inventory/topology.yaml: manifest entries Co-Authored-By: Claude Sonnet 4.6 --- .../docker-compose.override.yml | 4 + hosts/piha/services.yaml | 11 ++ inventory/topology.yaml | 15 ++ services/brain-watchdog/Dockerfile | 9 + services/brain-watchdog/docker-compose.yml | 30 ++++ services/brain-watchdog/env.example | 7 + services/brain-watchdog/healthcheck.sh | 10 ++ services/brain-watchdog/service.yaml | 34 ++++ .../src/brain_watchdog/__init__.py | 0 .../brain-watchdog/src/brain_watchdog/main.py | 157 ++++++++++++++++++ 10 files changed, 277 insertions(+) create mode 100644 hosts/piha/runtime/brain-watchdog/docker-compose.override.yml create mode 100644 services/brain-watchdog/Dockerfile create mode 100644 services/brain-watchdog/docker-compose.yml create mode 100644 services/brain-watchdog/env.example create mode 100755 services/brain-watchdog/healthcheck.sh create mode 100644 services/brain-watchdog/service.yaml create mode 100644 services/brain-watchdog/src/brain_watchdog/__init__.py create mode 100644 services/brain-watchdog/src/brain_watchdog/main.py diff --git a/hosts/piha/runtime/brain-watchdog/docker-compose.override.yml b/hosts/piha/runtime/brain-watchdog/docker-compose.override.yml new file mode 100644 index 0000000..0ed6604 --- /dev/null +++ b/hosts/piha/runtime/brain-watchdog/docker-compose.override.yml @@ -0,0 +1,4 @@ +services: + brain-watchdog: + mem_limit: 64m + restart: unless-stopped diff --git a/hosts/piha/services.yaml b/hosts/piha/services.yaml index 03fea12..5dcad97 100644 --- a/hosts/piha/services.yaml +++ b/hosts/piha/services.yaml @@ -29,3 +29,14 @@ services: config_path: /opt/homelab/config/node-agent data_path: /opt/homelab/state logs_path: /opt/homelab/events + + brain-watchdog: + role: control-plane-watchdog + deployment_model: docker-compose + exposure: private + offline_required: false + depends_on: + local: [] + external: [control-plane] + runtime: + config_path: /opt/homelab/config/brain-watchdog diff --git a/inventory/topology.yaml b/inventory/topology.yaml index 19d11c1..614fbe1 100644 --- a/inventory/topology.yaml +++ b/inventory/topology.yaml @@ -17,6 +17,10 @@ nodes: roles: - infra - monitoring + services: + - node-agent + - ha-diag-agent + - brain-watchdog solaria: roles: @@ -28,6 +32,17 @@ nodes: - edge - ingress - control-plane + services: + # Repo-managed GitOps services (hosts/vps/services.yaml is authoritative) + - node-agent + - control-plane # executor, observer, supervisor, operator-ui + - node_exporter + - stability-agent + - npm # Nginx Proxy Manager — public ingress, TLS termination + - outline # Team wiki (outline + postgres + redis) + - joplin # Note sync server (joplin-server + postgres) + - ai-cluster # AI workers: codex-worker, openclaw, planner-worker, + # service-ops-worker, redis, mosquitto chelsty-infra: site: chelsty diff --git a/services/brain-watchdog/Dockerfile b/services/brain-watchdog/Dockerfile new file mode 100644 index 0000000..e6fe6d3 --- /dev/null +++ b/services/brain-watchdog/Dockerfile @@ -0,0 +1,9 @@ +FROM python:3.11-slim + +WORKDIR /app + +COPY src/ src/ + +ENV PYTHONUNBUFFERED=1 + +CMD ["python", "-m", "brain_watchdog.main"] diff --git a/services/brain-watchdog/docker-compose.yml b/services/brain-watchdog/docker-compose.yml new file mode 100644 index 0000000..19a8d86 --- /dev/null +++ b/services/brain-watchdog/docker-compose.yml @@ -0,0 +1,30 @@ +services: + brain-watchdog: + build: . + container_name: brain-watchdog + restart: unless-stopped + + env_file: + - /opt/homelab/config/brain-watchdog/.env + + volumes: + - brain_watchdog_data:/data + + healthcheck: + test: + - "CMD" + - "python" + - "-c" + - | + import os, time, json, sys + p = '/data/state.json' + if not os.path.exists(p): sys.exit(1) + age = time.time() - os.path.getmtime(p) + sys.exit(0 if age < 300 else 1) + interval: 1m + timeout: 10s + retries: 3 + start_period: 30s + +volumes: + brain_watchdog_data: diff --git a/services/brain-watchdog/env.example b/services/brain-watchdog/env.example new file mode 100644 index 0000000..f09c999 --- /dev/null +++ b/services/brain-watchdog/env.example @@ -0,0 +1,7 @@ +CONTROL_PLANE_URL= +STALE_THRESHOLD=600 +INTERVAL=60 +FAILS_BEFORE_ALERT=3 +TG_TOKEN= +TG_CHAT_ID= +HEALTHCHECKS_URL= diff --git a/services/brain-watchdog/healthcheck.sh b/services/brain-watchdog/healthcheck.sh new file mode 100755 index 0000000..e3c1ab6 --- /dev/null +++ b/services/brain-watchdog/healthcheck.sh @@ -0,0 +1,10 @@ +#!/bin/sh +# Healthy if state.json was written within the last 5 minutes. +python -c " +import os, time, sys +p = '/data/state.json' +if not os.path.exists(p): + sys.exit(1) +age = time.time() - os.path.getmtime(p) +sys.exit(0 if age < 300 else 1) +" diff --git a/services/brain-watchdog/service.yaml b/services/brain-watchdog/service.yaml new file mode 100644 index 0000000..0eec949 --- /dev/null +++ b/services/brain-watchdog/service.yaml @@ -0,0 +1,34 @@ +service: + name: brain-watchdog + owner_node: piha + exposure: private + description: > + External watchdog for the control-plane on VPS. Queries /summary over + Tailscale and alerts via Telegram Bot API directly — no dependency on the + control-plane itself. Freshness is computed locally from last_update epoch. + + dependencies: + - control-plane # external — on VPS; deliberately untrusted for liveness + + healthcheck: + type: docker + interval: 60s + timeout: 10s + retries: 3 + start_period: 30s + + restart_policy: unless-stopped + + persistence: + paths: + - /data # state.json: fail_count, alerted, last_ok + + runtime: + env_vars: + - CONTROL_PLANE_URL # Tailscale IP + port of operator-ui (required) + - STALE_THRESHOLD # seconds before brain is considered stale (default: 600) + - INTERVAL # poll interval seconds (default: 60) + - FAILS_BEFORE_ALERT # consecutive failures before Telegram alert (default: 3) + - TG_TOKEN # Telegram Bot API token (required) + - TG_CHAT_ID # Telegram chat/user ID (required) + - HEALTHCHECKS_URL # optional healthchecks.io ping URL diff --git a/services/brain-watchdog/src/brain_watchdog/__init__.py b/services/brain-watchdog/src/brain_watchdog/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/services/brain-watchdog/src/brain_watchdog/main.py b/services/brain-watchdog/src/brain_watchdog/main.py new file mode 100644 index 0000000..96ed3b7 --- /dev/null +++ b/services/brain-watchdog/src/brain_watchdog/main.py @@ -0,0 +1,157 @@ +""" +brain-watchdog: external watchdog for the control-plane on VPS. + +Runs on PIHA; queries /summary directly over Tailscale and alerts via +Telegram Bot API without going through the control-plane itself. +Never trusts the self-reported "status" field — freshness is computed +locally from last_update epoch vs. time.time(). +""" + +import json +import os +import time +import urllib.error +import urllib.request +from pathlib import Path + +CONTROL_PLANE_URL = os.environ["CONTROL_PLANE_URL"].rstrip("/") +STALE_THRESHOLD = int(os.environ.get("STALE_THRESHOLD", "600")) +INTERVAL = int(os.environ.get("INTERVAL", "60")) +FAILS_BEFORE_ALERT = int(os.environ.get("FAILS_BEFORE_ALERT", "3")) +TG_TOKEN = os.environ["TG_TOKEN"] +TG_CHAT_ID = os.environ["TG_CHAT_ID"] +HEALTHCHECKS_URL = os.environ.get("HEALTHCHECKS_URL", "").strip() + +STATE_FILE = Path("/data/state.json") + + +def load_state() -> dict: + if STATE_FILE.exists(): + try: + return json.loads(STATE_FILE.read_text()) + except Exception: + pass + return {"fail_count": 0, "alerted": False, "last_ok": 0.0} + + +def save_state(state: dict) -> None: + STATE_FILE.parent.mkdir(parents=True, exist_ok=True) + STATE_FILE.write_text(json.dumps(state)) + + +def http_get(url: str, timeout: int = 10) -> tuple[int | None, dict | None]: + try: + with urllib.request.urlopen(url, timeout=timeout) as resp: + return resp.status, json.loads(resp.read()) + except urllib.error.HTTPError as exc: + return exc.code, None + except Exception: + return None, None + + +def send_telegram(message: str) -> bool: + url = f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage" + payload = json.dumps( + {"chat_id": TG_CHAT_ID, "text": message, "parse_mode": "HTML"} + ).encode() + req = urllib.request.Request( + url, data=payload, headers={"Content-Type": "application/json"} + ) + try: + with urllib.request.urlopen(req, timeout=10) as resp: + return resp.status == 200 + except Exception as exc: + print(f"[telegram] send failed: {exc}", flush=True) + return False + + +def ping_healthchecks() -> None: + if not HEALTHCHECKS_URL: + return + try: + urllib.request.urlopen(HEALTHCHECKS_URL, timeout=10) + except Exception as exc: + print(f"[healthchecks] ping failed: {exc}", flush=True) + + +def check() -> tuple[bool, str]: + """Return (ok, human-readable reason). Never reads 'status' field.""" + status, body = http_get(f"{CONTROL_PLANE_URL}/summary") + + if status is None: + return False, "panel unreachable (connection error)" + + if status != 200: + return False, f"panel returned HTTP {status}" + + if not body: + return False, "panel returned empty / invalid JSON" + + raw = body.get("last_update") + if raw is None: + return False, "summary missing last_update field" + + try: + last_update_ts = float(raw) + except (TypeError, ValueError): + return False, f"last_update not parseable: {raw!r}" + + age = time.time() - last_update_ts + if age > STALE_THRESHOLD: + return False, ( + f"brain stale: last update {int(age // 60)}m ago " + f"(threshold {STALE_THRESHOLD // 60}m)" + ) + + return True, f"ok (age {int(age)}s)" + + +def main() -> None: + print( + f"[brain-watchdog] starting — " + f"url={CONTROL_PLANE_URL} " + f"stale_threshold={STALE_THRESHOLD}s " + f"interval={INTERVAL}s " + f"fails_before_alert={FAILS_BEFORE_ALERT}", + flush=True, + ) + state = load_state() + + while True: + ok, reason = check() + ts = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) + print(f"[{ts}] {'OK ' if ok else 'FAIL'} — {reason}", flush=True) + + if ok: + if state["alerted"]: + send_telegram( + "✅ brain-watchdog: control-plane RECOVERED\n" + f"{reason}" + ) + print("[telegram] sent recovery alert", flush=True) + state["fail_count"] = 0 + state["alerted"] = False + state["last_ok"] = time.time() + save_state(state) + ping_healthchecks() + else: + state["fail_count"] = state.get("fail_count", 0) + 1 + save_state(state) + + if state["fail_count"] >= FAILS_BEFORE_ALERT and not state["alerted"]: + sent = send_telegram( + "🚨 brain-watchdog: control-plane DOWN\n" + f"Reason: {reason}\n" + f"Consecutive failures: {state['fail_count']}\n" + f"URL: {CONTROL_PLANE_URL}" + ) + if sent: + state["alerted"] = True + save_state(state) + print("[telegram] sent alert", flush=True) + + time.sleep(INTERVAL) + + +if __name__ == "__main__": + main()