diff --git a/hosts/piha/runtime/brain-watchdog/docker-compose.override.yml b/hosts/piha/runtime/brain-watchdog/docker-compose.override.yml
new file mode 100644
index 0000000..0ed6604
--- /dev/null
+++ b/hosts/piha/runtime/brain-watchdog/docker-compose.override.yml
@@ -0,0 +1,4 @@
+services:
+ brain-watchdog:
+ mem_limit: 64m
+ restart: unless-stopped
diff --git a/hosts/piha/services.yaml b/hosts/piha/services.yaml
index 03fea12..5dcad97 100644
--- a/hosts/piha/services.yaml
+++ b/hosts/piha/services.yaml
@@ -29,3 +29,14 @@ services:
config_path: /opt/homelab/config/node-agent
data_path: /opt/homelab/state
logs_path: /opt/homelab/events
+
+ brain-watchdog:
+ role: control-plane-watchdog
+ deployment_model: docker-compose
+ exposure: private
+ offline_required: false
+ depends_on:
+ local: []
+ external: [control-plane]
+ runtime:
+ config_path: /opt/homelab/config/brain-watchdog
diff --git a/inventory/topology.yaml b/inventory/topology.yaml
index 19d11c1..614fbe1 100644
--- a/inventory/topology.yaml
+++ b/inventory/topology.yaml
@@ -17,6 +17,10 @@ nodes:
roles:
- infra
- monitoring
+ services:
+ - node-agent
+ - ha-diag-agent
+ - brain-watchdog
solaria:
roles:
@@ -28,6 +32,17 @@ nodes:
- edge
- ingress
- control-plane
+ services:
+ # Repo-managed GitOps services (hosts/vps/services.yaml is authoritative)
+ - node-agent
+ - control-plane # executor, observer, supervisor, operator-ui
+ - node_exporter
+ - stability-agent
+ - npm # Nginx Proxy Manager — public ingress, TLS termination
+ - outline # Team wiki (outline + postgres + redis)
+ - joplin # Note sync server (joplin-server + postgres)
+ - ai-cluster # AI workers: codex-worker, openclaw, planner-worker,
+ # service-ops-worker, redis, mosquitto
chelsty-infra:
site: chelsty
diff --git a/services/brain-watchdog/Dockerfile b/services/brain-watchdog/Dockerfile
new file mode 100644
index 0000000..e6fe6d3
--- /dev/null
+++ b/services/brain-watchdog/Dockerfile
@@ -0,0 +1,9 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+COPY src/ src/
+
+ENV PYTHONUNBUFFERED=1
+
+CMD ["python", "-m", "brain_watchdog.main"]
diff --git a/services/brain-watchdog/docker-compose.yml b/services/brain-watchdog/docker-compose.yml
new file mode 100644
index 0000000..19a8d86
--- /dev/null
+++ b/services/brain-watchdog/docker-compose.yml
@@ -0,0 +1,30 @@
+services:
+ brain-watchdog:
+ build: .
+ container_name: brain-watchdog
+ restart: unless-stopped
+
+ env_file:
+ - /opt/homelab/config/brain-watchdog/.env
+
+ volumes:
+ - brain_watchdog_data:/data
+
+ healthcheck:
+ test:
+ - "CMD"
+ - "python"
+ - "-c"
+ - |
+ import os, time, json, sys
+ p = '/data/state.json'
+ if not os.path.exists(p): sys.exit(1)
+ age = time.time() - os.path.getmtime(p)
+ sys.exit(0 if age < 300 else 1)
+ interval: 1m
+ timeout: 10s
+ retries: 3
+ start_period: 30s
+
+volumes:
+ brain_watchdog_data:
diff --git a/services/brain-watchdog/env.example b/services/brain-watchdog/env.example
new file mode 100644
index 0000000..f09c999
--- /dev/null
+++ b/services/brain-watchdog/env.example
@@ -0,0 +1,7 @@
+CONTROL_PLANE_URL=
+STALE_THRESHOLD=600
+INTERVAL=60
+FAILS_BEFORE_ALERT=3
+TG_TOKEN=
+TG_CHAT_ID=
+HEALTHCHECKS_URL=
diff --git a/services/brain-watchdog/healthcheck.sh b/services/brain-watchdog/healthcheck.sh
new file mode 100755
index 0000000..e3c1ab6
--- /dev/null
+++ b/services/brain-watchdog/healthcheck.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+# Healthy if state.json was written within the last 5 minutes.
+python -c "
+import os, time, sys
+p = '/data/state.json'
+if not os.path.exists(p):
+ sys.exit(1)
+age = time.time() - os.path.getmtime(p)
+sys.exit(0 if age < 300 else 1)
+"
diff --git a/services/brain-watchdog/service.yaml b/services/brain-watchdog/service.yaml
new file mode 100644
index 0000000..0eec949
--- /dev/null
+++ b/services/brain-watchdog/service.yaml
@@ -0,0 +1,34 @@
+service:
+ name: brain-watchdog
+ owner_node: piha
+ exposure: private
+ description: >
+ External watchdog for the control-plane on VPS. Queries /summary over
+ Tailscale and alerts via Telegram Bot API directly — no dependency on the
+ control-plane itself. Freshness is computed locally from last_update epoch.
+
+ dependencies:
+ - control-plane # external — on VPS; deliberately untrusted for liveness
+
+ healthcheck:
+ type: docker
+ interval: 60s
+ timeout: 10s
+ retries: 3
+ start_period: 30s
+
+ restart_policy: unless-stopped
+
+ persistence:
+ paths:
+ - /data # state.json: fail_count, alerted, last_ok
+
+ runtime:
+ env_vars:
+ - CONTROL_PLANE_URL # Tailscale IP + port of operator-ui (required)
+ - STALE_THRESHOLD # seconds before brain is considered stale (default: 600)
+ - INTERVAL # poll interval seconds (default: 60)
+ - FAILS_BEFORE_ALERT # consecutive failures before Telegram alert (default: 3)
+ - TG_TOKEN # Telegram Bot API token (required)
+ - TG_CHAT_ID # Telegram chat/user ID (required)
+ - HEALTHCHECKS_URL # optional healthchecks.io ping URL
diff --git a/services/brain-watchdog/src/brain_watchdog/__init__.py b/services/brain-watchdog/src/brain_watchdog/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/services/brain-watchdog/src/brain_watchdog/main.py b/services/brain-watchdog/src/brain_watchdog/main.py
new file mode 100644
index 0000000..96ed3b7
--- /dev/null
+++ b/services/brain-watchdog/src/brain_watchdog/main.py
@@ -0,0 +1,157 @@
+"""
+brain-watchdog: external watchdog for the control-plane on VPS.
+
+Runs on PIHA; queries /summary directly over Tailscale and alerts via
+Telegram Bot API without going through the control-plane itself.
+Never trusts the self-reported "status" field — freshness is computed
+locally from last_update epoch vs. time.time().
+"""
+
+import json
+import os
+import time
+import urllib.error
+import urllib.request
+from pathlib import Path
+
+CONTROL_PLANE_URL = os.environ["CONTROL_PLANE_URL"].rstrip("/")
+STALE_THRESHOLD = int(os.environ.get("STALE_THRESHOLD", "600"))
+INTERVAL = int(os.environ.get("INTERVAL", "60"))
+FAILS_BEFORE_ALERT = int(os.environ.get("FAILS_BEFORE_ALERT", "3"))
+TG_TOKEN = os.environ["TG_TOKEN"]
+TG_CHAT_ID = os.environ["TG_CHAT_ID"]
+HEALTHCHECKS_URL = os.environ.get("HEALTHCHECKS_URL", "").strip()
+
+STATE_FILE = Path("/data/state.json")
+
+
+def load_state() -> dict:
+ if STATE_FILE.exists():
+ try:
+ return json.loads(STATE_FILE.read_text())
+ except Exception:
+ pass
+ return {"fail_count": 0, "alerted": False, "last_ok": 0.0}
+
+
+def save_state(state: dict) -> None:
+ STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
+ STATE_FILE.write_text(json.dumps(state))
+
+
+def http_get(url: str, timeout: int = 10) -> tuple[int | None, dict | None]:
+ try:
+ with urllib.request.urlopen(url, timeout=timeout) as resp:
+ return resp.status, json.loads(resp.read())
+ except urllib.error.HTTPError as exc:
+ return exc.code, None
+ except Exception:
+ return None, None
+
+
+def send_telegram(message: str) -> bool:
+ url = f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage"
+ payload = json.dumps(
+ {"chat_id": TG_CHAT_ID, "text": message, "parse_mode": "HTML"}
+ ).encode()
+ req = urllib.request.Request(
+ url, data=payload, headers={"Content-Type": "application/json"}
+ )
+ try:
+ with urllib.request.urlopen(req, timeout=10) as resp:
+ return resp.status == 200
+ except Exception as exc:
+ print(f"[telegram] send failed: {exc}", flush=True)
+ return False
+
+
+def ping_healthchecks() -> None:
+ if not HEALTHCHECKS_URL:
+ return
+ try:
+ urllib.request.urlopen(HEALTHCHECKS_URL, timeout=10)
+ except Exception as exc:
+ print(f"[healthchecks] ping failed: {exc}", flush=True)
+
+
+def check() -> tuple[bool, str]:
+ """Return (ok, human-readable reason). Never reads 'status' field."""
+ status, body = http_get(f"{CONTROL_PLANE_URL}/summary")
+
+ if status is None:
+ return False, "panel unreachable (connection error)"
+
+ if status != 200:
+ return False, f"panel returned HTTP {status}"
+
+ if not body:
+ return False, "panel returned empty / invalid JSON"
+
+ raw = body.get("last_update")
+ if raw is None:
+ return False, "summary missing last_update field"
+
+ try:
+ last_update_ts = float(raw)
+ except (TypeError, ValueError):
+ return False, f"last_update not parseable: {raw!r}"
+
+ age = time.time() - last_update_ts
+ if age > STALE_THRESHOLD:
+ return False, (
+ f"brain stale: last update {int(age // 60)}m ago "
+ f"(threshold {STALE_THRESHOLD // 60}m)"
+ )
+
+ return True, f"ok (age {int(age)}s)"
+
+
+def main() -> None:
+ print(
+ f"[brain-watchdog] starting — "
+ f"url={CONTROL_PLANE_URL} "
+ f"stale_threshold={STALE_THRESHOLD}s "
+ f"interval={INTERVAL}s "
+ f"fails_before_alert={FAILS_BEFORE_ALERT}",
+ flush=True,
+ )
+ state = load_state()
+
+ while True:
+ ok, reason = check()
+ ts = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
+ print(f"[{ts}] {'OK ' if ok else 'FAIL'} — {reason}", flush=True)
+
+ if ok:
+ if state["alerted"]:
+ send_telegram(
+ "✅ brain-watchdog: control-plane RECOVERED\n"
+ f"{reason}"
+ )
+ print("[telegram] sent recovery alert", flush=True)
+ state["fail_count"] = 0
+ state["alerted"] = False
+ state["last_ok"] = time.time()
+ save_state(state)
+ ping_healthchecks()
+ else:
+ state["fail_count"] = state.get("fail_count", 0) + 1
+ save_state(state)
+
+ if state["fail_count"] >= FAILS_BEFORE_ALERT and not state["alerted"]:
+ sent = send_telegram(
+ "🚨 brain-watchdog: control-plane DOWN\n"
+ f"Reason: {reason}\n"
+ f"Consecutive failures: {state['fail_count']}\n"
+ f"URL: {CONTROL_PLANE_URL}"
+ )
+ if sent:
+ state["alerted"] = True
+ save_state(state)
+ print("[telegram] sent alert", flush=True)
+
+ time.sleep(INTERVAL)
+
+
+if __name__ == "__main__":
+ main()