diff --git a/services/control-plane/src/supervisor.py b/services/control-plane/src/supervisor.py index 5d901ff..a0f8c97 100644 --- a/services/control-plane/src/supervisor.py +++ b/services/control-plane/src/supervisor.py @@ -75,6 +75,11 @@ HA_ALERT_COOLDOWN = 3600 # within this window — HA is in a planned restart/update and alerts would be noise. HA_TRANSITION_WINDOW = 300 # 5 minutes +# When True, events that would generate container_restart are downgraded to alert_only +# with a "[SHADOW MODE]" note. Safe default for initial deployment; set +# HA_DIAG_SHADOW_MODE=false on the control-plane node when ready for live actions. +HA_DIAG_SHADOW_MODE = os.getenv("HA_DIAG_SHADOW_MODE", "true").lower() == "true" + # Logging setup logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger("supervisor") @@ -88,6 +93,11 @@ class Supervisor: # on each reconcile cycle. Grows to at most ~hundreds of entries/day. self._ha_processed_event_ids: set = set() self._ensure_dirs() + logger.info( + "shadow_mode=%s — HA container_restart actions %s", + HA_DIAG_SHADOW_MODE, + "downgraded to alert_only" if HA_DIAG_SHADOW_MODE else "enabled", + ) def _ensure_dirs(self): ACTIONS_DIR.mkdir(parents=True, exist_ok=True) @@ -529,7 +539,13 @@ class Supervisor: f"Suppressing {event_type} on {node}: homeassistant in transition" ) return - self._generate_ha_container_restart(node, event) + if HA_DIAG_SHADOW_MODE: + logger.info( + "shadow_mode: suppressed container_restart for %s", event_type + ) + self._generate_ha_shadow_alert(node, event) + else: + self._generate_ha_container_restart(node, event) elif event_type == "ha_websocket_recovered": self._cancel_ha_container_restart(node) @@ -617,6 +633,48 @@ class Supervisor: } self._write_pending_action(action) + def _generate_ha_shadow_alert(self, node: str, event: dict): + """Shadow-mode downgrade: emit alert_only instead of container_restart. + + Uses the same action_id and cooldown as the real restart so that + cooldown semantics are identical regardless of shadow mode state. + """ + service = "homeassistant" + action_id = f"container-restart-{node}-{service}" + + for state in ("pending", "approved", "running"): + if (ACTIONS_DIR / state / f"{action_id}.json").exists(): + logger.debug(f"Skipping {action_id}: already in state '{state}'") + return + + if self._ha_action_recently_completed(action_id, HA_WEBSOCKET_RESTART_COOLDOWN): + logger.debug( + f"Skipping {action_id}: within {HA_WEBSOCKET_RESTART_COOLDOWN}s cooldown" + ) + return + + payload = dict(event.get("payload", {})) + payload["reason"] = "ha_websocket_dead" + payload["svc_key"] = f"{node}/{service}" + payload["shadow_mode"] = True + + action = { + "action_id": action_id, + "timestamp": time.time(), + "type": "alert_only", + "node": node, + "service": service, + "risk_level": "info", + "confidence": 0.9, + "description": ( + f"[SHADOW MODE] would have triggered container_restart " + f"for {service} on {node}: HA WebSocket unresponsive" + ), + "status": "pending", + "payload": payload, + } + self._write_pending_action(action) + def _generate_ha_alert_only(self, node: str, event: dict): event_type = event.get("type", "") suffix = _HA_ALERT_ID_SUFFIX.get(event_type, event_type.replace("_", "-")) diff --git a/services/control-plane/tests/test_supervisor_ha.py b/services/control-plane/tests/test_supervisor_ha.py index b707361..78d7b12 100644 --- a/services/control-plane/tests/test_supervisor_ha.py +++ b/services/control-plane/tests/test_supervisor_ha.py @@ -74,6 +74,7 @@ def _read_action(tmp_path: Path, state: str, action_id: str) -> dict: # --------------------------------------------------------------------------- def test_ha_websocket_dead_generates_container_restart(tmp_path, monkeypatch): + monkeypatch.setattr(supervisor_module, "HA_DIAG_SHADOW_MODE", False) sup = _setup_supervisor(tmp_path, monkeypatch) events_dir = tmp_path / "events" _write_event(events_dir, _make_event("ha_websocket_dead")) @@ -326,7 +327,57 @@ def test_alert_only_dedup_second_event_skipped(tmp_path, monkeypatch): # --------------------------------------------------------------------------- -# 7. Non-HA events are ignored +# 7. Shadow mode +# --------------------------------------------------------------------------- + +def test_shadow_mode_websocket_dead_generates_alert_not_restart(tmp_path, monkeypatch): + """shadow_mode=True: ha_websocket_dead → alert_only with [SHADOW MODE], not container_restart.""" + monkeypatch.setattr(supervisor_module, "HA_DIAG_SHADOW_MODE", True) + sup = _setup_supervisor(tmp_path, monkeypatch) + _write_event(tmp_path / "events", _make_event("ha_websocket_dead")) + + sup._process_ha_events() + + action_id = "container-restart-chelsty-ha-homeassistant" + assert _pending(tmp_path, action_id).exists(), "Shadow alert should be written" + action = _read_action(tmp_path, "pending", action_id) + assert action["type"] == "alert_only" + assert "[SHADOW MODE]" in action["description"] + assert action["payload"].get("shadow_mode") is True + + +def test_no_shadow_mode_websocket_dead_generates_container_restart(tmp_path, monkeypatch): + """shadow_mode=False: ha_websocket_dead → container_restart (normal path).""" + monkeypatch.setattr(supervisor_module, "HA_DIAG_SHADOW_MODE", False) + sup = _setup_supervisor(tmp_path, monkeypatch) + _write_event(tmp_path / "events", _make_event("ha_websocket_dead")) + + sup._process_ha_events() + + action_id = "container-restart-chelsty-ha-homeassistant" + assert _pending(tmp_path, action_id).exists() + action = _read_action(tmp_path, "pending", action_id) + assert action["type"] == "container_restart" + assert "[SHADOW MODE]" not in action["description"] + + +def test_shadow_mode_alert_only_events_unaffected(tmp_path, monkeypatch): + """shadow_mode=True: alert-only events (ha_entity_unavailable_long) are still routed normally.""" + monkeypatch.setattr(supervisor_module, "HA_DIAG_SHADOW_MODE", True) + sup = _setup_supervisor(tmp_path, monkeypatch) + _write_event(tmp_path / "events", _make_event("ha_entity_unavailable_long")) + + sup._process_ha_events() + + action_id = "alert-ha-entity-unavailable-chelsty-ha" + assert _pending(tmp_path, action_id).exists() + action = _read_action(tmp_path, "pending", action_id) + assert action["type"] == "alert_only" + assert "[SHADOW MODE]" not in action["description"] + + +# --------------------------------------------------------------------------- +# 8. Non-HA events are ignored # --------------------------------------------------------------------------- def test_non_ha_events_not_routed(tmp_path, monkeypatch): diff --git a/services/ha-diag-agent/DEPLOY.md b/services/ha-diag-agent/DEPLOY.md new file mode 100644 index 0000000..0e382d4 --- /dev/null +++ b/services/ha-diag-agent/DEPLOY.md @@ -0,0 +1,239 @@ +# ha-diag-agent Deployment Guide + +## Section 1: Prerequisites + +### HA long-lived access token + +The agent authenticates to Home Assistant with a long-lived token issued by a +dedicated service account. Do not use a personal admin token. + +1. In HA: **Settings → People → Add Person** + - Name: `diag_agent` + - Do **not** add to any group (no admin rights needed) +2. Log in to HA as `diag_agent` +3. Go to **Profile → Long-Lived Access Tokens → Create token** + - Name: `ha-diag-agent` + - Copy the token — it is shown only once +4. Store the token in the node's `.env` file (see Section 2) + +### Tailnet reachability check (chelsty-infra only) + +`chelsty-infra` reaches Home Assistant on `chelsty-ha` over Tailscale. +Verify before deploying: + +```bash +curl -sf http://100.70.180.90:8123/api/ \ + -H "Authorization: Bearer " | python3 -m json.tool +# Expect: {"message": "API running."} +``` + +If the request times out, check that both nodes are on the Tailscale mesh +(`tailscale status`) and that `chelsty-ha` is powered on. + +--- + +## Section 2: Per-host config + +Create `/opt/homelab/config/ha-diag-agent/.env` on **each target node**: + +### piha + +```bash +mkdir -p /opt/homelab/config/ha-diag-agent +cat > /opt/homelab/config/ha-diag-agent/.env << 'EOF' +HA_URL=http://localhost:8123 +HA_TOKEN= +NODE_NAME=piha +LOCATION_TAG=ken +CHECK_INTERVAL=60 +CHECK_INTERVAL_UNAVAILABLE=3600 +UNAVAILABLE_THRESHOLD_HOURS=24 +ALERT_COOLDOWN_HOURS=6 +LOG_LEVEL=info +EOF +chmod 600 /opt/homelab/config/ha-diag-agent/.env +``` + +### chelsty-infra + +```bash +mkdir -p /opt/homelab/config/ha-diag-agent +cat > /opt/homelab/config/ha-diag-agent/.env << 'EOF' +HA_URL=http://100.70.180.90:8123 +HA_TOKEN= +NODE_NAME=chelsty-infra +LOCATION_TAG=chelsty +CHECK_INTERVAL=60 +CHECK_INTERVAL_UNAVAILABLE=3600 +UNAVAILABLE_THRESHOLD_HOURS=24 +ALERT_COOLDOWN_HOURS=6 +LOG_LEVEL=info +EOF +chmod 600 /opt/homelab/config/ha-diag-agent/.env +``` + +> If `chelsty-ha` gets a new Tailscale IP, update `HA_URL` in this file and +> restart the container. + +--- + +## Section 3: Deploy procedure + +### From SATURN (standard flow) + +```bash +# 1. Commit and push changes from SATURN +git push + +# 2. SSH to target node +ssh oskar@piha # or chelsty-infra + +# 3. Pull latest and deploy +cd ~/homelab-codex-ws +git pull +scripts/deploy/deploy.sh --service ha-diag-agent +``` + +### chelsty-infra (docker-compose v1) + +`chelsty-infra` runs docker-compose v1 (1.29.2). The deploy script calls +`docker-compose` (hyphenated), which is correct. If you need to run manually: + +```bash +cd ~/homelab-codex-ws/services/ha-diag-agent +docker-compose up -d --build +``` + +--- + +## Section 4: Verification + +```bash +# Container is up +docker ps | grep ha-diag-agent + +# Last 50 log lines +docker logs ha-diag-agent --tail 50 + +# FastAPI health endpoint +curl http://localhost:8087/health +# Expect: {"status": "ok", "ws_connected": true, ...} + +# Events are being written +ls /opt/homelab/events// +# Expect: ha_*.json files appearing within the first CHECK_INTERVAL seconds + +# Supervisor is picking up events (check on VPS / control-plane) +tail -f /opt/homelab/logs/supervisor.log | grep ha_ +``` + +--- + +## Section 5: First-48h observation (shadow mode) + +The supervisor starts with `HA_DIAG_SHADOW_MODE=true` (default). During this +window, `ha_websocket_dead` events are downgraded to `alert_only` actions +tagged `[SHADOW MODE]` rather than triggering an automatic restart. + +Watch for these signals in Telegram: + +- `[SHADOW MODE] would have triggered container_restart for homeassistant` — + confirms the detection path works end-to-end +- `ha_entity_unavailable_long` / `ha_integration_failed` / etc. — these are + always `alert_only` regardless of shadow mode; verify descriptions look + accurate and thresholds are reasonable + +Things to evaluate: + +| Question | Good sign | +|----------|-----------| +| Are shadow alerts firing at reasonable frequency? | ≤ 1 per 30 min per node | +| Are there false positives? | No alerts during known-good uptime | +| Are entity-unavailable alerts describing real entities? | Yes, names match HA UI | +| Are integration-failed alerts genuine? | Yes, not noise from startup | + +Note any false positives or noisy thresholds before enabling production mode. + +--- + +## Section 6: Enabling production mode + +`HA_DIAG_SHADOW_MODE` is an environment variable read by the supervisor +container. The VPS supervisor env vars live in the version-controlled +override file at `hosts/vps/runtime/control-plane/docker-compose.override.yml` +(not in a runtime `.env` file — the supervisor has no `env_file:` directive). + +When the 48h observation period looks clean: + +**1. Edit the override file on SATURN:** + +```yaml +# hosts/vps/runtime/control-plane/docker-compose.override.yml +services: + supervisor: + environment: + - NODE_ALIAS_MAP={"node-2":"chelsty"} + - HA_DIAG_SHADOW_MODE=false # add this line +``` + +**2. Commit and push from SATURN:** + +```bash +git add hosts/vps/runtime/control-plane/docker-compose.override.yml +git commit -m "feat(control-plane): disable HA shadow mode — production ready" +git push +``` + +**3. Apply on VPS:** + +```bash +ssh oskar@100.95.58.48 +cd ~/homelab-codex-ws && git pull +docker compose \ + -f services/control-plane/docker-compose.yml \ + -f hosts/vps/runtime/control-plane/docker-compose.override.yml \ + up -d supervisor +``` + +**4. Confirm:** + +```bash +docker logs control-plane-supervisor --tail 5 +# Expect: shadow_mode=False — HA container_restart actions enabled +``` + +From this point, the next `ha_websocket_dead` event will generate a +`container_restart` action in the approval queue. The 30-minute cooldown +still applies after each restart. + +--- + +## Section 7: Rollback + +If production mode causes unexpected behaviour: + +```bash +# Option A — re-enable shadow mode +# On SATURN: edit hosts/vps/runtime/control-plane/docker-compose.override.yml +# Set HA_DIAG_SHADOW_MODE=true (or remove the line — default is true) +# Commit, push, then on VPS: +ssh oskar@100.95.58.48 +cd ~/homelab-codex-ws && git pull +docker compose \ + -f services/control-plane/docker-compose.yml \ + -f hosts/vps/runtime/control-plane/docker-compose.override.yml \ + up -d supervisor + +# Option B — stop ha-diag-agent entirely on affected nodes +ssh oskar@ +docker stop ha-diag-agent + +# Events written before rollback remain in /opt/homelab/events// +# and are historical only — no automated action will be taken on them +# unless the supervisor re-processes them, which it won't (already in +# _ha_processed_event_ids). +``` + +Any `container_restart` actions still in `pending/` after rollback can be +manually rejected via the Telegram bot or by deleting the action files from +`/opt/homelab/actions/pending/` on the VPS. diff --git a/services/ha-diag-agent/README.md b/services/ha-diag-agent/README.md index f74008b..c35e499 100644 --- a/services/ha-diag-agent/README.md +++ b/services/ha-diag-agent/README.md @@ -52,6 +52,17 @@ checks are APScheduler intervals (stateless REST polls). Event routing in supervisor (Phase 5) maps these to `notify` actions. `ha_websocket_recovered` should be routed to clear any active `ha_websocket_dead` incident. +## First-time deployment + +See **[DEPLOY.md](DEPLOY.md)** for the full procedure: HA token creation, +per-host `.env` config, deploy commands, verification steps, 48h shadow-mode +observation, and rollback. + +**Shadow mode** (`HA_DIAG_SHADOW_MODE`, default `true` on the control-plane): +`ha_websocket_dead` events are downgraded to `alert_only` with a `[SHADOW MODE]` +note instead of queuing an automatic `container_restart`. Set to `false` in +`/opt/homelab/config/control-plane/.env` on the VPS when ready for live actions. + ## Deployment model The agent is deployed **per-host** but targets a potentially remote HA instance: