feat(control-plane): shadow_mode for HA event auto-actions + deploy docs

- HA_DIAG_SHADOW_MODE env flag in supervisor (default true) - shadow_mode downgrades container_restart actions to alert_only with [SHADOW MODE] note; same action_id and 30-min cooldown apply - alert_only events unaffected (always routed normally) - 3 new tests: shadow on/off for ha_websocket_dead, alert-only unaffected - DEPLOY.md with token gen, per-host config, verification, 48h observation, production-mode enablement, rollback - README.md updated with shadow mode flag summary and DEPLOY.md link Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-29 17:04:39 +02:00 · 2026-05-29 17:04:39 +02:00 · 52607a7cdd
parent b9ed118b8c
commit 52607a7cdd
4 changed files with 361 additions and 2 deletions
--- a/services/control-plane/src/supervisor.py
+++ b/services/control-plane/src/supervisor.py
@ -75,6 +75,11 @@ HA_ALERT_COOLDOWN = 3600
 # within this window — HA is in a planned restart/update and alerts would be noise.
 HA_TRANSITION_WINDOW = 300  # 5 minutes

+# When True, events that would generate container_restart are downgraded to alert_only
+# with a "[SHADOW MODE]" note. Safe default for initial deployment; set
+# HA_DIAG_SHADOW_MODE=false on the control-plane node when ready for live actions.
+HA_DIAG_SHADOW_MODE = os.getenv("HA_DIAG_SHADOW_MODE", "true").lower() == "true"
+
 # Logging setup
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger("supervisor")
@ -88,6 +93,11 @@ class Supervisor:
        # on each reconcile cycle. Grows to at most ~hundreds of entries/day.
        self._ha_processed_event_ids: set = set()
        self._ensure_dirs()
+        logger.info(
+            "shadow_mode=%s — HA container_restart actions %s",
+            HA_DIAG_SHADOW_MODE,
+            "downgraded to alert_only" if HA_DIAG_SHADOW_MODE else "enabled",
+        )

    def _ensure_dirs(self):
        ACTIONS_DIR.mkdir(parents=True, exist_ok=True)
@ -529,7 +539,13 @@ class Supervisor:
                    f"Suppressing {event_type} on {node}: homeassistant in transition"
                )
                return
-            self._generate_ha_container_restart(node, event)
+            if HA_DIAG_SHADOW_MODE:
+                logger.info(
+                    "shadow_mode: suppressed container_restart for %s", event_type
+                )
+                self._generate_ha_shadow_alert(node, event)
+            else:
+                self._generate_ha_container_restart(node, event)

        elif event_type == "ha_websocket_recovered":
            self._cancel_ha_container_restart(node)
@ -617,6 +633,48 @@ class Supervisor:
        }
        self._write_pending_action(action)

+    def _generate_ha_shadow_alert(self, node: str, event: dict):
+        """Shadow-mode downgrade: emit alert_only instead of container_restart.
+
+        Uses the same action_id and cooldown as the real restart so that
+        cooldown semantics are identical regardless of shadow mode state.
+        """
+        service = "homeassistant"
+        action_id = f"container-restart-{node}-{service}"
+
+        for state in ("pending", "approved", "running"):
+            if (ACTIONS_DIR / state / f"{action_id}.json").exists():
+                logger.debug(f"Skipping {action_id}: already in state '{state}'")
+                return
+
+        if self._ha_action_recently_completed(action_id, HA_WEBSOCKET_RESTART_COOLDOWN):
+            logger.debug(
+                f"Skipping {action_id}: within {HA_WEBSOCKET_RESTART_COOLDOWN}s cooldown"
+            )
+            return
+
+        payload = dict(event.get("payload", {}))
+        payload["reason"] = "ha_websocket_dead"
+        payload["svc_key"] = f"{node}/{service}"
+        payload["shadow_mode"] = True
+
+        action = {
+            "action_id": action_id,
+            "timestamp": time.time(),
+            "type": "alert_only",
+            "node": node,
+            "service": service,
+            "risk_level": "info",
+            "confidence": 0.9,
+            "description": (
+                f"[SHADOW MODE] would have triggered container_restart "
+                f"for {service} on {node}: HA WebSocket unresponsive"
+            ),
+            "status": "pending",
+            "payload": payload,
+        }
+        self._write_pending_action(action)
+
    def _generate_ha_alert_only(self, node: str, event: dict):
        event_type = event.get("type", "")
        suffix = _HA_ALERT_ID_SUFFIX.get(event_type, event_type.replace("_", "-"))
--- a/services/control-plane/tests/test_supervisor_ha.py
+++ b/services/control-plane/tests/test_supervisor_ha.py
@ -74,6 +74,7 @@ def _read_action(tmp_path: Path, state: str, action_id: str) -> dict:
 # ---------------------------------------------------------------------------

 def test_ha_websocket_dead_generates_container_restart(tmp_path, monkeypatch):
+    monkeypatch.setattr(supervisor_module, "HA_DIAG_SHADOW_MODE", False)
    sup = _setup_supervisor(tmp_path, monkeypatch)
    events_dir = tmp_path / "events"
    _write_event(events_dir, _make_event("ha_websocket_dead"))
@ -326,7 +327,57 @@ def test_alert_only_dedup_second_event_skipped(tmp_path, monkeypatch):


 # ---------------------------------------------------------------------------
-# 7. Non-HA events are ignored
+# 7. Shadow mode
+# ---------------------------------------------------------------------------
+
+def test_shadow_mode_websocket_dead_generates_alert_not_restart(tmp_path, monkeypatch):
+    """shadow_mode=True: ha_websocket_dead → alert_only with [SHADOW MODE], not container_restart."""
+    monkeypatch.setattr(supervisor_module, "HA_DIAG_SHADOW_MODE", True)
+    sup = _setup_supervisor(tmp_path, monkeypatch)
+    _write_event(tmp_path / "events", _make_event("ha_websocket_dead"))
+
+    sup._process_ha_events()
+
+    action_id = "container-restart-chelsty-ha-homeassistant"
+    assert _pending(tmp_path, action_id).exists(), "Shadow alert should be written"
+    action = _read_action(tmp_path, "pending", action_id)
+    assert action["type"] == "alert_only"
+    assert "[SHADOW MODE]" in action["description"]
+    assert action["payload"].get("shadow_mode") is True
+
+
+def test_no_shadow_mode_websocket_dead_generates_container_restart(tmp_path, monkeypatch):
+    """shadow_mode=False: ha_websocket_dead → container_restart (normal path)."""
+    monkeypatch.setattr(supervisor_module, "HA_DIAG_SHADOW_MODE", False)
+    sup = _setup_supervisor(tmp_path, monkeypatch)
+    _write_event(tmp_path / "events", _make_event("ha_websocket_dead"))
+
+    sup._process_ha_events()
+
+    action_id = "container-restart-chelsty-ha-homeassistant"
+    assert _pending(tmp_path, action_id).exists()
+    action = _read_action(tmp_path, "pending", action_id)
+    assert action["type"] == "container_restart"
+    assert "[SHADOW MODE]" not in action["description"]
+
+
+def test_shadow_mode_alert_only_events_unaffected(tmp_path, monkeypatch):
+    """shadow_mode=True: alert-only events (ha_entity_unavailable_long) are still routed normally."""
+    monkeypatch.setattr(supervisor_module, "HA_DIAG_SHADOW_MODE", True)
+    sup = _setup_supervisor(tmp_path, monkeypatch)
+    _write_event(tmp_path / "events", _make_event("ha_entity_unavailable_long"))
+
+    sup._process_ha_events()
+
+    action_id = "alert-ha-entity-unavailable-chelsty-ha"
+    assert _pending(tmp_path, action_id).exists()
+    action = _read_action(tmp_path, "pending", action_id)
+    assert action["type"] == "alert_only"
+    assert "[SHADOW MODE]" not in action["description"]
+
+
+# ---------------------------------------------------------------------------
+# 8. Non-HA events are ignored
 # ---------------------------------------------------------------------------

 def test_non_ha_events_not_routed(tmp_path, monkeypatch):
--- a/services/ha-diag-agent/DEPLOY.md
+++ b/services/ha-diag-agent/DEPLOY.md
@ -0,0 +1,239 @@
+# ha-diag-agent Deployment Guide
+
+## Section 1: Prerequisites
+
+### HA long-lived access token
+
+The agent authenticates to Home Assistant with a long-lived token issued by a
+dedicated service account. Do not use a personal admin token.
+
+1. In HA: **Settings → People → Add Person**
+   - Name: `diag_agent`
+   - Do **not** add to any group (no admin rights needed)
+2. Log in to HA as `diag_agent`
+3. Go to **Profile → Long-Lived Access Tokens → Create token**
+   - Name: `ha-diag-agent`
+   - Copy the token — it is shown only once
+4. Store the token in the node's `.env` file (see Section 2)
+
+### Tailnet reachability check (chelsty-infra only)
+
+`chelsty-infra` reaches Home Assistant on `chelsty-ha` over Tailscale.
+Verify before deploying:
+
+```bash
+curl -sf http://100.70.180.90:8123/api/ \
+  -H "Authorization: Bearer <token>" | python3 -m json.tool
+# Expect: {"message": "API running."}
+```
+
+If the request times out, check that both nodes are on the Tailscale mesh
+(`tailscale status`) and that `chelsty-ha` is powered on.
+
+---
+
+## Section 2: Per-host config
+
+Create `/opt/homelab/config/ha-diag-agent/.env` on **each target node**:
+
+### piha
+
+```bash
+mkdir -p /opt/homelab/config/ha-diag-agent
+cat > /opt/homelab/config/ha-diag-agent/.env << 'EOF'
+HA_URL=http://localhost:8123
+HA_TOKEN=<long-lived-token-for-piha>
+NODE_NAME=piha
+LOCATION_TAG=ken
+CHECK_INTERVAL=60
+CHECK_INTERVAL_UNAVAILABLE=3600
+UNAVAILABLE_THRESHOLD_HOURS=24
+ALERT_COOLDOWN_HOURS=6
+LOG_LEVEL=info
+EOF
+chmod 600 /opt/homelab/config/ha-diag-agent/.env
+```
+
+### chelsty-infra
+
+```bash
+mkdir -p /opt/homelab/config/ha-diag-agent
+cat > /opt/homelab/config/ha-diag-agent/.env << 'EOF'
+HA_URL=http://100.70.180.90:8123
+HA_TOKEN=<long-lived-token-for-chelsty-ha>
+NODE_NAME=chelsty-infra
+LOCATION_TAG=chelsty
+CHECK_INTERVAL=60
+CHECK_INTERVAL_UNAVAILABLE=3600
+UNAVAILABLE_THRESHOLD_HOURS=24
+ALERT_COOLDOWN_HOURS=6
+LOG_LEVEL=info
+EOF
+chmod 600 /opt/homelab/config/ha-diag-agent/.env
+```
+
+> If `chelsty-ha` gets a new Tailscale IP, update `HA_URL` in this file and
+> restart the container.
+
+---
+
+## Section 3: Deploy procedure
+
+### From SATURN (standard flow)
+
+```bash
+# 1. Commit and push changes from SATURN
+git push
+
+# 2. SSH to target node
+ssh oskar@piha            # or chelsty-infra
+
+# 3. Pull latest and deploy
+cd ~/homelab-codex-ws
+git pull
+scripts/deploy/deploy.sh --service ha-diag-agent
+```
+
+### chelsty-infra (docker-compose v1)
+
+`chelsty-infra` runs docker-compose v1 (1.29.2). The deploy script calls
+`docker-compose` (hyphenated), which is correct. If you need to run manually:
+
+```bash
+cd ~/homelab-codex-ws/services/ha-diag-agent
+docker-compose up -d --build
+```
+
+---
+
+## Section 4: Verification
+
+```bash
+# Container is up
+docker ps | grep ha-diag-agent
+
+# Last 50 log lines
+docker logs ha-diag-agent --tail 50
+
+# FastAPI health endpoint
+curl http://localhost:8087/health
+# Expect: {"status": "ok", "ws_connected": true, ...}
+
+# Events are being written
+ls /opt/homelab/events/<node-name>/
+# Expect: ha_*.json files appearing within the first CHECK_INTERVAL seconds
+
+# Supervisor is picking up events (check on VPS / control-plane)
+tail -f /opt/homelab/logs/supervisor.log | grep ha_
+```
+
+---
+
+## Section 5: First-48h observation (shadow mode)
+
+The supervisor starts with `HA_DIAG_SHADOW_MODE=true` (default). During this
+window, `ha_websocket_dead` events are downgraded to `alert_only` actions
+tagged `[SHADOW MODE]` rather than triggering an automatic restart.
+
+Watch for these signals in Telegram:
+
+- `[SHADOW MODE] would have triggered container_restart for homeassistant` —
+  confirms the detection path works end-to-end
+- `ha_entity_unavailable_long` / `ha_integration_failed` / etc. — these are
+  always `alert_only` regardless of shadow mode; verify descriptions look
+  accurate and thresholds are reasonable
+
+Things to evaluate:
+
+| Question | Good sign |
+|----------|-----------|
+| Are shadow alerts firing at reasonable frequency? | ≤ 1 per 30 min per node |
+| Are there false positives? | No alerts during known-good uptime |
+| Are entity-unavailable alerts describing real entities? | Yes, names match HA UI |
+| Are integration-failed alerts genuine? | Yes, not noise from startup |
+
+Note any false positives or noisy thresholds before enabling production mode.
+
+---
+
+## Section 6: Enabling production mode
+
+`HA_DIAG_SHADOW_MODE` is an environment variable read by the supervisor
+container. The VPS supervisor env vars live in the version-controlled
+override file at `hosts/vps/runtime/control-plane/docker-compose.override.yml`
+(not in a runtime `.env` file — the supervisor has no `env_file:` directive).
+
+When the 48h observation period looks clean:
+
+**1. Edit the override file on SATURN:**
+
+```yaml
+# hosts/vps/runtime/control-plane/docker-compose.override.yml
+services:
+  supervisor:
+    environment:
+      - NODE_ALIAS_MAP={"node-2":"chelsty"}
+      - HA_DIAG_SHADOW_MODE=false      # add this line
+```
+
+**2. Commit and push from SATURN:**
+
+```bash
+git add hosts/vps/runtime/control-plane/docker-compose.override.yml
+git commit -m "feat(control-plane): disable HA shadow mode — production ready"
+git push
+```
+
+**3. Apply on VPS:**
+
+```bash
+ssh oskar@100.95.58.48
+cd ~/homelab-codex-ws && git pull
+docker compose \
+  -f services/control-plane/docker-compose.yml \
+  -f hosts/vps/runtime/control-plane/docker-compose.override.yml \
+  up -d supervisor
+```
+
+**4. Confirm:**
+
+```bash
+docker logs control-plane-supervisor --tail 5
+# Expect: shadow_mode=False — HA container_restart actions enabled
+```
+
+From this point, the next `ha_websocket_dead` event will generate a
+`container_restart` action in the approval queue. The 30-minute cooldown
+still applies after each restart.
+
+---
+
+## Section 7: Rollback
+
+If production mode causes unexpected behaviour:
+
+```bash
+# Option A — re-enable shadow mode
+# On SATURN: edit hosts/vps/runtime/control-plane/docker-compose.override.yml
+# Set HA_DIAG_SHADOW_MODE=true (or remove the line — default is true)
+# Commit, push, then on VPS:
+ssh oskar@100.95.58.48
+cd ~/homelab-codex-ws && git pull
+docker compose \
+  -f services/control-plane/docker-compose.yml \
+  -f hosts/vps/runtime/control-plane/docker-compose.override.yml \
+  up -d supervisor
+
+# Option B — stop ha-diag-agent entirely on affected nodes
+ssh oskar@<node>
+docker stop ha-diag-agent
+
+# Events written before rollback remain in /opt/homelab/events/<node>/
+# and are historical only — no automated action will be taken on them
+# unless the supervisor re-processes them, which it won't (already in
+# _ha_processed_event_ids).
+```
+
+Any `container_restart` actions still in `pending/` after rollback can be
+manually rejected via the Telegram bot or by deleting the action files from
+`/opt/homelab/actions/pending/` on the VPS.
--- a/services/ha-diag-agent/README.md
+++ b/services/ha-diag-agent/README.md
@ -52,6 +52,17 @@ checks are APScheduler intervals (stateless REST polls).
 Event routing in supervisor (Phase 5) maps these to `notify` actions.
 `ha_websocket_recovered` should be routed to clear any active `ha_websocket_dead` incident.

+## First-time deployment
+
+See **[DEPLOY.md](DEPLOY.md)** for the full procedure: HA token creation,
+per-host `.env` config, deploy commands, verification steps, 48h shadow-mode
+observation, and rollback.
+
+**Shadow mode** (`HA_DIAG_SHADOW_MODE`, default `true` on the control-plane):
+`ha_websocket_dead` events are downgraded to `alert_only` with a `[SHADOW MODE]`
+note instead of queuing an automatic `container_restart`. Set to `false` in
+`/opt/homelab/config/control-plane/.env` on the VPS when ready for live actions.
+
 ## Deployment model

 The agent is deployed **per-host** but targets a potentially remote HA instance: