feat(control-plane): shadow_mode for HA event auto-actions + deploy docs

- HA_DIAG_SHADOW_MODE env flag in supervisor (default true) - shadow_mode downgrades container_restart actions to alert_only with [SHADOW MODE] note; same action_id and 30-min cooldown apply - alert_only events unaffected (always routed normally) - 3 new tests: shadow on/off for ha_websocket_dead, alert-only unaffected - DEPLOY.md with token gen, per-host config, verification, 48h observation, production-mode enablement, rollback - README.md updated with shadow mode flag summary and DEPLOY.md link Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-29 17:04:39 +02:00 · 2026-05-29 17:04:39 +02:00 · 52607a7cdd
parent b9ed118b8c
commit 52607a7cdd
4 changed files with 361 additions and 2 deletions
--- a/services/control-plane/src/supervisor.py
+++ b/services/control-plane/src/supervisor.py
@ -75,6 +75,11 @@ HA_ALERT_COOLDOWN = 3600
 # within this window — HA is in a planned restart/update and alerts would be noise.
 HA_TRANSITION_WINDOW = 300  # 5 minutes
 # When True, events that would generate container_restart are downgraded to alert_only
 # with a "[SHADOW MODE]" note. Safe default for initial deployment; set
 # HA_DIAG_SHADOW_MODE=false on the control-plane node when ready for live actions.
 HA_DIAG_SHADOW_MODE = os.getenv("HA_DIAG_SHADOW_MODE", "true").lower() == "true"
 # Logging setup
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger("supervisor")
@ -88,6 +93,11 @@ class Supervisor:
        # on each reconcile cycle. Grows to at most ~hundreds of entries/day.
        self._ha_processed_event_ids: set = set()
        self._ensure_dirs()
        logger.info(
            "shadow_mode=%s — HA container_restart actions %s",
            HA_DIAG_SHADOW_MODE,
            "downgraded to alert_only" if HA_DIAG_SHADOW_MODE else "enabled",
        )
    def _ensure_dirs(self):
        ACTIONS_DIR.mkdir(parents=True, exist_ok=True)
@ -529,6 +539,12 @@ class Supervisor:
                    f"Suppressing {event_type} on {node}: homeassistant in transition"
                )
                return
            if HA_DIAG_SHADOW_MODE:
                logger.info(
                    "shadow_mode: suppressed container_restart for %s", event_type
                )
                self._generate_ha_shadow_alert(node, event)
            else:
                self._generate_ha_container_restart(node, event)
        elif event_type == "ha_websocket_recovered":
@ -617,6 +633,48 @@ class Supervisor:
        }
        self._write_pending_action(action)
    def _generate_ha_shadow_alert(self, node: str, event: dict):
        """Shadow-mode downgrade: emit alert_only instead of container_restart.
        Uses the same action_id and cooldown as the real restart so that
        cooldown semantics are identical regardless of shadow mode state.
        """
        service = "homeassistant"
        action_id = f"container-restart-{node}-{service}"
        for state in ("pending", "approved", "running"):
            if (ACTIONS_DIR / state / f"{action_id}.json").exists():
                logger.debug(f"Skipping {action_id}: already in state '{state}'")
                return
        if self._ha_action_recently_completed(action_id, HA_WEBSOCKET_RESTART_COOLDOWN):
            logger.debug(
                f"Skipping {action_id}: within {HA_WEBSOCKET_RESTART_COOLDOWN}s cooldown"
            )
            return
        payload = dict(event.get("payload", {}))
        payload["reason"] = "ha_websocket_dead"
        payload["svc_key"] = f"{node}/{service}"
        payload["shadow_mode"] = True
        action = {
            "action_id": action_id,
            "timestamp": time.time(),
            "type": "alert_only",
            "node": node,
            "service": service,
            "risk_level": "info",
            "confidence": 0.9,
            "description": (
                f"[SHADOW MODE] would have triggered container_restart "
                f"for {service} on {node}: HA WebSocket unresponsive"
            ),
            "status": "pending",
            "payload": payload,
        }
        self._write_pending_action(action)
    def _generate_ha_alert_only(self, node: str, event: dict):
        event_type = event.get("type", "")
        suffix = _HA_ALERT_ID_SUFFIX.get(event_type, event_type.replace("_", "-"))
--- a/services/control-plane/tests/test_supervisor_ha.py
+++ b/services/control-plane/tests/test_supervisor_ha.py
@ -74,6 +74,7 @@ def _read_action(tmp_path: Path, state: str, action_id: str) -> dict:
 # ---------------------------------------------------------------------------
 def test_ha_websocket_dead_generates_container_restart(tmp_path, monkeypatch):
    monkeypatch.setattr(supervisor_module, "HA_DIAG_SHADOW_MODE", False)
    sup = _setup_supervisor(tmp_path, monkeypatch)
    events_dir = tmp_path / "events"
    _write_event(events_dir, _make_event("ha_websocket_dead"))
@ -326,7 +327,57 @@ def test_alert_only_dedup_second_event_skipped(tmp_path, monkeypatch):
 # ---------------------------------------------------------------------------
-# 7. Non-HA events are ignored
+# 7. Shadow mode
 # ---------------------------------------------------------------------------
 def test_shadow_mode_websocket_dead_generates_alert_not_restart(tmp_path, monkeypatch):
    """shadow_mode=True: ha_websocket_dead → alert_only with [SHADOW MODE], not container_restart."""
    monkeypatch.setattr(supervisor_module, "HA_DIAG_SHADOW_MODE", True)
    sup = _setup_supervisor(tmp_path, monkeypatch)
    _write_event(tmp_path / "events", _make_event("ha_websocket_dead"))
    sup._process_ha_events()
    action_id = "container-restart-chelsty-ha-homeassistant"
    assert _pending(tmp_path, action_id).exists(), "Shadow alert should be written"
    action = _read_action(tmp_path, "pending", action_id)
    assert action["type"] == "alert_only"
    assert "[SHADOW MODE]" in action["description"]
    assert action["payload"].get("shadow_mode") is True
 def test_no_shadow_mode_websocket_dead_generates_container_restart(tmp_path, monkeypatch):
    """shadow_mode=False: ha_websocket_dead → container_restart (normal path)."""
    monkeypatch.setattr(supervisor_module, "HA_DIAG_SHADOW_MODE", False)
    sup = _setup_supervisor(tmp_path, monkeypatch)
    _write_event(tmp_path / "events", _make_event("ha_websocket_dead"))
    sup._process_ha_events()
    action_id = "container-restart-chelsty-ha-homeassistant"
    assert _pending(tmp_path, action_id).exists()
    action = _read_action(tmp_path, "pending", action_id)
    assert action["type"] == "container_restart"
    assert "[SHADOW MODE]" not in action["description"]
 def test_shadow_mode_alert_only_events_unaffected(tmp_path, monkeypatch):
    """shadow_mode=True: alert-only events (ha_entity_unavailable_long) are still routed normally."""
    monkeypatch.setattr(supervisor_module, "HA_DIAG_SHADOW_MODE", True)
    sup = _setup_supervisor(tmp_path, monkeypatch)
    _write_event(tmp_path / "events", _make_event("ha_entity_unavailable_long"))
    sup._process_ha_events()
    action_id = "alert-ha-entity-unavailable-chelsty-ha"
    assert _pending(tmp_path, action_id).exists()
    action = _read_action(tmp_path, "pending", action_id)
    assert action["type"] == "alert_only"
    assert "[SHADOW MODE]" not in action["description"]
 # ---------------------------------------------------------------------------
 # 8. Non-HA events are ignored
 # ---------------------------------------------------------------------------
 def test_non_ha_events_not_routed(tmp_path, monkeypatch):
--- a/services/ha-diag-agent/DEPLOY.md
+++ b/services/ha-diag-agent/DEPLOY.md
@ -0,0 +1,239 @@
 # ha-diag-agent Deployment Guide
 ## Section 1: Prerequisites
 ### HA long-lived access token
 The agent authenticates to Home Assistant with a long-lived token issued by a
 dedicated service account. Do not use a personal admin token.
 1. In HA: **Settings → People → Add Person**
   - Name: `diag_agent`
   - Do **not** add to any group (no admin rights needed)
 2. Log in to HA as `diag_agent`
 3. Go to **Profile → Long-Lived Access Tokens → Create token**
   - Name: `ha-diag-agent`
   - Copy the token — it is shown only once
 4. Store the token in the node's `.env` file (see Section 2)
 ### Tailnet reachability check (chelsty-infra only)
 `chelsty-infra` reaches Home Assistant on `chelsty-ha` over Tailscale.
 Verify before deploying:
 ```bash
 curl -sf http://100.70.180.90:8123/api/ \
  -H "Authorization: Bearer <token>" | python3 -m json.tool
 # Expect: {"message": "API running."}
 ```
 If the request times out, check that both nodes are on the Tailscale mesh
 (`tailscale status`) and that `chelsty-ha` is powered on.
 ---
 ## Section 2: Per-host config
 Create `/opt/homelab/config/ha-diag-agent/.env` on **each target node**:
 ### piha
 ```bash
 mkdir -p /opt/homelab/config/ha-diag-agent
 cat > /opt/homelab/config/ha-diag-agent/.env << 'EOF'
 HA_URL=http://localhost:8123
 HA_TOKEN=<long-lived-token-for-piha>
 NODE_NAME=piha
 LOCATION_TAG=ken
 CHECK_INTERVAL=60
 CHECK_INTERVAL_UNAVAILABLE=3600
 UNAVAILABLE_THRESHOLD_HOURS=24
 ALERT_COOLDOWN_HOURS=6
 LOG_LEVEL=info
 EOF
 chmod 600 /opt/homelab/config/ha-diag-agent/.env
 ```
 ### chelsty-infra
 ```bash
 mkdir -p /opt/homelab/config/ha-diag-agent
 cat > /opt/homelab/config/ha-diag-agent/.env << 'EOF'
 HA_URL=http://100.70.180.90:8123
 HA_TOKEN=<long-lived-token-for-chelsty-ha>
 NODE_NAME=chelsty-infra
 LOCATION_TAG=chelsty
 CHECK_INTERVAL=60
 CHECK_INTERVAL_UNAVAILABLE=3600
 UNAVAILABLE_THRESHOLD_HOURS=24
 ALERT_COOLDOWN_HOURS=6
 LOG_LEVEL=info
 EOF
 chmod 600 /opt/homelab/config/ha-diag-agent/.env
 ```
 > If `chelsty-ha` gets a new Tailscale IP, update `HA_URL` in this file and
 > restart the container.
 ---
 ## Section 3: Deploy procedure
 ### From SATURN (standard flow)
 ```bash
 # 1. Commit and push changes from SATURN
 git push
 # 2. SSH to target node
 ssh oskar@piha            # or chelsty-infra
 # 3. Pull latest and deploy
 cd ~/homelab-codex-ws
 git pull
 scripts/deploy/deploy.sh --service ha-diag-agent
 ```
 ### chelsty-infra (docker-compose v1)
 `chelsty-infra` runs docker-compose v1 (1.29.2). The deploy script calls
 `docker-compose` (hyphenated), which is correct. If you need to run manually:
 ```bash
 cd ~/homelab-codex-ws/services/ha-diag-agent
 docker-compose up -d --build
 ```
 ---
 ## Section 4: Verification
 ```bash
 # Container is up
 docker ps | grep ha-diag-agent
 # Last 50 log lines
 docker logs ha-diag-agent --tail 50
 # FastAPI health endpoint
 curl http://localhost:8087/health
 # Expect: {"status": "ok", "ws_connected": true, ...}
 # Events are being written
 ls /opt/homelab/events/<node-name>/
 # Expect: ha_*.json files appearing within the first CHECK_INTERVAL seconds
 # Supervisor is picking up events (check on VPS / control-plane)
 tail -f /opt/homelab/logs/supervisor.log | grep ha_
 ```
 ---
 ## Section 5: First-48h observation (shadow mode)
 The supervisor starts with `HA_DIAG_SHADOW_MODE=true` (default). During this
 window, `ha_websocket_dead` events are downgraded to `alert_only` actions
 tagged `[SHADOW MODE]` rather than triggering an automatic restart.
 Watch for these signals in Telegram:
 - `[SHADOW MODE] would have triggered container_restart for homeassistant` —
  confirms the detection path works end-to-end
 - `ha_entity_unavailable_long` / `ha_integration_failed` / etc. — these are
  always `alert_only` regardless of shadow mode; verify descriptions look
  accurate and thresholds are reasonable
 Things to evaluate:
 | Question | Good sign |
 |----------|-----------|
 | Are shadow alerts firing at reasonable frequency? | ≤ 1 per 30 min per node |
 | Are there false positives? | No alerts during known-good uptime |
 | Are entity-unavailable alerts describing real entities? | Yes, names match HA UI |
 | Are integration-failed alerts genuine? | Yes, not noise from startup |
 Note any false positives or noisy thresholds before enabling production mode.
 ---
 ## Section 6: Enabling production mode
 `HA_DIAG_SHADOW_MODE` is an environment variable read by the supervisor
 container. The VPS supervisor env vars live in the version-controlled
 override file at `hosts/vps/runtime/control-plane/docker-compose.override.yml`
 (not in a runtime `.env` file — the supervisor has no `env_file:` directive).
 When the 48h observation period looks clean:
 **1. Edit the override file on SATURN:**
 ```yaml
 # hosts/vps/runtime/control-plane/docker-compose.override.yml
 services:
  supervisor:
    environment:
      - NODE_ALIAS_MAP={"node-2":"chelsty"}
      - HA_DIAG_SHADOW_MODE=false      # add this line
 ```
 **2. Commit and push from SATURN:**
 ```bash
 git add hosts/vps/runtime/control-plane/docker-compose.override.yml
 git commit -m "feat(control-plane): disable HA shadow mode — production ready"
 git push
 ```
 **3. Apply on VPS:**
 ```bash
 ssh oskar@100.95.58.48
 cd ~/homelab-codex-ws && git pull
 docker compose \
  -f services/control-plane/docker-compose.yml \
  -f hosts/vps/runtime/control-plane/docker-compose.override.yml \
  up -d supervisor
 ```
 **4. Confirm:**
 ```bash
 docker logs control-plane-supervisor --tail 5
 # Expect: shadow_mode=False — HA container_restart actions enabled
 ```
 From this point, the next `ha_websocket_dead` event will generate a
 `container_restart` action in the approval queue. The 30-minute cooldown
 still applies after each restart.
 ---
 ## Section 7: Rollback
 If production mode causes unexpected behaviour:
 ```bash
 # Option A — re-enable shadow mode
 # On SATURN: edit hosts/vps/runtime/control-plane/docker-compose.override.yml
 # Set HA_DIAG_SHADOW_MODE=true (or remove the line — default is true)
 # Commit, push, then on VPS:
 ssh oskar@100.95.58.48
 cd ~/homelab-codex-ws && git pull
 docker compose \
  -f services/control-plane/docker-compose.yml \
  -f hosts/vps/runtime/control-plane/docker-compose.override.yml \
  up -d supervisor
 # Option B — stop ha-diag-agent entirely on affected nodes
 ssh oskar@<node>
 docker stop ha-diag-agent
 # Events written before rollback remain in /opt/homelab/events/<node>/
 # and are historical only — no automated action will be taken on them
 # unless the supervisor re-processes them, which it won't (already in
 # _ha_processed_event_ids).
 ```
 Any `container_restart` actions still in `pending/` after rollback can be
 manually rejected via the Telegram bot or by deleting the action files from
 `/opt/homelab/actions/pending/` on the VPS.
--- a/services/ha-diag-agent/README.md
+++ b/services/ha-diag-agent/README.md
@ -52,6 +52,17 @@ checks are APScheduler intervals (stateless REST polls).
 Event routing in supervisor (Phase 5) maps these to `notify` actions.
 `ha_websocket_recovered` should be routed to clear any active `ha_websocket_dead` incident.
 ## First-time deployment
 See **[DEPLOY.md](DEPLOY.md)** for the full procedure: HA token creation,
 per-host `.env` config, deploy commands, verification steps, 48h shadow-mode
 observation, and rollback.
 **Shadow mode** (`HA_DIAG_SHADOW_MODE`, default `true` on the control-plane):
 `ha_websocket_dead` events are downgraded to `alert_only` with a `[SHADOW MODE]`
 note instead of queuing an automatic `container_restart`. Set to `false` in
 `/opt/homelab/config/control-plane/.env` on the VPS when ready for live actions.
 ## Deployment model
 The agent is deployed **per-host** but targets a potentially remote HA instance: