feat(control-plane): shadow_mode for HA event auto-actions + deploy docs

- HA_DIAG_SHADOW_MODE env flag in supervisor (default true)
- shadow_mode downgrades container_restart actions to alert_only with
  [SHADOW MODE] note; same action_id and 30-min cooldown apply
- alert_only events unaffected (always routed normally)
- 3 new tests: shadow on/off for ha_websocket_dead, alert-only unaffected
- DEPLOY.md with token gen, per-host config, verification, 48h observation,
  production-mode enablement, rollback
- README.md updated with shadow mode flag summary and DEPLOY.md link

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Oskar Kapala 2026-05-29 17:04:39 +02:00
parent b9ed118b8c
commit 52607a7cdd
4 changed files with 361 additions and 2 deletions

View file

@ -75,6 +75,11 @@ HA_ALERT_COOLDOWN = 3600
# within this window — HA is in a planned restart/update and alerts would be noise. # within this window — HA is in a planned restart/update and alerts would be noise.
HA_TRANSITION_WINDOW = 300 # 5 minutes HA_TRANSITION_WINDOW = 300 # 5 minutes
# When True, events that would generate container_restart are downgraded to alert_only
# with a "[SHADOW MODE]" note. Safe default for initial deployment; set
# HA_DIAG_SHADOW_MODE=false on the control-plane node when ready for live actions.
HA_DIAG_SHADOW_MODE = os.getenv("HA_DIAG_SHADOW_MODE", "true").lower() == "true"
# Logging setup # Logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("supervisor") logger = logging.getLogger("supervisor")
@ -88,6 +93,11 @@ class Supervisor:
# on each reconcile cycle. Grows to at most ~hundreds of entries/day. # on each reconcile cycle. Grows to at most ~hundreds of entries/day.
self._ha_processed_event_ids: set = set() self._ha_processed_event_ids: set = set()
self._ensure_dirs() self._ensure_dirs()
logger.info(
"shadow_mode=%s — HA container_restart actions %s",
HA_DIAG_SHADOW_MODE,
"downgraded to alert_only" if HA_DIAG_SHADOW_MODE else "enabled",
)
def _ensure_dirs(self): def _ensure_dirs(self):
ACTIONS_DIR.mkdir(parents=True, exist_ok=True) ACTIONS_DIR.mkdir(parents=True, exist_ok=True)
@ -529,6 +539,12 @@ class Supervisor:
f"Suppressing {event_type} on {node}: homeassistant in transition" f"Suppressing {event_type} on {node}: homeassistant in transition"
) )
return return
if HA_DIAG_SHADOW_MODE:
logger.info(
"shadow_mode: suppressed container_restart for %s", event_type
)
self._generate_ha_shadow_alert(node, event)
else:
self._generate_ha_container_restart(node, event) self._generate_ha_container_restart(node, event)
elif event_type == "ha_websocket_recovered": elif event_type == "ha_websocket_recovered":
@ -617,6 +633,48 @@ class Supervisor:
} }
self._write_pending_action(action) self._write_pending_action(action)
def _generate_ha_shadow_alert(self, node: str, event: dict):
"""Shadow-mode downgrade: emit alert_only instead of container_restart.
Uses the same action_id and cooldown as the real restart so that
cooldown semantics are identical regardless of shadow mode state.
"""
service = "homeassistant"
action_id = f"container-restart-{node}-{service}"
for state in ("pending", "approved", "running"):
if (ACTIONS_DIR / state / f"{action_id}.json").exists():
logger.debug(f"Skipping {action_id}: already in state '{state}'")
return
if self._ha_action_recently_completed(action_id, HA_WEBSOCKET_RESTART_COOLDOWN):
logger.debug(
f"Skipping {action_id}: within {HA_WEBSOCKET_RESTART_COOLDOWN}s cooldown"
)
return
payload = dict(event.get("payload", {}))
payload["reason"] = "ha_websocket_dead"
payload["svc_key"] = f"{node}/{service}"
payload["shadow_mode"] = True
action = {
"action_id": action_id,
"timestamp": time.time(),
"type": "alert_only",
"node": node,
"service": service,
"risk_level": "info",
"confidence": 0.9,
"description": (
f"[SHADOW MODE] would have triggered container_restart "
f"for {service} on {node}: HA WebSocket unresponsive"
),
"status": "pending",
"payload": payload,
}
self._write_pending_action(action)
def _generate_ha_alert_only(self, node: str, event: dict): def _generate_ha_alert_only(self, node: str, event: dict):
event_type = event.get("type", "") event_type = event.get("type", "")
suffix = _HA_ALERT_ID_SUFFIX.get(event_type, event_type.replace("_", "-")) suffix = _HA_ALERT_ID_SUFFIX.get(event_type, event_type.replace("_", "-"))

View file

@ -74,6 +74,7 @@ def _read_action(tmp_path: Path, state: str, action_id: str) -> dict:
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def test_ha_websocket_dead_generates_container_restart(tmp_path, monkeypatch): def test_ha_websocket_dead_generates_container_restart(tmp_path, monkeypatch):
monkeypatch.setattr(supervisor_module, "HA_DIAG_SHADOW_MODE", False)
sup = _setup_supervisor(tmp_path, monkeypatch) sup = _setup_supervisor(tmp_path, monkeypatch)
events_dir = tmp_path / "events" events_dir = tmp_path / "events"
_write_event(events_dir, _make_event("ha_websocket_dead")) _write_event(events_dir, _make_event("ha_websocket_dead"))
@ -326,7 +327,57 @@ def test_alert_only_dedup_second_event_skipped(tmp_path, monkeypatch):
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# 7. Non-HA events are ignored # 7. Shadow mode
# ---------------------------------------------------------------------------
def test_shadow_mode_websocket_dead_generates_alert_not_restart(tmp_path, monkeypatch):
"""shadow_mode=True: ha_websocket_dead → alert_only with [SHADOW MODE], not container_restart."""
monkeypatch.setattr(supervisor_module, "HA_DIAG_SHADOW_MODE", True)
sup = _setup_supervisor(tmp_path, monkeypatch)
_write_event(tmp_path / "events", _make_event("ha_websocket_dead"))
sup._process_ha_events()
action_id = "container-restart-chelsty-ha-homeassistant"
assert _pending(tmp_path, action_id).exists(), "Shadow alert should be written"
action = _read_action(tmp_path, "pending", action_id)
assert action["type"] == "alert_only"
assert "[SHADOW MODE]" in action["description"]
assert action["payload"].get("shadow_mode") is True
def test_no_shadow_mode_websocket_dead_generates_container_restart(tmp_path, monkeypatch):
"""shadow_mode=False: ha_websocket_dead → container_restart (normal path)."""
monkeypatch.setattr(supervisor_module, "HA_DIAG_SHADOW_MODE", False)
sup = _setup_supervisor(tmp_path, monkeypatch)
_write_event(tmp_path / "events", _make_event("ha_websocket_dead"))
sup._process_ha_events()
action_id = "container-restart-chelsty-ha-homeassistant"
assert _pending(tmp_path, action_id).exists()
action = _read_action(tmp_path, "pending", action_id)
assert action["type"] == "container_restart"
assert "[SHADOW MODE]" not in action["description"]
def test_shadow_mode_alert_only_events_unaffected(tmp_path, monkeypatch):
"""shadow_mode=True: alert-only events (ha_entity_unavailable_long) are still routed normally."""
monkeypatch.setattr(supervisor_module, "HA_DIAG_SHADOW_MODE", True)
sup = _setup_supervisor(tmp_path, monkeypatch)
_write_event(tmp_path / "events", _make_event("ha_entity_unavailable_long"))
sup._process_ha_events()
action_id = "alert-ha-entity-unavailable-chelsty-ha"
assert _pending(tmp_path, action_id).exists()
action = _read_action(tmp_path, "pending", action_id)
assert action["type"] == "alert_only"
assert "[SHADOW MODE]" not in action["description"]
# ---------------------------------------------------------------------------
# 8. Non-HA events are ignored
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def test_non_ha_events_not_routed(tmp_path, monkeypatch): def test_non_ha_events_not_routed(tmp_path, monkeypatch):

View file

@ -0,0 +1,239 @@
# ha-diag-agent Deployment Guide
## Section 1: Prerequisites
### HA long-lived access token
The agent authenticates to Home Assistant with a long-lived token issued by a
dedicated service account. Do not use a personal admin token.
1. In HA: **Settings → People → Add Person**
- Name: `diag_agent`
- Do **not** add to any group (no admin rights needed)
2. Log in to HA as `diag_agent`
3. Go to **Profile → Long-Lived Access Tokens → Create token**
- Name: `ha-diag-agent`
- Copy the token — it is shown only once
4. Store the token in the node's `.env` file (see Section 2)
### Tailnet reachability check (chelsty-infra only)
`chelsty-infra` reaches Home Assistant on `chelsty-ha` over Tailscale.
Verify before deploying:
```bash
curl -sf http://100.70.180.90:8123/api/ \
-H "Authorization: Bearer <token>" | python3 -m json.tool
# Expect: {"message": "API running."}
```
If the request times out, check that both nodes are on the Tailscale mesh
(`tailscale status`) and that `chelsty-ha` is powered on.
---
## Section 2: Per-host config
Create `/opt/homelab/config/ha-diag-agent/.env` on **each target node**:
### piha
```bash
mkdir -p /opt/homelab/config/ha-diag-agent
cat > /opt/homelab/config/ha-diag-agent/.env << 'EOF'
HA_URL=http://localhost:8123
HA_TOKEN=<long-lived-token-for-piha>
NODE_NAME=piha
LOCATION_TAG=ken
CHECK_INTERVAL=60
CHECK_INTERVAL_UNAVAILABLE=3600
UNAVAILABLE_THRESHOLD_HOURS=24
ALERT_COOLDOWN_HOURS=6
LOG_LEVEL=info
EOF
chmod 600 /opt/homelab/config/ha-diag-agent/.env
```
### chelsty-infra
```bash
mkdir -p /opt/homelab/config/ha-diag-agent
cat > /opt/homelab/config/ha-diag-agent/.env << 'EOF'
HA_URL=http://100.70.180.90:8123
HA_TOKEN=<long-lived-token-for-chelsty-ha>
NODE_NAME=chelsty-infra
LOCATION_TAG=chelsty
CHECK_INTERVAL=60
CHECK_INTERVAL_UNAVAILABLE=3600
UNAVAILABLE_THRESHOLD_HOURS=24
ALERT_COOLDOWN_HOURS=6
LOG_LEVEL=info
EOF
chmod 600 /opt/homelab/config/ha-diag-agent/.env
```
> If `chelsty-ha` gets a new Tailscale IP, update `HA_URL` in this file and
> restart the container.
---
## Section 3: Deploy procedure
### From SATURN (standard flow)
```bash
# 1. Commit and push changes from SATURN
git push
# 2. SSH to target node
ssh oskar@piha # or chelsty-infra
# 3. Pull latest and deploy
cd ~/homelab-codex-ws
git pull
scripts/deploy/deploy.sh --service ha-diag-agent
```
### chelsty-infra (docker-compose v1)
`chelsty-infra` runs docker-compose v1 (1.29.2). The deploy script calls
`docker-compose` (hyphenated), which is correct. If you need to run manually:
```bash
cd ~/homelab-codex-ws/services/ha-diag-agent
docker-compose up -d --build
```
---
## Section 4: Verification
```bash
# Container is up
docker ps | grep ha-diag-agent
# Last 50 log lines
docker logs ha-diag-agent --tail 50
# FastAPI health endpoint
curl http://localhost:8087/health
# Expect: {"status": "ok", "ws_connected": true, ...}
# Events are being written
ls /opt/homelab/events/<node-name>/
# Expect: ha_*.json files appearing within the first CHECK_INTERVAL seconds
# Supervisor is picking up events (check on VPS / control-plane)
tail -f /opt/homelab/logs/supervisor.log | grep ha_
```
---
## Section 5: First-48h observation (shadow mode)
The supervisor starts with `HA_DIAG_SHADOW_MODE=true` (default). During this
window, `ha_websocket_dead` events are downgraded to `alert_only` actions
tagged `[SHADOW MODE]` rather than triggering an automatic restart.
Watch for these signals in Telegram:
- `[SHADOW MODE] would have triggered container_restart for homeassistant`
confirms the detection path works end-to-end
- `ha_entity_unavailable_long` / `ha_integration_failed` / etc. — these are
always `alert_only` regardless of shadow mode; verify descriptions look
accurate and thresholds are reasonable
Things to evaluate:
| Question | Good sign |
|----------|-----------|
| Are shadow alerts firing at reasonable frequency? | ≤ 1 per 30 min per node |
| Are there false positives? | No alerts during known-good uptime |
| Are entity-unavailable alerts describing real entities? | Yes, names match HA UI |
| Are integration-failed alerts genuine? | Yes, not noise from startup |
Note any false positives or noisy thresholds before enabling production mode.
---
## Section 6: Enabling production mode
`HA_DIAG_SHADOW_MODE` is an environment variable read by the supervisor
container. The VPS supervisor env vars live in the version-controlled
override file at `hosts/vps/runtime/control-plane/docker-compose.override.yml`
(not in a runtime `.env` file — the supervisor has no `env_file:` directive).
When the 48h observation period looks clean:
**1. Edit the override file on SATURN:**
```yaml
# hosts/vps/runtime/control-plane/docker-compose.override.yml
services:
supervisor:
environment:
- NODE_ALIAS_MAP={"node-2":"chelsty"}
- HA_DIAG_SHADOW_MODE=false # add this line
```
**2. Commit and push from SATURN:**
```bash
git add hosts/vps/runtime/control-plane/docker-compose.override.yml
git commit -m "feat(control-plane): disable HA shadow mode — production ready"
git push
```
**3. Apply on VPS:**
```bash
ssh oskar@100.95.58.48
cd ~/homelab-codex-ws && git pull
docker compose \
-f services/control-plane/docker-compose.yml \
-f hosts/vps/runtime/control-plane/docker-compose.override.yml \
up -d supervisor
```
**4. Confirm:**
```bash
docker logs control-plane-supervisor --tail 5
# Expect: shadow_mode=False — HA container_restart actions enabled
```
From this point, the next `ha_websocket_dead` event will generate a
`container_restart` action in the approval queue. The 30-minute cooldown
still applies after each restart.
---
## Section 7: Rollback
If production mode causes unexpected behaviour:
```bash
# Option A — re-enable shadow mode
# On SATURN: edit hosts/vps/runtime/control-plane/docker-compose.override.yml
# Set HA_DIAG_SHADOW_MODE=true (or remove the line — default is true)
# Commit, push, then on VPS:
ssh oskar@100.95.58.48
cd ~/homelab-codex-ws && git pull
docker compose \
-f services/control-plane/docker-compose.yml \
-f hosts/vps/runtime/control-plane/docker-compose.override.yml \
up -d supervisor
# Option B — stop ha-diag-agent entirely on affected nodes
ssh oskar@<node>
docker stop ha-diag-agent
# Events written before rollback remain in /opt/homelab/events/<node>/
# and are historical only — no automated action will be taken on them
# unless the supervisor re-processes them, which it won't (already in
# _ha_processed_event_ids).
```
Any `container_restart` actions still in `pending/` after rollback can be
manually rejected via the Telegram bot or by deleting the action files from
`/opt/homelab/actions/pending/` on the VPS.

View file

@ -52,6 +52,17 @@ checks are APScheduler intervals (stateless REST polls).
Event routing in supervisor (Phase 5) maps these to `notify` actions. Event routing in supervisor (Phase 5) maps these to `notify` actions.
`ha_websocket_recovered` should be routed to clear any active `ha_websocket_dead` incident. `ha_websocket_recovered` should be routed to clear any active `ha_websocket_dead` incident.
## First-time deployment
See **[DEPLOY.md](DEPLOY.md)** for the full procedure: HA token creation,
per-host `.env` config, deploy commands, verification steps, 48h shadow-mode
observation, and rollback.
**Shadow mode** (`HA_DIAG_SHADOW_MODE`, default `true` on the control-plane):
`ha_websocket_dead` events are downgraded to `alert_only` with a `[SHADOW MODE]`
note instead of queuing an automatic `container_restart`. Set to `false` in
`/opt/homelab/config/control-plane/.env` on the VPS when ready for live actions.
## Deployment model ## Deployment model
The agent is deployed **per-host** but targets a potentially remote HA instance: The agent is deployed **per-host** but targets a potentially remote HA instance: