feat(control-plane): shadow_mode for HA event auto-actions + deploy docs
- HA_DIAG_SHADOW_MODE env flag in supervisor (default true) - shadow_mode downgrades container_restart actions to alert_only with [SHADOW MODE] note; same action_id and 30-min cooldown apply - alert_only events unaffected (always routed normally) - 3 new tests: shadow on/off for ha_websocket_dead, alert-only unaffected - DEPLOY.md with token gen, per-host config, verification, 48h observation, production-mode enablement, rollback - README.md updated with shadow mode flag summary and DEPLOY.md link Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
b9ed118b8c
commit
52607a7cdd
|
|
@ -75,6 +75,11 @@ HA_ALERT_COOLDOWN = 3600
|
|||
# within this window — HA is in a planned restart/update and alerts would be noise.
|
||||
HA_TRANSITION_WINDOW = 300 # 5 minutes
|
||||
|
||||
# When True, events that would generate container_restart are downgraded to alert_only
|
||||
# with a "[SHADOW MODE]" note. Safe default for initial deployment; set
|
||||
# HA_DIAG_SHADOW_MODE=false on the control-plane node when ready for live actions.
|
||||
HA_DIAG_SHADOW_MODE = os.getenv("HA_DIAG_SHADOW_MODE", "true").lower() == "true"
|
||||
|
||||
# Logging setup
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger("supervisor")
|
||||
|
|
@ -88,6 +93,11 @@ class Supervisor:
|
|||
# on each reconcile cycle. Grows to at most ~hundreds of entries/day.
|
||||
self._ha_processed_event_ids: set = set()
|
||||
self._ensure_dirs()
|
||||
logger.info(
|
||||
"shadow_mode=%s — HA container_restart actions %s",
|
||||
HA_DIAG_SHADOW_MODE,
|
||||
"downgraded to alert_only" if HA_DIAG_SHADOW_MODE else "enabled",
|
||||
)
|
||||
|
||||
def _ensure_dirs(self):
|
||||
ACTIONS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
|
@ -529,7 +539,13 @@ class Supervisor:
|
|||
f"Suppressing {event_type} on {node}: homeassistant in transition"
|
||||
)
|
||||
return
|
||||
self._generate_ha_container_restart(node, event)
|
||||
if HA_DIAG_SHADOW_MODE:
|
||||
logger.info(
|
||||
"shadow_mode: suppressed container_restart for %s", event_type
|
||||
)
|
||||
self._generate_ha_shadow_alert(node, event)
|
||||
else:
|
||||
self._generate_ha_container_restart(node, event)
|
||||
|
||||
elif event_type == "ha_websocket_recovered":
|
||||
self._cancel_ha_container_restart(node)
|
||||
|
|
@ -617,6 +633,48 @@ class Supervisor:
|
|||
}
|
||||
self._write_pending_action(action)
|
||||
|
||||
def _generate_ha_shadow_alert(self, node: str, event: dict):
|
||||
"""Shadow-mode downgrade: emit alert_only instead of container_restart.
|
||||
|
||||
Uses the same action_id and cooldown as the real restart so that
|
||||
cooldown semantics are identical regardless of shadow mode state.
|
||||
"""
|
||||
service = "homeassistant"
|
||||
action_id = f"container-restart-{node}-{service}"
|
||||
|
||||
for state in ("pending", "approved", "running"):
|
||||
if (ACTIONS_DIR / state / f"{action_id}.json").exists():
|
||||
logger.debug(f"Skipping {action_id}: already in state '{state}'")
|
||||
return
|
||||
|
||||
if self._ha_action_recently_completed(action_id, HA_WEBSOCKET_RESTART_COOLDOWN):
|
||||
logger.debug(
|
||||
f"Skipping {action_id}: within {HA_WEBSOCKET_RESTART_COOLDOWN}s cooldown"
|
||||
)
|
||||
return
|
||||
|
||||
payload = dict(event.get("payload", {}))
|
||||
payload["reason"] = "ha_websocket_dead"
|
||||
payload["svc_key"] = f"{node}/{service}"
|
||||
payload["shadow_mode"] = True
|
||||
|
||||
action = {
|
||||
"action_id": action_id,
|
||||
"timestamp": time.time(),
|
||||
"type": "alert_only",
|
||||
"node": node,
|
||||
"service": service,
|
||||
"risk_level": "info",
|
||||
"confidence": 0.9,
|
||||
"description": (
|
||||
f"[SHADOW MODE] would have triggered container_restart "
|
||||
f"for {service} on {node}: HA WebSocket unresponsive"
|
||||
),
|
||||
"status": "pending",
|
||||
"payload": payload,
|
||||
}
|
||||
self._write_pending_action(action)
|
||||
|
||||
def _generate_ha_alert_only(self, node: str, event: dict):
|
||||
event_type = event.get("type", "")
|
||||
suffix = _HA_ALERT_ID_SUFFIX.get(event_type, event_type.replace("_", "-"))
|
||||
|
|
|
|||
|
|
@ -74,6 +74,7 @@ def _read_action(tmp_path: Path, state: str, action_id: str) -> dict:
|
|||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_ha_websocket_dead_generates_container_restart(tmp_path, monkeypatch):
|
||||
monkeypatch.setattr(supervisor_module, "HA_DIAG_SHADOW_MODE", False)
|
||||
sup = _setup_supervisor(tmp_path, monkeypatch)
|
||||
events_dir = tmp_path / "events"
|
||||
_write_event(events_dir, _make_event("ha_websocket_dead"))
|
||||
|
|
@ -326,7 +327,57 @@ def test_alert_only_dedup_second_event_skipped(tmp_path, monkeypatch):
|
|||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 7. Non-HA events are ignored
|
||||
# 7. Shadow mode
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_shadow_mode_websocket_dead_generates_alert_not_restart(tmp_path, monkeypatch):
|
||||
"""shadow_mode=True: ha_websocket_dead → alert_only with [SHADOW MODE], not container_restart."""
|
||||
monkeypatch.setattr(supervisor_module, "HA_DIAG_SHADOW_MODE", True)
|
||||
sup = _setup_supervisor(tmp_path, monkeypatch)
|
||||
_write_event(tmp_path / "events", _make_event("ha_websocket_dead"))
|
||||
|
||||
sup._process_ha_events()
|
||||
|
||||
action_id = "container-restart-chelsty-ha-homeassistant"
|
||||
assert _pending(tmp_path, action_id).exists(), "Shadow alert should be written"
|
||||
action = _read_action(tmp_path, "pending", action_id)
|
||||
assert action["type"] == "alert_only"
|
||||
assert "[SHADOW MODE]" in action["description"]
|
||||
assert action["payload"].get("shadow_mode") is True
|
||||
|
||||
|
||||
def test_no_shadow_mode_websocket_dead_generates_container_restart(tmp_path, monkeypatch):
|
||||
"""shadow_mode=False: ha_websocket_dead → container_restart (normal path)."""
|
||||
monkeypatch.setattr(supervisor_module, "HA_DIAG_SHADOW_MODE", False)
|
||||
sup = _setup_supervisor(tmp_path, monkeypatch)
|
||||
_write_event(tmp_path / "events", _make_event("ha_websocket_dead"))
|
||||
|
||||
sup._process_ha_events()
|
||||
|
||||
action_id = "container-restart-chelsty-ha-homeassistant"
|
||||
assert _pending(tmp_path, action_id).exists()
|
||||
action = _read_action(tmp_path, "pending", action_id)
|
||||
assert action["type"] == "container_restart"
|
||||
assert "[SHADOW MODE]" not in action["description"]
|
||||
|
||||
|
||||
def test_shadow_mode_alert_only_events_unaffected(tmp_path, monkeypatch):
|
||||
"""shadow_mode=True: alert-only events (ha_entity_unavailable_long) are still routed normally."""
|
||||
monkeypatch.setattr(supervisor_module, "HA_DIAG_SHADOW_MODE", True)
|
||||
sup = _setup_supervisor(tmp_path, monkeypatch)
|
||||
_write_event(tmp_path / "events", _make_event("ha_entity_unavailable_long"))
|
||||
|
||||
sup._process_ha_events()
|
||||
|
||||
action_id = "alert-ha-entity-unavailable-chelsty-ha"
|
||||
assert _pending(tmp_path, action_id).exists()
|
||||
action = _read_action(tmp_path, "pending", action_id)
|
||||
assert action["type"] == "alert_only"
|
||||
assert "[SHADOW MODE]" not in action["description"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 8. Non-HA events are ignored
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_non_ha_events_not_routed(tmp_path, monkeypatch):
|
||||
|
|
|
|||
239
services/ha-diag-agent/DEPLOY.md
Normal file
239
services/ha-diag-agent/DEPLOY.md
Normal file
|
|
@ -0,0 +1,239 @@
|
|||
# ha-diag-agent Deployment Guide
|
||||
|
||||
## Section 1: Prerequisites
|
||||
|
||||
### HA long-lived access token
|
||||
|
||||
The agent authenticates to Home Assistant with a long-lived token issued by a
|
||||
dedicated service account. Do not use a personal admin token.
|
||||
|
||||
1. In HA: **Settings → People → Add Person**
|
||||
- Name: `diag_agent`
|
||||
- Do **not** add to any group (no admin rights needed)
|
||||
2. Log in to HA as `diag_agent`
|
||||
3. Go to **Profile → Long-Lived Access Tokens → Create token**
|
||||
- Name: `ha-diag-agent`
|
||||
- Copy the token — it is shown only once
|
||||
4. Store the token in the node's `.env` file (see Section 2)
|
||||
|
||||
### Tailnet reachability check (chelsty-infra only)
|
||||
|
||||
`chelsty-infra` reaches Home Assistant on `chelsty-ha` over Tailscale.
|
||||
Verify before deploying:
|
||||
|
||||
```bash
|
||||
curl -sf http://100.70.180.90:8123/api/ \
|
||||
-H "Authorization: Bearer <token>" | python3 -m json.tool
|
||||
# Expect: {"message": "API running."}
|
||||
```
|
||||
|
||||
If the request times out, check that both nodes are on the Tailscale mesh
|
||||
(`tailscale status`) and that `chelsty-ha` is powered on.
|
||||
|
||||
---
|
||||
|
||||
## Section 2: Per-host config
|
||||
|
||||
Create `/opt/homelab/config/ha-diag-agent/.env` on **each target node**:
|
||||
|
||||
### piha
|
||||
|
||||
```bash
|
||||
mkdir -p /opt/homelab/config/ha-diag-agent
|
||||
cat > /opt/homelab/config/ha-diag-agent/.env << 'EOF'
|
||||
HA_URL=http://localhost:8123
|
||||
HA_TOKEN=<long-lived-token-for-piha>
|
||||
NODE_NAME=piha
|
||||
LOCATION_TAG=ken
|
||||
CHECK_INTERVAL=60
|
||||
CHECK_INTERVAL_UNAVAILABLE=3600
|
||||
UNAVAILABLE_THRESHOLD_HOURS=24
|
||||
ALERT_COOLDOWN_HOURS=6
|
||||
LOG_LEVEL=info
|
||||
EOF
|
||||
chmod 600 /opt/homelab/config/ha-diag-agent/.env
|
||||
```
|
||||
|
||||
### chelsty-infra
|
||||
|
||||
```bash
|
||||
mkdir -p /opt/homelab/config/ha-diag-agent
|
||||
cat > /opt/homelab/config/ha-diag-agent/.env << 'EOF'
|
||||
HA_URL=http://100.70.180.90:8123
|
||||
HA_TOKEN=<long-lived-token-for-chelsty-ha>
|
||||
NODE_NAME=chelsty-infra
|
||||
LOCATION_TAG=chelsty
|
||||
CHECK_INTERVAL=60
|
||||
CHECK_INTERVAL_UNAVAILABLE=3600
|
||||
UNAVAILABLE_THRESHOLD_HOURS=24
|
||||
ALERT_COOLDOWN_HOURS=6
|
||||
LOG_LEVEL=info
|
||||
EOF
|
||||
chmod 600 /opt/homelab/config/ha-diag-agent/.env
|
||||
```
|
||||
|
||||
> If `chelsty-ha` gets a new Tailscale IP, update `HA_URL` in this file and
|
||||
> restart the container.
|
||||
|
||||
---
|
||||
|
||||
## Section 3: Deploy procedure
|
||||
|
||||
### From SATURN (standard flow)
|
||||
|
||||
```bash
|
||||
# 1. Commit and push changes from SATURN
|
||||
git push
|
||||
|
||||
# 2. SSH to target node
|
||||
ssh oskar@piha # or chelsty-infra
|
||||
|
||||
# 3. Pull latest and deploy
|
||||
cd ~/homelab-codex-ws
|
||||
git pull
|
||||
scripts/deploy/deploy.sh --service ha-diag-agent
|
||||
```
|
||||
|
||||
### chelsty-infra (docker-compose v1)
|
||||
|
||||
`chelsty-infra` runs docker-compose v1 (1.29.2). The deploy script calls
|
||||
`docker-compose` (hyphenated), which is correct. If you need to run manually:
|
||||
|
||||
```bash
|
||||
cd ~/homelab-codex-ws/services/ha-diag-agent
|
||||
docker-compose up -d --build
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Section 4: Verification
|
||||
|
||||
```bash
|
||||
# Container is up
|
||||
docker ps | grep ha-diag-agent
|
||||
|
||||
# Last 50 log lines
|
||||
docker logs ha-diag-agent --tail 50
|
||||
|
||||
# FastAPI health endpoint
|
||||
curl http://localhost:8087/health
|
||||
# Expect: {"status": "ok", "ws_connected": true, ...}
|
||||
|
||||
# Events are being written
|
||||
ls /opt/homelab/events/<node-name>/
|
||||
# Expect: ha_*.json files appearing within the first CHECK_INTERVAL seconds
|
||||
|
||||
# Supervisor is picking up events (check on VPS / control-plane)
|
||||
tail -f /opt/homelab/logs/supervisor.log | grep ha_
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Section 5: First-48h observation (shadow mode)
|
||||
|
||||
The supervisor starts with `HA_DIAG_SHADOW_MODE=true` (default). During this
|
||||
window, `ha_websocket_dead` events are downgraded to `alert_only` actions
|
||||
tagged `[SHADOW MODE]` rather than triggering an automatic restart.
|
||||
|
||||
Watch for these signals in Telegram:
|
||||
|
||||
- `[SHADOW MODE] would have triggered container_restart for homeassistant` —
|
||||
confirms the detection path works end-to-end
|
||||
- `ha_entity_unavailable_long` / `ha_integration_failed` / etc. — these are
|
||||
always `alert_only` regardless of shadow mode; verify descriptions look
|
||||
accurate and thresholds are reasonable
|
||||
|
||||
Things to evaluate:
|
||||
|
||||
| Question | Good sign |
|
||||
|----------|-----------|
|
||||
| Are shadow alerts firing at reasonable frequency? | ≤ 1 per 30 min per node |
|
||||
| Are there false positives? | No alerts during known-good uptime |
|
||||
| Are entity-unavailable alerts describing real entities? | Yes, names match HA UI |
|
||||
| Are integration-failed alerts genuine? | Yes, not noise from startup |
|
||||
|
||||
Note any false positives or noisy thresholds before enabling production mode.
|
||||
|
||||
---
|
||||
|
||||
## Section 6: Enabling production mode
|
||||
|
||||
`HA_DIAG_SHADOW_MODE` is an environment variable read by the supervisor
|
||||
container. The VPS supervisor env vars live in the version-controlled
|
||||
override file at `hosts/vps/runtime/control-plane/docker-compose.override.yml`
|
||||
(not in a runtime `.env` file — the supervisor has no `env_file:` directive).
|
||||
|
||||
When the 48h observation period looks clean:
|
||||
|
||||
**1. Edit the override file on SATURN:**
|
||||
|
||||
```yaml
|
||||
# hosts/vps/runtime/control-plane/docker-compose.override.yml
|
||||
services:
|
||||
supervisor:
|
||||
environment:
|
||||
- NODE_ALIAS_MAP={"node-2":"chelsty"}
|
||||
- HA_DIAG_SHADOW_MODE=false # add this line
|
||||
```
|
||||
|
||||
**2. Commit and push from SATURN:**
|
||||
|
||||
```bash
|
||||
git add hosts/vps/runtime/control-plane/docker-compose.override.yml
|
||||
git commit -m "feat(control-plane): disable HA shadow mode — production ready"
|
||||
git push
|
||||
```
|
||||
|
||||
**3. Apply on VPS:**
|
||||
|
||||
```bash
|
||||
ssh oskar@100.95.58.48
|
||||
cd ~/homelab-codex-ws && git pull
|
||||
docker compose \
|
||||
-f services/control-plane/docker-compose.yml \
|
||||
-f hosts/vps/runtime/control-plane/docker-compose.override.yml \
|
||||
up -d supervisor
|
||||
```
|
||||
|
||||
**4. Confirm:**
|
||||
|
||||
```bash
|
||||
docker logs control-plane-supervisor --tail 5
|
||||
# Expect: shadow_mode=False — HA container_restart actions enabled
|
||||
```
|
||||
|
||||
From this point, the next `ha_websocket_dead` event will generate a
|
||||
`container_restart` action in the approval queue. The 30-minute cooldown
|
||||
still applies after each restart.
|
||||
|
||||
---
|
||||
|
||||
## Section 7: Rollback
|
||||
|
||||
If production mode causes unexpected behaviour:
|
||||
|
||||
```bash
|
||||
# Option A — re-enable shadow mode
|
||||
# On SATURN: edit hosts/vps/runtime/control-plane/docker-compose.override.yml
|
||||
# Set HA_DIAG_SHADOW_MODE=true (or remove the line — default is true)
|
||||
# Commit, push, then on VPS:
|
||||
ssh oskar@100.95.58.48
|
||||
cd ~/homelab-codex-ws && git pull
|
||||
docker compose \
|
||||
-f services/control-plane/docker-compose.yml \
|
||||
-f hosts/vps/runtime/control-plane/docker-compose.override.yml \
|
||||
up -d supervisor
|
||||
|
||||
# Option B — stop ha-diag-agent entirely on affected nodes
|
||||
ssh oskar@<node>
|
||||
docker stop ha-diag-agent
|
||||
|
||||
# Events written before rollback remain in /opt/homelab/events/<node>/
|
||||
# and are historical only — no automated action will be taken on them
|
||||
# unless the supervisor re-processes them, which it won't (already in
|
||||
# _ha_processed_event_ids).
|
||||
```
|
||||
|
||||
Any `container_restart` actions still in `pending/` after rollback can be
|
||||
manually rejected via the Telegram bot or by deleting the action files from
|
||||
`/opt/homelab/actions/pending/` on the VPS.
|
||||
|
|
@ -52,6 +52,17 @@ checks are APScheduler intervals (stateless REST polls).
|
|||
Event routing in supervisor (Phase 5) maps these to `notify` actions.
|
||||
`ha_websocket_recovered` should be routed to clear any active `ha_websocket_dead` incident.
|
||||
|
||||
## First-time deployment
|
||||
|
||||
See **[DEPLOY.md](DEPLOY.md)** for the full procedure: HA token creation,
|
||||
per-host `.env` config, deploy commands, verification steps, 48h shadow-mode
|
||||
observation, and rollback.
|
||||
|
||||
**Shadow mode** (`HA_DIAG_SHADOW_MODE`, default `true` on the control-plane):
|
||||
`ha_websocket_dead` events are downgraded to `alert_only` with a `[SHADOW MODE]`
|
||||
note instead of queuing an automatic `container_restart`. Set to `false` in
|
||||
`/opt/homelab/config/control-plane/.env` on the VPS when ready for live actions.
|
||||
|
||||
## Deployment model
|
||||
|
||||
The agent is deployed **per-host** but targets a potentially remote HA instance:
|
||||
|
|
|
|||
Loading…
Reference in a new issue