345 lines
14 KiB
Python
345 lines
14 KiB
Python
|
|
"""Tests for HA diagnostic event routing in the supervisor."""
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import json
|
||
|
|
import sys
|
||
|
|
import time
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
import pytest
|
||
|
|
|
||
|
|
# Add src/ to path so we can import supervisor without installing
|
||
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||
|
|
import supervisor as supervisor_module
|
||
|
|
from supervisor import Supervisor
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Helpers
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
def _make_event(event_type: str, node: str = "chelsty-ha", service: str = "homeassistant",
|
||
|
|
payload: dict | None = None, message: str = "") -> dict:
|
||
|
|
return {
|
||
|
|
"id": f"evt-{node}-{int(time.time())}-{event_type}-{service}-1",
|
||
|
|
"type": event_type,
|
||
|
|
"node": node,
|
||
|
|
"service": service,
|
||
|
|
"severity": "warning",
|
||
|
|
"timestamp": int(time.time()),
|
||
|
|
"message": message or f"Test event: {event_type}",
|
||
|
|
"payload": payload or {"location_tag": "chelsty"},
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def _write_event(events_dir: Path, event: dict) -> Path:
|
||
|
|
path = events_dir / f"{event['id']}.json"
|
||
|
|
path.write_text(json.dumps(event))
|
||
|
|
return path
|
||
|
|
|
||
|
|
|
||
|
|
def _setup_supervisor(tmp_path: Path, monkeypatch) -> Supervisor:
|
||
|
|
"""Return a Supervisor instance with all paths redirected to tmp_path."""
|
||
|
|
actions = tmp_path / "actions"
|
||
|
|
events = tmp_path / "events"
|
||
|
|
world = tmp_path / "world"
|
||
|
|
repo = tmp_path / "repo"
|
||
|
|
state = tmp_path / "state"
|
||
|
|
|
||
|
|
for d in (actions, events, world, repo / "hosts", state):
|
||
|
|
d.mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
monkeypatch.setattr(supervisor_module, "ACTIONS_DIR", actions)
|
||
|
|
monkeypatch.setattr(supervisor_module, "EVENTS_DIR", events)
|
||
|
|
monkeypatch.setattr(supervisor_module, "WORLD_DIR", world)
|
||
|
|
monkeypatch.setattr(supervisor_module, "REPO_ROOT", repo)
|
||
|
|
|
||
|
|
sup = Supervisor()
|
||
|
|
# Empty desired/actual state so reconcile drift loop is a no-op
|
||
|
|
sup.desired_state = {"services": {}}
|
||
|
|
sup.actual_state = {"services": {}, "nodes": {}, "incidents": {}}
|
||
|
|
return sup
|
||
|
|
|
||
|
|
|
||
|
|
def _pending(tmp_path: Path, action_id: str) -> Path:
|
||
|
|
return tmp_path / "actions" / "pending" / f"{action_id}.json"
|
||
|
|
|
||
|
|
|
||
|
|
def _read_action(tmp_path: Path, state: str, action_id: str) -> dict:
|
||
|
|
return json.loads((tmp_path / "actions" / state / f"{action_id}.json").read_text())
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 1. Each event type → correct action type
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
def test_ha_websocket_dead_generates_container_restart(tmp_path, monkeypatch):
|
||
|
|
sup = _setup_supervisor(tmp_path, monkeypatch)
|
||
|
|
events_dir = tmp_path / "events"
|
||
|
|
_write_event(events_dir, _make_event("ha_websocket_dead"))
|
||
|
|
|
||
|
|
sup._process_ha_events()
|
||
|
|
|
||
|
|
action_id = "container-restart-chelsty-ha-homeassistant"
|
||
|
|
assert _pending(tmp_path, action_id).exists()
|
||
|
|
action = _read_action(tmp_path, "pending", action_id)
|
||
|
|
assert action["type"] == "container_restart"
|
||
|
|
assert action["service"] == "homeassistant"
|
||
|
|
assert action["node"] == "chelsty-ha"
|
||
|
|
|
||
|
|
|
||
|
|
@pytest.mark.parametrize("event_type,expected_suffix", [
|
||
|
|
("ha_integration_failed", "integration-failed"),
|
||
|
|
("ha_entity_unavailable_long", "entity-unavailable"),
|
||
|
|
("ha_automation_failing", "automation-failing"),
|
||
|
|
("ha_update_available", "update-available"),
|
||
|
|
("ha_recorder_lag", "recorder-lag"),
|
||
|
|
("ha_system_health_degraded", "system-health-degraded"),
|
||
|
|
])
|
||
|
|
def test_alert_only_events_generate_alert_actions(
|
||
|
|
tmp_path, monkeypatch, event_type, expected_suffix
|
||
|
|
):
|
||
|
|
sup = _setup_supervisor(tmp_path, monkeypatch)
|
||
|
|
_write_event(tmp_path / "events", _make_event(event_type))
|
||
|
|
|
||
|
|
sup._process_ha_events()
|
||
|
|
|
||
|
|
action_id = f"alert-ha-{expected_suffix}-chelsty-ha"
|
||
|
|
assert _pending(tmp_path, action_id).exists(), f"No pending action for {event_type}"
|
||
|
|
action = _read_action(tmp_path, "pending", action_id)
|
||
|
|
assert action["type"] == "alert_only"
|
||
|
|
assert action["node"] == "chelsty-ha"
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 2. Transition suppression
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
def test_ha_websocket_dead_suppressed_during_transition(tmp_path, monkeypatch):
|
||
|
|
sup = _setup_supervisor(tmp_path, monkeypatch)
|
||
|
|
|
||
|
|
# Set up world state: homeassistant has an active containers_not_running incident
|
||
|
|
inc_id = "inc-123-chelsty-ha-homeassistant"
|
||
|
|
sup.actual_state["services"]["chelsty-ha/homeassistant"] = {
|
||
|
|
"node": "chelsty-ha", "service": "homeassistant",
|
||
|
|
"status": "unhealthy", "incident_id": inc_id,
|
||
|
|
}
|
||
|
|
sup.actual_state["incidents"][inc_id] = {
|
||
|
|
"id": inc_id, "status": "active",
|
||
|
|
"trigger_type": "containers_not_running",
|
||
|
|
"last_occurrence": time.time() - 60, # 1 min ago — within 5-min window
|
||
|
|
}
|
||
|
|
|
||
|
|
_write_event(tmp_path / "events", _make_event("ha_websocket_dead"))
|
||
|
|
sup._process_ha_events()
|
||
|
|
|
||
|
|
action_id = "container-restart-chelsty-ha-homeassistant"
|
||
|
|
assert not _pending(tmp_path, action_id).exists(), "Action should be suppressed during transition"
|
||
|
|
|
||
|
|
|
||
|
|
def test_ha_alert_suppressed_during_transition(tmp_path, monkeypatch):
|
||
|
|
sup = _setup_supervisor(tmp_path, monkeypatch)
|
||
|
|
|
||
|
|
inc_id = "inc-456-chelsty-ha-homeassistant"
|
||
|
|
sup.actual_state["services"]["chelsty-ha/homeassistant"] = {
|
||
|
|
"node": "chelsty-ha", "service": "homeassistant",
|
||
|
|
"status": "unhealthy", "incident_id": inc_id,
|
||
|
|
}
|
||
|
|
sup.actual_state["incidents"][inc_id] = {
|
||
|
|
"id": inc_id, "status": "active",
|
||
|
|
"trigger_type": "containers_not_running",
|
||
|
|
"last_occurrence": time.time() - 30,
|
||
|
|
}
|
||
|
|
|
||
|
|
for event_type in supervisor_module.HA_ALERT_ONLY_EVENTS:
|
||
|
|
_write_event(tmp_path / "events", _make_event(event_type))
|
||
|
|
|
||
|
|
sup._process_ha_events()
|
||
|
|
|
||
|
|
for suffix in supervisor_module._HA_ALERT_ID_SUFFIX.values():
|
||
|
|
action_id = f"alert-ha-{suffix}-chelsty-ha"
|
||
|
|
assert not _pending(tmp_path, action_id).exists(), \
|
||
|
|
f"{action_id} should be suppressed"
|
||
|
|
|
||
|
|
|
||
|
|
def test_transition_suppression_expires_after_window(tmp_path, monkeypatch):
|
||
|
|
"""After 5 min, transition window expires and events are routed normally."""
|
||
|
|
sup = _setup_supervisor(tmp_path, monkeypatch)
|
||
|
|
|
||
|
|
inc_id = "inc-789-chelsty-ha-homeassistant"
|
||
|
|
sup.actual_state["services"]["chelsty-ha/homeassistant"] = {
|
||
|
|
"node": "chelsty-ha", "service": "homeassistant",
|
||
|
|
"status": "unhealthy", "incident_id": inc_id,
|
||
|
|
}
|
||
|
|
sup.actual_state["incidents"][inc_id] = {
|
||
|
|
"id": inc_id, "status": "active",
|
||
|
|
"trigger_type": "containers_not_running",
|
||
|
|
"last_occurrence": time.time() - 400, # 6.7 min ago — outside window
|
||
|
|
}
|
||
|
|
|
||
|
|
_write_event(tmp_path / "events", _make_event("ha_websocket_dead"))
|
||
|
|
sup._process_ha_events()
|
||
|
|
|
||
|
|
action_id = "container-restart-chelsty-ha-homeassistant"
|
||
|
|
assert _pending(tmp_path, action_id).exists(), "Should not be suppressed after window"
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 3. Recovery cancellation
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
def test_ha_websocket_recovered_cancels_pending_restart(tmp_path, monkeypatch):
|
||
|
|
sup = _setup_supervisor(tmp_path, monkeypatch)
|
||
|
|
events_dir = tmp_path / "events"
|
||
|
|
actions = tmp_path / "actions"
|
||
|
|
(actions / "cancelled").mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
# Pre-create a pending container_restart for homeassistant
|
||
|
|
action_id = "container-restart-chelsty-ha-homeassistant"
|
||
|
|
pending_action = {
|
||
|
|
"action_id": action_id, "type": "container_restart",
|
||
|
|
"node": "chelsty-ha", "service": "homeassistant",
|
||
|
|
"status": "pending", "timestamp": time.time(),
|
||
|
|
}
|
||
|
|
_pending(tmp_path, action_id).write_text(json.dumps(pending_action))
|
||
|
|
|
||
|
|
_write_event(events_dir, _make_event("ha_websocket_recovered"))
|
||
|
|
sup._process_ha_events()
|
||
|
|
|
||
|
|
assert not _pending(tmp_path, action_id).exists(), "Pending action should be cancelled"
|
||
|
|
cancelled = actions / "cancelled" / f"{action_id}.json"
|
||
|
|
assert cancelled.exists()
|
||
|
|
data = json.loads(cancelled.read_text())
|
||
|
|
assert data["cancelled_reason"] == "ha_websocket_recovered"
|
||
|
|
|
||
|
|
|
||
|
|
def test_ha_websocket_recovered_no_pending_action_is_noop(tmp_path, monkeypatch):
|
||
|
|
"""Recovery event when no pending restart exists must not raise."""
|
||
|
|
sup = _setup_supervisor(tmp_path, monkeypatch)
|
||
|
|
_write_event(tmp_path / "events", _make_event("ha_websocket_recovered"))
|
||
|
|
sup._process_ha_events() # should not raise
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 4. Cooldown
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
def test_ha_websocket_dead_cooldown_prevents_second_restart(tmp_path, monkeypatch):
|
||
|
|
"""Two ha_websocket_dead events within 30 min → only one container_restart."""
|
||
|
|
sup = _setup_supervisor(tmp_path, monkeypatch)
|
||
|
|
events_dir = tmp_path / "events"
|
||
|
|
actions = tmp_path / "actions"
|
||
|
|
(actions / "completed").mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
# First event → action generated
|
||
|
|
_write_event(events_dir, _make_event("ha_websocket_dead", service="homeassistant"))
|
||
|
|
sup._process_ha_events()
|
||
|
|
|
||
|
|
action_id = "container-restart-chelsty-ha-homeassistant"
|
||
|
|
assert _pending(tmp_path, action_id).exists()
|
||
|
|
|
||
|
|
# Simulate: action completed recently (< 30 min ago)
|
||
|
|
action_data = json.loads(_pending(tmp_path, action_id).read_text())
|
||
|
|
action_data["status"] = "completed"
|
||
|
|
action_data["finished_at"] = time.time() - 60 # 1 min ago
|
||
|
|
(actions / "completed" / f"{action_id}.json").write_text(json.dumps(action_data))
|
||
|
|
_pending(tmp_path, action_id).unlink()
|
||
|
|
|
||
|
|
# Second event — should be suppressed by cooldown
|
||
|
|
event2 = _make_event("ha_websocket_dead", service="homeassistant")
|
||
|
|
event2["id"] = event2["id"] + "-2" # different event ID
|
||
|
|
_write_event(events_dir, event2)
|
||
|
|
sup._process_ha_events()
|
||
|
|
|
||
|
|
assert not _pending(tmp_path, action_id).exists(), "Second restart within cooldown should be suppressed"
|
||
|
|
|
||
|
|
|
||
|
|
def test_ha_websocket_dead_cooldown_expires(tmp_path, monkeypatch):
|
||
|
|
"""After cooldown expires, a new ha_websocket_dead should generate an action."""
|
||
|
|
sup = _setup_supervisor(tmp_path, monkeypatch)
|
||
|
|
events_dir = tmp_path / "events"
|
||
|
|
actions = tmp_path / "actions"
|
||
|
|
(actions / "completed").mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
action_id = "container-restart-chelsty-ha-homeassistant"
|
||
|
|
# Pre-populate completed action with timestamp > 30 min ago
|
||
|
|
old_action = {
|
||
|
|
"action_id": action_id, "type": "container_restart",
|
||
|
|
"status": "completed", "finished_at": time.time() - 3700, # > 30 min
|
||
|
|
}
|
||
|
|
(actions / "completed" / f"{action_id}.json").write_text(json.dumps(old_action))
|
||
|
|
|
||
|
|
_write_event(events_dir, _make_event("ha_websocket_dead"))
|
||
|
|
sup._process_ha_events()
|
||
|
|
|
||
|
|
assert _pending(tmp_path, action_id).exists(), "Should generate new restart after cooldown"
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 5. Location tag preserved
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
def test_location_tag_preserved_in_container_restart_payload(tmp_path, monkeypatch):
|
||
|
|
sup = _setup_supervisor(tmp_path, monkeypatch)
|
||
|
|
_write_event(tmp_path / "events",
|
||
|
|
_make_event("ha_websocket_dead", payload={"location_tag": "chelsty", "extra": "data"}))
|
||
|
|
|
||
|
|
sup._process_ha_events()
|
||
|
|
|
||
|
|
action = _read_action(tmp_path, "pending", "container-restart-chelsty-ha-homeassistant")
|
||
|
|
assert action["payload"]["location_tag"] == "chelsty"
|
||
|
|
|
||
|
|
|
||
|
|
def test_location_tag_preserved_in_alert_only_payload(tmp_path, monkeypatch):
|
||
|
|
sup = _setup_supervisor(tmp_path, monkeypatch)
|
||
|
|
_write_event(tmp_path / "events",
|
||
|
|
_make_event("ha_entity_unavailable_long",
|
||
|
|
payload={"location_tag": "ken", "count": 3}))
|
||
|
|
|
||
|
|
sup._process_ha_events()
|
||
|
|
|
||
|
|
action = _read_action(tmp_path, "pending", "alert-ha-entity-unavailable-chelsty-ha")
|
||
|
|
assert action["payload"]["location_tag"] == "ken"
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 6. Dedup — same alert type twice → only one pending action
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
def test_alert_only_dedup_second_event_skipped(tmp_path, monkeypatch):
|
||
|
|
sup = _setup_supervisor(tmp_path, monkeypatch)
|
||
|
|
events_dir = tmp_path / "events"
|
||
|
|
|
||
|
|
event1 = _make_event("ha_entity_unavailable_long")
|
||
|
|
event2 = _make_event("ha_entity_unavailable_long")
|
||
|
|
event2["id"] = event2["id"] + "-2"
|
||
|
|
_write_event(events_dir, event1)
|
||
|
|
_write_event(events_dir, event2)
|
||
|
|
|
||
|
|
sup._process_ha_events()
|
||
|
|
|
||
|
|
action_id = "alert-ha-entity-unavailable-chelsty-ha"
|
||
|
|
assert _pending(tmp_path, action_id).exists()
|
||
|
|
# Only one file — not duplicated
|
||
|
|
pending_files = list((tmp_path / "actions" / "pending").glob("alert-ha-entity-unavailable*.json"))
|
||
|
|
assert len(pending_files) == 1
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 7. Non-HA events are ignored
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
def test_non_ha_events_not_routed(tmp_path, monkeypatch):
|
||
|
|
sup = _setup_supervisor(tmp_path, monkeypatch)
|
||
|
|
events_dir = tmp_path / "events"
|
||
|
|
|
||
|
|
for etype in ("service_unhealthy", "containers_not_running", "node_online", "deployment_failed"):
|
||
|
|
e = _make_event(etype, service="mosquitto")
|
||
|
|
e["type"] = etype
|
||
|
|
_write_event(events_dir, e)
|
||
|
|
|
||
|
|
sup._process_ha_events()
|
||
|
|
|
||
|
|
pending_files = list((tmp_path / "actions" / "pending").glob("*.json"))
|
||
|
|
assert pending_files == [], "Non-HA events should not generate actions via HA path"
|