diff --git a/CLAUDE.md b/CLAUDE.md index fe9d251..ae31a58 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -79,6 +79,25 @@ Emit via `scripts/lib/events.sh` (shell) or `scripts/lib/events.py` (Python). Normalized event types: `deployment_started/completed/failed`, `service_unhealthy/recovered`, `node_offline/online`, `healthcheck_failed`, `remediation_started/completed`. +### Supervisor event routing table + +| Event type | Source | Action generated | Cooldown | +|---|---|---|---| +| `containers_not_running` | stability-agent | `container_restart` | dedup via stable ID | +| `mqtt_unreachable` | stability-agent | `container_restart` | dedup via stable ID | +| `service_unhealthy` / other | stability-agent | `redeploy` | dedup via stable ID | +| `disk_pressure` (high) | stability-agent | `disk_cleanup` | dedup via stable ID | +| `ha_websocket_dead` | ha-diag-agent | `container_restart` (homeassistant) | 30 min after completion | +| `ha_websocket_recovered` | ha-diag-agent | cancels matching restart | — | +| `ha_integration_failed` | ha-diag-agent | `alert_only` | 1 hour | +| `ha_entity_unavailable_long` | ha-diag-agent | `alert_only` | 1 hour | +| `ha_automation_failing` | ha-diag-agent | `alert_only` | 1 hour | +| `ha_update_available` | ha-diag-agent | `alert_only` | 1 hour | +| `ha_recorder_lag` | ha-diag-agent | `alert_only` | 1 hour | +| `ha_system_health_degraded` | ha-diag-agent | `alert_only` | 1 hour | + +HA events are routed directly from the events directory by the supervisor (not via world-state drift loop) to avoid conflicts with stability-agent's independent container health tracking. HA events are suppressed if `homeassistant` had a `containers_not_running` incident within the last 5 minutes (planned restart/update in progress). + ## Discovery Entry Points for Agents When exploring the system, use these files in order: diff --git a/services/control-plane/pyproject.toml b/services/control-plane/pyproject.toml new file mode 100644 index 0000000..da0b065 --- /dev/null +++ b/services/control-plane/pyproject.toml @@ -0,0 +1,19 @@ +[build-system] +requires = ["setuptools>=68"] +build-backend = "setuptools.build_meta" + +[project] +name = "control-plane" +version = "0.1.0" +requires-python = ">=3.11" +dependencies = [ + "pyyaml>=6.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=8.1", +] + +[tool.pytest.ini_options] +testpaths = ["tests"] diff --git a/services/control-plane/src/executor.py b/services/control-plane/src/executor.py index 3dddd6d..162e9ad 100644 --- a/services/control-plane/src/executor.py +++ b/services/control-plane/src/executor.py @@ -101,6 +101,10 @@ class Executor: payload = data.get("payload", {}) success, error_msg = self._execute_disk_cleanup(node, payload) + elif action_type == "alert_only": + # Operator acknowledged the alert; no automated execution needed. + success = True + else: success = False error_msg = f"Unknown action type: {action_type}" diff --git a/services/control-plane/src/supervisor.py b/services/control-plane/src/supervisor.py index 5076e69..5d901ff 100644 --- a/services/control-plane/src/supervisor.py +++ b/services/control-plane/src/supervisor.py @@ -9,6 +9,7 @@ from pathlib import Path RUNTIME_PATH = os.getenv("RUNTIME_PATH", "/opt/homelab") WORLD_DIR = Path(RUNTIME_PATH) / "world" ACTIONS_DIR = Path(RUNTIME_PATH) / "actions" +EVENTS_DIR = Path(RUNTIME_PATH) / "events" REPO_ROOT = Path(os.getenv("REPO_ROOT", "/repo")) # Node alias map: maps alternative node names (as they appear in events/world state) @@ -32,6 +33,48 @@ CONTAINER_RESTART_TRIGGERS = {"containers_not_running", "mqtt_unreachable"} # decide explicitly (e.g. adjust Frigate retain policy or purge HA recorder). NO_DISK_CLEANUP_NODES = {"chelsty-infra", "chelsty-ha"} +# --------------------------------------------------------------------------- +# HA diagnostic event routing (ha-diag-agent events) +# --------------------------------------------------------------------------- + +# ha_websocket_dead: HA WebSocket unresponsive → restart the homeassistant container. +# Separate from CONTAINER_RESTART_TRIGGERS because these events are routed directly +# from the events dir (not via the world-state drift loop) to avoid conflicts with +# the stability-agent's independent container health tracking on the same service key. +HA_CONTAINER_RESTART_EVENTS = {"ha_websocket_dead"} + +# Alert-only events — operator notification, no automated action. +HA_ALERT_ONLY_EVENTS = { + "ha_integration_failed", + "ha_entity_unavailable_long", + "ha_automation_failing", + "ha_update_available", + "ha_recorder_lag", + "ha_system_health_degraded", +} + +# Stable action-ID suffix for each alert-only type +_HA_ALERT_ID_SUFFIX = { + "ha_integration_failed": "integration-failed", + "ha_entity_unavailable_long": "entity-unavailable", + "ha_automation_failing": "automation-failing", + "ha_update_available": "update-available", + "ha_recorder_lag": "recorder-lag", + "ha_system_health_degraded": "system-health-degraded", +} + +# 30-min cooldown after a container_restart completes; prevents restart loops +# when HA repeatedly fails to connect (e.g. bad config, slow startup). +HA_WEBSOCKET_RESTART_COOLDOWN = 1800 + +# 1-hour cooldown for alert-only events; avoids repeated Telegram noise for +# persistent conditions (e.g. an entity that stays unavailable for hours). +HA_ALERT_COOLDOWN = 3600 + +# Suppress ha_* events if homeassistant had a containers_not_running incident +# within this window — HA is in a planned restart/update and alerts would be noise. +HA_TRANSITION_WINDOW = 300 # 5 minutes + # Logging setup logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger("supervisor") @@ -41,6 +84,9 @@ class Supervisor: def __init__(self): self.desired_state = {"services": {}} self.actual_state = {"services": {}, "nodes": {}, "incidents": {}} + # In-memory set of already-routed HA event IDs; prevents re-processing + # on each reconcile cycle. Grows to at most ~hundreds of entries/day. + self._ha_processed_event_ids: set = set() self._ensure_dirs() def _ensure_dirs(self): @@ -242,6 +288,12 @@ class Supervisor: # operator can see it was auto-resolved rather than silently dropped. self._cancel_resolved_pending_actions() + # 5. Route HA diagnostic events emitted by ha-diag-agent. + # Processed directly from the events directory — not via the world-state + # drift loop — to avoid conflicts with stability-agent's independent + # container health tracking for the homeassistant service. + self._process_ha_events() + # ------------------------------------------------------------------ # Recommendation generation # ------------------------------------------------------------------ @@ -442,6 +494,199 @@ class Supervisor: except Exception as e: logger.error(f"Failed to cancel action {action_file.name}: {e}") + # ------------------------------------------------------------------ + # HA diagnostic event routing + # ------------------------------------------------------------------ + + def _process_ha_events(self): + """Scan the events directory for unprocessed ha_* events and route them.""" + if not EVENTS_DIR.exists(): + return + for event_file in sorted(EVENTS_DIR.glob("**/*.json")): + event_id = event_file.stem + if event_id in self._ha_processed_event_ids: + continue + self._ha_processed_event_ids.add(event_id) + try: + with open(event_file) as f: + event = json.load(f) + except Exception as e: + logger.debug(f"Could not read event {event_file}: {e}") + continue + if not event.get("type", "").startswith("ha_"): + continue + self._route_ha_event(event) + + def _route_ha_event(self, event: dict): + event_type = event.get("type", "") + node = event.get("node", "") + if not node: + return + + if event_type in HA_CONTAINER_RESTART_EVENTS: + if self._is_ha_in_transition(node): + logger.debug( + f"Suppressing {event_type} on {node}: homeassistant in transition" + ) + return + self._generate_ha_container_restart(node, event) + + elif event_type == "ha_websocket_recovered": + self._cancel_ha_container_restart(node) + + elif event_type in HA_ALERT_ONLY_EVENTS: + if self._is_ha_in_transition(node): + logger.debug( + f"Suppressing {event_type} on {node}: homeassistant in transition" + ) + return + self._generate_ha_alert_only(node, event) + + def _is_ha_in_transition(self, node: str) -> bool: + """Return True if homeassistant container had a recent containers_not_running incident. + + Suppresses ha_* alerts during planned HA restarts/updates to avoid + flooding the operator with secondary diagnostic alerts. + """ + svc_key = f"{node}/homeassistant" + svc_info = self.actual_state["services"].get(svc_key, {}) + incident_id = svc_info.get("incident_id") + if not incident_id: + return False + incident = self.actual_state["incidents"].get(incident_id, {}) + return ( + incident.get("status") == "active" + and incident.get("trigger_type") == "containers_not_running" + and time.time() - (incident.get("last_occurrence") or 0) < HA_TRANSITION_WINDOW + ) + + def _ha_action_recently_completed(self, action_id: str, cooldown: int) -> bool: + """Return True if action completed/rejected/cancelled within the cooldown window.""" + for state in ("completed", "rejected", "cancelled"): + path = ACTIONS_DIR / state / f"{action_id}.json" + if path.exists(): + try: + with open(path) as f: + data = json.load(f) + finished = ( + data.get("finished_at") + or data.get("cancelled_at") + or data.get("updated_at") + or 0 + ) + if time.time() - finished < cooldown: + return True + except Exception: + pass + return False + + def _generate_ha_container_restart(self, node: str, event: dict): + service = "homeassistant" + action_id = f"container-restart-{node}-{service}" + + for state in ("pending", "approved", "running"): + if (ACTIONS_DIR / state / f"{action_id}.json").exists(): + logger.debug(f"Skipping {action_id}: already in state '{state}'") + return + + if self._ha_action_recently_completed(action_id, HA_WEBSOCKET_RESTART_COOLDOWN): + logger.debug( + f"Skipping {action_id}: within {HA_WEBSOCKET_RESTART_COOLDOWN}s cooldown" + ) + return + + payload = dict(event.get("payload", {})) + payload["reason"] = "ha_websocket_dead" + payload["svc_key"] = f"{node}/{service}" + + container_name = self._get_container_name(service) + action = { + "action_id": action_id, + "timestamp": time.time(), + "type": "container_restart", + "node": node, + "service": service, + "container_name": container_name, + "risk_level": "low", + "confidence": 0.9, + "description": ( + f"Restart '{container_name}' on {node}: HA WebSocket unresponsive" + ), + "status": "pending", + "payload": payload, + } + self._write_pending_action(action) + + def _generate_ha_alert_only(self, node: str, event: dict): + event_type = event.get("type", "") + suffix = _HA_ALERT_ID_SUFFIX.get(event_type, event_type.replace("_", "-")) + action_id = f"alert-ha-{suffix}-{node}" + + for state in ("pending", "approved", "running"): + if (ACTIONS_DIR / state / f"{action_id}.json").exists(): + logger.debug(f"Skipping {action_id}: already in state '{state}'") + return + + if self._ha_action_recently_completed(action_id, HA_ALERT_COOLDOWN): + logger.debug( + f"Skipping {action_id}: within {HA_ALERT_COOLDOWN}s cooldown" + ) + return + + payload = dict(event.get("payload", {})) + payload["reason"] = event_type + + action = { + "action_id": action_id, + "timestamp": time.time(), + "type": "alert_only", + "node": node, + "service": event.get("service", "homeassistant"), + "risk_level": "info", + "confidence": 1.0, + "description": event.get( + "message", f"HA diagnostic alert: {event_type} on {node}" + ), + "status": "pending", + "payload": payload, + } + self._write_pending_action(action) + + def _cancel_ha_container_restart(self, node: str): + """Move a pending ha_websocket_dead container_restart to cancelled on recovery.""" + action_id = f"container-restart-{node}-homeassistant" + pending_path = ACTIONS_DIR / "pending" / f"{action_id}.json" + if not pending_path.exists(): + return + cancelled_dir = ACTIONS_DIR / "cancelled" + cancelled_dir.mkdir(parents=True, exist_ok=True) + dest = cancelled_dir / f"{action_id}.json" + try: + with open(pending_path) as f: + action = json.load(f) + action["status"] = "cancelled" + action["cancelled_reason"] = "ha_websocket_recovered" + action["cancelled_at"] = time.time() + with open(dest, "w") as f: + json.dump(action, f, indent=2) + pending_path.unlink() + logger.info(f"Cancelled {action_id}: ha_websocket_recovered on {node}") + except Exception as e: + logger.error(f"Failed to cancel {action_id}: {e}") + + def _write_pending_action(self, action: dict): + action_id = action["action_id"] + action_path = ACTIONS_DIR / "pending" / f"{action_id}.json" + try: + with open(action_path, "w") as f: + json.dump(action, f, indent=2) + logger.info( + f"Generated HA action: {action_id} " + f"(type={action['type']}, risk={action['risk_level']})" + ) + except Exception as e: + logger.error(f"Failed to save action {action_id}: {e}") + def loop(self, interval=30): logger.info("Starting supervisor loop") while True: diff --git a/services/control-plane/tests/__init__.py b/services/control-plane/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/services/control-plane/tests/test_supervisor_ha.py b/services/control-plane/tests/test_supervisor_ha.py new file mode 100644 index 0000000..b707361 --- /dev/null +++ b/services/control-plane/tests/test_supervisor_ha.py @@ -0,0 +1,344 @@ +"""Tests for HA diagnostic event routing in the supervisor.""" +from __future__ import annotations + +import json +import sys +import time +from pathlib import Path + +import pytest + +# Add src/ to path so we can import supervisor without installing +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) +import supervisor as supervisor_module +from supervisor import Supervisor + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_event(event_type: str, node: str = "chelsty-ha", service: str = "homeassistant", + payload: dict | None = None, message: str = "") -> dict: + return { + "id": f"evt-{node}-{int(time.time())}-{event_type}-{service}-1", + "type": event_type, + "node": node, + "service": service, + "severity": "warning", + "timestamp": int(time.time()), + "message": message or f"Test event: {event_type}", + "payload": payload or {"location_tag": "chelsty"}, + } + + +def _write_event(events_dir: Path, event: dict) -> Path: + path = events_dir / f"{event['id']}.json" + path.write_text(json.dumps(event)) + return path + + +def _setup_supervisor(tmp_path: Path, monkeypatch) -> Supervisor: + """Return a Supervisor instance with all paths redirected to tmp_path.""" + actions = tmp_path / "actions" + events = tmp_path / "events" + world = tmp_path / "world" + repo = tmp_path / "repo" + state = tmp_path / "state" + + for d in (actions, events, world, repo / "hosts", state): + d.mkdir(parents=True, exist_ok=True) + + monkeypatch.setattr(supervisor_module, "ACTIONS_DIR", actions) + monkeypatch.setattr(supervisor_module, "EVENTS_DIR", events) + monkeypatch.setattr(supervisor_module, "WORLD_DIR", world) + monkeypatch.setattr(supervisor_module, "REPO_ROOT", repo) + + sup = Supervisor() + # Empty desired/actual state so reconcile drift loop is a no-op + sup.desired_state = {"services": {}} + sup.actual_state = {"services": {}, "nodes": {}, "incidents": {}} + return sup + + +def _pending(tmp_path: Path, action_id: str) -> Path: + return tmp_path / "actions" / "pending" / f"{action_id}.json" + + +def _read_action(tmp_path: Path, state: str, action_id: str) -> dict: + return json.loads((tmp_path / "actions" / state / f"{action_id}.json").read_text()) + + +# --------------------------------------------------------------------------- +# 1. Each event type → correct action type +# --------------------------------------------------------------------------- + +def test_ha_websocket_dead_generates_container_restart(tmp_path, monkeypatch): + sup = _setup_supervisor(tmp_path, monkeypatch) + events_dir = tmp_path / "events" + _write_event(events_dir, _make_event("ha_websocket_dead")) + + sup._process_ha_events() + + action_id = "container-restart-chelsty-ha-homeassistant" + assert _pending(tmp_path, action_id).exists() + action = _read_action(tmp_path, "pending", action_id) + assert action["type"] == "container_restart" + assert action["service"] == "homeassistant" + assert action["node"] == "chelsty-ha" + + +@pytest.mark.parametrize("event_type,expected_suffix", [ + ("ha_integration_failed", "integration-failed"), + ("ha_entity_unavailable_long", "entity-unavailable"), + ("ha_automation_failing", "automation-failing"), + ("ha_update_available", "update-available"), + ("ha_recorder_lag", "recorder-lag"), + ("ha_system_health_degraded", "system-health-degraded"), +]) +def test_alert_only_events_generate_alert_actions( + tmp_path, monkeypatch, event_type, expected_suffix +): + sup = _setup_supervisor(tmp_path, monkeypatch) + _write_event(tmp_path / "events", _make_event(event_type)) + + sup._process_ha_events() + + action_id = f"alert-ha-{expected_suffix}-chelsty-ha" + assert _pending(tmp_path, action_id).exists(), f"No pending action for {event_type}" + action = _read_action(tmp_path, "pending", action_id) + assert action["type"] == "alert_only" + assert action["node"] == "chelsty-ha" + + +# --------------------------------------------------------------------------- +# 2. Transition suppression +# --------------------------------------------------------------------------- + +def test_ha_websocket_dead_suppressed_during_transition(tmp_path, monkeypatch): + sup = _setup_supervisor(tmp_path, monkeypatch) + + # Set up world state: homeassistant has an active containers_not_running incident + inc_id = "inc-123-chelsty-ha-homeassistant" + sup.actual_state["services"]["chelsty-ha/homeassistant"] = { + "node": "chelsty-ha", "service": "homeassistant", + "status": "unhealthy", "incident_id": inc_id, + } + sup.actual_state["incidents"][inc_id] = { + "id": inc_id, "status": "active", + "trigger_type": "containers_not_running", + "last_occurrence": time.time() - 60, # 1 min ago — within 5-min window + } + + _write_event(tmp_path / "events", _make_event("ha_websocket_dead")) + sup._process_ha_events() + + action_id = "container-restart-chelsty-ha-homeassistant" + assert not _pending(tmp_path, action_id).exists(), "Action should be suppressed during transition" + + +def test_ha_alert_suppressed_during_transition(tmp_path, monkeypatch): + sup = _setup_supervisor(tmp_path, monkeypatch) + + inc_id = "inc-456-chelsty-ha-homeassistant" + sup.actual_state["services"]["chelsty-ha/homeassistant"] = { + "node": "chelsty-ha", "service": "homeassistant", + "status": "unhealthy", "incident_id": inc_id, + } + sup.actual_state["incidents"][inc_id] = { + "id": inc_id, "status": "active", + "trigger_type": "containers_not_running", + "last_occurrence": time.time() - 30, + } + + for event_type in supervisor_module.HA_ALERT_ONLY_EVENTS: + _write_event(tmp_path / "events", _make_event(event_type)) + + sup._process_ha_events() + + for suffix in supervisor_module._HA_ALERT_ID_SUFFIX.values(): + action_id = f"alert-ha-{suffix}-chelsty-ha" + assert not _pending(tmp_path, action_id).exists(), \ + f"{action_id} should be suppressed" + + +def test_transition_suppression_expires_after_window(tmp_path, monkeypatch): + """After 5 min, transition window expires and events are routed normally.""" + sup = _setup_supervisor(tmp_path, monkeypatch) + + inc_id = "inc-789-chelsty-ha-homeassistant" + sup.actual_state["services"]["chelsty-ha/homeassistant"] = { + "node": "chelsty-ha", "service": "homeassistant", + "status": "unhealthy", "incident_id": inc_id, + } + sup.actual_state["incidents"][inc_id] = { + "id": inc_id, "status": "active", + "trigger_type": "containers_not_running", + "last_occurrence": time.time() - 400, # 6.7 min ago — outside window + } + + _write_event(tmp_path / "events", _make_event("ha_websocket_dead")) + sup._process_ha_events() + + action_id = "container-restart-chelsty-ha-homeassistant" + assert _pending(tmp_path, action_id).exists(), "Should not be suppressed after window" + + +# --------------------------------------------------------------------------- +# 3. Recovery cancellation +# --------------------------------------------------------------------------- + +def test_ha_websocket_recovered_cancels_pending_restart(tmp_path, monkeypatch): + sup = _setup_supervisor(tmp_path, monkeypatch) + events_dir = tmp_path / "events" + actions = tmp_path / "actions" + (actions / "cancelled").mkdir(parents=True, exist_ok=True) + + # Pre-create a pending container_restart for homeassistant + action_id = "container-restart-chelsty-ha-homeassistant" + pending_action = { + "action_id": action_id, "type": "container_restart", + "node": "chelsty-ha", "service": "homeassistant", + "status": "pending", "timestamp": time.time(), + } + _pending(tmp_path, action_id).write_text(json.dumps(pending_action)) + + _write_event(events_dir, _make_event("ha_websocket_recovered")) + sup._process_ha_events() + + assert not _pending(tmp_path, action_id).exists(), "Pending action should be cancelled" + cancelled = actions / "cancelled" / f"{action_id}.json" + assert cancelled.exists() + data = json.loads(cancelled.read_text()) + assert data["cancelled_reason"] == "ha_websocket_recovered" + + +def test_ha_websocket_recovered_no_pending_action_is_noop(tmp_path, monkeypatch): + """Recovery event when no pending restart exists must not raise.""" + sup = _setup_supervisor(tmp_path, monkeypatch) + _write_event(tmp_path / "events", _make_event("ha_websocket_recovered")) + sup._process_ha_events() # should not raise + + +# --------------------------------------------------------------------------- +# 4. Cooldown +# --------------------------------------------------------------------------- + +def test_ha_websocket_dead_cooldown_prevents_second_restart(tmp_path, monkeypatch): + """Two ha_websocket_dead events within 30 min → only one container_restart.""" + sup = _setup_supervisor(tmp_path, monkeypatch) + events_dir = tmp_path / "events" + actions = tmp_path / "actions" + (actions / "completed").mkdir(parents=True, exist_ok=True) + + # First event → action generated + _write_event(events_dir, _make_event("ha_websocket_dead", service="homeassistant")) + sup._process_ha_events() + + action_id = "container-restart-chelsty-ha-homeassistant" + assert _pending(tmp_path, action_id).exists() + + # Simulate: action completed recently (< 30 min ago) + action_data = json.loads(_pending(tmp_path, action_id).read_text()) + action_data["status"] = "completed" + action_data["finished_at"] = time.time() - 60 # 1 min ago + (actions / "completed" / f"{action_id}.json").write_text(json.dumps(action_data)) + _pending(tmp_path, action_id).unlink() + + # Second event — should be suppressed by cooldown + event2 = _make_event("ha_websocket_dead", service="homeassistant") + event2["id"] = event2["id"] + "-2" # different event ID + _write_event(events_dir, event2) + sup._process_ha_events() + + assert not _pending(tmp_path, action_id).exists(), "Second restart within cooldown should be suppressed" + + +def test_ha_websocket_dead_cooldown_expires(tmp_path, monkeypatch): + """After cooldown expires, a new ha_websocket_dead should generate an action.""" + sup = _setup_supervisor(tmp_path, monkeypatch) + events_dir = tmp_path / "events" + actions = tmp_path / "actions" + (actions / "completed").mkdir(parents=True, exist_ok=True) + + action_id = "container-restart-chelsty-ha-homeassistant" + # Pre-populate completed action with timestamp > 30 min ago + old_action = { + "action_id": action_id, "type": "container_restart", + "status": "completed", "finished_at": time.time() - 3700, # > 30 min + } + (actions / "completed" / f"{action_id}.json").write_text(json.dumps(old_action)) + + _write_event(events_dir, _make_event("ha_websocket_dead")) + sup._process_ha_events() + + assert _pending(tmp_path, action_id).exists(), "Should generate new restart after cooldown" + + +# --------------------------------------------------------------------------- +# 5. Location tag preserved +# --------------------------------------------------------------------------- + +def test_location_tag_preserved_in_container_restart_payload(tmp_path, monkeypatch): + sup = _setup_supervisor(tmp_path, monkeypatch) + _write_event(tmp_path / "events", + _make_event("ha_websocket_dead", payload={"location_tag": "chelsty", "extra": "data"})) + + sup._process_ha_events() + + action = _read_action(tmp_path, "pending", "container-restart-chelsty-ha-homeassistant") + assert action["payload"]["location_tag"] == "chelsty" + + +def test_location_tag_preserved_in_alert_only_payload(tmp_path, monkeypatch): + sup = _setup_supervisor(tmp_path, monkeypatch) + _write_event(tmp_path / "events", + _make_event("ha_entity_unavailable_long", + payload={"location_tag": "ken", "count": 3})) + + sup._process_ha_events() + + action = _read_action(tmp_path, "pending", "alert-ha-entity-unavailable-chelsty-ha") + assert action["payload"]["location_tag"] == "ken" + + +# --------------------------------------------------------------------------- +# 6. Dedup — same alert type twice → only one pending action +# --------------------------------------------------------------------------- + +def test_alert_only_dedup_second_event_skipped(tmp_path, monkeypatch): + sup = _setup_supervisor(tmp_path, monkeypatch) + events_dir = tmp_path / "events" + + event1 = _make_event("ha_entity_unavailable_long") + event2 = _make_event("ha_entity_unavailable_long") + event2["id"] = event2["id"] + "-2" + _write_event(events_dir, event1) + _write_event(events_dir, event2) + + sup._process_ha_events() + + action_id = "alert-ha-entity-unavailable-chelsty-ha" + assert _pending(tmp_path, action_id).exists() + # Only one file — not duplicated + pending_files = list((tmp_path / "actions" / "pending").glob("alert-ha-entity-unavailable*.json")) + assert len(pending_files) == 1 + + +# --------------------------------------------------------------------------- +# 7. Non-HA events are ignored +# --------------------------------------------------------------------------- + +def test_non_ha_events_not_routed(tmp_path, monkeypatch): + sup = _setup_supervisor(tmp_path, monkeypatch) + events_dir = tmp_path / "events" + + for etype in ("service_unhealthy", "containers_not_running", "node_online", "deployment_failed"): + e = _make_event(etype, service="mosquitto") + e["type"] = etype + _write_event(events_dir, e) + + sup._process_ha_events() + + pending_files = list((tmp_path / "actions" / "pending").glob("*.json")) + assert pending_files == [], "Non-HA events should not generate actions via HA path"