homelab-codex-ws/services/ha-diag-agent/tests/integration/test_system_health_integration.py

152 lines
4.9 KiB
Python
Raw Permalink Normal View History

feat(ha-diag-agent): three REST diagnostic checks + Phase 3 flag fixes New checks: - SystemHealthCheck (15min interval): detects newly-failing HA integrations via /api/system_health snapshot diff; transition-based dedup (ok→error fires, sustained error silent, error→ok clears alert) - UpdatesAvailableCheck (daily cron 09:00): per-update ha_update_available events with 7-day dedup; release notes truncated at 2000 chars - UpdatesDigestCheck (Sunday cron 09:00): single digest event with all pending updates; weekly ISO-week dedup, independent of daily dedup key - AutomationFailuresCheck (30min interval): detects automations with N consecutive failures (default 3) via /api/trace/automation/<id>; 6h cooldown per automation Phase 3 flag fixes: - Flag #1 (since field): UnavailableEntitiesCheck now uses min(state.last_changed, baseline.first_seen) as effective "since", giving accurate duration when agent was offline at entity's first fail - Flag #3 (registry cache): HAClient.get_entity_registry() caches response in-process with configurable TTL (default 300s); avoids repeated API calls across concurrent check cycles; invalidate_registry_cache() for manual invalidation Storage: system_health_snapshot table (component, last_status, last_seen_at, payload) created automatically on next Storage.open() call Config additions (all with defaults): entity_registry_cache_ttl=300, system_health_check_interval=900, automation_check_interval=1800, automation_failure_threshold=3, updates_check_hour=9, updates_check_minute=0, updates_cooldown_days=7 Tests: 95 unit tests pass (49 new), 13 integration tests pass (9 new); 3 skipped (live-HA token not set in CI) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-29 14:43:10 +02:00
"""Integration tests for SystemHealthCheck using aioresponses.
Uses real aiosqlite Storage + EventEmitter + mocked HTTP.
Marked 'integration' because it exercises the full stack end-to-end.
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import AsyncGenerator
import pytest
import pytest_asyncio
from aioresponses import aioresponses
from ha_diag.checks.system_health import SystemHealthCheck
from ha_diag.config import Settings
from ha_diag.event_emitter import EventEmitter
from ha_diag.ha_client import HAClient, make_session
from ha_diag.models import HAEventType
from ha_diag.storage import Storage
HA_URL = "http://ha-test-ken:8123"
def _settings(**overrides) -> Settings:
defaults: dict = {
"ha_url": HA_URL,
"ha_token": "test-token",
"node_name": "piha",
"location_tag": "ken",
"alert_cooldown_hours": 0.0,
"check_interval": 60,
"check_interval_unavailable": 3600,
}
defaults.update(overrides)
return Settings(**defaults)
@pytest_asyncio.fixture
async def storage(tmp_path: Path) -> AsyncGenerator[Storage, None]:
s = Storage(tmp_path / "integration_test.db")
await s.open()
yield s
await s.close()
@pytest.fixture
def events_dir(tmp_path: Path) -> Path:
d = tmp_path / "events"
d.mkdir()
return d
@pytest.mark.integration
async def test_system_health_ok_components_no_event(
storage: Storage, events_dir: Path
):
"""All components healthy on first run → no events emitted."""
health = {
"homeassistant": {"type": "result", "data": {"version": "2025.5.0"}},
"recorder": {"type": "result", "data": {"backlog": 0}},
}
emitter = EventEmitter(events_dir, node_name="piha", location_tag="ken")
with aioresponses() as m:
m.get(f"{HA_URL}/api/system_health", payload=health)
async with make_session("test-token") as session:
client = HAClient(HA_URL, session)
check = SystemHealthCheck(client, storage, _settings())
results = await check.run()
assert results == []
assert not list(events_dir.glob("*.json"))
@pytest.mark.integration
async def test_system_health_degraded_emits_event_and_writes_file(
storage: Storage, events_dir: Path
):
"""Component degrades: event emitted + file written with correct structure."""
# First run: all ok
health_ok = {"cloud": {"type": "result", "data": {}}}
health_err = {"cloud": {"type": "error", "error": "Cloud connection lost"}}
emitter = EventEmitter(events_dir, node_name="piha", location_tag="ken")
with aioresponses() as m:
m.get(f"{HA_URL}/api/system_health", payload=health_ok)
async with make_session("test-token") as session:
client = HAClient(HA_URL, session)
await SystemHealthCheck(client, storage, _settings()).run()
# Second run: cloud errors
with aioresponses() as m:
m.get(f"{HA_URL}/api/system_health", payload=health_err)
async with make_session("test-token") as session:
client = HAClient(HA_URL, session)
check = SystemHealthCheck(client, storage, _settings())
results = await check.run()
assert len(results) == 1
assert results[0].event_type == HAEventType.ha_system_health_degraded
emitter.emit(
event_type=results[0].event_type,
severity=results[0].severity.value,
service="homeassistant",
message=results[0].message,
payload=results[0].payload,
)
files = list(events_dir.glob("*.json"))
assert len(files) == 1
data = json.loads(files[0].read_text())
assert data["type"] == "ha_system_health_degraded"
assert data["payload"]["component"] == "cloud"
assert data["payload"]["location_tag"] == "ken"
@pytest.mark.integration
async def test_system_health_recovery_and_re_degradation(storage: Storage):
"""Full ok→error→ok→error cycle: events fire on degradation, not on recovery."""
def _run(health):
with aioresponses() as m:
m.get(f"{HA_URL}/api/system_health", payload=health)
return make_session("test-token"), health
settings = _settings()
async def run_once(health):
with aioresponses() as m:
m.get(f"{HA_URL}/api/system_health", payload=health)
async with make_session("test-token") as session:
return await SystemHealthCheck(
HAClient(HA_URL, session), storage, settings
).run()
ok_h = {"cloud": {"type": "result", "data": {}}}
err_h = {"cloud": {"type": "error", "error": "timeout"}}
r1 = await run_once(ok_h) # baseline ok
r2 = await run_once(err_h) # first degradation
r3 = await run_once(err_h) # sustained error (no dup)
r4 = await run_once(ok_h) # recovery
r5 = await run_once(err_h) # second degradation
assert r1 == []
assert len(r2) == 1
assert r3 == []
assert r4 == []
assert len(r5) == 1