homelab-codex-ws/services/ha-diag-agent/src/ha_diag/main.py
Oskar Kapala 20f6761a67 feat(ha-diag-agent): UnavailableEntitiesCheck with root cause dedup
- shared aiohttp ClientSession in HAClient (Phase 1 Flag #2 fixed):
  make_session() factory, session injected at startup, closed on shutdown
- Check.run() → list[CheckResult]: clean multi-event interface
- first real diagnostic check: entity unavailable > 24h
  (INSERT OR IGNORE baseline preserves first-seen timestamp)
- root cause grouping: emit ha_integration_failed instead of N entity
  events when ≥50% of integration's entities are unavailable (≥3 min)
- alert deduplication via SQLite cooldown window (default 6h)
- recovery clears baseline + dedup for immediate re-alert
- configurable thresholds: duration, integration %, cooldown
- 38 unit tests + 7 integration tests (42 pass, 3 skip w/o live HA)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-29 13:41:55 +02:00

151 lines
4.4 KiB
Python

from __future__ import annotations
import asyncio
import json
import logging
import time
from datetime import datetime
import structlog
import uvicorn
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from .api import app, register_checks
from .checks.heartbeat import HeartbeatCheck
from .checks.unavailable_entities import UnavailableEntitiesCheck
from .config import Settings
from .event_emitter import EventEmitter
from .ha_client import HAClient, make_session
from .storage import Storage
_log = structlog.get_logger()
def _configure_structlog(log_level: str) -> None:
structlog.configure(
processors=[
structlog.processors.add_log_level,
structlog.processors.TimeStamper(fmt="iso"),
structlog.processors.StackInfoRenderer(),
structlog.processors.format_exc_info,
structlog.processors.JSONRenderer(),
],
logger_factory=structlog.PrintLoggerFactory(),
)
logging.basicConfig(level=getattr(logging, log_level.upper(), logging.INFO))
async def _run_check_and_emit(
check, emitter: EventEmitter, storage: Storage
) -> None:
"""Run a check, emit events for each result, and record to check_history."""
try:
results = await check.run()
healthy = not any(r.event_type for r in results)
summary = f"{len(results)} issue(s)" if results else "ok"
await storage.record_check(
check_name=check.name,
ran_at=time.time(),
healthy=healthy,
message=summary,
payload=json.dumps([r.model_dump() for r in results]),
)
for result in results:
if result.event_type:
emitter.emit(
event_type=result.event_type,
severity=result.severity.value,
service="homeassistant",
message=result.message,
payload=result.payload,
)
_log.warning(
"check_unhealthy",
check=check.name,
event=result.event_type,
msg=result.message,
)
if healthy:
_log.info("check_ok", check=check.name)
except Exception as exc:
_log.error("check_error", check=check.name, error=str(exc), exc_info=True)
async def run(settings: Settings) -> None:
_configure_structlog(settings.log_level)
_log.info(
"ha_diag_agent_starting",
node=settings.node_name,
location=settings.location_tag,
ha_url=settings.ha_url,
heartbeat_interval=settings.check_interval,
unavailable_interval=settings.check_interval_unavailable,
)
storage = Storage(settings.data_dir / "ha_diag.db")
await storage.open()
emitter = EventEmitter(settings.events_dir, settings.node_name, settings.location_tag)
# Shared session — created once at startup, closed on shutdown
session = make_session(settings.ha_token, settings.ha_timeout)
ha_client = HAClient(settings.ha_url, session)
heartbeat = HeartbeatCheck(ha_client)
unavailable = UnavailableEntitiesCheck(ha_client, storage, settings)
all_checks = [heartbeat, unavailable]
register_checks(all_checks, settings.node_name, settings.location_tag)
scheduler = AsyncIOScheduler()
scheduler.add_job(
_run_check_and_emit,
"interval",
seconds=settings.check_interval,
args=[heartbeat, emitter, storage],
id="check_heartbeat",
next_run_time=datetime.now(),
)
scheduler.add_job(
_run_check_and_emit,
"interval",
seconds=settings.check_interval_unavailable,
args=[unavailable, emitter, storage],
id="check_unavailable_entities",
next_run_time=datetime.now(),
)
scheduler.start()
_log.info(
"scheduler_started",
checks=[c.name for c in all_checks],
heartbeat_interval=settings.check_interval,
unavailable_interval=settings.check_interval_unavailable,
)
config = uvicorn.Config(
app,
host="0.0.0.0",
port=settings.port,
log_level=settings.log_level.lower(),
)
server = uvicorn.Server(config)
try:
await server.serve()
finally:
scheduler.shutdown(wait=False)
await storage.close()
await session.close()
def main() -> None:
settings = Settings.load()
asyncio.run(run(settings))
if __name__ == "__main__":
main()