- shared aiohttp ClientSession in HAClient (Phase 1 Flag #2 fixed): make_session() factory, session injected at startup, closed on shutdown - Check.run() → list[CheckResult]: clean multi-event interface - first real diagnostic check: entity unavailable > 24h (INSERT OR IGNORE baseline preserves first-seen timestamp) - root cause grouping: emit ha_integration_failed instead of N entity events when ≥50% of integration's entities are unavailable (≥3 min) - alert deduplication via SQLite cooldown window (default 6h) - recovery clears baseline + dedup for immediate re-alert - configurable thresholds: duration, integration %, cooldown - 38 unit tests + 7 integration tests (42 pass, 3 skip w/o live HA) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
151 lines
4.4 KiB
Python
151 lines
4.4 KiB
Python
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import time
|
|
from datetime import datetime
|
|
|
|
import structlog
|
|
import uvicorn
|
|
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
|
|
|
from .api import app, register_checks
|
|
from .checks.heartbeat import HeartbeatCheck
|
|
from .checks.unavailable_entities import UnavailableEntitiesCheck
|
|
from .config import Settings
|
|
from .event_emitter import EventEmitter
|
|
from .ha_client import HAClient, make_session
|
|
from .storage import Storage
|
|
|
|
_log = structlog.get_logger()
|
|
|
|
|
|
def _configure_structlog(log_level: str) -> None:
|
|
structlog.configure(
|
|
processors=[
|
|
structlog.processors.add_log_level,
|
|
structlog.processors.TimeStamper(fmt="iso"),
|
|
structlog.processors.StackInfoRenderer(),
|
|
structlog.processors.format_exc_info,
|
|
structlog.processors.JSONRenderer(),
|
|
],
|
|
logger_factory=structlog.PrintLoggerFactory(),
|
|
)
|
|
logging.basicConfig(level=getattr(logging, log_level.upper(), logging.INFO))
|
|
|
|
|
|
async def _run_check_and_emit(
|
|
check, emitter: EventEmitter, storage: Storage
|
|
) -> None:
|
|
"""Run a check, emit events for each result, and record to check_history."""
|
|
try:
|
|
results = await check.run()
|
|
healthy = not any(r.event_type for r in results)
|
|
summary = f"{len(results)} issue(s)" if results else "ok"
|
|
|
|
await storage.record_check(
|
|
check_name=check.name,
|
|
ran_at=time.time(),
|
|
healthy=healthy,
|
|
message=summary,
|
|
payload=json.dumps([r.model_dump() for r in results]),
|
|
)
|
|
|
|
for result in results:
|
|
if result.event_type:
|
|
emitter.emit(
|
|
event_type=result.event_type,
|
|
severity=result.severity.value,
|
|
service="homeassistant",
|
|
message=result.message,
|
|
payload=result.payload,
|
|
)
|
|
_log.warning(
|
|
"check_unhealthy",
|
|
check=check.name,
|
|
event=result.event_type,
|
|
msg=result.message,
|
|
)
|
|
|
|
if healthy:
|
|
_log.info("check_ok", check=check.name)
|
|
|
|
except Exception as exc:
|
|
_log.error("check_error", check=check.name, error=str(exc), exc_info=True)
|
|
|
|
|
|
async def run(settings: Settings) -> None:
|
|
_configure_structlog(settings.log_level)
|
|
_log.info(
|
|
"ha_diag_agent_starting",
|
|
node=settings.node_name,
|
|
location=settings.location_tag,
|
|
ha_url=settings.ha_url,
|
|
heartbeat_interval=settings.check_interval,
|
|
unavailable_interval=settings.check_interval_unavailable,
|
|
)
|
|
|
|
storage = Storage(settings.data_dir / "ha_diag.db")
|
|
await storage.open()
|
|
|
|
emitter = EventEmitter(settings.events_dir, settings.node_name, settings.location_tag)
|
|
|
|
# Shared session — created once at startup, closed on shutdown
|
|
session = make_session(settings.ha_token, settings.ha_timeout)
|
|
ha_client = HAClient(settings.ha_url, session)
|
|
|
|
heartbeat = HeartbeatCheck(ha_client)
|
|
unavailable = UnavailableEntitiesCheck(ha_client, storage, settings)
|
|
|
|
all_checks = [heartbeat, unavailable]
|
|
register_checks(all_checks, settings.node_name, settings.location_tag)
|
|
|
|
scheduler = AsyncIOScheduler()
|
|
scheduler.add_job(
|
|
_run_check_and_emit,
|
|
"interval",
|
|
seconds=settings.check_interval,
|
|
args=[heartbeat, emitter, storage],
|
|
id="check_heartbeat",
|
|
next_run_time=datetime.now(),
|
|
)
|
|
scheduler.add_job(
|
|
_run_check_and_emit,
|
|
"interval",
|
|
seconds=settings.check_interval_unavailable,
|
|
args=[unavailable, emitter, storage],
|
|
id="check_unavailable_entities",
|
|
next_run_time=datetime.now(),
|
|
)
|
|
scheduler.start()
|
|
_log.info(
|
|
"scheduler_started",
|
|
checks=[c.name for c in all_checks],
|
|
heartbeat_interval=settings.check_interval,
|
|
unavailable_interval=settings.check_interval_unavailable,
|
|
)
|
|
|
|
config = uvicorn.Config(
|
|
app,
|
|
host="0.0.0.0",
|
|
port=settings.port,
|
|
log_level=settings.log_level.lower(),
|
|
)
|
|
server = uvicorn.Server(config)
|
|
try:
|
|
await server.serve()
|
|
finally:
|
|
scheduler.shutdown(wait=False)
|
|
await storage.close()
|
|
await session.close()
|
|
|
|
|
|
def main() -> None:
|
|
settings = Settings.load()
|
|
asyncio.run(run(settings))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|