diff --git a/hosts/chelsty-infra/services.yaml b/hosts/chelsty-infra/services.yaml index 5049e79..0bdc4d7 100644 --- a/hosts/chelsty-infra/services.yaml +++ b/hosts/chelsty-infra/services.yaml @@ -2,6 +2,22 @@ host: chelsty-infra site: chelsty services: + ha-diag-agent: + role: ha-diagnostic-agent + deployment_model: docker-compose + exposure: local-only + offline_required: false + depends_on: + local: [] + external: [homeassistant] + config: + target_url: http://localhost:8123 + location_tag: "chelsty" + events_dir: /opt/homelab/events/chelsty-infra + runtime: + config_path: /opt/homelab/config/ha-diag-agent + data_path: /var/lib/ha-diag-agent + node-agent: role: node-stability-monitor # LTE node: node-agent monitors and emits events but does NO Docker cleanup. diff --git a/hosts/piha/services.yaml b/hosts/piha/services.yaml index 6676f97..03fea12 100644 --- a/hosts/piha/services.yaml +++ b/hosts/piha/services.yaml @@ -1,6 +1,22 @@ host: piha services: + ha-diag-agent: + role: ha-diagnostic-agent + deployment_model: docker-compose + exposure: local-only + offline_required: false + depends_on: + local: [] + external: [homeassistant] + config: + target_url: http://localhost:8123 + location_tag: "ken" + events_dir: /opt/homelab/events/piha + runtime: + config_path: /opt/homelab/config/ha-diag-agent + data_path: /var/lib/ha-diag-agent + node-agent: role: node-stability-monitor deployment_model: docker-compose diff --git a/services/ha-diag-agent/Dockerfile b/services/ha-diag-agent/Dockerfile new file mode 100644 index 0000000..3fd0fc8 --- /dev/null +++ b/services/ha-diag-agent/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.11-slim + +WORKDIR /app + +COPY pyproject.toml . +RUN mkdir -p src/ha_diag && touch src/ha_diag/__init__.py && \ + pip install --no-cache-dir -e . + +COPY src/ src/ + +ENV PYTHONUNBUFFERED=1 + +CMD ["python", "-m", "ha_diag.main"] diff --git a/services/ha-diag-agent/README.md b/services/ha-diag-agent/README.md new file mode 100644 index 0000000..3cd391f --- /dev/null +++ b/services/ha-diag-agent/README.md @@ -0,0 +1,91 @@ +# ha-diag-agent + +Per-host Home Assistant diagnostic agent. Polls HA REST API on a schedule, +emits structured events to `/opt/homelab/events//`, and exposes an +HTTP API for health checks and manual check triggers. + +Follows the same event-pipeline pattern as `node-agent`: filesystem-first, +no direct supervisor integration, events processed by the VPS observer. + +## Architecture + +``` +APScheduler (every CHECK_INTERVAL s) + └─ HeartbeatCheck → pings /api/, emits ha_websocket_dead on failure + [Phase 3: EntityUnavailableCheck, SystemHealthCheck, UpdateCheck, ...] + +FastAPI (port 8087) + GET /health → liveness probe + POST /trigger/ → run a named check on demand + +SQLite (/data/ha_diag.db) + entity_baseline → last-known entity states + check_history → per-check run log + alerts_sent → dedup gate for alert events +``` + +## Event Types + +| Type | Severity | Trigger | +|------|----------|---------| +| `ha_websocket_dead` | error | HA /api/ unreachable | +| `ha_integration_failed` | error | Integration in error state | +| `ha_entity_unavailable_long` | warning | Entity unavailable > threshold | +| `ha_automation_failing` | warning | Automation last run errored | +| `ha_update_available` | info | HA or integration update pending | +| `ha_recorder_lag` | warning | Recorder write lag > threshold | +| `ha_system_health_degraded` | warning | System health check failed | + +Event routing in supervisor (Phase 5) maps these to `notify` actions. + +## Deployment + +```bash +# 1. Create config on target node +ssh oskar@ +mkdir -p /opt/homelab/config/ha-diag-agent /var/lib/ha-diag-agent +cat > /opt/homelab/config/ha-diag-agent/.env << 'EOF' +HA_URL=http://homeassistant.local:8123 +HA_TOKEN= +NODE_NAME=piha +LOCATION_TAG=ken +CHECK_INTERVAL=60 +EOF + +# 2. Deploy +scripts/deploy/deploy.sh --service ha-diag-agent + +# 3. Verify +docker ps --filter name=ha-diag-agent +curl http://localhost:8087/health +``` + +### chelsty-infra note + +`chelsty-infra` runs docker-compose v1 (1.29.2). Use `docker-compose` (hyphenated): +```bash +docker-compose -f docker-compose.yml up -d --build +``` + +### HA long-lived token + +In HA UI: Profile → Long-Lived Access Tokens → Create token. + +## Running Tests + +```bash +cd services/ha-diag-agent +pip install -e ".[dev]" +pytest tests/ -v +``` + +## Optional YAML config + +Place `/opt/homelab/config/ha-diag-agent/ha-diag-agent.yaml` on the node. +Values there are defaults; env vars take priority. + +```yaml +ha_url: http://homeassistant.local:8123 +location_tag: ken +check_interval: 60 +``` diff --git a/services/ha-diag-agent/docker-compose.yml b/services/ha-diag-agent/docker-compose.yml new file mode 100644 index 0000000..dcba387 --- /dev/null +++ b/services/ha-diag-agent/docker-compose.yml @@ -0,0 +1,30 @@ +services: + ha-diag-agent: + build: . + container_name: ha-diag-agent + restart: unless-stopped + + env_file: + - /opt/homelab/config/ha-diag-agent/.env + + ports: + - "8087:8087" + + volumes: + # Events dir: host path includes node name; inside container always /events + - /opt/homelab/events/${NODE_NAME:-ha-diag}:/events + # SQLite baseline cache and check history + - /var/lib/ha-diag-agent:/data + # Optional YAML config (read-only) + - /opt/homelab/config/ha-diag-agent:/config:ro + + healthcheck: + test: + - "CMD" + - "python" + - "-c" + - "import urllib.request; urllib.request.urlopen('http://localhost:8087/health', timeout=5)" + interval: 30s + timeout: 10s + retries: 3 + start_period: 20s diff --git a/services/ha-diag-agent/env.example b/services/ha-diag-agent/env.example new file mode 100644 index 0000000..cf09776 --- /dev/null +++ b/services/ha-diag-agent/env.example @@ -0,0 +1,19 @@ +# ha-diag-agent environment variables +# Copy to /opt/homelab/config/ha-diag-agent/.env on the target node + +# Home Assistant connection (required) +HA_URL=http://homeassistant.local:8123 +HA_TOKEN=your-long-lived-token-here + +# Node identity +NODE_NAME=piha +LOCATION_TAG=ken + +# Timing +CHECK_INTERVAL=60 + +# API server +PORT=8087 + +# Logging: debug, info, warning, error +LOG_LEVEL=info diff --git a/services/ha-diag-agent/healthcheck.sh b/services/ha-diag-agent/healthcheck.sh new file mode 100755 index 0000000..e3be09d --- /dev/null +++ b/services/ha-diag-agent/healthcheck.sh @@ -0,0 +1,12 @@ +#!/bin/sh +# Healthcheck: probe the FastAPI /health endpoint +set -e +PORT="${PORT:-8087}" +python -c " +import urllib.request, sys +try: + r = urllib.request.urlopen('http://localhost:${PORT}/health', timeout=5) + sys.exit(0 if r.status == 200 else 1) +except Exception: + sys.exit(1) +" diff --git a/services/ha-diag-agent/pyproject.toml b/services/ha-diag-agent/pyproject.toml new file mode 100644 index 0000000..92836da --- /dev/null +++ b/services/ha-diag-agent/pyproject.toml @@ -0,0 +1,33 @@ +[build-system] +requires = ["setuptools>=68"] +build-backend = "setuptools.build_meta" + +[project] +name = "ha-diag-agent" +version = "0.1.0" +requires-python = ">=3.11" +dependencies = [ + "aiohttp>=3.9", + "fastapi>=0.110", + "uvicorn[standard]>=0.29", + "pydantic>=2.6", + "pydantic-settings>=2.2", + "apscheduler>=3.10", + "aiosqlite>=0.20", + "structlog>=24.1", + "pyyaml>=6.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=8.1", + "pytest-asyncio>=0.23", + "aioresponses>=0.7", +] + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.pytest.ini_options] +asyncio_mode = "auto" +testpaths = ["tests"] diff --git a/services/ha-diag-agent/service.yaml b/services/ha-diag-agent/service.yaml new file mode 100644 index 0000000..4da8ffe --- /dev/null +++ b/services/ha-diag-agent/service.yaml @@ -0,0 +1,37 @@ +service: + name: ha-diag-agent + # Deployed per-host: piha (site: ken) and chelsty-infra (site: chelsty) + owner_node: per-host + exposure: local-only + monitor: true + + dependencies: + - homeassistant + + ports: + - 8087 + + healthcheck: + type: http + path: /health + interval: 30s + timeout: 10s + retries: 3 + start_period: 20s + + restart_policy: unless-stopped + + persistence: + paths: + - /opt/homelab/events + - /var/lib/ha-diag-agent + + runtime: + env_vars: + - HA_TOKEN # long-lived HA access token (required) + - HA_URL # http://homeassistant.local:8123 + - NODE_NAME # canonical node name: piha, chelsty-infra, ... + - LOCATION_TAG # human site label: ken, chelsty, ... + - CHECK_INTERVAL # seconds between check cycles (default: 60) + - PORT # FastAPI port (default: 8087) + - LOG_LEVEL # default: info diff --git a/services/ha-diag-agent/src/ha_diag/__init__.py b/services/ha-diag-agent/src/ha_diag/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/services/ha-diag-agent/src/ha_diag/api.py b/services/ha-diag-agent/src/ha_diag/api.py new file mode 100644 index 0000000..5e7cda4 --- /dev/null +++ b/services/ha-diag-agent/src/ha_diag/api.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from fastapi import FastAPI, HTTPException + +if TYPE_CHECKING: + from .checks.base import Check + +app = FastAPI(title="ha-diag-agent", version="0.1.0") + +# Populated by main.py during startup +_checks: dict[str, "Check"] = {} +_node_name: str = "unknown" +_location_tag: str = "default" + + +def register_checks(checks: list["Check"], node_name: str, location_tag: str) -> None: + global _node_name, _location_tag + _checks.update({c.name: c for c in checks}) + _node_name = node_name + _location_tag = location_tag + + +@app.get("/health") +async def health() -> dict: + return { + "status": "ok", + "node": _node_name, + "location_tag": _location_tag, + "checks": list(_checks.keys()), + } + + +@app.post("/trigger/{check_name}") +async def trigger(check_name: str) -> dict: + check = _checks.get(check_name) + if check is None: + raise HTTPException(status_code=404, detail=f"Unknown check: {check_name!r}") + result = await check.run() + return { + "check": check_name, + "healthy": result.healthy, + "event_type": result.event_type, + "severity": result.severity, + "message": result.message, + "payload": result.payload, + } diff --git a/services/ha-diag-agent/src/ha_diag/checks/__init__.py b/services/ha-diag-agent/src/ha_diag/checks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/services/ha-diag-agent/src/ha_diag/checks/base.py b/services/ha-diag-agent/src/ha_diag/checks/base.py new file mode 100644 index 0000000..37a5c06 --- /dev/null +++ b/services/ha-diag-agent/src/ha_diag/checks/base.py @@ -0,0 +1,18 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod + +from ..models import CheckResult + + +class Check(ABC): + """Base class for all HA diagnostic checks.""" + + name: str # unique slug used in /trigger/ and check_history + + @abstractmethod + async def run(self) -> CheckResult: + """Execute the check and return a result. + + The caller is responsible for emitting events when result.event_type is set. + """ diff --git a/services/ha-diag-agent/src/ha_diag/checks/heartbeat.py b/services/ha-diag-agent/src/ha_diag/checks/heartbeat.py new file mode 100644 index 0000000..4aa6212 --- /dev/null +++ b/services/ha-diag-agent/src/ha_diag/checks/heartbeat.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from ..ha_client import HAClient +from ..models import CheckResult, HAEventType, Severity +from .base import Check + + +class HeartbeatCheck(Check): + """Pings HA /api/ to verify the API is reachable. + + Validates the end-to-end pipeline: HA client → check result → event emitter. + Real diagnostic checks (entity availability, system health, etc.) come in Phase 3. + """ + + name = "heartbeat" + + def __init__(self, ha_client: HAClient) -> None: + self._client = ha_client + + async def run(self) -> CheckResult: + try: + async with self._client: + data = await self._client.get_api_status() + if isinstance(data, dict) and "message" in data: + return CheckResult( + healthy=True, + event_type=None, + severity=Severity.info, + message="HA API reachable", + payload={"response": data}, + ) + return CheckResult( + healthy=False, + event_type=HAEventType.ha_websocket_dead, + severity=Severity.error, + message=f"HA API returned unexpected response: {data!r}", + payload={"response": str(data)}, + ) + except Exception as exc: + return CheckResult( + healthy=False, + event_type=HAEventType.ha_websocket_dead, + severity=Severity.error, + message=f"HA API unreachable: {exc}", + payload={"error": str(exc)}, + ) + diff --git a/services/ha-diag-agent/src/ha_diag/config.py b/services/ha-diag-agent/src/ha_diag/config.py new file mode 100644 index 0000000..e00ff07 --- /dev/null +++ b/services/ha-diag-agent/src/ha_diag/config.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +import os +from pathlib import Path + +import yaml +from pydantic import field_validator +from pydantic_settings import BaseSettings + +_CONFIG_YAML = Path("/config/ha-diag-agent.yaml") + + +class Settings(BaseSettings): + ha_url: str = "http://homeassistant.local:8123" + ha_token: str = "" + node_name: str = "unknown" + location_tag: str = "default" + check_interval: int = 60 + port: int = 8087 + log_level: str = "info" + events_dir: Path = Path("/events") + data_dir: Path = Path("/data") + + model_config = {"extra": "ignore", "case_sensitive": False} + + @field_validator("ha_url") + @classmethod + def strip_trailing_slash(cls, v: str) -> str: + return v.rstrip("/") + + @classmethod + def load(cls) -> "Settings": + """Load settings: YAML file provides defaults; env vars override.""" + if _CONFIG_YAML.exists(): + try: + with _CONFIG_YAML.open() as f: + data = yaml.safe_load(f) or {} + for k, v in data.items(): + os.environ.setdefault(k.upper(), str(v)) + except Exception: + pass + return cls() diff --git a/services/ha-diag-agent/src/ha_diag/event_emitter.py b/services/ha-diag-agent/src/ha_diag/event_emitter.py new file mode 100644 index 0000000..5f55d63 --- /dev/null +++ b/services/ha-diag-agent/src/ha_diag/event_emitter.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +import json +import re +import time +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +from .models import EventRecord + + +class EventEmitter: + """Writes atomic JSON event files to the events directory.""" + + def __init__(self, events_dir: Path, node_name: str) -> None: + self._events_dir = events_dir + self._node_name = node_name + self._seq = 0 + events_dir.mkdir(parents=True, exist_ok=True) + + def _make_id(self, event_type: str, service: str) -> str: + # Sequence suffix guarantees uniqueness even when multiple events of the + # same type are emitted within the same millisecond. + self._seq += 1 + ts = int(time.time()) + svc_slug = re.sub(r"[^a-z0-9]", "-", (service or "ha").lower())[:32].strip("-") + return f"evt-{self._node_name}-{ts}-{event_type}-{svc_slug}-{self._seq}" + + def emit( + self, + event_type: str, + severity: str, + service: str, + message: str, + payload: dict[str, Any] | None = None, + ) -> str: + event_id = self._make_id(event_type, service) + record = EventRecord( + id=event_id, + timestamp=int(time.time()), + date=datetime.now(timezone.utc).isoformat(), + type=event_type, + severity=severity, + node=self._node_name, + service=service, + message=message, + payload=payload or {}, + ) + path = self._events_dir / f"{event_id}.json" + tmp = path.with_suffix(".tmp") + tmp.write_text(json.dumps(record.model_dump(), indent=2)) + tmp.rename(path) + return event_id diff --git a/services/ha-diag-agent/src/ha_diag/ha_client.py b/services/ha-diag-agent/src/ha_diag/ha_client.py new file mode 100644 index 0000000..9ed1e12 --- /dev/null +++ b/services/ha-diag-agent/src/ha_diag/ha_client.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +from typing import Any + +import aiohttp + + +class HAClient: + """Async Home Assistant REST API client using long-lived token auth.""" + + def __init__(self, base_url: str, token: str, timeout: float = 10.0) -> None: + self._base_url = base_url.rstrip("/") + self._headers = { + "Authorization": f"Bearer {token}", + "Content-Type": "application/json", + } + self._timeout = aiohttp.ClientTimeout(total=timeout) + self._session: aiohttp.ClientSession | None = None + + async def __aenter__(self) -> "HAClient": + self._session = aiohttp.ClientSession( + headers=self._headers, + timeout=self._timeout, + ) + return self + + async def __aexit__(self, *_: Any) -> None: + if self._session: + await self._session.close() + self._session = None + + def _session_or_raise(self) -> aiohttp.ClientSession: + if self._session is None: + raise RuntimeError("HAClient must be used as an async context manager") + return self._session + + async def get_api_status(self) -> dict[str, Any]: + """GET /api/ — returns {"message": "API running."} when HA is up.""" + async with self._session_or_raise().get(f"{self._base_url}/api/") as resp: + resp.raise_for_status() + return await resp.json() + + async def get_states(self) -> list[dict[str, Any]]: + """GET /api/states — full entity state list.""" + async with self._session_or_raise().get(f"{self._base_url}/api/states") as resp: + resp.raise_for_status() + return await resp.json() + + async def get_system_health(self) -> dict[str, Any]: + """GET /api/system_health — per-integration health summary.""" + async with self._session_or_raise().get( + f"{self._base_url}/api/system_health" + ) as resp: + resp.raise_for_status() + return await resp.json() + + async def get_config(self) -> dict[str, Any]: + """GET /api/config — HA configuration including version.""" + async with self._session_or_raise().get(f"{self._base_url}/api/config") as resp: + resp.raise_for_status() + return await resp.json() + + async def get_automation_traces(self, automation_id: str) -> list[dict[str, Any]]: + """GET /api/trace/automation/ — last run traces for an automation.""" + url = f"{self._base_url}/api/trace/automation/{automation_id}" + async with self._session_or_raise().get(url) as resp: + resp.raise_for_status() + return await resp.json() + + async def get_error_log(self) -> str: + """GET /api/error_log — plaintext error log.""" + async with self._session_or_raise().get( + f"{self._base_url}/api/error_log" + ) as resp: + resp.raise_for_status() + return await resp.text() diff --git a/services/ha-diag-agent/src/ha_diag/main.py b/services/ha-diag-agent/src/ha_diag/main.py new file mode 100644 index 0000000..128a5c2 --- /dev/null +++ b/services/ha-diag-agent/src/ha_diag/main.py @@ -0,0 +1,119 @@ +from __future__ import annotations + +import asyncio +import json +import logging +import time +from datetime import datetime + +import structlog +import uvicorn +from apscheduler.schedulers.asyncio import AsyncIOScheduler + +from .api import app, register_checks +from .checks.heartbeat import HeartbeatCheck +from .config import Settings +from .event_emitter import EventEmitter +from .ha_client import HAClient +from .storage import Storage + +_log = structlog.get_logger() + + +def _configure_structlog(log_level: str) -> None: + structlog.configure( + processors=[ + structlog.processors.add_log_level, + structlog.processors.TimeStamper(fmt="iso"), + structlog.processors.StackInfoRenderer(), + structlog.processors.format_exc_info, + structlog.processors.JSONRenderer(), + ], + logger_factory=structlog.PrintLoggerFactory(), + ) + logging.basicConfig(level=getattr(logging, log_level.upper(), logging.INFO)) + + +async def _run_check_and_emit(check, emitter: EventEmitter, storage: Storage) -> None: + try: + result = await check.run() + await storage.record_check( + check_name=check.name, + ran_at=time.time(), + healthy=result.healthy, + message=result.message, + payload=json.dumps(result.payload), + ) + if result.event_type: + emitter.emit( + event_type=result.event_type, + severity=result.severity.value, + service="homeassistant", + message=result.message, + payload=result.payload, + ) + _log.warning( + "check_unhealthy", + check=check.name, + event=result.event_type, + msg=result.message, + ) + else: + _log.info("check_ok", check=check.name) + except Exception as exc: + _log.error("check_error", check=check.name, error=str(exc), exc_info=True) + + +async def run(settings: Settings) -> None: + _configure_structlog(settings.log_level) + _log.info( + "ha_diag_agent_starting", + node=settings.node_name, + location=settings.location_tag, + ha_url=settings.ha_url, + interval=settings.check_interval, + ) + + storage = Storage(settings.data_dir / "ha_diag.db") + await storage.open() + + emitter = EventEmitter(settings.events_dir, settings.node_name) + ha_client = HAClient(settings.ha_url, settings.ha_token) + + checks = [HeartbeatCheck(ha_client)] + register_checks(checks, settings.node_name, settings.location_tag) + + scheduler = AsyncIOScheduler() + for check in checks: + scheduler.add_job( + _run_check_and_emit, + "interval", + seconds=settings.check_interval, + args=[check, emitter, storage], + id=f"check_{check.name}", + next_run_time=datetime.now(), + ) + scheduler.start() + _log.info("scheduler_started", checks=[c.name for c in checks]) + + config = uvicorn.Config( + app, + host="0.0.0.0", + port=settings.port, + log_level=settings.log_level.lower(), + ) + server = uvicorn.Server(config) + try: + await server.serve() + finally: + scheduler.shutdown(wait=False) + await storage.close() + + +def main() -> None: + settings = Settings.load() + asyncio.run(run(settings)) + + +if __name__ == "__main__": + main() diff --git a/services/ha-diag-agent/src/ha_diag/models.py b/services/ha-diag-agent/src/ha_diag/models.py new file mode 100644 index 0000000..55d6477 --- /dev/null +++ b/services/ha-diag-agent/src/ha_diag/models.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +from enum import Enum +from typing import Any + +from pydantic import BaseModel + + +class Severity(str, Enum): + info = "info" + warning = "warning" + error = "error" + + +class HAEventType(str, Enum): + ha_integration_failed = "ha_integration_failed" + ha_entity_unavailable_long = "ha_entity_unavailable_long" + ha_websocket_dead = "ha_websocket_dead" + ha_automation_failing = "ha_automation_failing" + ha_update_available = "ha_update_available" + ha_recorder_lag = "ha_recorder_lag" + ha_system_health_degraded = "ha_system_health_degraded" + + +class EventRecord(BaseModel): + id: str + timestamp: int + date: str + type: str + severity: str + node: str + service: str + message: str + payload: dict[str, Any] = {} + + +class CheckResult(BaseModel): + healthy: bool + event_type: str | None = None # None means no event to emit + severity: Severity = Severity.info + message: str = "" + payload: dict[str, Any] = {} diff --git a/services/ha-diag-agent/src/ha_diag/storage.py b/services/ha-diag-agent/src/ha_diag/storage.py new file mode 100644 index 0000000..19362de --- /dev/null +++ b/services/ha-diag-agent/src/ha_diag/storage.py @@ -0,0 +1,126 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import aiosqlite + +_SCHEMA = """ +CREATE TABLE IF NOT EXISTS entity_baseline ( + entity_id TEXT PRIMARY KEY, + state TEXT NOT NULL, + attributes TEXT NOT NULL DEFAULT '{}', + updated_at REAL NOT NULL +); + +CREATE TABLE IF NOT EXISTS check_history ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + check_name TEXT NOT NULL, + ran_at REAL NOT NULL, + healthy INTEGER NOT NULL, + message TEXT NOT NULL DEFAULT '', + payload TEXT NOT NULL DEFAULT '{}' +); + +CREATE TABLE IF NOT EXISTS alerts_sent ( + alert_key TEXT PRIMARY KEY, + sent_at REAL NOT NULL +); +""" + + +class Storage: + def __init__(self, db_path: Path) -> None: + self._db_path = db_path + self._db: aiosqlite.Connection | None = None + + async def open(self) -> None: + self._db_path.parent.mkdir(parents=True, exist_ok=True) + self._db = await aiosqlite.connect(self._db_path) + self._db.row_factory = aiosqlite.Row + await self._db.executescript(_SCHEMA) + await self._db.commit() + + async def close(self) -> None: + if self._db: + await self._db.close() + self._db = None + + def _conn(self) -> aiosqlite.Connection: + if self._db is None: + raise RuntimeError("Storage not open — call await storage.open() first") + return self._db + + # ------------------------------------------------------------------ + # entity_baseline + # ------------------------------------------------------------------ + + async def upsert_entity_baseline( + self, entity_id: str, state: str, attributes: str, updated_at: float + ) -> None: + await self._conn().execute( + """ + INSERT INTO entity_baseline (entity_id, state, attributes, updated_at) + VALUES (?, ?, ?, ?) + ON CONFLICT(entity_id) DO UPDATE SET + state = excluded.state, + attributes = excluded.attributes, + updated_at = excluded.updated_at + """, + (entity_id, state, attributes, updated_at), + ) + await self._conn().commit() + + async def get_entity_baseline(self, entity_id: str) -> dict[str, Any] | None: + async with self._conn().execute( + "SELECT * FROM entity_baseline WHERE entity_id = ?", (entity_id,) + ) as cur: + row = await cur.fetchone() + return dict(row) if row else None + + # ------------------------------------------------------------------ + # check_history + # ------------------------------------------------------------------ + + async def record_check( + self, + check_name: str, + ran_at: float, + healthy: bool, + message: str, + payload: str, + ) -> None: + await self._conn().execute( + """ + INSERT INTO check_history (check_name, ran_at, healthy, message, payload) + VALUES (?, ?, ?, ?, ?) + """, + (check_name, ran_at, int(healthy), message, payload), + ) + await self._conn().commit() + + # ------------------------------------------------------------------ + # alerts_sent (dedup gate) + # ------------------------------------------------------------------ + + async def was_alert_sent(self, alert_key: str, within_seconds: float) -> bool: + import time + + cutoff = time.time() - within_seconds + async with self._conn().execute( + "SELECT sent_at FROM alerts_sent WHERE alert_key = ? AND sent_at > ?", + (alert_key, cutoff), + ) as cur: + return (await cur.fetchone()) is not None + + async def mark_alert_sent(self, alert_key: str) -> None: + import time + + await self._conn().execute( + """ + INSERT INTO alerts_sent (alert_key, sent_at) VALUES (?, ?) + ON CONFLICT(alert_key) DO UPDATE SET sent_at = excluded.sent_at + """, + (alert_key, time.time()), + ) + await self._conn().commit() diff --git a/services/ha-diag-agent/src/ha_diag_agent.egg-info/PKG-INFO b/services/ha-diag-agent/src/ha_diag_agent.egg-info/PKG-INFO new file mode 100644 index 0000000..eaae6ea --- /dev/null +++ b/services/ha-diag-agent/src/ha_diag_agent.egg-info/PKG-INFO @@ -0,0 +1,17 @@ +Metadata-Version: 2.4 +Name: ha-diag-agent +Version: 0.1.0 +Requires-Python: >=3.11 +Requires-Dist: aiohttp>=3.9 +Requires-Dist: fastapi>=0.110 +Requires-Dist: uvicorn[standard]>=0.29 +Requires-Dist: pydantic>=2.6 +Requires-Dist: pydantic-settings>=2.2 +Requires-Dist: apscheduler>=3.10 +Requires-Dist: aiosqlite>=0.20 +Requires-Dist: structlog>=24.1 +Requires-Dist: pyyaml>=6.0 +Provides-Extra: dev +Requires-Dist: pytest>=8.1; extra == "dev" +Requires-Dist: pytest-asyncio>=0.23; extra == "dev" +Requires-Dist: aioresponses>=0.7; extra == "dev" diff --git a/services/ha-diag-agent/src/ha_diag_agent.egg-info/SOURCES.txt b/services/ha-diag-agent/src/ha_diag_agent.egg-info/SOURCES.txt new file mode 100644 index 0000000..31418b6 --- /dev/null +++ b/services/ha-diag-agent/src/ha_diag_agent.egg-info/SOURCES.txt @@ -0,0 +1,21 @@ +README.md +pyproject.toml +src/ha_diag/__init__.py +src/ha_diag/api.py +src/ha_diag/config.py +src/ha_diag/event_emitter.py +src/ha_diag/ha_client.py +src/ha_diag/main.py +src/ha_diag/models.py +src/ha_diag/storage.py +src/ha_diag/checks/__init__.py +src/ha_diag/checks/base.py +src/ha_diag/checks/heartbeat.py +src/ha_diag_agent.egg-info/PKG-INFO +src/ha_diag_agent.egg-info/SOURCES.txt +src/ha_diag_agent.egg-info/dependency_links.txt +src/ha_diag_agent.egg-info/requires.txt +src/ha_diag_agent.egg-info/top_level.txt +tests/test_event_emitter.py +tests/test_ha_client.py +tests/test_heartbeat_check.py \ No newline at end of file diff --git a/services/ha-diag-agent/src/ha_diag_agent.egg-info/dependency_links.txt b/services/ha-diag-agent/src/ha_diag_agent.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/services/ha-diag-agent/src/ha_diag_agent.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/services/ha-diag-agent/src/ha_diag_agent.egg-info/requires.txt b/services/ha-diag-agent/src/ha_diag_agent.egg-info/requires.txt new file mode 100644 index 0000000..0d57093 --- /dev/null +++ b/services/ha-diag-agent/src/ha_diag_agent.egg-info/requires.txt @@ -0,0 +1,14 @@ +aiohttp>=3.9 +fastapi>=0.110 +uvicorn[standard]>=0.29 +pydantic>=2.6 +pydantic-settings>=2.2 +apscheduler>=3.10 +aiosqlite>=0.20 +structlog>=24.1 +pyyaml>=6.0 + +[dev] +pytest>=8.1 +pytest-asyncio>=0.23 +aioresponses>=0.7 diff --git a/services/ha-diag-agent/src/ha_diag_agent.egg-info/top_level.txt b/services/ha-diag-agent/src/ha_diag_agent.egg-info/top_level.txt new file mode 100644 index 0000000..5903d9e --- /dev/null +++ b/services/ha-diag-agent/src/ha_diag_agent.egg-info/top_level.txt @@ -0,0 +1 @@ +ha_diag diff --git a/services/ha-diag-agent/tests/conftest.py b/services/ha-diag-agent/tests/conftest.py new file mode 100644 index 0000000..faf0064 --- /dev/null +++ b/services/ha-diag-agent/tests/conftest.py @@ -0,0 +1,63 @@ +"""Shared fixtures for ha-diag-agent tests.""" +from __future__ import annotations + +import os +from pathlib import Path +from typing import AsyncGenerator +from unittest.mock import AsyncMock, MagicMock + +import pytest +import pytest_asyncio + +from ha_diag.event_emitter import EventEmitter +from ha_diag.storage import Storage + + +# --------------------------------------------------------------------------- +# Event dir fixture +# --------------------------------------------------------------------------- + + +@pytest.fixture +def tmp_events_dir(tmp_path: Path) -> Path: + events = tmp_path / "events" + events.mkdir() + return events + + +# --------------------------------------------------------------------------- +# Storage fixture (in-memory via tmp SQLite) +# --------------------------------------------------------------------------- + + +@pytest_asyncio.fixture +async def storage(tmp_path: Path) -> AsyncGenerator[Storage, None]: + s = Storage(tmp_path / "test.db") + await s.open() + yield s + await s.close() + + +# --------------------------------------------------------------------------- +# EventEmitter fixture +# --------------------------------------------------------------------------- + + +@pytest.fixture +def emitter(tmp_events_dir: Path) -> EventEmitter: + return EventEmitter(tmp_events_dir, node_name="test-node") + + +# --------------------------------------------------------------------------- +# Mock HA client fixture +# --------------------------------------------------------------------------- + + +@pytest.fixture +def mock_ha_client(): + """HAClient mock that behaves as an async context manager.""" + client = MagicMock() + client.__aenter__ = AsyncMock(return_value=client) + client.__aexit__ = AsyncMock(return_value=None) + client.get_api_status = AsyncMock(return_value={"message": "API running."}) + return client diff --git a/services/ha-diag-agent/tests/test_event_emitter.py b/services/ha-diag-agent/tests/test_event_emitter.py new file mode 100644 index 0000000..3b7a62e --- /dev/null +++ b/services/ha-diag-agent/tests/test_event_emitter.py @@ -0,0 +1,62 @@ +"""Tests for EventEmitter.""" +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from ha_diag.event_emitter import EventEmitter + + +def test_emit_creates_json_file(tmp_events_dir: Path, emitter: EventEmitter): + event_id = emitter.emit( + event_type="ha_websocket_dead", + severity="error", + service="homeassistant", + message="HA unreachable", + payload={"error": "timeout"}, + ) + files = list(tmp_events_dir.glob("*.json")) + assert len(files) == 1 + assert files[0].name == f"{event_id}.json" + + +def test_emit_event_schema(tmp_events_dir: Path, emitter: EventEmitter): + event_id = emitter.emit( + event_type="ha_websocket_dead", + severity="error", + service="homeassistant", + message="HA unreachable", + payload={"error": "timeout"}, + ) + data = json.loads((tmp_events_dir / f"{event_id}.json").read_text()) + assert data["id"] == event_id + assert data["type"] == "ha_websocket_dead" + assert data["severity"] == "error" + assert data["node"] == "test-node" + assert data["service"] == "homeassistant" + assert data["message"] == "HA unreachable" + assert data["payload"] == {"error": "timeout"} + assert "timestamp" in data + assert "date" in data + + +def test_emit_multiple_events_unique_files(tmp_events_dir: Path, emitter: EventEmitter): + ids = [ + emitter.emit("ha_websocket_dead", "error", "homeassistant", f"msg {i}") + for i in range(3) + ] + assert len(set(ids)) == 3 + assert len(list(tmp_events_dir.glob("*.json"))) == 3 + + +def test_emit_no_tmp_file_left(tmp_events_dir: Path, emitter: EventEmitter): + emitter.emit("ha_websocket_dead", "error", "homeassistant", "msg") + assert not list(tmp_events_dir.glob("*.tmp")) + + +def test_emitter_creates_events_dir(tmp_path: Path): + new_dir = tmp_path / "nested" / "events" + emitter = EventEmitter(new_dir, "my-node") + assert new_dir.exists() diff --git a/services/ha-diag-agent/tests/test_ha_client.py b/services/ha-diag-agent/tests/test_ha_client.py new file mode 100644 index 0000000..3d6a780 --- /dev/null +++ b/services/ha-diag-agent/tests/test_ha_client.py @@ -0,0 +1,56 @@ +"""Tests for HAClient using aioresponses to mock aiohttp.""" +from __future__ import annotations + +import pytest +from aioresponses import aioresponses + +from ha_diag.ha_client import HAClient + +HA_URL = "http://homeassistant.test:8123" +TOKEN = "test-token" + + +@pytest.mark.asyncio +async def test_get_api_status_ok(): + with aioresponses() as m: + m.get(f"{HA_URL}/api/", payload={"message": "API running."}) + async with HAClient(HA_URL, TOKEN) as client: + result = await client.get_api_status() + assert result == {"message": "API running."} + + +@pytest.mark.asyncio +async def test_get_api_status_unauthorized(): + with aioresponses() as m: + m.get(f"{HA_URL}/api/", status=401) + async with HAClient(HA_URL, TOKEN) as client: + with pytest.raises(Exception): + await client.get_api_status() + + +@pytest.mark.asyncio +async def test_get_states_returns_list(): + payload = [{"entity_id": "light.living_room", "state": "on"}] + with aioresponses() as m: + m.get(f"{HA_URL}/api/states", payload=payload) + async with HAClient(HA_URL, TOKEN) as client: + states = await client.get_states() + assert isinstance(states, list) + assert states[0]["entity_id"] == "light.living_room" + + +@pytest.mark.asyncio +async def test_get_config_returns_dict(): + payload = {"version": "2024.1.0", "location_name": "Home"} + with aioresponses() as m: + m.get(f"{HA_URL}/api/config", payload=payload) + async with HAClient(HA_URL, TOKEN) as client: + config = await client.get_config() + assert config["version"] == "2024.1.0" + + +@pytest.mark.asyncio +async def test_session_required_without_context_manager(): + client = HAClient(HA_URL, TOKEN) + with pytest.raises(RuntimeError, match="context manager"): + await client.get_api_status() diff --git a/services/ha-diag-agent/tests/test_heartbeat_check.py b/services/ha-diag-agent/tests/test_heartbeat_check.py new file mode 100644 index 0000000..37c3557 --- /dev/null +++ b/services/ha-diag-agent/tests/test_heartbeat_check.py @@ -0,0 +1,65 @@ +"""Tests for HeartbeatCheck.""" +from __future__ import annotations + +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from ha_diag.checks.heartbeat import HeartbeatCheck +from ha_diag.models import HAEventType, Severity + + +def _make_client(api_status=None, side_effect=None): + client = MagicMock() + client.__aenter__ = AsyncMock(return_value=client) + client.__aexit__ = AsyncMock(return_value=None) + if side_effect: + client.get_api_status = AsyncMock(side_effect=side_effect) + else: + client.get_api_status = AsyncMock(return_value=api_status) + return client + + +@pytest.mark.asyncio +async def test_heartbeat_ok(): + client = _make_client(api_status={"message": "API running."}) + check = HeartbeatCheck(client) + result = await check.run() + assert result.healthy is True + assert result.event_type is None + + +@pytest.mark.asyncio +async def test_heartbeat_connection_error(): + client = _make_client(side_effect=ConnectionError("refused")) + check = HeartbeatCheck(client) + result = await check.run() + assert result.healthy is False + assert result.event_type == HAEventType.ha_websocket_dead + assert result.severity == Severity.error + assert "refused" in result.message + + +@pytest.mark.asyncio +async def test_heartbeat_unexpected_response(): + client = _make_client(api_status={"unexpected": "key"}) + check = HeartbeatCheck(client) + result = await check.run() + assert result.healthy is False + assert result.event_type == HAEventType.ha_websocket_dead + + +@pytest.mark.asyncio +async def test_heartbeat_timeout(): + client = _make_client(side_effect=TimeoutError("timed out")) + check = HeartbeatCheck(client) + result = await check.run() + assert result.healthy is False + assert result.event_type == HAEventType.ha_websocket_dead + assert "timed out" in result.message + + +def test_heartbeat_check_name(): + client = MagicMock() + check = HeartbeatCheck(client) + assert check.name == "heartbeat"