diff --git a/docs/chelsty-stability-agent.md b/docs/chelsty-stability-agent.md new file mode 100644 index 0000000..e6f63d3 --- /dev/null +++ b/docs/chelsty-stability-agent.md @@ -0,0 +1,42 @@ +### CHELSTY Stability Agent + +The stability-agent on CHELSTY provides local observability and health monitoring for the node's services and infrastructure. + +#### Purpose + +It acts as a filesystem-first watchdog that detects anomalies in the local runtime environment without taking autonomous destructive actions (like restarts). It serves as the primary data source for node-level stability metrics. + +#### Monitoring Scope + +* **Docker Containers**: Monitors all local containers. If a container is not in the `running` state, a `containers_not_running` event is generated. +* **Disk Usage**: Monitors the root filesystem. Generates `disk_usage_high` events if usage exceeds the configured threshold. +* **Connectivity**: + * Checks if the Tailscale socket or interface is available. + * Checks reachability of the local Mosquitto MQTT broker. +* **Zigbee2MQTT**: Specifically tracks the presence and status of the Zigbee2MQTT service. + +#### Storage and Integration + +* **Heartbeat**: Updated every cycle at `/opt/homelab/state/stability-agent.heartbeat`. +* **State Summary**: A JSON summary of all latest checks at `/opt/homelab/state/stability-agent.json`. +* **Events**: Append-only JSON lines at `/opt/homelab/events/YYYY-MM-DD/chelsty/events.jsonl`. + +#### Deployment + +The service is deployed via Docker Compose on CHELSTY. + +```bash +cd services/stability-agent +docker compose up -d +``` + +#### Configuration + +Configuration is managed via environment variables in `docker-compose.override.yml` on the host. + +| Variable | Description | Default | +|----------|-------------|---------| +| `STABILITY_CHECK_INTERVAL` | Seconds between checks | `60` | +| `DISK_THRESHOLD_PCT` | Disk usage alert threshold | `90` | +| `MQTT_HOST` | MQTT broker hostname | `mosquitto` | +| `MQTT_PORT` | MQTT broker port | `1883` | diff --git a/hosts/chelsty/runtime/stability-agent/docker-compose.override.yml b/hosts/chelsty/runtime/stability-agent/docker-compose.override.yml new file mode 100644 index 0000000..c8ed0e0 --- /dev/null +++ b/hosts/chelsty/runtime/stability-agent/docker-compose.override.yml @@ -0,0 +1,7 @@ +services: + stability-agent: + environment: + - STABILITY_CHECK_INTERVAL=60 + - DISK_THRESHOLD_PCT=85 + - MQTT_HOST=mosquitto + - MQTT_PORT=1883 diff --git a/hosts/chelsty/services.yaml b/hosts/chelsty/services.yaml index b7ee27e..9e7091a 100644 --- a/hosts/chelsty/services.yaml +++ b/hosts/chelsty/services.yaml @@ -106,3 +106,21 @@ services: - /opt/homelab/data/mosquitto notes: - Retain ACL, password, persistence, and bridge configuration if enabled. + + stability-agent: + role: node-stability-monitor + deployment_model: docker-compose + exposure: local-only + offline_required: true + depends_on: + local: + - mosquitto + external: [] + runtime: + config_path: null + data_path: /opt/homelab/state + logs_path: /opt/homelab/events + backup: + recommended: false + notes: + - Events and state are transient or can be reconstructed; high-frequency writes. diff --git a/services/stability-agent/Dockerfile b/services/stability-agent/Dockerfile new file mode 100644 index 0000000..403964d --- /dev/null +++ b/services/stability-agent/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.11-slim + +WORKDIR /app + +# No extra dependencies needed beyond standard library for the current script +# But we might need them if we decide to use libraries later. + +COPY src/stability_agent.py . +COPY healthcheck.sh . +RUN chmod +x healthcheck.sh + +# Create the expected directories +RUN mkdir -p /opt/homelab/state /opt/homelab/events + +# Run the agent +CMD ["python", "stability_agent.py"] diff --git a/services/stability-agent/README.md b/services/stability-agent/README.md new file mode 100644 index 0000000..621c635 --- /dev/null +++ b/services/stability-agent/README.md @@ -0,0 +1,43 @@ +### Stability Agent + +A lightweight filesystem-first watchdog and observer agent for CHELSTY. + +#### Features + +* **Continuous Monitoring**: Runs as a background service. +* **Docker Inspection**: Checks container status via read-only Docker socket. +* **Disk Usage**: Monitors local disk utilization. +* **Tailscale Check**: Verifies Tailscale availability. +* **MQTT Reachability**: Checks connectivity to the local MQTT broker. +* **Zigbee2MQTT Monitoring**: Specifically monitors the Zigbee2MQTT container. +* **Event Logging**: Writes append-only JSON events to `/opt/homelab/events/YYYY-MM-DD/chelsty/`. +* **State Reporting**: Writes heartbeat and status summary to `/opt/homelab/state/`. + +#### Configuration + +Environment variables: + +* `STABILITY_CHECK_INTERVAL`: Interval between checks in seconds (default: 60). +* `DISK_THRESHOLD_PCT`: Disk usage percentage to trigger warning (default: 90). +* `MQTT_HOST`: Hostname or IP of the MQTT broker to check. +* `MQTT_PORT`: Port of the MQTT broker (default: 1883). + +#### Safety + +* No automatic restarts are performed. +* Read-only access to Docker socket. +* No configuration mutation. +* No secrets stored in the repository. + +#### Event Schema + +Events are written as JSON lines with the following fields: + +* `id`: Unique event UUID. +* `timestamp`: ISO 8601 timestamp (UTC). +* `node`: `chelsty`. +* `source`: `stability-agent`. +* `type`: Type of event (e.g., `disk_usage_high`, `containers_not_running`). +* `severity`: `info`, `warning`, or `error`. +* `message`: Human-readable description. +* `details`: Object containing specific check results. diff --git a/services/stability-agent/docker-compose.yml b/services/stability-agent/docker-compose.yml new file mode 100644 index 0000000..4d0d848 --- /dev/null +++ b/services/stability-agent/docker-compose.yml @@ -0,0 +1,25 @@ +services: + stability-agent: + build: . + container_name: stability-agent + restart: unless-stopped + volumes: + - /opt/homelab:/opt/homelab + - /var/run/docker.sock:/var/run/docker.sock:ro + - /var/run/tailscale/tailscaled.sock:/var/run/tailscale/tailscaled.sock:ro + environment: + - STABILITY_CHECK_INTERVAL=${STABILITY_CHECK_INTERVAL:-60} + - DISK_THRESHOLD_PCT=${DISK_THRESHOLD_PCT:-90} + - MQTT_HOST=${MQTT_HOST} + - MQTT_PORT=${MQTT_PORT:-1883} + - NODE_NAME=chelsty + healthcheck: + test: ["CMD", "/bin/sh", "/app/healthcheck.sh"] + interval: 1m + timeout: 10s + retries: 3 + +volumes: + opt_homelab: + external: true + name: homelab_data # This might vary, but /opt/homelab mount is preferred as direct path. diff --git a/services/stability-agent/env.example b/services/stability-agent/env.example new file mode 100644 index 0000000..07edb37 --- /dev/null +++ b/services/stability-agent/env.example @@ -0,0 +1,4 @@ +STABILITY_CHECK_INTERVAL=60 +DISK_THRESHOLD_PCT=90 +MQTT_HOST=mosquitto +MQTT_PORT=1883 diff --git a/services/stability-agent/healthcheck.sh b/services/stability-agent/healthcheck.sh new file mode 100644 index 0000000..d717c8f --- /dev/null +++ b/services/stability-agent/healthcheck.sh @@ -0,0 +1,25 @@ +#!/bin/sh + +HEARTBEAT_FILE="/opt/homelab/state/stability-agent.heartbeat" +MAX_AGE_SECONDS=300 # 5 minutes + +if [ ! -f "$HEARTBEAT_FILE" ]; then + echo "Heartbeat file missing" + exit 1 +fi + +# Get current time in seconds +NOW=$(date +%s) + +# Get file modification time in seconds +# Busybox stat (standard in alpine/slim) uses -c %Y +FILE_TIME=$(stat -c %Y "$HEARTBEAT_FILE") + +AGE=$((NOW - FILE_TIME)) + +if [ "$AGE" -gt "$MAX_AGE_SECONDS" ]; then + echo "Heartbeat is too old: ${AGE}s" + exit 1 +fi + +exit 0 diff --git a/services/stability-agent/service.yaml b/services/stability-agent/service.yaml new file mode 100644 index 0000000..e75e6ca --- /dev/null +++ b/services/stability-agent/service.yaml @@ -0,0 +1,24 @@ +service: + name: stability-agent + owner_node: chelsty + exposure: private + dependencies: [] + healthcheck: + type: custom + interval: 60s + timeout: 10s + retries: 3 + restart_policy: unless-stopped + persistence: + paths: + - /opt/homelab/state + - /opt/homelab/events + runtime: + directories: + - /opt/homelab/state + - /opt/homelab/events + env_vars: + - STABILITY_CHECK_INTERVAL + - DISK_THRESHOLD_PCT + - MQTT_HOST + - MQTT_PORT diff --git a/services/stability-agent/src/stability_agent.py b/services/stability-agent/src/stability_agent.py new file mode 100644 index 0000000..2e9caf7 --- /dev/null +++ b/services/stability-agent/src/stability_agent.py @@ -0,0 +1,210 @@ +import os +import time +import json +import datetime +import uuid +import socket +import shutil +import http.client + +# Configuration from environment +CHECK_INTERVAL = int(os.environ.get("STABILITY_CHECK_INTERVAL", "60")) +DISK_THRESHOLD_PCT = float(os.environ.get("DISK_THRESHOLD_PCT", "90.0")) +MQTT_HOST = os.environ.get("MQTT_HOST") +MQTT_PORT = int(os.environ.get("MQTT_PORT", "1883")) +NODE_NAME = os.environ.get("NODE_NAME", "chelsty") +SOURCE = "stability-agent" + +STATE_DIR = "/opt/homelab/state" +EVENTS_BASE_DIR = "/opt/homelab/events" +HEARTBEAT_FILE = os.path.join(STATE_DIR, "stability-agent.heartbeat") +STATUS_FILE = os.path.join(STATE_DIR, "stability-agent.json") + +def get_timestamp(): + return datetime.datetime.utcnow().isoformat() + "Z" + +def get_datestamp(): + return datetime.datetime.utcnow().strftime("%Y-%m-%d") + +def emit_event(event_type, severity, message, service=None, details=None): + event = { + "id": str(uuid.uuid4()), + "timestamp": get_timestamp(), + "node": NODE_NAME, + "source": SOURCE, + "type": event_type, + "severity": severity, + "message": message, + "details": details or {} + } + if service: + event["service"] = service + + date_str = get_datestamp() + event_dir = os.path.join(EVENTS_BASE_DIR, date_str, NODE_NAME) + try: + os.makedirs(event_dir, exist_ok=True) + event_file = os.path.join(event_dir, "events.jsonl") + with open(event_file, "a") as f: + f.write(json.dumps(event) + "\n") + except Exception as e: + print(f"Failed to write event to filesystem: {e}") + + print(f"[{severity}] {message}") + +def check_disk(): + total, used, free = shutil.disk_usage("/") + percent = (used / total) * 100 + details = { + "total_gb": total // (2**30), + "used_gb": used // (2**30), + "free_gb": free // (2**30), + "percent": round(percent, 2) + } + + if percent > DISK_THRESHOLD_PCT: + emit_event("disk_usage_high", "warning", f"Disk usage is high: {details['percent']}%", details=details) + + return details + +class DockerClient: + def __init__(self, socket_path="/var/run/docker.sock"): + self.socket_path = socket_path + + def _request(self, path): + class UnixHTTPConnection(http.client.HTTPConnection): + def __init__(self, socket_path): + super().__init__("localhost") + self.socket_path = socket_path + def connect(self): + self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + self.sock.settimeout(5.0) + self.sock.connect(self.socket_path) + + if not os.path.exists(self.socket_path): + return None + + conn = UnixHTTPConnection(self.socket_path) + try: + conn.request("GET", path) + res = conn.getresponse() + if res.status == 200: + return json.loads(res.read().decode()) + return None + except Exception as e: + print(f"Docker API error: {e}") + return None + finally: + conn.close() + + def get_containers(self): + return self._request("/containers/json?all=1") + +def check_docker(): + client = DockerClient() + containers = client.get_containers() + if containers is None: + emit_event("docker_socket_error", "error", "Could not connect to Docker socket or socket missing") + return {"status": "error", "error": "Could not connect to Docker socket"} + + summary = [] + unhealthy_containers = [] + for c in containers: + state = c.get("State", "") + status = c.get("Status", "") + name = c.get("Names", ["unknown"])[0].lstrip("/") + + container_info = { + "name": name, + "state": state, + "status": status + } + summary.append(container_info) + + if state != "running": + unhealthy_containers.append(container_info) + + if unhealthy_containers: + names = [c["name"] for c in unhealthy_containers] + # Only emit warning for containers that should be running? + # For now, we report any non-running container found by Docker. + emit_event("containers_not_running", "warning", f"Some containers are not running: {', '.join(names)}", details={"containers": unhealthy_containers}) + + return {"status": "ok", "containers": summary} + +def check_tailscale(): + # Check for tailscale socket or interface + socket_path = "/var/run/tailscale/tailscaled.sock" + socket_available = os.path.exists(socket_path) + interface_available = os.path.exists("/sys/class/net/tailscale0") + + return { + "available": socket_available or interface_available, + "details": { + "socket": socket_available, + "interface": interface_available + } + } + +def check_mqtt(): + if not MQTT_HOST: + return {"configured": False} + + try: + with socket.create_connection((MQTT_HOST, MQTT_PORT), timeout=5): + return {"configured": True, "reachable": True} + except Exception as e: + emit_event("mqtt_unreachable", "error", f"MQTT broker at {MQTT_HOST}:{MQTT_PORT} is unreachable", details={"error": str(e)}) + return {"configured": True, "reachable": False, "error": str(e)} + +def main(): + print(f"Starting stability-agent on {NODE_NAME}...") + + # Ensure directories exist + os.makedirs(STATE_DIR, exist_ok=True) + os.makedirs(EVENTS_BASE_DIR, exist_ok=True) + + while True: + try: + status = { + "timestamp": get_timestamp(), + "node": NODE_NAME, + "checks": {} + } + + status["checks"]["disk"] = check_disk() + status["checks"]["docker"] = check_docker() + status["checks"]["tailscale"] = check_tailscale() + status["checks"]["mqtt"] = check_mqtt() + + # Zigbee2MQTT container check + z2m_present = False + z2m_running = False + if status["checks"]["docker"]["status"] == "ok": + for c in status["checks"]["docker"]["containers"]: + if "zigbee2mqtt" in c["name"]: + z2m_present = True + if c["state"] == "running": + z2m_running = True + + status["checks"]["zigbee2mqtt"] = { + "present": z2m_present, + "running": z2m_running + } + + # Write heartbeat + with open(HEARTBEAT_FILE, "w") as f: + f.write(get_timestamp()) + + # Write status summary + with open(STATUS_FILE, "w") as f: + json.dump(status, f, indent=2) + + except Exception as e: + print(f"Error in main loop: {e}") + emit_event("agent_error", "error", f"Internal agent error: {e}", details={"error": str(e)}) + + time.sleep(CHECK_INTERVAL) + +if __name__ == "__main__": + main()