diff --git a/docs/stability-agent-rollout.md b/docs/stability-agent-rollout.md new file mode 100644 index 0000000..412a2a6 --- /dev/null +++ b/docs/stability-agent-rollout.md @@ -0,0 +1,75 @@ +# Stability Agent Multi-Node Rollout + +## Architecture Summary +The `stability-agent` is a lightweight Python service that monitors node health (disk, Docker containers, Tailscale, MQTT) and publishes state to a central Redis instance running on **PIHA**. + +- **Source**: `services/stability-agent` +- **State Path**: `/opt/homelab/state` +- **Events Path**: `/opt/homelab/events` +- **Redis Target**: `100.108.208.3:6379` (PIHA) + +## Why UI only showed CHELSTY +Previously, the `stability-agent` had `NODE_NAME` defaulted to `chelsty` and was only deployed there. The Agent System UI materializer on PIHA filters nodes based on the Redis keys `homelab:nodes:`. Without other agents publishing their specific `NODE_NAME`, the UI remained limited to the single active node. + +## Deployment Commands + +Use the helper script to generate commands: +```bash +./scripts/deploy/deploy-stability-agent.sh +``` + +### PIHA +```bash +cd ~/homelab-codex-ws +git pull +cd services/stability-agent +NODE_NAME=piha REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker compose up -d --build --force-recreate +``` + +### CHELSTY +```bash +cd ~/homelab-codex-ws +git pull +cd services/stability-agent +NODE_NAME=chelsty REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker compose up -d --build --force-recreate +``` + +### SOLARIA +```bash +cd ~/homelab-codex-ws +git pull +cd services/stability-agent +NODE_NAME=solaria REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker compose up -d --build --force-recreate +``` + +### VPS +```bash +cd ~/homelab-codex-ws +git pull +cd services/stability-agent +NODE_NAME=vps REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker compose up -d --build --force-recreate +``` + +### SATURN (Optional) +Saturn is the orchestrator and can optionally run the stability-agent. If deployed, follow the same pattern with `NODE_NAME=saturn`. + +## Verification (on PIHA) + +Verify Redis keys: +```bash +docker exec agent-system-redis redis-cli KEYS 'homelab:nodes:*' +docker exec agent-system-redis redis-cli HGETALL homelab:nodes: +``` + +Verify Web UI backend: +```bash +curl -s http://127.0.0.1:18180/nodes +curl -k https://agents.okit.pl/nodes +``` + +## Troubleshooting + +- **Redis empty after compose down**: The `agent-system-redis` on PIHA uses transient storage if not configured with a volume. If it restarts, agents must republish their state (they do this automatically every `CHECK_INTERVAL`). +- **Secrets**: `.env` files and local secrets are not committed to the repo. Ensure `MQTT_HOST` and other specific secrets are set via overrides if needed. +- **Telegram**: Telegram bot notifications can remain disabled if `TELEGRAM_BOT_TOKEN` is absent. +- **Docker Socket**: If the agent reports `unavailable` for Docker, ensure `/var/run/docker.sock` is mounted and the user has permissions. diff --git a/hosts/chelsty/runtime/stability-agent/docker-compose.override.yml b/hosts/chelsty/runtime/stability-agent/docker-compose.override.yml index c8ed0e0..f1a826e 100644 --- a/hosts/chelsty/runtime/stability-agent/docker-compose.override.yml +++ b/hosts/chelsty/runtime/stability-agent/docker-compose.override.yml @@ -1,6 +1,10 @@ services: stability-agent: environment: + - NODE_NAME=chelsty + - REDIS_HOST=100.108.208.3 + - REDIS_PORT=6379 + - REDIS_ENABLED=true - STABILITY_CHECK_INTERVAL=60 - DISK_THRESHOLD_PCT=85 - MQTT_HOST=mosquitto diff --git a/hosts/chelsty/services.yaml b/hosts/chelsty/services.yaml index 9e7091a..c7b51c1 100644 --- a/hosts/chelsty/services.yaml +++ b/hosts/chelsty/services.yaml @@ -117,7 +117,7 @@ services: - mosquitto external: [] runtime: - config_path: null + config_path: /opt/homelab/config/stability-agent data_path: /opt/homelab/state logs_path: /opt/homelab/events backup: diff --git a/hosts/piha/runtime/stability-agent/docker-compose.override.yml b/hosts/piha/runtime/stability-agent/docker-compose.override.yml new file mode 100644 index 0000000..48aafe2 --- /dev/null +++ b/hosts/piha/runtime/stability-agent/docker-compose.override.yml @@ -0,0 +1,7 @@ +services: + stability-agent: + environment: + - NODE_NAME=piha + - REDIS_HOST=100.108.208.3 + - REDIS_PORT=6379 + - REDIS_ENABLED=true diff --git a/hosts/piha/services.yaml b/hosts/piha/services.yaml new file mode 100644 index 0000000..6dc2f80 --- /dev/null +++ b/hosts/piha/services.yaml @@ -0,0 +1,15 @@ +host: piha + +services: + stability-agent: + role: node-stability-monitor + deployment_model: docker-compose + exposure: local-only + offline_required: true + depends_on: + local: [] + external: [] + runtime: + config_path: /opt/homelab/config/stability-agent + data_path: /opt/homelab/state + logs_path: /opt/homelab/events diff --git a/hosts/solaria/runtime/stability-agent/docker-compose.override.yml b/hosts/solaria/runtime/stability-agent/docker-compose.override.yml new file mode 100644 index 0000000..f8acaf2 --- /dev/null +++ b/hosts/solaria/runtime/stability-agent/docker-compose.override.yml @@ -0,0 +1,7 @@ +services: + stability-agent: + environment: + - NODE_NAME=solaria + - REDIS_HOST=100.108.208.3 + - REDIS_PORT=6379 + - REDIS_ENABLED=true diff --git a/hosts/solaria/services.yaml b/hosts/solaria/services.yaml new file mode 100644 index 0000000..5324beb --- /dev/null +++ b/hosts/solaria/services.yaml @@ -0,0 +1,15 @@ +host: solaria + +services: + stability-agent: + role: node-stability-monitor + deployment_model: docker-compose + exposure: local-only + offline_required: true + depends_on: + local: [] + external: [] + runtime: + config_path: /opt/homelab/config/stability-agent + data_path: /opt/homelab/state + logs_path: /opt/homelab/events diff --git a/hosts/vps/runtime/stability-agent/docker-compose.override.yml b/hosts/vps/runtime/stability-agent/docker-compose.override.yml new file mode 100644 index 0000000..f5ae248 --- /dev/null +++ b/hosts/vps/runtime/stability-agent/docker-compose.override.yml @@ -0,0 +1,7 @@ +services: + stability-agent: + environment: + - NODE_NAME=vps + - REDIS_HOST=100.108.208.3 + - REDIS_PORT=6379 + - REDIS_ENABLED=true diff --git a/hosts/vps/services.yaml b/hosts/vps/services.yaml new file mode 100644 index 0000000..7115a92 --- /dev/null +++ b/hosts/vps/services.yaml @@ -0,0 +1,15 @@ +host: vps + +services: + stability-agent: + role: node-stability-monitor + deployment_model: docker-compose + exposure: local-only + offline_required: true + depends_on: + local: [] + external: [] + runtime: + config_path: /opt/homelab/config/stability-agent + data_path: /opt/homelab/state + logs_path: /opt/homelab/events diff --git a/scripts/deploy/deploy-stability-agent.sh b/scripts/deploy/deploy-stability-agent.sh new file mode 100755 index 0000000..b0a82ec --- /dev/null +++ b/scripts/deploy/deploy-stability-agent.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# deploy-stability-agent.sh - Helper to print deployment commands for stability-agent + +NODE=$1 +REPO_PATH="~/homelab-codex-ws" + +if [[ -z "$NODE" ]]; then + echo "Usage: $0 " + echo "Supported nodes: chelsty, piha, solaria, vps" + exit 1 +fi + +case "$NODE" in + chelsty|piha|solaria|vps) + ;; + *) + echo "Error: Unknown node '$NODE'" + echo "Supported nodes: chelsty, piha, solaria, vps" + exit 1 + ;; +esac + +echo "# --- Deployment commands for $NODE ---" +echo "cd $REPO_PATH" +echo "git fetch origin" +echo "git checkout master" +echo "git pull" +echo "cd services/stability-agent" +echo "" +echo "# Command (Docker Compose V2):" +echo "NODE_NAME=$NODE REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker compose up -d --build --force-recreate" +echo "" +echo "# Command (Docker Compose V1):" +echo "NODE_NAME=$NODE REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker-compose up -d --build --force-recreate" +echo "" +echo "# Notes:" +echo "# - If using host-specific overrides: add '-f ../../hosts/$NODE/runtime/stability-agent/docker-compose.override.yml'" +echo "# - Ensure /opt/homelab/state and /opt/homelab/events exist on the host." diff --git a/services/stability-agent/README.md b/services/stability-agent/README.md index 8fdbb6c..b4c31dd 100644 --- a/services/stability-agent/README.md +++ b/services/stability-agent/README.md @@ -1,19 +1,25 @@ ### Stability Agent -A lightweight filesystem-first watchdog and observer agent for CHELSTY. +A lightweight filesystem-first watchdog and observer agent for homelab nodes. #### Features * **Continuous Monitoring**: Runs as a background service. -* **Docker Inspection**: Checks container status via read-only Docker socket. +* **Docker Inspection**: Checks container status via read-only Docker socket (optional). * **Disk Usage**: Monitors local disk utilization. -* **Tailscale Check**: Verifies Tailscale availability. -* **MQTT Reachability**: Checks connectivity to the local MQTT broker. -* **Zigbee2MQTT Monitoring**: Specifically monitors the Zigbee2MQTT container. -* **Redis Publishing**: (Optional) Publishes runtime state and events to a central Redis server. -* **Event Logging**: Writes append-only JSON events to `/opt/homelab/events/YYYY-MM-DD/chelsty/`. +* **Tailscale Check**: Verifies Tailscale availability (optional). +* **MQTT Reachability**: Checks connectivity to a configured MQTT broker (optional). +* **Redis Publishing**: Publishes runtime state and events to a central Redis server (PIHA). +* **Event Logging**: Writes append-only JSON events to `/opt/homelab/events/YYYY-MM-DD//`. * **State Reporting**: Writes heartbeat and status summary to `/opt/homelab/state/`. +#### Deployment + +Use the deployment helper script: +```bash +./scripts/deploy/deploy-stability-agent.sh +``` + #### Configuration Environment variables: @@ -33,10 +39,10 @@ You can verify the Redis publishing using `redis-cli`: ```bash # Check node state -redis-cli -h 100.108.208.3 HGETALL homelab:nodes:chelsty +redis-cli -h 100.108.208.3 HGETALL homelab:nodes: # Check service discovery -redis-cli -h 100.108.208.3 HGETALL homelab:services:chelsty:stability-agent +redis-cli -h 100.108.208.3 HGETALL homelab:services::stability-agent # Check event stream redis-cli -h 100.108.208.3 XRANGE homelab:events - + @@ -55,7 +61,7 @@ Events are written as JSON lines with the following fields: * `id`: Unique event UUID. * `timestamp`: ISO 8601 timestamp (UTC). -* `node`: `chelsty`. +* `node`: ``. * `source`: `stability-agent`. * `type`: Type of event (e.g., `disk_usage_high`, `containers_not_running`). * `severity`: `info`, `warning`, or `error`. diff --git a/services/stability-agent/src/stability_agent.py b/services/stability-agent/src/stability_agent.py index 12baef8..18f09f8 100644 --- a/services/stability-agent/src/stability_agent.py +++ b/services/stability-agent/src/stability_agent.py @@ -43,7 +43,7 @@ def emit_event(event_type, severity, message, service=None, details=None): } if service: event["service"] = service - + date_str = get_datestamp() event_dir = os.path.join(EVENTS_BASE_DIR, date_str, NODE_NAME) try: @@ -53,7 +53,7 @@ def emit_event(event_type, severity, message, service=None, details=None): f.write(json.dumps(event) + "\n") except Exception as e: print(f"Failed to write event to filesystem: {e}") - + # Redis publishing if REDIS_ENABLED and redis_client: try: @@ -68,7 +68,7 @@ def emit_event(event_type, severity, message, service=None, details=None): except Exception as e: print(f"Failed to publish event to Redis: {e}") # Do not crash, already logged to filesystem - + print(f"[{severity}] {message}") def check_disk(): @@ -80,10 +80,10 @@ def check_disk(): "free_gb": free // (2**30), "percent": round(percent, 2) } - + if percent > DISK_THRESHOLD_PCT: emit_event("disk_usage_high", "warning", f"Disk usage is high: {details['percent']}%", details=details) - + return details class DockerClient: @@ -121,34 +121,37 @@ class DockerClient: def check_docker(): client = DockerClient() + if not os.path.exists(client.socket_path): + return {"status": "unavailable", "message": "Docker socket not found"} + containers = client.get_containers() if containers is None: - emit_event("docker_socket_error", "error", "Could not connect to Docker socket or socket missing") - return {"status": "error", "error": "Could not connect to Docker socket"} - + emit_event("docker_api_error", "warning", "Could not connect to Docker socket API") + return {"status": "error", "error": "Could not connect to Docker socket API"} + summary = [] unhealthy_containers = [] for c in containers: state = c.get("State", "") status = c.get("Status", "") name = c.get("Names", ["unknown"])[0].lstrip("/") - + container_info = { "name": name, "state": state, "status": status } summary.append(container_info) - + if state != "running": unhealthy_containers.append(container_info) - + if unhealthy_containers: names = [c["name"] for c in unhealthy_containers] - # Only emit warning for containers that should be running? + # Only emit warning for containers that should be running? # For now, we report any non-running container found by Docker. emit_event("containers_not_running", "warning", f"Some containers are not running: {', '.join(names)}", details={"containers": unhealthy_containers}) - + return {"status": "ok", "containers": summary} def check_tailscale(): @@ -156,7 +159,7 @@ def check_tailscale(): socket_path = "/var/run/tailscale/tailscaled.sock" socket_available = os.path.exists(socket_path) interface_available = os.path.exists("/sys/class/net/tailscale0") - + return { "available": socket_available or interface_available, "details": { @@ -168,7 +171,7 @@ def check_tailscale(): def check_mqtt(): if not MQTT_HOST: return {"configured": False} - + try: with socket.create_connection((MQTT_HOST, MQTT_PORT), timeout=5): return {"configured": True, "reachable": True} @@ -203,13 +206,13 @@ class RedisClient: def _send_command(self, *args): if not self._connect(): return False - + # RESP array cmd = f"*{len(args)}\r\n" for arg in args: s_arg = str(arg) cmd += f"${len(s_arg.encode('utf-8'))}\r\n{s_arg}\r\n" - + try: self.sock.sendall(cmd.encode('utf-8')) # Basic response reading @@ -241,11 +244,11 @@ redis_client = RedisClient(REDIS_HOST, REDIS_PORT) if REDIS_ENABLED else None def main(): print(f"Starting stability-agent on {NODE_NAME}...") - + # Ensure directories exist os.makedirs(STATE_DIR, exist_ok=True) os.makedirs(EVENTS_BASE_DIR, exist_ok=True) - + while True: try: status = { @@ -253,12 +256,12 @@ def main(): "node": NODE_NAME, "checks": {} } - + status["checks"]["disk"] = check_disk() status["checks"]["docker"] = check_docker() status["checks"]["tailscale"] = check_tailscale() status["checks"]["mqtt"] = check_mqtt() - + # Zigbee2MQTT container check z2m_present = False z2m_running = False @@ -268,20 +271,20 @@ def main(): z2m_present = True if c["state"] == "running": z2m_running = True - + status["checks"]["zigbee2mqtt"] = { "present": z2m_present, "running": z2m_running } - + # Write heartbeat with open(HEARTBEAT_FILE, "w") as f: f.write(get_timestamp()) - + # Write status summary with open(STATUS_FILE, "w") as f: json.dump(status, f, indent=2) - + # Redis publishing if REDIS_ENABLED and redis_client: try: @@ -290,7 +293,8 @@ def main(): for check in status["checks"].values(): if isinstance(check, dict) and check.get("status") == "error": node_health = "unhealthy" - + + # Redis publishing for node state redis_client.hset(f"homelab:nodes:{NODE_NAME}", { "id": NODE_NAME, "hostname": NODE_NAME, @@ -300,13 +304,29 @@ def main(): "capabilities": json.dumps(["docker", "tailscale", "mqtt", "disk"]), "checks": json.dumps(status["checks"]) }) - + + # Always publish stability-agent itself as a service + redis_client.hset(f"homelab:services:{NODE_NAME}:stability-agent", { + "name": "stability-agent", + "node": NODE_NAME, + "health": "healthy", + "desired_state": "running", + "actual_state": "running", + "deployment_state": "deployed", + "updated_at": status["timestamp"], + "dependencies": json.dumps([]), + "recommendations": json.dumps([]) + }) + # Services discovered from Docker if status["checks"]["docker"]["status"] == "ok": for c in status["checks"]["docker"]["containers"]: service_name = c["name"] + if service_name == "stability-agent": + continue # Already published above + service_health = "healthy" if c["state"] == "running" else "unhealthy" - + redis_client.hset(f"homelab:services:{NODE_NAME}:{service_name}", { "name": service_name, "node": NODE_NAME, @@ -326,7 +346,7 @@ def main(): except Exception as e: print(f"Error in main loop: {e}") emit_event("agent_error", "error", f"Internal agent error: {e}", details={"error": str(e)}) - + time.sleep(CHECK_INTERVAL) if __name__ == "__main__":