Roll out stability agent to homelab nodes

This commit is contained in:
oskar 2026-05-17 15:54:19 +02:00
parent 3233cf07cd
commit c9ddfa9ac1
12 changed files with 249 additions and 40 deletions

View file

@ -0,0 +1,75 @@
# Stability Agent Multi-Node Rollout
## Architecture Summary
The `stability-agent` is a lightweight Python service that monitors node health (disk, Docker containers, Tailscale, MQTT) and publishes state to a central Redis instance running on **PIHA**.
- **Source**: `services/stability-agent`
- **State Path**: `/opt/homelab/state`
- **Events Path**: `/opt/homelab/events`
- **Redis Target**: `100.108.208.3:6379` (PIHA)
## Why UI only showed CHELSTY
Previously, the `stability-agent` had `NODE_NAME` defaulted to `chelsty` and was only deployed there. The Agent System UI materializer on PIHA filters nodes based on the Redis keys `homelab:nodes:<NODE_NAME>`. Without other agents publishing their specific `NODE_NAME`, the UI remained limited to the single active node.
## Deployment Commands
Use the helper script to generate commands:
```bash
./scripts/deploy/deploy-stability-agent.sh <node-name>
```
### PIHA
```bash
cd ~/homelab-codex-ws
git pull
cd services/stability-agent
NODE_NAME=piha REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker compose up -d --build --force-recreate
```
### CHELSTY
```bash
cd ~/homelab-codex-ws
git pull
cd services/stability-agent
NODE_NAME=chelsty REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker compose up -d --build --force-recreate
```
### SOLARIA
```bash
cd ~/homelab-codex-ws
git pull
cd services/stability-agent
NODE_NAME=solaria REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker compose up -d --build --force-recreate
```
### VPS
```bash
cd ~/homelab-codex-ws
git pull
cd services/stability-agent
NODE_NAME=vps REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker compose up -d --build --force-recreate
```
### SATURN (Optional)
Saturn is the orchestrator and can optionally run the stability-agent. If deployed, follow the same pattern with `NODE_NAME=saturn`.
## Verification (on PIHA)
Verify Redis keys:
```bash
docker exec agent-system-redis redis-cli KEYS 'homelab:nodes:*'
docker exec agent-system-redis redis-cli HGETALL homelab:nodes:<node-name>
```
Verify Web UI backend:
```bash
curl -s http://127.0.0.1:18180/nodes
curl -k https://agents.okit.pl/nodes
```
## Troubleshooting
- **Redis empty after compose down**: The `agent-system-redis` on PIHA uses transient storage if not configured with a volume. If it restarts, agents must republish their state (they do this automatically every `CHECK_INTERVAL`).
- **Secrets**: `.env` files and local secrets are not committed to the repo. Ensure `MQTT_HOST` and other specific secrets are set via overrides if needed.
- **Telegram**: Telegram bot notifications can remain disabled if `TELEGRAM_BOT_TOKEN` is absent.
- **Docker Socket**: If the agent reports `unavailable` for Docker, ensure `/var/run/docker.sock` is mounted and the user has permissions.

View file

@ -1,6 +1,10 @@
services: services:
stability-agent: stability-agent:
environment: environment:
- NODE_NAME=chelsty
- REDIS_HOST=100.108.208.3
- REDIS_PORT=6379
- REDIS_ENABLED=true
- STABILITY_CHECK_INTERVAL=60 - STABILITY_CHECK_INTERVAL=60
- DISK_THRESHOLD_PCT=85 - DISK_THRESHOLD_PCT=85
- MQTT_HOST=mosquitto - MQTT_HOST=mosquitto

View file

@ -117,7 +117,7 @@ services:
- mosquitto - mosquitto
external: [] external: []
runtime: runtime:
config_path: null config_path: /opt/homelab/config/stability-agent
data_path: /opt/homelab/state data_path: /opt/homelab/state
logs_path: /opt/homelab/events logs_path: /opt/homelab/events
backup: backup:

View file

@ -0,0 +1,7 @@
services:
stability-agent:
environment:
- NODE_NAME=piha
- REDIS_HOST=100.108.208.3
- REDIS_PORT=6379
- REDIS_ENABLED=true

15
hosts/piha/services.yaml Normal file
View file

@ -0,0 +1,15 @@
host: piha
services:
stability-agent:
role: node-stability-monitor
deployment_model: docker-compose
exposure: local-only
offline_required: true
depends_on:
local: []
external: []
runtime:
config_path: /opt/homelab/config/stability-agent
data_path: /opt/homelab/state
logs_path: /opt/homelab/events

View file

@ -0,0 +1,7 @@
services:
stability-agent:
environment:
- NODE_NAME=solaria
- REDIS_HOST=100.108.208.3
- REDIS_PORT=6379
- REDIS_ENABLED=true

View file

@ -0,0 +1,15 @@
host: solaria
services:
stability-agent:
role: node-stability-monitor
deployment_model: docker-compose
exposure: local-only
offline_required: true
depends_on:
local: []
external: []
runtime:
config_path: /opt/homelab/config/stability-agent
data_path: /opt/homelab/state
logs_path: /opt/homelab/events

View file

@ -0,0 +1,7 @@
services:
stability-agent:
environment:
- NODE_NAME=vps
- REDIS_HOST=100.108.208.3
- REDIS_PORT=6379
- REDIS_ENABLED=true

15
hosts/vps/services.yaml Normal file
View file

@ -0,0 +1,15 @@
host: vps
services:
stability-agent:
role: node-stability-monitor
deployment_model: docker-compose
exposure: local-only
offline_required: true
depends_on:
local: []
external: []
runtime:
config_path: /opt/homelab/config/stability-agent
data_path: /opt/homelab/state
logs_path: /opt/homelab/events

View file

@ -0,0 +1,38 @@
#!/usr/bin/env bash
# deploy-stability-agent.sh - Helper to print deployment commands for stability-agent
NODE=$1
REPO_PATH="~/homelab-codex-ws"
if [[ -z "$NODE" ]]; then
echo "Usage: $0 <node-name>"
echo "Supported nodes: chelsty, piha, solaria, vps"
exit 1
fi
case "$NODE" in
chelsty|piha|solaria|vps)
;;
*)
echo "Error: Unknown node '$NODE'"
echo "Supported nodes: chelsty, piha, solaria, vps"
exit 1
;;
esac
echo "# --- Deployment commands for $NODE ---"
echo "cd $REPO_PATH"
echo "git fetch origin"
echo "git checkout master"
echo "git pull"
echo "cd services/stability-agent"
echo ""
echo "# Command (Docker Compose V2):"
echo "NODE_NAME=$NODE REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker compose up -d --build --force-recreate"
echo ""
echo "# Command (Docker Compose V1):"
echo "NODE_NAME=$NODE REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker-compose up -d --build --force-recreate"
echo ""
echo "# Notes:"
echo "# - If using host-specific overrides: add '-f ../../hosts/$NODE/runtime/stability-agent/docker-compose.override.yml'"
echo "# - Ensure /opt/homelab/state and /opt/homelab/events exist on the host."

View file

@ -1,19 +1,25 @@
### Stability Agent ### Stability Agent
A lightweight filesystem-first watchdog and observer agent for CHELSTY. A lightweight filesystem-first watchdog and observer agent for homelab nodes.
#### Features #### Features
* **Continuous Monitoring**: Runs as a background service. * **Continuous Monitoring**: Runs as a background service.
* **Docker Inspection**: Checks container status via read-only Docker socket. * **Docker Inspection**: Checks container status via read-only Docker socket (optional).
* **Disk Usage**: Monitors local disk utilization. * **Disk Usage**: Monitors local disk utilization.
* **Tailscale Check**: Verifies Tailscale availability. * **Tailscale Check**: Verifies Tailscale availability (optional).
* **MQTT Reachability**: Checks connectivity to the local MQTT broker. * **MQTT Reachability**: Checks connectivity to a configured MQTT broker (optional).
* **Zigbee2MQTT Monitoring**: Specifically monitors the Zigbee2MQTT container. * **Redis Publishing**: Publishes runtime state and events to a central Redis server (PIHA).
* **Redis Publishing**: (Optional) Publishes runtime state and events to a central Redis server. * **Event Logging**: Writes append-only JSON events to `/opt/homelab/events/YYYY-MM-DD/<NODE_NAME>/`.
* **Event Logging**: Writes append-only JSON events to `/opt/homelab/events/YYYY-MM-DD/chelsty/`.
* **State Reporting**: Writes heartbeat and status summary to `/opt/homelab/state/`. * **State Reporting**: Writes heartbeat and status summary to `/opt/homelab/state/`.
#### Deployment
Use the deployment helper script:
```bash
./scripts/deploy/deploy-stability-agent.sh <NODE_NAME>
```
#### Configuration #### Configuration
Environment variables: Environment variables:
@ -33,10 +39,10 @@ You can verify the Redis publishing using `redis-cli`:
```bash ```bash
# Check node state # Check node state
redis-cli -h 100.108.208.3 HGETALL homelab:nodes:chelsty redis-cli -h 100.108.208.3 HGETALL homelab:nodes:<NODE_NAME>
# Check service discovery # Check service discovery
redis-cli -h 100.108.208.3 HGETALL homelab:services:chelsty:stability-agent redis-cli -h 100.108.208.3 HGETALL homelab:services:<NODE_NAME>:stability-agent
# Check event stream # Check event stream
redis-cli -h 100.108.208.3 XRANGE homelab:events - + redis-cli -h 100.108.208.3 XRANGE homelab:events - +
@ -55,7 +61,7 @@ Events are written as JSON lines with the following fields:
* `id`: Unique event UUID. * `id`: Unique event UUID.
* `timestamp`: ISO 8601 timestamp (UTC). * `timestamp`: ISO 8601 timestamp (UTC).
* `node`: `chelsty`. * `node`: `<NODE_NAME>`.
* `source`: `stability-agent`. * `source`: `stability-agent`.
* `type`: Type of event (e.g., `disk_usage_high`, `containers_not_running`). * `type`: Type of event (e.g., `disk_usage_high`, `containers_not_running`).
* `severity`: `info`, `warning`, or `error`. * `severity`: `info`, `warning`, or `error`.

View file

@ -121,10 +121,13 @@ class DockerClient:
def check_docker(): def check_docker():
client = DockerClient() client = DockerClient()
if not os.path.exists(client.socket_path):
return {"status": "unavailable", "message": "Docker socket not found"}
containers = client.get_containers() containers = client.get_containers()
if containers is None: if containers is None:
emit_event("docker_socket_error", "error", "Could not connect to Docker socket or socket missing") emit_event("docker_api_error", "warning", "Could not connect to Docker socket API")
return {"status": "error", "error": "Could not connect to Docker socket"} return {"status": "error", "error": "Could not connect to Docker socket API"}
summary = [] summary = []
unhealthy_containers = [] unhealthy_containers = []
@ -291,6 +294,7 @@ def main():
if isinstance(check, dict) and check.get("status") == "error": if isinstance(check, dict) and check.get("status") == "error":
node_health = "unhealthy" node_health = "unhealthy"
# Redis publishing for node state
redis_client.hset(f"homelab:nodes:{NODE_NAME}", { redis_client.hset(f"homelab:nodes:{NODE_NAME}", {
"id": NODE_NAME, "id": NODE_NAME,
"hostname": NODE_NAME, "hostname": NODE_NAME,
@ -301,10 +305,26 @@ def main():
"checks": json.dumps(status["checks"]) "checks": json.dumps(status["checks"])
}) })
# Always publish stability-agent itself as a service
redis_client.hset(f"homelab:services:{NODE_NAME}:stability-agent", {
"name": "stability-agent",
"node": NODE_NAME,
"health": "healthy",
"desired_state": "running",
"actual_state": "running",
"deployment_state": "deployed",
"updated_at": status["timestamp"],
"dependencies": json.dumps([]),
"recommendations": json.dumps([])
})
# Services discovered from Docker # Services discovered from Docker
if status["checks"]["docker"]["status"] == "ok": if status["checks"]["docker"]["status"] == "ok":
for c in status["checks"]["docker"]["containers"]: for c in status["checks"]["docker"]["containers"]:
service_name = c["name"] service_name = c["name"]
if service_name == "stability-agent":
continue # Already published above
service_health = "healthy" if c["state"] == "running" else "unhealthy" service_health = "healthy" if c["state"] == "running" else "unhealthy"
redis_client.hset(f"homelab:services:{NODE_NAME}:{service_name}", { redis_client.hset(f"homelab:services:{NODE_NAME}:{service_name}", {