Roll out stability agent to homelab nodes
This commit is contained in:
parent
3233cf07cd
commit
c9ddfa9ac1
75
docs/stability-agent-rollout.md
Normal file
75
docs/stability-agent-rollout.md
Normal file
|
|
@ -0,0 +1,75 @@
|
||||||
|
# Stability Agent Multi-Node Rollout
|
||||||
|
|
||||||
|
## Architecture Summary
|
||||||
|
The `stability-agent` is a lightweight Python service that monitors node health (disk, Docker containers, Tailscale, MQTT) and publishes state to a central Redis instance running on **PIHA**.
|
||||||
|
|
||||||
|
- **Source**: `services/stability-agent`
|
||||||
|
- **State Path**: `/opt/homelab/state`
|
||||||
|
- **Events Path**: `/opt/homelab/events`
|
||||||
|
- **Redis Target**: `100.108.208.3:6379` (PIHA)
|
||||||
|
|
||||||
|
## Why UI only showed CHELSTY
|
||||||
|
Previously, the `stability-agent` had `NODE_NAME` defaulted to `chelsty` and was only deployed there. The Agent System UI materializer on PIHA filters nodes based on the Redis keys `homelab:nodes:<NODE_NAME>`. Without other agents publishing their specific `NODE_NAME`, the UI remained limited to the single active node.
|
||||||
|
|
||||||
|
## Deployment Commands
|
||||||
|
|
||||||
|
Use the helper script to generate commands:
|
||||||
|
```bash
|
||||||
|
./scripts/deploy/deploy-stability-agent.sh <node-name>
|
||||||
|
```
|
||||||
|
|
||||||
|
### PIHA
|
||||||
|
```bash
|
||||||
|
cd ~/homelab-codex-ws
|
||||||
|
git pull
|
||||||
|
cd services/stability-agent
|
||||||
|
NODE_NAME=piha REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker compose up -d --build --force-recreate
|
||||||
|
```
|
||||||
|
|
||||||
|
### CHELSTY
|
||||||
|
```bash
|
||||||
|
cd ~/homelab-codex-ws
|
||||||
|
git pull
|
||||||
|
cd services/stability-agent
|
||||||
|
NODE_NAME=chelsty REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker compose up -d --build --force-recreate
|
||||||
|
```
|
||||||
|
|
||||||
|
### SOLARIA
|
||||||
|
```bash
|
||||||
|
cd ~/homelab-codex-ws
|
||||||
|
git pull
|
||||||
|
cd services/stability-agent
|
||||||
|
NODE_NAME=solaria REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker compose up -d --build --force-recreate
|
||||||
|
```
|
||||||
|
|
||||||
|
### VPS
|
||||||
|
```bash
|
||||||
|
cd ~/homelab-codex-ws
|
||||||
|
git pull
|
||||||
|
cd services/stability-agent
|
||||||
|
NODE_NAME=vps REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker compose up -d --build --force-recreate
|
||||||
|
```
|
||||||
|
|
||||||
|
### SATURN (Optional)
|
||||||
|
Saturn is the orchestrator and can optionally run the stability-agent. If deployed, follow the same pattern with `NODE_NAME=saturn`.
|
||||||
|
|
||||||
|
## Verification (on PIHA)
|
||||||
|
|
||||||
|
Verify Redis keys:
|
||||||
|
```bash
|
||||||
|
docker exec agent-system-redis redis-cli KEYS 'homelab:nodes:*'
|
||||||
|
docker exec agent-system-redis redis-cli HGETALL homelab:nodes:<node-name>
|
||||||
|
```
|
||||||
|
|
||||||
|
Verify Web UI backend:
|
||||||
|
```bash
|
||||||
|
curl -s http://127.0.0.1:18180/nodes
|
||||||
|
curl -k https://agents.okit.pl/nodes
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
- **Redis empty after compose down**: The `agent-system-redis` on PIHA uses transient storage if not configured with a volume. If it restarts, agents must republish their state (they do this automatically every `CHECK_INTERVAL`).
|
||||||
|
- **Secrets**: `.env` files and local secrets are not committed to the repo. Ensure `MQTT_HOST` and other specific secrets are set via overrides if needed.
|
||||||
|
- **Telegram**: Telegram bot notifications can remain disabled if `TELEGRAM_BOT_TOKEN` is absent.
|
||||||
|
- **Docker Socket**: If the agent reports `unavailable` for Docker, ensure `/var/run/docker.sock` is mounted and the user has permissions.
|
||||||
|
|
@ -1,6 +1,10 @@
|
||||||
services:
|
services:
|
||||||
stability-agent:
|
stability-agent:
|
||||||
environment:
|
environment:
|
||||||
|
- NODE_NAME=chelsty
|
||||||
|
- REDIS_HOST=100.108.208.3
|
||||||
|
- REDIS_PORT=6379
|
||||||
|
- REDIS_ENABLED=true
|
||||||
- STABILITY_CHECK_INTERVAL=60
|
- STABILITY_CHECK_INTERVAL=60
|
||||||
- DISK_THRESHOLD_PCT=85
|
- DISK_THRESHOLD_PCT=85
|
||||||
- MQTT_HOST=mosquitto
|
- MQTT_HOST=mosquitto
|
||||||
|
|
|
||||||
|
|
@ -117,7 +117,7 @@ services:
|
||||||
- mosquitto
|
- mosquitto
|
||||||
external: []
|
external: []
|
||||||
runtime:
|
runtime:
|
||||||
config_path: null
|
config_path: /opt/homelab/config/stability-agent
|
||||||
data_path: /opt/homelab/state
|
data_path: /opt/homelab/state
|
||||||
logs_path: /opt/homelab/events
|
logs_path: /opt/homelab/events
|
||||||
backup:
|
backup:
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,7 @@
|
||||||
|
services:
|
||||||
|
stability-agent:
|
||||||
|
environment:
|
||||||
|
- NODE_NAME=piha
|
||||||
|
- REDIS_HOST=100.108.208.3
|
||||||
|
- REDIS_PORT=6379
|
||||||
|
- REDIS_ENABLED=true
|
||||||
15
hosts/piha/services.yaml
Normal file
15
hosts/piha/services.yaml
Normal file
|
|
@ -0,0 +1,15 @@
|
||||||
|
host: piha
|
||||||
|
|
||||||
|
services:
|
||||||
|
stability-agent:
|
||||||
|
role: node-stability-monitor
|
||||||
|
deployment_model: docker-compose
|
||||||
|
exposure: local-only
|
||||||
|
offline_required: true
|
||||||
|
depends_on:
|
||||||
|
local: []
|
||||||
|
external: []
|
||||||
|
runtime:
|
||||||
|
config_path: /opt/homelab/config/stability-agent
|
||||||
|
data_path: /opt/homelab/state
|
||||||
|
logs_path: /opt/homelab/events
|
||||||
|
|
@ -0,0 +1,7 @@
|
||||||
|
services:
|
||||||
|
stability-agent:
|
||||||
|
environment:
|
||||||
|
- NODE_NAME=solaria
|
||||||
|
- REDIS_HOST=100.108.208.3
|
||||||
|
- REDIS_PORT=6379
|
||||||
|
- REDIS_ENABLED=true
|
||||||
15
hosts/solaria/services.yaml
Normal file
15
hosts/solaria/services.yaml
Normal file
|
|
@ -0,0 +1,15 @@
|
||||||
|
host: solaria
|
||||||
|
|
||||||
|
services:
|
||||||
|
stability-agent:
|
||||||
|
role: node-stability-monitor
|
||||||
|
deployment_model: docker-compose
|
||||||
|
exposure: local-only
|
||||||
|
offline_required: true
|
||||||
|
depends_on:
|
||||||
|
local: []
|
||||||
|
external: []
|
||||||
|
runtime:
|
||||||
|
config_path: /opt/homelab/config/stability-agent
|
||||||
|
data_path: /opt/homelab/state
|
||||||
|
logs_path: /opt/homelab/events
|
||||||
|
|
@ -0,0 +1,7 @@
|
||||||
|
services:
|
||||||
|
stability-agent:
|
||||||
|
environment:
|
||||||
|
- NODE_NAME=vps
|
||||||
|
- REDIS_HOST=100.108.208.3
|
||||||
|
- REDIS_PORT=6379
|
||||||
|
- REDIS_ENABLED=true
|
||||||
15
hosts/vps/services.yaml
Normal file
15
hosts/vps/services.yaml
Normal file
|
|
@ -0,0 +1,15 @@
|
||||||
|
host: vps
|
||||||
|
|
||||||
|
services:
|
||||||
|
stability-agent:
|
||||||
|
role: node-stability-monitor
|
||||||
|
deployment_model: docker-compose
|
||||||
|
exposure: local-only
|
||||||
|
offline_required: true
|
||||||
|
depends_on:
|
||||||
|
local: []
|
||||||
|
external: []
|
||||||
|
runtime:
|
||||||
|
config_path: /opt/homelab/config/stability-agent
|
||||||
|
data_path: /opt/homelab/state
|
||||||
|
logs_path: /opt/homelab/events
|
||||||
38
scripts/deploy/deploy-stability-agent.sh
Executable file
38
scripts/deploy/deploy-stability-agent.sh
Executable file
|
|
@ -0,0 +1,38 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# deploy-stability-agent.sh - Helper to print deployment commands for stability-agent
|
||||||
|
|
||||||
|
NODE=$1
|
||||||
|
REPO_PATH="~/homelab-codex-ws"
|
||||||
|
|
||||||
|
if [[ -z "$NODE" ]]; then
|
||||||
|
echo "Usage: $0 <node-name>"
|
||||||
|
echo "Supported nodes: chelsty, piha, solaria, vps"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
case "$NODE" in
|
||||||
|
chelsty|piha|solaria|vps)
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "Error: Unknown node '$NODE'"
|
||||||
|
echo "Supported nodes: chelsty, piha, solaria, vps"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
echo "# --- Deployment commands for $NODE ---"
|
||||||
|
echo "cd $REPO_PATH"
|
||||||
|
echo "git fetch origin"
|
||||||
|
echo "git checkout master"
|
||||||
|
echo "git pull"
|
||||||
|
echo "cd services/stability-agent"
|
||||||
|
echo ""
|
||||||
|
echo "# Command (Docker Compose V2):"
|
||||||
|
echo "NODE_NAME=$NODE REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker compose up -d --build --force-recreate"
|
||||||
|
echo ""
|
||||||
|
echo "# Command (Docker Compose V1):"
|
||||||
|
echo "NODE_NAME=$NODE REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker-compose up -d --build --force-recreate"
|
||||||
|
echo ""
|
||||||
|
echo "# Notes:"
|
||||||
|
echo "# - If using host-specific overrides: add '-f ../../hosts/$NODE/runtime/stability-agent/docker-compose.override.yml'"
|
||||||
|
echo "# - Ensure /opt/homelab/state and /opt/homelab/events exist on the host."
|
||||||
|
|
@ -1,19 +1,25 @@
|
||||||
### Stability Agent
|
### Stability Agent
|
||||||
|
|
||||||
A lightweight filesystem-first watchdog and observer agent for CHELSTY.
|
A lightweight filesystem-first watchdog and observer agent for homelab nodes.
|
||||||
|
|
||||||
#### Features
|
#### Features
|
||||||
|
|
||||||
* **Continuous Monitoring**: Runs as a background service.
|
* **Continuous Monitoring**: Runs as a background service.
|
||||||
* **Docker Inspection**: Checks container status via read-only Docker socket.
|
* **Docker Inspection**: Checks container status via read-only Docker socket (optional).
|
||||||
* **Disk Usage**: Monitors local disk utilization.
|
* **Disk Usage**: Monitors local disk utilization.
|
||||||
* **Tailscale Check**: Verifies Tailscale availability.
|
* **Tailscale Check**: Verifies Tailscale availability (optional).
|
||||||
* **MQTT Reachability**: Checks connectivity to the local MQTT broker.
|
* **MQTT Reachability**: Checks connectivity to a configured MQTT broker (optional).
|
||||||
* **Zigbee2MQTT Monitoring**: Specifically monitors the Zigbee2MQTT container.
|
* **Redis Publishing**: Publishes runtime state and events to a central Redis server (PIHA).
|
||||||
* **Redis Publishing**: (Optional) Publishes runtime state and events to a central Redis server.
|
* **Event Logging**: Writes append-only JSON events to `/opt/homelab/events/YYYY-MM-DD/<NODE_NAME>/`.
|
||||||
* **Event Logging**: Writes append-only JSON events to `/opt/homelab/events/YYYY-MM-DD/chelsty/`.
|
|
||||||
* **State Reporting**: Writes heartbeat and status summary to `/opt/homelab/state/`.
|
* **State Reporting**: Writes heartbeat and status summary to `/opt/homelab/state/`.
|
||||||
|
|
||||||
|
#### Deployment
|
||||||
|
|
||||||
|
Use the deployment helper script:
|
||||||
|
```bash
|
||||||
|
./scripts/deploy/deploy-stability-agent.sh <NODE_NAME>
|
||||||
|
```
|
||||||
|
|
||||||
#### Configuration
|
#### Configuration
|
||||||
|
|
||||||
Environment variables:
|
Environment variables:
|
||||||
|
|
@ -33,10 +39,10 @@ You can verify the Redis publishing using `redis-cli`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Check node state
|
# Check node state
|
||||||
redis-cli -h 100.108.208.3 HGETALL homelab:nodes:chelsty
|
redis-cli -h 100.108.208.3 HGETALL homelab:nodes:<NODE_NAME>
|
||||||
|
|
||||||
# Check service discovery
|
# Check service discovery
|
||||||
redis-cli -h 100.108.208.3 HGETALL homelab:services:chelsty:stability-agent
|
redis-cli -h 100.108.208.3 HGETALL homelab:services:<NODE_NAME>:stability-agent
|
||||||
|
|
||||||
# Check event stream
|
# Check event stream
|
||||||
redis-cli -h 100.108.208.3 XRANGE homelab:events - +
|
redis-cli -h 100.108.208.3 XRANGE homelab:events - +
|
||||||
|
|
@ -55,7 +61,7 @@ Events are written as JSON lines with the following fields:
|
||||||
|
|
||||||
* `id`: Unique event UUID.
|
* `id`: Unique event UUID.
|
||||||
* `timestamp`: ISO 8601 timestamp (UTC).
|
* `timestamp`: ISO 8601 timestamp (UTC).
|
||||||
* `node`: `chelsty`.
|
* `node`: `<NODE_NAME>`.
|
||||||
* `source`: `stability-agent`.
|
* `source`: `stability-agent`.
|
||||||
* `type`: Type of event (e.g., `disk_usage_high`, `containers_not_running`).
|
* `type`: Type of event (e.g., `disk_usage_high`, `containers_not_running`).
|
||||||
* `severity`: `info`, `warning`, or `error`.
|
* `severity`: `info`, `warning`, or `error`.
|
||||||
|
|
|
||||||
|
|
@ -121,10 +121,13 @@ class DockerClient:
|
||||||
|
|
||||||
def check_docker():
|
def check_docker():
|
||||||
client = DockerClient()
|
client = DockerClient()
|
||||||
|
if not os.path.exists(client.socket_path):
|
||||||
|
return {"status": "unavailable", "message": "Docker socket not found"}
|
||||||
|
|
||||||
containers = client.get_containers()
|
containers = client.get_containers()
|
||||||
if containers is None:
|
if containers is None:
|
||||||
emit_event("docker_socket_error", "error", "Could not connect to Docker socket or socket missing")
|
emit_event("docker_api_error", "warning", "Could not connect to Docker socket API")
|
||||||
return {"status": "error", "error": "Could not connect to Docker socket"}
|
return {"status": "error", "error": "Could not connect to Docker socket API"}
|
||||||
|
|
||||||
summary = []
|
summary = []
|
||||||
unhealthy_containers = []
|
unhealthy_containers = []
|
||||||
|
|
@ -291,6 +294,7 @@ def main():
|
||||||
if isinstance(check, dict) and check.get("status") == "error":
|
if isinstance(check, dict) and check.get("status") == "error":
|
||||||
node_health = "unhealthy"
|
node_health = "unhealthy"
|
||||||
|
|
||||||
|
# Redis publishing for node state
|
||||||
redis_client.hset(f"homelab:nodes:{NODE_NAME}", {
|
redis_client.hset(f"homelab:nodes:{NODE_NAME}", {
|
||||||
"id": NODE_NAME,
|
"id": NODE_NAME,
|
||||||
"hostname": NODE_NAME,
|
"hostname": NODE_NAME,
|
||||||
|
|
@ -301,10 +305,26 @@ def main():
|
||||||
"checks": json.dumps(status["checks"])
|
"checks": json.dumps(status["checks"])
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Always publish stability-agent itself as a service
|
||||||
|
redis_client.hset(f"homelab:services:{NODE_NAME}:stability-agent", {
|
||||||
|
"name": "stability-agent",
|
||||||
|
"node": NODE_NAME,
|
||||||
|
"health": "healthy",
|
||||||
|
"desired_state": "running",
|
||||||
|
"actual_state": "running",
|
||||||
|
"deployment_state": "deployed",
|
||||||
|
"updated_at": status["timestamp"],
|
||||||
|
"dependencies": json.dumps([]),
|
||||||
|
"recommendations": json.dumps([])
|
||||||
|
})
|
||||||
|
|
||||||
# Services discovered from Docker
|
# Services discovered from Docker
|
||||||
if status["checks"]["docker"]["status"] == "ok":
|
if status["checks"]["docker"]["status"] == "ok":
|
||||||
for c in status["checks"]["docker"]["containers"]:
|
for c in status["checks"]["docker"]["containers"]:
|
||||||
service_name = c["name"]
|
service_name = c["name"]
|
||||||
|
if service_name == "stability-agent":
|
||||||
|
continue # Already published above
|
||||||
|
|
||||||
service_health = "healthy" if c["state"] == "running" else "unhealthy"
|
service_health = "healthy" if c["state"] == "running" else "unhealthy"
|
||||||
|
|
||||||
redis_client.hset(f"homelab:services:{NODE_NAME}:{service_name}", {
|
redis_client.hset(f"homelab:services:{NODE_NAME}:{service_name}", {
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue