Roll out stability agent to homelab nodes

This commit is contained in:
oskar 2026-05-17 15:54:19 +02:00
parent 3233cf07cd
commit c9ddfa9ac1
12 changed files with 249 additions and 40 deletions

View file

@ -0,0 +1,75 @@
# Stability Agent Multi-Node Rollout
## Architecture Summary
The `stability-agent` is a lightweight Python service that monitors node health (disk, Docker containers, Tailscale, MQTT) and publishes state to a central Redis instance running on **PIHA**.
- **Source**: `services/stability-agent`
- **State Path**: `/opt/homelab/state`
- **Events Path**: `/opt/homelab/events`
- **Redis Target**: `100.108.208.3:6379` (PIHA)
## Why UI only showed CHELSTY
Previously, the `stability-agent` had `NODE_NAME` defaulted to `chelsty` and was only deployed there. The Agent System UI materializer on PIHA filters nodes based on the Redis keys `homelab:nodes:<NODE_NAME>`. Without other agents publishing their specific `NODE_NAME`, the UI remained limited to the single active node.
## Deployment Commands
Use the helper script to generate commands:
```bash
./scripts/deploy/deploy-stability-agent.sh <node-name>
```
### PIHA
```bash
cd ~/homelab-codex-ws
git pull
cd services/stability-agent
NODE_NAME=piha REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker compose up -d --build --force-recreate
```
### CHELSTY
```bash
cd ~/homelab-codex-ws
git pull
cd services/stability-agent
NODE_NAME=chelsty REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker compose up -d --build --force-recreate
```
### SOLARIA
```bash
cd ~/homelab-codex-ws
git pull
cd services/stability-agent
NODE_NAME=solaria REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker compose up -d --build --force-recreate
```
### VPS
```bash
cd ~/homelab-codex-ws
git pull
cd services/stability-agent
NODE_NAME=vps REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker compose up -d --build --force-recreate
```
### SATURN (Optional)
Saturn is the orchestrator and can optionally run the stability-agent. If deployed, follow the same pattern with `NODE_NAME=saturn`.
## Verification (on PIHA)
Verify Redis keys:
```bash
docker exec agent-system-redis redis-cli KEYS 'homelab:nodes:*'
docker exec agent-system-redis redis-cli HGETALL homelab:nodes:<node-name>
```
Verify Web UI backend:
```bash
curl -s http://127.0.0.1:18180/nodes
curl -k https://agents.okit.pl/nodes
```
## Troubleshooting
- **Redis empty after compose down**: The `agent-system-redis` on PIHA uses transient storage if not configured with a volume. If it restarts, agents must republish their state (they do this automatically every `CHECK_INTERVAL`).
- **Secrets**: `.env` files and local secrets are not committed to the repo. Ensure `MQTT_HOST` and other specific secrets are set via overrides if needed.
- **Telegram**: Telegram bot notifications can remain disabled if `TELEGRAM_BOT_TOKEN` is absent.
- **Docker Socket**: If the agent reports `unavailable` for Docker, ensure `/var/run/docker.sock` is mounted and the user has permissions.

View file

@ -1,6 +1,10 @@
services:
stability-agent:
environment:
- NODE_NAME=chelsty
- REDIS_HOST=100.108.208.3
- REDIS_PORT=6379
- REDIS_ENABLED=true
- STABILITY_CHECK_INTERVAL=60
- DISK_THRESHOLD_PCT=85
- MQTT_HOST=mosquitto

View file

@ -117,7 +117,7 @@ services:
- mosquitto
external: []
runtime:
config_path: null
config_path: /opt/homelab/config/stability-agent
data_path: /opt/homelab/state
logs_path: /opt/homelab/events
backup:

View file

@ -0,0 +1,7 @@
services:
stability-agent:
environment:
- NODE_NAME=piha
- REDIS_HOST=100.108.208.3
- REDIS_PORT=6379
- REDIS_ENABLED=true

15
hosts/piha/services.yaml Normal file
View file

@ -0,0 +1,15 @@
host: piha
services:
stability-agent:
role: node-stability-monitor
deployment_model: docker-compose
exposure: local-only
offline_required: true
depends_on:
local: []
external: []
runtime:
config_path: /opt/homelab/config/stability-agent
data_path: /opt/homelab/state
logs_path: /opt/homelab/events

View file

@ -0,0 +1,7 @@
services:
stability-agent:
environment:
- NODE_NAME=solaria
- REDIS_HOST=100.108.208.3
- REDIS_PORT=6379
- REDIS_ENABLED=true

View file

@ -0,0 +1,15 @@
host: solaria
services:
stability-agent:
role: node-stability-monitor
deployment_model: docker-compose
exposure: local-only
offline_required: true
depends_on:
local: []
external: []
runtime:
config_path: /opt/homelab/config/stability-agent
data_path: /opt/homelab/state
logs_path: /opt/homelab/events

View file

@ -0,0 +1,7 @@
services:
stability-agent:
environment:
- NODE_NAME=vps
- REDIS_HOST=100.108.208.3
- REDIS_PORT=6379
- REDIS_ENABLED=true

15
hosts/vps/services.yaml Normal file
View file

@ -0,0 +1,15 @@
host: vps
services:
stability-agent:
role: node-stability-monitor
deployment_model: docker-compose
exposure: local-only
offline_required: true
depends_on:
local: []
external: []
runtime:
config_path: /opt/homelab/config/stability-agent
data_path: /opt/homelab/state
logs_path: /opt/homelab/events

View file

@ -0,0 +1,38 @@
#!/usr/bin/env bash
# deploy-stability-agent.sh - Helper to print deployment commands for stability-agent
NODE=$1
REPO_PATH="~/homelab-codex-ws"
if [[ -z "$NODE" ]]; then
echo "Usage: $0 <node-name>"
echo "Supported nodes: chelsty, piha, solaria, vps"
exit 1
fi
case "$NODE" in
chelsty|piha|solaria|vps)
;;
*)
echo "Error: Unknown node '$NODE'"
echo "Supported nodes: chelsty, piha, solaria, vps"
exit 1
;;
esac
echo "# --- Deployment commands for $NODE ---"
echo "cd $REPO_PATH"
echo "git fetch origin"
echo "git checkout master"
echo "git pull"
echo "cd services/stability-agent"
echo ""
echo "# Command (Docker Compose V2):"
echo "NODE_NAME=$NODE REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker compose up -d --build --force-recreate"
echo ""
echo "# Command (Docker Compose V1):"
echo "NODE_NAME=$NODE REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker-compose up -d --build --force-recreate"
echo ""
echo "# Notes:"
echo "# - If using host-specific overrides: add '-f ../../hosts/$NODE/runtime/stability-agent/docker-compose.override.yml'"
echo "# - Ensure /opt/homelab/state and /opt/homelab/events exist on the host."

View file

@ -1,19 +1,25 @@
### Stability Agent
A lightweight filesystem-first watchdog and observer agent for CHELSTY.
A lightweight filesystem-first watchdog and observer agent for homelab nodes.
#### Features
* **Continuous Monitoring**: Runs as a background service.
* **Docker Inspection**: Checks container status via read-only Docker socket.
* **Docker Inspection**: Checks container status via read-only Docker socket (optional).
* **Disk Usage**: Monitors local disk utilization.
* **Tailscale Check**: Verifies Tailscale availability.
* **MQTT Reachability**: Checks connectivity to the local MQTT broker.
* **Zigbee2MQTT Monitoring**: Specifically monitors the Zigbee2MQTT container.
* **Redis Publishing**: (Optional) Publishes runtime state and events to a central Redis server.
* **Event Logging**: Writes append-only JSON events to `/opt/homelab/events/YYYY-MM-DD/chelsty/`.
* **Tailscale Check**: Verifies Tailscale availability (optional).
* **MQTT Reachability**: Checks connectivity to a configured MQTT broker (optional).
* **Redis Publishing**: Publishes runtime state and events to a central Redis server (PIHA).
* **Event Logging**: Writes append-only JSON events to `/opt/homelab/events/YYYY-MM-DD/<NODE_NAME>/`.
* **State Reporting**: Writes heartbeat and status summary to `/opt/homelab/state/`.
#### Deployment
Use the deployment helper script:
```bash
./scripts/deploy/deploy-stability-agent.sh <NODE_NAME>
```
#### Configuration
Environment variables:
@ -33,10 +39,10 @@ You can verify the Redis publishing using `redis-cli`:
```bash
# Check node state
redis-cli -h 100.108.208.3 HGETALL homelab:nodes:chelsty
redis-cli -h 100.108.208.3 HGETALL homelab:nodes:<NODE_NAME>
# Check service discovery
redis-cli -h 100.108.208.3 HGETALL homelab:services:chelsty:stability-agent
redis-cli -h 100.108.208.3 HGETALL homelab:services:<NODE_NAME>:stability-agent
# Check event stream
redis-cli -h 100.108.208.3 XRANGE homelab:events - +
@ -55,7 +61,7 @@ Events are written as JSON lines with the following fields:
* `id`: Unique event UUID.
* `timestamp`: ISO 8601 timestamp (UTC).
* `node`: `chelsty`.
* `node`: `<NODE_NAME>`.
* `source`: `stability-agent`.
* `type`: Type of event (e.g., `disk_usage_high`, `containers_not_running`).
* `severity`: `info`, `warning`, or `error`.

View file

@ -43,7 +43,7 @@ def emit_event(event_type, severity, message, service=None, details=None):
}
if service:
event["service"] = service
date_str = get_datestamp()
event_dir = os.path.join(EVENTS_BASE_DIR, date_str, NODE_NAME)
try:
@ -53,7 +53,7 @@ def emit_event(event_type, severity, message, service=None, details=None):
f.write(json.dumps(event) + "\n")
except Exception as e:
print(f"Failed to write event to filesystem: {e}")
# Redis publishing
if REDIS_ENABLED and redis_client:
try:
@ -68,7 +68,7 @@ def emit_event(event_type, severity, message, service=None, details=None):
except Exception as e:
print(f"Failed to publish event to Redis: {e}")
# Do not crash, already logged to filesystem
print(f"[{severity}] {message}")
def check_disk():
@ -80,10 +80,10 @@ def check_disk():
"free_gb": free // (2**30),
"percent": round(percent, 2)
}
if percent > DISK_THRESHOLD_PCT:
emit_event("disk_usage_high", "warning", f"Disk usage is high: {details['percent']}%", details=details)
return details
class DockerClient:
@ -121,34 +121,37 @@ class DockerClient:
def check_docker():
client = DockerClient()
if not os.path.exists(client.socket_path):
return {"status": "unavailable", "message": "Docker socket not found"}
containers = client.get_containers()
if containers is None:
emit_event("docker_socket_error", "error", "Could not connect to Docker socket or socket missing")
return {"status": "error", "error": "Could not connect to Docker socket"}
emit_event("docker_api_error", "warning", "Could not connect to Docker socket API")
return {"status": "error", "error": "Could not connect to Docker socket API"}
summary = []
unhealthy_containers = []
for c in containers:
state = c.get("State", "")
status = c.get("Status", "")
name = c.get("Names", ["unknown"])[0].lstrip("/")
container_info = {
"name": name,
"state": state,
"status": status
}
summary.append(container_info)
if state != "running":
unhealthy_containers.append(container_info)
if unhealthy_containers:
names = [c["name"] for c in unhealthy_containers]
# Only emit warning for containers that should be running?
# Only emit warning for containers that should be running?
# For now, we report any non-running container found by Docker.
emit_event("containers_not_running", "warning", f"Some containers are not running: {', '.join(names)}", details={"containers": unhealthy_containers})
return {"status": "ok", "containers": summary}
def check_tailscale():
@ -156,7 +159,7 @@ def check_tailscale():
socket_path = "/var/run/tailscale/tailscaled.sock"
socket_available = os.path.exists(socket_path)
interface_available = os.path.exists("/sys/class/net/tailscale0")
return {
"available": socket_available or interface_available,
"details": {
@ -168,7 +171,7 @@ def check_tailscale():
def check_mqtt():
if not MQTT_HOST:
return {"configured": False}
try:
with socket.create_connection((MQTT_HOST, MQTT_PORT), timeout=5):
return {"configured": True, "reachable": True}
@ -203,13 +206,13 @@ class RedisClient:
def _send_command(self, *args):
if not self._connect():
return False
# RESP array
cmd = f"*{len(args)}\r\n"
for arg in args:
s_arg = str(arg)
cmd += f"${len(s_arg.encode('utf-8'))}\r\n{s_arg}\r\n"
try:
self.sock.sendall(cmd.encode('utf-8'))
# Basic response reading
@ -241,11 +244,11 @@ redis_client = RedisClient(REDIS_HOST, REDIS_PORT) if REDIS_ENABLED else None
def main():
print(f"Starting stability-agent on {NODE_NAME}...")
# Ensure directories exist
os.makedirs(STATE_DIR, exist_ok=True)
os.makedirs(EVENTS_BASE_DIR, exist_ok=True)
while True:
try:
status = {
@ -253,12 +256,12 @@ def main():
"node": NODE_NAME,
"checks": {}
}
status["checks"]["disk"] = check_disk()
status["checks"]["docker"] = check_docker()
status["checks"]["tailscale"] = check_tailscale()
status["checks"]["mqtt"] = check_mqtt()
# Zigbee2MQTT container check
z2m_present = False
z2m_running = False
@ -268,20 +271,20 @@ def main():
z2m_present = True
if c["state"] == "running":
z2m_running = True
status["checks"]["zigbee2mqtt"] = {
"present": z2m_present,
"running": z2m_running
}
# Write heartbeat
with open(HEARTBEAT_FILE, "w") as f:
f.write(get_timestamp())
# Write status summary
with open(STATUS_FILE, "w") as f:
json.dump(status, f, indent=2)
# Redis publishing
if REDIS_ENABLED and redis_client:
try:
@ -290,7 +293,8 @@ def main():
for check in status["checks"].values():
if isinstance(check, dict) and check.get("status") == "error":
node_health = "unhealthy"
# Redis publishing for node state
redis_client.hset(f"homelab:nodes:{NODE_NAME}", {
"id": NODE_NAME,
"hostname": NODE_NAME,
@ -300,13 +304,29 @@ def main():
"capabilities": json.dumps(["docker", "tailscale", "mqtt", "disk"]),
"checks": json.dumps(status["checks"])
})
# Always publish stability-agent itself as a service
redis_client.hset(f"homelab:services:{NODE_NAME}:stability-agent", {
"name": "stability-agent",
"node": NODE_NAME,
"health": "healthy",
"desired_state": "running",
"actual_state": "running",
"deployment_state": "deployed",
"updated_at": status["timestamp"],
"dependencies": json.dumps([]),
"recommendations": json.dumps([])
})
# Services discovered from Docker
if status["checks"]["docker"]["status"] == "ok":
for c in status["checks"]["docker"]["containers"]:
service_name = c["name"]
if service_name == "stability-agent":
continue # Already published above
service_health = "healthy" if c["state"] == "running" else "unhealthy"
redis_client.hset(f"homelab:services:{NODE_NAME}:{service_name}", {
"name": service_name,
"node": NODE_NAME,
@ -326,7 +346,7 @@ def main():
except Exception as e:
print(f"Error in main loop: {e}")
emit_event("agent_error", "error", f"Internal agent error: {e}", details={"error": str(e)})
time.sleep(CHECK_INTERVAL)
if __name__ == "__main__":