Roll out stability agent to homelab nodes
This commit is contained in:
parent
3233cf07cd
commit
c9ddfa9ac1
75
docs/stability-agent-rollout.md
Normal file
75
docs/stability-agent-rollout.md
Normal file
|
|
@ -0,0 +1,75 @@
|
||||||
|
# Stability Agent Multi-Node Rollout
|
||||||
|
|
||||||
|
## Architecture Summary
|
||||||
|
The `stability-agent` is a lightweight Python service that monitors node health (disk, Docker containers, Tailscale, MQTT) and publishes state to a central Redis instance running on **PIHA**.
|
||||||
|
|
||||||
|
- **Source**: `services/stability-agent`
|
||||||
|
- **State Path**: `/opt/homelab/state`
|
||||||
|
- **Events Path**: `/opt/homelab/events`
|
||||||
|
- **Redis Target**: `100.108.208.3:6379` (PIHA)
|
||||||
|
|
||||||
|
## Why UI only showed CHELSTY
|
||||||
|
Previously, the `stability-agent` had `NODE_NAME` defaulted to `chelsty` and was only deployed there. The Agent System UI materializer on PIHA filters nodes based on the Redis keys `homelab:nodes:<NODE_NAME>`. Without other agents publishing their specific `NODE_NAME`, the UI remained limited to the single active node.
|
||||||
|
|
||||||
|
## Deployment Commands
|
||||||
|
|
||||||
|
Use the helper script to generate commands:
|
||||||
|
```bash
|
||||||
|
./scripts/deploy/deploy-stability-agent.sh <node-name>
|
||||||
|
```
|
||||||
|
|
||||||
|
### PIHA
|
||||||
|
```bash
|
||||||
|
cd ~/homelab-codex-ws
|
||||||
|
git pull
|
||||||
|
cd services/stability-agent
|
||||||
|
NODE_NAME=piha REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker compose up -d --build --force-recreate
|
||||||
|
```
|
||||||
|
|
||||||
|
### CHELSTY
|
||||||
|
```bash
|
||||||
|
cd ~/homelab-codex-ws
|
||||||
|
git pull
|
||||||
|
cd services/stability-agent
|
||||||
|
NODE_NAME=chelsty REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker compose up -d --build --force-recreate
|
||||||
|
```
|
||||||
|
|
||||||
|
### SOLARIA
|
||||||
|
```bash
|
||||||
|
cd ~/homelab-codex-ws
|
||||||
|
git pull
|
||||||
|
cd services/stability-agent
|
||||||
|
NODE_NAME=solaria REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker compose up -d --build --force-recreate
|
||||||
|
```
|
||||||
|
|
||||||
|
### VPS
|
||||||
|
```bash
|
||||||
|
cd ~/homelab-codex-ws
|
||||||
|
git pull
|
||||||
|
cd services/stability-agent
|
||||||
|
NODE_NAME=vps REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker compose up -d --build --force-recreate
|
||||||
|
```
|
||||||
|
|
||||||
|
### SATURN (Optional)
|
||||||
|
Saturn is the orchestrator and can optionally run the stability-agent. If deployed, follow the same pattern with `NODE_NAME=saturn`.
|
||||||
|
|
||||||
|
## Verification (on PIHA)
|
||||||
|
|
||||||
|
Verify Redis keys:
|
||||||
|
```bash
|
||||||
|
docker exec agent-system-redis redis-cli KEYS 'homelab:nodes:*'
|
||||||
|
docker exec agent-system-redis redis-cli HGETALL homelab:nodes:<node-name>
|
||||||
|
```
|
||||||
|
|
||||||
|
Verify Web UI backend:
|
||||||
|
```bash
|
||||||
|
curl -s http://127.0.0.1:18180/nodes
|
||||||
|
curl -k https://agents.okit.pl/nodes
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
- **Redis empty after compose down**: The `agent-system-redis` on PIHA uses transient storage if not configured with a volume. If it restarts, agents must republish their state (they do this automatically every `CHECK_INTERVAL`).
|
||||||
|
- **Secrets**: `.env` files and local secrets are not committed to the repo. Ensure `MQTT_HOST` and other specific secrets are set via overrides if needed.
|
||||||
|
- **Telegram**: Telegram bot notifications can remain disabled if `TELEGRAM_BOT_TOKEN` is absent.
|
||||||
|
- **Docker Socket**: If the agent reports `unavailable` for Docker, ensure `/var/run/docker.sock` is mounted and the user has permissions.
|
||||||
|
|
@ -1,6 +1,10 @@
|
||||||
services:
|
services:
|
||||||
stability-agent:
|
stability-agent:
|
||||||
environment:
|
environment:
|
||||||
|
- NODE_NAME=chelsty
|
||||||
|
- REDIS_HOST=100.108.208.3
|
||||||
|
- REDIS_PORT=6379
|
||||||
|
- REDIS_ENABLED=true
|
||||||
- STABILITY_CHECK_INTERVAL=60
|
- STABILITY_CHECK_INTERVAL=60
|
||||||
- DISK_THRESHOLD_PCT=85
|
- DISK_THRESHOLD_PCT=85
|
||||||
- MQTT_HOST=mosquitto
|
- MQTT_HOST=mosquitto
|
||||||
|
|
|
||||||
|
|
@ -117,7 +117,7 @@ services:
|
||||||
- mosquitto
|
- mosquitto
|
||||||
external: []
|
external: []
|
||||||
runtime:
|
runtime:
|
||||||
config_path: null
|
config_path: /opt/homelab/config/stability-agent
|
||||||
data_path: /opt/homelab/state
|
data_path: /opt/homelab/state
|
||||||
logs_path: /opt/homelab/events
|
logs_path: /opt/homelab/events
|
||||||
backup:
|
backup:
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,7 @@
|
||||||
|
services:
|
||||||
|
stability-agent:
|
||||||
|
environment:
|
||||||
|
- NODE_NAME=piha
|
||||||
|
- REDIS_HOST=100.108.208.3
|
||||||
|
- REDIS_PORT=6379
|
||||||
|
- REDIS_ENABLED=true
|
||||||
15
hosts/piha/services.yaml
Normal file
15
hosts/piha/services.yaml
Normal file
|
|
@ -0,0 +1,15 @@
|
||||||
|
host: piha
|
||||||
|
|
||||||
|
services:
|
||||||
|
stability-agent:
|
||||||
|
role: node-stability-monitor
|
||||||
|
deployment_model: docker-compose
|
||||||
|
exposure: local-only
|
||||||
|
offline_required: true
|
||||||
|
depends_on:
|
||||||
|
local: []
|
||||||
|
external: []
|
||||||
|
runtime:
|
||||||
|
config_path: /opt/homelab/config/stability-agent
|
||||||
|
data_path: /opt/homelab/state
|
||||||
|
logs_path: /opt/homelab/events
|
||||||
|
|
@ -0,0 +1,7 @@
|
||||||
|
services:
|
||||||
|
stability-agent:
|
||||||
|
environment:
|
||||||
|
- NODE_NAME=solaria
|
||||||
|
- REDIS_HOST=100.108.208.3
|
||||||
|
- REDIS_PORT=6379
|
||||||
|
- REDIS_ENABLED=true
|
||||||
15
hosts/solaria/services.yaml
Normal file
15
hosts/solaria/services.yaml
Normal file
|
|
@ -0,0 +1,15 @@
|
||||||
|
host: solaria
|
||||||
|
|
||||||
|
services:
|
||||||
|
stability-agent:
|
||||||
|
role: node-stability-monitor
|
||||||
|
deployment_model: docker-compose
|
||||||
|
exposure: local-only
|
||||||
|
offline_required: true
|
||||||
|
depends_on:
|
||||||
|
local: []
|
||||||
|
external: []
|
||||||
|
runtime:
|
||||||
|
config_path: /opt/homelab/config/stability-agent
|
||||||
|
data_path: /opt/homelab/state
|
||||||
|
logs_path: /opt/homelab/events
|
||||||
|
|
@ -0,0 +1,7 @@
|
||||||
|
services:
|
||||||
|
stability-agent:
|
||||||
|
environment:
|
||||||
|
- NODE_NAME=vps
|
||||||
|
- REDIS_HOST=100.108.208.3
|
||||||
|
- REDIS_PORT=6379
|
||||||
|
- REDIS_ENABLED=true
|
||||||
15
hosts/vps/services.yaml
Normal file
15
hosts/vps/services.yaml
Normal file
|
|
@ -0,0 +1,15 @@
|
||||||
|
host: vps
|
||||||
|
|
||||||
|
services:
|
||||||
|
stability-agent:
|
||||||
|
role: node-stability-monitor
|
||||||
|
deployment_model: docker-compose
|
||||||
|
exposure: local-only
|
||||||
|
offline_required: true
|
||||||
|
depends_on:
|
||||||
|
local: []
|
||||||
|
external: []
|
||||||
|
runtime:
|
||||||
|
config_path: /opt/homelab/config/stability-agent
|
||||||
|
data_path: /opt/homelab/state
|
||||||
|
logs_path: /opt/homelab/events
|
||||||
38
scripts/deploy/deploy-stability-agent.sh
Executable file
38
scripts/deploy/deploy-stability-agent.sh
Executable file
|
|
@ -0,0 +1,38 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# deploy-stability-agent.sh - Helper to print deployment commands for stability-agent
|
||||||
|
|
||||||
|
NODE=$1
|
||||||
|
REPO_PATH="~/homelab-codex-ws"
|
||||||
|
|
||||||
|
if [[ -z "$NODE" ]]; then
|
||||||
|
echo "Usage: $0 <node-name>"
|
||||||
|
echo "Supported nodes: chelsty, piha, solaria, vps"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
case "$NODE" in
|
||||||
|
chelsty|piha|solaria|vps)
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "Error: Unknown node '$NODE'"
|
||||||
|
echo "Supported nodes: chelsty, piha, solaria, vps"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
echo "# --- Deployment commands for $NODE ---"
|
||||||
|
echo "cd $REPO_PATH"
|
||||||
|
echo "git fetch origin"
|
||||||
|
echo "git checkout master"
|
||||||
|
echo "git pull"
|
||||||
|
echo "cd services/stability-agent"
|
||||||
|
echo ""
|
||||||
|
echo "# Command (Docker Compose V2):"
|
||||||
|
echo "NODE_NAME=$NODE REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker compose up -d --build --force-recreate"
|
||||||
|
echo ""
|
||||||
|
echo "# Command (Docker Compose V1):"
|
||||||
|
echo "NODE_NAME=$NODE REDIS_HOST=100.108.208.3 REDIS_PORT=6379 REDIS_ENABLED=true docker-compose up -d --build --force-recreate"
|
||||||
|
echo ""
|
||||||
|
echo "# Notes:"
|
||||||
|
echo "# - If using host-specific overrides: add '-f ../../hosts/$NODE/runtime/stability-agent/docker-compose.override.yml'"
|
||||||
|
echo "# - Ensure /opt/homelab/state and /opt/homelab/events exist on the host."
|
||||||
|
|
@ -1,19 +1,25 @@
|
||||||
### Stability Agent
|
### Stability Agent
|
||||||
|
|
||||||
A lightweight filesystem-first watchdog and observer agent for CHELSTY.
|
A lightweight filesystem-first watchdog and observer agent for homelab nodes.
|
||||||
|
|
||||||
#### Features
|
#### Features
|
||||||
|
|
||||||
* **Continuous Monitoring**: Runs as a background service.
|
* **Continuous Monitoring**: Runs as a background service.
|
||||||
* **Docker Inspection**: Checks container status via read-only Docker socket.
|
* **Docker Inspection**: Checks container status via read-only Docker socket (optional).
|
||||||
* **Disk Usage**: Monitors local disk utilization.
|
* **Disk Usage**: Monitors local disk utilization.
|
||||||
* **Tailscale Check**: Verifies Tailscale availability.
|
* **Tailscale Check**: Verifies Tailscale availability (optional).
|
||||||
* **MQTT Reachability**: Checks connectivity to the local MQTT broker.
|
* **MQTT Reachability**: Checks connectivity to a configured MQTT broker (optional).
|
||||||
* **Zigbee2MQTT Monitoring**: Specifically monitors the Zigbee2MQTT container.
|
* **Redis Publishing**: Publishes runtime state and events to a central Redis server (PIHA).
|
||||||
* **Redis Publishing**: (Optional) Publishes runtime state and events to a central Redis server.
|
* **Event Logging**: Writes append-only JSON events to `/opt/homelab/events/YYYY-MM-DD/<NODE_NAME>/`.
|
||||||
* **Event Logging**: Writes append-only JSON events to `/opt/homelab/events/YYYY-MM-DD/chelsty/`.
|
|
||||||
* **State Reporting**: Writes heartbeat and status summary to `/opt/homelab/state/`.
|
* **State Reporting**: Writes heartbeat and status summary to `/opt/homelab/state/`.
|
||||||
|
|
||||||
|
#### Deployment
|
||||||
|
|
||||||
|
Use the deployment helper script:
|
||||||
|
```bash
|
||||||
|
./scripts/deploy/deploy-stability-agent.sh <NODE_NAME>
|
||||||
|
```
|
||||||
|
|
||||||
#### Configuration
|
#### Configuration
|
||||||
|
|
||||||
Environment variables:
|
Environment variables:
|
||||||
|
|
@ -33,10 +39,10 @@ You can verify the Redis publishing using `redis-cli`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Check node state
|
# Check node state
|
||||||
redis-cli -h 100.108.208.3 HGETALL homelab:nodes:chelsty
|
redis-cli -h 100.108.208.3 HGETALL homelab:nodes:<NODE_NAME>
|
||||||
|
|
||||||
# Check service discovery
|
# Check service discovery
|
||||||
redis-cli -h 100.108.208.3 HGETALL homelab:services:chelsty:stability-agent
|
redis-cli -h 100.108.208.3 HGETALL homelab:services:<NODE_NAME>:stability-agent
|
||||||
|
|
||||||
# Check event stream
|
# Check event stream
|
||||||
redis-cli -h 100.108.208.3 XRANGE homelab:events - +
|
redis-cli -h 100.108.208.3 XRANGE homelab:events - +
|
||||||
|
|
@ -55,7 +61,7 @@ Events are written as JSON lines with the following fields:
|
||||||
|
|
||||||
* `id`: Unique event UUID.
|
* `id`: Unique event UUID.
|
||||||
* `timestamp`: ISO 8601 timestamp (UTC).
|
* `timestamp`: ISO 8601 timestamp (UTC).
|
||||||
* `node`: `chelsty`.
|
* `node`: `<NODE_NAME>`.
|
||||||
* `source`: `stability-agent`.
|
* `source`: `stability-agent`.
|
||||||
* `type`: Type of event (e.g., `disk_usage_high`, `containers_not_running`).
|
* `type`: Type of event (e.g., `disk_usage_high`, `containers_not_running`).
|
||||||
* `severity`: `info`, `warning`, or `error`.
|
* `severity`: `info`, `warning`, or `error`.
|
||||||
|
|
|
||||||
|
|
@ -43,7 +43,7 @@ def emit_event(event_type, severity, message, service=None, details=None):
|
||||||
}
|
}
|
||||||
if service:
|
if service:
|
||||||
event["service"] = service
|
event["service"] = service
|
||||||
|
|
||||||
date_str = get_datestamp()
|
date_str = get_datestamp()
|
||||||
event_dir = os.path.join(EVENTS_BASE_DIR, date_str, NODE_NAME)
|
event_dir = os.path.join(EVENTS_BASE_DIR, date_str, NODE_NAME)
|
||||||
try:
|
try:
|
||||||
|
|
@ -53,7 +53,7 @@ def emit_event(event_type, severity, message, service=None, details=None):
|
||||||
f.write(json.dumps(event) + "\n")
|
f.write(json.dumps(event) + "\n")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Failed to write event to filesystem: {e}")
|
print(f"Failed to write event to filesystem: {e}")
|
||||||
|
|
||||||
# Redis publishing
|
# Redis publishing
|
||||||
if REDIS_ENABLED and redis_client:
|
if REDIS_ENABLED and redis_client:
|
||||||
try:
|
try:
|
||||||
|
|
@ -68,7 +68,7 @@ def emit_event(event_type, severity, message, service=None, details=None):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Failed to publish event to Redis: {e}")
|
print(f"Failed to publish event to Redis: {e}")
|
||||||
# Do not crash, already logged to filesystem
|
# Do not crash, already logged to filesystem
|
||||||
|
|
||||||
print(f"[{severity}] {message}")
|
print(f"[{severity}] {message}")
|
||||||
|
|
||||||
def check_disk():
|
def check_disk():
|
||||||
|
|
@ -80,10 +80,10 @@ def check_disk():
|
||||||
"free_gb": free // (2**30),
|
"free_gb": free // (2**30),
|
||||||
"percent": round(percent, 2)
|
"percent": round(percent, 2)
|
||||||
}
|
}
|
||||||
|
|
||||||
if percent > DISK_THRESHOLD_PCT:
|
if percent > DISK_THRESHOLD_PCT:
|
||||||
emit_event("disk_usage_high", "warning", f"Disk usage is high: {details['percent']}%", details=details)
|
emit_event("disk_usage_high", "warning", f"Disk usage is high: {details['percent']}%", details=details)
|
||||||
|
|
||||||
return details
|
return details
|
||||||
|
|
||||||
class DockerClient:
|
class DockerClient:
|
||||||
|
|
@ -121,34 +121,37 @@ class DockerClient:
|
||||||
|
|
||||||
def check_docker():
|
def check_docker():
|
||||||
client = DockerClient()
|
client = DockerClient()
|
||||||
|
if not os.path.exists(client.socket_path):
|
||||||
|
return {"status": "unavailable", "message": "Docker socket not found"}
|
||||||
|
|
||||||
containers = client.get_containers()
|
containers = client.get_containers()
|
||||||
if containers is None:
|
if containers is None:
|
||||||
emit_event("docker_socket_error", "error", "Could not connect to Docker socket or socket missing")
|
emit_event("docker_api_error", "warning", "Could not connect to Docker socket API")
|
||||||
return {"status": "error", "error": "Could not connect to Docker socket"}
|
return {"status": "error", "error": "Could not connect to Docker socket API"}
|
||||||
|
|
||||||
summary = []
|
summary = []
|
||||||
unhealthy_containers = []
|
unhealthy_containers = []
|
||||||
for c in containers:
|
for c in containers:
|
||||||
state = c.get("State", "")
|
state = c.get("State", "")
|
||||||
status = c.get("Status", "")
|
status = c.get("Status", "")
|
||||||
name = c.get("Names", ["unknown"])[0].lstrip("/")
|
name = c.get("Names", ["unknown"])[0].lstrip("/")
|
||||||
|
|
||||||
container_info = {
|
container_info = {
|
||||||
"name": name,
|
"name": name,
|
||||||
"state": state,
|
"state": state,
|
||||||
"status": status
|
"status": status
|
||||||
}
|
}
|
||||||
summary.append(container_info)
|
summary.append(container_info)
|
||||||
|
|
||||||
if state != "running":
|
if state != "running":
|
||||||
unhealthy_containers.append(container_info)
|
unhealthy_containers.append(container_info)
|
||||||
|
|
||||||
if unhealthy_containers:
|
if unhealthy_containers:
|
||||||
names = [c["name"] for c in unhealthy_containers]
|
names = [c["name"] for c in unhealthy_containers]
|
||||||
# Only emit warning for containers that should be running?
|
# Only emit warning for containers that should be running?
|
||||||
# For now, we report any non-running container found by Docker.
|
# For now, we report any non-running container found by Docker.
|
||||||
emit_event("containers_not_running", "warning", f"Some containers are not running: {', '.join(names)}", details={"containers": unhealthy_containers})
|
emit_event("containers_not_running", "warning", f"Some containers are not running: {', '.join(names)}", details={"containers": unhealthy_containers})
|
||||||
|
|
||||||
return {"status": "ok", "containers": summary}
|
return {"status": "ok", "containers": summary}
|
||||||
|
|
||||||
def check_tailscale():
|
def check_tailscale():
|
||||||
|
|
@ -156,7 +159,7 @@ def check_tailscale():
|
||||||
socket_path = "/var/run/tailscale/tailscaled.sock"
|
socket_path = "/var/run/tailscale/tailscaled.sock"
|
||||||
socket_available = os.path.exists(socket_path)
|
socket_available = os.path.exists(socket_path)
|
||||||
interface_available = os.path.exists("/sys/class/net/tailscale0")
|
interface_available = os.path.exists("/sys/class/net/tailscale0")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"available": socket_available or interface_available,
|
"available": socket_available or interface_available,
|
||||||
"details": {
|
"details": {
|
||||||
|
|
@ -168,7 +171,7 @@ def check_tailscale():
|
||||||
def check_mqtt():
|
def check_mqtt():
|
||||||
if not MQTT_HOST:
|
if not MQTT_HOST:
|
||||||
return {"configured": False}
|
return {"configured": False}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with socket.create_connection((MQTT_HOST, MQTT_PORT), timeout=5):
|
with socket.create_connection((MQTT_HOST, MQTT_PORT), timeout=5):
|
||||||
return {"configured": True, "reachable": True}
|
return {"configured": True, "reachable": True}
|
||||||
|
|
@ -203,13 +206,13 @@ class RedisClient:
|
||||||
def _send_command(self, *args):
|
def _send_command(self, *args):
|
||||||
if not self._connect():
|
if not self._connect():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# RESP array
|
# RESP array
|
||||||
cmd = f"*{len(args)}\r\n"
|
cmd = f"*{len(args)}\r\n"
|
||||||
for arg in args:
|
for arg in args:
|
||||||
s_arg = str(arg)
|
s_arg = str(arg)
|
||||||
cmd += f"${len(s_arg.encode('utf-8'))}\r\n{s_arg}\r\n"
|
cmd += f"${len(s_arg.encode('utf-8'))}\r\n{s_arg}\r\n"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.sock.sendall(cmd.encode('utf-8'))
|
self.sock.sendall(cmd.encode('utf-8'))
|
||||||
# Basic response reading
|
# Basic response reading
|
||||||
|
|
@ -241,11 +244,11 @@ redis_client = RedisClient(REDIS_HOST, REDIS_PORT) if REDIS_ENABLED else None
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
print(f"Starting stability-agent on {NODE_NAME}...")
|
print(f"Starting stability-agent on {NODE_NAME}...")
|
||||||
|
|
||||||
# Ensure directories exist
|
# Ensure directories exist
|
||||||
os.makedirs(STATE_DIR, exist_ok=True)
|
os.makedirs(STATE_DIR, exist_ok=True)
|
||||||
os.makedirs(EVENTS_BASE_DIR, exist_ok=True)
|
os.makedirs(EVENTS_BASE_DIR, exist_ok=True)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
status = {
|
status = {
|
||||||
|
|
@ -253,12 +256,12 @@ def main():
|
||||||
"node": NODE_NAME,
|
"node": NODE_NAME,
|
||||||
"checks": {}
|
"checks": {}
|
||||||
}
|
}
|
||||||
|
|
||||||
status["checks"]["disk"] = check_disk()
|
status["checks"]["disk"] = check_disk()
|
||||||
status["checks"]["docker"] = check_docker()
|
status["checks"]["docker"] = check_docker()
|
||||||
status["checks"]["tailscale"] = check_tailscale()
|
status["checks"]["tailscale"] = check_tailscale()
|
||||||
status["checks"]["mqtt"] = check_mqtt()
|
status["checks"]["mqtt"] = check_mqtt()
|
||||||
|
|
||||||
# Zigbee2MQTT container check
|
# Zigbee2MQTT container check
|
||||||
z2m_present = False
|
z2m_present = False
|
||||||
z2m_running = False
|
z2m_running = False
|
||||||
|
|
@ -268,20 +271,20 @@ def main():
|
||||||
z2m_present = True
|
z2m_present = True
|
||||||
if c["state"] == "running":
|
if c["state"] == "running":
|
||||||
z2m_running = True
|
z2m_running = True
|
||||||
|
|
||||||
status["checks"]["zigbee2mqtt"] = {
|
status["checks"]["zigbee2mqtt"] = {
|
||||||
"present": z2m_present,
|
"present": z2m_present,
|
||||||
"running": z2m_running
|
"running": z2m_running
|
||||||
}
|
}
|
||||||
|
|
||||||
# Write heartbeat
|
# Write heartbeat
|
||||||
with open(HEARTBEAT_FILE, "w") as f:
|
with open(HEARTBEAT_FILE, "w") as f:
|
||||||
f.write(get_timestamp())
|
f.write(get_timestamp())
|
||||||
|
|
||||||
# Write status summary
|
# Write status summary
|
||||||
with open(STATUS_FILE, "w") as f:
|
with open(STATUS_FILE, "w") as f:
|
||||||
json.dump(status, f, indent=2)
|
json.dump(status, f, indent=2)
|
||||||
|
|
||||||
# Redis publishing
|
# Redis publishing
|
||||||
if REDIS_ENABLED and redis_client:
|
if REDIS_ENABLED and redis_client:
|
||||||
try:
|
try:
|
||||||
|
|
@ -290,7 +293,8 @@ def main():
|
||||||
for check in status["checks"].values():
|
for check in status["checks"].values():
|
||||||
if isinstance(check, dict) and check.get("status") == "error":
|
if isinstance(check, dict) and check.get("status") == "error":
|
||||||
node_health = "unhealthy"
|
node_health = "unhealthy"
|
||||||
|
|
||||||
|
# Redis publishing for node state
|
||||||
redis_client.hset(f"homelab:nodes:{NODE_NAME}", {
|
redis_client.hset(f"homelab:nodes:{NODE_NAME}", {
|
||||||
"id": NODE_NAME,
|
"id": NODE_NAME,
|
||||||
"hostname": NODE_NAME,
|
"hostname": NODE_NAME,
|
||||||
|
|
@ -300,13 +304,29 @@ def main():
|
||||||
"capabilities": json.dumps(["docker", "tailscale", "mqtt", "disk"]),
|
"capabilities": json.dumps(["docker", "tailscale", "mqtt", "disk"]),
|
||||||
"checks": json.dumps(status["checks"])
|
"checks": json.dumps(status["checks"])
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Always publish stability-agent itself as a service
|
||||||
|
redis_client.hset(f"homelab:services:{NODE_NAME}:stability-agent", {
|
||||||
|
"name": "stability-agent",
|
||||||
|
"node": NODE_NAME,
|
||||||
|
"health": "healthy",
|
||||||
|
"desired_state": "running",
|
||||||
|
"actual_state": "running",
|
||||||
|
"deployment_state": "deployed",
|
||||||
|
"updated_at": status["timestamp"],
|
||||||
|
"dependencies": json.dumps([]),
|
||||||
|
"recommendations": json.dumps([])
|
||||||
|
})
|
||||||
|
|
||||||
# Services discovered from Docker
|
# Services discovered from Docker
|
||||||
if status["checks"]["docker"]["status"] == "ok":
|
if status["checks"]["docker"]["status"] == "ok":
|
||||||
for c in status["checks"]["docker"]["containers"]:
|
for c in status["checks"]["docker"]["containers"]:
|
||||||
service_name = c["name"]
|
service_name = c["name"]
|
||||||
|
if service_name == "stability-agent":
|
||||||
|
continue # Already published above
|
||||||
|
|
||||||
service_health = "healthy" if c["state"] == "running" else "unhealthy"
|
service_health = "healthy" if c["state"] == "running" else "unhealthy"
|
||||||
|
|
||||||
redis_client.hset(f"homelab:services:{NODE_NAME}:{service_name}", {
|
redis_client.hset(f"homelab:services:{NODE_NAME}:{service_name}", {
|
||||||
"name": service_name,
|
"name": service_name,
|
||||||
"node": NODE_NAME,
|
"node": NODE_NAME,
|
||||||
|
|
@ -326,7 +346,7 @@ def main():
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error in main loop: {e}")
|
print(f"Error in main loop: {e}")
|
||||||
emit_event("agent_error", "error", f"Internal agent error: {e}", details={"error": str(e)})
|
emit_event("agent_error", "error", f"Internal agent error: {e}", details={"error": str(e)})
|
||||||
|
|
||||||
time.sleep(CHECK_INTERVAL)
|
time.sleep(CHECK_INTERVAL)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue