Compare commits
No commits in common. "ac90acfac81104eb4b3e4d243a64ccf2eaf27343" and "90b2a5d0e9546bef95c64235c45c5c0ac92f54e3" have entirely different histories.
ac90acfac8
...
90b2a5d0e9
|
|
@ -1,42 +0,0 @@
|
||||||
### CHELSTY Stability Agent
|
|
||||||
|
|
||||||
The stability-agent on CHELSTY provides local observability and health monitoring for the node's services and infrastructure.
|
|
||||||
|
|
||||||
#### Purpose
|
|
||||||
|
|
||||||
It acts as a filesystem-first watchdog that detects anomalies in the local runtime environment without taking autonomous destructive actions (like restarts). It serves as the primary data source for node-level stability metrics.
|
|
||||||
|
|
||||||
#### Monitoring Scope
|
|
||||||
|
|
||||||
* **Docker Containers**: Monitors all local containers. If a container is not in the `running` state, a `containers_not_running` event is generated.
|
|
||||||
* **Disk Usage**: Monitors the root filesystem. Generates `disk_usage_high` events if usage exceeds the configured threshold.
|
|
||||||
* **Connectivity**:
|
|
||||||
* Checks if the Tailscale socket or interface is available.
|
|
||||||
* Checks reachability of the local Mosquitto MQTT broker.
|
|
||||||
* **Zigbee2MQTT**: Specifically tracks the presence and status of the Zigbee2MQTT service.
|
|
||||||
|
|
||||||
#### Storage and Integration
|
|
||||||
|
|
||||||
* **Heartbeat**: Updated every cycle at `/opt/homelab/state/stability-agent.heartbeat`.
|
|
||||||
* **State Summary**: A JSON summary of all latest checks at `/opt/homelab/state/stability-agent.json`.
|
|
||||||
* **Events**: Append-only JSON lines at `/opt/homelab/events/YYYY-MM-DD/chelsty/events.jsonl`.
|
|
||||||
|
|
||||||
#### Deployment
|
|
||||||
|
|
||||||
The service is deployed via Docker Compose on CHELSTY.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd services/stability-agent
|
|
||||||
docker compose up -d
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Configuration
|
|
||||||
|
|
||||||
Configuration is managed via environment variables in `docker-compose.override.yml` on the host.
|
|
||||||
|
|
||||||
| Variable | Description | Default |
|
|
||||||
|----------|-------------|---------|
|
|
||||||
| `STABILITY_CHECK_INTERVAL` | Seconds between checks | `60` |
|
|
||||||
| `DISK_THRESHOLD_PCT` | Disk usage alert threshold | `90` |
|
|
||||||
| `MQTT_HOST` | MQTT broker hostname | `mosquitto` |
|
|
||||||
| `MQTT_PORT` | MQTT broker port | `1883` |
|
|
||||||
|
|
@ -1,7 +0,0 @@
|
||||||
services:
|
|
||||||
stability-agent:
|
|
||||||
environment:
|
|
||||||
- STABILITY_CHECK_INTERVAL=60
|
|
||||||
- DISK_THRESHOLD_PCT=85
|
|
||||||
- MQTT_HOST=mosquitto
|
|
||||||
- MQTT_PORT=1883
|
|
||||||
|
|
@ -106,21 +106,3 @@ services:
|
||||||
- /opt/homelab/data/mosquitto
|
- /opt/homelab/data/mosquitto
|
||||||
notes:
|
notes:
|
||||||
- Retain ACL, password, persistence, and bridge configuration if enabled.
|
- Retain ACL, password, persistence, and bridge configuration if enabled.
|
||||||
|
|
||||||
stability-agent:
|
|
||||||
role: node-stability-monitor
|
|
||||||
deployment_model: docker-compose
|
|
||||||
exposure: local-only
|
|
||||||
offline_required: true
|
|
||||||
depends_on:
|
|
||||||
local:
|
|
||||||
- mosquitto
|
|
||||||
external: []
|
|
||||||
runtime:
|
|
||||||
config_path: null
|
|
||||||
data_path: /opt/homelab/state
|
|
||||||
logs_path: /opt/homelab/events
|
|
||||||
backup:
|
|
||||||
recommended: false
|
|
||||||
notes:
|
|
||||||
- Events and state are transient or can be reconstructed; high-frequency writes.
|
|
||||||
|
|
|
||||||
|
|
@ -1,37 +0,0 @@
|
||||||
### Agent System
|
|
||||||
Central runtime materializer and Operator Control Plane UI.
|
|
||||||
|
|
||||||
#### Components
|
|
||||||
- **Redis**: Central state store (on PIHA).
|
|
||||||
- **Runtime Materializer**: Converts Redis state to JSON files in `/opt/homelab/world`.
|
|
||||||
- **Web UI**: Exposes API endpoints and serving the Operator UI.
|
|
||||||
|
|
||||||
#### Deployment (on PIHA)
|
|
||||||
```bash
|
|
||||||
cd services/agent-system
|
|
||||||
./deploy.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Deployment (on CHELSTY)
|
|
||||||
```bash
|
|
||||||
cd services/stability-agent
|
|
||||||
docker compose up -d --build
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Verification
|
|
||||||
The `deploy.sh` script automatically verifies the local endpoints.
|
|
||||||
You can also manually check:
|
|
||||||
```bash
|
|
||||||
# Check runtime summary
|
|
||||||
curl http://localhost:18180/summary
|
|
||||||
|
|
||||||
# Check discovered nodes
|
|
||||||
curl http://localhost:18180/nodes
|
|
||||||
|
|
||||||
# Check discovered services
|
|
||||||
curl http://localhost:18180/services
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Directory Structure
|
|
||||||
- `/opt/homelab/world`: Contains materialized JSON state.
|
|
||||||
- `/opt/homelab/state`: Contains operator configuration and local heartbeats.
|
|
||||||
|
|
@ -1,22 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
echo ">>> Validating docker-compose configuration..."
|
|
||||||
docker compose config
|
|
||||||
|
|
||||||
echo ">>> Building and starting Agent System services..."
|
|
||||||
docker compose up -d --build
|
|
||||||
|
|
||||||
echo ">>> Services status:"
|
|
||||||
docker ps --filter "name=agent-system"
|
|
||||||
|
|
||||||
echo ">>> Verifying API endpoints..."
|
|
||||||
sleep 5 # Give it a moment to start
|
|
||||||
|
|
||||||
endpoints=("summary" "nodes" "services")
|
|
||||||
for ep in "${endpoints[@]}"; do
|
|
||||||
echo "Checking /$ep..."
|
|
||||||
curl -s -f http://localhost:18180/$ep > /dev/null && echo " OK" || echo " FAILED"
|
|
||||||
done
|
|
||||||
|
|
||||||
echo ">>> Deployment complete."
|
|
||||||
|
|
@ -1,33 +0,0 @@
|
||||||
services:
|
|
||||||
redis:
|
|
||||||
image: redis:7
|
|
||||||
container_name: agent-system-redis
|
|
||||||
ports:
|
|
||||||
- "6379:6379"
|
|
||||||
restart: unless-stopped
|
|
||||||
|
|
||||||
webui:
|
|
||||||
build: ./webui
|
|
||||||
container_name: agent-system-webui
|
|
||||||
ports:
|
|
||||||
- "18180:8080"
|
|
||||||
volumes:
|
|
||||||
- /opt/homelab:/opt/homelab
|
|
||||||
depends_on:
|
|
||||||
- redis
|
|
||||||
restart: unless-stopped
|
|
||||||
|
|
||||||
runtime-materializer:
|
|
||||||
build: ./runtime-materializer
|
|
||||||
container_name: agent-system-runtime-materializer
|
|
||||||
environment:
|
|
||||||
REDIS_HOST: redis
|
|
||||||
REDIS_PORT: "6379"
|
|
||||||
HOMELAB_WORLD_ROOT: /opt/homelab/world
|
|
||||||
WORLD_DIR: /opt/homelab/world
|
|
||||||
MATERIALIZE_INTERVAL: "10"
|
|
||||||
volumes:
|
|
||||||
- /opt/homelab:/opt/homelab
|
|
||||||
depends_on:
|
|
||||||
- redis
|
|
||||||
restart: unless-stopped
|
|
||||||
|
|
@ -1,16 +0,0 @@
|
||||||
FROM python:3.11-slim
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
# Install redis python package as requested
|
|
||||||
RUN pip install --no-cache-dir redis
|
|
||||||
|
|
||||||
COPY materializer.py .
|
|
||||||
|
|
||||||
# Ensure the world directory exists in the container (though it will likely be a volume)
|
|
||||||
RUN mkdir -p /opt/homelab/world
|
|
||||||
|
|
||||||
# Use unbuffered output to see logs in docker
|
|
||||||
ENV PYTHONUNBUFFERED=1
|
|
||||||
|
|
||||||
CMD ["python", "materializer.py"]
|
|
||||||
|
|
@ -1,181 +0,0 @@
|
||||||
import redis
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
import argparse
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
# Configuration from environment variables
|
|
||||||
REDIS_HOST = os.environ.get("REDIS_HOST", "redis")
|
|
||||||
REDIS_PORT = int(os.environ.get("REDIS_PORT", 6379))
|
|
||||||
WORLD_DIR = os.environ.get("WORLD_DIR", "/opt/homelab/world")
|
|
||||||
|
|
||||||
def get_redis_client():
|
|
||||||
"""Returns a Redis client with decoding enabled."""
|
|
||||||
return redis.Redis(
|
|
||||||
host=REDIS_HOST,
|
|
||||||
port=REDIS_PORT,
|
|
||||||
decode_responses=True,
|
|
||||||
socket_timeout=5
|
|
||||||
)
|
|
||||||
|
|
||||||
def safe_json_loads(data, default=None):
|
|
||||||
"""Safely loads JSON from a string."""
|
|
||||||
if not data:
|
|
||||||
return default
|
|
||||||
try:
|
|
||||||
if isinstance(data, (dict, list)):
|
|
||||||
return data
|
|
||||||
return json.loads(data)
|
|
||||||
except (json.JSONDecodeError, TypeError):
|
|
||||||
return data
|
|
||||||
|
|
||||||
def normalize_health(health):
|
|
||||||
"""Normalizes health values for the UI."""
|
|
||||||
if not health:
|
|
||||||
return "nominal"
|
|
||||||
h = str(health).lower()
|
|
||||||
if h in ["healthy", "ok", "running", "nominal"]:
|
|
||||||
return "nominal"
|
|
||||||
if h in ["degraded", "warning"]:
|
|
||||||
return "degraded"
|
|
||||||
return "error"
|
|
||||||
|
|
||||||
def materialize():
|
|
||||||
"""Reads state from Redis and writes JSON files to the world directory."""
|
|
||||||
print(f"[{datetime.now().isoformat()}] Materializing world state...")
|
|
||||||
try:
|
|
||||||
r = get_redis_client()
|
|
||||||
|
|
||||||
# 1. Nodes
|
|
||||||
nodes = []
|
|
||||||
node_keys = r.keys("homelab:nodes:*")
|
|
||||||
for key in node_keys:
|
|
||||||
node_data = r.hgetall(key)
|
|
||||||
if node_data:
|
|
||||||
# Normalize health
|
|
||||||
if "health" in node_data:
|
|
||||||
node_data["health"] = normalize_health(node_data["health"])
|
|
||||||
# Parse JSON fields if they exist
|
|
||||||
if "capabilities" in node_data:
|
|
||||||
node_data["capabilities"] = safe_json_loads(node_data["capabilities"], [])
|
|
||||||
if "checks" in node_data:
|
|
||||||
node_data["checks"] = safe_json_loads(node_data["checks"], {})
|
|
||||||
nodes.append(node_data)
|
|
||||||
|
|
||||||
# 2. Services
|
|
||||||
services = []
|
|
||||||
service_keys = r.keys("homelab:services:*")
|
|
||||||
for key in service_keys:
|
|
||||||
svc_data = r.hgetall(key)
|
|
||||||
if svc_data:
|
|
||||||
# Normalize health
|
|
||||||
if "health" in svc_data:
|
|
||||||
svc_data["health"] = normalize_health(svc_data["health"])
|
|
||||||
if "dependencies" in svc_data:
|
|
||||||
svc_data["dependencies"] = safe_json_loads(svc_data["dependencies"], [])
|
|
||||||
if "recommendations" in svc_data:
|
|
||||||
svc_data["recommendations"] = safe_json_loads(svc_data["recommendations"], [])
|
|
||||||
services.append(svc_data)
|
|
||||||
|
|
||||||
# 3. Events (Stream)
|
|
||||||
events = []
|
|
||||||
try:
|
|
||||||
# Get last 100 events from the stream
|
|
||||||
raw_events = r.xrevrange("homelab:events", count=100)
|
|
||||||
for event_id, data in raw_events:
|
|
||||||
event = data.copy()
|
|
||||||
event["id"] = event_id
|
|
||||||
if "details" in event:
|
|
||||||
event["details"] = safe_json_loads(event["details"], {})
|
|
||||||
events.append(event)
|
|
||||||
except redis.exceptions.ResponseError:
|
|
||||||
# homelab:events might not be a stream or doesn't exist
|
|
||||||
pass
|
|
||||||
|
|
||||||
# 4. Incidents (Hash)
|
|
||||||
incidents = []
|
|
||||||
incident_keys = r.keys("homelab:incidents:*")
|
|
||||||
for key in incident_keys:
|
|
||||||
incident_data = r.hgetall(key)
|
|
||||||
if incident_data:
|
|
||||||
# Normalize health if present
|
|
||||||
if "health" in incident_data:
|
|
||||||
incident_data["health"] = normalize_health(incident_data["health"])
|
|
||||||
incidents.append(incident_data)
|
|
||||||
|
|
||||||
# 5. Deployments (Hash)
|
|
||||||
deployments = []
|
|
||||||
deployment_keys = r.keys("homelab:deployments:*")
|
|
||||||
for key in deployment_keys:
|
|
||||||
dep_data = r.hgetall(key)
|
|
||||||
if dep_data:
|
|
||||||
deployments.append(dep_data)
|
|
||||||
|
|
||||||
# 6. Recommendations (Hash)
|
|
||||||
recommendations = []
|
|
||||||
recommendation_keys = r.keys("homelab:recommendations:*")
|
|
||||||
for key in recommendation_keys:
|
|
||||||
rec_data = r.hgetall(key)
|
|
||||||
if rec_data:
|
|
||||||
recommendations.append(rec_data)
|
|
||||||
|
|
||||||
# 7. Runtime Summary
|
|
||||||
unhealthy_services = [s for s in services if s.get("health") != "nominal"]
|
|
||||||
active_incidents = [i for i in incidents if i.get("status") not in ["resolved", "closed"]]
|
|
||||||
|
|
||||||
status = "nominal"
|
|
||||||
if len(active_incidents) > 0 or len(unhealthy_services) > 5:
|
|
||||||
status = "error"
|
|
||||||
elif len(unhealthy_services) > 0:
|
|
||||||
status = "degraded"
|
|
||||||
|
|
||||||
summary = {
|
|
||||||
"status": status,
|
|
||||||
"timestamp": datetime.utcnow().isoformat() + "Z",
|
|
||||||
"last_update": int(time.time()),
|
|
||||||
"node_count": len(nodes),
|
|
||||||
"service_count": len(services),
|
|
||||||
"active_incidents_count": len(active_incidents),
|
|
||||||
"unhealthy_services_count": len(unhealthy_services),
|
|
||||||
"incident_count": len(incidents),
|
|
||||||
"recent_events_count": len(events),
|
|
||||||
"stale": False
|
|
||||||
}
|
|
||||||
|
|
||||||
# Ensure directory exists
|
|
||||||
os.makedirs(WORLD_DIR, exist_ok=True)
|
|
||||||
|
|
||||||
def write_json(filename, data):
|
|
||||||
path = os.path.join(WORLD_DIR, filename)
|
|
||||||
with open(path, "w") as f:
|
|
||||||
json.dump(data, f, indent=2)
|
|
||||||
|
|
||||||
write_json("runtime-summary.json", summary)
|
|
||||||
write_json("nodes.json", nodes)
|
|
||||||
write_json("services.json", services)
|
|
||||||
write_json("incidents.json", incidents)
|
|
||||||
write_json("events.json", events)
|
|
||||||
write_json("deployments.json", deployments)
|
|
||||||
write_json("recommendations.json", recommendations)
|
|
||||||
|
|
||||||
print(f"[{datetime.now().isoformat()}] Successfully materialized to {WORLD_DIR}")
|
|
||||||
|
|
||||||
except redis.exceptions.ConnectionError as e:
|
|
||||||
print(f"Redis connection error: {e}")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Unexpected error during materialization: {e}")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser(description="Homelab Runtime Materializer")
|
|
||||||
parser.add_argument("--once", action="store_true", help="Run once and exit")
|
|
||||||
parser.add_argument("--interval", type=int, default=30, help="Sleep interval between runs (seconds)")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
if args.once:
|
|
||||||
materialize()
|
|
||||||
else:
|
|
||||||
print(f"Starting materializer loop (interval: {args.interval}s)...")
|
|
||||||
while True:
|
|
||||||
materialize()
|
|
||||||
time.sleep(args.interval)
|
|
||||||
|
|
@ -1,7 +0,0 @@
|
||||||
FROM python:3.11-slim
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
COPY web.py index.html ./
|
|
||||||
|
|
||||||
EXPOSE 8080
|
|
||||||
CMD ["python", "web.py"]
|
|
||||||
|
|
@ -1,701 +0,0 @@
|
||||||
<!doctype html>
|
|
||||||
<html lang="en">
|
|
||||||
<head>
|
|
||||||
<meta charset="utf-8">
|
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
||||||
<title>Operator Control Plane</title>
|
|
||||||
<style>
|
|
||||||
:root {
|
|
||||||
--bg-color: #0a0c0e;
|
|
||||||
--sidebar-color: #14171a;
|
|
||||||
--card-color: #1c2024;
|
|
||||||
--border-color: #2a3540;
|
|
||||||
--text-color: #e7edf3;
|
|
||||||
--text-muted: #94a3b8;
|
|
||||||
--accent-color: #3eaf7c;
|
|
||||||
--nominal: #3eaf7c;
|
|
||||||
--degraded: #e7c000;
|
|
||||||
--unstable: #e67e22;
|
|
||||||
--reconciling: #3498db;
|
|
||||||
--error: #c0392b;
|
|
||||||
--safe: #3eaf7c;
|
|
||||||
--guarded: #e67e22;
|
|
||||||
--dangerous: #c0392b;
|
|
||||||
}
|
|
||||||
|
|
||||||
body {
|
|
||||||
margin: 0;
|
|
||||||
font-family: 'Inter', system-ui, -apple-system, sans-serif;
|
|
||||||
background: var(--bg-color);
|
|
||||||
color: var(--text-color);
|
|
||||||
display: flex;
|
|
||||||
height: 100vh;
|
|
||||||
overflow: hidden;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Sidebar */
|
|
||||||
.sidebar {
|
|
||||||
width: 240px;
|
|
||||||
background: var(--sidebar-color);
|
|
||||||
border-right: 1px solid var(--border-color);
|
|
||||||
display: flex;
|
|
||||||
flex-direction: column;
|
|
||||||
flex-shrink: 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
.sidebar-header {
|
|
||||||
padding: 24px;
|
|
||||||
font-weight: 800;
|
|
||||||
font-size: 14px;
|
|
||||||
letter-spacing: 0.1em;
|
|
||||||
color: var(--accent-color);
|
|
||||||
border-bottom: 1px solid var(--border-color);
|
|
||||||
}
|
|
||||||
|
|
||||||
.nav-list {
|
|
||||||
list-style: none;
|
|
||||||
padding: 12px 0;
|
|
||||||
margin: 0;
|
|
||||||
flex-grow: 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
.nav-item {
|
|
||||||
padding: 12px 24px;
|
|
||||||
cursor: pointer;
|
|
||||||
font-size: 14px;
|
|
||||||
color: var(--text-muted);
|
|
||||||
transition: all 0.2s;
|
|
||||||
display: flex;
|
|
||||||
align-items: center;
|
|
||||||
gap: 12px;
|
|
||||||
}
|
|
||||||
|
|
||||||
.nav-item:hover {
|
|
||||||
background: rgba(255, 255, 255, 0.05);
|
|
||||||
color: var(--text-color);
|
|
||||||
}
|
|
||||||
|
|
||||||
.nav-item.active {
|
|
||||||
background: rgba(62, 175, 124, 0.1);
|
|
||||||
color: var(--accent-color);
|
|
||||||
border-left: 3px solid var(--accent-color);
|
|
||||||
}
|
|
||||||
|
|
||||||
.sidebar-footer {
|
|
||||||
padding: 16px;
|
|
||||||
border-top: 1px solid var(--border-color);
|
|
||||||
font-size: 12px;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Content Area */
|
|
||||||
.main-content {
|
|
||||||
flex-grow: 1;
|
|
||||||
display: flex;
|
|
||||||
flex-direction: column;
|
|
||||||
overflow: hidden;
|
|
||||||
}
|
|
||||||
|
|
||||||
header {
|
|
||||||
height: 64px;
|
|
||||||
border-bottom: 1px solid var(--border-color);
|
|
||||||
display: flex;
|
|
||||||
align-items: center;
|
|
||||||
padding: 0 24px;
|
|
||||||
justify-content: space-between;
|
|
||||||
background: var(--bg-color);
|
|
||||||
}
|
|
||||||
|
|
||||||
.view-title {
|
|
||||||
font-size: 18px;
|
|
||||||
font-weight: 600;
|
|
||||||
}
|
|
||||||
|
|
||||||
.content-scroll {
|
|
||||||
flex-grow: 1;
|
|
||||||
overflow-y: auto;
|
|
||||||
padding: 24px;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Cards & Grids */
|
|
||||||
.grid {
|
|
||||||
display: grid;
|
|
||||||
grid-template-columns: repeat(auto-fill, minmax(350px, 1fr));
|
|
||||||
gap: 20px;
|
|
||||||
}
|
|
||||||
|
|
||||||
.card {
|
|
||||||
background: var(--card-color);
|
|
||||||
border: 1px solid var(--border-color);
|
|
||||||
padding: 20px;
|
|
||||||
border-radius: 4px;
|
|
||||||
position: relative;
|
|
||||||
}
|
|
||||||
|
|
||||||
.card-header {
|
|
||||||
display: flex;
|
|
||||||
justify-content: space-between;
|
|
||||||
align-items: center;
|
|
||||||
margin-bottom: 16px;
|
|
||||||
}
|
|
||||||
|
|
||||||
.card-title {
|
|
||||||
font-weight: 700;
|
|
||||||
font-size: 16px;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Status Badges */
|
|
||||||
.badge {
|
|
||||||
padding: 4px 8px;
|
|
||||||
border-radius: 4px;
|
|
||||||
font-size: 11px;
|
|
||||||
font-weight: 700;
|
|
||||||
text-transform: uppercase;
|
|
||||||
}
|
|
||||||
|
|
||||||
.status-nominal { background: rgba(62, 175, 124, 0.1); color: var(--nominal); }
|
|
||||||
.status-degraded { background: rgba(231, 192, 0, 0.1); color: var(--degraded); }
|
|
||||||
.status-unstable { background: rgba(230, 126, 34, 0.1); color: var(--unstable); }
|
|
||||||
.status-reconciling { background: rgba(52, 152, 219, 0.1); color: var(--reconciling); }
|
|
||||||
.status-error { background: rgba(192, 57, 43, 0.1); color: var(--error); }
|
|
||||||
|
|
||||||
/* Timeline */
|
|
||||||
.timeline {
|
|
||||||
display: flex;
|
|
||||||
flex-direction: column;
|
|
||||||
gap: 12px;
|
|
||||||
}
|
|
||||||
|
|
||||||
.event {
|
|
||||||
padding: 12px;
|
|
||||||
border-left: 2px solid var(--border-color);
|
|
||||||
background: rgba(255, 255, 255, 0.02);
|
|
||||||
font-family: ui-monospace, monospace;
|
|
||||||
font-size: 13px;
|
|
||||||
}
|
|
||||||
|
|
||||||
.event.high { border-left-color: var(--error); }
|
|
||||||
.event.medium { border-left-color: var(--unstable); }
|
|
||||||
.event.low { border-left-color: var(--nominal); }
|
|
||||||
|
|
||||||
.event-header {
|
|
||||||
display: flex;
|
|
||||||
justify-content: space-between;
|
|
||||||
margin-bottom: 4px;
|
|
||||||
color: var(--text-muted);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Forms & Inputs */
|
|
||||||
.controls {
|
|
||||||
display: flex;
|
|
||||||
gap: 12px;
|
|
||||||
margin-top: 20px;
|
|
||||||
}
|
|
||||||
|
|
||||||
input, button {
|
|
||||||
background: var(--card-color);
|
|
||||||
border: 1px solid var(--border-color);
|
|
||||||
color: var(--text-color);
|
|
||||||
padding: 8px 16px;
|
|
||||||
font-size: 14px;
|
|
||||||
border-radius: 4px;
|
|
||||||
}
|
|
||||||
|
|
||||||
button {
|
|
||||||
cursor: pointer;
|
|
||||||
font-weight: 600;
|
|
||||||
}
|
|
||||||
|
|
||||||
button:hover { background: var(--border-color); }
|
|
||||||
|
|
||||||
.btn-primary { background: var(--accent-color); color: white; border: none; }
|
|
||||||
.btn-primary:hover { background: #359b6d; }
|
|
||||||
|
|
||||||
/* Utility */
|
|
||||||
.hidden { display: none !important; }
|
|
||||||
.mono { font-family: ui-monospace, monospace; }
|
|
||||||
.label { color: var(--text-muted); font-size: 12px; margin-bottom: 4px; }
|
|
||||||
.value { font-weight: 500; margin-bottom: 12px; }
|
|
||||||
|
|
||||||
.risk-safe { background: rgba(62, 175, 124, 0.1); color: var(--safe); }
|
|
||||||
.risk-guarded { background: rgba(230, 126, 34, 0.1); color: var(--guarded); }
|
|
||||||
.risk-dangerous { background: rgba(192, 57, 43, 0.1); color: var(--dangerous); }
|
|
||||||
|
|
||||||
</style>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<aside class="sidebar">
|
|
||||||
<div class="sidebar-header">HOMELAB OPERATOR</div>
|
|
||||||
<ul class="nav-list">
|
|
||||||
<li class="nav-item active" onclick="showView('dashboard', this)">
|
|
||||||
<span>Dashboard</span>
|
|
||||||
</li>
|
|
||||||
<li class="nav-item" onclick="showView('actions', this)">
|
|
||||||
<span>Action Queue</span>
|
|
||||||
</li>
|
|
||||||
<li class="nav-item" onclick="showView('nodes', this)">
|
|
||||||
<span>Nodes</span>
|
|
||||||
</li>
|
|
||||||
<li class="nav-item" onclick="showView('services', this)">
|
|
||||||
<span>Services</span>
|
|
||||||
</li>
|
|
||||||
<li class="nav-item" onclick="showView('deployments', this)">
|
|
||||||
<span>Deployments</span>
|
|
||||||
</li>
|
|
||||||
<li class="nav-item" onclick="showView('topology', this)">
|
|
||||||
<span>Topology</span>
|
|
||||||
</li>
|
|
||||||
<li class="nav-item" onclick="showView('events', this)">
|
|
||||||
<span>Events</span>
|
|
||||||
</li>
|
|
||||||
<li class="nav-item" onclick="showView('correlation', this)">
|
|
||||||
<span>Correlation</span>
|
|
||||||
</li>
|
|
||||||
<li class="nav-item" onclick="showView('recommendations', this)">
|
|
||||||
<span>Recommendations</span>
|
|
||||||
</li>
|
|
||||||
<li class="nav-item" onclick="showView('settings', this)">
|
|
||||||
<span>Settings</span>
|
|
||||||
</li>
|
|
||||||
</ul>
|
|
||||||
<div class="sidebar-footer">
|
|
||||||
<div id="summary-status">System Status: Loading...</div>
|
|
||||||
</div>
|
|
||||||
</aside>
|
|
||||||
|
|
||||||
<main class="main-content">
|
|
||||||
<div id="stale-banner" class="hidden" style="background:var(--error); color:white; padding:8px 24px; font-weight:bold; font-size:12px; text-align:center; letter-spacing:0.05em">
|
|
||||||
RUNTIME STATE IS STALE
|
|
||||||
</div>
|
|
||||||
<header>
|
|
||||||
<div style="display:flex; align-items:center; gap:20px">
|
|
||||||
<div class="view-title" id="current-view-title">Dashboard</div>
|
|
||||||
<select id="operator-mode" onchange="setOperatorMode(this.value)" style="background:var(--sidebar-color); border:1px solid var(--border-color); color:var(--accent-color); font-weight:bold; font-size:12px; padding:4px 8px">
|
|
||||||
<option value="observe">OBSERVE</option>
|
|
||||||
<option value="recommend">RECOMMEND</option>
|
|
||||||
<option value="approval" selected>APPROVAL</option>
|
|
||||||
<option value="autonomous">AUTONOMOUS</option>
|
|
||||||
<option value="maintenance">MAINTENANCE</option>
|
|
||||||
</select>
|
|
||||||
</div>
|
|
||||||
<div class="header-actions">
|
|
||||||
<button onclick="refreshData()">Refresh</button>
|
|
||||||
</div>
|
|
||||||
</header>
|
|
||||||
|
|
||||||
<div class="content-scroll">
|
|
||||||
<!-- Dashboard View -->
|
|
||||||
<div id="view-dashboard" class="view">
|
|
||||||
<div class="grid">
|
|
||||||
<div class="card">
|
|
||||||
<div class="card-title">System Overview</div>
|
|
||||||
<div id="dashboard-summary" style="margin-top:20px"></div>
|
|
||||||
</div>
|
|
||||||
<div class="card">
|
|
||||||
<div class="card-title">Pending Actions</div>
|
|
||||||
<div id="dashboard-actions-summary" style="margin-top:20px"></div>
|
|
||||||
</div>
|
|
||||||
<div class="card">
|
|
||||||
<div class="card-title">Active Incidents</div>
|
|
||||||
<div id="dashboard-incidents" style="margin-top:20px"></div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- Actions View -->
|
|
||||||
<div id="view-actions" class="view hidden">
|
|
||||||
<div style="display:grid; grid-template-columns: 1fr 1fr; gap:24px">
|
|
||||||
<div>
|
|
||||||
<h3>Pending Approval</h3>
|
|
||||||
<div id="actions-pending" class="timeline"></div>
|
|
||||||
</div>
|
|
||||||
<div>
|
|
||||||
<h3>Active / History</h3>
|
|
||||||
<div id="actions-history" class="timeline"></div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- Nodes View -->
|
|
||||||
<div id="view-nodes" class="view hidden">
|
|
||||||
<div class="grid" id="nodes-list"></div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- Services View -->
|
|
||||||
<div id="view-services" class="view hidden">
|
|
||||||
<div class="grid" id="services-list"></div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- Deployments View -->
|
|
||||||
<div id="view-deployments" class="view hidden">
|
|
||||||
<div class="grid" id="deployments-list"></div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- Topology View -->
|
|
||||||
<div id="view-topology" class="view hidden">
|
|
||||||
<div class="card" style="min-height:500px">
|
|
||||||
<div class="card-title">Runtime Topology</div>
|
|
||||||
<div id="topology-map" style="margin-top:20px; display:flex; flex-wrap:wrap; gap:40px; justify-content:center"></div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- Events View -->
|
|
||||||
<div id="view-events" class="view hidden">
|
|
||||||
<div class="timeline" id="events-timeline"></div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- Correlation View -->
|
|
||||||
<div id="view-correlation" class="view hidden">
|
|
||||||
<div id="correlation-chains" class="grid"></div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- Recommendations View -->
|
|
||||||
<div id="view-recommendations" class="view hidden">
|
|
||||||
<div class="grid" id="recommendations-list"></div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- Settings View -->
|
|
||||||
<div id="view-settings" class="view hidden">
|
|
||||||
<div class="card">
|
|
||||||
<div class="card-title">Configuration</div>
|
|
||||||
<div id="settings-content" style="margin-top:20px"></div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</main>
|
|
||||||
|
|
||||||
<script>
|
|
||||||
let currentView = 'dashboard';
|
|
||||||
const pollInterval = 5000;
|
|
||||||
|
|
||||||
function showView(viewId, el) {
|
|
||||||
document.querySelectorAll('.view').forEach(v => v.classList.add('hidden'));
|
|
||||||
document.getElementById('view-' + viewId).classList.remove('hidden');
|
|
||||||
document.querySelectorAll('.nav-item').forEach(i => i.classList.remove('active'));
|
|
||||||
if (el) el.classList.add('active');
|
|
||||||
currentView = viewId;
|
|
||||||
document.getElementById('current-view-title').textContent = viewId.charAt(0).toUpperCase() + viewId.slice(1);
|
|
||||||
refreshData();
|
|
||||||
}
|
|
||||||
|
|
||||||
async function fetchData(endpoint) {
|
|
||||||
try {
|
|
||||||
const res = await fetch(endpoint, {cache: 'no-store'});
|
|
||||||
return await res.json();
|
|
||||||
} catch (e) {
|
|
||||||
console.error('Fetch error:', endpoint, e);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function postData(endpoint, data) {
|
|
||||||
try {
|
|
||||||
const res = await fetch(endpoint, {
|
|
||||||
method: 'POST',
|
|
||||||
headers: {'Content-Type': 'application/json'},
|
|
||||||
body: JSON.stringify(data)
|
|
||||||
});
|
|
||||||
return await res.json();
|
|
||||||
} catch (e) {
|
|
||||||
console.error('Post error:', endpoint, e);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function mutateAction(id, status) {
|
|
||||||
const res = await postData('/action/mutate', {id, status});
|
|
||||||
if (res && res.status === 'ok') {
|
|
||||||
refreshData();
|
|
||||||
} else {
|
|
||||||
alert('Mutation failed');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function setOperatorMode(mode) {
|
|
||||||
console.log('Operator mode set to:', mode);
|
|
||||||
const res = await postData('/mode', {mode});
|
|
||||||
if (res && res.status === 'ok') {
|
|
||||||
console.log('Mode updated successfully');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function formatTime(ts) {
|
|
||||||
if (!ts) return 'N/A';
|
|
||||||
return new Date(ts * 1000).toLocaleString();
|
|
||||||
}
|
|
||||||
|
|
||||||
function getStatusClass(status) {
|
|
||||||
status = (status || '').toLowerCase();
|
|
||||||
if (['nominal', 'healthy', 'ok', 'up'].includes(status)) return 'status-nominal';
|
|
||||||
if (['degraded', 'warning'].includes(status)) return 'status-degraded';
|
|
||||||
if (['unstable'].includes(status)) return 'status-unstable';
|
|
||||||
if (['reconciling'].includes(status)) return 'status-reconciling';
|
|
||||||
if (['error', 'down', 'failed'].includes(status)) return 'status-error';
|
|
||||||
return '';
|
|
||||||
}
|
|
||||||
|
|
||||||
async function refreshData() {
|
|
||||||
// Refresh summary always
|
|
||||||
const summary = await fetchData('/summary');
|
|
||||||
if (summary) {
|
|
||||||
const statusEl = document.getElementById('summary-status');
|
|
||||||
statusEl.textContent = `System Status: ${summary.status.toUpperCase()}`;
|
|
||||||
statusEl.className = 'sidebar-footer ' + getStatusClass(summary.status);
|
|
||||||
|
|
||||||
// Handle stale state
|
|
||||||
const staleBanner = document.getElementById('stale-banner');
|
|
||||||
if (summary.stale) {
|
|
||||||
staleBanner.classList.remove('hidden');
|
|
||||||
staleBanner.textContent = `CRITICAL: Runtime state is STALE (Last update: ${formatTime(summary.last_update)})`;
|
|
||||||
} else {
|
|
||||||
staleBanner.classList.add('hidden');
|
|
||||||
}
|
|
||||||
|
|
||||||
if (currentView === 'dashboard') {
|
|
||||||
const dashSummary = document.getElementById('dashboard-summary');
|
|
||||||
dashSummary.innerHTML = `
|
|
||||||
<div class="label">Nodes</div><div class="value">${summary.node_count}</div>
|
|
||||||
<div class="label">Services</div><div class="value">${summary.service_count}</div>
|
|
||||||
<div class="label">Last Update</div><div class="value">${formatTime(summary.last_update)}</div>
|
|
||||||
`;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (currentView === 'dashboard' || currentView === 'actions') {
|
|
||||||
const actions = await fetchData('/actions');
|
|
||||||
if (actions) {
|
|
||||||
if (currentView === 'dashboard') {
|
|
||||||
const dashActions = document.getElementById('dashboard-actions-summary');
|
|
||||||
const pendingCount = actions.pending.length;
|
|
||||||
dashActions.innerHTML = `
|
|
||||||
<div class="label">Pending</div><div class="value" style="color:var(--guarded)">${pendingCount}</div>
|
|
||||||
<div class="label">Running</div><div class="value" style="color:var(--reconciling)">${actions.running.length}</div>
|
|
||||||
`;
|
|
||||||
}
|
|
||||||
if (currentView === 'actions') {
|
|
||||||
const pendingEl = document.getElementById('actions-pending');
|
|
||||||
const historyEl = document.getElementById('actions-history');
|
|
||||||
|
|
||||||
pendingEl.innerHTML = actions.pending.map(a => `
|
|
||||||
<div class="card" style="margin-bottom:12px">
|
|
||||||
<div class="card-header">
|
|
||||||
<div class="card-title">${(a.action_type || a.type || 'unknown').toUpperCase()}</div>
|
|
||||||
<span class="badge risk-${a.risk_level}">${a.risk_level}</span>
|
|
||||||
</div>
|
|
||||||
<p>${a.description || a.action_type || 'No description'}</p>
|
|
||||||
<div class="label">Target</div><div class="value">${a.node || (a.target && a.target.node) || 'unknown'} ${(a.service || (a.target && a.target.service)) || ''}</div>
|
|
||||||
<div class="label">Confidence</div><div class="value">${Math.round((a.confidence || 0)*100)}%</div>
|
|
||||||
<div class="controls">
|
|
||||||
<button class="btn-primary" onclick="mutateAction('${a.id}', 'approved')">Approve</button>
|
|
||||||
<button onclick="mutateAction('${a.id}', 'rejected')">Reject</button>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
`).join('') || 'No pending actions.';
|
|
||||||
|
|
||||||
const history = [...actions.approved, ...actions.running, ...actions.completed, ...actions.failed, ...actions.rejected];
|
|
||||||
historyEl.innerHTML = history.sort((a,b) => (b.timestamp || b.updated_at || 0) - (a.timestamp || a.updated_at || 0)).map(a => `
|
|
||||||
<div class="event">
|
|
||||||
<div class="event-header">
|
|
||||||
<span>${(a.action_type || a.type || 'unknown').toUpperCase()}</span>
|
|
||||||
<span class="badge ${getStatusClass(a.status)}">${a.status}</span>
|
|
||||||
</div>
|
|
||||||
<div>${a.description || a.action_type || 'No description'}</div>
|
|
||||||
<small>${formatTime(a.timestamp || a.updated_at)} | Target: ${a.node || (a.target && a.target.node)}</small>
|
|
||||||
${a.status === 'approved' ? `<div class="controls"><button class="btn-primary" onclick="mutateAction('${a.id}', 'running')">Execute</button></div>` : ''}
|
|
||||||
${a.transition_history ? `
|
|
||||||
<div style="margin-top:8px; font-size:10px; color:var(--text-muted)">
|
|
||||||
<strong>Trace:</strong> ${a.transition_history.map(h => `${h.from}->${h.to}`).join(' → ')}
|
|
||||||
</div>
|
|
||||||
` : ''}
|
|
||||||
</div>
|
|
||||||
`).join('') || 'No history.';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (currentView === 'dashboard' || currentView === 'events') {
|
|
||||||
const incidents = await fetchData('/incidents');
|
|
||||||
if (currentView === 'dashboard') {
|
|
||||||
const dashIncidents = document.getElementById('dashboard-incidents');
|
|
||||||
if (!incidents || incidents.length === 0) {
|
|
||||||
dashIncidents.textContent = 'No active incidents.';
|
|
||||||
} else {
|
|
||||||
dashIncidents.innerHTML = incidents.map(inc => `
|
|
||||||
<div class="event ${inc.severity}">
|
|
||||||
<strong>${inc.severity.toUpperCase()}:</strong> ${inc.message}<br>
|
|
||||||
<small>${formatTime(inc.timestamp)} | Node: ${inc.node}</small>
|
|
||||||
</div>
|
|
||||||
`).join('');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (currentView === 'nodes') {
|
|
||||||
const nodes = await fetchData('/nodes');
|
|
||||||
const list = document.getElementById('nodes-list');
|
|
||||||
list.innerHTML = nodes.map(node => `
|
|
||||||
<div class="card">
|
|
||||||
<div class="card-header">
|
|
||||||
<div class="card-title">${node.hostname}</div>
|
|
||||||
<span class="badge ${getStatusClass(node.health)}">${node.health}</span>
|
|
||||||
</div>
|
|
||||||
<div class="label">ID</div><div class="value mono">${node.id}</div>
|
|
||||||
<div class="label">Capabilities</div><div class="value">${node.capabilities.join(', ')}</div>
|
|
||||||
<div class="label">Connectivity</div><div class="value">${node.connectivity}</div>
|
|
||||||
<div class="label">Incidents (24h)</div><div class="value">${node.incidents}</div>
|
|
||||||
<div class="label">Last Seen</div><div class="value">${formatTime(node.last_seen)}</div>
|
|
||||||
<div class="label">Runtime Status</div><div class="value">${node.status}</div>
|
|
||||||
</div>
|
|
||||||
`).join('');
|
|
||||||
}
|
|
||||||
|
|
||||||
if (currentView === 'services') {
|
|
||||||
const services = await fetchData('/services');
|
|
||||||
const list = document.getElementById('services-list');
|
|
||||||
list.innerHTML = services.map(svc => `
|
|
||||||
<div class="card">
|
|
||||||
<div class="card-header">
|
|
||||||
<div class="card-title">${svc.name}</div>
|
|
||||||
<span class="badge ${getStatusClass(svc.health)}">${svc.health}</span>
|
|
||||||
</div>
|
|
||||||
<div class="label">State (Desired/Actual)</div><div class="value">${svc.desired_state} / ${svc.actual_state}</div>
|
|
||||||
<div class="label">Deployment</div><div class="value">${svc.deployment_state}</div>
|
|
||||||
<div class="label">Dependencies</div><div class="value">${svc.dependencies.join(', ') || 'None'}</div>
|
|
||||||
<div class="label">Recommendations</div><div class="value">${svc.recommendations.join(', ') || 'None'}</div>
|
|
||||||
</div>
|
|
||||||
`).join('');
|
|
||||||
}
|
|
||||||
|
|
||||||
if (currentView === 'deployments') {
|
|
||||||
const deps = await fetchData('/deployments');
|
|
||||||
const list = document.getElementById('deployments-list');
|
|
||||||
list.innerHTML = deps.map(dep => `
|
|
||||||
<div class="card">
|
|
||||||
<div class="card-header">
|
|
||||||
<div class="card-title">${dep.service}</div>
|
|
||||||
<span class="badge ${dep.status === 'failed' ? 'status-error' : 'status-reconciling'}">${dep.status}</span>
|
|
||||||
</div>
|
|
||||||
<div class="label">ID</div><div class="value mono">${dep.id}</div>
|
|
||||||
<div class="label">Stage</div><div class="value">${dep.stage}</div>
|
|
||||||
<div class="label">Diagnostics</div><div class="value">${dep.diagnostics || 'No data'}</div>
|
|
||||||
<div class="label">Resumable</div><div class="value">${dep.resumable ? 'Yes' : 'No'}</div>
|
|
||||||
${dep.resumable ? '<button class="btn-primary">Resume</button>' : ''}
|
|
||||||
</div>
|
|
||||||
`).join('');
|
|
||||||
}
|
|
||||||
|
|
||||||
if (currentView === 'events') {
|
|
||||||
const events = await fetchData('/events');
|
|
||||||
const timeline = document.getElementById('events-timeline');
|
|
||||||
timeline.innerHTML = events.map(ev => `
|
|
||||||
<div class="event ${ev.severity}">
|
|
||||||
<div class="event-header">
|
|
||||||
<span>${ev.type.toUpperCase()}</span>
|
|
||||||
<span>${formatTime(ev.timestamp)}</span>
|
|
||||||
</div>
|
|
||||||
<div>${ev.message}</div>
|
|
||||||
<div class="label" style="margin-top:8px">Node: ${ev.node} ${ev.service ? '| Service: ' + ev.service : ''}</div>
|
|
||||||
</div>
|
|
||||||
`).join('');
|
|
||||||
}
|
|
||||||
|
|
||||||
if (currentView === 'recommendations') {
|
|
||||||
const recs = await fetchData('/recommendations');
|
|
||||||
const list = document.getElementById('recommendations-list');
|
|
||||||
list.innerHTML = recs.map(rec => `
|
|
||||||
<div class="card">
|
|
||||||
<div class="card-header">
|
|
||||||
<div class="card-title">${rec.title}</div>
|
|
||||||
<span class="badge risk-${rec.risk_level}">${rec.risk_level}</span>
|
|
||||||
</div>
|
|
||||||
<p>${rec.description}</p>
|
|
||||||
<div class="label">Confidence</div><div class="value">${Math.round(rec.confidence * 100)}%</div>
|
|
||||||
<div class="label">Autonomous Eligible</div><div class="value">${rec.autonomous_eligible ? 'Yes' : 'No'}</div>
|
|
||||||
<div class="label">Blocked Actions</div><div class="value">${rec.blocked_actions.join(', ') || 'None'}</div>
|
|
||||||
<div class="controls">
|
|
||||||
<button class="btn-primary" ${rec.risk_level === 'dangerous' ? 'style="background:var(--dangerous)"' : ''}>Approve Action</button>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
`).join('');
|
|
||||||
}
|
|
||||||
|
|
||||||
if (currentView === 'topology') {
|
|
||||||
const nodes = await fetchData('/nodes');
|
|
||||||
const services = await fetchData('/services');
|
|
||||||
const topMap = document.getElementById('topology-map');
|
|
||||||
if (nodes && services) {
|
|
||||||
topMap.innerHTML = nodes.map(node => {
|
|
||||||
const nodeServices = services.filter(s => s.node === node.hostname || s.node === node.id);
|
|
||||||
return `
|
|
||||||
<div class="card" style="width:250px; border: 1px solid ${node.health === 'nominal' ? 'var(--border-color)' : 'var(--error)'}">
|
|
||||||
<div class="card-header">
|
|
||||||
<div class="card-title">${node.hostname}</div>
|
|
||||||
<span class="badge ${getStatusClass(node.health)}">${node.health}</span>
|
|
||||||
</div>
|
|
||||||
<div class="label">Capabilities</div>
|
|
||||||
<div class="value" style="font-size:11px">${node.capabilities.join(', ')}</div>
|
|
||||||
<div class="label">Services</div>
|
|
||||||
<div style="font-size:12px; margin-bottom:10px">
|
|
||||||
${nodeServices.length > 0 ? nodeServices.map(s => `
|
|
||||||
<div style="display:flex; justify-content:space-between; margin-bottom:4px; padding:4px; background:rgba(255,255,255,0.03)">
|
|
||||||
<span>${s.name}</span>
|
|
||||||
<span class="${getStatusClass(s.health)}" style="font-size:10px">${s.health}</span>
|
|
||||||
</div>
|
|
||||||
${s.dependencies.length > 0 ? `<div style="font-size:9px; color:var(--text-muted); margin-left:8px; margin-bottom:4px">dep: ${s.dependencies.join(', ')}</div>` : ''}
|
|
||||||
`).join('') : '<div class="value">None</div>'}
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
`;
|
|
||||||
}).join('');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (currentView === 'correlation') {
|
|
||||||
const incidents = await fetchData('/incidents');
|
|
||||||
const actions = await fetchData('/actions');
|
|
||||||
const list = document.getElementById('correlation-chains');
|
|
||||||
if (incidents && actions) {
|
|
||||||
const allActions = Object.values(actions).flat();
|
|
||||||
list.innerHTML = incidents.map(inc => {
|
|
||||||
const related = allActions.filter(a => a.correlation_chain && a.correlation_chain.includes(inc.id));
|
|
||||||
return `
|
|
||||||
<div class="card">
|
|
||||||
<div class="card-header">
|
|
||||||
<div class="card-title">Incident: ${inc.id || 'INC-001'}</div>
|
|
||||||
<span class="badge status-error">Active</span>
|
|
||||||
</div>
|
|
||||||
<p>${inc.message}</p>
|
|
||||||
<div class="label">Related Actions</div>
|
|
||||||
${related.map(a => `
|
|
||||||
<div class="event" style="margin-top:5px">
|
|
||||||
<strong>${a.type}</strong> (${a.status})<br>
|
|
||||||
<small>${a.description}</small>
|
|
||||||
</div>
|
|
||||||
`).join('') || '<div class="value">No actions yet</div>'}
|
|
||||||
</div>
|
|
||||||
`;
|
|
||||||
}).join('');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (currentView === 'settings') {
|
|
||||||
const config = await fetchData('/config');
|
|
||||||
const content = document.getElementById('settings-content');
|
|
||||||
content.innerHTML = `
|
|
||||||
<div class="label">Auto Mode</div>
|
|
||||||
<div class="value">${config.auto_mode ? 'Enabled' : 'Disabled'}</div>
|
|
||||||
<div class="label">Action Thresholds</div>
|
|
||||||
<div class="value mono">${JSON.stringify(config.action_thresholds, null, 2)}</div>
|
|
||||||
<div class="label">Telegram Integration</div>
|
|
||||||
<div class="value" style="color:var(--text-muted)">Ready for mobile approval flows. Hook: /api/v1/telegram/webhook</div>
|
|
||||||
<button onclick="alert('Settings update not implemented in this demo')">Edit Configuration</button>
|
|
||||||
`;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Initial load
|
|
||||||
refreshData();
|
|
||||||
// Poll for updates
|
|
||||||
setInterval(refreshData, pollInterval);
|
|
||||||
|
|
||||||
</script>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
|
|
@ -1,264 +0,0 @@
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
|
|
||||||
STATE_DIR = Path(os.getenv("HOMELAB_STATE_ROOT", "/opt/homelab/state"))
|
|
||||||
EVENTS_DIR = Path(os.getenv("HOMELAB_EVENTS_ROOT", "/opt/homelab/events"))
|
|
||||||
WORLD_DIR = Path(os.getenv("HOMELAB_WORLD_ROOT", "/opt/homelab/world"))
|
|
||||||
ACTIONS_DIR = Path(os.getenv("HOMELAB_ACTIONS_ROOT", "/opt/homelab/actions"))
|
|
||||||
CONFIG_DIR = Path(os.getenv("HOMELAB_CONFIG_ROOT", "/opt/homelab/config"))
|
|
||||||
|
|
||||||
STATIC_DIR = Path(__file__).parent
|
|
||||||
|
|
||||||
DEFAULT_CONFIG = {
|
|
||||||
"operator_mode": "approval",
|
|
||||||
"auto_mode": True,
|
|
||||||
"action_thresholds": {
|
|
||||||
"restart_ha": 0.8,
|
|
||||||
"check_network": 0.9,
|
|
||||||
},
|
|
||||||
"default_threshold": 0.9,
|
|
||||||
"allowed_auto_actions": ["restart_ha"],
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def read_json_file(path, default=None):
|
|
||||||
if not path.exists():
|
|
||||||
return default if default is not None else []
|
|
||||||
try:
|
|
||||||
return json.loads(path.read_text())
|
|
||||||
except Exception:
|
|
||||||
return default if default is not None else []
|
|
||||||
|
|
||||||
|
|
||||||
def get_config():
|
|
||||||
config_path = STATE_DIR / "operator-config.json"
|
|
||||||
if config_path.exists():
|
|
||||||
return read_json_file(config_path, DEFAULT_CONFIG)
|
|
||||||
return DEFAULT_CONFIG
|
|
||||||
|
|
||||||
|
|
||||||
def save_config(config):
|
|
||||||
STATE_DIR.mkdir(parents=True, exist_ok=True)
|
|
||||||
(STATE_DIR / "operator-config.json").write_text(json.dumps(config, indent=2))
|
|
||||||
|
|
||||||
|
|
||||||
def current_nodes():
|
|
||||||
return read_json_file(WORLD_DIR / "nodes.json")
|
|
||||||
|
|
||||||
|
|
||||||
def current_services():
|
|
||||||
return read_json_file(WORLD_DIR / "services.json")
|
|
||||||
|
|
||||||
|
|
||||||
def current_deployments():
|
|
||||||
return read_json_file(WORLD_DIR / "deployments.json")
|
|
||||||
|
|
||||||
|
|
||||||
def current_incidents():
|
|
||||||
return read_json_file(WORLD_DIR / "incidents.json")
|
|
||||||
|
|
||||||
|
|
||||||
def current_recommendations():
|
|
||||||
return read_json_file(WORLD_DIR / "recommendations.json")
|
|
||||||
|
|
||||||
|
|
||||||
def current_summary():
|
|
||||||
summary = read_json_file(WORLD_DIR / "runtime-summary.json", default={})
|
|
||||||
if summary:
|
|
||||||
# Check for staleness
|
|
||||||
mtime = os.path.getmtime(WORLD_DIR / "runtime-summary.json")
|
|
||||||
summary["last_update"] = mtime
|
|
||||||
summary["stale"] = (time.time() - mtime) > 60 # Stale if older than 60s
|
|
||||||
return summary
|
|
||||||
|
|
||||||
|
|
||||||
def current_events():
|
|
||||||
return read_json_file(WORLD_DIR / "events.json", default=[])
|
|
||||||
|
|
||||||
|
|
||||||
def current_actions():
|
|
||||||
actions = {}
|
|
||||||
statuses = ["pending", "approved", "running", "completed", "failed", "rejected"]
|
|
||||||
for status in statuses:
|
|
||||||
actions[status] = []
|
|
||||||
status_dir = ACTIONS_DIR / status
|
|
||||||
if status_dir.exists():
|
|
||||||
for f in status_dir.glob("*.json"):
|
|
||||||
data = read_json_file(f)
|
|
||||||
if data:
|
|
||||||
# Injects some metadata for UI
|
|
||||||
data["id"] = data.get("action_id") or f.stem
|
|
||||||
data["status"] = status
|
|
||||||
actions[status].append(data)
|
|
||||||
return actions
|
|
||||||
|
|
||||||
|
|
||||||
def mutate_action(action_id, target_status):
|
|
||||||
statuses = ["pending", "approved", "running", "completed", "failed", "rejected"]
|
|
||||||
if target_status not in statuses:
|
|
||||||
return False, f"Invalid target status: {target_status}"
|
|
||||||
|
|
||||||
# Find where the action is
|
|
||||||
source_path = None
|
|
||||||
current_status = None
|
|
||||||
for status in statuses:
|
|
||||||
p = ACTIONS_DIR / status / f"{action_id}.json"
|
|
||||||
if p.exists():
|
|
||||||
source_path = p
|
|
||||||
current_status = status
|
|
||||||
break
|
|
||||||
|
|
||||||
if not source_path:
|
|
||||||
return False, f"Action {action_id} not found"
|
|
||||||
|
|
||||||
target_dir = ACTIONS_DIR / target_status
|
|
||||||
target_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
target_path = target_dir / f"{action_id}.json"
|
|
||||||
|
|
||||||
try:
|
|
||||||
data = json.loads(source_path.read_text())
|
|
||||||
data["status"] = target_status
|
|
||||||
data["updated_at"] = time.time()
|
|
||||||
|
|
||||||
# Keep history of transitions
|
|
||||||
history = data.get("transition_history", [])
|
|
||||||
history.append({
|
|
||||||
"from": current_status,
|
|
||||||
"to": target_status,
|
|
||||||
"timestamp": time.time()
|
|
||||||
})
|
|
||||||
data["transition_history"] = history
|
|
||||||
|
|
||||||
target_path.write_text(json.dumps(data, indent=2))
|
|
||||||
if source_path != target_path:
|
|
||||||
source_path.unlink()
|
|
||||||
return True, "Success"
|
|
||||||
except Exception as e:
|
|
||||||
return False, str(e)
|
|
||||||
|
|
||||||
|
|
||||||
def send_json(status, payload, handler):
|
|
||||||
body = (json.dumps(payload) + "\n").encode("utf-8")
|
|
||||||
handler.send_response(status)
|
|
||||||
handler.send_header("Content-Type", "application/json")
|
|
||||||
handler.send_header("Content-Length", str(len(body)))
|
|
||||||
handler.end_headers()
|
|
||||||
handler.wfile.write(body)
|
|
||||||
|
|
||||||
|
|
||||||
class Handler(BaseHTTPRequestHandler):
|
|
||||||
def do_GET(self):
|
|
||||||
if self.path == "/config":
|
|
||||||
send_json(200, get_config(), self)
|
|
||||||
return
|
|
||||||
|
|
||||||
if self.path == "/nodes":
|
|
||||||
send_json(200, current_nodes(), self)
|
|
||||||
return
|
|
||||||
|
|
||||||
if self.path == "/services":
|
|
||||||
send_json(200, current_services(), self)
|
|
||||||
return
|
|
||||||
|
|
||||||
if self.path == "/deployments":
|
|
||||||
send_json(200, current_deployments(), self)
|
|
||||||
return
|
|
||||||
|
|
||||||
if self.path == "/incidents":
|
|
||||||
send_json(200, current_incidents(), self)
|
|
||||||
return
|
|
||||||
|
|
||||||
if self.path == "/recommendations":
|
|
||||||
send_json(200, current_recommendations(), self)
|
|
||||||
return
|
|
||||||
|
|
||||||
if self.path == "/summary":
|
|
||||||
send_json(200, current_summary(), self)
|
|
||||||
return
|
|
||||||
|
|
||||||
if self.path == "/events":
|
|
||||||
send_json(200, current_events(), self)
|
|
||||||
return
|
|
||||||
|
|
||||||
if self.path == "/actions":
|
|
||||||
send_json(200, current_actions(), self)
|
|
||||||
return
|
|
||||||
|
|
||||||
if self.path in ("/", "/index.html"):
|
|
||||||
body = (STATIC_DIR / "index.html").read_bytes()
|
|
||||||
self.send_response(200)
|
|
||||||
self.send_header("Content-Type", "text/html; charset=utf-8")
|
|
||||||
self.send_header("Content-Length", str(len(body)))
|
|
||||||
self.end_headers()
|
|
||||||
self.wfile.write(body)
|
|
||||||
return
|
|
||||||
|
|
||||||
self.send_error(404)
|
|
||||||
|
|
||||||
def do_POST(self):
|
|
||||||
if self.path not in (
|
|
||||||
"/config",
|
|
||||||
"/action/mutate",
|
|
||||||
"/mode",
|
|
||||||
):
|
|
||||||
self.send_error(404)
|
|
||||||
return
|
|
||||||
|
|
||||||
length = int(self.headers.get("Content-Length", "0"))
|
|
||||||
raw_body = self.rfile.read(length).decode("utf-8")
|
|
||||||
try:
|
|
||||||
payload = json.loads(raw_body)
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
self.send_error(400, "Invalid JSON")
|
|
||||||
return
|
|
||||||
|
|
||||||
if self.path == "/config":
|
|
||||||
config = get_config()
|
|
||||||
config.update(payload)
|
|
||||||
save_config(config)
|
|
||||||
send_json(200, {"status": "ok"}, self)
|
|
||||||
return
|
|
||||||
|
|
||||||
if self.path == "/mode":
|
|
||||||
mode = payload.get("mode")
|
|
||||||
if not mode:
|
|
||||||
self.send_error(400, "mode is required")
|
|
||||||
return
|
|
||||||
config = get_config()
|
|
||||||
config["operator_mode"] = mode
|
|
||||||
save_config(config)
|
|
||||||
send_json(200, {"status": "ok"}, self)
|
|
||||||
return
|
|
||||||
|
|
||||||
if self.path == "/action/mutate":
|
|
||||||
action_id = payload.get("id")
|
|
||||||
target = payload.get("status")
|
|
||||||
if not action_id or not target:
|
|
||||||
self.send_error(400, "id and status are required")
|
|
||||||
return
|
|
||||||
success, msg = mutate_action(action_id, target)
|
|
||||||
if success:
|
|
||||||
send_json(200, {"status": "ok"}, self)
|
|
||||||
else:
|
|
||||||
self.send_error(500, msg)
|
|
||||||
return
|
|
||||||
|
|
||||||
def log_message(self, format, *args):
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# Ensure directories exist
|
|
||||||
for d in [STATE_DIR, EVENTS_DIR, WORLD_DIR, ACTIONS_DIR, CONFIG_DIR]:
|
|
||||||
d.mkdir(parents=True, exist_ok=True)
|
|
||||||
for s in ["pending", "approved", "running", "completed", "failed", "rejected"]:
|
|
||||||
(ACTIONS_DIR / s).mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
port = int(os.getenv("PORT", "8080"))
|
|
||||||
print(f"Operator Control Plane starting on 0.0.0.0:{port}")
|
|
||||||
server = ThreadingHTTPServer(("0.0.0.0", port), Handler)
|
|
||||||
server.serve_forever()
|
|
||||||
|
|
@ -1,16 +0,0 @@
|
||||||
FROM python:3.11-slim
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
# No extra dependencies needed beyond standard library for the current script
|
|
||||||
# But we might need them if we decide to use libraries later.
|
|
||||||
|
|
||||||
COPY src/stability_agent.py .
|
|
||||||
COPY healthcheck.sh .
|
|
||||||
RUN chmod +x healthcheck.sh
|
|
||||||
|
|
||||||
# Create the expected directories
|
|
||||||
RUN mkdir -p /opt/homelab/state /opt/homelab/events
|
|
||||||
|
|
||||||
# Run the agent
|
|
||||||
CMD ["python", "stability_agent.py"]
|
|
||||||
|
|
@ -1,63 +0,0 @@
|
||||||
### Stability Agent
|
|
||||||
|
|
||||||
A lightweight filesystem-first watchdog and observer agent for CHELSTY.
|
|
||||||
|
|
||||||
#### Features
|
|
||||||
|
|
||||||
* **Continuous Monitoring**: Runs as a background service.
|
|
||||||
* **Docker Inspection**: Checks container status via read-only Docker socket.
|
|
||||||
* **Disk Usage**: Monitors local disk utilization.
|
|
||||||
* **Tailscale Check**: Verifies Tailscale availability.
|
|
||||||
* **MQTT Reachability**: Checks connectivity to the local MQTT broker.
|
|
||||||
* **Zigbee2MQTT Monitoring**: Specifically monitors the Zigbee2MQTT container.
|
|
||||||
* **Redis Publishing**: (Optional) Publishes runtime state and events to a central Redis server.
|
|
||||||
* **Event Logging**: Writes append-only JSON events to `/opt/homelab/events/YYYY-MM-DD/chelsty/`.
|
|
||||||
* **State Reporting**: Writes heartbeat and status summary to `/opt/homelab/state/`.
|
|
||||||
|
|
||||||
#### Configuration
|
|
||||||
|
|
||||||
Environment variables:
|
|
||||||
|
|
||||||
* `STABILITY_CHECK_INTERVAL`: Interval between checks in seconds (default: 60).
|
|
||||||
* `DISK_THRESHOLD_PCT`: Disk usage percentage to trigger warning (default: 90).
|
|
||||||
* `MQTT_HOST`: Hostname or IP of the MQTT broker to check.
|
|
||||||
* `MQTT_PORT`: Port of the MQTT broker (default: 1883).
|
|
||||||
* `REDIS_HOST`: Hostname or IP of the Redis server (e.g., PIHA at 100.108.208.3).
|
|
||||||
* `REDIS_PORT`: Port of the Redis server (default: 6379).
|
|
||||||
* `REDIS_ENABLED`: Whether to enable Redis publishing (default: true if REDIS_HOST is set).
|
|
||||||
* `NODE_NAME`: Name of the current node (default: chelsty).
|
|
||||||
|
|
||||||
#### Verification
|
|
||||||
|
|
||||||
You can verify the Redis publishing using `redis-cli`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Check node state
|
|
||||||
redis-cli -h 100.108.208.3 HGETALL homelab:nodes:chelsty
|
|
||||||
|
|
||||||
# Check service discovery
|
|
||||||
redis-cli -h 100.108.208.3 HGETALL homelab:services:chelsty:stability-agent
|
|
||||||
|
|
||||||
# Check event stream
|
|
||||||
redis-cli -h 100.108.208.3 XRANGE homelab:events - +
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Safety
|
|
||||||
|
|
||||||
* No automatic restarts are performed.
|
|
||||||
* Read-only access to Docker socket.
|
|
||||||
* No configuration mutation.
|
|
||||||
* No secrets stored in the repository.
|
|
||||||
|
|
||||||
#### Event Schema
|
|
||||||
|
|
||||||
Events are written as JSON lines with the following fields:
|
|
||||||
|
|
||||||
* `id`: Unique event UUID.
|
|
||||||
* `timestamp`: ISO 8601 timestamp (UTC).
|
|
||||||
* `node`: `chelsty`.
|
|
||||||
* `source`: `stability-agent`.
|
|
||||||
* `type`: Type of event (e.g., `disk_usage_high`, `containers_not_running`).
|
|
||||||
* `severity`: `info`, `warning`, or `error`.
|
|
||||||
* `message`: Human-readable description.
|
|
||||||
* `details`: Object containing specific check results.
|
|
||||||
|
|
@ -1,28 +0,0 @@
|
||||||
services:
|
|
||||||
stability-agent:
|
|
||||||
build: .
|
|
||||||
container_name: stability-agent
|
|
||||||
restart: unless-stopped
|
|
||||||
volumes:
|
|
||||||
- /opt/homelab:/opt/homelab
|
|
||||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
|
||||||
- /var/run/tailscale/tailscaled.sock:/var/run/tailscale/tailscaled.sock:ro
|
|
||||||
environment:
|
|
||||||
- STABILITY_CHECK_INTERVAL=${STABILITY_CHECK_INTERVAL:-60}
|
|
||||||
- DISK_THRESHOLD_PCT=${DISK_THRESHOLD_PCT:-90}
|
|
||||||
- MQTT_HOST=${MQTT_HOST}
|
|
||||||
- MQTT_PORT=${MQTT_PORT:-1883}
|
|
||||||
- REDIS_HOST=${REDIS_HOST:-100.108.208.3}
|
|
||||||
- REDIS_PORT=${REDIS_PORT:-6379}
|
|
||||||
- REDIS_ENABLED=${REDIS_ENABLED:-true}
|
|
||||||
- NODE_NAME=${NODE_NAME:-chelsty}
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "/bin/sh", "/app/healthcheck.sh"]
|
|
||||||
interval: 1m
|
|
||||||
timeout: 10s
|
|
||||||
retries: 3
|
|
||||||
|
|
||||||
volumes:
|
|
||||||
opt_homelab:
|
|
||||||
external: true
|
|
||||||
name: homelab_data # This might vary, but /opt/homelab mount is preferred as direct path.
|
|
||||||
|
|
@ -1,8 +0,0 @@
|
||||||
STABILITY_CHECK_INTERVAL=60
|
|
||||||
DISK_THRESHOLD_PCT=90
|
|
||||||
MQTT_HOST=mosquitto
|
|
||||||
MQTT_PORT=1883
|
|
||||||
REDIS_HOST=100.108.208.3
|
|
||||||
REDIS_PORT=6379
|
|
||||||
REDIS_ENABLED=true
|
|
||||||
NODE_NAME=chelsty
|
|
||||||
|
|
@ -1,25 +0,0 @@
|
||||||
#!/bin/sh
|
|
||||||
|
|
||||||
HEARTBEAT_FILE="/opt/homelab/state/stability-agent.heartbeat"
|
|
||||||
MAX_AGE_SECONDS=300 # 5 minutes
|
|
||||||
|
|
||||||
if [ ! -f "$HEARTBEAT_FILE" ]; then
|
|
||||||
echo "Heartbeat file missing"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Get current time in seconds
|
|
||||||
NOW=$(date +%s)
|
|
||||||
|
|
||||||
# Get file modification time in seconds
|
|
||||||
# Busybox stat (standard in alpine/slim) uses -c %Y
|
|
||||||
FILE_TIME=$(stat -c %Y "$HEARTBEAT_FILE")
|
|
||||||
|
|
||||||
AGE=$((NOW - FILE_TIME))
|
|
||||||
|
|
||||||
if [ "$AGE" -gt "$MAX_AGE_SECONDS" ]; then
|
|
||||||
echo "Heartbeat is too old: ${AGE}s"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
exit 0
|
|
||||||
|
|
@ -1,24 +0,0 @@
|
||||||
service:
|
|
||||||
name: stability-agent
|
|
||||||
owner_node: chelsty
|
|
||||||
exposure: private
|
|
||||||
dependencies: []
|
|
||||||
healthcheck:
|
|
||||||
type: custom
|
|
||||||
interval: 60s
|
|
||||||
timeout: 10s
|
|
||||||
retries: 3
|
|
||||||
restart_policy: unless-stopped
|
|
||||||
persistence:
|
|
||||||
paths:
|
|
||||||
- /opt/homelab/state
|
|
||||||
- /opt/homelab/events
|
|
||||||
runtime:
|
|
||||||
directories:
|
|
||||||
- /opt/homelab/state
|
|
||||||
- /opt/homelab/events
|
|
||||||
env_vars:
|
|
||||||
- STABILITY_CHECK_INTERVAL
|
|
||||||
- DISK_THRESHOLD_PCT
|
|
||||||
- MQTT_HOST
|
|
||||||
- MQTT_PORT
|
|
||||||
|
|
@ -1,333 +0,0 @@
|
||||||
import os
|
|
||||||
import time
|
|
||||||
import json
|
|
||||||
import datetime
|
|
||||||
import uuid
|
|
||||||
import socket
|
|
||||||
import shutil
|
|
||||||
import http.client
|
|
||||||
|
|
||||||
# Configuration from environment
|
|
||||||
CHECK_INTERVAL = int(os.environ.get("STABILITY_CHECK_INTERVAL", "60"))
|
|
||||||
DISK_THRESHOLD_PCT = float(os.environ.get("DISK_THRESHOLD_PCT", "90.0"))
|
|
||||||
MQTT_HOST = os.environ.get("MQTT_HOST")
|
|
||||||
MQTT_PORT = int(os.environ.get("MQTT_PORT", "1883"))
|
|
||||||
NODE_NAME = os.environ.get("NODE_NAME", "chelsty")
|
|
||||||
REDIS_HOST = os.environ.get("REDIS_HOST")
|
|
||||||
REDIS_PORT = int(os.environ.get("REDIS_PORT", "6379"))
|
|
||||||
REDIS_ENABLED = os.environ.get("REDIS_ENABLED", "true").lower() == "true" if REDIS_HOST else False
|
|
||||||
SOURCE = "stability-agent"
|
|
||||||
|
|
||||||
STATE_DIR = "/opt/homelab/state"
|
|
||||||
EVENTS_BASE_DIR = "/opt/homelab/events"
|
|
||||||
HEARTBEAT_FILE = os.path.join(STATE_DIR, "stability-agent.heartbeat")
|
|
||||||
STATUS_FILE = os.path.join(STATE_DIR, "stability-agent.json")
|
|
||||||
|
|
||||||
def get_timestamp():
|
|
||||||
return datetime.datetime.utcnow().isoformat() + "Z"
|
|
||||||
|
|
||||||
def get_datestamp():
|
|
||||||
return datetime.datetime.utcnow().strftime("%Y-%m-%d")
|
|
||||||
|
|
||||||
def emit_event(event_type, severity, message, service=None, details=None):
|
|
||||||
timestamp = get_timestamp()
|
|
||||||
event = {
|
|
||||||
"id": str(uuid.uuid4()),
|
|
||||||
"timestamp": timestamp,
|
|
||||||
"node": NODE_NAME,
|
|
||||||
"source": SOURCE,
|
|
||||||
"type": event_type,
|
|
||||||
"severity": severity,
|
|
||||||
"message": message,
|
|
||||||
"details": details or {}
|
|
||||||
}
|
|
||||||
if service:
|
|
||||||
event["service"] = service
|
|
||||||
|
|
||||||
date_str = get_datestamp()
|
|
||||||
event_dir = os.path.join(EVENTS_BASE_DIR, date_str, NODE_NAME)
|
|
||||||
try:
|
|
||||||
os.makedirs(event_dir, exist_ok=True)
|
|
||||||
event_file = os.path.join(event_dir, "events.jsonl")
|
|
||||||
with open(event_file, "a") as f:
|
|
||||||
f.write(json.dumps(event) + "\n")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Failed to write event to filesystem: {e}")
|
|
||||||
|
|
||||||
# Redis publishing
|
|
||||||
if REDIS_ENABLED and redis_client:
|
|
||||||
try:
|
|
||||||
redis_client.xadd("homelab:events", {
|
|
||||||
"node": NODE_NAME,
|
|
||||||
"type": event_type,
|
|
||||||
"severity": severity,
|
|
||||||
"timestamp": str(int(time.time())),
|
|
||||||
"message": message,
|
|
||||||
"details": json.dumps(details or {})
|
|
||||||
})
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Failed to publish event to Redis: {e}")
|
|
||||||
# Do not crash, already logged to filesystem
|
|
||||||
|
|
||||||
print(f"[{severity}] {message}")
|
|
||||||
|
|
||||||
def check_disk():
|
|
||||||
total, used, free = shutil.disk_usage("/")
|
|
||||||
percent = (used / total) * 100
|
|
||||||
details = {
|
|
||||||
"total_gb": total // (2**30),
|
|
||||||
"used_gb": used // (2**30),
|
|
||||||
"free_gb": free // (2**30),
|
|
||||||
"percent": round(percent, 2)
|
|
||||||
}
|
|
||||||
|
|
||||||
if percent > DISK_THRESHOLD_PCT:
|
|
||||||
emit_event("disk_usage_high", "warning", f"Disk usage is high: {details['percent']}%", details=details)
|
|
||||||
|
|
||||||
return details
|
|
||||||
|
|
||||||
class DockerClient:
|
|
||||||
def __init__(self, socket_path="/var/run/docker.sock"):
|
|
||||||
self.socket_path = socket_path
|
|
||||||
|
|
||||||
def _request(self, path):
|
|
||||||
class UnixHTTPConnection(http.client.HTTPConnection):
|
|
||||||
def __init__(self, socket_path):
|
|
||||||
super().__init__("localhost")
|
|
||||||
self.socket_path = socket_path
|
|
||||||
def connect(self):
|
|
||||||
self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
|
||||||
self.sock.settimeout(5.0)
|
|
||||||
self.sock.connect(self.socket_path)
|
|
||||||
|
|
||||||
if not os.path.exists(self.socket_path):
|
|
||||||
return None
|
|
||||||
|
|
||||||
conn = UnixHTTPConnection(self.socket_path)
|
|
||||||
try:
|
|
||||||
conn.request("GET", path)
|
|
||||||
res = conn.getresponse()
|
|
||||||
if res.status == 200:
|
|
||||||
return json.loads(res.read().decode())
|
|
||||||
return None
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Docker API error: {e}")
|
|
||||||
return None
|
|
||||||
finally:
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
def get_containers(self):
|
|
||||||
return self._request("/containers/json?all=1")
|
|
||||||
|
|
||||||
def check_docker():
|
|
||||||
client = DockerClient()
|
|
||||||
containers = client.get_containers()
|
|
||||||
if containers is None:
|
|
||||||
emit_event("docker_socket_error", "error", "Could not connect to Docker socket or socket missing")
|
|
||||||
return {"status": "error", "error": "Could not connect to Docker socket"}
|
|
||||||
|
|
||||||
summary = []
|
|
||||||
unhealthy_containers = []
|
|
||||||
for c in containers:
|
|
||||||
state = c.get("State", "")
|
|
||||||
status = c.get("Status", "")
|
|
||||||
name = c.get("Names", ["unknown"])[0].lstrip("/")
|
|
||||||
|
|
||||||
container_info = {
|
|
||||||
"name": name,
|
|
||||||
"state": state,
|
|
||||||
"status": status
|
|
||||||
}
|
|
||||||
summary.append(container_info)
|
|
||||||
|
|
||||||
if state != "running":
|
|
||||||
unhealthy_containers.append(container_info)
|
|
||||||
|
|
||||||
if unhealthy_containers:
|
|
||||||
names = [c["name"] for c in unhealthy_containers]
|
|
||||||
# Only emit warning for containers that should be running?
|
|
||||||
# For now, we report any non-running container found by Docker.
|
|
||||||
emit_event("containers_not_running", "warning", f"Some containers are not running: {', '.join(names)}", details={"containers": unhealthy_containers})
|
|
||||||
|
|
||||||
return {"status": "ok", "containers": summary}
|
|
||||||
|
|
||||||
def check_tailscale():
|
|
||||||
# Check for tailscale socket or interface
|
|
||||||
socket_path = "/var/run/tailscale/tailscaled.sock"
|
|
||||||
socket_available = os.path.exists(socket_path)
|
|
||||||
interface_available = os.path.exists("/sys/class/net/tailscale0")
|
|
||||||
|
|
||||||
return {
|
|
||||||
"available": socket_available or interface_available,
|
|
||||||
"details": {
|
|
||||||
"socket": socket_available,
|
|
||||||
"interface": interface_available
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def check_mqtt():
|
|
||||||
if not MQTT_HOST:
|
|
||||||
return {"configured": False}
|
|
||||||
|
|
||||||
try:
|
|
||||||
with socket.create_connection((MQTT_HOST, MQTT_PORT), timeout=5):
|
|
||||||
return {"configured": True, "reachable": True}
|
|
||||||
except Exception as e:
|
|
||||||
emit_event("mqtt_unreachable", "error", f"MQTT broker at {MQTT_HOST}:{MQTT_PORT} is unreachable", details={"error": str(e)})
|
|
||||||
return {"configured": True, "reachable": False, "error": str(e)}
|
|
||||||
|
|
||||||
class RedisClient:
|
|
||||||
def __init__(self, host, port=6379):
|
|
||||||
self.host = host
|
|
||||||
self.port = port
|
|
||||||
self.sock = None
|
|
||||||
|
|
||||||
def _connect(self):
|
|
||||||
if self.sock:
|
|
||||||
try:
|
|
||||||
# Check if socket is still alive
|
|
||||||
self.sock.send(b"", socket.MSG_DONTWAIT)
|
|
||||||
return True
|
|
||||||
except (socket.error, AttributeError):
|
|
||||||
self.sock = None
|
|
||||||
|
|
||||||
try:
|
|
||||||
self.sock = socket.create_connection((self.host, self.port), timeout=2)
|
|
||||||
self.sock.settimeout(2.0)
|
|
||||||
return True
|
|
||||||
except Exception as e:
|
|
||||||
self.sock = None
|
|
||||||
print(f"Redis connection error: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
def _send_command(self, *args):
|
|
||||||
if not self._connect():
|
|
||||||
return False
|
|
||||||
|
|
||||||
# RESP array
|
|
||||||
cmd = f"*{len(args)}\r\n"
|
|
||||||
for arg in args:
|
|
||||||
s_arg = str(arg)
|
|
||||||
cmd += f"${len(s_arg.encode('utf-8'))}\r\n{s_arg}\r\n"
|
|
||||||
|
|
||||||
try:
|
|
||||||
self.sock.sendall(cmd.encode('utf-8'))
|
|
||||||
# Basic response reading
|
|
||||||
resp = self.sock.recv(4096)
|
|
||||||
if resp.startswith(b"-"):
|
|
||||||
print(f"Redis error response: {resp.decode().strip()}")
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Redis send error: {e}")
|
|
||||||
if self.sock:
|
|
||||||
self.sock.close()
|
|
||||||
self.sock = None
|
|
||||||
return False
|
|
||||||
|
|
||||||
def hset(self, key, mapping):
|
|
||||||
args = ["HSET", key]
|
|
||||||
for k, v in mapping.items():
|
|
||||||
args.extend([k, v])
|
|
||||||
return self._send_command(*args)
|
|
||||||
|
|
||||||
def xadd(self, key, fields):
|
|
||||||
args = ["XADD", key, "*"]
|
|
||||||
for k, v in fields.items():
|
|
||||||
args.extend([k, v])
|
|
||||||
return self._send_command(*args)
|
|
||||||
|
|
||||||
redis_client = RedisClient(REDIS_HOST, REDIS_PORT) if REDIS_ENABLED else None
|
|
||||||
|
|
||||||
def main():
|
|
||||||
print(f"Starting stability-agent on {NODE_NAME}...")
|
|
||||||
|
|
||||||
# Ensure directories exist
|
|
||||||
os.makedirs(STATE_DIR, exist_ok=True)
|
|
||||||
os.makedirs(EVENTS_BASE_DIR, exist_ok=True)
|
|
||||||
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
status = {
|
|
||||||
"timestamp": get_timestamp(),
|
|
||||||
"node": NODE_NAME,
|
|
||||||
"checks": {}
|
|
||||||
}
|
|
||||||
|
|
||||||
status["checks"]["disk"] = check_disk()
|
|
||||||
status["checks"]["docker"] = check_docker()
|
|
||||||
status["checks"]["tailscale"] = check_tailscale()
|
|
||||||
status["checks"]["mqtt"] = check_mqtt()
|
|
||||||
|
|
||||||
# Zigbee2MQTT container check
|
|
||||||
z2m_present = False
|
|
||||||
z2m_running = False
|
|
||||||
if status["checks"]["docker"]["status"] == "ok":
|
|
||||||
for c in status["checks"]["docker"]["containers"]:
|
|
||||||
if "zigbee2mqtt" in c["name"]:
|
|
||||||
z2m_present = True
|
|
||||||
if c["state"] == "running":
|
|
||||||
z2m_running = True
|
|
||||||
|
|
||||||
status["checks"]["zigbee2mqtt"] = {
|
|
||||||
"present": z2m_present,
|
|
||||||
"running": z2m_running
|
|
||||||
}
|
|
||||||
|
|
||||||
# Write heartbeat
|
|
||||||
with open(HEARTBEAT_FILE, "w") as f:
|
|
||||||
f.write(get_timestamp())
|
|
||||||
|
|
||||||
# Write status summary
|
|
||||||
with open(STATUS_FILE, "w") as f:
|
|
||||||
json.dump(status, f, indent=2)
|
|
||||||
|
|
||||||
# Redis publishing
|
|
||||||
if REDIS_ENABLED and redis_client:
|
|
||||||
try:
|
|
||||||
# Node state
|
|
||||||
node_health = "healthy"
|
|
||||||
for check in status["checks"].values():
|
|
||||||
if isinstance(check, dict) and check.get("status") == "error":
|
|
||||||
node_health = "unhealthy"
|
|
||||||
|
|
||||||
redis_client.hset(f"homelab:nodes:{NODE_NAME}", {
|
|
||||||
"id": NODE_NAME,
|
|
||||||
"hostname": NODE_NAME,
|
|
||||||
"health": node_health,
|
|
||||||
"status": "online",
|
|
||||||
"last_seen": status["timestamp"],
|
|
||||||
"capabilities": json.dumps(["docker", "tailscale", "mqtt", "disk"]),
|
|
||||||
"checks": json.dumps(status["checks"])
|
|
||||||
})
|
|
||||||
|
|
||||||
# Services discovered from Docker
|
|
||||||
if status["checks"]["docker"]["status"] == "ok":
|
|
||||||
for c in status["checks"]["docker"]["containers"]:
|
|
||||||
service_name = c["name"]
|
|
||||||
service_health = "healthy" if c["state"] == "running" else "unhealthy"
|
|
||||||
|
|
||||||
redis_client.hset(f"homelab:services:{NODE_NAME}:{service_name}", {
|
|
||||||
"name": service_name,
|
|
||||||
"node": NODE_NAME,
|
|
||||||
"health": service_health,
|
|
||||||
"desired_state": "running",
|
|
||||||
"actual_state": c["state"],
|
|
||||||
"deployment_state": "deployed",
|
|
||||||
"updated_at": status["timestamp"],
|
|
||||||
"dependencies": json.dumps([]),
|
|
||||||
"recommendations": json.dumps([])
|
|
||||||
})
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Failed to publish to Redis: {e}")
|
|
||||||
# Local event for Redis error
|
|
||||||
emit_event("redis_publish_error", "warning", f"Failed to publish to Redis: {e}", details={"error": str(e)})
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error in main loop: {e}")
|
|
||||||
emit_event("agent_error", "error", f"Internal agent error: {e}", details={"error": str(e)})
|
|
||||||
|
|
||||||
time.sleep(CHECK_INTERVAL)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
Loading…
Reference in a new issue