diff --git a/services/agent-system/README.md b/services/agent-system/README.md new file mode 100644 index 0000000..c811dd6 --- /dev/null +++ b/services/agent-system/README.md @@ -0,0 +1,37 @@ +### Agent System +Central runtime materializer and Operator Control Plane UI. + +#### Components +- **Redis**: Central state store (on PIHA). +- **Runtime Materializer**: Converts Redis state to JSON files in `/opt/homelab/world`. +- **Web UI**: Exposes API endpoints and serving the Operator UI. + +#### Deployment (on PIHA) +```bash +cd services/agent-system +./deploy.sh +``` + +#### Deployment (on CHELSTY) +```bash +cd services/stability-agent +docker compose up -d --build +``` + +#### Verification +The `deploy.sh` script automatically verifies the local endpoints. +You can also manually check: +```bash +# Check runtime summary +curl http://localhost:18180/summary + +# Check discovered nodes +curl http://localhost:18180/nodes + +# Check discovered services +curl http://localhost:18180/services +``` + +#### Directory Structure +- `/opt/homelab/world`: Contains materialized JSON state. +- `/opt/homelab/state`: Contains operator configuration and local heartbeats. diff --git a/services/agent-system/deploy.sh b/services/agent-system/deploy.sh new file mode 100755 index 0000000..3d6c015 --- /dev/null +++ b/services/agent-system/deploy.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set -e + +echo ">>> Validating docker-compose configuration..." +docker compose config + +echo ">>> Building and starting Agent System services..." +docker compose up -d --build + +echo ">>> Services status:" +docker ps --filter "name=agent-system" + +echo ">>> Verifying API endpoints..." +sleep 5 # Give it a moment to start + +endpoints=("summary" "nodes" "services") +for ep in "${endpoints[@]}"; do + echo "Checking /$ep..." + curl -s -f http://localhost:18180/$ep > /dev/null && echo " OK" || echo " FAILED" +done + +echo ">>> Deployment complete." diff --git a/services/agent-system/runtime-materializer/materializer.py b/services/agent-system/runtime-materializer/materializer.py index 7dcd42f..cbf22ea 100644 --- a/services/agent-system/runtime-materializer/materializer.py +++ b/services/agent-system/runtime-materializer/materializer.py @@ -30,6 +30,17 @@ def safe_json_loads(data, default=None): except (json.JSONDecodeError, TypeError): return data +def normalize_health(health): + """Normalizes health values for the UI.""" + if not health: + return "nominal" + h = str(health).lower() + if h in ["healthy", "ok", "running", "nominal"]: + return "nominal" + if h in ["degraded", "warning"]: + return "degraded" + return "error" + def materialize(): """Reads state from Redis and writes JSON files to the world directory.""" print(f"[{datetime.now().isoformat()}] Materializing world state...") @@ -42,6 +53,9 @@ def materialize(): for key in node_keys: node_data = r.hgetall(key) if node_data: + # Normalize health + if "health" in node_data: + node_data["health"] = normalize_health(node_data["health"]) # Parse JSON fields if they exist if "capabilities" in node_data: node_data["capabilities"] = safe_json_loads(node_data["capabilities"], []) @@ -55,6 +69,9 @@ def materialize(): for key in service_keys: svc_data = r.hgetall(key) if svc_data: + # Normalize health + if "health" in svc_data: + svc_data["health"] = normalize_health(svc_data["health"]) if "dependencies" in svc_data: svc_data["dependencies"] = safe_json_loads(svc_data["dependencies"], []) if "recommendations" in svc_data: @@ -82,6 +99,9 @@ def materialize(): for key in incident_keys: incident_data = r.hgetall(key) if incident_data: + # Normalize health if present + if "health" in incident_data: + incident_data["health"] = normalize_health(incident_data["health"]) incidents.append(incident_data) # 5. Deployments (Hash) @@ -101,13 +121,26 @@ def materialize(): recommendations.append(rec_data) # 7. Runtime Summary + unhealthy_services = [s for s in services if s.get("health") != "nominal"] + active_incidents = [i for i in incidents if i.get("status") not in ["resolved", "closed"]] + + status = "nominal" + if len(active_incidents) > 0 or len(unhealthy_services) > 5: + status = "error" + elif len(unhealthy_services) > 0: + status = "degraded" + summary = { + "status": status, "timestamp": datetime.utcnow().isoformat() + "Z", + "last_update": int(time.time()), "node_count": len(nodes), "service_count": len(services), - "unhealthy_services_count": len([s for s in services if s.get("health") != "healthy"]), + "active_incidents_count": len(active_incidents), + "unhealthy_services_count": len(unhealthy_services), "incident_count": len(incidents), - "recent_events_count": len(events) + "recent_events_count": len(events), + "stale": False } # Ensure directory exists diff --git a/services/agent-system/webui/web.py b/services/agent-system/webui/web.py index 4332474..49fd021 100644 --- a/services/agent-system/webui/web.py +++ b/services/agent-system/webui/web.py @@ -47,45 +47,37 @@ def save_config(config): def current_nodes(): - return read_json_file(STATE_DIR / "nodes.json") + return read_json_file(WORLD_DIR / "nodes.json") def current_services(): - return read_json_file(STATE_DIR / "services.json") + return read_json_file(WORLD_DIR / "services.json") def current_deployments(): - return read_json_file(STATE_DIR / "deployments.json") + return read_json_file(WORLD_DIR / "deployments.json") def current_incidents(): - return read_json_file(STATE_DIR / "incidents.json") + return read_json_file(WORLD_DIR / "incidents.json") def current_recommendations(): - return read_json_file(STATE_DIR / "recommendations.json") + return read_json_file(WORLD_DIR / "recommendations.json") def current_summary(): - summary = read_json_file(STATE_DIR / "runtime-summary.json", default={}) + summary = read_json_file(WORLD_DIR / "runtime-summary.json", default={}) if summary: # Check for staleness - mtime = os.path.getmtime(STATE_DIR / "runtime-summary.json") + mtime = os.path.getmtime(WORLD_DIR / "runtime-summary.json") summary["last_update"] = mtime summary["stale"] = (time.time() - mtime) > 60 # Stale if older than 60s return summary def current_events(): - events = [] - if EVENTS_DIR.exists(): - for f in EVENTS_DIR.glob("*.json"): - data = read_json_file(f) - if data: - # Add source file for traceability - data["_source"] = f.name - events.append(data) - return sorted(events, key=lambda x: x.get("timestamp", 0), reverse=True) + return read_json_file(WORLD_DIR / "events.json", default=[]) def current_actions(): diff --git a/services/stability-agent/src/stability_agent.py b/services/stability-agent/src/stability_agent.py index c775fc7..12baef8 100644 --- a/services/stability-agent/src/stability_agent.py +++ b/services/stability-agent/src/stability_agent.py @@ -293,7 +293,7 @@ def main(): redis_client.hset(f"homelab:nodes:{NODE_NAME}", { "id": NODE_NAME, - "hostname": socket.gethostname(), + "hostname": NODE_NAME, "health": node_health, "status": "online", "last_seen": status["timestamp"],