Finish repo-first implementation of Agent System UI pipeline
Co-authored-by: Junie <junie@jetbrains.com>
This commit is contained in:
parent
41c05f42b5
commit
12a775c834
37
services/agent-system/README.md
Normal file
37
services/agent-system/README.md
Normal file
|
|
@ -0,0 +1,37 @@
|
||||||
|
### Agent System
|
||||||
|
Central runtime materializer and Operator Control Plane UI.
|
||||||
|
|
||||||
|
#### Components
|
||||||
|
- **Redis**: Central state store (on PIHA).
|
||||||
|
- **Runtime Materializer**: Converts Redis state to JSON files in `/opt/homelab/world`.
|
||||||
|
- **Web UI**: Exposes API endpoints and serving the Operator UI.
|
||||||
|
|
||||||
|
#### Deployment (on PIHA)
|
||||||
|
```bash
|
||||||
|
cd services/agent-system
|
||||||
|
./deploy.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Deployment (on CHELSTY)
|
||||||
|
```bash
|
||||||
|
cd services/stability-agent
|
||||||
|
docker compose up -d --build
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Verification
|
||||||
|
The `deploy.sh` script automatically verifies the local endpoints.
|
||||||
|
You can also manually check:
|
||||||
|
```bash
|
||||||
|
# Check runtime summary
|
||||||
|
curl http://localhost:18180/summary
|
||||||
|
|
||||||
|
# Check discovered nodes
|
||||||
|
curl http://localhost:18180/nodes
|
||||||
|
|
||||||
|
# Check discovered services
|
||||||
|
curl http://localhost:18180/services
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Directory Structure
|
||||||
|
- `/opt/homelab/world`: Contains materialized JSON state.
|
||||||
|
- `/opt/homelab/state`: Contains operator configuration and local heartbeats.
|
||||||
22
services/agent-system/deploy.sh
Executable file
22
services/agent-system/deploy.sh
Executable file
|
|
@ -0,0 +1,22 @@
|
||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
echo ">>> Validating docker-compose configuration..."
|
||||||
|
docker compose config
|
||||||
|
|
||||||
|
echo ">>> Building and starting Agent System services..."
|
||||||
|
docker compose up -d --build
|
||||||
|
|
||||||
|
echo ">>> Services status:"
|
||||||
|
docker ps --filter "name=agent-system"
|
||||||
|
|
||||||
|
echo ">>> Verifying API endpoints..."
|
||||||
|
sleep 5 # Give it a moment to start
|
||||||
|
|
||||||
|
endpoints=("summary" "nodes" "services")
|
||||||
|
for ep in "${endpoints[@]}"; do
|
||||||
|
echo "Checking /$ep..."
|
||||||
|
curl -s -f http://localhost:18180/$ep > /dev/null && echo " OK" || echo " FAILED"
|
||||||
|
done
|
||||||
|
|
||||||
|
echo ">>> Deployment complete."
|
||||||
|
|
@ -30,6 +30,17 @@ def safe_json_loads(data, default=None):
|
||||||
except (json.JSONDecodeError, TypeError):
|
except (json.JSONDecodeError, TypeError):
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
def normalize_health(health):
|
||||||
|
"""Normalizes health values for the UI."""
|
||||||
|
if not health:
|
||||||
|
return "nominal"
|
||||||
|
h = str(health).lower()
|
||||||
|
if h in ["healthy", "ok", "running", "nominal"]:
|
||||||
|
return "nominal"
|
||||||
|
if h in ["degraded", "warning"]:
|
||||||
|
return "degraded"
|
||||||
|
return "error"
|
||||||
|
|
||||||
def materialize():
|
def materialize():
|
||||||
"""Reads state from Redis and writes JSON files to the world directory."""
|
"""Reads state from Redis and writes JSON files to the world directory."""
|
||||||
print(f"[{datetime.now().isoformat()}] Materializing world state...")
|
print(f"[{datetime.now().isoformat()}] Materializing world state...")
|
||||||
|
|
@ -42,6 +53,9 @@ def materialize():
|
||||||
for key in node_keys:
|
for key in node_keys:
|
||||||
node_data = r.hgetall(key)
|
node_data = r.hgetall(key)
|
||||||
if node_data:
|
if node_data:
|
||||||
|
# Normalize health
|
||||||
|
if "health" in node_data:
|
||||||
|
node_data["health"] = normalize_health(node_data["health"])
|
||||||
# Parse JSON fields if they exist
|
# Parse JSON fields if they exist
|
||||||
if "capabilities" in node_data:
|
if "capabilities" in node_data:
|
||||||
node_data["capabilities"] = safe_json_loads(node_data["capabilities"], [])
|
node_data["capabilities"] = safe_json_loads(node_data["capabilities"], [])
|
||||||
|
|
@ -55,6 +69,9 @@ def materialize():
|
||||||
for key in service_keys:
|
for key in service_keys:
|
||||||
svc_data = r.hgetall(key)
|
svc_data = r.hgetall(key)
|
||||||
if svc_data:
|
if svc_data:
|
||||||
|
# Normalize health
|
||||||
|
if "health" in svc_data:
|
||||||
|
svc_data["health"] = normalize_health(svc_data["health"])
|
||||||
if "dependencies" in svc_data:
|
if "dependencies" in svc_data:
|
||||||
svc_data["dependencies"] = safe_json_loads(svc_data["dependencies"], [])
|
svc_data["dependencies"] = safe_json_loads(svc_data["dependencies"], [])
|
||||||
if "recommendations" in svc_data:
|
if "recommendations" in svc_data:
|
||||||
|
|
@ -82,6 +99,9 @@ def materialize():
|
||||||
for key in incident_keys:
|
for key in incident_keys:
|
||||||
incident_data = r.hgetall(key)
|
incident_data = r.hgetall(key)
|
||||||
if incident_data:
|
if incident_data:
|
||||||
|
# Normalize health if present
|
||||||
|
if "health" in incident_data:
|
||||||
|
incident_data["health"] = normalize_health(incident_data["health"])
|
||||||
incidents.append(incident_data)
|
incidents.append(incident_data)
|
||||||
|
|
||||||
# 5. Deployments (Hash)
|
# 5. Deployments (Hash)
|
||||||
|
|
@ -101,13 +121,26 @@ def materialize():
|
||||||
recommendations.append(rec_data)
|
recommendations.append(rec_data)
|
||||||
|
|
||||||
# 7. Runtime Summary
|
# 7. Runtime Summary
|
||||||
|
unhealthy_services = [s for s in services if s.get("health") != "nominal"]
|
||||||
|
active_incidents = [i for i in incidents if i.get("status") not in ["resolved", "closed"]]
|
||||||
|
|
||||||
|
status = "nominal"
|
||||||
|
if len(active_incidents) > 0 or len(unhealthy_services) > 5:
|
||||||
|
status = "error"
|
||||||
|
elif len(unhealthy_services) > 0:
|
||||||
|
status = "degraded"
|
||||||
|
|
||||||
summary = {
|
summary = {
|
||||||
|
"status": status,
|
||||||
"timestamp": datetime.utcnow().isoformat() + "Z",
|
"timestamp": datetime.utcnow().isoformat() + "Z",
|
||||||
|
"last_update": int(time.time()),
|
||||||
"node_count": len(nodes),
|
"node_count": len(nodes),
|
||||||
"service_count": len(services),
|
"service_count": len(services),
|
||||||
"unhealthy_services_count": len([s for s in services if s.get("health") != "healthy"]),
|
"active_incidents_count": len(active_incidents),
|
||||||
|
"unhealthy_services_count": len(unhealthy_services),
|
||||||
"incident_count": len(incidents),
|
"incident_count": len(incidents),
|
||||||
"recent_events_count": len(events)
|
"recent_events_count": len(events),
|
||||||
|
"stale": False
|
||||||
}
|
}
|
||||||
|
|
||||||
# Ensure directory exists
|
# Ensure directory exists
|
||||||
|
|
|
||||||
|
|
@ -47,45 +47,37 @@ def save_config(config):
|
||||||
|
|
||||||
|
|
||||||
def current_nodes():
|
def current_nodes():
|
||||||
return read_json_file(STATE_DIR / "nodes.json")
|
return read_json_file(WORLD_DIR / "nodes.json")
|
||||||
|
|
||||||
|
|
||||||
def current_services():
|
def current_services():
|
||||||
return read_json_file(STATE_DIR / "services.json")
|
return read_json_file(WORLD_DIR / "services.json")
|
||||||
|
|
||||||
|
|
||||||
def current_deployments():
|
def current_deployments():
|
||||||
return read_json_file(STATE_DIR / "deployments.json")
|
return read_json_file(WORLD_DIR / "deployments.json")
|
||||||
|
|
||||||
|
|
||||||
def current_incidents():
|
def current_incidents():
|
||||||
return read_json_file(STATE_DIR / "incidents.json")
|
return read_json_file(WORLD_DIR / "incidents.json")
|
||||||
|
|
||||||
|
|
||||||
def current_recommendations():
|
def current_recommendations():
|
||||||
return read_json_file(STATE_DIR / "recommendations.json")
|
return read_json_file(WORLD_DIR / "recommendations.json")
|
||||||
|
|
||||||
|
|
||||||
def current_summary():
|
def current_summary():
|
||||||
summary = read_json_file(STATE_DIR / "runtime-summary.json", default={})
|
summary = read_json_file(WORLD_DIR / "runtime-summary.json", default={})
|
||||||
if summary:
|
if summary:
|
||||||
# Check for staleness
|
# Check for staleness
|
||||||
mtime = os.path.getmtime(STATE_DIR / "runtime-summary.json")
|
mtime = os.path.getmtime(WORLD_DIR / "runtime-summary.json")
|
||||||
summary["last_update"] = mtime
|
summary["last_update"] = mtime
|
||||||
summary["stale"] = (time.time() - mtime) > 60 # Stale if older than 60s
|
summary["stale"] = (time.time() - mtime) > 60 # Stale if older than 60s
|
||||||
return summary
|
return summary
|
||||||
|
|
||||||
|
|
||||||
def current_events():
|
def current_events():
|
||||||
events = []
|
return read_json_file(WORLD_DIR / "events.json", default=[])
|
||||||
if EVENTS_DIR.exists():
|
|
||||||
for f in EVENTS_DIR.glob("*.json"):
|
|
||||||
data = read_json_file(f)
|
|
||||||
if data:
|
|
||||||
# Add source file for traceability
|
|
||||||
data["_source"] = f.name
|
|
||||||
events.append(data)
|
|
||||||
return sorted(events, key=lambda x: x.get("timestamp", 0), reverse=True)
|
|
||||||
|
|
||||||
|
|
||||||
def current_actions():
|
def current_actions():
|
||||||
|
|
|
||||||
|
|
@ -293,7 +293,7 @@ def main():
|
||||||
|
|
||||||
redis_client.hset(f"homelab:nodes:{NODE_NAME}", {
|
redis_client.hset(f"homelab:nodes:{NODE_NAME}", {
|
||||||
"id": NODE_NAME,
|
"id": NODE_NAME,
|
||||||
"hostname": socket.gethostname(),
|
"hostname": NODE_NAME,
|
||||||
"health": node_health,
|
"health": node_health,
|
||||||
"status": "online",
|
"status": "online",
|
||||||
"last_seen": status["timestamp"],
|
"last_seen": status["timestamp"],
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue