Finish repo-first implementation of Agent System UI pipeline

Co-authored-by: Junie <junie@jetbrains.com>
This commit is contained in:
oskar 2026-05-16 19:36:43 +02:00
parent 41c05f42b5
commit 12a775c834
5 changed files with 103 additions and 19 deletions

View file

@ -0,0 +1,37 @@
### Agent System
Central runtime materializer and Operator Control Plane UI.
#### Components
- **Redis**: Central state store (on PIHA).
- **Runtime Materializer**: Converts Redis state to JSON files in `/opt/homelab/world`.
- **Web UI**: Exposes API endpoints and serving the Operator UI.
#### Deployment (on PIHA)
```bash
cd services/agent-system
./deploy.sh
```
#### Deployment (on CHELSTY)
```bash
cd services/stability-agent
docker compose up -d --build
```
#### Verification
The `deploy.sh` script automatically verifies the local endpoints.
You can also manually check:
```bash
# Check runtime summary
curl http://localhost:18180/summary
# Check discovered nodes
curl http://localhost:18180/nodes
# Check discovered services
curl http://localhost:18180/services
```
#### Directory Structure
- `/opt/homelab/world`: Contains materialized JSON state.
- `/opt/homelab/state`: Contains operator configuration and local heartbeats.

22
services/agent-system/deploy.sh Executable file
View file

@ -0,0 +1,22 @@
#!/bin/bash
set -e
echo ">>> Validating docker-compose configuration..."
docker compose config
echo ">>> Building and starting Agent System services..."
docker compose up -d --build
echo ">>> Services status:"
docker ps --filter "name=agent-system"
echo ">>> Verifying API endpoints..."
sleep 5 # Give it a moment to start
endpoints=("summary" "nodes" "services")
for ep in "${endpoints[@]}"; do
echo "Checking /$ep..."
curl -s -f http://localhost:18180/$ep > /dev/null && echo " OK" || echo " FAILED"
done
echo ">>> Deployment complete."

View file

@ -30,6 +30,17 @@ def safe_json_loads(data, default=None):
except (json.JSONDecodeError, TypeError):
return data
def normalize_health(health):
"""Normalizes health values for the UI."""
if not health:
return "nominal"
h = str(health).lower()
if h in ["healthy", "ok", "running", "nominal"]:
return "nominal"
if h in ["degraded", "warning"]:
return "degraded"
return "error"
def materialize():
"""Reads state from Redis and writes JSON files to the world directory."""
print(f"[{datetime.now().isoformat()}] Materializing world state...")
@ -42,6 +53,9 @@ def materialize():
for key in node_keys:
node_data = r.hgetall(key)
if node_data:
# Normalize health
if "health" in node_data:
node_data["health"] = normalize_health(node_data["health"])
# Parse JSON fields if they exist
if "capabilities" in node_data:
node_data["capabilities"] = safe_json_loads(node_data["capabilities"], [])
@ -55,6 +69,9 @@ def materialize():
for key in service_keys:
svc_data = r.hgetall(key)
if svc_data:
# Normalize health
if "health" in svc_data:
svc_data["health"] = normalize_health(svc_data["health"])
if "dependencies" in svc_data:
svc_data["dependencies"] = safe_json_loads(svc_data["dependencies"], [])
if "recommendations" in svc_data:
@ -82,6 +99,9 @@ def materialize():
for key in incident_keys:
incident_data = r.hgetall(key)
if incident_data:
# Normalize health if present
if "health" in incident_data:
incident_data["health"] = normalize_health(incident_data["health"])
incidents.append(incident_data)
# 5. Deployments (Hash)
@ -101,13 +121,26 @@ def materialize():
recommendations.append(rec_data)
# 7. Runtime Summary
unhealthy_services = [s for s in services if s.get("health") != "nominal"]
active_incidents = [i for i in incidents if i.get("status") not in ["resolved", "closed"]]
status = "nominal"
if len(active_incidents) > 0 or len(unhealthy_services) > 5:
status = "error"
elif len(unhealthy_services) > 0:
status = "degraded"
summary = {
"status": status,
"timestamp": datetime.utcnow().isoformat() + "Z",
"last_update": int(time.time()),
"node_count": len(nodes),
"service_count": len(services),
"unhealthy_services_count": len([s for s in services if s.get("health") != "healthy"]),
"active_incidents_count": len(active_incidents),
"unhealthy_services_count": len(unhealthy_services),
"incident_count": len(incidents),
"recent_events_count": len(events)
"recent_events_count": len(events),
"stale": False
}
# Ensure directory exists

View file

@ -47,45 +47,37 @@ def save_config(config):
def current_nodes():
return read_json_file(STATE_DIR / "nodes.json")
return read_json_file(WORLD_DIR / "nodes.json")
def current_services():
return read_json_file(STATE_DIR / "services.json")
return read_json_file(WORLD_DIR / "services.json")
def current_deployments():
return read_json_file(STATE_DIR / "deployments.json")
return read_json_file(WORLD_DIR / "deployments.json")
def current_incidents():
return read_json_file(STATE_DIR / "incidents.json")
return read_json_file(WORLD_DIR / "incidents.json")
def current_recommendations():
return read_json_file(STATE_DIR / "recommendations.json")
return read_json_file(WORLD_DIR / "recommendations.json")
def current_summary():
summary = read_json_file(STATE_DIR / "runtime-summary.json", default={})
summary = read_json_file(WORLD_DIR / "runtime-summary.json", default={})
if summary:
# Check for staleness
mtime = os.path.getmtime(STATE_DIR / "runtime-summary.json")
mtime = os.path.getmtime(WORLD_DIR / "runtime-summary.json")
summary["last_update"] = mtime
summary["stale"] = (time.time() - mtime) > 60 # Stale if older than 60s
return summary
def current_events():
events = []
if EVENTS_DIR.exists():
for f in EVENTS_DIR.glob("*.json"):
data = read_json_file(f)
if data:
# Add source file for traceability
data["_source"] = f.name
events.append(data)
return sorted(events, key=lambda x: x.get("timestamp", 0), reverse=True)
return read_json_file(WORLD_DIR / "events.json", default=[])
def current_actions():

View file

@ -293,7 +293,7 @@ def main():
redis_client.hset(f"homelab:nodes:{NODE_NAME}", {
"id": NODE_NAME,
"hostname": socket.gethostname(),
"hostname": NODE_NAME,
"health": node_health,
"status": "online",
"last_seen": status["timestamp"],