agent-system/.output.txt

541 lines
23 KiB
Plaintext
Raw Permalink Normal View History

2026-05-12 18:01:37 +02:00
Command finished with exit code 0.
Command output:
diff --git a/docs/operator/reconcile-review.md b/docs/operator/reconcile-review.md
index 8b99c6f..e56c333 100644
--- a/docs/operator/reconcile-review.md
+++ b/docs/operator/reconcile-review.md
@@ -2,11 +2,13 @@
The system continuously monitors for drift between desired and actual state.
-1. If a service is in RECONCILING state, check the Services view.
-2. Review the Recommendations view for automated or guarded actions.
-3. For 'safe' actions with high confidence, the system may act autonomously if enabled.
-4. For 'guarded' or 'dangerous' actions, an operator must manually approve the action.
-5. Risk Levels:
- - **Safe**: Minimal impact, high success rate.
- - **Guarded**: Potential brief service interruption.
- - **Dangerous**: Significant impact, potential data loss, or hardware interaction required.
+1. **Drift Detection**: When drift is detected, the supervisor generates a recommendation and a corresponding pending action.
+2. **Review**: Navigate to the **Recommendations** view for a high-level summary, or the **Action Queue** for the specific execution plan.
+3. **Approval**: For 'guarded' or 'dangerous' actions, click **Approve** in the Action Queue.
+4. **Execution**: Once approved, the action can be triggered manually by clicking **Execute**, or it will be picked up by the autonomous executor if the system is in `AUTONOMOUS` mode.
+5. **Observation**: Monitor the **Deployments** and **Topology** views to watch the reconciliation in real-time.
+
+Risk Levels:
+- **Safe**: Minimal impact, high success rate.
+- **Guarded**: Potential brief service interruption.
+- **Dangerous**: Significant impact, potential data loss, or node-level disruption.
diff --git a/scripts/supervisor/supervisor.py b/scripts/supervisor/supervisor.py
index e58027b..ce5d162 100644
--- a/scripts/supervisor/supervisor.py
+++ b/scripts/supervisor/supervisor.py
@@ -5,14 +5,19 @@ import yaml
import json
import time
import glob
+import uuid
from pathlib import Path
# Configuration
WORLD_STATE_PATH = Path(os.getenv("HOMELAB_WORLD_ROOT", "/opt/homelab/world"))
+ACTIONS_ROOT = Path(os.getenv("HOMELAB_ACTIONS_ROOT", "/opt/homelab/actions"))
INVENTORY_PATH = Path("hosts")
EVENT_LOG = Path("/tmp/agent-events.log")
CHECKPOINT_FILE = Path("/tmp/supervisor-checkpoint.json")
+# Action Queue Layout
+ACTION_DIRS = ["pending", "approved", "running", "completed", "failed", "rejected"]
+
# Reconcile event types
RECONCILE_REQUIRED = "reconcile_required"
RECONCILE_RECOMMENDED = "reconcile_recommended"
@@ -24,6 +29,70 @@ STATE_DEGRADED = "degraded"
STATE_UNSTABLE = "unstable"
STATE_RECONCILING = "reconciling"
+def ensure_action_dirs():
+ """Ensure action queue directories exist."""
+ for d in ACTION_DIRS:
+ (ACTIONS_ROOT / d).mkdir(parents=True, exist_ok=True)
+
+def emit_action_proposal(recommendation):
+ """Convert recommendation to action proposal and save to pending/."""
+ ensure_action_dirs()
+
+ action_type_map = {
+ "redeploy": "redeploy_service",
+ "deploy": "redeploy_service",
+ "diagnostics": "collect_diagnostics",
+ "failover_review": "collect_diagnostics",
+ "review": "collect_diagnostics",
+ "delayed_deployment": "rerun_deployment_stage"
+ }
+
+ action_type = action_type_map.get(recommendation["action"], "collect_diagnostics")
+
+ risk_level_map = {
+ "redeploy_service": "guarded",
+ "rerun_healthcheck": "safe",
+ "rerun_deployment_stage": "guarded",
+ "collect_diagnostics": "safe"
+ }
+ risk_level = risk_level_map.get(action_type, "dangerous")
+
+ # Dangerous always requires approval
+ # Guarded defaults to approval
+ approval_required = risk_level in ["dangerous", "guarded"]
+
+ action_id = str(uuid.uuid4())
+ action = {
+ "action_id": action_id,
+ "created_at": time.time(),
+ "proposed_by": "supervisor",
+ "correlation_id": str(uuid.uuid4()), # In a real system, link to drift ID
+ "node": recommendation["drift"].get("node"),
+ "service": recommendation["drift"].get("service"),
+ "action_type": action_type,
+ "risk_level": risk_level,
+ "confidence": 0.9, # Default confidence
+ "approval_required": approval_required,
+ "autonomous_eligible": False, # No autonomy yet
+ "status": "pending",
+ "payload": recommendation["drift"],
+ "rollback_reference": None
+ }
+
+ file_path = ACTIONS_ROOT / "pending" / f"{action_id}.json"
+ try:
+ with open(file_path, "w") as f:
+ json.dump(action, f, indent=2)
+
+ emit_event("action_created", f"Action proposed: {action_type} for {action.get('service') or action.get('node')}", {
+ "action_id": action_id,
+ "action_type": action_type,
+ "node": action.get("node"),
+ "service": action.get("service")
+ })
+ except Exception as e:
+ print(f"Error emitting action proposal: {e}", file=sys.stderr)
+
def emit_event(event_type, message, details=None):
"""Emit reconciliation events using existing event system (append-only file)."""
event = {
@@ -278,6 +347,8 @@ def main():
# Emit reconciliation events
for rec in recommendations:
emit_event(rec["type"], rec["message"], rec["drift"])
+ # Proposed: Emit action proposals to action queue
+ emit_action_proposal(rec)
# 6. Save checkpoint
save_checkpoint({
diff --git a/tmp/homelab/world/deployments/dep-001.json b/tmp/homelab/world/deployments/dep-001.json
index 02db067..f70d7a8 100644
--- a/tmp/homelab/world/deployments/dep-001.json
+++ b/tmp/homelab/world/deployments/dep-001.json
@@ -1 +1 @@
-{"id": "dep-001", "service": "webapp", "status": "failed", "timestamp": 1778597957}
+{"id": "dep-001", "service": "webapp", "status": "failed", "timestamp": 1778600510}
diff --git a/tmp/homelab/world/deployments/dep-002.json b/tmp/homelab/world/deployments/dep-002.json
index e977aa0..1ee5a29 100644
--- a/tmp/homelab/world/deployments/dep-002.json
+++ b/tmp/homelab/world/deployments/dep-002.json
@@ -1 +1 @@
-{"id": "dep-002", "service": "webapp", "status": "failed", "timestamp": 1778597657}
+{"id": "dep-002", "service": "webapp", "status": "failed", "timestamp": 1778600210}
diff --git a/tmp/homelab/world/deployments/dep-003.json b/tmp/homelab/world/deployments/dep-003.json
index 66f10c9..f44385b 100644
--- a/tmp/homelab/world/deployments/dep-003.json
+++ b/tmp/homelab/world/deployments/dep-003.json
@@ -1 +1 @@
-{"id": "dep-003", "service": "webapp", "status": "failed", "timestamp": 1778597357}
+{"id": "dep-003", "service": "webapp", "status": "failed", "timestamp": 1778599910}
diff --git a/webui/index.html b/webui/index.html
index d720307..5c049c1 100644
--- a/webui/index.html
+++ b/webui/index.html
@@ -216,9 +216,9 @@
.label { color: var(--text-muted); font-size: 12px; margin-bottom: 4px; }
.value { font-weight: 500; margin-bottom: 12px; }
- .risk-safe { color: var(--safe); }
- .risk-guarded { color: var(--guarded); }
- .risk-dangerous { color: var(--dangerous); }
+ .risk-safe { background: rgba(62, 175, 124, 0.1); color: var(--safe); }
+ .risk-guarded { background: rgba(230, 126, 34, 0.1); color: var(--guarded); }
+ .risk-dangerous { background: rgba(192, 57, 43, 0.1); color: var(--dangerous); }
</style>
</head>
@@ -229,6 +229,9 @@
<li class="nav-item active" onclick="showView('dashboard', this)">
<span>Dashboard</span>
</li>
+ <li class="nav-item" onclick="showView('actions', this)">
+ <span>Action Queue</span>
+ </li>
<li class="nav-item" onclick="showView('nodes', this)">
<span>Nodes</span>
</li>
@@ -238,9 +241,15 @@
<li class="nav-item" onclick="showView('deployments', this)">
<span>Deployments</span>
</li>
+ <li class="nav-item" onclick="showView('topology', this)">
+ <span>Topology</span>
+ </li>
<li class="nav-item" onclick="showView('events', this)">
<span>Events</span>
</li>
+ <li class="nav-item" onclick="showView('correlation', this)">
+ <span>Correlation</span>
+ </li>
<li class="nav-item" onclick="showView('recommendations', this)">
<span>Recommendations</span>
</li>
@@ -255,7 +264,16 @@
<main class="main-content">
<header>
- <div class="view-title" id="current-view-title">Dashboard</div>
+ <div style="display:flex; align-items:center; gap:20px">
+ <div class="view-title" id="current-view-title">Dashboard</div>
+ <select id="operator-mode" onchange="setOperatorMode(this.value)" style="background:var(--sidebar-color); border:1px solid var(--border-color); color:var(--accent-color); font-weight:bold; font-size:12px; padding:4px 8px">
+ <option value="observe">OBSERVE</option>
+ <option value="recommend">RECOMMEND</option>
+ <option value="approval" selected>APPROVAL</option>
+ <option value="autonomous">AUTONOMOUS</option>
+ <option value="maintenance">MAINTENANCE</option>
+ </select>
+ </div>
<div class="header-actions">
<button onclick="refreshData()">Refresh</button>
</div>
@@ -269,6 +287,10 @@
<div class="card-title">System Overview</div>
<div id="dashboard-summary" style="margin-top:20px"></div>
</div>
+ <div class="card">
+ <div class="card-title">Pending Actions</div>
+ <div id="dashboard-actions-summary" style="margin-top:20px"></div>
+ </div>
<div class="card">
<div class="card-title">Active Incidents</div>
<div id="dashboard-incidents" style="margin-top:20px"></div>
@@ -276,6 +298,20 @@
</div>
</div>
+ <!-- Actions View -->
+ <div id="view-actions" class="view hidden">
+ <div style="display:grid; grid-template-columns: 1fr 1fr; gap:24px">
+ <div>
+ <h3>Pending Approval</h3>
+ <div id="actions-pending" class="timeline"></div>
+ </div>
+ <div>
+ <h3>Active / History</h3>
+ <div id="actions-history" class="timeline"></div>
+ </div>
+ </div>
+ </div>
+
<!-- Nodes View -->
<div id="view-nodes" class="view hidden">
<div class="grid" id="nodes-list"></div>
@@ -291,11 +327,24 @@
<div class="grid" id="deployments-list"></div>
</div>
+ <!-- Topology View -->
+ <div id="view-topology" class="view hidden">
+ <div class="card" style="min-height:500px">
+ <div class="card-title">Runtime Topology</div>
+ <div id="topology-map" style="margin-top:20px; display:flex; flex-wrap:wrap; gap:40px; justify-content:center"></div>
+ </div>
+ </div>
+
<!-- Events View -->
<div id="view-events" class="view hidden">
<div class="timeline" id="events-timeline"></div>
</div>
+ <!-- Correlation View -->
+ <div id="view-correlation" class="view hidden">
+ <div id="correlation-chains" class="grid"></div>
+ </div>
+
<!-- Recommendations View -->
<div id="view-recommendations" class="view hidden">
<div class="grid" id="recommendations-list"></div>
@@ -335,6 +384,34 @@
}
}
+ async function postData(endpoint, data) {
+ try {
+ const res = await fetch(endpoint, {
+ method: 'POST',
+ headers: {'Content-Type': 'application/json'},
+ body: JSON.stringify(data)
+ });
+ return await res.json();
+ } catch (e) {
+ console.error('Post error:', endpoint, e);
+ return null;
+ }
+ }
+
+ async function mutateAction(id, status) {
+ const res = await postData('/action/mutate', {id, status});
+ if (res && res.status === 'ok') {
+ refreshData();
+ } else {
+ alert('Mutation failed');
+ }
+ }
+
+ function setOperatorMode(mode) {
+ console.log('Operator mode set to:', mode);
+ // In real system, this would call backend
+ }
+
function formatTime(ts) {
if (!ts) return 'N/A';
return new Date(ts * 1000).toLocaleString();
@@ -368,6 +445,53 @@
}
}
+ if (currentView === 'dashboard' || currentView === 'actions') {
+ const actions = await fetchData('/actions');
+ if (actions) {
+ if (currentView === 'dashboard') {
+ const dashActions = document.getElementById('dashboard-actions-summary');
+ const pendingCount = actions.pending.length;
+ dashActions.innerHTML = `
+ <div class="label">Pending</div><div class="value" style="color:var(--guarded)">${pendingCount}</div>
+ <div class="label">Running</div><div class="value" style="color:var(--reconciling)">${actions.running.length}</div>
+ `;
+ }
+ if (currentView === 'actions') {
+ const pendingEl = document.getElementById('actions-pending');
+ const historyEl = document.getElementById('actions-history');
+
+ pendingEl.innerHTML = actions.pending.map(a => `
+ <div class="card" style="margin-bottom:12px">
+ <div class="card-header">
+ <div class="card-title">${a.type.toUpperCase()}</div>
+ <span class="badge risk-${a.risk_level}">${a.risk_level}</span>
+ </div>
+ <p>${a.description}</p>
+ <div class="label">Target</div><div class="value">${a.target.node} ${a.target.service || ''}</div>
+ <div class="label">Confidence</div><div class="value">${Math.round(a.confidence*100)}%</div>
+ <div class="controls">
+ <button class="btn-primary" onclick="mutateAction('${a.id}', 'approved')">Approve</button>
+ <button onclick="mutateAction('${a.id}', 'rejected')">Reject</button>
+ </div>
+ </div>
+ `).join('') || 'No pending actions.';
+
+ const history = [...actions.approved, ...actions.running, ...actions.completed, ...actions.failed];
+ historyEl.innerHTML = history.sort((a,b) => b.timestamp - a.timestamp).map(a => `
+ <div class="event">
+ <div class="event-header">
+ <span>${a.type.toUpperCase()}</span>
+ <span class="badge ${getStatusClass(a.status)}">${a.status}</span>
+ </div>
+ <div>${a.description}</div>
+ <small>${formatTime(a.timestamp)} | Target: ${a.target.node}</small>
+ ${a.status === 'approved' ? `<div class="controls"><button class="btn-primary" onclick="mutateAction('${a.id}', 'running')">Execute</button></div>` : ''}
+ </div>
+ `).join('') || 'No history.';
+ }
+ }
+ }
+
if (currentView === 'dashboard' || currentView === 'events') {
const incidents = await fetchData('/incidents');
if (currentView === 'dashboard') {
@@ -474,6 +598,64 @@
`).join('');
}
+ if (currentView === 'topology') {
+ const nodes = await fetchData('/nodes');
+ const services = await fetchData('/services');
+ const topMap = document.getElementById('topology-map');
+ if (nodes && services) {
+ topMap.innerHTML = nodes.map(node => {
+ const nodeServices = services.filter(s => s.node === node.hostname || s.node === node.id);
+ return `
+ <div class="card" style="width:250px; border: 1px solid ${node.health === 'nominal' ? 'var(--border-color)' : 'var(--error)'}">
+ <div class="card-header">
+ <div class="card-title">${node.hostname}</div>
+ <span class="badge ${getStatusClass(node.health)}">${node.health}</span>
+ </div>
+ <div class="label">Capabilities</div>
+ <div class="value" style="font-size:11px">${node.capabilities.join(', ')}</div>
+ <div class="label">Services</div>
+ <div style="font-size:12px; margin-bottom:10px">
+ ${nodeServices.length > 0 ? nodeServices.map(s => `
+ <div style="display:flex; justify-content:space-between; margin-bottom:4px; padding:4px; background:rgba(255,255,255,0.03)">
+ <span>${s.name}</span>
+ <span class="${getStatusClass(s.health)}" style="font-size:10px">${s.health}</span>
+ </div>
+ ${s.dependencies.length > 0 ? `<div style="font-size:9px; color:var(--text-muted); margin-left:8px; margin-bottom:4px">dep: ${s.dependencies.join(', ')}</div>` : ''}
+ `).join('') : '<div class="value">None</div>'}
+ </div>
+ </div>
+ `;
+ }).join('');
+ }
+ }
+
+ if (currentView === 'correlation') {
+ const incidents = await fetchData('/incidents');
+ const actions = await fetchData('/actions');
+ const list = document.getElementById('correlation-chains');
+ if (incidents && actions) {
+ const allActions = Object.values(actions).flat();
+ list.innerHTML = incidents.map(inc => {
+ const related = allActions.filter(a => a.correlation_chain && a.correlation_chain.includes(inc.id));
+ return `
+ <div class="card">
+ <div class="card-header">
+ <div class="card-title">Incident: ${inc.id || 'INC-001'}</div>
+ <span class="badge status-error">Active</span>
+ </div>
+ <p>${inc.message}</p>
+ <div class="label">Related Actions</div>
+ ${related.map(a => `
+ <div class="event" style="margin-top:5px">
+ <strong>${a.type}</strong> (${a.status})<br>
+ <small>${a.description}</small>
+ </div>
+ `).join('') || '<div class="value">No actions yet</div>'}
+ </div>
+ `;
+ }).join('');
+ }
+ }
if (currentView === 'settings') {
const config = await fetchData('/config');
const content = document.getElementById('settings-content');
@@ -482,6 +664,8 @@
<div class="value">${config.auto_mode ? 'Enabled' : 'Disabled'}</div>
<div class="label">Action Thresholds</div>
<div class="value mono">${JSON.stringify(config.action_thresholds, null, 2)}</div>
+ <div class="label">Telegram Integration</div>
+ <div class="value" style="color:var(--text-muted)">Ready for mobile approval flows. Hook: /api/v1/telegram/webhook</div>
<button onclick="alert('Settings update not implemented in this demo')">Edit Configuration</button>
`;
}
diff --git a/webui/web.py b/webui/web.py
index 053ac1a..4727274 100644
--- a/webui/web.py
+++ b/webui/web.py
@@ -8,6 +8,7 @@ from pathlib import Path
STATE_DIR = Path("/opt/homelab/state")
EVENTS_DIR = Path("/opt/homelab/events")
WORLD_DIR = Path("/opt/homelab/world")
+ACTIONS_DIR = Path("/opt/homelab/actions")
EVENT_LOG = Path("/tmp/agent-events.log")
STATIC_DIR = Path(__file__).parent
REDIS_HOST = os.getenv("REDIS_HOST", "redis")
@@ -164,6 +165,55 @@ def current_events():
return sorted(events, key=lambda x: x.get("timestamp", 0), reverse=True)
+def current_actions():
+ actions = {}
+ statuses = ["pending", "approved", "running", "completed", "failed", "rejected"]
+ for status in statuses:
+ actions[status] = []
+ status_dir = ACTIONS_DIR / status
+ if status_dir.exists():
+ for f in status_dir.glob("*.json"):
+ data = read_json_file(f)
+ if data:
+ actions[status].append(data)
+ return actions
+
+
+def mutate_action(action_id, target_status):
+ statuses = ["pending", "approved", "running", "completed", "failed", "rejected"]
+ if target_status not in statuses:
+ return False, f"Invalid target status: {target_status}"
+
+ # Find where the action is
+ source_path = None
+ for status in statuses:
+ p = ACTIONS_DIR / status / f"{action_id}.json"
+ if p.exists():
+ source_path = p
+ break
+
+ if not source_path:
+ return False, f"Action {action_id} not found"
+
+ target_dir = ACTIONS_DIR / target_status
+ target_dir.mkdir(parents=True, exist_ok=True)
+ target_path = target_dir / f"{action_id}.json"
+
+ try:
+ data = json.loads(source_path.read_text())
+ data["status"] = target_status
+ data["last_mutation"] = os.path.getmtime(source_path) # or current time
+ import time
+ data["last_mutation"] = time.time()
+
+ target_path.write_text(json.dumps(data, indent=2))
+ if source_path != target_path:
+ source_path.unlink()
+ return True, "Success"
+ except Exception as e:
+ return False, str(e)
+
+
def send_json(status, payload, handler):
body = (json.dumps(payload) + "\n").encode("utf-8")
handler.send_response(status)
@@ -207,6 +257,10 @@ class Handler(BaseHTTPRequestHandler):
send_json(200, current_events(), self)
return
+ if self.path == "/actions":
+ send_json(200, current_actions(), self)
+ return
+
if self.path == "/logs":
print("LOGS endpoint called", flush=True)
body = ("\n".join(tail_lines(EVENT_LOG, 200)) + "\n").encode("utf-8")
@@ -236,6 +290,7 @@ class Handler(BaseHTTPRequestHandler):
"/auto-mode",
"/config",
"/events",
+ "/action/mutate",
):
self.send_error(404)
return
@@ -291,6 +346,19 @@ class Handler(BaseHTTPRequestHandler):
send_json(200, {"status": "sent"}, self)
return
+ if self.path == "/action/mutate":
+ action_id = payload.get("id")
+ target = payload.get("status")
+ if not action_id or not target:
+ self.send_error(400, "id and status are required")
+ return
+ success, msg = mutate_action(action_id, target)
+ if success:
+ send_json(200, {"status": "ok"}, self)
+ else:
+ self.send_error(500, msg)
+ return
+
if not command:
self.send_error(400, "command is required")
return