Resolve merge conflicts

2026-05-12 18:01:37 +02:00 · 2026-05-12 18:01:37 +02:00 · 50d03b9f96
parent 5ff4ddc2c9
commit 50d03b9f96
29 changed files with 1615 additions and 15 deletions
--- a/.output.txt
+++ b/.output.txt
@ -0,0 +1,541 @@
 Command finished with exit code 0.
 Command output:
 diff --git a/docs/operator/reconcile-review.md b/docs/operator/reconcile-review.md
 index 8b99c6f..e56c333 100644
 --- a/docs/operator/reconcile-review.md
 +++ b/docs/operator/reconcile-review.md
@@ -2,11 +2,13 @@
 The system continuously monitors for drift between desired and actual state.
 -1. If a service is in RECONCILING state, check the Services view.
 -2. Review the Recommendations view for automated or guarded actions.
 -3. For 'safe' actions with high confidence, the system may act autonomously if enabled.
 -4. For 'guarded' or 'dangerous' actions, an operator must manually approve the action.
 -5. Risk Levels:
 -   - **Safe**: Minimal impact, high success rate.
 -   - **Guarded**: Potential brief service interruption.
 -   - **Dangerous**: Significant impact, potential data loss, or hardware interaction required.
 +1. **Drift Detection**: When drift is detected, the supervisor generates a recommendation and a corresponding pending action.
 +2. **Review**: Navigate to the **Recommendations** view for a high-level summary, or the **Action Queue** for the specific execution plan.
 +3. **Approval**: For 'guarded' or 'dangerous' actions, click **Approve** in the Action Queue.
 +4. **Execution**: Once approved, the action can be triggered manually by clicking **Execute**, or it will be picked up by the autonomous executor if the system is in `AUTONOMOUS` mode.
 +5. **Observation**: Monitor the **Deployments** and **Topology** views to watch the reconciliation in real-time.
 +
 +Risk Levels:
 +- **Safe**: Minimal impact, high success rate.
 +- **Guarded**: Potential brief service interruption.
 +- **Dangerous**: Significant impact, potential data loss, or node-level disruption.
 diff --git a/scripts/supervisor/supervisor.py b/scripts/supervisor/supervisor.py
 index e58027b..ce5d162 100644
 --- a/scripts/supervisor/supervisor.py
 +++ b/scripts/supervisor/supervisor.py
@@ -5,14 +5,19 @@ import yaml
 import json
 import time
 import glob
 +import uuid
 from pathlib import Path
 # Configuration
 WORLD_STATE_PATH = Path(os.getenv("HOMELAB_WORLD_ROOT", "/opt/homelab/world"))
 +ACTIONS_ROOT = Path(os.getenv("HOMELAB_ACTIONS_ROOT", "/opt/homelab/actions"))
 INVENTORY_PATH = Path("hosts")
 EVENT_LOG = Path("/tmp/agent-events.log")
 CHECKPOINT_FILE = Path("/tmp/supervisor-checkpoint.json")
 +# Action Queue Layout
 +ACTION_DIRS = ["pending", "approved", "running", "completed", "failed", "rejected"]
 +
 # Reconcile event types
 RECONCILE_REQUIRED = "reconcile_required"
 RECONCILE_RECOMMENDED = "reconcile_recommended"
@@ -24,6 +29,70 @@ STATE_DEGRADED = "degraded"
 STATE_UNSTABLE = "unstable"
 STATE_RECONCILING = "reconciling"
 +def ensure_action_dirs():
 +    """Ensure action queue directories exist."""
 +    for d in ACTION_DIRS:
 +        (ACTIONS_ROOT / d).mkdir(parents=True, exist_ok=True)
 +
 +def emit_action_proposal(recommendation):
 +    """Convert recommendation to action proposal and save to pending/."""
 +    ensure_action_dirs()
 +    
 +    action_type_map = {
 +        "redeploy": "redeploy_service",
 +        "deploy": "redeploy_service",
 +        "diagnostics": "collect_diagnostics",
 +        "failover_review": "collect_diagnostics",
 +        "review": "collect_diagnostics",
 +        "delayed_deployment": "rerun_deployment_stage"
 +    }
 +    
 +    action_type = action_type_map.get(recommendation["action"], "collect_diagnostics")
 +    
 +    risk_level_map = {
 +        "redeploy_service": "guarded",
 +        "rerun_healthcheck": "safe",
 +        "rerun_deployment_stage": "guarded",
 +        "collect_diagnostics": "safe"
 +    }
 +    risk_level = risk_level_map.get(action_type, "dangerous")
 +    
 +    # Dangerous always requires approval
 +    # Guarded defaults to approval
 +    approval_required = risk_level in ["dangerous", "guarded"]
 +    
 +    action_id = str(uuid.uuid4())
 +    action = {
 +        "action_id": action_id,
 +        "created_at": time.time(),
 +        "proposed_by": "supervisor",
 +        "correlation_id": str(uuid.uuid4()), # In a real system, link to drift ID
 +        "node": recommendation["drift"].get("node"),
 +        "service": recommendation["drift"].get("service"),
 +        "action_type": action_type,
 +        "risk_level": risk_level,
 +        "confidence": 0.9, # Default confidence
 +        "approval_required": approval_required,
 +        "autonomous_eligible": False, # No autonomy yet
 +        "status": "pending",
 +        "payload": recommendation["drift"],
 +        "rollback_reference": None
 +    }
 +    
 +    file_path = ACTIONS_ROOT / "pending" / f"{action_id}.json"
 +    try:
 +        with open(file_path, "w") as f:
 +            json.dump(action, f, indent=2)
 +        
 +        emit_event("action_created", f"Action proposed: {action_type} for {action.get('service') or action.get('node')}", {
 +            "action_id": action_id,
 +            "action_type": action_type,
 +            "node": action.get("node"),
 +            "service": action.get("service")
 +        })
 +    except Exception as e:
 +        print(f"Error emitting action proposal: {e}", file=sys.stderr)
 +
 def emit_event(event_type, message, details=None):
     """Emit reconciliation events using existing event system (append-only file)."""
     event = {
@@ -278,6 +347,8 @@ def main():
         # Emit reconciliation events
         for rec in recommendations:
             emit_event(rec["type"], rec["message"], rec["drift"])
 +            # Proposed: Emit action proposals to action queue
 +            emit_action_proposal(rec)
     # 6. Save checkpoint
     save_checkpoint({
 diff --git a/tmp/homelab/world/deployments/dep-001.json b/tmp/homelab/world/deployments/dep-001.json
 index 02db067..f70d7a8 100644
 --- a/tmp/homelab/world/deployments/dep-001.json
 +++ b/tmp/homelab/world/deployments/dep-001.json
@@ -1 +1 @@
 -{"id": "dep-001", "service": "webapp", "status": "failed", "timestamp": 1778597957}
 +{"id": "dep-001", "service": "webapp", "status": "failed", "timestamp": 1778600510}
 diff --git a/tmp/homelab/world/deployments/dep-002.json b/tmp/homelab/world/deployments/dep-002.json
 index e977aa0..1ee5a29 100644
 --- a/tmp/homelab/world/deployments/dep-002.json
 +++ b/tmp/homelab/world/deployments/dep-002.json
@@ -1 +1 @@
 -{"id": "dep-002", "service": "webapp", "status": "failed", "timestamp": 1778597657}
 +{"id": "dep-002", "service": "webapp", "status": "failed", "timestamp": 1778600210}
 diff --git a/tmp/homelab/world/deployments/dep-003.json b/tmp/homelab/world/deployments/dep-003.json
 index 66f10c9..f44385b 100644
 --- a/tmp/homelab/world/deployments/dep-003.json
 +++ b/tmp/homelab/world/deployments/dep-003.json
@@ -1 +1 @@
 -{"id": "dep-003", "service": "webapp", "status": "failed", "timestamp": 1778597357}
 +{"id": "dep-003", "service": "webapp", "status": "failed", "timestamp": 1778599910}
 diff --git a/webui/index.html b/webui/index.html
 index d720307..5c049c1 100644
 --- a/webui/index.html
 +++ b/webui/index.html
@@ -216,9 +216,9 @@
     .label { color: var(--text-muted); font-size: 12px; margin-bottom: 4px; }
     .value { font-weight: 500; margin-bottom: 12px; }
 -    .risk-safe { color: var(--safe); }
 -    .risk-guarded { color: var(--guarded); }
 -    .risk-dangerous { color: var(--dangerous); }
 +    .risk-safe { background: rgba(62, 175, 124, 0.1); color: var(--safe); }
 +    .risk-guarded { background: rgba(230, 126, 34, 0.1); color: var(--guarded); }
 +    .risk-dangerous { background: rgba(192, 57, 43, 0.1); color: var(--dangerous); }
   </style>
 </head>
@@ -229,6 +229,9 @@
       <li class="nav-item active" onclick="showView('dashboard', this)">
         <span>Dashboard</span>
       </li>
 +      <li class="nav-item" onclick="showView('actions', this)">
 +        <span>Action Queue</span>
 +      </li>
       <li class="nav-item" onclick="showView('nodes', this)">
         <span>Nodes</span>
       </li>
@@ -238,9 +241,15 @@
       <li class="nav-item" onclick="showView('deployments', this)">
         <span>Deployments</span>
       </li>
 +      <li class="nav-item" onclick="showView('topology', this)">
 +        <span>Topology</span>
 +      </li>
       <li class="nav-item" onclick="showView('events', this)">
         <span>Events</span>
       </li>
 +      <li class="nav-item" onclick="showView('correlation', this)">
 +        <span>Correlation</span>
 +      </li>
       <li class="nav-item" onclick="showView('recommendations', this)">
         <span>Recommendations</span>
       </li>
@@ -255,7 +264,16 @@
   <main class="main-content">
     <header>
 -      <div class="view-title" id="current-view-title">Dashboard</div>
 +      <div style="display:flex; align-items:center; gap:20px">
 +        <div class="view-title" id="current-view-title">Dashboard</div>
 +        <select id="operator-mode" onchange="setOperatorMode(this.value)" style="background:var(--sidebar-color); border:1px solid var(--border-color); color:var(--accent-color); font-weight:bold; font-size:12px; padding:4px 8px">
 +          <option value="observe">OBSERVE</option>
 +          <option value="recommend">RECOMMEND</option>
 +          <option value="approval" selected>APPROVAL</option>
 +          <option value="autonomous">AUTONOMOUS</option>
 +          <option value="maintenance">MAINTENANCE</option>
 +        </select>
 +      </div>
       <div class="header-actions">
         <button onclick="refreshData()">Refresh</button>
       </div>
@@ -269,6 +287,10 @@
             <div class="card-title">System Overview</div>
             <div id="dashboard-summary" style="margin-top:20px"></div>
           </div>
 +          <div class="card">
 +            <div class="card-title">Pending Actions</div>
 +            <div id="dashboard-actions-summary" style="margin-top:20px"></div>
 +          </div>
           <div class="card">
             <div class="card-title">Active Incidents</div>
             <div id="dashboard-incidents" style="margin-top:20px"></div>
@@ -276,6 +298,20 @@
         </div>
       </div>
 +      <!-- Actions View -->
 +      <div id="view-actions" class="view hidden">
 +        <div style="display:grid; grid-template-columns: 1fr 1fr; gap:24px">
 +          <div>
 +            <h3>Pending Approval</h3>
 +            <div id="actions-pending" class="timeline"></div>
 +          </div>
 +          <div>
 +            <h3>Active / History</h3>
 +            <div id="actions-history" class="timeline"></div>
 +          </div>
 +        </div>
 +      </div>
 +
       <!-- Nodes View -->
       <div id="view-nodes" class="view hidden">
         <div class="grid" id="nodes-list"></div>
@@ -291,11 +327,24 @@
         <div class="grid" id="deployments-list"></div>
       </div>
 +      <!-- Topology View -->
 +      <div id="view-topology" class="view hidden">
 +        <div class="card" style="min-height:500px">
 +          <div class="card-title">Runtime Topology</div>
 +          <div id="topology-map" style="margin-top:20px; display:flex; flex-wrap:wrap; gap:40px; justify-content:center"></div>
 +        </div>
 +      </div>
 +
       <!-- Events View -->
       <div id="view-events" class="view hidden">
         <div class="timeline" id="events-timeline"></div>
       </div>
 +      <!-- Correlation View -->
 +      <div id="view-correlation" class="view hidden">
 +        <div id="correlation-chains" class="grid"></div>
 +      </div>
 +
       <!-- Recommendations View -->
       <div id="view-recommendations" class="view hidden">
         <div class="grid" id="recommendations-list"></div>
@@ -335,6 +384,34 @@
       }
     }
 +    async function postData(endpoint, data) {
 +      try {
 +        const res = await fetch(endpoint, {
 +          method: 'POST',
 +          headers: {'Content-Type': 'application/json'},
 +          body: JSON.stringify(data)
 +        });
 +        return await res.json();
 +      } catch (e) {
 +        console.error('Post error:', endpoint, e);
 +        return null;
 +      }
 +    }
 +
 +    async function mutateAction(id, status) {
 +      const res = await postData('/action/mutate', {id, status});
 +      if (res && res.status === 'ok') {
 +        refreshData();
 +      } else {
 +        alert('Mutation failed');
 +      }
 +    }
 +
 +    function setOperatorMode(mode) {
 +      console.log('Operator mode set to:', mode);
 +      // In real system, this would call backend
 +    }
 +
     function formatTime(ts) {
       if (!ts) return 'N/A';
       return new Date(ts * 1000).toLocaleString();
@@ -368,6 +445,53 @@
         }
       }
 +      if (currentView === 'dashboard' || currentView === 'actions') {
 +          const actions = await fetchData('/actions');
 +          if (actions) {
 +              if (currentView === 'dashboard') {
 +                  const dashActions = document.getElementById('dashboard-actions-summary');
 +                  const pendingCount = actions.pending.length;
 +                  dashActions.innerHTML = `
 +                    <div class="label">Pending</div><div class="value" style="color:var(--guarded)">${pendingCount}</div>
 +                    <div class="label">Running</div><div class="value" style="color:var(--reconciling)">${actions.running.length}</div>
 +                  `;
 +              }
 +              if (currentView === 'actions') {
 +                  const pendingEl = document.getElementById('actions-pending');
 +                  const historyEl = document.getElementById('actions-history');
 +                  
 +                  pendingEl.innerHTML = actions.pending.map(a => `
 +                    <div class="card" style="margin-bottom:12px">
 +                        <div class="card-header">
 +                            <div class="card-title">${a.type.toUpperCase()}</div>
 +                            <span class="badge risk-${a.risk_level}">${a.risk_level}</span>
 +                        </div>
 +                        <p>${a.description}</p>
 +                        <div class="label">Target</div><div class="value">${a.target.node} ${a.target.service || ''}</div>
 +                        <div class="label">Confidence</div><div class="value">${Math.round(a.confidence*100)}%</div>
 +                        <div class="controls">
 +                            <button class="btn-primary" onclick="mutateAction('${a.id}', 'approved')">Approve</button>
 +                            <button onclick="mutateAction('${a.id}', 'rejected')">Reject</button>
 +                        </div>
 +                    </div>
 +                  `).join('') || 'No pending actions.';
 +
 +                  const history = [...actions.approved, ...actions.running, ...actions.completed, ...actions.failed];
 +                  historyEl.innerHTML = history.sort((a,b) => b.timestamp - a.timestamp).map(a => `
 +                    <div class="event">
 +                        <div class="event-header">
 +                            <span>${a.type.toUpperCase()}</span>
 +                            <span class="badge ${getStatusClass(a.status)}">${a.status}</span>
 +                        </div>
 +                        <div>${a.description}</div>
 +                        <small>${formatTime(a.timestamp)} | Target: ${a.target.node}</small>
 +                        ${a.status === 'approved' ? `<div class="controls"><button class="btn-primary" onclick="mutateAction('${a.id}', 'running')">Execute</button></div>` : ''}
 +                    </div>
 +                  `).join('') || 'No history.';
 +              }
 +          }
 +      }
 +
       if (currentView === 'dashboard' || currentView === 'events') {
           const incidents = await fetchData('/incidents');
           if (currentView === 'dashboard') {
@@ -474,6 +598,64 @@
         `).join('');
       }
 +      if (currentView === 'topology') {
 +          const nodes = await fetchData('/nodes');
 +          const services = await fetchData('/services');
 +          const topMap = document.getElementById('topology-map');
 +          if (nodes && services) {
 +              topMap.innerHTML = nodes.map(node => {
 +                  const nodeServices = services.filter(s => s.node === node.hostname || s.node === node.id);
 +                  return `
 +                    <div class="card" style="width:250px; border: 1px solid ${node.health === 'nominal' ? 'var(--border-color)' : 'var(--error)'}">
 +                        <div class="card-header">
 +                            <div class="card-title">${node.hostname}</div>
 +                            <span class="badge ${getStatusClass(node.health)}">${node.health}</span>
 +                        </div>
 +                        <div class="label">Capabilities</div>
 +                        <div class="value" style="font-size:11px">${node.capabilities.join(', ')}</div>
 +                        <div class="label">Services</div>
 +                        <div style="font-size:12px; margin-bottom:10px">
 +                            ${nodeServices.length > 0 ? nodeServices.map(s => `
 +                                <div style="display:flex; justify-content:space-between; margin-bottom:4px; padding:4px; background:rgba(255,255,255,0.03)">
 +                                    <span>${s.name}</span>
 +                                    <span class="${getStatusClass(s.health)}" style="font-size:10px">${s.health}</span>
 +                                </div>
 +                                ${s.dependencies.length > 0 ? `<div style="font-size:9px; color:var(--text-muted); margin-left:8px; margin-bottom:4px">dep: ${s.dependencies.join(', ')}</div>` : ''}
 +                            `).join('') : '<div class="value">None</div>'}
 +                        </div>
 +                    </div>
 +                  `;
 +              }).join('');
 +          }
 +      }
 +
 +      if (currentView === 'correlation') {
 +          const incidents = await fetchData('/incidents');
 +          const actions = await fetchData('/actions');
 +          const list = document.getElementById('correlation-chains');
 +          if (incidents && actions) {
 +              const allActions = Object.values(actions).flat();
 +              list.innerHTML = incidents.map(inc => {
 +                  const related = allActions.filter(a => a.correlation_chain && a.correlation_chain.includes(inc.id));
 +                  return `
 +                    <div class="card">
 +                        <div class="card-header">
 +                            <div class="card-title">Incident: ${inc.id || 'INC-001'}</div>
 +                            <span class="badge status-error">Active</span>
 +                        </div>
 +                        <p>${inc.message}</p>
 +                        <div class="label">Related Actions</div>
 +                        ${related.map(a => `
 +                            <div class="event" style="margin-top:5px">
 +                                <strong>${a.type}</strong> (${a.status})<br>
 +                                <small>${a.description}</small>
 +                            </div>
 +                        `).join('') || '<div class="value">No actions yet</div>'}
 +                    </div>
 +                  `;
 +              }).join('');
 +          }
 +      }
       if (currentView === 'settings') {
           const config = await fetchData('/config');
           const content = document.getElementById('settings-content');
@@ -482,6 +664,8 @@
               <div class="value">${config.auto_mode ? 'Enabled' : 'Disabled'}</div>
               <div class="label">Action Thresholds</div>
               <div class="value mono">${JSON.stringify(config.action_thresholds, null, 2)}</div>
 +              <div class="label">Telegram Integration</div>
 +              <div class="value" style="color:var(--text-muted)">Ready for mobile approval flows. Hook: /api/v1/telegram/webhook</div>
               <button onclick="alert('Settings update not implemented in this demo')">Edit Configuration</button>
           `;
       }
 diff --git a/webui/web.py b/webui/web.py
 index 053ac1a..4727274 100644
 --- a/webui/web.py
 +++ b/webui/web.py
@@ -8,6 +8,7 @@ from pathlib import Path
 STATE_DIR = Path("/opt/homelab/state")
 EVENTS_DIR = Path("/opt/homelab/events")
 WORLD_DIR = Path("/opt/homelab/world")
 +ACTIONS_DIR = Path("/opt/homelab/actions")
 EVENT_LOG = Path("/tmp/agent-events.log")
 STATIC_DIR = Path(__file__).parent
 REDIS_HOST = os.getenv("REDIS_HOST", "redis")
@@ -164,6 +165,55 @@ def current_events():
     return sorted(events, key=lambda x: x.get("timestamp", 0), reverse=True)
 +def current_actions():
 +    actions = {}
 +    statuses = ["pending", "approved", "running", "completed", "failed", "rejected"]
 +    for status in statuses:
 +        actions[status] = []
 +        status_dir = ACTIONS_DIR / status
 +        if status_dir.exists():
 +            for f in status_dir.glob("*.json"):
 +                data = read_json_file(f)
 +                if data:
 +                    actions[status].append(data)
 +    return actions
 +
 +
 +def mutate_action(action_id, target_status):
 +    statuses = ["pending", "approved", "running", "completed", "failed", "rejected"]
 +    if target_status not in statuses:
 +        return False, f"Invalid target status: {target_status}"
 +
 +    # Find where the action is
 +    source_path = None
 +    for status in statuses:
 +        p = ACTIONS_DIR / status / f"{action_id}.json"
 +        if p.exists():
 +            source_path = p
 +            break
 +
 +    if not source_path:
 +        return False, f"Action {action_id} not found"
 +
 +    target_dir = ACTIONS_DIR / target_status
 +    target_dir.mkdir(parents=True, exist_ok=True)
 +    target_path = target_dir / f"{action_id}.json"
 +
 +    try:
 +        data = json.loads(source_path.read_text())
 +        data["status"] = target_status
 +        data["last_mutation"] = os.path.getmtime(source_path) # or current time
 +        import time
 +        data["last_mutation"] = time.time()
 +        
 +        target_path.write_text(json.dumps(data, indent=2))
 +        if source_path != target_path:
 +            source_path.unlink()
 +        return True, "Success"
 +    except Exception as e:
 +        return False, str(e)
 +
 +
 def send_json(status, payload, handler):
     body = (json.dumps(payload) + "\n").encode("utf-8")
     handler.send_response(status)
@@ -207,6 +257,10 @@ class Handler(BaseHTTPRequestHandler):
             send_json(200, current_events(), self)
             return
 +        if self.path == "/actions":
 +            send_json(200, current_actions(), self)
 +            return
 +
         if self.path == "/logs":
             print("LOGS endpoint called", flush=True)
             body = ("\n".join(tail_lines(EVENT_LOG, 200)) + "\n").encode("utf-8")
@@ -236,6 +290,7 @@ class Handler(BaseHTTPRequestHandler):
             "/auto-mode",
             "/config",
             "/events",
 +            "/action/mutate",
         ):
             self.send_error(404)
             return
@@ -291,6 +346,19 @@ class Handler(BaseHTTPRequestHandler):
             send_json(200, {"status": "sent"}, self)
             return
 +        if self.path == "/action/mutate":
 +            action_id = payload.get("id")
 +            target = payload.get("status")
 +            if not action_id or not target:
 +                self.send_error(400, "id and status are required")
 +                return
 +            success, msg = mutate_action(action_id, target)
 +            if success:
 +                send_json(200, {"status": "ok"}, self)
 +            else:
 +                self.send_error(500, msg)
 +            return
 +
         if not command:
             self.send_error(400, "command is required")
             return
--- a/docs/action-queue-system.md
+++ b/docs/action-queue-system.md
@ -0,0 +1,75 @@
 # Action Queue System
 The Action Queue System provides a safe, filesystem-first lifecycle for operational actions in the homelab platform. It enables controlled execution with mandatory approval for high-risk operations.
 ## Action Lifecycle
 Actions move through various states, represented by directories under `/opt/homelab/actions/`:
 1.  **Pending** (`pending/`): Actions proposed by the Supervisor or other agents.
 2.  **Approved** (`approved/`): Actions that have been reviewed and approved for execution.
 3.  **Running** (`running/`): Actions currently being processed by the Executor.
 4.  **Completed** (`completed/`): Successfully executed actions.
 5.  **Failed** (`failed/`): Actions that encountered errors during execution.
 6.  **Rejected** (`rejected/`): Proposed actions that were explicitly denied.
 ## Action Schema
 Actions are stored as JSON documents with the following structure:
 ```json
 {
  "action_id": "uuid",
  "created_at": 1620000000.0,
  "proposed_by": "supervisor",
  "correlation_id": "uuid",
  "node": "node-name",
  "service": "service-name",
  "action_type": "redeploy_service",
  "risk_level": "guarded",
  "confidence": 0.9,
  "approval_required": true,
  "autonomous_eligible": false,
  "status": "pending",
  "payload": { ... },
  "rollback_reference": null
 }
 ```
 ## Safety Model
 Actions are categorized into safety classes:
 -   **Safe**: Low-risk actions that may be eligible for autonomous execution in the future (e.g., `collect_diagnostics`, `rerun_healthcheck`).
 -   **Guarded**: Actions that default to requiring approval but could be automated under strict conditions (e.g., `redeploy_service`, `rerun_deployment_stage`).
 -   **Dangerous**: High-risk actions that ALWAYS require manual approval.
 Currently, the platform operates in a **Recommendation-Only** mode where even `safe` actions require explicit approval.
 ## Initial Action Types
 -   `redeploy_service`: Restarts or redeploys a service container.
 -   `rerun_healthcheck`: Triggers an immediate health check.
 -   `rerun_deployment_stage`: Retries a specific stage of a failed deployment.
 -   `collect_diagnostics`: Gathers logs and metrics for troubleshooting.
 ## Executor
 The Executor (`scripts/executor/executor.py`) is responsible for processing approved actions. It features:
 -   **Process Approved Only**: Only actions in the `approved/` directory are processed.
 -   **Recommendation-Safe**: Simulation-based execution that logs intended mutations without side effects.
 -   **Idempotency**: Designed to be safe to run multiple times.
 -   **Resumable State**: If interrupted, it will pick up actions in the `running/` state.
 -   **Append-Only History**: Maintains a `history.log` of all action transitions.
 ## Rollback Concepts
 Every action schema includes a `rollback_reference`. In future iterations, this will point to the previous stable state or a reverse action that can be triggered if the current action fails or causes further instability.
 ## Future Autonomous Execution
 The system is designed to transition to autonomous execution by:
 1.  Identifying `safe` actions with high `confidence` scores.
 2.  Matching them against a `policy-engine`.
 3.  Automatically moving them from `pending/` to `approved/` based on allowed safety guardrails.
--- a/docs/operator/approval-workflow.md
+++ b/docs/operator/approval-workflow.md
@ -0,0 +1,27 @@
 # Operator Approval Workflow
 This document describes the process of reviewing and approving actions generated by the reconciliation supervisor.
 ## Workflow Stages
 ### 1. Action Identification
 When the supervisor identifies a delta between desired and actual state, it generates a pending action in `/opt/homelab/actions/pending/`.
 ### 2. Risk Assessment
 Actions are categorized by risk level:
 - **Safe**: Low impact, high confidence. Can be auto-approved in autonomous mode.
 - **Guarded**: Moderate impact. Requires explicit operator approval.
 - **Dangerous**: High impact (e.g., node redeploy). Requires multi-step approval or senior operator override.
 ### 3. Review Process
 1. Navigate to the **Action Queue** view.
 2. Review the **Confidence Score** and **Correlation Chain** to understand why the action was proposed.
 3. Check the **Rollback Availability**.
 ### 4. Decision
 - **Approve**: Moves action to `approved` state.
 - **Reject**: Moves action to `rejected` state and suppresses similar recommendations for a cooldown period.
 - **Execute**: Transitions an approved action to `running` status.
 ## Mobile Approvals
 Approval requests can be acknowledged via the Telegram bot integration, allowing for remote operational control.
--- a/docs/operator/incident-remediation.md
+++ b/docs/operator/incident-remediation.md
@ -0,0 +1,24 @@
 # Incident Remediation Guide
 Guide for operators responding to system incidents using the Control Plane.
 ## Remediation Flow
 ### 1. Detection
 Incidents appear in the **Active Incidents** card on the Dashboard and in the **Events** timeline.
 ### 2. Correlation
 Use the **Correlation** view to see:
 - The event chain leading to the incident.
 - Automated recommendations generated in response.
 - Any manual actions already taken.
 ### 3. Intervention
 1. Review the recommended actions in the **Action Queue**.
 2. If the automated recommendation is not sufficient, use the **Nodes** or **Services** view to manually trigger commands.
 3. Observe the **Runtime Topology** to ensure no cascading failures occur during remediation.
 ### 4. Verification
 Once actions are completed, verify the system state:
 - Health badges should transition back to **Nominal**.
 - The **System Status** in the sidebar should reflect a healthy state.
--- a/docs/operator/reconcile-review.md
+++ b/docs/operator/reconcile-review.md
@ -2,11 +2,13 @@
 The system continuously monitors for drift between desired and actual state.
-1. If a service is in RECONCILING state, check the Services view.
+1. **Drift Detection**: When drift is detected, the supervisor generates a recommendation and a corresponding pending action.
-2. Review the Recommendations view for automated or guarded actions.
+2. **Review**: Navigate to the **Recommendations** view for a high-level summary, or the **Action Queue** for the specific execution plan.
-3. For 'safe' actions with high confidence, the system may act autonomously if enabled.
+3. **Approval**: For 'guarded' or 'dangerous' actions, click **Approve** in the Action Queue.
-4. For 'guarded' or 'dangerous' actions, an operator must manually approve the action.
+4. **Execution**: Once approved, the action can be triggered manually by clicking **Execute**, or it will be picked up by the autonomous executor if the system is in `AUTONOMOUS` mode.
-5. Risk Levels:
+5. **Observation**: Monitor the **Deployments** and **Topology** views to watch the reconciliation in real-time.
-   - **Safe**: Minimal impact, high success rate.
+
-   - **Guarded**: Potential brief service interruption.
+Risk Levels:
-   - **Dangerous**: Significant impact, potential data loss, or hardware interaction required.
+- **Safe**: Minimal impact, high success rate.
 - **Guarded**: Potential brief service interruption.
 - **Dangerous**: Significant impact, potential data loss, or node-level disruption.
--- a/scripts/executor/executor.py
+++ b/scripts/executor/executor.py
@ -0,0 +1,225 @@
 #!/usr/bin/env python3
 import os
 import json
 import time
 import sys
 import shutil
 import uuid
 from pathlib import Path
 # Configuration
 ACTIONS_ROOT = Path(os.getenv("HOMELAB_ACTIONS_ROOT", "/opt/homelab/actions"))
 EVENT_LOG = Path("/tmp/agent-events.log")
 HISTORY_LOG = ACTIONS_ROOT / "history.log"
 def emit_event(event_type, message, details=None):
    """Emit action lifecycle events."""
    event = {
        "type": event_type,
        "message": message,
        "timestamp": time.time(),
        "details": details or {}
    }
    line = json.dumps(event)
    print(line)
    try:
        with open(EVENT_LOG, "a") as f:
            f.write(line + "\n")
            f.flush()
    except Exception as e:
        print(f"Error writing to event log: {e}", file=sys.stderr)
 def log_history(action_id, status, message):
    """Append-only execution history."""
    entry = {
        "timestamp": time.time(),
        "action_id": action_id,
        "status": status,
        "message": message
    }
    try:
        with open(HISTORY_LOG, "a") as f:
            f.write(json.dumps(entry) + "\n")
            f.flush()
    except Exception as e:
        print(f"Error writing history: {e}", file=sys.stderr)
 def ensure_dirs():
    for d in ["pending", "approved", "running", "completed", "failed", "rejected"]:
        (ACTIONS_ROOT / d).mkdir(parents=True, exist_ok=True)
 def approve_action(action_id):
    ensure_dirs()
    if not action_id.endswith(".json"):
        filename = f"{action_id}.json"
    else:
        filename = action_id
    pending_path = ACTIONS_ROOT / "pending" / filename
    if not pending_path.exists():
        print(f"Action {filename} not found in pending.")
        return False
    approved_path = ACTIONS_ROOT / "approved" / filename
    try:
        with open(pending_path, "r") as f:
            action = json.load(f)
        action["status"] = "approved"
        action["approved_at"] = time.time()
        with open(pending_path, "w") as f:
            json.dump(action, f, indent=2)
        shutil.move(pending_path, approved_path)
        emit_event("action_approved", f"Action approved: {action['action_id']}", {"action_id": action['action_id']})
        log_history(action['action_id'], "approved", "Manual approval received")
        print(f"Action {action['action_id']} approved.")
        return True
    except Exception as e:
        print(f"Error approving action: {e}")
        return False
 def reject_action(action_id):
    ensure_dirs()
    if not action_id.endswith(".json"):
        filename = f"{action_id}.json"
    else:
        filename = action_id
    pending_path = ACTIONS_ROOT / "pending" / filename
    if not pending_path.exists():
        print(f"Action {filename} not found in pending.")
        return False
    rejected_path = ACTIONS_ROOT / "rejected" / filename
    try:
        with open(pending_path, "r") as f:
            action = json.load(f)
        action["status"] = "rejected"
        action["rejected_at"] = time.time()
        with open(pending_path, "w") as f:
            json.dump(action, f, indent=2)
        shutil.move(pending_path, rejected_path)
        emit_event("action_rejected", f"Action rejected: {action['action_id']}", {"action_id": action['action_id']})
        log_history(action['action_id'], "rejected", "Manual rejection received")
        print(f"Action {action['action_id']} rejected.")
        return True
    except Exception as e:
        print(f"Error rejecting action: {e}")
        return False
 def process_action(action_path, dry_run=False):
    """Process a single approved action."""
    try:
        with open(action_path, "r") as f:
            action = json.load(f)
    except Exception as e:
        print(f"Error reading action {action_path}: {e}")
        return
    action_id = action["action_id"]
    action_type = action["action_type"]
    # Move to running (Resumable execution state)
    running_path = ACTIONS_ROOT / "running" / action_path.name
    shutil.move(action_path, running_path)
    action["status"] = "running"
    action["started_at"] = time.time()
    with open(running_path, "w") as f:
        json.dump(action, f, indent=2)
    emit_event("action_started", f"Started action {action_id} ({action_type})", {"action_id": action_id})
    log_history(action_id, "running", f"Execution started (dry_run={dry_run})")
    # Simulation logic (Recommendation-safe execution model)
    print(f"Executing {action_type} for {action.get('service') or action.get('node')}...")
    # Idempotent simulation: in a real world, we'd check if it's already done
    time.sleep(0.5) 
    success = True
    if dry_run:
        print(f"[DRY-RUN] Would execute {action_type} logic here.")
    else:
        # Initial action types implementation (Simulation)
        if action_type == "redeploy_service":
            print(f"DEBUG: Triggering container restart/redeploy for {action.get('service')}")
        elif action_type == "rerun_healthcheck":
            print(f"DEBUG: Running healthcheck for {action.get('service')}")
        elif action_type == "rerun_deployment_stage":
            print(f"DEBUG: Retrying deployment stage for {action.get('service')}")
        elif action_type == "collect_diagnostics":
            print(f"DEBUG: Collecting logs and metrics for {action.get('service') or action.get('node')}")
        else:
            print(f"DEBUG: Executing unknown action type: {action_type}")
    # Finalize
    if success:
        final_status = "completed"
        target_dir = ACTIONS_ROOT / "completed"
    else:
        final_status = "failed"
        target_dir = ACTIONS_ROOT / "failed"
    final_path = target_dir / action_path.name
    action["status"] = final_status
    action["finished_at"] = time.time()
    with open(running_path, "w") as f:
        json.dump(action, f, indent=2)
    shutil.move(running_path, final_path)
    emit_event(f"action_{final_status}", f"Action {action_id} {final_status}", {"action_id": action_id})
    log_history(action_id, final_status, "Execution finished")
 def run_executor(dry_run=False):
    ensure_dirs()
    print(f"--- Executor Run: {time.ctime()} (dry_run={dry_run}) ---")
    # 1. Resume running actions
    running_actions = list((ACTIONS_ROOT / "running").glob("*.json"))
    for action_file in running_actions:
        print(f"Resuming action: {action_file.name}")
        process_action(action_file, dry_run=dry_run)
    # 2. Process approved actions
    approved_actions = list((ACTIONS_ROOT / "approved").glob("*.json"))
    if not approved_actions:
        print("No approved actions found.")
    else:
        for action_file in approved_actions:
            process_action(action_file, dry_run=dry_run)
    print("Run complete.")
 if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Homelab Action Executor")
    parser.add_argument("command", choices=["run", "approve", "reject"], nargs="?", default="run")
    parser.add_argument("action_id", nargs="?")
    parser.add_argument("--dry-run", action="store_true")
    args = parser.parse_args()
    if args.command == "run":
        run_executor(dry_run=args.dry_run)
    elif args.command == "approve":
        if not args.action_id:
            print("Error: action_id required for approve")
            sys.exit(1)
        approve_action(args.action_id)
    elif args.command == "reject":
        if not args.action_id:
            print("Error: action_id required for reject")
            sys.exit(1)
        reject_action(args.action_id)
--- a/scripts/executor/test_actions.sh
+++ b/scripts/executor/test_actions.sh
@ -0,0 +1,74 @@
 #!/bin/bash
 # Validation script for Homelab Action Queue System
 set -e
 BASE_DIR=$(pwd)
 export HOMELAB_WORLD_ROOT="$BASE_DIR/tmp/homelab/world"
 export HOMELAB_ACTIONS_ROOT="$BASE_DIR/tmp/homelab/actions"
 EVENT_LOG="/tmp/agent-events.log"
 echo "=== Starting Action Queue Validation ==="
 # 1. Setup drift scenarios
 echo "Setting up drift scenarios..."
 bash scripts/supervisor/test_scenarios.sh
 # 2. Run supervisor to generate action proposals
 echo "Running supervisor..."
 python3 scripts/supervisor/supervisor.py
 # 3. Check for pending actions
 echo "Checking pending actions..."
 ls -l "$HOMELAB_ACTIONS_ROOT/pending/"
 # Get an action ID from pending
 ACTION_FILE=$(ls "$HOMELAB_ACTIONS_ROOT/pending/" | head -n 1)
 if [ -z "$ACTION_FILE" ]; then
    echo "Error: No pending actions found!"
    exit 1
 fi
 ACTION_ID="${ACTION_FILE%.json}"
 echo "Found action: $ACTION_ID"
 # 4. Approve the action
 echo "Approving action $ACTION_ID..."
 python3 scripts/executor/executor.py approve "$ACTION_ID"
 # 5. Run executor
 echo "Running executor..."
 python3 scripts/executor/executor.py run
 # 6. Verify completion
 if [ -f "$HOMELAB_ACTIONS_ROOT/completed/$ACTION_FILE" ]; then
    echo "SUCCESS: Action $ACTION_ID moved to completed."
 else
    echo "FAILURE: Action $ACTION_ID NOT found in completed."
    exit 1
 fi
 # 7. Test rejection
 echo "Testing rejection..."
 NEXT_ACTION_FILE=$(ls "$HOMELAB_ACTIONS_ROOT/pending/" | head -n 1)
 if [ -n "$NEXT_ACTION_FILE" ]; then
    NEXT_ACTION_ID="${NEXT_ACTION_FILE%.json}"
    echo "Rejecting action $NEXT_ACTION_ID..."
    python3 scripts/executor/executor.py reject "$NEXT_ACTION_ID"
    if [ -f "$HOMELAB_ACTIONS_ROOT/rejected/$NEXT_ACTION_FILE" ]; then
        echo "SUCCESS: Action $NEXT_ACTION_ID moved to rejected."
    else
        echo "FAILURE: Action $NEXT_ACTION_ID NOT found in rejected."
        exit 1
    fi
 fi
 # 8. Verify events
 echo "Verifying events in $EVENT_LOG..."
 grep "action_created" "$EVENT_LOG" | tail -n 1
 grep "action_approved" "$EVENT_LOG" | tail -n 1
 grep "action_started" "$EVENT_LOG" | tail -n 1
 grep "action_completed" "$EVENT_LOG" | tail -n 1
 grep "action_rejected" "$EVENT_LOG" | tail -n 1
 echo "=== Validation Complete ==="
--- a/scripts/supervisor/supervisor.py
+++ b/scripts/supervisor/supervisor.py
@ -5,14 +5,19 @@ import yaml
 import json
 import time
 import glob
 import uuid
 from pathlib import Path
 # Configuration
 WORLD_STATE_PATH = Path(os.getenv("HOMELAB_WORLD_ROOT", "/opt/homelab/world"))
 ACTIONS_ROOT = Path(os.getenv("HOMELAB_ACTIONS_ROOT", "/opt/homelab/actions"))
 INVENTORY_PATH = Path("hosts")
 EVENT_LOG = Path("/tmp/agent-events.log")
 CHECKPOINT_FILE = Path("/tmp/supervisor-checkpoint.json")
 # Action Queue Layout
 ACTION_DIRS = ["pending", "approved", "running", "completed", "failed", "rejected"]
 # Reconcile event types
 RECONCILE_REQUIRED = "reconcile_required"
 RECONCILE_RECOMMENDED = "reconcile_recommended"
@ -24,6 +29,70 @@ STATE_DEGRADED = "degraded"
 STATE_UNSTABLE = "unstable"
 STATE_RECONCILING = "reconciling"
 def ensure_action_dirs():
    """Ensure action queue directories exist."""
    for d in ACTION_DIRS:
        (ACTIONS_ROOT / d).mkdir(parents=True, exist_ok=True)
 def emit_action_proposal(recommendation):
    """Convert recommendation to action proposal and save to pending/."""
    ensure_action_dirs()
    action_type_map = {
        "redeploy": "redeploy_service",
        "deploy": "redeploy_service",
        "diagnostics": "collect_diagnostics",
        "failover_review": "collect_diagnostics",
        "review": "collect_diagnostics",
        "delayed_deployment": "rerun_deployment_stage"
    }
    action_type = action_type_map.get(recommendation["action"], "collect_diagnostics")
    risk_level_map = {
        "redeploy_service": "guarded",
        "rerun_healthcheck": "safe",
        "rerun_deployment_stage": "guarded",
        "collect_diagnostics": "safe"
    }
    risk_level = risk_level_map.get(action_type, "dangerous")
    # Dangerous always requires approval
    # Guarded defaults to approval
    approval_required = risk_level in ["dangerous", "guarded"]
    action_id = str(uuid.uuid4())
    action = {
        "action_id": action_id,
        "created_at": time.time(),
        "proposed_by": "supervisor",
        "correlation_id": str(uuid.uuid4()), # In a real system, link to drift ID
        "node": recommendation["drift"].get("node"),
        "service": recommendation["drift"].get("service"),
        "action_type": action_type,
        "risk_level": risk_level,
        "confidence": 0.9, # Default confidence
        "approval_required": approval_required,
        "autonomous_eligible": False, # No autonomy yet
        "status": "pending",
        "payload": recommendation["drift"],
        "rollback_reference": None
    }
    file_path = ACTIONS_ROOT / "pending" / f"{action_id}.json"
    try:
        with open(file_path, "w") as f:
            json.dump(action, f, indent=2)
        emit_event("action_created", f"Action proposed: {action_type} for {action.get('service') or action.get('node')}", {
            "action_id": action_id,
            "action_type": action_type,
            "node": action.get("node"),
            "service": action.get("service")
        })
    except Exception as e:
        print(f"Error emitting action proposal: {e}", file=sys.stderr)
 def emit_event(event_type, message, details=None):
    """Emit reconciliation events using existing event system (append-only file)."""
    event = {
@ -278,6 +347,8 @@ def main():
        # Emit reconciliation events
        for rec in recommendations:
            emit_event(rec["type"], rec["message"], rec["drift"])
            # Proposed: Emit action proposals to action queue
            emit_action_proposal(rec)
    # 6. Save checkpoint
    save_checkpoint({
--- a/tmp/homelab/actions/completed/0083f8ad-1f2b-47a4-81a8-81e59740879e.json
+++ b/tmp/homelab/actions/completed/0083f8ad-1f2b-47a4-81a8-81e59740879e.json
@ -0,0 +1,24 @@
 {
  "action_id": "0083f8ad-1f2b-47a4-81a8-81e59740879e",
  "created_at": 1778600485.050643,
  "proposed_by": "supervisor",
  "correlation_id": "6d88755b-ca89-45eb-bf2d-506fca631144",
  "node": "node1",
  "service": "homeassistant",
  "action_type": "redeploy_service",
  "risk_level": "guarded",
  "confidence": 0.9,
  "approval_required": true,
  "autonomous_eligible": false,
  "status": "completed",
  "payload": {
    "type": "unhealthy_service",
    "service": "homeassistant",
    "status": "unhealthy",
    "node": "node1"
  },
  "rollback_reference": null,
  "approved_at": 1778600485.1278665,
  "started_at": 1778600485.1792338,
  "finished_at": 1778600485.6797137
 }
--- a/tmp/homelab/actions/completed/050add79-3265-4e35-bb88-41c368bbccda.json
+++ b/tmp/homelab/actions/completed/050add79-3265-4e35-bb88-41c368bbccda.json
@ -0,0 +1,23 @@
 {
  "action_id": "050add79-3265-4e35-bb88-41c368bbccda",
  "created_at": 1778600510.7529757,
  "proposed_by": "supervisor",
  "correlation_id": "d8ba7d84-74dd-46c8-a085-5ed8ba186770",
  "node": null,
  "service": "webapp",
  "action_type": "collect_diagnostics",
  "risk_level": "safe",
  "confidence": 0.9,
  "approval_required": false,
  "autonomous_eligible": false,
  "status": "completed",
  "payload": {
    "type": "failed_deployment",
    "deployment_id": "dep-001",
    "service": "webapp"
  },
  "rollback_reference": null,
  "approved_at": 1778600510.8252015,
  "started_at": 1778600510.8744874,
  "finished_at": 1778600511.3750403
 }
--- a/tmp/homelab/actions/completed/resumable-task.json
+++ b/tmp/homelab/actions/completed/resumable-task.json
@ -0,0 +1,7 @@
 {
  "action_id": "resumable-task",
  "action_type": "rerun_healthcheck",
  "status": "completed",
  "started_at": 1778600488.5642526,
  "finished_at": 1778600489.0646975
 }
--- a/tmp/homelab/actions/history.log
+++ b/tmp/homelab/actions/history.log
@ -0,0 +1,10 @@
 {"timestamp": 1778600485.1282582, "action_id": "0083f8ad-1f2b-47a4-81a8-81e59740879e", "status": "approved", "message": "Manual approval received"}
 {"timestamp": 1778600485.179484, "action_id": "0083f8ad-1f2b-47a4-81a8-81e59740879e", "status": "running", "message": "Execution started (dry_run=False)"}
 {"timestamp": 1778600485.680433, "action_id": "0083f8ad-1f2b-47a4-81a8-81e59740879e", "status": "completed", "message": "Execution finished"}
 {"timestamp": 1778600485.7410686, "action_id": "2143ae5b-bcc6-410b-b925-e7def70fc013", "status": "rejected", "message": "Manual rejection received"}
 {"timestamp": 1778600488.5644836, "action_id": "resumable-task", "status": "running", "message": "Execution started (dry_run=False)"}
 {"timestamp": 1778600489.0652084, "action_id": "resumable-task", "status": "completed", "message": "Execution finished"}
 {"timestamp": 1778600510.825529, "action_id": "050add79-3265-4e35-bb88-41c368bbccda", "status": "approved", "message": "Manual approval received"}
 {"timestamp": 1778600510.8747966, "action_id": "050add79-3265-4e35-bb88-41c368bbccda", "status": "running", "message": "Execution started (dry_run=False)"}
 {"timestamp": 1778600511.3755214, "action_id": "050add79-3265-4e35-bb88-41c368bbccda", "status": "completed", "message": "Execution finished"}
 {"timestamp": 1778600511.4307747, "action_id": "240cbbc0-891e-4032-bf73-1fa40ff850b4", "status": "rejected", "message": "Manual rejection received"}
--- a/tmp/homelab/actions/pending/50d7cdab-2f12-449f-965a-0383e32babaa.json
+++ b/tmp/homelab/actions/pending/50d7cdab-2f12-449f-965a-0383e32babaa.json
@ -0,0 +1,21 @@
 {
  "action_id": "50d7cdab-2f12-449f-965a-0383e32babaa",
  "created_at": 1778600485.053174,
  "proposed_by": "supervisor",
  "correlation_id": "a2899a7f-548f-455d-a8dd-4e208be58e00",
  "node": null,
  "service": null,
  "action_type": "collect_diagnostics",
  "risk_level": "safe",
  "confidence": 0.9,
  "approval_required": false,
  "autonomous_eligible": false,
  "status": "pending",
  "payload": {
    "type": "unresolved_incident",
    "incident_id": "inc-99",
    "description": "High memory usage on node1",
    "status": "investigating"
  },
  "rollback_reference": null
 }
--- a/tmp/homelab/actions/pending/5e239d96-ff3f-48a3-a71a-ad5aa6b7ff88.json
+++ b/tmp/homelab/actions/pending/5e239d96-ff3f-48a3-a71a-ad5aa6b7ff88.json
@ -0,0 +1,20 @@
 {
  "action_id": "5e239d96-ff3f-48a3-a71a-ad5aa6b7ff88",
  "created_at": 1778600485.05199,
  "proposed_by": "supervisor",
  "correlation_id": "c5fa628e-35a1-44f9-9119-07d93f20af80",
  "node": null,
  "service": "webapp",
  "action_type": "collect_diagnostics",
  "risk_level": "safe",
  "confidence": 0.9,
  "approval_required": false,
  "autonomous_eligible": false,
  "status": "pending",
  "payload": {
    "type": "failed_deployment",
    "deployment_id": "dep-002",
    "service": "webapp"
  },
  "rollback_reference": null
 }
--- a/tmp/homelab/actions/pending/7cde5093-3394-43af-9391-321c50ac5362.json
+++ b/tmp/homelab/actions/pending/7cde5093-3394-43af-9391-321c50ac5362.json
@ -0,0 +1,20 @@
 {
  "action_id": "7cde5093-3394-43af-9391-321c50ac5362",
  "created_at": 1778600510.7521193,
  "proposed_by": "supervisor",
  "correlation_id": "2a91f58e-e10d-4de5-abd7-5f4fe6fdc325",
  "node": null,
  "service": "webapp",
  "action_type": "collect_diagnostics",
  "risk_level": "safe",
  "confidence": 0.9,
  "approval_required": false,
  "autonomous_eligible": false,
  "status": "pending",
  "payload": {
    "type": "failed_deployment",
    "deployment_id": "dep-002",
    "service": "webapp"
  },
  "rollback_reference": null
 }
--- a/tmp/homelab/actions/pending/a42e2183-ca22-4a50-97a7-eb53ab0e039a.json
+++ b/tmp/homelab/actions/pending/a42e2183-ca22-4a50-97a7-eb53ab0e039a.json
@ -0,0 +1,20 @@
 {
  "action_id": "a42e2183-ca22-4a50-97a7-eb53ab0e039a",
  "created_at": 1778600510.75163,
  "proposed_by": "supervisor",
  "correlation_id": "ec2a1960-5baa-453a-8380-65fc9376cc82",
  "node": "node2",
  "service": null,
  "action_type": "collect_diagnostics",
  "risk_level": "safe",
  "confidence": 0.9,
  "approval_required": false,
  "autonomous_eligible": false,
  "status": "pending",
  "payload": {
    "type": "offline_node",
    "node": "node2",
    "status": "offline"
  },
  "rollback_reference": null
 }
--- a/tmp/homelab/actions/pending/aae83bcd-455f-4b59-bab0-7c7994116468.json
+++ b/tmp/homelab/actions/pending/aae83bcd-455f-4b59-bab0-7c7994116468.json
@ -0,0 +1,21 @@
 {
  "action_id": "aae83bcd-455f-4b59-bab0-7c7994116468",
  "created_at": 1778600510.7506568,
  "proposed_by": "supervisor",
  "correlation_id": "0a786305-46cb-4837-8725-53d99203f39e",
  "node": "node1",
  "service": "homeassistant",
  "action_type": "redeploy_service",
  "risk_level": "guarded",
  "confidence": 0.9,
  "approval_required": true,
  "autonomous_eligible": false,
  "status": "pending",
  "payload": {
    "type": "unhealthy_service",
    "service": "homeassistant",
    "status": "unhealthy",
    "node": "node1"
  },
  "rollback_reference": null
 }
--- a/tmp/homelab/actions/pending/c2e6c844-6d96-4ea7-b924-5e33764e5493.json
+++ b/tmp/homelab/actions/pending/c2e6c844-6d96-4ea7-b924-5e33764e5493.json
@ -0,0 +1,21 @@
 {
  "action_id": "c2e6c844-6d96-4ea7-b924-5e33764e5493",
  "created_at": 1778600510.7533653,
  "proposed_by": "supervisor",
  "correlation_id": "6ffc0579-71ac-417f-8ea1-fc46e54527c6",
  "node": null,
  "service": null,
  "action_type": "collect_diagnostics",
  "risk_level": "safe",
  "confidence": 0.9,
  "approval_required": false,
  "autonomous_eligible": false,
  "status": "pending",
  "payload": {
    "type": "unresolved_incident",
    "incident_id": "inc-99",
    "description": "High memory usage on node1",
    "status": "investigating"
  },
  "rollback_reference": null
 }
--- a/tmp/homelab/actions/pending/c91a4171-e636-4194-a146-6e003d2f2586.json
+++ b/tmp/homelab/actions/pending/c91a4171-e636-4194-a146-6e003d2f2586.json
@ -0,0 +1,20 @@
 {
  "action_id": "c91a4171-e636-4194-a146-6e003d2f2586",
  "created_at": 1778600510.7511823,
  "proposed_by": "supervisor",
  "correlation_id": "966a62ee-f81b-497d-96cb-7749f4da0c6f",
  "node": "node2",
  "service": "webapp",
  "action_type": "rerun_deployment_stage",
  "risk_level": "guarded",
  "confidence": 0.9,
  "approval_required": true,
  "autonomous_eligible": false,
  "status": "pending",
  "payload": {
    "type": "missing_service",
    "service": "webapp",
    "node": "node2"
  },
  "rollback_reference": null
 }
--- a/tmp/homelab/actions/pending/e6d3f0d6-c294-4282-b9f4-a730f9cec9dc.json
+++ b/tmp/homelab/actions/pending/e6d3f0d6-c294-4282-b9f4-a730f9cec9dc.json
@ -0,0 +1,20 @@
 {
  "action_id": "e6d3f0d6-c294-4282-b9f4-a730f9cec9dc",
  "created_at": 1778600485.0515254,
  "proposed_by": "supervisor",
  "correlation_id": "bf51852b-0b34-4b4b-98c9-fffff38f77ce",
  "node": "node2",
  "service": null,
  "action_type": "collect_diagnostics",
  "risk_level": "safe",
  "confidence": 0.9,
  "approval_required": false,
  "autonomous_eligible": false,
  "status": "pending",
  "payload": {
    "type": "offline_node",
    "node": "node2",
    "status": "offline"
  },
  "rollback_reference": null
 }
--- a/tmp/homelab/actions/pending/f4c56df2-6775-484b-806e-cdecdcc19584.json
+++ b/tmp/homelab/actions/pending/f4c56df2-6775-484b-806e-cdecdcc19584.json
@ -0,0 +1,20 @@
 {
  "action_id": "f4c56df2-6775-484b-806e-cdecdcc19584",
  "created_at": 1778600485.0527768,
  "proposed_by": "supervisor",
  "correlation_id": "f974d640-d0fb-4a85-bf8a-eda100182181",
  "node": null,
  "service": "webapp",
  "action_type": "collect_diagnostics",
  "risk_level": "safe",
  "confidence": 0.9,
  "approval_required": false,
  "autonomous_eligible": false,
  "status": "pending",
  "payload": {
    "type": "failed_deployment",
    "deployment_id": "dep-001",
    "service": "webapp"
  },
  "rollback_reference": null
 }
--- a/tmp/homelab/actions/pending/ff3da03c-fffa-49a7-985d-ed4589ab6856.json
+++ b/tmp/homelab/actions/pending/ff3da03c-fffa-49a7-985d-ed4589ab6856.json
@ -0,0 +1,20 @@
 {
  "action_id": "ff3da03c-fffa-49a7-985d-ed4589ab6856",
  "created_at": 1778600485.0510974,
  "proposed_by": "supervisor",
  "correlation_id": "37da2d5b-3ecd-4a29-97c2-7e9461b1792e",
  "node": "node2",
  "service": "webapp",
  "action_type": "rerun_deployment_stage",
  "risk_level": "guarded",
  "confidence": 0.9,
  "approval_required": true,
  "autonomous_eligible": false,
  "status": "pending",
  "payload": {
    "type": "missing_service",
    "service": "webapp",
    "node": "node2"
  },
  "rollback_reference": null
 }
--- a/tmp/homelab/actions/rejected/2143ae5b-bcc6-410b-b925-e7def70fc013.json
+++ b/tmp/homelab/actions/rejected/2143ae5b-bcc6-410b-b925-e7def70fc013.json
@ -0,0 +1,21 @@
 {
  "action_id": "2143ae5b-bcc6-410b-b925-e7def70fc013",
  "created_at": 1778600485.0523734,
  "proposed_by": "supervisor",
  "correlation_id": "dc23556c-68d2-41a3-a5d2-9ad66705f989",
  "node": null,
  "service": "webapp",
  "action_type": "collect_diagnostics",
  "risk_level": "safe",
  "confidence": 0.9,
  "approval_required": false,
  "autonomous_eligible": false,
  "status": "rejected",
  "payload": {
    "type": "failed_deployment",
    "deployment_id": "dep-003",
    "service": "webapp"
  },
  "rollback_reference": null,
  "rejected_at": 1778600485.740686
 }
--- a/tmp/homelab/actions/rejected/240cbbc0-891e-4032-bf73-1fa40ff850b4.json
+++ b/tmp/homelab/actions/rejected/240cbbc0-891e-4032-bf73-1fa40ff850b4.json
@ -0,0 +1,21 @@
 {
  "action_id": "240cbbc0-891e-4032-bf73-1fa40ff850b4",
  "created_at": 1778600510.7525399,
  "proposed_by": "supervisor",
  "correlation_id": "fd234809-82aa-459d-858b-18bc3205a6c5",
  "node": null,
  "service": "webapp",
  "action_type": "collect_diagnostics",
  "risk_level": "safe",
  "confidence": 0.9,
  "approval_required": false,
  "autonomous_eligible": false,
  "status": "rejected",
  "payload": {
    "type": "failed_deployment",
    "deployment_id": "dep-003",
    "service": "webapp"
  },
  "rollback_reference": null,
  "rejected_at": 1778600511.4303465
 }
--- a/tmp/homelab/world/deployments/dep-001.json
+++ b/tmp/homelab/world/deployments/dep-001.json
@ -1 +1 @@
-{"id": "dep-001", "service": "webapp", "status": "failed", "timestamp": 1778597957}
+{"id": "dep-001", "service": "webapp", "status": "failed", "timestamp": 1778600510}
--- a/tmp/homelab/world/deployments/dep-002.json
+++ b/tmp/homelab/world/deployments/dep-002.json
@ -1 +1 @@
-{"id": "dep-002", "service": "webapp", "status": "failed", "timestamp": 1778597657}
+{"id": "dep-002", "service": "webapp", "status": "failed", "timestamp": 1778600210}
--- a/tmp/homelab/world/deployments/dep-003.json
+++ b/tmp/homelab/world/deployments/dep-003.json
@ -1 +1 @@
-{"id": "dep-003", "service": "webapp", "status": "failed", "timestamp": 1778597357}
+{"id": "dep-003", "service": "webapp", "status": "failed", "timestamp": 1778599910}
--- a/webui/index.html
+++ b/webui/index.html
@ -216,9 +216,9 @@
    .label { color: var(--text-muted); font-size: 12px; margin-bottom: 4px; }
    .value { font-weight: 500; margin-bottom: 12px; }
-    .risk-safe { color: var(--safe); }
+    .risk-safe { background: rgba(62, 175, 124, 0.1); color: var(--safe); }
-    .risk-guarded { color: var(--guarded); }
+    .risk-guarded { background: rgba(230, 126, 34, 0.1); color: var(--guarded); }
-    .risk-dangerous { color: var(--dangerous); }
+    .risk-dangerous { background: rgba(192, 57, 43, 0.1); color: var(--dangerous); }
  </style>
 </head>
@ -229,6 +229,9 @@
      <li class="nav-item active" onclick="showView('dashboard', this)">
        <span>Dashboard</span>
      </li>
      <li class="nav-item" onclick="showView('actions', this)">
        <span>Action Queue</span>
      </li>
      <li class="nav-item" onclick="showView('nodes', this)">
        <span>Nodes</span>
      </li>
@ -238,9 +241,15 @@
      <li class="nav-item" onclick="showView('deployments', this)">
        <span>Deployments</span>
      </li>
      <li class="nav-item" onclick="showView('topology', this)">
        <span>Topology</span>
      </li>
      <li class="nav-item" onclick="showView('events', this)">
        <span>Events</span>
      </li>
      <li class="nav-item" onclick="showView('correlation', this)">
        <span>Correlation</span>
      </li>
      <li class="nav-item" onclick="showView('recommendations', this)">
        <span>Recommendations</span>
      </li>
@ -255,7 +264,16 @@
  <main class="main-content">
    <header>
-      <div class="view-title" id="current-view-title">Dashboard</div>
+      <div style="display:flex; align-items:center; gap:20px">
        <div class="view-title" id="current-view-title">Dashboard</div>
        <select id="operator-mode" onchange="setOperatorMode(this.value)" style="background:var(--sidebar-color); border:1px solid var(--border-color); color:var(--accent-color); font-weight:bold; font-size:12px; padding:4px 8px">
          <option value="observe">OBSERVE</option>
          <option value="recommend">RECOMMEND</option>
          <option value="approval" selected>APPROVAL</option>
          <option value="autonomous">AUTONOMOUS</option>
          <option value="maintenance">MAINTENANCE</option>
        </select>
      </div>
      <div class="header-actions">
        <button onclick="refreshData()">Refresh</button>
      </div>
@ -269,6 +287,10 @@
            <div class="card-title">System Overview</div>
            <div id="dashboard-summary" style="margin-top:20px"></div>
          </div>
          <div class="card">
            <div class="card-title">Pending Actions</div>
            <div id="dashboard-actions-summary" style="margin-top:20px"></div>
          </div>
          <div class="card">
            <div class="card-title">Active Incidents</div>
            <div id="dashboard-incidents" style="margin-top:20px"></div>
@ -276,6 +298,20 @@
        </div>
      </div>
      <!-- Actions View -->
      <div id="view-actions" class="view hidden">
        <div style="display:grid; grid-template-columns: 1fr 1fr; gap:24px">
          <div>
            <h3>Pending Approval</h3>
            <div id="actions-pending" class="timeline"></div>
          </div>
          <div>
            <h3>Active / History</h3>
            <div id="actions-history" class="timeline"></div>
          </div>
        </div>
      </div>
      <!-- Nodes View -->
      <div id="view-nodes" class="view hidden">
        <div class="grid" id="nodes-list"></div>
@ -291,11 +327,24 @@
        <div class="grid" id="deployments-list"></div>
      </div>
      <!-- Topology View -->
      <div id="view-topology" class="view hidden">
        <div class="card" style="min-height:500px">
          <div class="card-title">Runtime Topology</div>
          <div id="topology-map" style="margin-top:20px; display:flex; flex-wrap:wrap; gap:40px; justify-content:center"></div>
        </div>
      </div>
      <!-- Events View -->
      <div id="view-events" class="view hidden">
        <div class="timeline" id="events-timeline"></div>
      </div>
      <!-- Correlation View -->
      <div id="view-correlation" class="view hidden">
        <div id="correlation-chains" class="grid"></div>
      </div>
      <!-- Recommendations View -->
      <div id="view-recommendations" class="view hidden">
        <div class="grid" id="recommendations-list"></div>
@ -335,6 +384,34 @@
      }
    }
    async function postData(endpoint, data) {
      try {
        const res = await fetch(endpoint, {
          method: 'POST',
          headers: {'Content-Type': 'application/json'},
          body: JSON.stringify(data)
        });
        return await res.json();
      } catch (e) {
        console.error('Post error:', endpoint, e);
        return null;
      }
    }
    async function mutateAction(id, status) {
      const res = await postData('/action/mutate', {id, status});
      if (res && res.status === 'ok') {
        refreshData();
      } else {
        alert('Mutation failed');
      }
    }
    function setOperatorMode(mode) {
      console.log('Operator mode set to:', mode);
      // In real system, this would call backend
    }
    function formatTime(ts) {
      if (!ts) return 'N/A';
      return new Date(ts * 1000).toLocaleString();
@ -368,6 +445,53 @@
        }
      }
      if (currentView === 'dashboard' || currentView === 'actions') {
          const actions = await fetchData('/actions');
          if (actions) {
              if (currentView === 'dashboard') {
                  const dashActions = document.getElementById('dashboard-actions-summary');
                  const pendingCount = actions.pending.length;
                  dashActions.innerHTML = `
                    <div class="label">Pending</div><div class="value" style="color:var(--guarded)">${pendingCount}</div>
                    <div class="label">Running</div><div class="value" style="color:var(--reconciling)">${actions.running.length}</div>
                  `;
              }
              if (currentView === 'actions') {
                  const pendingEl = document.getElementById('actions-pending');
                  const historyEl = document.getElementById('actions-history');
                  pendingEl.innerHTML = actions.pending.map(a => `
                    <div class="card" style="margin-bottom:12px">
                        <div class="card-header">
                            <div class="card-title">${a.type.toUpperCase()}</div>
                            <span class="badge risk-${a.risk_level}">${a.risk_level}</span>
                        </div>
                        <p>${a.description}</p>
                        <div class="label">Target</div><div class="value">${a.target.node} ${a.target.service || ''}</div>
                        <div class="label">Confidence</div><div class="value">${Math.round(a.confidence*100)}%</div>
                        <div class="controls">
                            <button class="btn-primary" onclick="mutateAction('${a.id}', 'approved')">Approve</button>
                            <button onclick="mutateAction('${a.id}', 'rejected')">Reject</button>
                        </div>
                    </div>
                  `).join('') || 'No pending actions.';
                  const history = [...actions.approved, ...actions.running, ...actions.completed, ...actions.failed];
                  historyEl.innerHTML = history.sort((a,b) => b.timestamp - a.timestamp).map(a => `
                    <div class="event">
                        <div class="event-header">
                            <span>${a.type.toUpperCase()}</span>
                            <span class="badge ${getStatusClass(a.status)}">${a.status}</span>
                        </div>
                        <div>${a.description}</div>
                        <small>${formatTime(a.timestamp)} | Target: ${a.target.node}</small>
                        ${a.status === 'approved' ? `<div class="controls"><button class="btn-primary" onclick="mutateAction('${a.id}', 'running')">Execute</button></div>` : ''}
                    </div>
                  `).join('') || 'No history.';
              }
          }
      }
      if (currentView === 'dashboard' || currentView === 'events') {
          const incidents = await fetchData('/incidents');
          if (currentView === 'dashboard') {
@ -474,6 +598,64 @@
        `).join('');
      }
      if (currentView === 'topology') {
          const nodes = await fetchData('/nodes');
          const services = await fetchData('/services');
          const topMap = document.getElementById('topology-map');
          if (nodes && services) {
              topMap.innerHTML = nodes.map(node => {
                  const nodeServices = services.filter(s => s.node === node.hostname || s.node === node.id);
                  return `
                    <div class="card" style="width:250px; border: 1px solid ${node.health === 'nominal' ? 'var(--border-color)' : 'var(--error)'}">
                        <div class="card-header">
                            <div class="card-title">${node.hostname}</div>
                            <span class="badge ${getStatusClass(node.health)}">${node.health}</span>
                        </div>
                        <div class="label">Capabilities</div>
                        <div class="value" style="font-size:11px">${node.capabilities.join(', ')}</div>
                        <div class="label">Services</div>
                        <div style="font-size:12px; margin-bottom:10px">
                            ${nodeServices.length > 0 ? nodeServices.map(s => `
                                <div style="display:flex; justify-content:space-between; margin-bottom:4px; padding:4px; background:rgba(255,255,255,0.03)">
                                    <span>${s.name}</span>
                                    <span class="${getStatusClass(s.health)}" style="font-size:10px">${s.health}</span>
                                </div>
                                ${s.dependencies.length > 0 ? `<div style="font-size:9px; color:var(--text-muted); margin-left:8px; margin-bottom:4px">dep: ${s.dependencies.join(', ')}</div>` : ''}
                            `).join('') : '<div class="value">None</div>'}
                        </div>
                    </div>
                  `;
              }).join('');
          }
      }
      if (currentView === 'correlation') {
          const incidents = await fetchData('/incidents');
          const actions = await fetchData('/actions');
          const list = document.getElementById('correlation-chains');
          if (incidents && actions) {
              const allActions = Object.values(actions).flat();
              list.innerHTML = incidents.map(inc => {
                  const related = allActions.filter(a => a.correlation_chain && a.correlation_chain.includes(inc.id));
                  return `
                    <div class="card">
                        <div class="card-header">
                            <div class="card-title">Incident: ${inc.id || 'INC-001'}</div>
                            <span class="badge status-error">Active</span>
                        </div>
                        <p>${inc.message}</p>
                        <div class="label">Related Actions</div>
                        ${related.map(a => `
                            <div class="event" style="margin-top:5px">
                                <strong>${a.type}</strong> (${a.status})<br>
                                <small>${a.description}</small>
                            </div>
                        `).join('') || '<div class="value">No actions yet</div>'}
                    </div>
                  `;
              }).join('');
          }
      }
      if (currentView === 'settings') {
          const config = await fetchData('/config');
          const content = document.getElementById('settings-content');
@ -482,6 +664,8 @@
              <div class="value">${config.auto_mode ? 'Enabled' : 'Disabled'}</div>
              <div class="label">Action Thresholds</div>
              <div class="value mono">${JSON.stringify(config.action_thresholds, null, 2)}</div>
              <div class="label">Telegram Integration</div>
              <div class="value" style="color:var(--text-muted)">Ready for mobile approval flows. Hook: /api/v1/telegram/webhook</div>
              <button onclick="alert('Settings update not implemented in this demo')">Edit Configuration</button>
          `;
      }
--- a/webui/web.py
+++ b/webui/web.py
@ -8,6 +8,7 @@ from pathlib import Path
 STATE_DIR = Path("/opt/homelab/state")
 EVENTS_DIR = Path("/opt/homelab/events")
 WORLD_DIR = Path("/opt/homelab/world")
 ACTIONS_DIR = Path("/opt/homelab/actions")
 EVENT_LOG = Path("/tmp/agent-events.log")
 STATIC_DIR = Path(__file__).parent
 REDIS_HOST = os.getenv("REDIS_HOST", "redis")
@ -164,6 +165,55 @@ def current_events():
    return sorted(events, key=lambda x: x.get("timestamp", 0), reverse=True)
 def current_actions():
    actions = {}
    statuses = ["pending", "approved", "running", "completed", "failed", "rejected"]
    for status in statuses:
        actions[status] = []
        status_dir = ACTIONS_DIR / status
        if status_dir.exists():
            for f in status_dir.glob("*.json"):
                data = read_json_file(f)
                if data:
                    actions[status].append(data)
    return actions
 def mutate_action(action_id, target_status):
    statuses = ["pending", "approved", "running", "completed", "failed", "rejected"]
    if target_status not in statuses:
        return False, f"Invalid target status: {target_status}"
    # Find where the action is
    source_path = None
    for status in statuses:
        p = ACTIONS_DIR / status / f"{action_id}.json"
        if p.exists():
            source_path = p
            break
    if not source_path:
        return False, f"Action {action_id} not found"
    target_dir = ACTIONS_DIR / target_status
    target_dir.mkdir(parents=True, exist_ok=True)
    target_path = target_dir / f"{action_id}.json"
    try:
        data = json.loads(source_path.read_text())
        data["status"] = target_status
        data["last_mutation"] = os.path.getmtime(source_path) # or current time
        import time
        data["last_mutation"] = time.time()
        target_path.write_text(json.dumps(data, indent=2))
        if source_path != target_path:
            source_path.unlink()
        return True, "Success"
    except Exception as e:
        return False, str(e)
 def send_json(status, payload, handler):
    body = (json.dumps(payload) + "\n").encode("utf-8")
    handler.send_response(status)
@ -207,6 +257,10 @@ class Handler(BaseHTTPRequestHandler):
            send_json(200, current_events(), self)
            return
        if self.path == "/actions":
            send_json(200, current_actions(), self)
            return
        if self.path == "/logs":
            print("LOGS endpoint called", flush=True)
            body = ("\n".join(tail_lines(EVENT_LOG, 200)) + "\n").encode("utf-8")
@ -236,6 +290,7 @@ class Handler(BaseHTTPRequestHandler):
            "/auto-mode",
            "/config",
            "/events",
            "/action/mutate",
        ):
            self.send_error(404)
            return
@ -291,6 +346,19 @@ class Handler(BaseHTTPRequestHandler):
            send_json(200, {"status": "sent"}, self)
            return
        if self.path == "/action/mutate":
            action_id = payload.get("id")
            target = payload.get("status")
            if not action_id or not target:
                self.send_error(400, "id and status are required")
                return
            success, msg = mutate_action(action_id, target)
            if success:
                send_json(200, {"status": "ok"}, self)
            else:
                self.send_error(500, msg)
            return
        if not command:
            self.send_error(400, "command is required")
            return
`@ -1 +1 @@`
	`{"id": "dep-001", "service": "webapp", "status": "failed", "timestamp": 1778597957}`	`{"id": "dep-001", "service": "webapp", "status": "failed", "timestamp": 1778600510}`
`@ -1 +1 @@`
	`{"id": "dep-002", "service": "webapp", "status": "failed", "timestamp": 1778597657}`	`{"id": "dep-002", "service": "webapp", "status": "failed", "timestamp": 1778600210}`
`@ -1 +1 @@`
	`{"id": "dep-003", "service": "webapp", "status": "failed", "timestamp": 1778597357}`	`{"id": "dep-003", "service": "webapp", "status": "failed", "timestamp": 1778599910}`