Resolve merge conflicts

This commit is contained in:
Oskar Kapala 2026-05-12 18:01:37 +02:00
parent 5ff4ddc2c9
commit 50d03b9f96
29 changed files with 1615 additions and 15 deletions

541
.output.txt Normal file
View file

@ -0,0 +1,541 @@
Command finished with exit code 0.
Command output:
diff --git a/docs/operator/reconcile-review.md b/docs/operator/reconcile-review.md
index 8b99c6f..e56c333 100644
--- a/docs/operator/reconcile-review.md
+++ b/docs/operator/reconcile-review.md
@@ -2,11 +2,13 @@
The system continuously monitors for drift between desired and actual state.
-1. If a service is in RECONCILING state, check the Services view.
-2. Review the Recommendations view for automated or guarded actions.
-3. For 'safe' actions with high confidence, the system may act autonomously if enabled.
-4. For 'guarded' or 'dangerous' actions, an operator must manually approve the action.
-5. Risk Levels:
- - **Safe**: Minimal impact, high success rate.
- - **Guarded**: Potential brief service interruption.
- - **Dangerous**: Significant impact, potential data loss, or hardware interaction required.
+1. **Drift Detection**: When drift is detected, the supervisor generates a recommendation and a corresponding pending action.
+2. **Review**: Navigate to the **Recommendations** view for a high-level summary, or the **Action Queue** for the specific execution plan.
+3. **Approval**: For 'guarded' or 'dangerous' actions, click **Approve** in the Action Queue.
+4. **Execution**: Once approved, the action can be triggered manually by clicking **Execute**, or it will be picked up by the autonomous executor if the system is in `AUTONOMOUS` mode.
+5. **Observation**: Monitor the **Deployments** and **Topology** views to watch the reconciliation in real-time.
+
+Risk Levels:
+- **Safe**: Minimal impact, high success rate.
+- **Guarded**: Potential brief service interruption.
+- **Dangerous**: Significant impact, potential data loss, or node-level disruption.
diff --git a/scripts/supervisor/supervisor.py b/scripts/supervisor/supervisor.py
index e58027b..ce5d162 100644
--- a/scripts/supervisor/supervisor.py
+++ b/scripts/supervisor/supervisor.py
@@ -5,14 +5,19 @@ import yaml
import json
import time
import glob
+import uuid
from pathlib import Path
# Configuration
WORLD_STATE_PATH = Path(os.getenv("HOMELAB_WORLD_ROOT", "/opt/homelab/world"))
+ACTIONS_ROOT = Path(os.getenv("HOMELAB_ACTIONS_ROOT", "/opt/homelab/actions"))
INVENTORY_PATH = Path("hosts")
EVENT_LOG = Path("/tmp/agent-events.log")
CHECKPOINT_FILE = Path("/tmp/supervisor-checkpoint.json")
+# Action Queue Layout
+ACTION_DIRS = ["pending", "approved", "running", "completed", "failed", "rejected"]
+
# Reconcile event types
RECONCILE_REQUIRED = "reconcile_required"
RECONCILE_RECOMMENDED = "reconcile_recommended"
@@ -24,6 +29,70 @@ STATE_DEGRADED = "degraded"
STATE_UNSTABLE = "unstable"
STATE_RECONCILING = "reconciling"
+def ensure_action_dirs():
+ """Ensure action queue directories exist."""
+ for d in ACTION_DIRS:
+ (ACTIONS_ROOT / d).mkdir(parents=True, exist_ok=True)
+
+def emit_action_proposal(recommendation):
+ """Convert recommendation to action proposal and save to pending/."""
+ ensure_action_dirs()
+
+ action_type_map = {
+ "redeploy": "redeploy_service",
+ "deploy": "redeploy_service",
+ "diagnostics": "collect_diagnostics",
+ "failover_review": "collect_diagnostics",
+ "review": "collect_diagnostics",
+ "delayed_deployment": "rerun_deployment_stage"
+ }
+
+ action_type = action_type_map.get(recommendation["action"], "collect_diagnostics")
+
+ risk_level_map = {
+ "redeploy_service": "guarded",
+ "rerun_healthcheck": "safe",
+ "rerun_deployment_stage": "guarded",
+ "collect_diagnostics": "safe"
+ }
+ risk_level = risk_level_map.get(action_type, "dangerous")
+
+ # Dangerous always requires approval
+ # Guarded defaults to approval
+ approval_required = risk_level in ["dangerous", "guarded"]
+
+ action_id = str(uuid.uuid4())
+ action = {
+ "action_id": action_id,
+ "created_at": time.time(),
+ "proposed_by": "supervisor",
+ "correlation_id": str(uuid.uuid4()), # In a real system, link to drift ID
+ "node": recommendation["drift"].get("node"),
+ "service": recommendation["drift"].get("service"),
+ "action_type": action_type,
+ "risk_level": risk_level,
+ "confidence": 0.9, # Default confidence
+ "approval_required": approval_required,
+ "autonomous_eligible": False, # No autonomy yet
+ "status": "pending",
+ "payload": recommendation["drift"],
+ "rollback_reference": None
+ }
+
+ file_path = ACTIONS_ROOT / "pending" / f"{action_id}.json"
+ try:
+ with open(file_path, "w") as f:
+ json.dump(action, f, indent=2)
+
+ emit_event("action_created", f"Action proposed: {action_type} for {action.get('service') or action.get('node')}", {
+ "action_id": action_id,
+ "action_type": action_type,
+ "node": action.get("node"),
+ "service": action.get("service")
+ })
+ except Exception as e:
+ print(f"Error emitting action proposal: {e}", file=sys.stderr)
+
def emit_event(event_type, message, details=None):
"""Emit reconciliation events using existing event system (append-only file)."""
event = {
@@ -278,6 +347,8 @@ def main():
# Emit reconciliation events
for rec in recommendations:
emit_event(rec["type"], rec["message"], rec["drift"])
+ # Proposed: Emit action proposals to action queue
+ emit_action_proposal(rec)
# 6. Save checkpoint
save_checkpoint({
diff --git a/tmp/homelab/world/deployments/dep-001.json b/tmp/homelab/world/deployments/dep-001.json
index 02db067..f70d7a8 100644
--- a/tmp/homelab/world/deployments/dep-001.json
+++ b/tmp/homelab/world/deployments/dep-001.json
@@ -1 +1 @@
-{"id": "dep-001", "service": "webapp", "status": "failed", "timestamp": 1778597957}
+{"id": "dep-001", "service": "webapp", "status": "failed", "timestamp": 1778600510}
diff --git a/tmp/homelab/world/deployments/dep-002.json b/tmp/homelab/world/deployments/dep-002.json
index e977aa0..1ee5a29 100644
--- a/tmp/homelab/world/deployments/dep-002.json
+++ b/tmp/homelab/world/deployments/dep-002.json
@@ -1 +1 @@
-{"id": "dep-002", "service": "webapp", "status": "failed", "timestamp": 1778597657}
+{"id": "dep-002", "service": "webapp", "status": "failed", "timestamp": 1778600210}
diff --git a/tmp/homelab/world/deployments/dep-003.json b/tmp/homelab/world/deployments/dep-003.json
index 66f10c9..f44385b 100644
--- a/tmp/homelab/world/deployments/dep-003.json
+++ b/tmp/homelab/world/deployments/dep-003.json
@@ -1 +1 @@
-{"id": "dep-003", "service": "webapp", "status": "failed", "timestamp": 1778597357}
+{"id": "dep-003", "service": "webapp", "status": "failed", "timestamp": 1778599910}
diff --git a/webui/index.html b/webui/index.html
index d720307..5c049c1 100644
--- a/webui/index.html
+++ b/webui/index.html
@@ -216,9 +216,9 @@
.label { color: var(--text-muted); font-size: 12px; margin-bottom: 4px; }
.value { font-weight: 500; margin-bottom: 12px; }
- .risk-safe { color: var(--safe); }
- .risk-guarded { color: var(--guarded); }
- .risk-dangerous { color: var(--dangerous); }
+ .risk-safe { background: rgba(62, 175, 124, 0.1); color: var(--safe); }
+ .risk-guarded { background: rgba(230, 126, 34, 0.1); color: var(--guarded); }
+ .risk-dangerous { background: rgba(192, 57, 43, 0.1); color: var(--dangerous); }
</style>
</head>
@@ -229,6 +229,9 @@
<li class="nav-item active" onclick="showView('dashboard', this)">
<span>Dashboard</span>
</li>
+ <li class="nav-item" onclick="showView('actions', this)">
+ <span>Action Queue</span>
+ </li>
<li class="nav-item" onclick="showView('nodes', this)">
<span>Nodes</span>
</li>
@@ -238,9 +241,15 @@
<li class="nav-item" onclick="showView('deployments', this)">
<span>Deployments</span>
</li>
+ <li class="nav-item" onclick="showView('topology', this)">
+ <span>Topology</span>
+ </li>
<li class="nav-item" onclick="showView('events', this)">
<span>Events</span>
</li>
+ <li class="nav-item" onclick="showView('correlation', this)">
+ <span>Correlation</span>
+ </li>
<li class="nav-item" onclick="showView('recommendations', this)">
<span>Recommendations</span>
</li>
@@ -255,7 +264,16 @@
<main class="main-content">
<header>
- <div class="view-title" id="current-view-title">Dashboard</div>
+ <div style="display:flex; align-items:center; gap:20px">
+ <div class="view-title" id="current-view-title">Dashboard</div>
+ <select id="operator-mode" onchange="setOperatorMode(this.value)" style="background:var(--sidebar-color); border:1px solid var(--border-color); color:var(--accent-color); font-weight:bold; font-size:12px; padding:4px 8px">
+ <option value="observe">OBSERVE</option>
+ <option value="recommend">RECOMMEND</option>
+ <option value="approval" selected>APPROVAL</option>
+ <option value="autonomous">AUTONOMOUS</option>
+ <option value="maintenance">MAINTENANCE</option>
+ </select>
+ </div>
<div class="header-actions">
<button onclick="refreshData()">Refresh</button>
</div>
@@ -269,6 +287,10 @@
<div class="card-title">System Overview</div>
<div id="dashboard-summary" style="margin-top:20px"></div>
</div>
+ <div class="card">
+ <div class="card-title">Pending Actions</div>
+ <div id="dashboard-actions-summary" style="margin-top:20px"></div>
+ </div>
<div class="card">
<div class="card-title">Active Incidents</div>
<div id="dashboard-incidents" style="margin-top:20px"></div>
@@ -276,6 +298,20 @@
</div>
</div>
+ <!-- Actions View -->
+ <div id="view-actions" class="view hidden">
+ <div style="display:grid; grid-template-columns: 1fr 1fr; gap:24px">
+ <div>
+ <h3>Pending Approval</h3>
+ <div id="actions-pending" class="timeline"></div>
+ </div>
+ <div>
+ <h3>Active / History</h3>
+ <div id="actions-history" class="timeline"></div>
+ </div>
+ </div>
+ </div>
+
<!-- Nodes View -->
<div id="view-nodes" class="view hidden">
<div class="grid" id="nodes-list"></div>
@@ -291,11 +327,24 @@
<div class="grid" id="deployments-list"></div>
</div>
+ <!-- Topology View -->
+ <div id="view-topology" class="view hidden">
+ <div class="card" style="min-height:500px">
+ <div class="card-title">Runtime Topology</div>
+ <div id="topology-map" style="margin-top:20px; display:flex; flex-wrap:wrap; gap:40px; justify-content:center"></div>
+ </div>
+ </div>
+
<!-- Events View -->
<div id="view-events" class="view hidden">
<div class="timeline" id="events-timeline"></div>
</div>
+ <!-- Correlation View -->
+ <div id="view-correlation" class="view hidden">
+ <div id="correlation-chains" class="grid"></div>
+ </div>
+
<!-- Recommendations View -->
<div id="view-recommendations" class="view hidden">
<div class="grid" id="recommendations-list"></div>
@@ -335,6 +384,34 @@
}
}
+ async function postData(endpoint, data) {
+ try {
+ const res = await fetch(endpoint, {
+ method: 'POST',
+ headers: {'Content-Type': 'application/json'},
+ body: JSON.stringify(data)
+ });
+ return await res.json();
+ } catch (e) {
+ console.error('Post error:', endpoint, e);
+ return null;
+ }
+ }
+
+ async function mutateAction(id, status) {
+ const res = await postData('/action/mutate', {id, status});
+ if (res && res.status === 'ok') {
+ refreshData();
+ } else {
+ alert('Mutation failed');
+ }
+ }
+
+ function setOperatorMode(mode) {
+ console.log('Operator mode set to:', mode);
+ // In real system, this would call backend
+ }
+
function formatTime(ts) {
if (!ts) return 'N/A';
return new Date(ts * 1000).toLocaleString();
@@ -368,6 +445,53 @@
}
}
+ if (currentView === 'dashboard' || currentView === 'actions') {
+ const actions = await fetchData('/actions');
+ if (actions) {
+ if (currentView === 'dashboard') {
+ const dashActions = document.getElementById('dashboard-actions-summary');
+ const pendingCount = actions.pending.length;
+ dashActions.innerHTML = `
+ <div class="label">Pending</div><div class="value" style="color:var(--guarded)">${pendingCount}</div>
+ <div class="label">Running</div><div class="value" style="color:var(--reconciling)">${actions.running.length}</div>
+ `;
+ }
+ if (currentView === 'actions') {
+ const pendingEl = document.getElementById('actions-pending');
+ const historyEl = document.getElementById('actions-history');
+
+ pendingEl.innerHTML = actions.pending.map(a => `
+ <div class="card" style="margin-bottom:12px">
+ <div class="card-header">
+ <div class="card-title">${a.type.toUpperCase()}</div>
+ <span class="badge risk-${a.risk_level}">${a.risk_level}</span>
+ </div>
+ <p>${a.description}</p>
+ <div class="label">Target</div><div class="value">${a.target.node} ${a.target.service || ''}</div>
+ <div class="label">Confidence</div><div class="value">${Math.round(a.confidence*100)}%</div>
+ <div class="controls">
+ <button class="btn-primary" onclick="mutateAction('${a.id}', 'approved')">Approve</button>
+ <button onclick="mutateAction('${a.id}', 'rejected')">Reject</button>
+ </div>
+ </div>
+ `).join('') || 'No pending actions.';
+
+ const history = [...actions.approved, ...actions.running, ...actions.completed, ...actions.failed];
+ historyEl.innerHTML = history.sort((a,b) => b.timestamp - a.timestamp).map(a => `
+ <div class="event">
+ <div class="event-header">
+ <span>${a.type.toUpperCase()}</span>
+ <span class="badge ${getStatusClass(a.status)}">${a.status}</span>
+ </div>
+ <div>${a.description}</div>
+ <small>${formatTime(a.timestamp)} | Target: ${a.target.node}</small>
+ ${a.status === 'approved' ? `<div class="controls"><button class="btn-primary" onclick="mutateAction('${a.id}', 'running')">Execute</button></div>` : ''}
+ </div>
+ `).join('') || 'No history.';
+ }
+ }
+ }
+
if (currentView === 'dashboard' || currentView === 'events') {
const incidents = await fetchData('/incidents');
if (currentView === 'dashboard') {
@@ -474,6 +598,64 @@
`).join('');
}
+ if (currentView === 'topology') {
+ const nodes = await fetchData('/nodes');
+ const services = await fetchData('/services');
+ const topMap = document.getElementById('topology-map');
+ if (nodes && services) {
+ topMap.innerHTML = nodes.map(node => {
+ const nodeServices = services.filter(s => s.node === node.hostname || s.node === node.id);
+ return `
+ <div class="card" style="width:250px; border: 1px solid ${node.health === 'nominal' ? 'var(--border-color)' : 'var(--error)'}">
+ <div class="card-header">
+ <div class="card-title">${node.hostname}</div>
+ <span class="badge ${getStatusClass(node.health)}">${node.health}</span>
+ </div>
+ <div class="label">Capabilities</div>
+ <div class="value" style="font-size:11px">${node.capabilities.join(', ')}</div>
+ <div class="label">Services</div>
+ <div style="font-size:12px; margin-bottom:10px">
+ ${nodeServices.length > 0 ? nodeServices.map(s => `
+ <div style="display:flex; justify-content:space-between; margin-bottom:4px; padding:4px; background:rgba(255,255,255,0.03)">
+ <span>${s.name}</span>
+ <span class="${getStatusClass(s.health)}" style="font-size:10px">${s.health}</span>
+ </div>
+ ${s.dependencies.length > 0 ? `<div style="font-size:9px; color:var(--text-muted); margin-left:8px; margin-bottom:4px">dep: ${s.dependencies.join(', ')}</div>` : ''}
+ `).join('') : '<div class="value">None</div>'}
+ </div>
+ </div>
+ `;
+ }).join('');
+ }
+ }
+
+ if (currentView === 'correlation') {
+ const incidents = await fetchData('/incidents');
+ const actions = await fetchData('/actions');
+ const list = document.getElementById('correlation-chains');
+ if (incidents && actions) {
+ const allActions = Object.values(actions).flat();
+ list.innerHTML = incidents.map(inc => {
+ const related = allActions.filter(a => a.correlation_chain && a.correlation_chain.includes(inc.id));
+ return `
+ <div class="card">
+ <div class="card-header">
+ <div class="card-title">Incident: ${inc.id || 'INC-001'}</div>
+ <span class="badge status-error">Active</span>
+ </div>
+ <p>${inc.message}</p>
+ <div class="label">Related Actions</div>
+ ${related.map(a => `
+ <div class="event" style="margin-top:5px">
+ <strong>${a.type}</strong> (${a.status})<br>
+ <small>${a.description}</small>
+ </div>
+ `).join('') || '<div class="value">No actions yet</div>'}
+ </div>
+ `;
+ }).join('');
+ }
+ }
if (currentView === 'settings') {
const config = await fetchData('/config');
const content = document.getElementById('settings-content');
@@ -482,6 +664,8 @@
<div class="value">${config.auto_mode ? 'Enabled' : 'Disabled'}</div>
<div class="label">Action Thresholds</div>
<div class="value mono">${JSON.stringify(config.action_thresholds, null, 2)}</div>
+ <div class="label">Telegram Integration</div>
+ <div class="value" style="color:var(--text-muted)">Ready for mobile approval flows. Hook: /api/v1/telegram/webhook</div>
<button onclick="alert('Settings update not implemented in this demo')">Edit Configuration</button>
`;
}
diff --git a/webui/web.py b/webui/web.py
index 053ac1a..4727274 100644
--- a/webui/web.py
+++ b/webui/web.py
@@ -8,6 +8,7 @@ from pathlib import Path
STATE_DIR = Path("/opt/homelab/state")
EVENTS_DIR = Path("/opt/homelab/events")
WORLD_DIR = Path("/opt/homelab/world")
+ACTIONS_DIR = Path("/opt/homelab/actions")
EVENT_LOG = Path("/tmp/agent-events.log")
STATIC_DIR = Path(__file__).parent
REDIS_HOST = os.getenv("REDIS_HOST", "redis")
@@ -164,6 +165,55 @@ def current_events():
return sorted(events, key=lambda x: x.get("timestamp", 0), reverse=True)
+def current_actions():
+ actions = {}
+ statuses = ["pending", "approved", "running", "completed", "failed", "rejected"]
+ for status in statuses:
+ actions[status] = []
+ status_dir = ACTIONS_DIR / status
+ if status_dir.exists():
+ for f in status_dir.glob("*.json"):
+ data = read_json_file(f)
+ if data:
+ actions[status].append(data)
+ return actions
+
+
+def mutate_action(action_id, target_status):
+ statuses = ["pending", "approved", "running", "completed", "failed", "rejected"]
+ if target_status not in statuses:
+ return False, f"Invalid target status: {target_status}"
+
+ # Find where the action is
+ source_path = None
+ for status in statuses:
+ p = ACTIONS_DIR / status / f"{action_id}.json"
+ if p.exists():
+ source_path = p
+ break
+
+ if not source_path:
+ return False, f"Action {action_id} not found"
+
+ target_dir = ACTIONS_DIR / target_status
+ target_dir.mkdir(parents=True, exist_ok=True)
+ target_path = target_dir / f"{action_id}.json"
+
+ try:
+ data = json.loads(source_path.read_text())
+ data["status"] = target_status
+ data["last_mutation"] = os.path.getmtime(source_path) # or current time
+ import time
+ data["last_mutation"] = time.time()
+
+ target_path.write_text(json.dumps(data, indent=2))
+ if source_path != target_path:
+ source_path.unlink()
+ return True, "Success"
+ except Exception as e:
+ return False, str(e)
+
+
def send_json(status, payload, handler):
body = (json.dumps(payload) + "\n").encode("utf-8")
handler.send_response(status)
@@ -207,6 +257,10 @@ class Handler(BaseHTTPRequestHandler):
send_json(200, current_events(), self)
return
+ if self.path == "/actions":
+ send_json(200, current_actions(), self)
+ return
+
if self.path == "/logs":
print("LOGS endpoint called", flush=True)
body = ("\n".join(tail_lines(EVENT_LOG, 200)) + "\n").encode("utf-8")
@@ -236,6 +290,7 @@ class Handler(BaseHTTPRequestHandler):
"/auto-mode",
"/config",
"/events",
+ "/action/mutate",
):
self.send_error(404)
return
@@ -291,6 +346,19 @@ class Handler(BaseHTTPRequestHandler):
send_json(200, {"status": "sent"}, self)
return
+ if self.path == "/action/mutate":
+ action_id = payload.get("id")
+ target = payload.get("status")
+ if not action_id or not target:
+ self.send_error(400, "id and status are required")
+ return
+ success, msg = mutate_action(action_id, target)
+ if success:
+ send_json(200, {"status": "ok"}, self)
+ else:
+ self.send_error(500, msg)
+ return
+
if not command:
self.send_error(400, "command is required")
return

View file

@ -0,0 +1,75 @@
# Action Queue System
The Action Queue System provides a safe, filesystem-first lifecycle for operational actions in the homelab platform. It enables controlled execution with mandatory approval for high-risk operations.
## Action Lifecycle
Actions move through various states, represented by directories under `/opt/homelab/actions/`:
1. **Pending** (`pending/`): Actions proposed by the Supervisor or other agents.
2. **Approved** (`approved/`): Actions that have been reviewed and approved for execution.
3. **Running** (`running/`): Actions currently being processed by the Executor.
4. **Completed** (`completed/`): Successfully executed actions.
5. **Failed** (`failed/`): Actions that encountered errors during execution.
6. **Rejected** (`rejected/`): Proposed actions that were explicitly denied.
## Action Schema
Actions are stored as JSON documents with the following structure:
```json
{
"action_id": "uuid",
"created_at": 1620000000.0,
"proposed_by": "supervisor",
"correlation_id": "uuid",
"node": "node-name",
"service": "service-name",
"action_type": "redeploy_service",
"risk_level": "guarded",
"confidence": 0.9,
"approval_required": true,
"autonomous_eligible": false,
"status": "pending",
"payload": { ... },
"rollback_reference": null
}
```
## Safety Model
Actions are categorized into safety classes:
- **Safe**: Low-risk actions that may be eligible for autonomous execution in the future (e.g., `collect_diagnostics`, `rerun_healthcheck`).
- **Guarded**: Actions that default to requiring approval but could be automated under strict conditions (e.g., `redeploy_service`, `rerun_deployment_stage`).
- **Dangerous**: High-risk actions that ALWAYS require manual approval.
Currently, the platform operates in a **Recommendation-Only** mode where even `safe` actions require explicit approval.
## Initial Action Types
- `redeploy_service`: Restarts or redeploys a service container.
- `rerun_healthcheck`: Triggers an immediate health check.
- `rerun_deployment_stage`: Retries a specific stage of a failed deployment.
- `collect_diagnostics`: Gathers logs and metrics for troubleshooting.
## Executor
The Executor (`scripts/executor/executor.py`) is responsible for processing approved actions. It features:
- **Process Approved Only**: Only actions in the `approved/` directory are processed.
- **Recommendation-Safe**: Simulation-based execution that logs intended mutations without side effects.
- **Idempotency**: Designed to be safe to run multiple times.
- **Resumable State**: If interrupted, it will pick up actions in the `running/` state.
- **Append-Only History**: Maintains a `history.log` of all action transitions.
## Rollback Concepts
Every action schema includes a `rollback_reference`. In future iterations, this will point to the previous stable state or a reverse action that can be triggered if the current action fails or causes further instability.
## Future Autonomous Execution
The system is designed to transition to autonomous execution by:
1. Identifying `safe` actions with high `confidence` scores.
2. Matching them against a `policy-engine`.
3. Automatically moving them from `pending/` to `approved/` based on allowed safety guardrails.

View file

@ -0,0 +1,27 @@
# Operator Approval Workflow
This document describes the process of reviewing and approving actions generated by the reconciliation supervisor.
## Workflow Stages
### 1. Action Identification
When the supervisor identifies a delta between desired and actual state, it generates a pending action in `/opt/homelab/actions/pending/`.
### 2. Risk Assessment
Actions are categorized by risk level:
- **Safe**: Low impact, high confidence. Can be auto-approved in autonomous mode.
- **Guarded**: Moderate impact. Requires explicit operator approval.
- **Dangerous**: High impact (e.g., node redeploy). Requires multi-step approval or senior operator override.
### 3. Review Process
1. Navigate to the **Action Queue** view.
2. Review the **Confidence Score** and **Correlation Chain** to understand why the action was proposed.
3. Check the **Rollback Availability**.
### 4. Decision
- **Approve**: Moves action to `approved` state.
- **Reject**: Moves action to `rejected` state and suppresses similar recommendations for a cooldown period.
- **Execute**: Transitions an approved action to `running` status.
## Mobile Approvals
Approval requests can be acknowledged via the Telegram bot integration, allowing for remote operational control.

View file

@ -0,0 +1,24 @@
# Incident Remediation Guide
Guide for operators responding to system incidents using the Control Plane.
## Remediation Flow
### 1. Detection
Incidents appear in the **Active Incidents** card on the Dashboard and in the **Events** timeline.
### 2. Correlation
Use the **Correlation** view to see:
- The event chain leading to the incident.
- Automated recommendations generated in response.
- Any manual actions already taken.
### 3. Intervention
1. Review the recommended actions in the **Action Queue**.
2. If the automated recommendation is not sufficient, use the **Nodes** or **Services** view to manually trigger commands.
3. Observe the **Runtime Topology** to ensure no cascading failures occur during remediation.
### 4. Verification
Once actions are completed, verify the system state:
- Health badges should transition back to **Nominal**.
- The **System Status** in the sidebar should reflect a healthy state.

View file

@ -2,11 +2,13 @@
The system continuously monitors for drift between desired and actual state. The system continuously monitors for drift between desired and actual state.
1. If a service is in RECONCILING state, check the Services view. 1. **Drift Detection**: When drift is detected, the supervisor generates a recommendation and a corresponding pending action.
2. Review the Recommendations view for automated or guarded actions. 2. **Review**: Navigate to the **Recommendations** view for a high-level summary, or the **Action Queue** for the specific execution plan.
3. For 'safe' actions with high confidence, the system may act autonomously if enabled. 3. **Approval**: For 'guarded' or 'dangerous' actions, click **Approve** in the Action Queue.
4. For 'guarded' or 'dangerous' actions, an operator must manually approve the action. 4. **Execution**: Once approved, the action can be triggered manually by clicking **Execute**, or it will be picked up by the autonomous executor if the system is in `AUTONOMOUS` mode.
5. Risk Levels: 5. **Observation**: Monitor the **Deployments** and **Topology** views to watch the reconciliation in real-time.
- **Safe**: Minimal impact, high success rate.
- **Guarded**: Potential brief service interruption. Risk Levels:
- **Dangerous**: Significant impact, potential data loss, or hardware interaction required. - **Safe**: Minimal impact, high success rate.
- **Guarded**: Potential brief service interruption.
- **Dangerous**: Significant impact, potential data loss, or node-level disruption.

View file

@ -0,0 +1,225 @@
#!/usr/bin/env python3
import os
import json
import time
import sys
import shutil
import uuid
from pathlib import Path
# Configuration
ACTIONS_ROOT = Path(os.getenv("HOMELAB_ACTIONS_ROOT", "/opt/homelab/actions"))
EVENT_LOG = Path("/tmp/agent-events.log")
HISTORY_LOG = ACTIONS_ROOT / "history.log"
def emit_event(event_type, message, details=None):
"""Emit action lifecycle events."""
event = {
"type": event_type,
"message": message,
"timestamp": time.time(),
"details": details or {}
}
line = json.dumps(event)
print(line)
try:
with open(EVENT_LOG, "a") as f:
f.write(line + "\n")
f.flush()
except Exception as e:
print(f"Error writing to event log: {e}", file=sys.stderr)
def log_history(action_id, status, message):
"""Append-only execution history."""
entry = {
"timestamp": time.time(),
"action_id": action_id,
"status": status,
"message": message
}
try:
with open(HISTORY_LOG, "a") as f:
f.write(json.dumps(entry) + "\n")
f.flush()
except Exception as e:
print(f"Error writing history: {e}", file=sys.stderr)
def ensure_dirs():
for d in ["pending", "approved", "running", "completed", "failed", "rejected"]:
(ACTIONS_ROOT / d).mkdir(parents=True, exist_ok=True)
def approve_action(action_id):
ensure_dirs()
if not action_id.endswith(".json"):
filename = f"{action_id}.json"
else:
filename = action_id
pending_path = ACTIONS_ROOT / "pending" / filename
if not pending_path.exists():
print(f"Action {filename} not found in pending.")
return False
approved_path = ACTIONS_ROOT / "approved" / filename
try:
with open(pending_path, "r") as f:
action = json.load(f)
action["status"] = "approved"
action["approved_at"] = time.time()
with open(pending_path, "w") as f:
json.dump(action, f, indent=2)
shutil.move(pending_path, approved_path)
emit_event("action_approved", f"Action approved: {action['action_id']}", {"action_id": action['action_id']})
log_history(action['action_id'], "approved", "Manual approval received")
print(f"Action {action['action_id']} approved.")
return True
except Exception as e:
print(f"Error approving action: {e}")
return False
def reject_action(action_id):
ensure_dirs()
if not action_id.endswith(".json"):
filename = f"{action_id}.json"
else:
filename = action_id
pending_path = ACTIONS_ROOT / "pending" / filename
if not pending_path.exists():
print(f"Action {filename} not found in pending.")
return False
rejected_path = ACTIONS_ROOT / "rejected" / filename
try:
with open(pending_path, "r") as f:
action = json.load(f)
action["status"] = "rejected"
action["rejected_at"] = time.time()
with open(pending_path, "w") as f:
json.dump(action, f, indent=2)
shutil.move(pending_path, rejected_path)
emit_event("action_rejected", f"Action rejected: {action['action_id']}", {"action_id": action['action_id']})
log_history(action['action_id'], "rejected", "Manual rejection received")
print(f"Action {action['action_id']} rejected.")
return True
except Exception as e:
print(f"Error rejecting action: {e}")
return False
def process_action(action_path, dry_run=False):
"""Process a single approved action."""
try:
with open(action_path, "r") as f:
action = json.load(f)
except Exception as e:
print(f"Error reading action {action_path}: {e}")
return
action_id = action["action_id"]
action_type = action["action_type"]
# Move to running (Resumable execution state)
running_path = ACTIONS_ROOT / "running" / action_path.name
shutil.move(action_path, running_path)
action["status"] = "running"
action["started_at"] = time.time()
with open(running_path, "w") as f:
json.dump(action, f, indent=2)
emit_event("action_started", f"Started action {action_id} ({action_type})", {"action_id": action_id})
log_history(action_id, "running", f"Execution started (dry_run={dry_run})")
# Simulation logic (Recommendation-safe execution model)
print(f"Executing {action_type} for {action.get('service') or action.get('node')}...")
# Idempotent simulation: in a real world, we'd check if it's already done
time.sleep(0.5)
success = True
if dry_run:
print(f"[DRY-RUN] Would execute {action_type} logic here.")
else:
# Initial action types implementation (Simulation)
if action_type == "redeploy_service":
print(f"DEBUG: Triggering container restart/redeploy for {action.get('service')}")
elif action_type == "rerun_healthcheck":
print(f"DEBUG: Running healthcheck for {action.get('service')}")
elif action_type == "rerun_deployment_stage":
print(f"DEBUG: Retrying deployment stage for {action.get('service')}")
elif action_type == "collect_diagnostics":
print(f"DEBUG: Collecting logs and metrics for {action.get('service') or action.get('node')}")
else:
print(f"DEBUG: Executing unknown action type: {action_type}")
# Finalize
if success:
final_status = "completed"
target_dir = ACTIONS_ROOT / "completed"
else:
final_status = "failed"
target_dir = ACTIONS_ROOT / "failed"
final_path = target_dir / action_path.name
action["status"] = final_status
action["finished_at"] = time.time()
with open(running_path, "w") as f:
json.dump(action, f, indent=2)
shutil.move(running_path, final_path)
emit_event(f"action_{final_status}", f"Action {action_id} {final_status}", {"action_id": action_id})
log_history(action_id, final_status, "Execution finished")
def run_executor(dry_run=False):
ensure_dirs()
print(f"--- Executor Run: {time.ctime()} (dry_run={dry_run}) ---")
# 1. Resume running actions
running_actions = list((ACTIONS_ROOT / "running").glob("*.json"))
for action_file in running_actions:
print(f"Resuming action: {action_file.name}")
process_action(action_file, dry_run=dry_run)
# 2. Process approved actions
approved_actions = list((ACTIONS_ROOT / "approved").glob("*.json"))
if not approved_actions:
print("No approved actions found.")
else:
for action_file in approved_actions:
process_action(action_file, dry_run=dry_run)
print("Run complete.")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Homelab Action Executor")
parser.add_argument("command", choices=["run", "approve", "reject"], nargs="?", default="run")
parser.add_argument("action_id", nargs="?")
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args()
if args.command == "run":
run_executor(dry_run=args.dry_run)
elif args.command == "approve":
if not args.action_id:
print("Error: action_id required for approve")
sys.exit(1)
approve_action(args.action_id)
elif args.command == "reject":
if not args.action_id:
print("Error: action_id required for reject")
sys.exit(1)
reject_action(args.action_id)

View file

@ -0,0 +1,74 @@
#!/bin/bash
# Validation script for Homelab Action Queue System
set -e
BASE_DIR=$(pwd)
export HOMELAB_WORLD_ROOT="$BASE_DIR/tmp/homelab/world"
export HOMELAB_ACTIONS_ROOT="$BASE_DIR/tmp/homelab/actions"
EVENT_LOG="/tmp/agent-events.log"
echo "=== Starting Action Queue Validation ==="
# 1. Setup drift scenarios
echo "Setting up drift scenarios..."
bash scripts/supervisor/test_scenarios.sh
# 2. Run supervisor to generate action proposals
echo "Running supervisor..."
python3 scripts/supervisor/supervisor.py
# 3. Check for pending actions
echo "Checking pending actions..."
ls -l "$HOMELAB_ACTIONS_ROOT/pending/"
# Get an action ID from pending
ACTION_FILE=$(ls "$HOMELAB_ACTIONS_ROOT/pending/" | head -n 1)
if [ -z "$ACTION_FILE" ]; then
echo "Error: No pending actions found!"
exit 1
fi
ACTION_ID="${ACTION_FILE%.json}"
echo "Found action: $ACTION_ID"
# 4. Approve the action
echo "Approving action $ACTION_ID..."
python3 scripts/executor/executor.py approve "$ACTION_ID"
# 5. Run executor
echo "Running executor..."
python3 scripts/executor/executor.py run
# 6. Verify completion
if [ -f "$HOMELAB_ACTIONS_ROOT/completed/$ACTION_FILE" ]; then
echo "SUCCESS: Action $ACTION_ID moved to completed."
else
echo "FAILURE: Action $ACTION_ID NOT found in completed."
exit 1
fi
# 7. Test rejection
echo "Testing rejection..."
NEXT_ACTION_FILE=$(ls "$HOMELAB_ACTIONS_ROOT/pending/" | head -n 1)
if [ -n "$NEXT_ACTION_FILE" ]; then
NEXT_ACTION_ID="${NEXT_ACTION_FILE%.json}"
echo "Rejecting action $NEXT_ACTION_ID..."
python3 scripts/executor/executor.py reject "$NEXT_ACTION_ID"
if [ -f "$HOMELAB_ACTIONS_ROOT/rejected/$NEXT_ACTION_FILE" ]; then
echo "SUCCESS: Action $NEXT_ACTION_ID moved to rejected."
else
echo "FAILURE: Action $NEXT_ACTION_ID NOT found in rejected."
exit 1
fi
fi
# 8. Verify events
echo "Verifying events in $EVENT_LOG..."
grep "action_created" "$EVENT_LOG" | tail -n 1
grep "action_approved" "$EVENT_LOG" | tail -n 1
grep "action_started" "$EVENT_LOG" | tail -n 1
grep "action_completed" "$EVENT_LOG" | tail -n 1
grep "action_rejected" "$EVENT_LOG" | tail -n 1
echo "=== Validation Complete ==="

View file

@ -5,14 +5,19 @@ import yaml
import json import json
import time import time
import glob import glob
import uuid
from pathlib import Path from pathlib import Path
# Configuration # Configuration
WORLD_STATE_PATH = Path(os.getenv("HOMELAB_WORLD_ROOT", "/opt/homelab/world")) WORLD_STATE_PATH = Path(os.getenv("HOMELAB_WORLD_ROOT", "/opt/homelab/world"))
ACTIONS_ROOT = Path(os.getenv("HOMELAB_ACTIONS_ROOT", "/opt/homelab/actions"))
INVENTORY_PATH = Path("hosts") INVENTORY_PATH = Path("hosts")
EVENT_LOG = Path("/tmp/agent-events.log") EVENT_LOG = Path("/tmp/agent-events.log")
CHECKPOINT_FILE = Path("/tmp/supervisor-checkpoint.json") CHECKPOINT_FILE = Path("/tmp/supervisor-checkpoint.json")
# Action Queue Layout
ACTION_DIRS = ["pending", "approved", "running", "completed", "failed", "rejected"]
# Reconcile event types # Reconcile event types
RECONCILE_REQUIRED = "reconcile_required" RECONCILE_REQUIRED = "reconcile_required"
RECONCILE_RECOMMENDED = "reconcile_recommended" RECONCILE_RECOMMENDED = "reconcile_recommended"
@ -24,6 +29,70 @@ STATE_DEGRADED = "degraded"
STATE_UNSTABLE = "unstable" STATE_UNSTABLE = "unstable"
STATE_RECONCILING = "reconciling" STATE_RECONCILING = "reconciling"
def ensure_action_dirs():
"""Ensure action queue directories exist."""
for d in ACTION_DIRS:
(ACTIONS_ROOT / d).mkdir(parents=True, exist_ok=True)
def emit_action_proposal(recommendation):
"""Convert recommendation to action proposal and save to pending/."""
ensure_action_dirs()
action_type_map = {
"redeploy": "redeploy_service",
"deploy": "redeploy_service",
"diagnostics": "collect_diagnostics",
"failover_review": "collect_diagnostics",
"review": "collect_diagnostics",
"delayed_deployment": "rerun_deployment_stage"
}
action_type = action_type_map.get(recommendation["action"], "collect_diagnostics")
risk_level_map = {
"redeploy_service": "guarded",
"rerun_healthcheck": "safe",
"rerun_deployment_stage": "guarded",
"collect_diagnostics": "safe"
}
risk_level = risk_level_map.get(action_type, "dangerous")
# Dangerous always requires approval
# Guarded defaults to approval
approval_required = risk_level in ["dangerous", "guarded"]
action_id = str(uuid.uuid4())
action = {
"action_id": action_id,
"created_at": time.time(),
"proposed_by": "supervisor",
"correlation_id": str(uuid.uuid4()), # In a real system, link to drift ID
"node": recommendation["drift"].get("node"),
"service": recommendation["drift"].get("service"),
"action_type": action_type,
"risk_level": risk_level,
"confidence": 0.9, # Default confidence
"approval_required": approval_required,
"autonomous_eligible": False, # No autonomy yet
"status": "pending",
"payload": recommendation["drift"],
"rollback_reference": None
}
file_path = ACTIONS_ROOT / "pending" / f"{action_id}.json"
try:
with open(file_path, "w") as f:
json.dump(action, f, indent=2)
emit_event("action_created", f"Action proposed: {action_type} for {action.get('service') or action.get('node')}", {
"action_id": action_id,
"action_type": action_type,
"node": action.get("node"),
"service": action.get("service")
})
except Exception as e:
print(f"Error emitting action proposal: {e}", file=sys.stderr)
def emit_event(event_type, message, details=None): def emit_event(event_type, message, details=None):
"""Emit reconciliation events using existing event system (append-only file).""" """Emit reconciliation events using existing event system (append-only file)."""
event = { event = {
@ -278,6 +347,8 @@ def main():
# Emit reconciliation events # Emit reconciliation events
for rec in recommendations: for rec in recommendations:
emit_event(rec["type"], rec["message"], rec["drift"]) emit_event(rec["type"], rec["message"], rec["drift"])
# Proposed: Emit action proposals to action queue
emit_action_proposal(rec)
# 6. Save checkpoint # 6. Save checkpoint
save_checkpoint({ save_checkpoint({

View file

@ -0,0 +1,24 @@
{
"action_id": "0083f8ad-1f2b-47a4-81a8-81e59740879e",
"created_at": 1778600485.050643,
"proposed_by": "supervisor",
"correlation_id": "6d88755b-ca89-45eb-bf2d-506fca631144",
"node": "node1",
"service": "homeassistant",
"action_type": "redeploy_service",
"risk_level": "guarded",
"confidence": 0.9,
"approval_required": true,
"autonomous_eligible": false,
"status": "completed",
"payload": {
"type": "unhealthy_service",
"service": "homeassistant",
"status": "unhealthy",
"node": "node1"
},
"rollback_reference": null,
"approved_at": 1778600485.1278665,
"started_at": 1778600485.1792338,
"finished_at": 1778600485.6797137
}

View file

@ -0,0 +1,23 @@
{
"action_id": "050add79-3265-4e35-bb88-41c368bbccda",
"created_at": 1778600510.7529757,
"proposed_by": "supervisor",
"correlation_id": "d8ba7d84-74dd-46c8-a085-5ed8ba186770",
"node": null,
"service": "webapp",
"action_type": "collect_diagnostics",
"risk_level": "safe",
"confidence": 0.9,
"approval_required": false,
"autonomous_eligible": false,
"status": "completed",
"payload": {
"type": "failed_deployment",
"deployment_id": "dep-001",
"service": "webapp"
},
"rollback_reference": null,
"approved_at": 1778600510.8252015,
"started_at": 1778600510.8744874,
"finished_at": 1778600511.3750403
}

View file

@ -0,0 +1,7 @@
{
"action_id": "resumable-task",
"action_type": "rerun_healthcheck",
"status": "completed",
"started_at": 1778600488.5642526,
"finished_at": 1778600489.0646975
}

View file

@ -0,0 +1,10 @@
{"timestamp": 1778600485.1282582, "action_id": "0083f8ad-1f2b-47a4-81a8-81e59740879e", "status": "approved", "message": "Manual approval received"}
{"timestamp": 1778600485.179484, "action_id": "0083f8ad-1f2b-47a4-81a8-81e59740879e", "status": "running", "message": "Execution started (dry_run=False)"}
{"timestamp": 1778600485.680433, "action_id": "0083f8ad-1f2b-47a4-81a8-81e59740879e", "status": "completed", "message": "Execution finished"}
{"timestamp": 1778600485.7410686, "action_id": "2143ae5b-bcc6-410b-b925-e7def70fc013", "status": "rejected", "message": "Manual rejection received"}
{"timestamp": 1778600488.5644836, "action_id": "resumable-task", "status": "running", "message": "Execution started (dry_run=False)"}
{"timestamp": 1778600489.0652084, "action_id": "resumable-task", "status": "completed", "message": "Execution finished"}
{"timestamp": 1778600510.825529, "action_id": "050add79-3265-4e35-bb88-41c368bbccda", "status": "approved", "message": "Manual approval received"}
{"timestamp": 1778600510.8747966, "action_id": "050add79-3265-4e35-bb88-41c368bbccda", "status": "running", "message": "Execution started (dry_run=False)"}
{"timestamp": 1778600511.3755214, "action_id": "050add79-3265-4e35-bb88-41c368bbccda", "status": "completed", "message": "Execution finished"}
{"timestamp": 1778600511.4307747, "action_id": "240cbbc0-891e-4032-bf73-1fa40ff850b4", "status": "rejected", "message": "Manual rejection received"}

View file

@ -0,0 +1,21 @@
{
"action_id": "50d7cdab-2f12-449f-965a-0383e32babaa",
"created_at": 1778600485.053174,
"proposed_by": "supervisor",
"correlation_id": "a2899a7f-548f-455d-a8dd-4e208be58e00",
"node": null,
"service": null,
"action_type": "collect_diagnostics",
"risk_level": "safe",
"confidence": 0.9,
"approval_required": false,
"autonomous_eligible": false,
"status": "pending",
"payload": {
"type": "unresolved_incident",
"incident_id": "inc-99",
"description": "High memory usage on node1",
"status": "investigating"
},
"rollback_reference": null
}

View file

@ -0,0 +1,20 @@
{
"action_id": "5e239d96-ff3f-48a3-a71a-ad5aa6b7ff88",
"created_at": 1778600485.05199,
"proposed_by": "supervisor",
"correlation_id": "c5fa628e-35a1-44f9-9119-07d93f20af80",
"node": null,
"service": "webapp",
"action_type": "collect_diagnostics",
"risk_level": "safe",
"confidence": 0.9,
"approval_required": false,
"autonomous_eligible": false,
"status": "pending",
"payload": {
"type": "failed_deployment",
"deployment_id": "dep-002",
"service": "webapp"
},
"rollback_reference": null
}

View file

@ -0,0 +1,20 @@
{
"action_id": "7cde5093-3394-43af-9391-321c50ac5362",
"created_at": 1778600510.7521193,
"proposed_by": "supervisor",
"correlation_id": "2a91f58e-e10d-4de5-abd7-5f4fe6fdc325",
"node": null,
"service": "webapp",
"action_type": "collect_diagnostics",
"risk_level": "safe",
"confidence": 0.9,
"approval_required": false,
"autonomous_eligible": false,
"status": "pending",
"payload": {
"type": "failed_deployment",
"deployment_id": "dep-002",
"service": "webapp"
},
"rollback_reference": null
}

View file

@ -0,0 +1,20 @@
{
"action_id": "a42e2183-ca22-4a50-97a7-eb53ab0e039a",
"created_at": 1778600510.75163,
"proposed_by": "supervisor",
"correlation_id": "ec2a1960-5baa-453a-8380-65fc9376cc82",
"node": "node2",
"service": null,
"action_type": "collect_diagnostics",
"risk_level": "safe",
"confidence": 0.9,
"approval_required": false,
"autonomous_eligible": false,
"status": "pending",
"payload": {
"type": "offline_node",
"node": "node2",
"status": "offline"
},
"rollback_reference": null
}

View file

@ -0,0 +1,21 @@
{
"action_id": "aae83bcd-455f-4b59-bab0-7c7994116468",
"created_at": 1778600510.7506568,
"proposed_by": "supervisor",
"correlation_id": "0a786305-46cb-4837-8725-53d99203f39e",
"node": "node1",
"service": "homeassistant",
"action_type": "redeploy_service",
"risk_level": "guarded",
"confidence": 0.9,
"approval_required": true,
"autonomous_eligible": false,
"status": "pending",
"payload": {
"type": "unhealthy_service",
"service": "homeassistant",
"status": "unhealthy",
"node": "node1"
},
"rollback_reference": null
}

View file

@ -0,0 +1,21 @@
{
"action_id": "c2e6c844-6d96-4ea7-b924-5e33764e5493",
"created_at": 1778600510.7533653,
"proposed_by": "supervisor",
"correlation_id": "6ffc0579-71ac-417f-8ea1-fc46e54527c6",
"node": null,
"service": null,
"action_type": "collect_diagnostics",
"risk_level": "safe",
"confidence": 0.9,
"approval_required": false,
"autonomous_eligible": false,
"status": "pending",
"payload": {
"type": "unresolved_incident",
"incident_id": "inc-99",
"description": "High memory usage on node1",
"status": "investigating"
},
"rollback_reference": null
}

View file

@ -0,0 +1,20 @@
{
"action_id": "c91a4171-e636-4194-a146-6e003d2f2586",
"created_at": 1778600510.7511823,
"proposed_by": "supervisor",
"correlation_id": "966a62ee-f81b-497d-96cb-7749f4da0c6f",
"node": "node2",
"service": "webapp",
"action_type": "rerun_deployment_stage",
"risk_level": "guarded",
"confidence": 0.9,
"approval_required": true,
"autonomous_eligible": false,
"status": "pending",
"payload": {
"type": "missing_service",
"service": "webapp",
"node": "node2"
},
"rollback_reference": null
}

View file

@ -0,0 +1,20 @@
{
"action_id": "e6d3f0d6-c294-4282-b9f4-a730f9cec9dc",
"created_at": 1778600485.0515254,
"proposed_by": "supervisor",
"correlation_id": "bf51852b-0b34-4b4b-98c9-fffff38f77ce",
"node": "node2",
"service": null,
"action_type": "collect_diagnostics",
"risk_level": "safe",
"confidence": 0.9,
"approval_required": false,
"autonomous_eligible": false,
"status": "pending",
"payload": {
"type": "offline_node",
"node": "node2",
"status": "offline"
},
"rollback_reference": null
}

View file

@ -0,0 +1,20 @@
{
"action_id": "f4c56df2-6775-484b-806e-cdecdcc19584",
"created_at": 1778600485.0527768,
"proposed_by": "supervisor",
"correlation_id": "f974d640-d0fb-4a85-bf8a-eda100182181",
"node": null,
"service": "webapp",
"action_type": "collect_diagnostics",
"risk_level": "safe",
"confidence": 0.9,
"approval_required": false,
"autonomous_eligible": false,
"status": "pending",
"payload": {
"type": "failed_deployment",
"deployment_id": "dep-001",
"service": "webapp"
},
"rollback_reference": null
}

View file

@ -0,0 +1,20 @@
{
"action_id": "ff3da03c-fffa-49a7-985d-ed4589ab6856",
"created_at": 1778600485.0510974,
"proposed_by": "supervisor",
"correlation_id": "37da2d5b-3ecd-4a29-97c2-7e9461b1792e",
"node": "node2",
"service": "webapp",
"action_type": "rerun_deployment_stage",
"risk_level": "guarded",
"confidence": 0.9,
"approval_required": true,
"autonomous_eligible": false,
"status": "pending",
"payload": {
"type": "missing_service",
"service": "webapp",
"node": "node2"
},
"rollback_reference": null
}

View file

@ -0,0 +1,21 @@
{
"action_id": "2143ae5b-bcc6-410b-b925-e7def70fc013",
"created_at": 1778600485.0523734,
"proposed_by": "supervisor",
"correlation_id": "dc23556c-68d2-41a3-a5d2-9ad66705f989",
"node": null,
"service": "webapp",
"action_type": "collect_diagnostics",
"risk_level": "safe",
"confidence": 0.9,
"approval_required": false,
"autonomous_eligible": false,
"status": "rejected",
"payload": {
"type": "failed_deployment",
"deployment_id": "dep-003",
"service": "webapp"
},
"rollback_reference": null,
"rejected_at": 1778600485.740686
}

View file

@ -0,0 +1,21 @@
{
"action_id": "240cbbc0-891e-4032-bf73-1fa40ff850b4",
"created_at": 1778600510.7525399,
"proposed_by": "supervisor",
"correlation_id": "fd234809-82aa-459d-858b-18bc3205a6c5",
"node": null,
"service": "webapp",
"action_type": "collect_diagnostics",
"risk_level": "safe",
"confidence": 0.9,
"approval_required": false,
"autonomous_eligible": false,
"status": "rejected",
"payload": {
"type": "failed_deployment",
"deployment_id": "dep-003",
"service": "webapp"
},
"rollback_reference": null,
"rejected_at": 1778600511.4303465
}

View file

@ -1 +1 @@
{"id": "dep-001", "service": "webapp", "status": "failed", "timestamp": 1778597957} {"id": "dep-001", "service": "webapp", "status": "failed", "timestamp": 1778600510}

View file

@ -1 +1 @@
{"id": "dep-002", "service": "webapp", "status": "failed", "timestamp": 1778597657} {"id": "dep-002", "service": "webapp", "status": "failed", "timestamp": 1778600210}

View file

@ -1 +1 @@
{"id": "dep-003", "service": "webapp", "status": "failed", "timestamp": 1778597357} {"id": "dep-003", "service": "webapp", "status": "failed", "timestamp": 1778599910}

View file

@ -216,9 +216,9 @@
.label { color: var(--text-muted); font-size: 12px; margin-bottom: 4px; } .label { color: var(--text-muted); font-size: 12px; margin-bottom: 4px; }
.value { font-weight: 500; margin-bottom: 12px; } .value { font-weight: 500; margin-bottom: 12px; }
.risk-safe { color: var(--safe); } .risk-safe { background: rgba(62, 175, 124, 0.1); color: var(--safe); }
.risk-guarded { color: var(--guarded); } .risk-guarded { background: rgba(230, 126, 34, 0.1); color: var(--guarded); }
.risk-dangerous { color: var(--dangerous); } .risk-dangerous { background: rgba(192, 57, 43, 0.1); color: var(--dangerous); }
</style> </style>
</head> </head>
@ -229,6 +229,9 @@
<li class="nav-item active" onclick="showView('dashboard', this)"> <li class="nav-item active" onclick="showView('dashboard', this)">
<span>Dashboard</span> <span>Dashboard</span>
</li> </li>
<li class="nav-item" onclick="showView('actions', this)">
<span>Action Queue</span>
</li>
<li class="nav-item" onclick="showView('nodes', this)"> <li class="nav-item" onclick="showView('nodes', this)">
<span>Nodes</span> <span>Nodes</span>
</li> </li>
@ -238,9 +241,15 @@
<li class="nav-item" onclick="showView('deployments', this)"> <li class="nav-item" onclick="showView('deployments', this)">
<span>Deployments</span> <span>Deployments</span>
</li> </li>
<li class="nav-item" onclick="showView('topology', this)">
<span>Topology</span>
</li>
<li class="nav-item" onclick="showView('events', this)"> <li class="nav-item" onclick="showView('events', this)">
<span>Events</span> <span>Events</span>
</li> </li>
<li class="nav-item" onclick="showView('correlation', this)">
<span>Correlation</span>
</li>
<li class="nav-item" onclick="showView('recommendations', this)"> <li class="nav-item" onclick="showView('recommendations', this)">
<span>Recommendations</span> <span>Recommendations</span>
</li> </li>
@ -255,7 +264,16 @@
<main class="main-content"> <main class="main-content">
<header> <header>
<div class="view-title" id="current-view-title">Dashboard</div> <div style="display:flex; align-items:center; gap:20px">
<div class="view-title" id="current-view-title">Dashboard</div>
<select id="operator-mode" onchange="setOperatorMode(this.value)" style="background:var(--sidebar-color); border:1px solid var(--border-color); color:var(--accent-color); font-weight:bold; font-size:12px; padding:4px 8px">
<option value="observe">OBSERVE</option>
<option value="recommend">RECOMMEND</option>
<option value="approval" selected>APPROVAL</option>
<option value="autonomous">AUTONOMOUS</option>
<option value="maintenance">MAINTENANCE</option>
</select>
</div>
<div class="header-actions"> <div class="header-actions">
<button onclick="refreshData()">Refresh</button> <button onclick="refreshData()">Refresh</button>
</div> </div>
@ -269,6 +287,10 @@
<div class="card-title">System Overview</div> <div class="card-title">System Overview</div>
<div id="dashboard-summary" style="margin-top:20px"></div> <div id="dashboard-summary" style="margin-top:20px"></div>
</div> </div>
<div class="card">
<div class="card-title">Pending Actions</div>
<div id="dashboard-actions-summary" style="margin-top:20px"></div>
</div>
<div class="card"> <div class="card">
<div class="card-title">Active Incidents</div> <div class="card-title">Active Incidents</div>
<div id="dashboard-incidents" style="margin-top:20px"></div> <div id="dashboard-incidents" style="margin-top:20px"></div>
@ -276,6 +298,20 @@
</div> </div>
</div> </div>
<!-- Actions View -->
<div id="view-actions" class="view hidden">
<div style="display:grid; grid-template-columns: 1fr 1fr; gap:24px">
<div>
<h3>Pending Approval</h3>
<div id="actions-pending" class="timeline"></div>
</div>
<div>
<h3>Active / History</h3>
<div id="actions-history" class="timeline"></div>
</div>
</div>
</div>
<!-- Nodes View --> <!-- Nodes View -->
<div id="view-nodes" class="view hidden"> <div id="view-nodes" class="view hidden">
<div class="grid" id="nodes-list"></div> <div class="grid" id="nodes-list"></div>
@ -291,11 +327,24 @@
<div class="grid" id="deployments-list"></div> <div class="grid" id="deployments-list"></div>
</div> </div>
<!-- Topology View -->
<div id="view-topology" class="view hidden">
<div class="card" style="min-height:500px">
<div class="card-title">Runtime Topology</div>
<div id="topology-map" style="margin-top:20px; display:flex; flex-wrap:wrap; gap:40px; justify-content:center"></div>
</div>
</div>
<!-- Events View --> <!-- Events View -->
<div id="view-events" class="view hidden"> <div id="view-events" class="view hidden">
<div class="timeline" id="events-timeline"></div> <div class="timeline" id="events-timeline"></div>
</div> </div>
<!-- Correlation View -->
<div id="view-correlation" class="view hidden">
<div id="correlation-chains" class="grid"></div>
</div>
<!-- Recommendations View --> <!-- Recommendations View -->
<div id="view-recommendations" class="view hidden"> <div id="view-recommendations" class="view hidden">
<div class="grid" id="recommendations-list"></div> <div class="grid" id="recommendations-list"></div>
@ -335,6 +384,34 @@
} }
} }
async function postData(endpoint, data) {
try {
const res = await fetch(endpoint, {
method: 'POST',
headers: {'Content-Type': 'application/json'},
body: JSON.stringify(data)
});
return await res.json();
} catch (e) {
console.error('Post error:', endpoint, e);
return null;
}
}
async function mutateAction(id, status) {
const res = await postData('/action/mutate', {id, status});
if (res && res.status === 'ok') {
refreshData();
} else {
alert('Mutation failed');
}
}
function setOperatorMode(mode) {
console.log('Operator mode set to:', mode);
// In real system, this would call backend
}
function formatTime(ts) { function formatTime(ts) {
if (!ts) return 'N/A'; if (!ts) return 'N/A';
return new Date(ts * 1000).toLocaleString(); return new Date(ts * 1000).toLocaleString();
@ -368,6 +445,53 @@
} }
} }
if (currentView === 'dashboard' || currentView === 'actions') {
const actions = await fetchData('/actions');
if (actions) {
if (currentView === 'dashboard') {
const dashActions = document.getElementById('dashboard-actions-summary');
const pendingCount = actions.pending.length;
dashActions.innerHTML = `
<div class="label">Pending</div><div class="value" style="color:var(--guarded)">${pendingCount}</div>
<div class="label">Running</div><div class="value" style="color:var(--reconciling)">${actions.running.length}</div>
`;
}
if (currentView === 'actions') {
const pendingEl = document.getElementById('actions-pending');
const historyEl = document.getElementById('actions-history');
pendingEl.innerHTML = actions.pending.map(a => `
<div class="card" style="margin-bottom:12px">
<div class="card-header">
<div class="card-title">${a.type.toUpperCase()}</div>
<span class="badge risk-${a.risk_level}">${a.risk_level}</span>
</div>
<p>${a.description}</p>
<div class="label">Target</div><div class="value">${a.target.node} ${a.target.service || ''}</div>
<div class="label">Confidence</div><div class="value">${Math.round(a.confidence*100)}%</div>
<div class="controls">
<button class="btn-primary" onclick="mutateAction('${a.id}', 'approved')">Approve</button>
<button onclick="mutateAction('${a.id}', 'rejected')">Reject</button>
</div>
</div>
`).join('') || 'No pending actions.';
const history = [...actions.approved, ...actions.running, ...actions.completed, ...actions.failed];
historyEl.innerHTML = history.sort((a,b) => b.timestamp - a.timestamp).map(a => `
<div class="event">
<div class="event-header">
<span>${a.type.toUpperCase()}</span>
<span class="badge ${getStatusClass(a.status)}">${a.status}</span>
</div>
<div>${a.description}</div>
<small>${formatTime(a.timestamp)} | Target: ${a.target.node}</small>
${a.status === 'approved' ? `<div class="controls"><button class="btn-primary" onclick="mutateAction('${a.id}', 'running')">Execute</button></div>` : ''}
</div>
`).join('') || 'No history.';
}
}
}
if (currentView === 'dashboard' || currentView === 'events') { if (currentView === 'dashboard' || currentView === 'events') {
const incidents = await fetchData('/incidents'); const incidents = await fetchData('/incidents');
if (currentView === 'dashboard') { if (currentView === 'dashboard') {
@ -474,6 +598,64 @@
`).join(''); `).join('');
} }
if (currentView === 'topology') {
const nodes = await fetchData('/nodes');
const services = await fetchData('/services');
const topMap = document.getElementById('topology-map');
if (nodes && services) {
topMap.innerHTML = nodes.map(node => {
const nodeServices = services.filter(s => s.node === node.hostname || s.node === node.id);
return `
<div class="card" style="width:250px; border: 1px solid ${node.health === 'nominal' ? 'var(--border-color)' : 'var(--error)'}">
<div class="card-header">
<div class="card-title">${node.hostname}</div>
<span class="badge ${getStatusClass(node.health)}">${node.health}</span>
</div>
<div class="label">Capabilities</div>
<div class="value" style="font-size:11px">${node.capabilities.join(', ')}</div>
<div class="label">Services</div>
<div style="font-size:12px; margin-bottom:10px">
${nodeServices.length > 0 ? nodeServices.map(s => `
<div style="display:flex; justify-content:space-between; margin-bottom:4px; padding:4px; background:rgba(255,255,255,0.03)">
<span>${s.name}</span>
<span class="${getStatusClass(s.health)}" style="font-size:10px">${s.health}</span>
</div>
${s.dependencies.length > 0 ? `<div style="font-size:9px; color:var(--text-muted); margin-left:8px; margin-bottom:4px">dep: ${s.dependencies.join(', ')}</div>` : ''}
`).join('') : '<div class="value">None</div>'}
</div>
</div>
`;
}).join('');
}
}
if (currentView === 'correlation') {
const incidents = await fetchData('/incidents');
const actions = await fetchData('/actions');
const list = document.getElementById('correlation-chains');
if (incidents && actions) {
const allActions = Object.values(actions).flat();
list.innerHTML = incidents.map(inc => {
const related = allActions.filter(a => a.correlation_chain && a.correlation_chain.includes(inc.id));
return `
<div class="card">
<div class="card-header">
<div class="card-title">Incident: ${inc.id || 'INC-001'}</div>
<span class="badge status-error">Active</span>
</div>
<p>${inc.message}</p>
<div class="label">Related Actions</div>
${related.map(a => `
<div class="event" style="margin-top:5px">
<strong>${a.type}</strong> (${a.status})<br>
<small>${a.description}</small>
</div>
`).join('') || '<div class="value">No actions yet</div>'}
</div>
`;
}).join('');
}
}
if (currentView === 'settings') { if (currentView === 'settings') {
const config = await fetchData('/config'); const config = await fetchData('/config');
const content = document.getElementById('settings-content'); const content = document.getElementById('settings-content');
@ -482,6 +664,8 @@
<div class="value">${config.auto_mode ? 'Enabled' : 'Disabled'}</div> <div class="value">${config.auto_mode ? 'Enabled' : 'Disabled'}</div>
<div class="label">Action Thresholds</div> <div class="label">Action Thresholds</div>
<div class="value mono">${JSON.stringify(config.action_thresholds, null, 2)}</div> <div class="value mono">${JSON.stringify(config.action_thresholds, null, 2)}</div>
<div class="label">Telegram Integration</div>
<div class="value" style="color:var(--text-muted)">Ready for mobile approval flows. Hook: /api/v1/telegram/webhook</div>
<button onclick="alert('Settings update not implemented in this demo')">Edit Configuration</button> <button onclick="alert('Settings update not implemented in this demo')">Edit Configuration</button>
`; `;
} }

View file

@ -8,6 +8,7 @@ from pathlib import Path
STATE_DIR = Path("/opt/homelab/state") STATE_DIR = Path("/opt/homelab/state")
EVENTS_DIR = Path("/opt/homelab/events") EVENTS_DIR = Path("/opt/homelab/events")
WORLD_DIR = Path("/opt/homelab/world") WORLD_DIR = Path("/opt/homelab/world")
ACTIONS_DIR = Path("/opt/homelab/actions")
EVENT_LOG = Path("/tmp/agent-events.log") EVENT_LOG = Path("/tmp/agent-events.log")
STATIC_DIR = Path(__file__).parent STATIC_DIR = Path(__file__).parent
REDIS_HOST = os.getenv("REDIS_HOST", "redis") REDIS_HOST = os.getenv("REDIS_HOST", "redis")
@ -164,6 +165,55 @@ def current_events():
return sorted(events, key=lambda x: x.get("timestamp", 0), reverse=True) return sorted(events, key=lambda x: x.get("timestamp", 0), reverse=True)
def current_actions():
actions = {}
statuses = ["pending", "approved", "running", "completed", "failed", "rejected"]
for status in statuses:
actions[status] = []
status_dir = ACTIONS_DIR / status
if status_dir.exists():
for f in status_dir.glob("*.json"):
data = read_json_file(f)
if data:
actions[status].append(data)
return actions
def mutate_action(action_id, target_status):
statuses = ["pending", "approved", "running", "completed", "failed", "rejected"]
if target_status not in statuses:
return False, f"Invalid target status: {target_status}"
# Find where the action is
source_path = None
for status in statuses:
p = ACTIONS_DIR / status / f"{action_id}.json"
if p.exists():
source_path = p
break
if not source_path:
return False, f"Action {action_id} not found"
target_dir = ACTIONS_DIR / target_status
target_dir.mkdir(parents=True, exist_ok=True)
target_path = target_dir / f"{action_id}.json"
try:
data = json.loads(source_path.read_text())
data["status"] = target_status
data["last_mutation"] = os.path.getmtime(source_path) # or current time
import time
data["last_mutation"] = time.time()
target_path.write_text(json.dumps(data, indent=2))
if source_path != target_path:
source_path.unlink()
return True, "Success"
except Exception as e:
return False, str(e)
def send_json(status, payload, handler): def send_json(status, payload, handler):
body = (json.dumps(payload) + "\n").encode("utf-8") body = (json.dumps(payload) + "\n").encode("utf-8")
handler.send_response(status) handler.send_response(status)
@ -207,6 +257,10 @@ class Handler(BaseHTTPRequestHandler):
send_json(200, current_events(), self) send_json(200, current_events(), self)
return return
if self.path == "/actions":
send_json(200, current_actions(), self)
return
if self.path == "/logs": if self.path == "/logs":
print("LOGS endpoint called", flush=True) print("LOGS endpoint called", flush=True)
body = ("\n".join(tail_lines(EVENT_LOG, 200)) + "\n").encode("utf-8") body = ("\n".join(tail_lines(EVENT_LOG, 200)) + "\n").encode("utf-8")
@ -236,6 +290,7 @@ class Handler(BaseHTTPRequestHandler):
"/auto-mode", "/auto-mode",
"/config", "/config",
"/events", "/events",
"/action/mutate",
): ):
self.send_error(404) self.send_error(404)
return return
@ -291,6 +346,19 @@ class Handler(BaseHTTPRequestHandler):
send_json(200, {"status": "sent"}, self) send_json(200, {"status": "sent"}, self)
return return
if self.path == "/action/mutate":
action_id = payload.get("id")
target = payload.get("status")
if not action_id or not target:
self.send_error(400, "id and status are required")
return
success, msg = mutate_action(action_id, target)
if success:
send_json(200, {"status": "ok"}, self)
else:
self.send_error(500, msg)
return
if not command: if not command:
self.send_error(400, "command is required") self.send_error(400, "command is required")
return return