Resolve merge conflicts

This commit is contained in:
Oskar Kapala 2026-05-12 18:01:37 +02:00
parent 5ff4ddc2c9
commit 50d03b9f96
29 changed files with 1615 additions and 15 deletions

541
.output.txt Normal file
View file

@ -0,0 +1,541 @@
Command finished with exit code 0.
Command output:
diff --git a/docs/operator/reconcile-review.md b/docs/operator/reconcile-review.md
index 8b99c6f..e56c333 100644
--- a/docs/operator/reconcile-review.md
+++ b/docs/operator/reconcile-review.md
@@ -2,11 +2,13 @@
The system continuously monitors for drift between desired and actual state.
-1. If a service is in RECONCILING state, check the Services view.
-2. Review the Recommendations view for automated or guarded actions.
-3. For 'safe' actions with high confidence, the system may act autonomously if enabled.
-4. For 'guarded' or 'dangerous' actions, an operator must manually approve the action.
-5. Risk Levels:
- - **Safe**: Minimal impact, high success rate.
- - **Guarded**: Potential brief service interruption.
- - **Dangerous**: Significant impact, potential data loss, or hardware interaction required.
+1. **Drift Detection**: When drift is detected, the supervisor generates a recommendation and a corresponding pending action.
+2. **Review**: Navigate to the **Recommendations** view for a high-level summary, or the **Action Queue** for the specific execution plan.
+3. **Approval**: For 'guarded' or 'dangerous' actions, click **Approve** in the Action Queue.
+4. **Execution**: Once approved, the action can be triggered manually by clicking **Execute**, or it will be picked up by the autonomous executor if the system is in `AUTONOMOUS` mode.
+5. **Observation**: Monitor the **Deployments** and **Topology** views to watch the reconciliation in real-time.
+
+Risk Levels:
+- **Safe**: Minimal impact, high success rate.
+- **Guarded**: Potential brief service interruption.
+- **Dangerous**: Significant impact, potential data loss, or node-level disruption.
diff --git a/scripts/supervisor/supervisor.py b/scripts/supervisor/supervisor.py
index e58027b..ce5d162 100644
--- a/scripts/supervisor/supervisor.py
+++ b/scripts/supervisor/supervisor.py
@@ -5,14 +5,19 @@ import yaml
import json
import time
import glob
+import uuid
from pathlib import Path
# Configuration
WORLD_STATE_PATH = Path(os.getenv("HOMELAB_WORLD_ROOT", "/opt/homelab/world"))
+ACTIONS_ROOT = Path(os.getenv("HOMELAB_ACTIONS_ROOT", "/opt/homelab/actions"))
INVENTORY_PATH = Path("hosts")
EVENT_LOG = Path("/tmp/agent-events.log")
CHECKPOINT_FILE = Path("/tmp/supervisor-checkpoint.json")
+# Action Queue Layout
+ACTION_DIRS = ["pending", "approved", "running", "completed", "failed", "rejected"]
+
# Reconcile event types
RECONCILE_REQUIRED = "reconcile_required"
RECONCILE_RECOMMENDED = "reconcile_recommended"
@@ -24,6 +29,70 @@ STATE_DEGRADED = "degraded"
STATE_UNSTABLE = "unstable"
STATE_RECONCILING = "reconciling"
+def ensure_action_dirs():
+ """Ensure action queue directories exist."""
+ for d in ACTION_DIRS:
+ (ACTIONS_ROOT / d).mkdir(parents=True, exist_ok=True)
+
+def emit_action_proposal(recommendation):
+ """Convert recommendation to action proposal and save to pending/."""
+ ensure_action_dirs()
+
+ action_type_map = {
+ "redeploy": "redeploy_service",
+ "deploy": "redeploy_service",
+ "diagnostics": "collect_diagnostics",
+ "failover_review": "collect_diagnostics",
+ "review": "collect_diagnostics",
+ "delayed_deployment": "rerun_deployment_stage"
+ }
+
+ action_type = action_type_map.get(recommendation["action"], "collect_diagnostics")
+
+ risk_level_map = {
+ "redeploy_service": "guarded",
+ "rerun_healthcheck": "safe",
+ "rerun_deployment_stage": "guarded",
+ "collect_diagnostics": "safe"
+ }
+ risk_level = risk_level_map.get(action_type, "dangerous")
+
+ # Dangerous always requires approval
+ # Guarded defaults to approval
+ approval_required = risk_level in ["dangerous", "guarded"]
+
+ action_id = str(uuid.uuid4())
+ action = {
+ "action_id": action_id,
+ "created_at": time.time(),
+ "proposed_by": "supervisor",
+ "correlation_id": str(uuid.uuid4()), # In a real system, link to drift ID
+ "node": recommendation["drift"].get("node"),
+ "service": recommendation["drift"].get("service"),
+ "action_type": action_type,
+ "risk_level": risk_level,
+ "confidence": 0.9, # Default confidence
+ "approval_required": approval_required,
+ "autonomous_eligible": False, # No autonomy yet
+ "status": "pending",
+ "payload": recommendation["drift"],
+ "rollback_reference": None
+ }
+
+ file_path = ACTIONS_ROOT / "pending" / f"{action_id}.json"
+ try:
+ with open(file_path, "w") as f:
+ json.dump(action, f, indent=2)
+
+ emit_event("action_created", f"Action proposed: {action_type} for {action.get('service') or action.get('node')}", {
+ "action_id": action_id,
+ "action_type": action_type,
+ "node": action.get("node"),
+ "service": action.get("service")
+ })
+ except Exception as e:
+ print(f"Error emitting action proposal: {e}", file=sys.stderr)
+
def emit_event(event_type, message, details=None):
"""Emit reconciliation events using existing event system (append-only file)."""
event = {
@@ -278,6 +347,8 @@ def main():
# Emit reconciliation events
for rec in recommendations:
emit_event(rec["type"], rec["message"], rec["drift"])
+ # Proposed: Emit action proposals to action queue
+ emit_action_proposal(rec)
# 6. Save checkpoint
save_checkpoint({
diff --git a/tmp/homelab/world/deployments/dep-001.json b/tmp/homelab/world/deployments/dep-001.json
index 02db067..f70d7a8 100644
--- a/tmp/homelab/world/deployments/dep-001.json
+++ b/tmp/homelab/world/deployments/dep-001.json
@@ -1 +1 @@
-{"id": "dep-001", "service": "webapp", "status": "failed", "timestamp": 1778597957}
+{"id": "dep-001", "service": "webapp", "status": "failed", "timestamp": 1778600510}
diff --git a/tmp/homelab/world/deployments/dep-002.json b/tmp/homelab/world/deployments/dep-002.json
index e977aa0..1ee5a29 100644
--- a/tmp/homelab/world/deployments/dep-002.json
+++ b/tmp/homelab/world/deployments/dep-002.json
@@ -1 +1 @@
-{"id": "dep-002", "service": "webapp", "status": "failed", "timestamp": 1778597657}
+{"id": "dep-002", "service": "webapp", "status": "failed", "timestamp": 1778600210}
diff --git a/tmp/homelab/world/deployments/dep-003.json b/tmp/homelab/world/deployments/dep-003.json
index 66f10c9..f44385b 100644
--- a/tmp/homelab/world/deployments/dep-003.json
+++ b/tmp/homelab/world/deployments/dep-003.json
@@ -1 +1 @@
-{"id": "dep-003", "service": "webapp", "status": "failed", "timestamp": 1778597357}
+{"id": "dep-003", "service": "webapp", "status": "failed", "timestamp": 1778599910}
diff --git a/webui/index.html b/webui/index.html
index d720307..5c049c1 100644
--- a/webui/index.html
+++ b/webui/index.html
@@ -216,9 +216,9 @@
.label { color: var(--text-muted); font-size: 12px; margin-bottom: 4px; }
.value { font-weight: 500; margin-bottom: 12px; }
- .risk-safe { color: var(--safe); }
- .risk-guarded { color: var(--guarded); }
- .risk-dangerous { color: var(--dangerous); }
+ .risk-safe { background: rgba(62, 175, 124, 0.1); color: var(--safe); }
+ .risk-guarded { background: rgba(230, 126, 34, 0.1); color: var(--guarded); }
+ .risk-dangerous { background: rgba(192, 57, 43, 0.1); color: var(--dangerous); }
</style>
</head>
@@ -229,6 +229,9 @@
<li class="nav-item active" onclick="showView('dashboard', this)">
<span>Dashboard</span>
</li>
+ <li class="nav-item" onclick="showView('actions', this)">
+ <span>Action Queue</span>
+ </li>
<li class="nav-item" onclick="showView('nodes', this)">
<span>Nodes</span>
</li>
@@ -238,9 +241,15 @@
<li class="nav-item" onclick="showView('deployments', this)">
<span>Deployments</span>
</li>
+ <li class="nav-item" onclick="showView('topology', this)">
+ <span>Topology</span>
+ </li>
<li class="nav-item" onclick="showView('events', this)">
<span>Events</span>
</li>
+ <li class="nav-item" onclick="showView('correlation', this)">
+ <span>Correlation</span>
+ </li>
<li class="nav-item" onclick="showView('recommendations', this)">
<span>Recommendations</span>
</li>
@@ -255,7 +264,16 @@
<main class="main-content">
<header>
- <div class="view-title" id="current-view-title">Dashboard</div>
+ <div style="display:flex; align-items:center; gap:20px">
+ <div class="view-title" id="current-view-title">Dashboard</div>
+ <select id="operator-mode" onchange="setOperatorMode(this.value)" style="background:var(--sidebar-color); border:1px solid var(--border-color); color:var(--accent-color); font-weight:bold; font-size:12px; padding:4px 8px">
+ <option value="observe">OBSERVE</option>
+ <option value="recommend">RECOMMEND</option>
+ <option value="approval" selected>APPROVAL</option>
+ <option value="autonomous">AUTONOMOUS</option>
+ <option value="maintenance">MAINTENANCE</option>
+ </select>
+ </div>
<div class="header-actions">
<button onclick="refreshData()">Refresh</button>
</div>
@@ -269,6 +287,10 @@
<div class="card-title">System Overview</div>
<div id="dashboard-summary" style="margin-top:20px"></div>
</div>
+ <div class="card">
+ <div class="card-title">Pending Actions</div>
+ <div id="dashboard-actions-summary" style="margin-top:20px"></div>
+ </div>
<div class="card">
<div class="card-title">Active Incidents</div>
<div id="dashboard-incidents" style="margin-top:20px"></div>
@@ -276,6 +298,20 @@
</div>
</div>
+ <!-- Actions View -->
+ <div id="view-actions" class="view hidden">
+ <div style="display:grid; grid-template-columns: 1fr 1fr; gap:24px">
+ <div>
+ <h3>Pending Approval</h3>
+ <div id="actions-pending" class="timeline"></div>
+ </div>
+ <div>
+ <h3>Active / History</h3>
+ <div id="actions-history" class="timeline"></div>
+ </div>
+ </div>
+ </div>
+
<!-- Nodes View -->
<div id="view-nodes" class="view hidden">
<div class="grid" id="nodes-list"></div>
@@ -291,11 +327,24 @@
<div class="grid" id="deployments-list"></div>
</div>
+ <!-- Topology View -->
+ <div id="view-topology" class="view hidden">
+ <div class="card" style="min-height:500px">
+ <div class="card-title">Runtime Topology</div>
+ <div id="topology-map" style="margin-top:20px; display:flex; flex-wrap:wrap; gap:40px; justify-content:center"></div>
+ </div>
+ </div>
+
<!-- Events View -->
<div id="view-events" class="view hidden">
<div class="timeline" id="events-timeline"></div>
</div>
+ <!-- Correlation View -->
+ <div id="view-correlation" class="view hidden">
+ <div id="correlation-chains" class="grid"></div>
+ </div>
+
<!-- Recommendations View -->
<div id="view-recommendations" class="view hidden">
<div class="grid" id="recommendations-list"></div>
@@ -335,6 +384,34 @@
}
}
+ async function postData(endpoint, data) {
+ try {
+ const res = await fetch(endpoint, {
+ method: 'POST',
+ headers: {'Content-Type': 'application/json'},
+ body: JSON.stringify(data)
+ });
+ return await res.json();
+ } catch (e) {
+ console.error('Post error:', endpoint, e);
+ return null;
+ }
+ }
+
+ async function mutateAction(id, status) {
+ const res = await postData('/action/mutate', {id, status});
+ if (res && res.status === 'ok') {
+ refreshData();
+ } else {
+ alert('Mutation failed');
+ }
+ }
+
+ function setOperatorMode(mode) {
+ console.log('Operator mode set to:', mode);
+ // In real system, this would call backend
+ }
+
function formatTime(ts) {
if (!ts) return 'N/A';
return new Date(ts * 1000).toLocaleString();
@@ -368,6 +445,53 @@
}
}
+ if (currentView === 'dashboard' || currentView === 'actions') {
+ const actions = await fetchData('/actions');
+ if (actions) {
+ if (currentView === 'dashboard') {
+ const dashActions = document.getElementById('dashboard-actions-summary');
+ const pendingCount = actions.pending.length;
+ dashActions.innerHTML = `
+ <div class="label">Pending</div><div class="value" style="color:var(--guarded)">${pendingCount}</div>
+ <div class="label">Running</div><div class="value" style="color:var(--reconciling)">${actions.running.length}</div>
+ `;
+ }
+ if (currentView === 'actions') {
+ const pendingEl = document.getElementById('actions-pending');
+ const historyEl = document.getElementById('actions-history');
+
+ pendingEl.innerHTML = actions.pending.map(a => `
+ <div class="card" style="margin-bottom:12px">
+ <div class="card-header">
+ <div class="card-title">${a.type.toUpperCase()}</div>
+ <span class="badge risk-${a.risk_level}">${a.risk_level}</span>
+ </div>
+ <p>${a.description}</p>
+ <div class="label">Target</div><div class="value">${a.target.node} ${a.target.service || ''}</div>
+ <div class="label">Confidence</div><div class="value">${Math.round(a.confidence*100)}%</div>
+ <div class="controls">
+ <button class="btn-primary" onclick="mutateAction('${a.id}', 'approved')">Approve</button>
+ <button onclick="mutateAction('${a.id}', 'rejected')">Reject</button>
+ </div>
+ </div>
+ `).join('') || 'No pending actions.';
+
+ const history = [...actions.approved, ...actions.running, ...actions.completed, ...actions.failed];
+ historyEl.innerHTML = history.sort((a,b) => b.timestamp - a.timestamp).map(a => `
+ <div class="event">
+ <div class="event-header">
+ <span>${a.type.toUpperCase()}</span>
+ <span class="badge ${getStatusClass(a.status)}">${a.status}</span>
+ </div>
+ <div>${a.description}</div>
+ <small>${formatTime(a.timestamp)} | Target: ${a.target.node}</small>
+ ${a.status === 'approved' ? `<div class="controls"><button class="btn-primary" onclick="mutateAction('${a.id}', 'running')">Execute</button></div>` : ''}
+ </div>
+ `).join('') || 'No history.';
+ }
+ }
+ }
+
if (currentView === 'dashboard' || currentView === 'events') {
const incidents = await fetchData('/incidents');
if (currentView === 'dashboard') {
@@ -474,6 +598,64 @@
`).join('');
}
+ if (currentView === 'topology') {
+ const nodes = await fetchData('/nodes');
+ const services = await fetchData('/services');
+ const topMap = document.getElementById('topology-map');
+ if (nodes && services) {
+ topMap.innerHTML = nodes.map(node => {
+ const nodeServices = services.filter(s => s.node === node.hostname || s.node === node.id);
+ return `
+ <div class="card" style="width:250px; border: 1px solid ${node.health === 'nominal' ? 'var(--border-color)' : 'var(--error)'}">
+ <div class="card-header">
+ <div class="card-title">${node.hostname}</div>
+ <span class="badge ${getStatusClass(node.health)}">${node.health}</span>
+ </div>
+ <div class="label">Capabilities</div>
+ <div class="value" style="font-size:11px">${node.capabilities.join(', ')}</div>
+ <div class="label">Services</div>
+ <div style="font-size:12px; margin-bottom:10px">
+ ${nodeServices.length > 0 ? nodeServices.map(s => `
+ <div style="display:flex; justify-content:space-between; margin-bottom:4px; padding:4px; background:rgba(255,255,255,0.03)">
+ <span>${s.name}</span>
+ <span class="${getStatusClass(s.health)}" style="font-size:10px">${s.health}</span>
+ </div>
+ ${s.dependencies.length > 0 ? `<div style="font-size:9px; color:var(--text-muted); margin-left:8px; margin-bottom:4px">dep: ${s.dependencies.join(', ')}</div>` : ''}
+ `).join('') : '<div class="value">None</div>'}
+ </div>
+ </div>
+ `;
+ }).join('');
+ }
+ }
+
+ if (currentView === 'correlation') {
+ const incidents = await fetchData('/incidents');
+ const actions = await fetchData('/actions');
+ const list = document.getElementById('correlation-chains');
+ if (incidents && actions) {
+ const allActions = Object.values(actions).flat();
+ list.innerHTML = incidents.map(inc => {
+ const related = allActions.filter(a => a.correlation_chain && a.correlation_chain.includes(inc.id));
+ return `
+ <div class="card">
+ <div class="card-header">
+ <div class="card-title">Incident: ${inc.id || 'INC-001'}</div>
+ <span class="badge status-error">Active</span>
+ </div>
+ <p>${inc.message}</p>
+ <div class="label">Related Actions</div>
+ ${related.map(a => `
+ <div class="event" style="margin-top:5px">
+ <strong>${a.type}</strong> (${a.status})<br>
+ <small>${a.description}</small>
+ </div>
+ `).join('') || '<div class="value">No actions yet</div>'}
+ </div>
+ `;
+ }).join('');
+ }
+ }
if (currentView === 'settings') {
const config = await fetchData('/config');
const content = document.getElementById('settings-content');
@@ -482,6 +664,8 @@
<div class="value">${config.auto_mode ? 'Enabled' : 'Disabled'}</div>
<div class="label">Action Thresholds</div>
<div class="value mono">${JSON.stringify(config.action_thresholds, null, 2)}</div>
+ <div class="label">Telegram Integration</div>
+ <div class="value" style="color:var(--text-muted)">Ready for mobile approval flows. Hook: /api/v1/telegram/webhook</div>
<button onclick="alert('Settings update not implemented in this demo')">Edit Configuration</button>
`;
}
diff --git a/webui/web.py b/webui/web.py
index 053ac1a..4727274 100644
--- a/webui/web.py
+++ b/webui/web.py
@@ -8,6 +8,7 @@ from pathlib import Path
STATE_DIR = Path("/opt/homelab/state")
EVENTS_DIR = Path("/opt/homelab/events")
WORLD_DIR = Path("/opt/homelab/world")
+ACTIONS_DIR = Path("/opt/homelab/actions")
EVENT_LOG = Path("/tmp/agent-events.log")
STATIC_DIR = Path(__file__).parent
REDIS_HOST = os.getenv("REDIS_HOST", "redis")
@@ -164,6 +165,55 @@ def current_events():
return sorted(events, key=lambda x: x.get("timestamp", 0), reverse=True)
+def current_actions():
+ actions = {}
+ statuses = ["pending", "approved", "running", "completed", "failed", "rejected"]
+ for status in statuses:
+ actions[status] = []
+ status_dir = ACTIONS_DIR / status
+ if status_dir.exists():
+ for f in status_dir.glob("*.json"):
+ data = read_json_file(f)
+ if data:
+ actions[status].append(data)
+ return actions
+
+
+def mutate_action(action_id, target_status):
+ statuses = ["pending", "approved", "running", "completed", "failed", "rejected"]
+ if target_status not in statuses:
+ return False, f"Invalid target status: {target_status}"
+
+ # Find where the action is
+ source_path = None
+ for status in statuses:
+ p = ACTIONS_DIR / status / f"{action_id}.json"
+ if p.exists():
+ source_path = p
+ break
+
+ if not source_path:
+ return False, f"Action {action_id} not found"
+
+ target_dir = ACTIONS_DIR / target_status
+ target_dir.mkdir(parents=True, exist_ok=True)
+ target_path = target_dir / f"{action_id}.json"
+
+ try:
+ data = json.loads(source_path.read_text())
+ data["status"] = target_status
+ data["last_mutation"] = os.path.getmtime(source_path) # or current time
+ import time
+ data["last_mutation"] = time.time()
+
+ target_path.write_text(json.dumps(data, indent=2))
+ if source_path != target_path:
+ source_path.unlink()
+ return True, "Success"
+ except Exception as e:
+ return False, str(e)
+
+
def send_json(status, payload, handler):
body = (json.dumps(payload) + "\n").encode("utf-8")
handler.send_response(status)
@@ -207,6 +257,10 @@ class Handler(BaseHTTPRequestHandler):
send_json(200, current_events(), self)
return
+ if self.path == "/actions":
+ send_json(200, current_actions(), self)
+ return
+
if self.path == "/logs":
print("LOGS endpoint called", flush=True)
body = ("\n".join(tail_lines(EVENT_LOG, 200)) + "\n").encode("utf-8")
@@ -236,6 +290,7 @@ class Handler(BaseHTTPRequestHandler):
"/auto-mode",
"/config",
"/events",
+ "/action/mutate",
):
self.send_error(404)
return
@@ -291,6 +346,19 @@ class Handler(BaseHTTPRequestHandler):
send_json(200, {"status": "sent"}, self)
return
+ if self.path == "/action/mutate":
+ action_id = payload.get("id")
+ target = payload.get("status")
+ if not action_id or not target:
+ self.send_error(400, "id and status are required")
+ return
+ success, msg = mutate_action(action_id, target)
+ if success:
+ send_json(200, {"status": "ok"}, self)
+ else:
+ self.send_error(500, msg)
+ return
+
if not command:
self.send_error(400, "command is required")
return

View file

@ -0,0 +1,75 @@
# Action Queue System
The Action Queue System provides a safe, filesystem-first lifecycle for operational actions in the homelab platform. It enables controlled execution with mandatory approval for high-risk operations.
## Action Lifecycle
Actions move through various states, represented by directories under `/opt/homelab/actions/`:
1. **Pending** (`pending/`): Actions proposed by the Supervisor or other agents.
2. **Approved** (`approved/`): Actions that have been reviewed and approved for execution.
3. **Running** (`running/`): Actions currently being processed by the Executor.
4. **Completed** (`completed/`): Successfully executed actions.
5. **Failed** (`failed/`): Actions that encountered errors during execution.
6. **Rejected** (`rejected/`): Proposed actions that were explicitly denied.
## Action Schema
Actions are stored as JSON documents with the following structure:
```json
{
"action_id": "uuid",
"created_at": 1620000000.0,
"proposed_by": "supervisor",
"correlation_id": "uuid",
"node": "node-name",
"service": "service-name",
"action_type": "redeploy_service",
"risk_level": "guarded",
"confidence": 0.9,
"approval_required": true,
"autonomous_eligible": false,
"status": "pending",
"payload": { ... },
"rollback_reference": null
}
```
## Safety Model
Actions are categorized into safety classes:
- **Safe**: Low-risk actions that may be eligible for autonomous execution in the future (e.g., `collect_diagnostics`, `rerun_healthcheck`).
- **Guarded**: Actions that default to requiring approval but could be automated under strict conditions (e.g., `redeploy_service`, `rerun_deployment_stage`).
- **Dangerous**: High-risk actions that ALWAYS require manual approval.
Currently, the platform operates in a **Recommendation-Only** mode where even `safe` actions require explicit approval.
## Initial Action Types
- `redeploy_service`: Restarts or redeploys a service container.
- `rerun_healthcheck`: Triggers an immediate health check.
- `rerun_deployment_stage`: Retries a specific stage of a failed deployment.
- `collect_diagnostics`: Gathers logs and metrics for troubleshooting.
## Executor
The Executor (`scripts/executor/executor.py`) is responsible for processing approved actions. It features:
- **Process Approved Only**: Only actions in the `approved/` directory are processed.
- **Recommendation-Safe**: Simulation-based execution that logs intended mutations without side effects.
- **Idempotency**: Designed to be safe to run multiple times.
- **Resumable State**: If interrupted, it will pick up actions in the `running/` state.
- **Append-Only History**: Maintains a `history.log` of all action transitions.
## Rollback Concepts
Every action schema includes a `rollback_reference`. In future iterations, this will point to the previous stable state or a reverse action that can be triggered if the current action fails or causes further instability.
## Future Autonomous Execution
The system is designed to transition to autonomous execution by:
1. Identifying `safe` actions with high `confidence` scores.
2. Matching them against a `policy-engine`.
3. Automatically moving them from `pending/` to `approved/` based on allowed safety guardrails.

View file

@ -0,0 +1,27 @@
# Operator Approval Workflow
This document describes the process of reviewing and approving actions generated by the reconciliation supervisor.
## Workflow Stages
### 1. Action Identification
When the supervisor identifies a delta between desired and actual state, it generates a pending action in `/opt/homelab/actions/pending/`.
### 2. Risk Assessment
Actions are categorized by risk level:
- **Safe**: Low impact, high confidence. Can be auto-approved in autonomous mode.
- **Guarded**: Moderate impact. Requires explicit operator approval.
- **Dangerous**: High impact (e.g., node redeploy). Requires multi-step approval or senior operator override.
### 3. Review Process
1. Navigate to the **Action Queue** view.
2. Review the **Confidence Score** and **Correlation Chain** to understand why the action was proposed.
3. Check the **Rollback Availability**.
### 4. Decision
- **Approve**: Moves action to `approved` state.
- **Reject**: Moves action to `rejected` state and suppresses similar recommendations for a cooldown period.
- **Execute**: Transitions an approved action to `running` status.
## Mobile Approvals
Approval requests can be acknowledged via the Telegram bot integration, allowing for remote operational control.

View file

@ -0,0 +1,24 @@
# Incident Remediation Guide
Guide for operators responding to system incidents using the Control Plane.
## Remediation Flow
### 1. Detection
Incidents appear in the **Active Incidents** card on the Dashboard and in the **Events** timeline.
### 2. Correlation
Use the **Correlation** view to see:
- The event chain leading to the incident.
- Automated recommendations generated in response.
- Any manual actions already taken.
### 3. Intervention
1. Review the recommended actions in the **Action Queue**.
2. If the automated recommendation is not sufficient, use the **Nodes** or **Services** view to manually trigger commands.
3. Observe the **Runtime Topology** to ensure no cascading failures occur during remediation.
### 4. Verification
Once actions are completed, verify the system state:
- Health badges should transition back to **Nominal**.
- The **System Status** in the sidebar should reflect a healthy state.

View file

@ -2,11 +2,13 @@
The system continuously monitors for drift between desired and actual state.
1. If a service is in RECONCILING state, check the Services view.
2. Review the Recommendations view for automated or guarded actions.
3. For 'safe' actions with high confidence, the system may act autonomously if enabled.
4. For 'guarded' or 'dangerous' actions, an operator must manually approve the action.
5. Risk Levels:
- **Safe**: Minimal impact, high success rate.
- **Guarded**: Potential brief service interruption.
- **Dangerous**: Significant impact, potential data loss, or hardware interaction required.
1. **Drift Detection**: When drift is detected, the supervisor generates a recommendation and a corresponding pending action.
2. **Review**: Navigate to the **Recommendations** view for a high-level summary, or the **Action Queue** for the specific execution plan.
3. **Approval**: For 'guarded' or 'dangerous' actions, click **Approve** in the Action Queue.
4. **Execution**: Once approved, the action can be triggered manually by clicking **Execute**, or it will be picked up by the autonomous executor if the system is in `AUTONOMOUS` mode.
5. **Observation**: Monitor the **Deployments** and **Topology** views to watch the reconciliation in real-time.
Risk Levels:
- **Safe**: Minimal impact, high success rate.
- **Guarded**: Potential brief service interruption.
- **Dangerous**: Significant impact, potential data loss, or node-level disruption.

View file

@ -0,0 +1,225 @@
#!/usr/bin/env python3
import os
import json
import time
import sys
import shutil
import uuid
from pathlib import Path
# Configuration
ACTIONS_ROOT = Path(os.getenv("HOMELAB_ACTIONS_ROOT", "/opt/homelab/actions"))
EVENT_LOG = Path("/tmp/agent-events.log")
HISTORY_LOG = ACTIONS_ROOT / "history.log"
def emit_event(event_type, message, details=None):
"""Emit action lifecycle events."""
event = {
"type": event_type,
"message": message,
"timestamp": time.time(),
"details": details or {}
}
line = json.dumps(event)
print(line)
try:
with open(EVENT_LOG, "a") as f:
f.write(line + "\n")
f.flush()
except Exception as e:
print(f"Error writing to event log: {e}", file=sys.stderr)
def log_history(action_id, status, message):
"""Append-only execution history."""
entry = {
"timestamp": time.time(),
"action_id": action_id,
"status": status,
"message": message
}
try:
with open(HISTORY_LOG, "a") as f:
f.write(json.dumps(entry) + "\n")
f.flush()
except Exception as e:
print(f"Error writing history: {e}", file=sys.stderr)
def ensure_dirs():
for d in ["pending", "approved", "running", "completed", "failed", "rejected"]:
(ACTIONS_ROOT / d).mkdir(parents=True, exist_ok=True)
def approve_action(action_id):
ensure_dirs()
if not action_id.endswith(".json"):
filename = f"{action_id}.json"
else:
filename = action_id
pending_path = ACTIONS_ROOT / "pending" / filename
if not pending_path.exists():
print(f"Action {filename} not found in pending.")
return False
approved_path = ACTIONS_ROOT / "approved" / filename
try:
with open(pending_path, "r") as f:
action = json.load(f)
action["status"] = "approved"
action["approved_at"] = time.time()
with open(pending_path, "w") as f:
json.dump(action, f, indent=2)
shutil.move(pending_path, approved_path)
emit_event("action_approved", f"Action approved: {action['action_id']}", {"action_id": action['action_id']})
log_history(action['action_id'], "approved", "Manual approval received")
print(f"Action {action['action_id']} approved.")
return True
except Exception as e:
print(f"Error approving action: {e}")
return False
def reject_action(action_id):
ensure_dirs()
if not action_id.endswith(".json"):
filename = f"{action_id}.json"
else:
filename = action_id
pending_path = ACTIONS_ROOT / "pending" / filename
if not pending_path.exists():
print(f"Action {filename} not found in pending.")
return False
rejected_path = ACTIONS_ROOT / "rejected" / filename
try:
with open(pending_path, "r") as f:
action = json.load(f)
action["status"] = "rejected"
action["rejected_at"] = time.time()
with open(pending_path, "w") as f:
json.dump(action, f, indent=2)
shutil.move(pending_path, rejected_path)
emit_event("action_rejected", f"Action rejected: {action['action_id']}", {"action_id": action['action_id']})
log_history(action['action_id'], "rejected", "Manual rejection received")
print(f"Action {action['action_id']} rejected.")
return True
except Exception as e:
print(f"Error rejecting action: {e}")
return False
def process_action(action_path, dry_run=False):
"""Process a single approved action."""
try:
with open(action_path, "r") as f:
action = json.load(f)
except Exception as e:
print(f"Error reading action {action_path}: {e}")
return
action_id = action["action_id"]
action_type = action["action_type"]
# Move to running (Resumable execution state)
running_path = ACTIONS_ROOT / "running" / action_path.name
shutil.move(action_path, running_path)
action["status"] = "running"
action["started_at"] = time.time()
with open(running_path, "w") as f:
json.dump(action, f, indent=2)
emit_event("action_started", f"Started action {action_id} ({action_type})", {"action_id": action_id})
log_history(action_id, "running", f"Execution started (dry_run={dry_run})")
# Simulation logic (Recommendation-safe execution model)
print(f"Executing {action_type} for {action.get('service') or action.get('node')}...")
# Idempotent simulation: in a real world, we'd check if it's already done
time.sleep(0.5)
success = True
if dry_run:
print(f"[DRY-RUN] Would execute {action_type} logic here.")
else:
# Initial action types implementation (Simulation)
if action_type == "redeploy_service":
print(f"DEBUG: Triggering container restart/redeploy for {action.get('service')}")
elif action_type == "rerun_healthcheck":
print(f"DEBUG: Running healthcheck for {action.get('service')}")
elif action_type == "rerun_deployment_stage":
print(f"DEBUG: Retrying deployment stage for {action.get('service')}")
elif action_type == "collect_diagnostics":
print(f"DEBUG: Collecting logs and metrics for {action.get('service') or action.get('node')}")
else:
print(f"DEBUG: Executing unknown action type: {action_type}")
# Finalize
if success:
final_status = "completed"
target_dir = ACTIONS_ROOT / "completed"
else:
final_status = "failed"
target_dir = ACTIONS_ROOT / "failed"
final_path = target_dir / action_path.name
action["status"] = final_status
action["finished_at"] = time.time()
with open(running_path, "w") as f:
json.dump(action, f, indent=2)
shutil.move(running_path, final_path)
emit_event(f"action_{final_status}", f"Action {action_id} {final_status}", {"action_id": action_id})
log_history(action_id, final_status, "Execution finished")
def run_executor(dry_run=False):
ensure_dirs()
print(f"--- Executor Run: {time.ctime()} (dry_run={dry_run}) ---")
# 1. Resume running actions
running_actions = list((ACTIONS_ROOT / "running").glob("*.json"))
for action_file in running_actions:
print(f"Resuming action: {action_file.name}")
process_action(action_file, dry_run=dry_run)
# 2. Process approved actions
approved_actions = list((ACTIONS_ROOT / "approved").glob("*.json"))
if not approved_actions:
print("No approved actions found.")
else:
for action_file in approved_actions:
process_action(action_file, dry_run=dry_run)
print("Run complete.")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Homelab Action Executor")
parser.add_argument("command", choices=["run", "approve", "reject"], nargs="?", default="run")
parser.add_argument("action_id", nargs="?")
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args()
if args.command == "run":
run_executor(dry_run=args.dry_run)
elif args.command == "approve":
if not args.action_id:
print("Error: action_id required for approve")
sys.exit(1)
approve_action(args.action_id)
elif args.command == "reject":
if not args.action_id:
print("Error: action_id required for reject")
sys.exit(1)
reject_action(args.action_id)

View file

@ -0,0 +1,74 @@
#!/bin/bash
# Validation script for Homelab Action Queue System
set -e
BASE_DIR=$(pwd)
export HOMELAB_WORLD_ROOT="$BASE_DIR/tmp/homelab/world"
export HOMELAB_ACTIONS_ROOT="$BASE_DIR/tmp/homelab/actions"
EVENT_LOG="/tmp/agent-events.log"
echo "=== Starting Action Queue Validation ==="
# 1. Setup drift scenarios
echo "Setting up drift scenarios..."
bash scripts/supervisor/test_scenarios.sh
# 2. Run supervisor to generate action proposals
echo "Running supervisor..."
python3 scripts/supervisor/supervisor.py
# 3. Check for pending actions
echo "Checking pending actions..."
ls -l "$HOMELAB_ACTIONS_ROOT/pending/"
# Get an action ID from pending
ACTION_FILE=$(ls "$HOMELAB_ACTIONS_ROOT/pending/" | head -n 1)
if [ -z "$ACTION_FILE" ]; then
echo "Error: No pending actions found!"
exit 1
fi
ACTION_ID="${ACTION_FILE%.json}"
echo "Found action: $ACTION_ID"
# 4. Approve the action
echo "Approving action $ACTION_ID..."
python3 scripts/executor/executor.py approve "$ACTION_ID"
# 5. Run executor
echo "Running executor..."
python3 scripts/executor/executor.py run
# 6. Verify completion
if [ -f "$HOMELAB_ACTIONS_ROOT/completed/$ACTION_FILE" ]; then
echo "SUCCESS: Action $ACTION_ID moved to completed."
else
echo "FAILURE: Action $ACTION_ID NOT found in completed."
exit 1
fi
# 7. Test rejection
echo "Testing rejection..."
NEXT_ACTION_FILE=$(ls "$HOMELAB_ACTIONS_ROOT/pending/" | head -n 1)
if [ -n "$NEXT_ACTION_FILE" ]; then
NEXT_ACTION_ID="${NEXT_ACTION_FILE%.json}"
echo "Rejecting action $NEXT_ACTION_ID..."
python3 scripts/executor/executor.py reject "$NEXT_ACTION_ID"
if [ -f "$HOMELAB_ACTIONS_ROOT/rejected/$NEXT_ACTION_FILE" ]; then
echo "SUCCESS: Action $NEXT_ACTION_ID moved to rejected."
else
echo "FAILURE: Action $NEXT_ACTION_ID NOT found in rejected."
exit 1
fi
fi
# 8. Verify events
echo "Verifying events in $EVENT_LOG..."
grep "action_created" "$EVENT_LOG" | tail -n 1
grep "action_approved" "$EVENT_LOG" | tail -n 1
grep "action_started" "$EVENT_LOG" | tail -n 1
grep "action_completed" "$EVENT_LOG" | tail -n 1
grep "action_rejected" "$EVENT_LOG" | tail -n 1
echo "=== Validation Complete ==="

View file

@ -5,14 +5,19 @@ import yaml
import json
import time
import glob
import uuid
from pathlib import Path
# Configuration
WORLD_STATE_PATH = Path(os.getenv("HOMELAB_WORLD_ROOT", "/opt/homelab/world"))
ACTIONS_ROOT = Path(os.getenv("HOMELAB_ACTIONS_ROOT", "/opt/homelab/actions"))
INVENTORY_PATH = Path("hosts")
EVENT_LOG = Path("/tmp/agent-events.log")
CHECKPOINT_FILE = Path("/tmp/supervisor-checkpoint.json")
# Action Queue Layout
ACTION_DIRS = ["pending", "approved", "running", "completed", "failed", "rejected"]
# Reconcile event types
RECONCILE_REQUIRED = "reconcile_required"
RECONCILE_RECOMMENDED = "reconcile_recommended"
@ -24,6 +29,70 @@ STATE_DEGRADED = "degraded"
STATE_UNSTABLE = "unstable"
STATE_RECONCILING = "reconciling"
def ensure_action_dirs():
"""Ensure action queue directories exist."""
for d in ACTION_DIRS:
(ACTIONS_ROOT / d).mkdir(parents=True, exist_ok=True)
def emit_action_proposal(recommendation):
"""Convert recommendation to action proposal and save to pending/."""
ensure_action_dirs()
action_type_map = {
"redeploy": "redeploy_service",
"deploy": "redeploy_service",
"diagnostics": "collect_diagnostics",
"failover_review": "collect_diagnostics",
"review": "collect_diagnostics",
"delayed_deployment": "rerun_deployment_stage"
}
action_type = action_type_map.get(recommendation["action"], "collect_diagnostics")
risk_level_map = {
"redeploy_service": "guarded",
"rerun_healthcheck": "safe",
"rerun_deployment_stage": "guarded",
"collect_diagnostics": "safe"
}
risk_level = risk_level_map.get(action_type, "dangerous")
# Dangerous always requires approval
# Guarded defaults to approval
approval_required = risk_level in ["dangerous", "guarded"]
action_id = str(uuid.uuid4())
action = {
"action_id": action_id,
"created_at": time.time(),
"proposed_by": "supervisor",
"correlation_id": str(uuid.uuid4()), # In a real system, link to drift ID
"node": recommendation["drift"].get("node"),
"service": recommendation["drift"].get("service"),
"action_type": action_type,
"risk_level": risk_level,
"confidence": 0.9, # Default confidence
"approval_required": approval_required,
"autonomous_eligible": False, # No autonomy yet
"status": "pending",
"payload": recommendation["drift"],
"rollback_reference": None
}
file_path = ACTIONS_ROOT / "pending" / f"{action_id}.json"
try:
with open(file_path, "w") as f:
json.dump(action, f, indent=2)
emit_event("action_created", f"Action proposed: {action_type} for {action.get('service') or action.get('node')}", {
"action_id": action_id,
"action_type": action_type,
"node": action.get("node"),
"service": action.get("service")
})
except Exception as e:
print(f"Error emitting action proposal: {e}", file=sys.stderr)
def emit_event(event_type, message, details=None):
"""Emit reconciliation events using existing event system (append-only file)."""
event = {
@ -278,6 +347,8 @@ def main():
# Emit reconciliation events
for rec in recommendations:
emit_event(rec["type"], rec["message"], rec["drift"])
# Proposed: Emit action proposals to action queue
emit_action_proposal(rec)
# 6. Save checkpoint
save_checkpoint({

View file

@ -0,0 +1,24 @@
{
"action_id": "0083f8ad-1f2b-47a4-81a8-81e59740879e",
"created_at": 1778600485.050643,
"proposed_by": "supervisor",
"correlation_id": "6d88755b-ca89-45eb-bf2d-506fca631144",
"node": "node1",
"service": "homeassistant",
"action_type": "redeploy_service",
"risk_level": "guarded",
"confidence": 0.9,
"approval_required": true,
"autonomous_eligible": false,
"status": "completed",
"payload": {
"type": "unhealthy_service",
"service": "homeassistant",
"status": "unhealthy",
"node": "node1"
},
"rollback_reference": null,
"approved_at": 1778600485.1278665,
"started_at": 1778600485.1792338,
"finished_at": 1778600485.6797137
}

View file

@ -0,0 +1,23 @@
{
"action_id": "050add79-3265-4e35-bb88-41c368bbccda",
"created_at": 1778600510.7529757,
"proposed_by": "supervisor",
"correlation_id": "d8ba7d84-74dd-46c8-a085-5ed8ba186770",
"node": null,
"service": "webapp",
"action_type": "collect_diagnostics",
"risk_level": "safe",
"confidence": 0.9,
"approval_required": false,
"autonomous_eligible": false,
"status": "completed",
"payload": {
"type": "failed_deployment",
"deployment_id": "dep-001",
"service": "webapp"
},
"rollback_reference": null,
"approved_at": 1778600510.8252015,
"started_at": 1778600510.8744874,
"finished_at": 1778600511.3750403
}

View file

@ -0,0 +1,7 @@
{
"action_id": "resumable-task",
"action_type": "rerun_healthcheck",
"status": "completed",
"started_at": 1778600488.5642526,
"finished_at": 1778600489.0646975
}

View file

@ -0,0 +1,10 @@
{"timestamp": 1778600485.1282582, "action_id": "0083f8ad-1f2b-47a4-81a8-81e59740879e", "status": "approved", "message": "Manual approval received"}
{"timestamp": 1778600485.179484, "action_id": "0083f8ad-1f2b-47a4-81a8-81e59740879e", "status": "running", "message": "Execution started (dry_run=False)"}
{"timestamp": 1778600485.680433, "action_id": "0083f8ad-1f2b-47a4-81a8-81e59740879e", "status": "completed", "message": "Execution finished"}
{"timestamp": 1778600485.7410686, "action_id": "2143ae5b-bcc6-410b-b925-e7def70fc013", "status": "rejected", "message": "Manual rejection received"}
{"timestamp": 1778600488.5644836, "action_id": "resumable-task", "status": "running", "message": "Execution started (dry_run=False)"}
{"timestamp": 1778600489.0652084, "action_id": "resumable-task", "status": "completed", "message": "Execution finished"}
{"timestamp": 1778600510.825529, "action_id": "050add79-3265-4e35-bb88-41c368bbccda", "status": "approved", "message": "Manual approval received"}
{"timestamp": 1778600510.8747966, "action_id": "050add79-3265-4e35-bb88-41c368bbccda", "status": "running", "message": "Execution started (dry_run=False)"}
{"timestamp": 1778600511.3755214, "action_id": "050add79-3265-4e35-bb88-41c368bbccda", "status": "completed", "message": "Execution finished"}
{"timestamp": 1778600511.4307747, "action_id": "240cbbc0-891e-4032-bf73-1fa40ff850b4", "status": "rejected", "message": "Manual rejection received"}

View file

@ -0,0 +1,21 @@
{
"action_id": "50d7cdab-2f12-449f-965a-0383e32babaa",
"created_at": 1778600485.053174,
"proposed_by": "supervisor",
"correlation_id": "a2899a7f-548f-455d-a8dd-4e208be58e00",
"node": null,
"service": null,
"action_type": "collect_diagnostics",
"risk_level": "safe",
"confidence": 0.9,
"approval_required": false,
"autonomous_eligible": false,
"status": "pending",
"payload": {
"type": "unresolved_incident",
"incident_id": "inc-99",
"description": "High memory usage on node1",
"status": "investigating"
},
"rollback_reference": null
}

View file

@ -0,0 +1,20 @@
{
"action_id": "5e239d96-ff3f-48a3-a71a-ad5aa6b7ff88",
"created_at": 1778600485.05199,
"proposed_by": "supervisor",
"correlation_id": "c5fa628e-35a1-44f9-9119-07d93f20af80",
"node": null,
"service": "webapp",
"action_type": "collect_diagnostics",
"risk_level": "safe",
"confidence": 0.9,
"approval_required": false,
"autonomous_eligible": false,
"status": "pending",
"payload": {
"type": "failed_deployment",
"deployment_id": "dep-002",
"service": "webapp"
},
"rollback_reference": null
}

View file

@ -0,0 +1,20 @@
{
"action_id": "7cde5093-3394-43af-9391-321c50ac5362",
"created_at": 1778600510.7521193,
"proposed_by": "supervisor",
"correlation_id": "2a91f58e-e10d-4de5-abd7-5f4fe6fdc325",
"node": null,
"service": "webapp",
"action_type": "collect_diagnostics",
"risk_level": "safe",
"confidence": 0.9,
"approval_required": false,
"autonomous_eligible": false,
"status": "pending",
"payload": {
"type": "failed_deployment",
"deployment_id": "dep-002",
"service": "webapp"
},
"rollback_reference": null
}

View file

@ -0,0 +1,20 @@
{
"action_id": "a42e2183-ca22-4a50-97a7-eb53ab0e039a",
"created_at": 1778600510.75163,
"proposed_by": "supervisor",
"correlation_id": "ec2a1960-5baa-453a-8380-65fc9376cc82",
"node": "node2",
"service": null,
"action_type": "collect_diagnostics",
"risk_level": "safe",
"confidence": 0.9,
"approval_required": false,
"autonomous_eligible": false,
"status": "pending",
"payload": {
"type": "offline_node",
"node": "node2",
"status": "offline"
},
"rollback_reference": null
}

View file

@ -0,0 +1,21 @@
{
"action_id": "aae83bcd-455f-4b59-bab0-7c7994116468",
"created_at": 1778600510.7506568,
"proposed_by": "supervisor",
"correlation_id": "0a786305-46cb-4837-8725-53d99203f39e",
"node": "node1",
"service": "homeassistant",
"action_type": "redeploy_service",
"risk_level": "guarded",
"confidence": 0.9,
"approval_required": true,
"autonomous_eligible": false,
"status": "pending",
"payload": {
"type": "unhealthy_service",
"service": "homeassistant",
"status": "unhealthy",
"node": "node1"
},
"rollback_reference": null
}

View file

@ -0,0 +1,21 @@
{
"action_id": "c2e6c844-6d96-4ea7-b924-5e33764e5493",
"created_at": 1778600510.7533653,
"proposed_by": "supervisor",
"correlation_id": "6ffc0579-71ac-417f-8ea1-fc46e54527c6",
"node": null,
"service": null,
"action_type": "collect_diagnostics",
"risk_level": "safe",
"confidence": 0.9,
"approval_required": false,
"autonomous_eligible": false,
"status": "pending",
"payload": {
"type": "unresolved_incident",
"incident_id": "inc-99",
"description": "High memory usage on node1",
"status": "investigating"
},
"rollback_reference": null
}

View file

@ -0,0 +1,20 @@
{
"action_id": "c91a4171-e636-4194-a146-6e003d2f2586",
"created_at": 1778600510.7511823,
"proposed_by": "supervisor",
"correlation_id": "966a62ee-f81b-497d-96cb-7749f4da0c6f",
"node": "node2",
"service": "webapp",
"action_type": "rerun_deployment_stage",
"risk_level": "guarded",
"confidence": 0.9,
"approval_required": true,
"autonomous_eligible": false,
"status": "pending",
"payload": {
"type": "missing_service",
"service": "webapp",
"node": "node2"
},
"rollback_reference": null
}

View file

@ -0,0 +1,20 @@
{
"action_id": "e6d3f0d6-c294-4282-b9f4-a730f9cec9dc",
"created_at": 1778600485.0515254,
"proposed_by": "supervisor",
"correlation_id": "bf51852b-0b34-4b4b-98c9-fffff38f77ce",
"node": "node2",
"service": null,
"action_type": "collect_diagnostics",
"risk_level": "safe",
"confidence": 0.9,
"approval_required": false,
"autonomous_eligible": false,
"status": "pending",
"payload": {
"type": "offline_node",
"node": "node2",
"status": "offline"
},
"rollback_reference": null
}

View file

@ -0,0 +1,20 @@
{
"action_id": "f4c56df2-6775-484b-806e-cdecdcc19584",
"created_at": 1778600485.0527768,
"proposed_by": "supervisor",
"correlation_id": "f974d640-d0fb-4a85-bf8a-eda100182181",
"node": null,
"service": "webapp",
"action_type": "collect_diagnostics",
"risk_level": "safe",
"confidence": 0.9,
"approval_required": false,
"autonomous_eligible": false,
"status": "pending",
"payload": {
"type": "failed_deployment",
"deployment_id": "dep-001",
"service": "webapp"
},
"rollback_reference": null
}

View file

@ -0,0 +1,20 @@
{
"action_id": "ff3da03c-fffa-49a7-985d-ed4589ab6856",
"created_at": 1778600485.0510974,
"proposed_by": "supervisor",
"correlation_id": "37da2d5b-3ecd-4a29-97c2-7e9461b1792e",
"node": "node2",
"service": "webapp",
"action_type": "rerun_deployment_stage",
"risk_level": "guarded",
"confidence": 0.9,
"approval_required": true,
"autonomous_eligible": false,
"status": "pending",
"payload": {
"type": "missing_service",
"service": "webapp",
"node": "node2"
},
"rollback_reference": null
}

View file

@ -0,0 +1,21 @@
{
"action_id": "2143ae5b-bcc6-410b-b925-e7def70fc013",
"created_at": 1778600485.0523734,
"proposed_by": "supervisor",
"correlation_id": "dc23556c-68d2-41a3-a5d2-9ad66705f989",
"node": null,
"service": "webapp",
"action_type": "collect_diagnostics",
"risk_level": "safe",
"confidence": 0.9,
"approval_required": false,
"autonomous_eligible": false,
"status": "rejected",
"payload": {
"type": "failed_deployment",
"deployment_id": "dep-003",
"service": "webapp"
},
"rollback_reference": null,
"rejected_at": 1778600485.740686
}

View file

@ -0,0 +1,21 @@
{
"action_id": "240cbbc0-891e-4032-bf73-1fa40ff850b4",
"created_at": 1778600510.7525399,
"proposed_by": "supervisor",
"correlation_id": "fd234809-82aa-459d-858b-18bc3205a6c5",
"node": null,
"service": "webapp",
"action_type": "collect_diagnostics",
"risk_level": "safe",
"confidence": 0.9,
"approval_required": false,
"autonomous_eligible": false,
"status": "rejected",
"payload": {
"type": "failed_deployment",
"deployment_id": "dep-003",
"service": "webapp"
},
"rollback_reference": null,
"rejected_at": 1778600511.4303465
}

View file

@ -1 +1 @@
{"id": "dep-001", "service": "webapp", "status": "failed", "timestamp": 1778597957}
{"id": "dep-001", "service": "webapp", "status": "failed", "timestamp": 1778600510}

View file

@ -1 +1 @@
{"id": "dep-002", "service": "webapp", "status": "failed", "timestamp": 1778597657}
{"id": "dep-002", "service": "webapp", "status": "failed", "timestamp": 1778600210}

View file

@ -1 +1 @@
{"id": "dep-003", "service": "webapp", "status": "failed", "timestamp": 1778597357}
{"id": "dep-003", "service": "webapp", "status": "failed", "timestamp": 1778599910}

View file

@ -216,9 +216,9 @@
.label { color: var(--text-muted); font-size: 12px; margin-bottom: 4px; }
.value { font-weight: 500; margin-bottom: 12px; }
.risk-safe { color: var(--safe); }
.risk-guarded { color: var(--guarded); }
.risk-dangerous { color: var(--dangerous); }
.risk-safe { background: rgba(62, 175, 124, 0.1); color: var(--safe); }
.risk-guarded { background: rgba(230, 126, 34, 0.1); color: var(--guarded); }
.risk-dangerous { background: rgba(192, 57, 43, 0.1); color: var(--dangerous); }
</style>
</head>
@ -229,6 +229,9 @@
<li class="nav-item active" onclick="showView('dashboard', this)">
<span>Dashboard</span>
</li>
<li class="nav-item" onclick="showView('actions', this)">
<span>Action Queue</span>
</li>
<li class="nav-item" onclick="showView('nodes', this)">
<span>Nodes</span>
</li>
@ -238,9 +241,15 @@
<li class="nav-item" onclick="showView('deployments', this)">
<span>Deployments</span>
</li>
<li class="nav-item" onclick="showView('topology', this)">
<span>Topology</span>
</li>
<li class="nav-item" onclick="showView('events', this)">
<span>Events</span>
</li>
<li class="nav-item" onclick="showView('correlation', this)">
<span>Correlation</span>
</li>
<li class="nav-item" onclick="showView('recommendations', this)">
<span>Recommendations</span>
</li>
@ -255,7 +264,16 @@
<main class="main-content">
<header>
<div class="view-title" id="current-view-title">Dashboard</div>
<div style="display:flex; align-items:center; gap:20px">
<div class="view-title" id="current-view-title">Dashboard</div>
<select id="operator-mode" onchange="setOperatorMode(this.value)" style="background:var(--sidebar-color); border:1px solid var(--border-color); color:var(--accent-color); font-weight:bold; font-size:12px; padding:4px 8px">
<option value="observe">OBSERVE</option>
<option value="recommend">RECOMMEND</option>
<option value="approval" selected>APPROVAL</option>
<option value="autonomous">AUTONOMOUS</option>
<option value="maintenance">MAINTENANCE</option>
</select>
</div>
<div class="header-actions">
<button onclick="refreshData()">Refresh</button>
</div>
@ -269,6 +287,10 @@
<div class="card-title">System Overview</div>
<div id="dashboard-summary" style="margin-top:20px"></div>
</div>
<div class="card">
<div class="card-title">Pending Actions</div>
<div id="dashboard-actions-summary" style="margin-top:20px"></div>
</div>
<div class="card">
<div class="card-title">Active Incidents</div>
<div id="dashboard-incidents" style="margin-top:20px"></div>
@ -276,6 +298,20 @@
</div>
</div>
<!-- Actions View -->
<div id="view-actions" class="view hidden">
<div style="display:grid; grid-template-columns: 1fr 1fr; gap:24px">
<div>
<h3>Pending Approval</h3>
<div id="actions-pending" class="timeline"></div>
</div>
<div>
<h3>Active / History</h3>
<div id="actions-history" class="timeline"></div>
</div>
</div>
</div>
<!-- Nodes View -->
<div id="view-nodes" class="view hidden">
<div class="grid" id="nodes-list"></div>
@ -291,11 +327,24 @@
<div class="grid" id="deployments-list"></div>
</div>
<!-- Topology View -->
<div id="view-topology" class="view hidden">
<div class="card" style="min-height:500px">
<div class="card-title">Runtime Topology</div>
<div id="topology-map" style="margin-top:20px; display:flex; flex-wrap:wrap; gap:40px; justify-content:center"></div>
</div>
</div>
<!-- Events View -->
<div id="view-events" class="view hidden">
<div class="timeline" id="events-timeline"></div>
</div>
<!-- Correlation View -->
<div id="view-correlation" class="view hidden">
<div id="correlation-chains" class="grid"></div>
</div>
<!-- Recommendations View -->
<div id="view-recommendations" class="view hidden">
<div class="grid" id="recommendations-list"></div>
@ -335,6 +384,34 @@
}
}
async function postData(endpoint, data) {
try {
const res = await fetch(endpoint, {
method: 'POST',
headers: {'Content-Type': 'application/json'},
body: JSON.stringify(data)
});
return await res.json();
} catch (e) {
console.error('Post error:', endpoint, e);
return null;
}
}
async function mutateAction(id, status) {
const res = await postData('/action/mutate', {id, status});
if (res && res.status === 'ok') {
refreshData();
} else {
alert('Mutation failed');
}
}
function setOperatorMode(mode) {
console.log('Operator mode set to:', mode);
// In real system, this would call backend
}
function formatTime(ts) {
if (!ts) return 'N/A';
return new Date(ts * 1000).toLocaleString();
@ -368,6 +445,53 @@
}
}
if (currentView === 'dashboard' || currentView === 'actions') {
const actions = await fetchData('/actions');
if (actions) {
if (currentView === 'dashboard') {
const dashActions = document.getElementById('dashboard-actions-summary');
const pendingCount = actions.pending.length;
dashActions.innerHTML = `
<div class="label">Pending</div><div class="value" style="color:var(--guarded)">${pendingCount}</div>
<div class="label">Running</div><div class="value" style="color:var(--reconciling)">${actions.running.length}</div>
`;
}
if (currentView === 'actions') {
const pendingEl = document.getElementById('actions-pending');
const historyEl = document.getElementById('actions-history');
pendingEl.innerHTML = actions.pending.map(a => `
<div class="card" style="margin-bottom:12px">
<div class="card-header">
<div class="card-title">${a.type.toUpperCase()}</div>
<span class="badge risk-${a.risk_level}">${a.risk_level}</span>
</div>
<p>${a.description}</p>
<div class="label">Target</div><div class="value">${a.target.node} ${a.target.service || ''}</div>
<div class="label">Confidence</div><div class="value">${Math.round(a.confidence*100)}%</div>
<div class="controls">
<button class="btn-primary" onclick="mutateAction('${a.id}', 'approved')">Approve</button>
<button onclick="mutateAction('${a.id}', 'rejected')">Reject</button>
</div>
</div>
`).join('') || 'No pending actions.';
const history = [...actions.approved, ...actions.running, ...actions.completed, ...actions.failed];
historyEl.innerHTML = history.sort((a,b) => b.timestamp - a.timestamp).map(a => `
<div class="event">
<div class="event-header">
<span>${a.type.toUpperCase()}</span>
<span class="badge ${getStatusClass(a.status)}">${a.status}</span>
</div>
<div>${a.description}</div>
<small>${formatTime(a.timestamp)} | Target: ${a.target.node}</small>
${a.status === 'approved' ? `<div class="controls"><button class="btn-primary" onclick="mutateAction('${a.id}', 'running')">Execute</button></div>` : ''}
</div>
`).join('') || 'No history.';
}
}
}
if (currentView === 'dashboard' || currentView === 'events') {
const incidents = await fetchData('/incidents');
if (currentView === 'dashboard') {
@ -474,6 +598,64 @@
`).join('');
}
if (currentView === 'topology') {
const nodes = await fetchData('/nodes');
const services = await fetchData('/services');
const topMap = document.getElementById('topology-map');
if (nodes && services) {
topMap.innerHTML = nodes.map(node => {
const nodeServices = services.filter(s => s.node === node.hostname || s.node === node.id);
return `
<div class="card" style="width:250px; border: 1px solid ${node.health === 'nominal' ? 'var(--border-color)' : 'var(--error)'}">
<div class="card-header">
<div class="card-title">${node.hostname}</div>
<span class="badge ${getStatusClass(node.health)}">${node.health}</span>
</div>
<div class="label">Capabilities</div>
<div class="value" style="font-size:11px">${node.capabilities.join(', ')}</div>
<div class="label">Services</div>
<div style="font-size:12px; margin-bottom:10px">
${nodeServices.length > 0 ? nodeServices.map(s => `
<div style="display:flex; justify-content:space-between; margin-bottom:4px; padding:4px; background:rgba(255,255,255,0.03)">
<span>${s.name}</span>
<span class="${getStatusClass(s.health)}" style="font-size:10px">${s.health}</span>
</div>
${s.dependencies.length > 0 ? `<div style="font-size:9px; color:var(--text-muted); margin-left:8px; margin-bottom:4px">dep: ${s.dependencies.join(', ')}</div>` : ''}
`).join('') : '<div class="value">None</div>'}
</div>
</div>
`;
}).join('');
}
}
if (currentView === 'correlation') {
const incidents = await fetchData('/incidents');
const actions = await fetchData('/actions');
const list = document.getElementById('correlation-chains');
if (incidents && actions) {
const allActions = Object.values(actions).flat();
list.innerHTML = incidents.map(inc => {
const related = allActions.filter(a => a.correlation_chain && a.correlation_chain.includes(inc.id));
return `
<div class="card">
<div class="card-header">
<div class="card-title">Incident: ${inc.id || 'INC-001'}</div>
<span class="badge status-error">Active</span>
</div>
<p>${inc.message}</p>
<div class="label">Related Actions</div>
${related.map(a => `
<div class="event" style="margin-top:5px">
<strong>${a.type}</strong> (${a.status})<br>
<small>${a.description}</small>
</div>
`).join('') || '<div class="value">No actions yet</div>'}
</div>
`;
}).join('');
}
}
if (currentView === 'settings') {
const config = await fetchData('/config');
const content = document.getElementById('settings-content');
@ -482,6 +664,8 @@
<div class="value">${config.auto_mode ? 'Enabled' : 'Disabled'}</div>
<div class="label">Action Thresholds</div>
<div class="value mono">${JSON.stringify(config.action_thresholds, null, 2)}</div>
<div class="label">Telegram Integration</div>
<div class="value" style="color:var(--text-muted)">Ready for mobile approval flows. Hook: /api/v1/telegram/webhook</div>
<button onclick="alert('Settings update not implemented in this demo')">Edit Configuration</button>
`;
}

View file

@ -8,6 +8,7 @@ from pathlib import Path
STATE_DIR = Path("/opt/homelab/state")
EVENTS_DIR = Path("/opt/homelab/events")
WORLD_DIR = Path("/opt/homelab/world")
ACTIONS_DIR = Path("/opt/homelab/actions")
EVENT_LOG = Path("/tmp/agent-events.log")
STATIC_DIR = Path(__file__).parent
REDIS_HOST = os.getenv("REDIS_HOST", "redis")
@ -164,6 +165,55 @@ def current_events():
return sorted(events, key=lambda x: x.get("timestamp", 0), reverse=True)
def current_actions():
actions = {}
statuses = ["pending", "approved", "running", "completed", "failed", "rejected"]
for status in statuses:
actions[status] = []
status_dir = ACTIONS_DIR / status
if status_dir.exists():
for f in status_dir.glob("*.json"):
data = read_json_file(f)
if data:
actions[status].append(data)
return actions
def mutate_action(action_id, target_status):
statuses = ["pending", "approved", "running", "completed", "failed", "rejected"]
if target_status not in statuses:
return False, f"Invalid target status: {target_status}"
# Find where the action is
source_path = None
for status in statuses:
p = ACTIONS_DIR / status / f"{action_id}.json"
if p.exists():
source_path = p
break
if not source_path:
return False, f"Action {action_id} not found"
target_dir = ACTIONS_DIR / target_status
target_dir.mkdir(parents=True, exist_ok=True)
target_path = target_dir / f"{action_id}.json"
try:
data = json.loads(source_path.read_text())
data["status"] = target_status
data["last_mutation"] = os.path.getmtime(source_path) # or current time
import time
data["last_mutation"] = time.time()
target_path.write_text(json.dumps(data, indent=2))
if source_path != target_path:
source_path.unlink()
return True, "Success"
except Exception as e:
return False, str(e)
def send_json(status, payload, handler):
body = (json.dumps(payload) + "\n").encode("utf-8")
handler.send_response(status)
@ -207,6 +257,10 @@ class Handler(BaseHTTPRequestHandler):
send_json(200, current_events(), self)
return
if self.path == "/actions":
send_json(200, current_actions(), self)
return
if self.path == "/logs":
print("LOGS endpoint called", flush=True)
body = ("\n".join(tail_lines(EVENT_LOG, 200)) + "\n").encode("utf-8")
@ -236,6 +290,7 @@ class Handler(BaseHTTPRequestHandler):
"/auto-mode",
"/config",
"/events",
"/action/mutate",
):
self.send_error(404)
return
@ -291,6 +346,19 @@ class Handler(BaseHTTPRequestHandler):
send_json(200, {"status": "sent"}, self)
return
if self.path == "/action/mutate":
action_id = payload.get("id")
target = payload.get("status")
if not action_id or not target:
self.send_error(400, "id and status are required")
return
success, msg = mutate_action(action_id, target)
if success:
send_json(200, {"status": "ok"}, self)
else:
self.send_error(500, msg)
return
if not command:
self.send_error(400, "command is required")
return