diff --git a/.junie/memory/errors.md b/.junie/memory/errors.md new file mode 100644 index 0000000..e69de29 diff --git a/.junie/memory/feedback.md b/.junie/memory/feedback.md new file mode 100644 index 0000000..e69de29 diff --git a/.junie/memory/language.json b/.junie/memory/language.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/.junie/memory/language.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/.junie/memory/memory.version b/.junie/memory/memory.version new file mode 100644 index 0000000..f398a20 --- /dev/null +++ b/.junie/memory/memory.version @@ -0,0 +1 @@ +3.0 \ No newline at end of file diff --git a/.junie/memory/tasks.md b/.junie/memory/tasks.md new file mode 100644 index 0000000..e69de29 diff --git a/hosts/node1/services.yaml b/hosts/node1/services.yaml new file mode 100644 index 0000000..6854596 --- /dev/null +++ b/hosts/node1/services.yaml @@ -0,0 +1,2 @@ +services: + - name: homeassistant diff --git a/hosts/node2/services.yaml b/hosts/node2/services.yaml new file mode 100644 index 0000000..fa65cbe --- /dev/null +++ b/hosts/node2/services.yaml @@ -0,0 +1,2 @@ +services: + - name: webapp diff --git a/scripts/supervisor/supervisor.py b/scripts/supervisor/supervisor.py new file mode 100644 index 0000000..e58027b --- /dev/null +++ b/scripts/supervisor/supervisor.py @@ -0,0 +1,292 @@ +#!/usr/bin/env python3 +import os +import sys +import yaml +import json +import time +import glob +from pathlib import Path + +# Configuration +WORLD_STATE_PATH = Path(os.getenv("HOMELAB_WORLD_ROOT", "/opt/homelab/world")) +INVENTORY_PATH = Path("hosts") +EVENT_LOG = Path("/tmp/agent-events.log") +CHECKPOINT_FILE = Path("/tmp/supervisor-checkpoint.json") + +# Reconcile event types +RECONCILE_REQUIRED = "reconcile_required" +RECONCILE_RECOMMENDED = "reconcile_recommended" +RECONCILE_BLOCKED = "reconcile_blocked" + +# Runtime summary states +STATE_NOMINAL = "nominal" +STATE_DEGRADED = "degraded" +STATE_UNSTABLE = "unstable" +STATE_RECONCILING = "reconciling" + +def emit_event(event_type, message, details=None): + """Emit reconciliation events using existing event system (append-only file).""" + event = { + "type": event_type, + "message": message, + "timestamp": time.time(), + "details": details or {} + } + line = json.dumps(event) + print(line) + try: + # Append-only semantics + with open(EVENT_LOG, "a", encoding="utf-8") as f: + f.write(line + "\n") + f.flush() + except Exception as e: + print(f"Error writing to event log: {e}", file=sys.stderr) + +def load_desired_state(): + """Load desired state from hosts/*/services.yaml.""" + desired = {"services": [], "nodes": []} + if not INVENTORY_PATH.exists(): + return desired + + # Inventory model: hosts/{node_name}/services.yaml + for yaml_file in glob.glob(str(INVENTORY_PATH / "*" / "services.yaml")): + try: + with open(yaml_file, "r") as f: + data = yaml.safe_load(f) + if data and "services" in data: + node_name = Path(yaml_file).parent.name + for svc in data["services"]: + if isinstance(svc, str): + svc = {"name": svc} + svc["node"] = node_name + desired["services"].append(svc) + if node_name not in desired["nodes"]: + desired["nodes"].append(node_name) + except Exception as e: + print(f"Error loading {yaml_file}: {e}", file=sys.stderr) + return desired + +def load_world_state(): + """Load current world state from /opt/homelab/world/ (filesystem-first).""" + world = { + "services": {}, + "nodes": {}, + "deployments": {}, + "incidents": {} + } + + if not WORLD_STATE_PATH.exists(): + return world + + # Filesystem-first design: each category is a directory, each item is a JSON file + for category in ["services", "nodes", "deployments", "incidents"]: + cat_path = WORLD_STATE_PATH / category + if cat_path.exists() and cat_path.is_dir(): + for json_file in cat_path.glob("*.json"): + try: + with open(json_file, "r") as f: + world[category][json_file.stem] = json.load(f) + except Exception as e: + print(f"Error loading {json_file}: {e}", file=sys.stderr) + + return world + +def detect_drift(desired, world): + """Compare desired infrastructure state with observed runtime state.""" + drifts = [] + + # 1. Missing service & 2. Unhealthy service + desired_service_names = set() + for d_svc in desired["services"]: + name = d_svc["name"] + desired_service_names.add(name) + a_svc = world["services"].get(name) + + if not a_svc: + drifts.append({ + "type": "missing_service", + "service": name, + "node": d_svc["node"] + }) + elif a_svc.get("status") not in ("ok", "healthy", "up"): + drifts.append({ + "type": "unhealthy_service", + "service": name, + "status": a_svc.get("status"), + "node": a_svc.get("node") or d_svc["node"] + }) + + # 4. Offline node + for node_name in desired["nodes"]: + a_node = world["nodes"].get(node_name) + if not a_node or a_node.get("status") not in ("online", "ok", "up"): + drifts.append({ + "type": "offline_node", + "node": node_name, + "status": a_node.get("status") if a_node else "missing" + }) + + # 3. Failed deployment + # Check for recent failures in world/deployments + for d_id, d_data in world["deployments"].items(): + if d_data.get("status") == "failed": + drifts.append({ + "type": "failed_deployment", + "deployment_id": d_id, + "service": d_data.get("service") + }) + + # 5. Unresolved incidents + for i_id, i_data in world["incidents"].items(): + if i_data.get("status") not in ("resolved", "closed"): + drifts.append({ + "type": "unresolved_incident", + "incident_id": i_id, + "description": i_data.get("description"), + "status": i_data.get("status") + }) + + return drifts + +def recommendation_engine(drifts, world): + """Recommendation mode only: emit proposed actions without mutation.""" + recommendations = [] + for drift in drifts: + if drift["type"] == "unhealthy_service": + recommendations.append({ + "drift": drift, + "action": "redeploy", + "message": f"Service {drift['service']} is unhealthy. Recommend redeploy.", + "type": RECONCILE_REQUIRED + }) + elif drift["type"] == "failed_deployment": + service = drift["service"] + # Recommendation: repeated deployment failures -> recommend diagnostics + failures = [d for d in world["deployments"].values() if d.get("service") == service and d.get("status") == "failed"] + if len(failures) > 2: + recommendations.append({ + "drift": drift, + "action": "diagnostics", + "message": f"Repeated deployment failures for {service}. Recommend diagnostics.", + "type": RECONCILE_BLOCKED + }) + else: + recommendations.append({ + "drift": drift, + "action": "redeploy", + "message": f"Deployment failed for {service}. Recommend retry.", + "type": RECONCILE_REQUIRED + }) + elif drift["type"] == "offline_node": + # Recommendation: node offline -> recommend failover review + recommendations.append({ + "drift": drift, + "action": "failover_review", + "message": f"Node {drift['node']} is offline. Recommend failover review.", + "type": RECONCILE_REQUIRED + }) + elif drift["type"] == "missing_service": + # Recommendation: dependency unavailable -> recommend delayed deployment + # Mock dependency check: if a service is 'webapp' and 'database' is not healthy + dependencies_met = True + if drift["service"] == "webapp": + db_svc = world["services"].get("database") + if not db_svc or db_svc.get("status") not in ("ok", "healthy", "up"): + dependencies_met = False + + if not dependencies_met: + recommendations.append({ + "drift": drift, + "action": "delayed_deployment", + "message": f"Dependency unavailable for {drift['service']}. Recommend delayed deployment.", + "type": RECONCILE_RECOMMENDED + }) + else: + recommendations.append({ + "drift": drift, + "action": "deploy", + "message": f"Service {drift['service']} is missing. Recommend deployment.", + "type": RECONCILE_REQUIRED + }) + elif drift["type"] == "unresolved_incident": + recommendations.append({ + "drift": drift, + "action": "review", + "message": f"Unresolved incident: {drift['description']}. Recommend review.", + "type": RECONCILE_RECOMMENDED + }) + return recommendations + +def save_checkpoint(state): + """Checkpoint support for idempotent operation.""" + try: + with open(CHECKPOINT_FILE, "w") as f: + json.dump({ + "last_run": time.time(), + "state": state + }, f) + except Exception as e: + print(f"Error saving checkpoint: {e}", file=sys.stderr) + +def load_checkpoint(): + """Load last run checkpoint.""" + if CHECKPOINT_FILE.exists(): + try: + with open(CHECKPOINT_FILE, "r") as f: + return json.load(f) + except Exception as e: + print(f"Error loading checkpoint: {e}", file=sys.stderr) + return None + +def main(): + print(f"--- Supervisor Run: {time.ctime()} ---") + + checkpoint = load_checkpoint() + if checkpoint: + print(f"Last run: {time.ctime(checkpoint.get('last_run', 0))}") + + # 1. Load desired state + desired = load_desired_state() + + # 2. Load world state + world = load_world_state() + + # 3. Detect drift + drifts = detect_drift(desired, world) + + # 4. Generate recommendations (Recommendation mode only) + recommendations = recommendation_engine(drifts, world) + + # 5. Emit events & Update summary state + if not recommendations and not drifts: + emit_event("summary_state", f"System state: {STATE_NOMINAL}", {"state": STATE_NOMINAL}) + else: + # Extend runtime summary states: nominal, degraded, unstable, reconciling + has_blocked = any(r["type"] == RECONCILE_BLOCKED for r in recommendations) + has_required = any(r["type"] == RECONCILE_REQUIRED for r in recommendations) + + overall_state = STATE_NOMINAL + if has_blocked: + overall_state = STATE_UNSTABLE + elif has_required: + overall_state = STATE_DEGRADED + elif recommendations: + overall_state = STATE_DEGRADED + + emit_event("summary_state", f"System state: {overall_state}", {"state": overall_state}) + + # Emit reconciliation events + for rec in recommendations: + emit_event(rec["type"], rec["message"], rec["drift"]) + + # 6. Save checkpoint + save_checkpoint({ + "drift_count": len(drifts), + "recommendation_count": len(recommendations), + "timestamp": time.time() + }) + + print("Run complete.") + +if __name__ == "__main__": + main() diff --git a/scripts/supervisor/test_scenarios.sh b/scripts/supervisor/test_scenarios.sh new file mode 100644 index 0000000..b03aed0 --- /dev/null +++ b/scripts/supervisor/test_scenarios.sh @@ -0,0 +1,79 @@ +#!/bin/bash +# Generate realistic reconciliation drift scenarios for the homelab supervisor. + +set -e + +# Configuration +BASE_DIR=$(pwd) +# Use local directory for testing if /opt is not writable +WORLD_DIR="${HOMELAB_WORLD_ROOT:-$BASE_DIR/tmp/homelab/world}" +INVENTORY_DIR="$BASE_DIR/hosts" + +echo "Setting up homelab reconciliation scenarios..." +echo "World state: $WORLD_DIR" +echo "Inventory: $INVENTORY_DIR" + +# Cleanup +rm -rf "$WORLD_DIR" +rm -rf "$INVENTORY_DIR" + +# Create directories +mkdir -p "$INVENTORY_DIR/node1" +mkdir -p "$INVENTORY_DIR/node2" +mkdir -p "$WORLD_DIR/services" "$WORLD_DIR/nodes" "$WORLD_DIR/deployments" "$WORLD_DIR/incidents" + +# --- Scenario 1: Nominal --- +cat < "$INVENTORY_DIR/node1/services.yaml" +services: + - name: homeassistant +EOF + +cat < "$WORLD_DIR/services/homeassistant.json" +{"name": "homeassistant", "status": "healthy", "node": "node1"} +EOF + +cat < "$WORLD_DIR/nodes/node1.json" +{"name": "node1", "status": "online"} +EOF + +# --- Scenario 2: Unhealthy Service --- +cat < "$WORLD_DIR/services/homeassistant.json" +{"name": "homeassistant", "status": "unhealthy", "node": "node1"} +EOF + +# --- Scenario 3: Missing Service --- +cat < "$INVENTORY_DIR/node2/services.yaml" +services: + - name: webapp +EOF +# webapp is missing from world/services + +# --- Scenario 4: Dependency Unavailable (for Missing Service) --- +cat < "$WORLD_DIR/services/database.json" +{"name": "database", "status": "error", "node": "node2"} +EOF +# webapp depends on database in supervisor logic + +# --- Scenario 5: Offline Node --- +cat < "$WORLD_DIR/nodes/node2.json" +{"name": "node2", "status": "offline"} +EOF + +# --- Scenario 6: Repeated Deployment Failures --- +cat < "$WORLD_DIR/deployments/dep-001.json" +{"id": "dep-001", "service": "webapp", "status": "failed", "timestamp": $(date +%s)} +EOF +cat < "$WORLD_DIR/deployments/dep-002.json" +{"id": "dep-002", "service": "webapp", "status": "failed", "timestamp": $(( $(date +%s) - 300 ))} +EOF +cat < "$WORLD_DIR/deployments/dep-003.json" +{"id": "dep-003", "service": "webapp", "status": "failed", "timestamp": $(( $(date +%s) - 600 ))} +EOF + +# --- Scenario 7: Unresolved Incident --- +cat < "$WORLD_DIR/incidents/inc-99.json" +{"id": "inc-99", "description": "High memory usage on node1", "status": "investigating"} +EOF + +echo "Scenarios generated successfully." +echo "You can now run: HOMELAB_WORLD_ROOT=$WORLD_DIR python3 scripts/supervisor/supervisor.py" diff --git a/tmp/homelab/world/deployments/dep-001.json b/tmp/homelab/world/deployments/dep-001.json new file mode 100644 index 0000000..02db067 --- /dev/null +++ b/tmp/homelab/world/deployments/dep-001.json @@ -0,0 +1 @@ +{"id": "dep-001", "service": "webapp", "status": "failed", "timestamp": 1778597957} diff --git a/tmp/homelab/world/deployments/dep-002.json b/tmp/homelab/world/deployments/dep-002.json new file mode 100644 index 0000000..e977aa0 --- /dev/null +++ b/tmp/homelab/world/deployments/dep-002.json @@ -0,0 +1 @@ +{"id": "dep-002", "service": "webapp", "status": "failed", "timestamp": 1778597657} diff --git a/tmp/homelab/world/deployments/dep-003.json b/tmp/homelab/world/deployments/dep-003.json new file mode 100644 index 0000000..66f10c9 --- /dev/null +++ b/tmp/homelab/world/deployments/dep-003.json @@ -0,0 +1 @@ +{"id": "dep-003", "service": "webapp", "status": "failed", "timestamp": 1778597357} diff --git a/tmp/homelab/world/incidents/inc-99.json b/tmp/homelab/world/incidents/inc-99.json new file mode 100644 index 0000000..4f28449 --- /dev/null +++ b/tmp/homelab/world/incidents/inc-99.json @@ -0,0 +1 @@ +{"id": "inc-99", "description": "High memory usage on node1", "status": "investigating"} diff --git a/tmp/homelab/world/nodes/node1.json b/tmp/homelab/world/nodes/node1.json new file mode 100644 index 0000000..d246df0 --- /dev/null +++ b/tmp/homelab/world/nodes/node1.json @@ -0,0 +1 @@ +{"name": "node1", "status": "online"} diff --git a/tmp/homelab/world/nodes/node2.json b/tmp/homelab/world/nodes/node2.json new file mode 100644 index 0000000..bcc0d43 --- /dev/null +++ b/tmp/homelab/world/nodes/node2.json @@ -0,0 +1 @@ +{"name": "node2", "status": "offline"} diff --git a/tmp/homelab/world/services/database.json b/tmp/homelab/world/services/database.json new file mode 100644 index 0000000..4395a11 --- /dev/null +++ b/tmp/homelab/world/services/database.json @@ -0,0 +1 @@ +{"name": "database", "status": "error", "node": "node2"} diff --git a/tmp/homelab/world/services/homeassistant.json b/tmp/homelab/world/services/homeassistant.json new file mode 100644 index 0000000..50e31b7 --- /dev/null +++ b/tmp/homelab/world/services/homeassistant.json @@ -0,0 +1 @@ +{"name": "homeassistant", "status": "unhealthy", "node": "node1"}