diff --git a/docs/vps-control-plane.md b/docs/vps-control-plane.md new file mode 100644 index 0000000..59f7376 --- /dev/null +++ b/docs/vps-control-plane.md @@ -0,0 +1,78 @@ +# VPS Control Plane + +The VPS Control Plane is the orchestration brain of the homelab platform. It runs on the Hetzner VPS and provides observability, automated reconciliation, and a web-based operator interface. + +## Architecture + +The control plane consists of four core services running as a Docker Compose stack: + +1. **Observer**: Synthesizes world state from events. +2. **Supervisor**: Detects drifts between desired and actual state. +3. **Executor**: Executes approved actions from the queue. +4. **Operator UI**: Web interface for system monitoring and action approval. + +All services adhere to **filesystem-first** semantics, using `/opt/homelab/` as the primary data exchange and persistence layer. + +## Deployment Flow + +### 1. Prerequisites +- Target VPS node must be onboarded (Tailscale active, Docker installed). +- Repository cloned to `/home/oskar/homelab-codex-ws`. + +### 2. Bootstrap +Run the bootstrap script to initialize the runtime filesystem and start the stack: + +```bash +./scripts/bootstrap/vps-control-plane.sh +``` + +### 3. Verification +Verify the stack is healthy: + +```bash +cd services/control-plane +docker compose ps +curl http://localhost:8080/summary +``` + +## Operational Workflows + +### Action Approval +1. Access the Operator UI (via Tailscale IP or Nginx Proxy Manager). +2. Navigate to **Action Queue**. +3. Review **Pending** actions recommended by the Supervisor. +4. Click **Approve** to move actions to the execution queue. + +### Recovery Flow +In case of control plane failure: +1. Check logs: `docker compose logs -f`. +2. Restart stack: `docker compose restart`. +3. Rebuild world state: Delete `/opt/homelab/state/observer_checkpoint.json` and restart the observer service. + +### Upgrade Flow +1. Pull latest changes from git. +2. Run bootstrap script again: `./scripts/bootstrap/vps-control-plane.sh`. + - This will rebuild images and restart containers with new code. + +### Rollback Semantics +Since the runtime is filesystem-first and append-only: +1. Roll back the repository state to a previous commit. +2. Restart the control plane stack. +3. The supervisor will detect drift against the older (rolled-back) desired state and recommend actions to restore it. + +## Runtime Safety + +- **Readonly Mounts**: Most services mount the repository as `:ro` to prevent accidental mutations. +- **Least-Privilege**: UI, Observer, and Supervisor run as non-root `homelab` user (UID 1000). +- **Filesystem Isolation**: Clear separation between `/repo` (code/inventory) and `/opt/homelab` (runtime state). + +## Integration + +### Nginx Proxy Manager +Configure a proxy host in NPM to point to `http://control-plane-ui:8080`. Ensure Websockets are enabled if the UI uses them. + +### Log Locations +- Container logs: `docker compose logs` +- Runtime events: `/opt/homelab/events/YYYY-MM-DD/` +- World state: `/opt/homelab/world/` +- Diagnostics: `/opt/homelab/logs/` diff --git a/hosts/vps/runtime/control-plane/env.example b/hosts/vps/runtime/control-plane/env.example new file mode 100644 index 0000000..89985a1 --- /dev/null +++ b/hosts/vps/runtime/control-plane/env.example @@ -0,0 +1,7 @@ +# Control Plane Environment Variables +PORT=8080 +HOMELAB_STATE_ROOT=/opt/homelab/state +HOMELAB_EVENTS_ROOT=/opt/homelab/events +HOMELAB_WORLD_ROOT=/opt/homelab/world +HOMELAB_ACTIONS_ROOT=/opt/homelab/actions +HOMELAB_CONFIG_ROOT=/opt/homelab/config diff --git a/scripts/bootstrap/vps-control-plane.sh b/scripts/bootstrap/vps-control-plane.sh new file mode 100755 index 0000000..1dd39aa --- /dev/null +++ b/scripts/bootstrap/vps-control-plane.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# vps-control-plane.sh - Bootstrap script for VPS control plane + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +RUNTIME_DIR="/opt/homelab" +VPS_CONFIG="$REPO_ROOT/hosts/vps/runtime" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +log() { echo -e "${GREEN}[INFO]${NC} $1"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } +error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; } + +log "Starting VPS control plane bootstrap..." + +# 1. Validate Docker availability +if ! command -v docker &> /dev/null; then + error "Docker is not installed. Please install Docker first." +fi + +# 2. Validate compose plugin +if ! docker compose version &> /dev/null; then + error "Docker Compose plugin is not installed." +fi + +log "Docker and Compose plugin verified." + +# 3. Create filesystem-first runtime structure +log "Creating filesystem-first runtime structure in $RUNTIME_DIR..." +sudo mkdir -p "$RUNTIME_DIR/events" \ + "$RUNTIME_DIR/state" \ + "$RUNTIME_DIR/world" \ + "$RUNTIME_DIR/actions/pending" \ + "$RUNTIME_DIR/actions/approved" \ + "$RUNTIME_DIR/actions/running" \ + "$RUNTIME_DIR/actions/completed" \ + "$RUNTIME_DIR/actions/failed" \ + "$RUNTIME_DIR/actions/rejected" \ + "$RUNTIME_DIR/config" \ + "$RUNTIME_DIR/logs" + +# 4. Set permissions +log "Setting permissions..." +sudo chown -R $USER:$USER "$RUNTIME_DIR" +chmod -R 755 "$RUNTIME_DIR" + +# 5. Install environment file +log "Installing environment configuration..." +if [ ! -f "$RUNTIME_DIR/config/control-plane.env" ]; then + cp "$VPS_CONFIG/control-plane/env.example" "$RUNTIME_DIR/config/control-plane.env" + log "Created $RUNTIME_DIR/config/control-plane.env from template." +else + warn "Environment file already exists, skipping installation." +fi + +# 6. Build and start the control plane +log "Building and starting control plane services..." +cd "$REPO_ROOT/services/control-plane" +docker compose build +docker compose up -d + +log "VPS control plane bootstrap complete!" + +echo -e "\n${YELLOW}Verification commands:${NC}" +echo "1. Check container status: docker compose ps" +echo "2. Check operator UI: curl http://localhost:8080/summary" +echo "3. Validate world state: ls -l $RUNTIME_DIR/world" +echo "4. Monitor events: tail -f $RUNTIME_DIR/events/*/*/*.json" diff --git a/services/control-plane/Dockerfile b/services/control-plane/Dockerfile new file mode 100644 index 0000000..a620b8e --- /dev/null +++ b/services/control-plane/Dockerfile @@ -0,0 +1,28 @@ +FROM python:3.11-slim + +WORKDIR /app + +RUN apt-get update && apt-get install -y \ + curl \ + docker.io \ + && rm -rf /var/lib/apt/lists/* + +RUN pip install --no-cache-dir pyyaml + +# Create homelab user +RUN useradd -m -u 1000 homelab + +# Copy sources +COPY src/ /app/src/ +# Also need the observer script if we want to run it from here, +# but I'll copy it from the repo during build or mount it. +# Actually, I'll copy the entire scripts/ directory to /repo/scripts +# so the supervisor/executor can find them. + +# For simplicity, we'll assume the repo is mounted at /repo +ENV REPO_ROOT=/repo +ENV RUNTIME_PATH=/opt/homelab +ENV PYTHONUNBUFFERED=1 + +# Default command (will be overridden in docker-compose) +CMD ["python", "src/operator_ui.py"] diff --git a/services/control-plane/docker-compose.yml b/services/control-plane/docker-compose.yml new file mode 100644 index 0000000..d82d600 --- /dev/null +++ b/services/control-plane/docker-compose.yml @@ -0,0 +1,55 @@ +services: + operator-ui: + build: . + container_name: control-plane-ui + user: "1000:1000" + command: python src/operator_ui.py + ports: + - "8080:8080" + volumes: + - /opt/homelab:/opt/homelab + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8080/summary"] + interval: 30s + timeout: 10s + retries: 3 + + observer: + build: . + container_name: control-plane-observer + user: "1000:1000" + command: python /repo/scripts/observer/observer.py + volumes: + - /opt/homelab:/opt/homelab + - ../..:/repo:ro + restart: unless-stopped + environment: + - REPO_ROOT=/repo + - RUNTIME_PATH=/opt/homelab + + supervisor: + build: . + container_name: control-plane-supervisor + user: "1000:1000" + command: python src/supervisor.py + volumes: + - /opt/homelab:/opt/homelab + - ../..:/repo:ro + restart: unless-stopped + environment: + - REPO_ROOT=/repo + - RUNTIME_PATH=/opt/homelab + + executor: + build: . + container_name: control-plane-executor + command: python src/executor.py + volumes: + - /opt/homelab:/opt/homelab + - ../..:/repo + - /var/run/docker.sock:/var/run/docker.sock + restart: unless-stopped + environment: + - REPO_ROOT=/repo + - RUNTIME_PATH=/opt/homelab diff --git a/services/control-plane/src/executor.py b/services/control-plane/src/executor.py new file mode 100644 index 0000000..1776abc --- /dev/null +++ b/services/control-plane/src/executor.py @@ -0,0 +1,102 @@ +import os +import json +import time +import logging +import subprocess +from pathlib import Path + +# Constants and Paths +RUNTIME_PATH = os.getenv("RUNTIME_PATH", "/opt/homelab") +ACTIONS_DIR = Path(RUNTIME_PATH) / "actions" +REPO_ROOT = Path(os.getenv("REPO_ROOT", "/repo")) + +# Logging setup +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger("executor") + +class Executor: + def __init__(self): + self._ensure_dirs() + + def _ensure_dirs(self): + for s in ["approved", "running", "completed", "failed", "rejected"]: + (ACTIONS_DIR / s).mkdir(parents=True, exist_ok=True) + + def process_actions(self): + approved_dir = ACTIONS_DIR / "approved" + action_files = sorted(approved_dir.glob("*.json")) + + for action_file in action_files: + self._execute_action(action_file) + + def _execute_action(self, action_file): + action_id = action_file.stem + logger.info(f"Executing action: {action_id}") + + # Move to running + running_path = ACTIONS_DIR / "running" / f"{action_id}.json" + try: + with open(action_file, "r") as f: + data = json.load(f) + data["status"] = "running" + data["started_at"] = time.time() + with open(running_path, "w") as f: + json.dump(data, f, indent=2) + action_file.unlink() + except Exception as e: + logger.error(f"Failed to move {action_id} to running: {e}") + return + + # Execute + success = False + error_msg = "" + try: + action_type = data.get("type") + node = data.get("node") + service = data.get("service") + + if action_type == "redeploy": + # Call deploy-node.sh + cmd = [ + str(REPO_ROOT / "scripts" / "deploy" / "deploy-node.sh"), + node, + service + ] + logger.info(f"Running command: {' '.join(cmd)}") + result = subprocess.run(cmd, capture_output=True, text=True, cwd=str(REPO_ROOT)) + if result.returncode == 0: + success = True + else: + success = False + error_msg = result.stderr or result.stdout + else: + success = False + error_msg = f"Unknown action type: {action_type}" + except Exception as e: + success = False + error_msg = str(e) + + # Move to completed/failed + target_status = "completed" if success else "failed" + target_path = ACTIONS_DIR / target_status / f"{action_id}.json" + try: + data["status"] = target_status + data["finished_at"] = time.time() + if not success: + data["error"] = error_msg + with open(target_path, "w") as f: + json.dump(data, f, indent=2) + running_path.unlink() + logger.info(f"Action {action_id} {target_status}") + except Exception as e: + logger.error(f"Failed to move {action_id} to {target_status}: {e}") + + def loop(self, interval=10): + logger.info("Starting executor loop") + while True: + self.process_actions() + time.sleep(interval) + +if __name__ == "__main__": + executor = Executor() + executor.loop() diff --git a/services/control-plane/src/index.html b/services/control-plane/src/index.html new file mode 100644 index 0000000..d20843a --- /dev/null +++ b/services/control-plane/src/index.html @@ -0,0 +1,701 @@ + + +
+ + +