import os import json import time import logging import subprocess from pathlib import Path # Constants and Paths RUNTIME_PATH = os.getenv("RUNTIME_PATH", "/opt/homelab") ACTIONS_DIR = Path(RUNTIME_PATH) / "actions" REPO_ROOT = Path(os.getenv("REPO_ROOT", "/repo")) # SSH configuration # SSH_USER can be overridden per-deployment environment. SSH_USER = os.getenv("SSH_USER", "oskar") SSH_OPTIONS = [ "-o", "StrictHostKeyChecking=no", "-o", "ConnectTimeout=10", "-o", "BatchMode=yes", ] # Logging setup logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger("executor") class Executor: def __init__(self): self._ensure_dirs() def _ensure_dirs(self): for s in ["approved", "running", "completed", "failed", "rejected"]: (ACTIONS_DIR / s).mkdir(parents=True, exist_ok=True) def process_actions(self): # Update heartbeat heartbeat_file = ACTIONS_DIR.parent / "state" / "executor.heartbeat" try: heartbeat_file.touch() except Exception as e: logger.error(f"Failed to touch heartbeat file: {e}") approved_dir = ACTIONS_DIR / "approved" action_files = sorted(approved_dir.glob("*.json")) for action_file in action_files: self._execute_action(action_file) def _execute_action(self, action_file): action_id = action_file.stem logger.info(f"Executing action: {action_id}") # Move to running running_path = ACTIONS_DIR / "running" / f"{action_id}.json" try: with open(action_file, "r") as f: data = json.load(f) data["status"] = "running" data["started_at"] = time.time() with open(running_path, "w") as f: json.dump(data, f, indent=2) action_file.unlink() except Exception as e: logger.error(f"Failed to move {action_id} to running: {e}") return # Dispatch by action type success = False error_msg = "" try: action_type = data.get("type") node = data.get("node") service = data.get("service") if action_type == "redeploy": # Full service redeploy via the repo deploy script cmd = [ str(REPO_ROOT / "scripts" / "deploy" / "deploy-node.sh"), node, service ] logger.info(f"Running command: {' '.join(cmd)}") result = subprocess.run(cmd, capture_output=True, text=True, cwd=str(REPO_ROOT)) if result.returncode == 0: success = True else: success = False error_msg = result.stderr or result.stdout elif action_type == "container_restart": # Lightweight restart: SSH to node and docker restart the container. # container_name is set by the supervisor; falls back to service name. container_name = data.get("container_name") or service success, error_msg = self._execute_container_restart(node, container_name) elif action_type == "disk_cleanup": # Operator-approved aggressive Docker cleanup (image prune -a + # volume prune). Commands come from the action payload so the # supervisor controls exactly what runs; the executor adds a # safety check to reject anything touching protected paths. payload = data.get("payload", {}) success, error_msg = self._execute_disk_cleanup(node, payload) elif action_type == "alert_only": # Operator acknowledged the alert; no automated execution needed. success = True else: success = False error_msg = f"Unknown action type: {action_type}" except Exception as e: success = False error_msg = str(e) # Move to completed/failed target_status = "completed" if success else "failed" target_path = ACTIONS_DIR / target_status / f"{action_id}.json" try: data["status"] = target_status data["finished_at"] = time.time() if not success: data["error"] = error_msg with open(target_path, "w") as f: json.dump(data, f, indent=2) running_path.unlink() logger.info(f"Action {action_id} {target_status}") except Exception as e: logger.error(f"Failed to move {action_id} to {target_status}: {e}") def _execute_container_restart(self, node, container_name, retry_delay=10): """ SSH to the target node and run `docker restart `. Attempts the restart up to 2 times (initial + 1 retry). If the first attempt fails, waits retry_delay seconds then tries once more before declaring the action failed. Returns (success: bool, error_msg: str). """ cmd = [ "ssh", *SSH_OPTIONS, f"{SSH_USER}@{node}", f"docker restart {container_name}", ] logger.info(f"SSH container restart: {' '.join(cmd)}") max_attempts = 2 last_error = "" for attempt in range(1, max_attempts + 1): result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode == 0: logger.info( f"Container '{container_name}' on {node} restarted successfully " f"(attempt {attempt}/{max_attempts})" ) return True, "" last_error = (result.stderr or result.stdout).strip() logger.warning( f"container_restart attempt {attempt}/{max_attempts} failed " f"for '{container_name}' on {node}: {last_error}" ) if attempt < max_attempts: logger.info(f"Retrying in {retry_delay}s...") time.sleep(retry_delay) logger.error( f"container_restart exhausted all {max_attempts} attempts " f"for '{container_name}' on {node}" ) return False, last_error def _execute_disk_cleanup(self, node: str, payload: dict): """ SSH to the target node and run the operator-approved disk cleanup commands from the action payload. Safety invariants enforced here regardless of payload content: - No command may reference /opt/homelab/data/, /opt/homelab/config/, or /opt/homelab/state/ (application data and configuration). - No command may contain rm -rf / or similar destructive patterns. If any command fails the safety check the entire action is rejected (not run at all) and the rejection reason is recorded. Returns (success: bool, error_msg: str). """ commands = payload.get("commands", [ "docker image prune -a -f", "docker volume prune -f", ]) # Safety gate: reject commands that touch protected paths FORBIDDEN = [ "/opt/homelab/data", "/opt/homelab/config", "/opt/homelab/state", "rm -rf /", ] for cmd in commands: for forbidden in FORBIDDEN: if forbidden in cmd: msg = f"Rejected: command contains forbidden pattern '{forbidden}': {cmd}" logger.error(msg) return False, msg full_command = " && ".join(commands) cmd = [ "ssh", *SSH_OPTIONS, f"{SSH_USER}@{node}", full_command, ] logger.info(f"Disk cleanup on {node}: {full_command}") result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode == 0: logger.info(f"Disk cleanup on {node} succeeded") return True, "" error_msg = (result.stderr or result.stdout).strip() logger.error(f"Disk cleanup on {node} failed: {error_msg}") return False, error_msg def loop(self, interval=10): logger.info("Starting executor loop") while True: self.process_actions() time.sleep(interval) if __name__ == "__main__": executor = Executor() executor.loop()