homelab-codex-ws/services/control-plane/src/executor.py

import os
import json
import time
import logging
import subprocess
from pathlib import Path

# Constants and Paths
RUNTIME_PATH = os.getenv("RUNTIME_PATH", "/opt/homelab")
ACTIONS_DIR = Path(RUNTIME_PATH) / "actions"
REPO_ROOT = Path(os.getenv("REPO_ROOT", "/repo"))

# SSH configuration
# SSH_USER can be overridden per-deployment environment.
SSH_USER = os.getenv("SSH_USER", "oskar")
SSH_OPTIONS = [
    "-o", "StrictHostKeyChecking=no",
    "-o", "ConnectTimeout=10",
    "-o", "BatchMode=yes",
]

# Logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("executor")


class Executor:
    def __init__(self):
        self._ensure_dirs()

    def _ensure_dirs(self):
        for s in ["approved", "running", "completed", "failed", "rejected"]:
            (ACTIONS_DIR / s).mkdir(parents=True, exist_ok=True)

    def process_actions(self):
        # Update heartbeat
        heartbeat_file = ACTIONS_DIR.parent / "state" / "executor.heartbeat"
        try:
            heartbeat_file.touch()
        except Exception as e:
            logger.error(f"Failed to touch heartbeat file: {e}")

        approved_dir = ACTIONS_DIR / "approved"
        action_files = sorted(approved_dir.glob("*.json"))

        for action_file in action_files:
            self._execute_action(action_file)

    def _execute_action(self, action_file):
        action_id = action_file.stem
        logger.info(f"Executing action: {action_id}")

        # Move to running
        running_path = ACTIONS_DIR / "running" / f"{action_id}.json"
        try:
            with open(action_file, "r") as f:
                data = json.load(f)
            data["status"] = "running"
            data["started_at"] = time.time()
            with open(running_path, "w") as f:
                json.dump(data, f, indent=2)
            action_file.unlink()
        except Exception as e:
            logger.error(f"Failed to move {action_id} to running: {e}")
            return

        # Dispatch by action type
        success = False
        error_msg = ""
        try:
            action_type = data.get("type")
            node = data.get("node")
            service = data.get("service")

            if action_type == "redeploy":
                # Full service redeploy via the repo deploy script
                cmd = [
                    str(REPO_ROOT / "scripts" / "deploy" / "deploy-node.sh"),
                    node,
                    service
                ]
                logger.info(f"Running command: {' '.join(cmd)}")
                result = subprocess.run(cmd, capture_output=True, text=True, cwd=str(REPO_ROOT))
                if result.returncode == 0:
                    success = True
                else:
                    success = False
                    error_msg = result.stderr or result.stdout

            elif action_type == "container_restart":
                # Lightweight restart: SSH to node and docker restart the container.
                # container_name is set by the supervisor; falls back to service name.
                container_name = data.get("container_name") or service
                success, error_msg = self._execute_container_restart(node, container_name)

            elif action_type == "disk_cleanup":
                # Operator-approved aggressive Docker cleanup (image prune -a +
                # volume prune). Commands come from the action payload so the
                # supervisor controls exactly what runs; the executor adds a
                # safety check to reject anything touching protected paths.
                payload = data.get("payload", {})
                success, error_msg = self._execute_disk_cleanup(node, payload)

            elif action_type == "alert_only":
                # Operator acknowledged the alert; no automated execution needed.
                success = True

            else:
                success = False
                error_msg = f"Unknown action type: {action_type}"

        except Exception as e:
            success = False
            error_msg = str(e)

        # Move to completed/failed
        target_status = "completed" if success else "failed"
        target_path = ACTIONS_DIR / target_status / f"{action_id}.json"
        try:
            data["status"] = target_status
            data["finished_at"] = time.time()
            if not success:
                data["error"] = error_msg
            with open(target_path, "w") as f:
                json.dump(data, f, indent=2)
            running_path.unlink()
            logger.info(f"Action {action_id} {target_status}")
        except Exception as e:
            logger.error(f"Failed to move {action_id} to {target_status}: {e}")

    def _execute_container_restart(self, node, container_name, retry_delay=10):
        """
        SSH to the target node and run `docker restart <container_name>`.

        Attempts the restart up to 2 times (initial + 1 retry). If the first
        attempt fails, waits retry_delay seconds then tries once more before
        declaring the action failed.

        Returns (success: bool, error_msg: str).
        """
        cmd = [
            "ssh",
            *SSH_OPTIONS,
            f"{SSH_USER}@{node}",
            f"docker restart {container_name}",
        ]
        logger.info(f"SSH container restart: {' '.join(cmd)}")

        max_attempts = 2
        last_error = ""

        for attempt in range(1, max_attempts + 1):
            result = subprocess.run(cmd, capture_output=True, text=True)

            if result.returncode == 0:
                logger.info(
                    f"Container '{container_name}' on {node} restarted successfully "
                    f"(attempt {attempt}/{max_attempts})"
                )
                return True, ""

            last_error = (result.stderr or result.stdout).strip()
            logger.warning(
                f"container_restart attempt {attempt}/{max_attempts} failed "
                f"for '{container_name}' on {node}: {last_error}"
            )

            if attempt < max_attempts:
                logger.info(f"Retrying in {retry_delay}s...")
                time.sleep(retry_delay)

        logger.error(
            f"container_restart exhausted all {max_attempts} attempts "
            f"for '{container_name}' on {node}"
        )
        return False, last_error

    def _execute_disk_cleanup(self, node: str, payload: dict):
        """
        SSH to the target node and run the operator-approved disk cleanup
        commands from the action payload.

        Safety invariants enforced here regardless of payload content:
          - No command may reference /opt/homelab/data/, /opt/homelab/config/,
            or /opt/homelab/state/ (application data and configuration).
          - No command may contain rm -rf / or similar destructive patterns.
        If any command fails the safety check the entire action is rejected
        (not run at all) and the rejection reason is recorded.

        Returns (success: bool, error_msg: str).
        """
        commands = payload.get("commands", [
            "docker image prune -a -f",
            "docker volume prune -f",
        ])

        # Safety gate: reject commands that touch protected paths
        FORBIDDEN = [
            "/opt/homelab/data",
            "/opt/homelab/config",
            "/opt/homelab/state",
            "rm -rf /",
        ]
        for cmd in commands:
            for forbidden in FORBIDDEN:
                if forbidden in cmd:
                    msg = f"Rejected: command contains forbidden pattern '{forbidden}': {cmd}"
                    logger.error(msg)
                    return False, msg

        full_command = " && ".join(commands)
        cmd = [
            "ssh",
            *SSH_OPTIONS,
            f"{SSH_USER}@{node}",
            full_command,
        ]
        logger.info(f"Disk cleanup on {node}: {full_command}")

        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode == 0:
            logger.info(f"Disk cleanup on {node} succeeded")
            return True, ""

        error_msg = (result.stderr or result.stdout).strip()
        logger.error(f"Disk cleanup on {node} failed: {error_msg}")
        return False, error_msg

    def loop(self, interval=10):
        logger.info("Starting executor loop")
        while True:
            self.process_actions()
            time.sleep(interval)


if __name__ == "__main__":
    executor = Executor()
    executor.loop()