homelab-codex-ws/services/control-plane/src/executor.py

import os
import json
import time
import logging
import subprocess
from pathlib import Path

# Constants and Paths
RUNTIME_PATH = os.getenv("RUNTIME_PATH", "/opt/homelab")
ACTIONS_DIR = Path(RUNTIME_PATH) / "actions"
REPO_ROOT = Path(os.getenv("REPO_ROOT", "/repo"))

# SSH configuration
# SSH_USER can be overridden per-deployment environment.
SSH_USER = os.getenv("SSH_USER", "oskar")
SSH_OPTIONS = [
    "-o", "StrictHostKeyChecking=no",
    "-o", "ConnectTimeout=10",
    "-o", "BatchMode=yes",
]

# Logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("executor")


class Executor:
    def __init__(self):
        self._ensure_dirs()

    def _ensure_dirs(self):
        for s in ["approved", "running", "completed", "failed", "rejected"]:
            (ACTIONS_DIR / s).mkdir(parents=True, exist_ok=True)

    def process_actions(self):
        # Update heartbeat
        heartbeat_file = ACTIONS_DIR.parent / "state" / "executor.heartbeat"
        try:
            heartbeat_file.touch()
        except Exception as e:
            logger.error(f"Failed to touch heartbeat file: {e}")

        approved_dir = ACTIONS_DIR / "approved"
        action_files = sorted(approved_dir.glob("*.json"))

        for action_file in action_files:
            self._execute_action(action_file)

    def _execute_action(self, action_file):
        action_id = action_file.stem
        logger.info(f"Executing action: {action_id}")

        # Move to running
        running_path = ACTIONS_DIR / "running" / f"{action_id}.json"
        try:
            with open(action_file, "r") as f:
                data = json.load(f)
            data["status"] = "running"
            data["started_at"] = time.time()
            with open(running_path, "w") as f:
                json.dump(data, f, indent=2)
            action_file.unlink()
        except Exception as e:
            logger.error(f"Failed to move {action_id} to running: {e}")
            return

        # Dispatch by action type
        success = False
        error_msg = ""
        try:
            action_type = data.get("type")
            node = data.get("node")
            service = data.get("service")

            if action_type == "redeploy":
                # Full service redeploy via the repo deploy script
                cmd = [
                    str(REPO_ROOT / "scripts" / "deploy" / "deploy-node.sh"),
                    node,
                    service
                ]
                logger.info(f"Running command: {' '.join(cmd)}")
                result = subprocess.run(cmd, capture_output=True, text=True, cwd=str(REPO_ROOT))
                if result.returncode == 0:
                    success = True
                else:
                    success = False
                    error_msg = result.stderr or result.stdout

            elif action_type == "container_restart":
                # Lightweight restart: SSH to node and docker restart the container.
                # container_name is set by the supervisor; falls back to service name.
                container_name = data.get("container_name") or service
                success, error_msg = self._execute_container_restart(node, container_name)

            elif action_type == "disk_cleanup":
                # Operator-approved aggressive Docker cleanup (image prune -a +
                # volume prune). Commands come from the action payload so the
                # supervisor controls exactly what runs; the executor adds a
                # safety check to reject anything touching protected paths.
                payload = data.get("payload", {})
                success, error_msg = self._execute_disk_cleanup(node, payload)

            else:
                success = False
                error_msg = f"Unknown action type: {action_type}"

        except Exception as e:
            success = False
            error_msg = str(e)

        # Move to completed/failed
        target_status = "completed" if success else "failed"
        target_path = ACTIONS_DIR / target_status / f"{action_id}.json"
        try:
            data["status"] = target_status
            data["finished_at"] = time.time()
            if not success:
                data["error"] = error_msg
            with open(target_path, "w") as f:
                json.dump(data, f, indent=2)
            running_path.unlink()
            logger.info(f"Action {action_id} {target_status}")
        except Exception as e:
            logger.error(f"Failed to move {action_id} to {target_status}: {e}")

    def _execute_container_restart(self, node, container_name, retry_delay=10):
        """
        SSH to the target node and run `docker restart <container_name>`.

        Attempts the restart up to 2 times (initial + 1 retry). If the first
        attempt fails, waits retry_delay seconds then tries once more before
        declaring the action failed.

        Returns (success: bool, error_msg: str).
        """
        cmd = [
            "ssh",
            *SSH_OPTIONS,
            f"{SSH_USER}@{node}",
            f"docker restart {container_name}",
        ]
        logger.info(f"SSH container restart: {' '.join(cmd)}")

        max_attempts = 2
        last_error = ""

        for attempt in range(1, max_attempts + 1):
            result = subprocess.run(cmd, capture_output=True, text=True)

            if result.returncode == 0:
                logger.info(
                    f"Container '{container_name}' on {node} restarted successfully "
                    f"(attempt {attempt}/{max_attempts})"
                )
                return True, ""

            last_error = (result.stderr or result.stdout).strip()
            logger.warning(
                f"container_restart attempt {attempt}/{max_attempts} failed "
                f"for '{container_name}' on {node}: {last_error}"
            )

            if attempt < max_attempts:
                logger.info(f"Retrying in {retry_delay}s...")
                time.sleep(retry_delay)

        logger.error(
            f"container_restart exhausted all {max_attempts} attempts "
            f"for '{container_name}' on {node}"
        )
        return False, last_error

    def _execute_disk_cleanup(self, node: str, payload: dict):
        """
        SSH to the target node and run the operator-approved disk cleanup
        commands from the action payload.

        Safety invariants enforced here regardless of payload content:
          - No command may reference /opt/homelab/data/, /opt/homelab/config/,
            or /opt/homelab/state/ (application data and configuration).
          - No command may contain rm -rf / or similar destructive patterns.
        If any command fails the safety check the entire action is rejected
        (not run at all) and the rejection reason is recorded.

        Returns (success: bool, error_msg: str).
        """
        commands = payload.get("commands", [
            "docker image prune -a -f",
            "docker volume prune -f",
        ])

        # Safety gate: reject commands that touch protected paths
        FORBIDDEN = [
            "/opt/homelab/data",
            "/opt/homelab/config",
            "/opt/homelab/state",
            "rm -rf /",
        ]
        for cmd in commands:
            for forbidden in FORBIDDEN:
                if forbidden in cmd:
                    msg = f"Rejected: command contains forbidden pattern '{forbidden}': {cmd}"
                    logger.error(msg)
                    return False, msg

        full_command = " && ".join(commands)
        cmd = [
            "ssh",
            *SSH_OPTIONS,
            f"{SSH_USER}@{node}",
            full_command,
        ]
        logger.info(f"Disk cleanup on {node}: {full_command}")

        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode == 0:
            logger.info(f"Disk cleanup on {node} succeeded")
            return True, ""

        error_msg = (result.stderr or result.stdout).strip()
        logger.error(f"Disk cleanup on {node} failed: {error_msg}")
        return False, error_msg

    def loop(self, interval=10):
        logger.info("Starting executor loop")
        while True:
            self.process_actions()
            time.sleep(interval)


if __name__ == "__main__":
    executor = Executor()
    executor.loop()
Implement VPS control-plane deployment profile 2026-05-12 20:19:05 +02:00			`import os`
			`import json`
			`import time`
			`import logging`
			`import subprocess`
			`from pathlib import Path`

			`# Constants and Paths`
			`RUNTIME_PATH = os.getenv("RUNTIME_PATH", "/opt/homelab")`
			`ACTIONS_DIR = Path(RUNTIME_PATH) / "actions"`
			`REPO_ROOT = Path(os.getenv("REPO_ROOT", "/repo"))`

feat(control-plane): add container_restart remediation - observer: store trigger_type on incidents for supervisor routing - supervisor: route containers_not_running/mqtt_unreachable to container_restart instead of redeploy - supervisor: fix node alias normalization via NODE_ALIAS_MAP - supervisor: fix pending action dedup (scan by content not filename) - executor: implement container_restart via SSH docker restart with retry - control-plane override: configure NODE_ALIAS_MAP for production Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-05-27 12:42:03 +02:00			`# SSH configuration`
			`# SSH_USER can be overridden per-deployment environment.`
			`SSH_USER = os.getenv("SSH_USER", "oskar")`
			`SSH_OPTIONS = [`
			`"-o", "StrictHostKeyChecking=no",`
			`"-o", "ConnectTimeout=10",`
			`"-o", "BatchMode=yes",`
			`]`

Implement VPS control-plane deployment profile 2026-05-12 20:19:05 +02:00			`# Logging setup`
			`logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')`
			`logger = logging.getLogger("executor")`

feat(control-plane): add container_restart remediation - observer: store trigger_type on incidents for supervisor routing - supervisor: route containers_not_running/mqtt_unreachable to container_restart instead of redeploy - supervisor: fix node alias normalization via NODE_ALIAS_MAP - supervisor: fix pending action dedup (scan by content not filename) - executor: implement container_restart via SSH docker restart with retry - control-plane override: configure NODE_ALIAS_MAP for production Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-05-27 12:42:03 +02:00
Implement VPS control-plane deployment profile 2026-05-12 20:19:05 +02:00			`class Executor:`
			`def __init__(self):`
			`self._ensure_dirs()`

			`def _ensure_dirs(self):`
			`for s in ["approved", "running", "completed", "failed", "rejected"]:`
			`(ACTIONS_DIR / s).mkdir(parents=True, exist_ok=True)`

			`def process_actions(self):`
Add heartbeat updates and improve health checks in control-plane components 2026-05-12 20:59:46 +02:00			`# Update heartbeat`
			`heartbeat_file = ACTIONS_DIR.parent / "state" / "executor.heartbeat"`
			`try:`
			`heartbeat_file.touch()`
			`except Exception as e:`
			`logger.error(f"Failed to touch heartbeat file: {e}")`

Implement VPS control-plane deployment profile 2026-05-12 20:19:05 +02:00			`approved_dir = ACTIONS_DIR / "approved"`
			`action_files = sorted(approved_dir.glob("*.json"))`
feat(control-plane): add container_restart remediation - observer: store trigger_type on incidents for supervisor routing - supervisor: route containers_not_running/mqtt_unreachable to container_restart instead of redeploy - supervisor: fix node alias normalization via NODE_ALIAS_MAP - supervisor: fix pending action dedup (scan by content not filename) - executor: implement container_restart via SSH docker restart with retry - control-plane override: configure NODE_ALIAS_MAP for production Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-05-27 12:42:03 +02:00
Implement VPS control-plane deployment profile 2026-05-12 20:19:05 +02:00			`for action_file in action_files:`
			`self._execute_action(action_file)`

			`def _execute_action(self, action_file):`
			`action_id = action_file.stem`
			`logger.info(f"Executing action: {action_id}")`
feat(control-plane): add container_restart remediation - observer: store trigger_type on incidents for supervisor routing - supervisor: route containers_not_running/mqtt_unreachable to container_restart instead of redeploy - supervisor: fix node alias normalization via NODE_ALIAS_MAP - supervisor: fix pending action dedup (scan by content not filename) - executor: implement container_restart via SSH docker restart with retry - control-plane override: configure NODE_ALIAS_MAP for production Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-05-27 12:42:03 +02:00
Implement VPS control-plane deployment profile 2026-05-12 20:19:05 +02:00			`# Move to running`
			`running_path = ACTIONS_DIR / "running" / f"{action_id}.json"`
			`try:`
			`with open(action_file, "r") as f:`
			`data = json.load(f)`
			`data["status"] = "running"`
			`data["started_at"] = time.time()`
			`with open(running_path, "w") as f:`
			`json.dump(data, f, indent=2)`
			`action_file.unlink()`
			`except Exception as e:`
			`logger.error(f"Failed to move {action_id} to running: {e}")`
			`return`

feat(control-plane): add container_restart remediation - observer: store trigger_type on incidents for supervisor routing - supervisor: route containers_not_running/mqtt_unreachable to container_restart instead of redeploy - supervisor: fix node alias normalization via NODE_ALIAS_MAP - supervisor: fix pending action dedup (scan by content not filename) - executor: implement container_restart via SSH docker restart with retry - control-plane override: configure NODE_ALIAS_MAP for production Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-05-27 12:42:03 +02:00			`# Dispatch by action type`
Implement VPS control-plane deployment profile 2026-05-12 20:19:05 +02:00			`success = False`
			`error_msg = ""`
			`try:`
			`action_type = data.get("type")`
			`node = data.get("node")`
			`service = data.get("service")`
feat(control-plane): add container_restart remediation - observer: store trigger_type on incidents for supervisor routing - supervisor: route containers_not_running/mqtt_unreachable to container_restart instead of redeploy - supervisor: fix node alias normalization via NODE_ALIAS_MAP - supervisor: fix pending action dedup (scan by content not filename) - executor: implement container_restart via SSH docker restart with retry - control-plane override: configure NODE_ALIAS_MAP for production Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-05-27 12:42:03 +02:00
Implement VPS control-plane deployment profile 2026-05-12 20:19:05 +02:00			`if action_type == "redeploy":`
feat(control-plane): add container_restart remediation - observer: store trigger_type on incidents for supervisor routing - supervisor: route containers_not_running/mqtt_unreachable to container_restart instead of redeploy - supervisor: fix node alias normalization via NODE_ALIAS_MAP - supervisor: fix pending action dedup (scan by content not filename) - executor: implement container_restart via SSH docker restart with retry - control-plane override: configure NODE_ALIAS_MAP for production Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-05-27 12:42:03 +02:00			`# Full service redeploy via the repo deploy script`
Implement VPS control-plane deployment profile 2026-05-12 20:19:05 +02:00			`cmd = [`
			`str(REPO_ROOT / "scripts" / "deploy" / "deploy-node.sh"),`
			`node,`
			`service`
			`]`
			`logger.info(f"Running command: {' '.join(cmd)}")`
			`result = subprocess.run(cmd, capture_output=True, text=True, cwd=str(REPO_ROOT))`
			`if result.returncode == 0:`
			`success = True`
			`else:`
			`success = False`
			`error_msg = result.stderr or result.stdout`
feat(control-plane): add container_restart remediation - observer: store trigger_type on incidents for supervisor routing - supervisor: route containers_not_running/mqtt_unreachable to container_restart instead of redeploy - supervisor: fix node alias normalization via NODE_ALIAS_MAP - supervisor: fix pending action dedup (scan by content not filename) - executor: implement container_restart via SSH docker restart with retry - control-plane override: configure NODE_ALIAS_MAP for production Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-05-27 12:42:03 +02:00
			`elif action_type == "container_restart":`
			`# Lightweight restart: SSH to node and docker restart the container.`
			`# container_name is set by the supervisor; falls back to service name.`
			`container_name = data.get("container_name") or service`
			`success, error_msg = self._execute_container_restart(node, container_name)`

feat(node-agent): implement health monitor and safe cleanup policy scripts/monitor/health-monitor.sh (new): - Standalone bash health monitor: disk/RAM/CPU checks + docker container health - Per-node-type cleanup policy enforced: lte_node (chelsty-infra, chelsty-ha): NO cleanup, no docker ops sd_card (piha, saturn): dangling images + containers, rate-limited once/24h ai_node (solaria): dangling + containers + build cache, NEVER -a standard (vps): dangling + containers + build cache + CP filesystem rotation - VPS filesystem rotation: completed/failed actions >7d, deploy logs >30d, events >3d AND past observer checkpoint - Emits structured JSON events (node_health, disk_pressure, high_memory, high_cpu, containers_not_running, healthcheck_failed) services/node-agent/ (new): - Python daemon (node_agent.py): same policy as bash script, Docker SDK for container checks and cleanup, /proc for system metrics - Optional event shipping to VPS via rsync+SSH (VPS_EVENTS_HOST env var) - Dockerfile: python:3.11-slim + openssh-client + rsync + docker>=6.0 - docker-compose.yml: mounts docker socket, /opt/homelab, repo read-only observer.py: - Handle node_health: update node status + disk/mem/cpu metrics, clear disk_pressure - Handle disk_pressure: record severity on node, clear when healthy - Handle high_memory / high_cpu: record pressure level for correlation supervisor.py: - Add NO_DISK_CLEANUP_NODES = {chelsty-infra, chelsty-ha} - reconcile() step 3: generate disk_cleanup actions for nodes with high disk pressure - _generate_disk_cleanup_recommendation(): stable ID disk-cleanup-{node}, checks all active states, risk=guarded (operator approval required) executor.py: - Handle disk_cleanup action type via _execute_disk_cleanup() - Commands come from action payload; safety gate rejects any command touching /opt/homelab/data/, /opt/homelab/config/, /opt/homelab/state/, or rm -rf / hosts/*/services.yaml: - Rename stability-agent -> node-agent on piha, vps, solaria, chelsty-infra - Add node-agent to chelsty-ha (previously missing) - Add cleanup policy notes to LTE node comments Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-05-27 13:15:06 +02:00			`elif action_type == "disk_cleanup":`
			`# Operator-approved aggressive Docker cleanup (image prune -a +`
			`# volume prune). Commands come from the action payload so the`
			`# supervisor controls exactly what runs; the executor adds a`
			`# safety check to reject anything touching protected paths.`
			`payload = data.get("payload", {})`
			`success, error_msg = self._execute_disk_cleanup(node, payload)`

Implement VPS control-plane deployment profile 2026-05-12 20:19:05 +02:00			`else:`
			`success = False`
			`error_msg = f"Unknown action type: {action_type}"`
feat(control-plane): add container_restart remediation - observer: store trigger_type on incidents for supervisor routing - supervisor: route containers_not_running/mqtt_unreachable to container_restart instead of redeploy - supervisor: fix node alias normalization via NODE_ALIAS_MAP - supervisor: fix pending action dedup (scan by content not filename) - executor: implement container_restart via SSH docker restart with retry - control-plane override: configure NODE_ALIAS_MAP for production Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-05-27 12:42:03 +02:00
Implement VPS control-plane deployment profile 2026-05-12 20:19:05 +02:00			`except Exception as e:`
			`success = False`
			`error_msg = str(e)`

			`# Move to completed/failed`
			`target_status = "completed" if success else "failed"`
			`target_path = ACTIONS_DIR / target_status / f"{action_id}.json"`
			`try:`
			`data["status"] = target_status`
			`data["finished_at"] = time.time()`
			`if not success:`
			`data["error"] = error_msg`
			`with open(target_path, "w") as f:`
			`json.dump(data, f, indent=2)`
			`running_path.unlink()`
			`logger.info(f"Action {action_id} {target_status}")`
			`except Exception as e:`
			`logger.error(f"Failed to move {action_id} to {target_status}: {e}")`

feat(control-plane): add container_restart remediation - observer: store trigger_type on incidents for supervisor routing - supervisor: route containers_not_running/mqtt_unreachable to container_restart instead of redeploy - supervisor: fix node alias normalization via NODE_ALIAS_MAP - supervisor: fix pending action dedup (scan by content not filename) - executor: implement container_restart via SSH docker restart with retry - control-plane override: configure NODE_ALIAS_MAP for production Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-05-27 12:42:03 +02:00			`def _execute_container_restart(self, node, container_name, retry_delay=10):`
			`"""`
			SSH to the target node and run `docker restart <container_name>`.

			`Attempts the restart up to 2 times (initial + 1 retry). If the first`
			`attempt fails, waits retry_delay seconds then tries once more before`
			`declaring the action failed.`

			`Returns (success: bool, error_msg: str).`
			`"""`
			`cmd = [`
			`"ssh",`
			`*SSH_OPTIONS,`
			`f"{SSH_USER}@{node}",`
			`f"docker restart {container_name}",`
			`]`
			`logger.info(f"SSH container restart: {' '.join(cmd)}")`

			`max_attempts = 2`
			`last_error = ""`

			`for attempt in range(1, max_attempts + 1):`
			`result = subprocess.run(cmd, capture_output=True, text=True)`

			`if result.returncode == 0:`
			`logger.info(`
			`f"Container '{container_name}' on {node} restarted successfully "`
			`f"(attempt {attempt}/{max_attempts})"`
			`)`
			`return True, ""`

			`last_error = (result.stderr or result.stdout).strip()`
			`logger.warning(`
			`f"container_restart attempt {attempt}/{max_attempts} failed "`
			`f"for '{container_name}' on {node}: {last_error}"`
			`)`

			`if attempt < max_attempts:`
			`logger.info(f"Retrying in {retry_delay}s...")`
			`time.sleep(retry_delay)`

			`logger.error(`
			`f"container_restart exhausted all {max_attempts} attempts "`
			`f"for '{container_name}' on {node}"`
			`)`
			`return False, last_error`

feat(node-agent): implement health monitor and safe cleanup policy scripts/monitor/health-monitor.sh (new): - Standalone bash health monitor: disk/RAM/CPU checks + docker container health - Per-node-type cleanup policy enforced: lte_node (chelsty-infra, chelsty-ha): NO cleanup, no docker ops sd_card (piha, saturn): dangling images + containers, rate-limited once/24h ai_node (solaria): dangling + containers + build cache, NEVER -a standard (vps): dangling + containers + build cache + CP filesystem rotation - VPS filesystem rotation: completed/failed actions >7d, deploy logs >30d, events >3d AND past observer checkpoint - Emits structured JSON events (node_health, disk_pressure, high_memory, high_cpu, containers_not_running, healthcheck_failed) services/node-agent/ (new): - Python daemon (node_agent.py): same policy as bash script, Docker SDK for container checks and cleanup, /proc for system metrics - Optional event shipping to VPS via rsync+SSH (VPS_EVENTS_HOST env var) - Dockerfile: python:3.11-slim + openssh-client + rsync + docker>=6.0 - docker-compose.yml: mounts docker socket, /opt/homelab, repo read-only observer.py: - Handle node_health: update node status + disk/mem/cpu metrics, clear disk_pressure - Handle disk_pressure: record severity on node, clear when healthy - Handle high_memory / high_cpu: record pressure level for correlation supervisor.py: - Add NO_DISK_CLEANUP_NODES = {chelsty-infra, chelsty-ha} - reconcile() step 3: generate disk_cleanup actions for nodes with high disk pressure - _generate_disk_cleanup_recommendation(): stable ID disk-cleanup-{node}, checks all active states, risk=guarded (operator approval required) executor.py: - Handle disk_cleanup action type via _execute_disk_cleanup() - Commands come from action payload; safety gate rejects any command touching /opt/homelab/data/, /opt/homelab/config/, /opt/homelab/state/, or rm -rf / hosts/*/services.yaml: - Rename stability-agent -> node-agent on piha, vps, solaria, chelsty-infra - Add node-agent to chelsty-ha (previously missing) - Add cleanup policy notes to LTE node comments Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-05-27 13:15:06 +02:00			`def _execute_disk_cleanup(self, node: str, payload: dict):`
			`"""`
			`SSH to the target node and run the operator-approved disk cleanup`
			`commands from the action payload.`

			`Safety invariants enforced here regardless of payload content:`
			`- No command may reference /opt/homelab/data/, /opt/homelab/config/,`
			`or /opt/homelab/state/ (application data and configuration).`
			`- No command may contain rm -rf / or similar destructive patterns.`
			`If any command fails the safety check the entire action is rejected`
			`(not run at all) and the rejection reason is recorded.`

			`Returns (success: bool, error_msg: str).`
			`"""`
			`commands = payload.get("commands", [`
			`"docker image prune -a -f",`
			`"docker volume prune -f",`
			`])`

			`# Safety gate: reject commands that touch protected paths`
			`FORBIDDEN = [`
			`"/opt/homelab/data",`
			`"/opt/homelab/config",`
			`"/opt/homelab/state",`
			`"rm -rf /",`
			`]`
			`for cmd in commands:`
			`for forbidden in FORBIDDEN:`
			`if forbidden in cmd:`
			`msg = f"Rejected: command contains forbidden pattern '{forbidden}': {cmd}"`
			`logger.error(msg)`
			`return False, msg`

			`full_command = " && ".join(commands)`
			`cmd = [`
			`"ssh",`
			`*SSH_OPTIONS,`
			`f"{SSH_USER}@{node}",`
			`full_command,`
			`]`
			`logger.info(f"Disk cleanup on {node}: {full_command}")`

			`result = subprocess.run(cmd, capture_output=True, text=True)`
			`if result.returncode == 0:`
			`logger.info(f"Disk cleanup on {node} succeeded")`
			`return True, ""`

			`error_msg = (result.stderr or result.stdout).strip()`
			`logger.error(f"Disk cleanup on {node} failed: {error_msg}")`
			`return False, error_msg`

Implement VPS control-plane deployment profile 2026-05-12 20:19:05 +02:00			`def loop(self, interval=10):`
			`logger.info("Starting executor loop")`
			`while True:`
			`self.process_actions()`
			`time.sleep(interval)`

feat(control-plane): add container_restart remediation - observer: store trigger_type on incidents for supervisor routing - supervisor: route containers_not_running/mqtt_unreachable to container_restart instead of redeploy - supervisor: fix node alias normalization via NODE_ALIAS_MAP - supervisor: fix pending action dedup (scan by content not filename) - executor: implement container_restart via SSH docker restart with retry - control-plane override: configure NODE_ALIAS_MAP for production Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-05-27 12:42:03 +02:00
Implement VPS control-plane deployment profile 2026-05-12 20:19:05 +02:00			`if __name__ == "__main__":`
			`executor = Executor()`
			`executor.loop()`