homelab-codex-ws/services/control-plane/src/executor.py

235 lines
8.2 KiB
Python
Raw Normal View History

import os
import json
import time
import logging
import subprocess
from pathlib import Path
# Constants and Paths
RUNTIME_PATH = os.getenv("RUNTIME_PATH", "/opt/homelab")
ACTIONS_DIR = Path(RUNTIME_PATH) / "actions"
REPO_ROOT = Path(os.getenv("REPO_ROOT", "/repo"))
# SSH configuration
# SSH_USER can be overridden per-deployment environment.
SSH_USER = os.getenv("SSH_USER", "oskar")
SSH_OPTIONS = [
"-o", "StrictHostKeyChecking=no",
"-o", "ConnectTimeout=10",
"-o", "BatchMode=yes",
]
# Logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("executor")
class Executor:
def __init__(self):
self._ensure_dirs()
def _ensure_dirs(self):
for s in ["approved", "running", "completed", "failed", "rejected"]:
(ACTIONS_DIR / s).mkdir(parents=True, exist_ok=True)
def process_actions(self):
# Update heartbeat
heartbeat_file = ACTIONS_DIR.parent / "state" / "executor.heartbeat"
try:
heartbeat_file.touch()
except Exception as e:
logger.error(f"Failed to touch heartbeat file: {e}")
approved_dir = ACTIONS_DIR / "approved"
action_files = sorted(approved_dir.glob("*.json"))
for action_file in action_files:
self._execute_action(action_file)
def _execute_action(self, action_file):
action_id = action_file.stem
logger.info(f"Executing action: {action_id}")
# Move to running
running_path = ACTIONS_DIR / "running" / f"{action_id}.json"
try:
with open(action_file, "r") as f:
data = json.load(f)
data["status"] = "running"
data["started_at"] = time.time()
with open(running_path, "w") as f:
json.dump(data, f, indent=2)
action_file.unlink()
except Exception as e:
logger.error(f"Failed to move {action_id} to running: {e}")
return
# Dispatch by action type
success = False
error_msg = ""
try:
action_type = data.get("type")
node = data.get("node")
service = data.get("service")
if action_type == "redeploy":
# Full service redeploy via the repo deploy script
cmd = [
str(REPO_ROOT / "scripts" / "deploy" / "deploy-node.sh"),
node,
service
]
logger.info(f"Running command: {' '.join(cmd)}")
result = subprocess.run(cmd, capture_output=True, text=True, cwd=str(REPO_ROOT))
if result.returncode == 0:
success = True
else:
success = False
error_msg = result.stderr or result.stdout
elif action_type == "container_restart":
# Lightweight restart: SSH to node and docker restart the container.
# container_name is set by the supervisor; falls back to service name.
container_name = data.get("container_name") or service
success, error_msg = self._execute_container_restart(node, container_name)
feat(node-agent): implement health monitor and safe cleanup policy scripts/monitor/health-monitor.sh (new): - Standalone bash health monitor: disk/RAM/CPU checks + docker container health - Per-node-type cleanup policy enforced: lte_node (chelsty-infra, chelsty-ha): NO cleanup, no docker ops sd_card (piha, saturn): dangling images + containers, rate-limited once/24h ai_node (solaria): dangling + containers + build cache, NEVER -a standard (vps): dangling + containers + build cache + CP filesystem rotation - VPS filesystem rotation: completed/failed actions >7d, deploy logs >30d, events >3d AND past observer checkpoint - Emits structured JSON events (node_health, disk_pressure, high_memory, high_cpu, containers_not_running, healthcheck_failed) services/node-agent/ (new): - Python daemon (node_agent.py): same policy as bash script, Docker SDK for container checks and cleanup, /proc for system metrics - Optional event shipping to VPS via rsync+SSH (VPS_EVENTS_HOST env var) - Dockerfile: python:3.11-slim + openssh-client + rsync + docker>=6.0 - docker-compose.yml: mounts docker socket, /opt/homelab, repo read-only observer.py: - Handle node_health: update node status + disk/mem/cpu metrics, clear disk_pressure - Handle disk_pressure: record severity on node, clear when healthy - Handle high_memory / high_cpu: record pressure level for correlation supervisor.py: - Add NO_DISK_CLEANUP_NODES = {chelsty-infra, chelsty-ha} - reconcile() step 3: generate disk_cleanup actions for nodes with high disk pressure - _generate_disk_cleanup_recommendation(): stable ID disk-cleanup-{node}, checks all active states, risk=guarded (operator approval required) executor.py: - Handle disk_cleanup action type via _execute_disk_cleanup() - Commands come from action payload; safety gate rejects any command touching /opt/homelab/data/, /opt/homelab/config/, /opt/homelab/state/, or rm -rf / hosts/*/services.yaml: - Rename stability-agent -> node-agent on piha, vps, solaria, chelsty-infra - Add node-agent to chelsty-ha (previously missing) - Add cleanup policy notes to LTE node comments Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-27 13:15:06 +02:00
elif action_type == "disk_cleanup":
# Operator-approved aggressive Docker cleanup (image prune -a +
# volume prune). Commands come from the action payload so the
# supervisor controls exactly what runs; the executor adds a
# safety check to reject anything touching protected paths.
payload = data.get("payload", {})
success, error_msg = self._execute_disk_cleanup(node, payload)
else:
success = False
error_msg = f"Unknown action type: {action_type}"
except Exception as e:
success = False
error_msg = str(e)
# Move to completed/failed
target_status = "completed" if success else "failed"
target_path = ACTIONS_DIR / target_status / f"{action_id}.json"
try:
data["status"] = target_status
data["finished_at"] = time.time()
if not success:
data["error"] = error_msg
with open(target_path, "w") as f:
json.dump(data, f, indent=2)
running_path.unlink()
logger.info(f"Action {action_id} {target_status}")
except Exception as e:
logger.error(f"Failed to move {action_id} to {target_status}: {e}")
def _execute_container_restart(self, node, container_name, retry_delay=10):
"""
SSH to the target node and run `docker restart <container_name>`.
Attempts the restart up to 2 times (initial + 1 retry). If the first
attempt fails, waits retry_delay seconds then tries once more before
declaring the action failed.
Returns (success: bool, error_msg: str).
"""
cmd = [
"ssh",
*SSH_OPTIONS,
f"{SSH_USER}@{node}",
f"docker restart {container_name}",
]
logger.info(f"SSH container restart: {' '.join(cmd)}")
max_attempts = 2
last_error = ""
for attempt in range(1, max_attempts + 1):
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
logger.info(
f"Container '{container_name}' on {node} restarted successfully "
f"(attempt {attempt}/{max_attempts})"
)
return True, ""
last_error = (result.stderr or result.stdout).strip()
logger.warning(
f"container_restart attempt {attempt}/{max_attempts} failed "
f"for '{container_name}' on {node}: {last_error}"
)
if attempt < max_attempts:
logger.info(f"Retrying in {retry_delay}s...")
time.sleep(retry_delay)
logger.error(
f"container_restart exhausted all {max_attempts} attempts "
f"for '{container_name}' on {node}"
)
return False, last_error
feat(node-agent): implement health monitor and safe cleanup policy scripts/monitor/health-monitor.sh (new): - Standalone bash health monitor: disk/RAM/CPU checks + docker container health - Per-node-type cleanup policy enforced: lte_node (chelsty-infra, chelsty-ha): NO cleanup, no docker ops sd_card (piha, saturn): dangling images + containers, rate-limited once/24h ai_node (solaria): dangling + containers + build cache, NEVER -a standard (vps): dangling + containers + build cache + CP filesystem rotation - VPS filesystem rotation: completed/failed actions >7d, deploy logs >30d, events >3d AND past observer checkpoint - Emits structured JSON events (node_health, disk_pressure, high_memory, high_cpu, containers_not_running, healthcheck_failed) services/node-agent/ (new): - Python daemon (node_agent.py): same policy as bash script, Docker SDK for container checks and cleanup, /proc for system metrics - Optional event shipping to VPS via rsync+SSH (VPS_EVENTS_HOST env var) - Dockerfile: python:3.11-slim + openssh-client + rsync + docker>=6.0 - docker-compose.yml: mounts docker socket, /opt/homelab, repo read-only observer.py: - Handle node_health: update node status + disk/mem/cpu metrics, clear disk_pressure - Handle disk_pressure: record severity on node, clear when healthy - Handle high_memory / high_cpu: record pressure level for correlation supervisor.py: - Add NO_DISK_CLEANUP_NODES = {chelsty-infra, chelsty-ha} - reconcile() step 3: generate disk_cleanup actions for nodes with high disk pressure - _generate_disk_cleanup_recommendation(): stable ID disk-cleanup-{node}, checks all active states, risk=guarded (operator approval required) executor.py: - Handle disk_cleanup action type via _execute_disk_cleanup() - Commands come from action payload; safety gate rejects any command touching /opt/homelab/data/, /opt/homelab/config/, /opt/homelab/state/, or rm -rf / hosts/*/services.yaml: - Rename stability-agent -> node-agent on piha, vps, solaria, chelsty-infra - Add node-agent to chelsty-ha (previously missing) - Add cleanup policy notes to LTE node comments Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-27 13:15:06 +02:00
def _execute_disk_cleanup(self, node: str, payload: dict):
"""
SSH to the target node and run the operator-approved disk cleanup
commands from the action payload.
Safety invariants enforced here regardless of payload content:
- No command may reference /opt/homelab/data/, /opt/homelab/config/,
or /opt/homelab/state/ (application data and configuration).
- No command may contain rm -rf / or similar destructive patterns.
If any command fails the safety check the entire action is rejected
(not run at all) and the rejection reason is recorded.
Returns (success: bool, error_msg: str).
"""
commands = payload.get("commands", [
"docker image prune -a -f",
"docker volume prune -f",
])
# Safety gate: reject commands that touch protected paths
FORBIDDEN = [
"/opt/homelab/data",
"/opt/homelab/config",
"/opt/homelab/state",
"rm -rf /",
]
for cmd in commands:
for forbidden in FORBIDDEN:
if forbidden in cmd:
msg = f"Rejected: command contains forbidden pattern '{forbidden}': {cmd}"
logger.error(msg)
return False, msg
full_command = " && ".join(commands)
cmd = [
"ssh",
*SSH_OPTIONS,
f"{SSH_USER}@{node}",
full_command,
]
logger.info(f"Disk cleanup on {node}: {full_command}")
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
logger.info(f"Disk cleanup on {node} succeeded")
return True, ""
error_msg = (result.stderr or result.stdout).strip()
logger.error(f"Disk cleanup on {node} failed: {error_msg}")
return False, error_msg
def loop(self, interval=10):
logger.info("Starting executor loop")
while True:
self.process_actions()
time.sleep(interval)
if __name__ == "__main__":
executor = Executor()
executor.loop()