2026-05-12 20:19:05 +02:00
|
|
|
import os
|
|
|
|
|
import json
|
|
|
|
|
import time
|
|
|
|
|
import logging
|
|
|
|
|
import subprocess
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
# Constants and Paths
|
|
|
|
|
RUNTIME_PATH = os.getenv("RUNTIME_PATH", "/opt/homelab")
|
|
|
|
|
ACTIONS_DIR = Path(RUNTIME_PATH) / "actions"
|
|
|
|
|
REPO_ROOT = Path(os.getenv("REPO_ROOT", "/repo"))
|
|
|
|
|
|
2026-05-27 12:42:03 +02:00
|
|
|
# SSH configuration
|
|
|
|
|
# SSH_USER can be overridden per-deployment environment.
|
|
|
|
|
SSH_USER = os.getenv("SSH_USER", "oskar")
|
|
|
|
|
SSH_OPTIONS = [
|
|
|
|
|
"-o", "StrictHostKeyChecking=no",
|
|
|
|
|
"-o", "ConnectTimeout=10",
|
|
|
|
|
"-o", "BatchMode=yes",
|
|
|
|
|
]
|
|
|
|
|
|
2026-05-12 20:19:05 +02:00
|
|
|
# Logging setup
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
|
|
logger = logging.getLogger("executor")
|
|
|
|
|
|
2026-05-27 12:42:03 +02:00
|
|
|
|
2026-05-12 20:19:05 +02:00
|
|
|
class Executor:
|
|
|
|
|
def __init__(self):
|
|
|
|
|
self._ensure_dirs()
|
|
|
|
|
|
|
|
|
|
def _ensure_dirs(self):
|
|
|
|
|
for s in ["approved", "running", "completed", "failed", "rejected"]:
|
|
|
|
|
(ACTIONS_DIR / s).mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
def process_actions(self):
|
2026-05-12 20:59:46 +02:00
|
|
|
# Update heartbeat
|
|
|
|
|
heartbeat_file = ACTIONS_DIR.parent / "state" / "executor.heartbeat"
|
|
|
|
|
try:
|
|
|
|
|
heartbeat_file.touch()
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Failed to touch heartbeat file: {e}")
|
|
|
|
|
|
2026-05-12 20:19:05 +02:00
|
|
|
approved_dir = ACTIONS_DIR / "approved"
|
|
|
|
|
action_files = sorted(approved_dir.glob("*.json"))
|
2026-05-27 12:42:03 +02:00
|
|
|
|
2026-05-12 20:19:05 +02:00
|
|
|
for action_file in action_files:
|
|
|
|
|
self._execute_action(action_file)
|
|
|
|
|
|
|
|
|
|
def _execute_action(self, action_file):
|
|
|
|
|
action_id = action_file.stem
|
|
|
|
|
logger.info(f"Executing action: {action_id}")
|
2026-05-27 12:42:03 +02:00
|
|
|
|
2026-05-12 20:19:05 +02:00
|
|
|
# Move to running
|
|
|
|
|
running_path = ACTIONS_DIR / "running" / f"{action_id}.json"
|
|
|
|
|
try:
|
|
|
|
|
with open(action_file, "r") as f:
|
|
|
|
|
data = json.load(f)
|
|
|
|
|
data["status"] = "running"
|
|
|
|
|
data["started_at"] = time.time()
|
|
|
|
|
with open(running_path, "w") as f:
|
|
|
|
|
json.dump(data, f, indent=2)
|
|
|
|
|
action_file.unlink()
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Failed to move {action_id} to running: {e}")
|
|
|
|
|
return
|
|
|
|
|
|
2026-05-27 12:42:03 +02:00
|
|
|
# Dispatch by action type
|
2026-05-12 20:19:05 +02:00
|
|
|
success = False
|
|
|
|
|
error_msg = ""
|
|
|
|
|
try:
|
|
|
|
|
action_type = data.get("type")
|
|
|
|
|
node = data.get("node")
|
|
|
|
|
service = data.get("service")
|
2026-05-27 12:42:03 +02:00
|
|
|
|
2026-05-12 20:19:05 +02:00
|
|
|
if action_type == "redeploy":
|
2026-05-27 12:42:03 +02:00
|
|
|
# Full service redeploy via the repo deploy script
|
2026-05-12 20:19:05 +02:00
|
|
|
cmd = [
|
|
|
|
|
str(REPO_ROOT / "scripts" / "deploy" / "deploy-node.sh"),
|
|
|
|
|
node,
|
|
|
|
|
service
|
|
|
|
|
]
|
|
|
|
|
logger.info(f"Running command: {' '.join(cmd)}")
|
|
|
|
|
result = subprocess.run(cmd, capture_output=True, text=True, cwd=str(REPO_ROOT))
|
|
|
|
|
if result.returncode == 0:
|
|
|
|
|
success = True
|
|
|
|
|
else:
|
|
|
|
|
success = False
|
|
|
|
|
error_msg = result.stderr or result.stdout
|
2026-05-27 12:42:03 +02:00
|
|
|
|
|
|
|
|
elif action_type == "container_restart":
|
|
|
|
|
# Lightweight restart: SSH to node and docker restart the container.
|
|
|
|
|
# container_name is set by the supervisor; falls back to service name.
|
|
|
|
|
container_name = data.get("container_name") or service
|
|
|
|
|
success, error_msg = self._execute_container_restart(node, container_name)
|
|
|
|
|
|
2026-05-12 20:19:05 +02:00
|
|
|
else:
|
|
|
|
|
success = False
|
|
|
|
|
error_msg = f"Unknown action type: {action_type}"
|
2026-05-27 12:42:03 +02:00
|
|
|
|
2026-05-12 20:19:05 +02:00
|
|
|
except Exception as e:
|
|
|
|
|
success = False
|
|
|
|
|
error_msg = str(e)
|
|
|
|
|
|
|
|
|
|
# Move to completed/failed
|
|
|
|
|
target_status = "completed" if success else "failed"
|
|
|
|
|
target_path = ACTIONS_DIR / target_status / f"{action_id}.json"
|
|
|
|
|
try:
|
|
|
|
|
data["status"] = target_status
|
|
|
|
|
data["finished_at"] = time.time()
|
|
|
|
|
if not success:
|
|
|
|
|
data["error"] = error_msg
|
|
|
|
|
with open(target_path, "w") as f:
|
|
|
|
|
json.dump(data, f, indent=2)
|
|
|
|
|
running_path.unlink()
|
|
|
|
|
logger.info(f"Action {action_id} {target_status}")
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Failed to move {action_id} to {target_status}: {e}")
|
|
|
|
|
|
2026-05-27 12:42:03 +02:00
|
|
|
def _execute_container_restart(self, node, container_name, retry_delay=10):
|
|
|
|
|
"""
|
|
|
|
|
SSH to the target node and run `docker restart <container_name>`.
|
|
|
|
|
|
|
|
|
|
Attempts the restart up to 2 times (initial + 1 retry). If the first
|
|
|
|
|
attempt fails, waits retry_delay seconds then tries once more before
|
|
|
|
|
declaring the action failed.
|
|
|
|
|
|
|
|
|
|
Returns (success: bool, error_msg: str).
|
|
|
|
|
"""
|
|
|
|
|
cmd = [
|
|
|
|
|
"ssh",
|
|
|
|
|
*SSH_OPTIONS,
|
|
|
|
|
f"{SSH_USER}@{node}",
|
|
|
|
|
f"docker restart {container_name}",
|
|
|
|
|
]
|
|
|
|
|
logger.info(f"SSH container restart: {' '.join(cmd)}")
|
|
|
|
|
|
|
|
|
|
max_attempts = 2
|
|
|
|
|
last_error = ""
|
|
|
|
|
|
|
|
|
|
for attempt in range(1, max_attempts + 1):
|
|
|
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
|
|
|
|
|
|
|
|
if result.returncode == 0:
|
|
|
|
|
logger.info(
|
|
|
|
|
f"Container '{container_name}' on {node} restarted successfully "
|
|
|
|
|
f"(attempt {attempt}/{max_attempts})"
|
|
|
|
|
)
|
|
|
|
|
return True, ""
|
|
|
|
|
|
|
|
|
|
last_error = (result.stderr or result.stdout).strip()
|
|
|
|
|
logger.warning(
|
|
|
|
|
f"container_restart attempt {attempt}/{max_attempts} failed "
|
|
|
|
|
f"for '{container_name}' on {node}: {last_error}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if attempt < max_attempts:
|
|
|
|
|
logger.info(f"Retrying in {retry_delay}s...")
|
|
|
|
|
time.sleep(retry_delay)
|
|
|
|
|
|
|
|
|
|
logger.error(
|
|
|
|
|
f"container_restart exhausted all {max_attempts} attempts "
|
|
|
|
|
f"for '{container_name}' on {node}"
|
|
|
|
|
)
|
|
|
|
|
return False, last_error
|
|
|
|
|
|
2026-05-12 20:19:05 +02:00
|
|
|
def loop(self, interval=10):
|
|
|
|
|
logger.info("Starting executor loop")
|
|
|
|
|
while True:
|
|
|
|
|
self.process_actions()
|
|
|
|
|
time.sleep(interval)
|
|
|
|
|
|
2026-05-27 12:42:03 +02:00
|
|
|
|
2026-05-12 20:19:05 +02:00
|
|
|
if __name__ == "__main__":
|
|
|
|
|
executor = Executor()
|
|
|
|
|
executor.loop()
|