Add heartbeat updates and improve health checks in control-plane components

This commit is contained in:
Oskar Kapala 2026-05-12 20:59:46 +02:00
parent f4e6871d76
commit 533b8e846d
6 changed files with 61 additions and 11 deletions

View file

@ -31,7 +31,7 @@ class Observer:
"deployments": {},
"incidents": {},
"summary": {
"last_update": None,
"last_update": datetime.now(timezone.utc).isoformat(),
"status": "initializing",
"active_incidents_count": 0
}
@ -132,6 +132,7 @@ class Observer:
"services.json": self.world_state["services"],
"deployments.json": self.world_state["deployments"],
"incidents.json": self.world_state["incidents"],
"recommendations.json": [], # Placeholder to satisfy requirements
"runtime-summary.json": self.world_state["summary"]
}
for filename, data in files.items():
@ -263,8 +264,15 @@ class Observer:
self.world_state["incidents"][incident_id]["last_error"] = payload["error"]
def run_once(self):
# Update heartbeat
heartbeat_file = STATE_DIR / "observer.heartbeat"
try:
heartbeat_file.touch()
except Exception as e:
logger.error(f"Failed to touch heartbeat file: {e}")
# Find all event files
event_files = sorted(glob.glob(str(EVENTS_DIR / "*" / "*" / "*.json")))
event_files = sorted(glob.glob(str(EVENTS_DIR / "**" / "*.json"), recursive=True))
new_files = []
if self.last_processed_file:
@ -278,6 +286,8 @@ class Observer:
new_files = event_files
if not new_files:
# Even if no new events, we update freshness of summary
self._save_world()
return
logger.info(f"Processing {len(new_files)} new events")

View file

@ -2,9 +2,6 @@ FROM python:3.11-slim
WORKDIR /app
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8080/summary')"
RUN pip install --no-cache-dir pyyaml
# Create homelab user

View file

@ -27,6 +27,12 @@ services:
environment:
- REPO_ROOT=/repo
- RUNTIME_PATH=/opt/homelab
healthcheck:
test: ["CMD", "test", "-f", "/opt/homelab/state/observer.heartbeat"]
interval: 30s
timeout: 5s
retries: 3
start_period: 5s
supervisor:
build: .
@ -40,6 +46,12 @@ services:
environment:
- REPO_ROOT=/repo
- RUNTIME_PATH=/opt/homelab
healthcheck:
test: ["CMD", "test", "-f", "/opt/homelab/state/supervisor.heartbeat"]
interval: 60s
timeout: 5s
retries: 3
start_period: 10s
executor:
build: .
@ -53,3 +65,9 @@ services:
environment:
- REPO_ROOT=/repo
- RUNTIME_PATH=/opt/homelab
healthcheck:
test: ["CMD", "test", "-f", "/opt/homelab/state/executor.heartbeat"]
interval: 30s
timeout: 5s
retries: 3
start_period: 5s

View file

@ -23,6 +23,13 @@ class Executor:
(ACTIONS_DIR / s).mkdir(parents=True, exist_ok=True)
def process_actions(self):
# Update heartbeat
heartbeat_file = ACTIONS_DIR.parent / "state" / "executor.heartbeat"
try:
heartbeat_file.touch()
except Exception as e:
logger.error(f"Failed to touch heartbeat file: {e}")
approved_dir = ACTIONS_DIR / "approved"
action_files = sorted(approved_dir.glob("*.json"))

View file

@ -1,6 +1,7 @@
import json
import os
import time
from datetime import datetime
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
from pathlib import Path
@ -69,23 +70,33 @@ def current_recommendations():
def current_summary():
summary = read_json_file(WORLD_DIR / "runtime-summary.json", default={})
if summary:
# Check for staleness
mtime = os.path.getmtime(WORLD_DIR / "runtime-summary.json")
summary["last_update"] = mtime
summary["stale"] = (time.time() - mtime) > 60 # Stale if older than 60s
# Check for staleness from the summary's own timestamp if available
# otherwise use file mtime
last_update_str = summary.get("last_update")
if last_update_str:
try:
# Assuming ISO format from observer.py
last_update = datetime.fromisoformat(last_update_str.replace('Z', '+00:00')).timestamp()
except Exception:
last_update = os.path.getmtime(WORLD_DIR / "runtime-summary.json")
else:
last_update = os.path.getmtime(WORLD_DIR / "runtime-summary.json")
summary["last_update"] = last_update
summary["stale"] = (time.time() - last_update) > 60 # Stale if older than 60s
return summary
def current_events():
events = []
if EVENTS_DIR.exists():
for f in EVENTS_DIR.glob("*.json"):
for f in EVENTS_DIR.glob("**/*.json"):
data = read_json_file(f)
if data:
# Add source file for traceability
data["_source"] = f.name
events.append(data)
return sorted(events, key=lambda x: x.get("timestamp", 0), reverse=True)
return sorted(events, key=lambda x: x.get("timestamp", ""), reverse=True)
def current_actions():

View file

@ -66,6 +66,13 @@ class Supervisor:
logger.error(f"Failed to load {key} actual state: {e}")
def reconcile(self):
# Update heartbeat
heartbeat_file = WORLD_DIR.parent / "state" / "supervisor.heartbeat"
try:
heartbeat_file.touch()
except Exception as e:
logger.error(f"Failed to touch heartbeat file: {e}")
self._load_desired_state()
self._load_actual_state()