Add heartbeat updates and improve health checks in control-plane components

This commit is contained in:
Oskar Kapala 2026-05-12 20:59:46 +02:00
parent f4e6871d76
commit 533b8e846d
6 changed files with 61 additions and 11 deletions

View file

@ -31,7 +31,7 @@ class Observer:
"deployments": {}, "deployments": {},
"incidents": {}, "incidents": {},
"summary": { "summary": {
"last_update": None, "last_update": datetime.now(timezone.utc).isoformat(),
"status": "initializing", "status": "initializing",
"active_incidents_count": 0 "active_incidents_count": 0
} }
@ -132,6 +132,7 @@ class Observer:
"services.json": self.world_state["services"], "services.json": self.world_state["services"],
"deployments.json": self.world_state["deployments"], "deployments.json": self.world_state["deployments"],
"incidents.json": self.world_state["incidents"], "incidents.json": self.world_state["incidents"],
"recommendations.json": [], # Placeholder to satisfy requirements
"runtime-summary.json": self.world_state["summary"] "runtime-summary.json": self.world_state["summary"]
} }
for filename, data in files.items(): for filename, data in files.items():
@ -263,8 +264,15 @@ class Observer:
self.world_state["incidents"][incident_id]["last_error"] = payload["error"] self.world_state["incidents"][incident_id]["last_error"] = payload["error"]
def run_once(self): def run_once(self):
# Update heartbeat
heartbeat_file = STATE_DIR / "observer.heartbeat"
try:
heartbeat_file.touch()
except Exception as e:
logger.error(f"Failed to touch heartbeat file: {e}")
# Find all event files # Find all event files
event_files = sorted(glob.glob(str(EVENTS_DIR / "*" / "*" / "*.json"))) event_files = sorted(glob.glob(str(EVENTS_DIR / "**" / "*.json"), recursive=True))
new_files = [] new_files = []
if self.last_processed_file: if self.last_processed_file:
@ -278,6 +286,8 @@ class Observer:
new_files = event_files new_files = event_files
if not new_files: if not new_files:
# Even if no new events, we update freshness of summary
self._save_world()
return return
logger.info(f"Processing {len(new_files)} new events") logger.info(f"Processing {len(new_files)} new events")

View file

@ -2,9 +2,6 @@ FROM python:3.11-slim
WORKDIR /app WORKDIR /app
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8080/summary')"
RUN pip install --no-cache-dir pyyaml RUN pip install --no-cache-dir pyyaml
# Create homelab user # Create homelab user

View file

@ -27,6 +27,12 @@ services:
environment: environment:
- REPO_ROOT=/repo - REPO_ROOT=/repo
- RUNTIME_PATH=/opt/homelab - RUNTIME_PATH=/opt/homelab
healthcheck:
test: ["CMD", "test", "-f", "/opt/homelab/state/observer.heartbeat"]
interval: 30s
timeout: 5s
retries: 3
start_period: 5s
supervisor: supervisor:
build: . build: .
@ -40,6 +46,12 @@ services:
environment: environment:
- REPO_ROOT=/repo - REPO_ROOT=/repo
- RUNTIME_PATH=/opt/homelab - RUNTIME_PATH=/opt/homelab
healthcheck:
test: ["CMD", "test", "-f", "/opt/homelab/state/supervisor.heartbeat"]
interval: 60s
timeout: 5s
retries: 3
start_period: 10s
executor: executor:
build: . build: .
@ -53,3 +65,9 @@ services:
environment: environment:
- REPO_ROOT=/repo - REPO_ROOT=/repo
- RUNTIME_PATH=/opt/homelab - RUNTIME_PATH=/opt/homelab
healthcheck:
test: ["CMD", "test", "-f", "/opt/homelab/state/executor.heartbeat"]
interval: 30s
timeout: 5s
retries: 3
start_period: 5s

View file

@ -23,6 +23,13 @@ class Executor:
(ACTIONS_DIR / s).mkdir(parents=True, exist_ok=True) (ACTIONS_DIR / s).mkdir(parents=True, exist_ok=True)
def process_actions(self): def process_actions(self):
# Update heartbeat
heartbeat_file = ACTIONS_DIR.parent / "state" / "executor.heartbeat"
try:
heartbeat_file.touch()
except Exception as e:
logger.error(f"Failed to touch heartbeat file: {e}")
approved_dir = ACTIONS_DIR / "approved" approved_dir = ACTIONS_DIR / "approved"
action_files = sorted(approved_dir.glob("*.json")) action_files = sorted(approved_dir.glob("*.json"))

View file

@ -1,6 +1,7 @@
import json import json
import os import os
import time import time
from datetime import datetime
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
from pathlib import Path from pathlib import Path
@ -69,23 +70,33 @@ def current_recommendations():
def current_summary(): def current_summary():
summary = read_json_file(WORLD_DIR / "runtime-summary.json", default={}) summary = read_json_file(WORLD_DIR / "runtime-summary.json", default={})
if summary: if summary:
# Check for staleness # Check for staleness from the summary's own timestamp if available
mtime = os.path.getmtime(WORLD_DIR / "runtime-summary.json") # otherwise use file mtime
summary["last_update"] = mtime last_update_str = summary.get("last_update")
summary["stale"] = (time.time() - mtime) > 60 # Stale if older than 60s if last_update_str:
try:
# Assuming ISO format from observer.py
last_update = datetime.fromisoformat(last_update_str.replace('Z', '+00:00')).timestamp()
except Exception:
last_update = os.path.getmtime(WORLD_DIR / "runtime-summary.json")
else:
last_update = os.path.getmtime(WORLD_DIR / "runtime-summary.json")
summary["last_update"] = last_update
summary["stale"] = (time.time() - last_update) > 60 # Stale if older than 60s
return summary return summary
def current_events(): def current_events():
events = [] events = []
if EVENTS_DIR.exists(): if EVENTS_DIR.exists():
for f in EVENTS_DIR.glob("*.json"): for f in EVENTS_DIR.glob("**/*.json"):
data = read_json_file(f) data = read_json_file(f)
if data: if data:
# Add source file for traceability # Add source file for traceability
data["_source"] = f.name data["_source"] = f.name
events.append(data) events.append(data)
return sorted(events, key=lambda x: x.get("timestamp", 0), reverse=True) return sorted(events, key=lambda x: x.get("timestamp", ""), reverse=True)
def current_actions(): def current_actions():

View file

@ -66,6 +66,13 @@ class Supervisor:
logger.error(f"Failed to load {key} actual state: {e}") logger.error(f"Failed to load {key} actual state: {e}")
def reconcile(self): def reconcile(self):
# Update heartbeat
heartbeat_file = WORLD_DIR.parent / "state" / "supervisor.heartbeat"
try:
heartbeat_file.touch()
except Exception as e:
logger.error(f"Failed to touch heartbeat file: {e}")
self._load_desired_state() self._load_desired_state()
self._load_actual_state() self._load_actual_state()