Add heartbeat updates and improve health checks in control-plane components
This commit is contained in:
parent
f4e6871d76
commit
533b8e846d
|
|
@ -31,7 +31,7 @@ class Observer:
|
|||
"deployments": {},
|
||||
"incidents": {},
|
||||
"summary": {
|
||||
"last_update": None,
|
||||
"last_update": datetime.now(timezone.utc).isoformat(),
|
||||
"status": "initializing",
|
||||
"active_incidents_count": 0
|
||||
}
|
||||
|
|
@ -132,6 +132,7 @@ class Observer:
|
|||
"services.json": self.world_state["services"],
|
||||
"deployments.json": self.world_state["deployments"],
|
||||
"incidents.json": self.world_state["incidents"],
|
||||
"recommendations.json": [], # Placeholder to satisfy requirements
|
||||
"runtime-summary.json": self.world_state["summary"]
|
||||
}
|
||||
for filename, data in files.items():
|
||||
|
|
@ -263,8 +264,15 @@ class Observer:
|
|||
self.world_state["incidents"][incident_id]["last_error"] = payload["error"]
|
||||
|
||||
def run_once(self):
|
||||
# Update heartbeat
|
||||
heartbeat_file = STATE_DIR / "observer.heartbeat"
|
||||
try:
|
||||
heartbeat_file.touch()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to touch heartbeat file: {e}")
|
||||
|
||||
# Find all event files
|
||||
event_files = sorted(glob.glob(str(EVENTS_DIR / "*" / "*" / "*.json")))
|
||||
event_files = sorted(glob.glob(str(EVENTS_DIR / "**" / "*.json"), recursive=True))
|
||||
|
||||
new_files = []
|
||||
if self.last_processed_file:
|
||||
|
|
@ -278,6 +286,8 @@ class Observer:
|
|||
new_files = event_files
|
||||
|
||||
if not new_files:
|
||||
# Even if no new events, we update freshness of summary
|
||||
self._save_world()
|
||||
return
|
||||
|
||||
logger.info(f"Processing {len(new_files)} new events")
|
||||
|
|
|
|||
|
|
@ -2,9 +2,6 @@ FROM python:3.11-slim
|
|||
|
||||
WORKDIR /app
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
|
||||
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8080/summary')"
|
||||
|
||||
RUN pip install --no-cache-dir pyyaml
|
||||
|
||||
# Create homelab user
|
||||
|
|
|
|||
|
|
@ -27,6 +27,12 @@ services:
|
|||
environment:
|
||||
- REPO_ROOT=/repo
|
||||
- RUNTIME_PATH=/opt/homelab
|
||||
healthcheck:
|
||||
test: ["CMD", "test", "-f", "/opt/homelab/state/observer.heartbeat"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 5s
|
||||
|
||||
supervisor:
|
||||
build: .
|
||||
|
|
@ -40,6 +46,12 @@ services:
|
|||
environment:
|
||||
- REPO_ROOT=/repo
|
||||
- RUNTIME_PATH=/opt/homelab
|
||||
healthcheck:
|
||||
test: ["CMD", "test", "-f", "/opt/homelab/state/supervisor.heartbeat"]
|
||||
interval: 60s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 10s
|
||||
|
||||
executor:
|
||||
build: .
|
||||
|
|
@ -53,3 +65,9 @@ services:
|
|||
environment:
|
||||
- REPO_ROOT=/repo
|
||||
- RUNTIME_PATH=/opt/homelab
|
||||
healthcheck:
|
||||
test: ["CMD", "test", "-f", "/opt/homelab/state/executor.heartbeat"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 5s
|
||||
|
|
|
|||
|
|
@ -23,6 +23,13 @@ class Executor:
|
|||
(ACTIONS_DIR / s).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def process_actions(self):
|
||||
# Update heartbeat
|
||||
heartbeat_file = ACTIONS_DIR.parent / "state" / "executor.heartbeat"
|
||||
try:
|
||||
heartbeat_file.touch()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to touch heartbeat file: {e}")
|
||||
|
||||
approved_dir = ACTIONS_DIR / "approved"
|
||||
action_files = sorted(approved_dir.glob("*.json"))
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
import json
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime
|
||||
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
|
||||
from pathlib import Path
|
||||
|
||||
|
|
@ -69,23 +70,33 @@ def current_recommendations():
|
|||
def current_summary():
|
||||
summary = read_json_file(WORLD_DIR / "runtime-summary.json", default={})
|
||||
if summary:
|
||||
# Check for staleness
|
||||
mtime = os.path.getmtime(WORLD_DIR / "runtime-summary.json")
|
||||
summary["last_update"] = mtime
|
||||
summary["stale"] = (time.time() - mtime) > 60 # Stale if older than 60s
|
||||
# Check for staleness from the summary's own timestamp if available
|
||||
# otherwise use file mtime
|
||||
last_update_str = summary.get("last_update")
|
||||
if last_update_str:
|
||||
try:
|
||||
# Assuming ISO format from observer.py
|
||||
last_update = datetime.fromisoformat(last_update_str.replace('Z', '+00:00')).timestamp()
|
||||
except Exception:
|
||||
last_update = os.path.getmtime(WORLD_DIR / "runtime-summary.json")
|
||||
else:
|
||||
last_update = os.path.getmtime(WORLD_DIR / "runtime-summary.json")
|
||||
|
||||
summary["last_update"] = last_update
|
||||
summary["stale"] = (time.time() - last_update) > 60 # Stale if older than 60s
|
||||
return summary
|
||||
|
||||
|
||||
def current_events():
|
||||
events = []
|
||||
if EVENTS_DIR.exists():
|
||||
for f in EVENTS_DIR.glob("*.json"):
|
||||
for f in EVENTS_DIR.glob("**/*.json"):
|
||||
data = read_json_file(f)
|
||||
if data:
|
||||
# Add source file for traceability
|
||||
data["_source"] = f.name
|
||||
events.append(data)
|
||||
return sorted(events, key=lambda x: x.get("timestamp", 0), reverse=True)
|
||||
return sorted(events, key=lambda x: x.get("timestamp", ""), reverse=True)
|
||||
|
||||
|
||||
def current_actions():
|
||||
|
|
|
|||
|
|
@ -66,6 +66,13 @@ class Supervisor:
|
|||
logger.error(f"Failed to load {key} actual state: {e}")
|
||||
|
||||
def reconcile(self):
|
||||
# Update heartbeat
|
||||
heartbeat_file = WORLD_DIR.parent / "state" / "supervisor.heartbeat"
|
||||
try:
|
||||
heartbeat_file.touch()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to touch heartbeat file: {e}")
|
||||
|
||||
self._load_desired_state()
|
||||
self._load_actual_state()
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue