Add heartbeat updates and improve health checks in control-plane components
This commit is contained in:
parent
f4e6871d76
commit
533b8e846d
|
|
@ -31,7 +31,7 @@ class Observer:
|
||||||
"deployments": {},
|
"deployments": {},
|
||||||
"incidents": {},
|
"incidents": {},
|
||||||
"summary": {
|
"summary": {
|
||||||
"last_update": None,
|
"last_update": datetime.now(timezone.utc).isoformat(),
|
||||||
"status": "initializing",
|
"status": "initializing",
|
||||||
"active_incidents_count": 0
|
"active_incidents_count": 0
|
||||||
}
|
}
|
||||||
|
|
@ -132,6 +132,7 @@ class Observer:
|
||||||
"services.json": self.world_state["services"],
|
"services.json": self.world_state["services"],
|
||||||
"deployments.json": self.world_state["deployments"],
|
"deployments.json": self.world_state["deployments"],
|
||||||
"incidents.json": self.world_state["incidents"],
|
"incidents.json": self.world_state["incidents"],
|
||||||
|
"recommendations.json": [], # Placeholder to satisfy requirements
|
||||||
"runtime-summary.json": self.world_state["summary"]
|
"runtime-summary.json": self.world_state["summary"]
|
||||||
}
|
}
|
||||||
for filename, data in files.items():
|
for filename, data in files.items():
|
||||||
|
|
@ -263,8 +264,15 @@ class Observer:
|
||||||
self.world_state["incidents"][incident_id]["last_error"] = payload["error"]
|
self.world_state["incidents"][incident_id]["last_error"] = payload["error"]
|
||||||
|
|
||||||
def run_once(self):
|
def run_once(self):
|
||||||
|
# Update heartbeat
|
||||||
|
heartbeat_file = STATE_DIR / "observer.heartbeat"
|
||||||
|
try:
|
||||||
|
heartbeat_file.touch()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to touch heartbeat file: {e}")
|
||||||
|
|
||||||
# Find all event files
|
# Find all event files
|
||||||
event_files = sorted(glob.glob(str(EVENTS_DIR / "*" / "*" / "*.json")))
|
event_files = sorted(glob.glob(str(EVENTS_DIR / "**" / "*.json"), recursive=True))
|
||||||
|
|
||||||
new_files = []
|
new_files = []
|
||||||
if self.last_processed_file:
|
if self.last_processed_file:
|
||||||
|
|
@ -278,6 +286,8 @@ class Observer:
|
||||||
new_files = event_files
|
new_files = event_files
|
||||||
|
|
||||||
if not new_files:
|
if not new_files:
|
||||||
|
# Even if no new events, we update freshness of summary
|
||||||
|
self._save_world()
|
||||||
return
|
return
|
||||||
|
|
||||||
logger.info(f"Processing {len(new_files)} new events")
|
logger.info(f"Processing {len(new_files)} new events")
|
||||||
|
|
|
||||||
|
|
@ -2,9 +2,6 @@ FROM python:3.11-slim
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
|
|
||||||
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8080/summary')"
|
|
||||||
|
|
||||||
RUN pip install --no-cache-dir pyyaml
|
RUN pip install --no-cache-dir pyyaml
|
||||||
|
|
||||||
# Create homelab user
|
# Create homelab user
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,12 @@ services:
|
||||||
environment:
|
environment:
|
||||||
- REPO_ROOT=/repo
|
- REPO_ROOT=/repo
|
||||||
- RUNTIME_PATH=/opt/homelab
|
- RUNTIME_PATH=/opt/homelab
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "test", "-f", "/opt/homelab/state/observer.heartbeat"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
start_period: 5s
|
||||||
|
|
||||||
supervisor:
|
supervisor:
|
||||||
build: .
|
build: .
|
||||||
|
|
@ -40,6 +46,12 @@ services:
|
||||||
environment:
|
environment:
|
||||||
- REPO_ROOT=/repo
|
- REPO_ROOT=/repo
|
||||||
- RUNTIME_PATH=/opt/homelab
|
- RUNTIME_PATH=/opt/homelab
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "test", "-f", "/opt/homelab/state/supervisor.heartbeat"]
|
||||||
|
interval: 60s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
start_period: 10s
|
||||||
|
|
||||||
executor:
|
executor:
|
||||||
build: .
|
build: .
|
||||||
|
|
@ -53,3 +65,9 @@ services:
|
||||||
environment:
|
environment:
|
||||||
- REPO_ROOT=/repo
|
- REPO_ROOT=/repo
|
||||||
- RUNTIME_PATH=/opt/homelab
|
- RUNTIME_PATH=/opt/homelab
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "test", "-f", "/opt/homelab/state/executor.heartbeat"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
start_period: 5s
|
||||||
|
|
|
||||||
|
|
@ -23,6 +23,13 @@ class Executor:
|
||||||
(ACTIONS_DIR / s).mkdir(parents=True, exist_ok=True)
|
(ACTIONS_DIR / s).mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
def process_actions(self):
|
def process_actions(self):
|
||||||
|
# Update heartbeat
|
||||||
|
heartbeat_file = ACTIONS_DIR.parent / "state" / "executor.heartbeat"
|
||||||
|
try:
|
||||||
|
heartbeat_file.touch()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to touch heartbeat file: {e}")
|
||||||
|
|
||||||
approved_dir = ACTIONS_DIR / "approved"
|
approved_dir = ACTIONS_DIR / "approved"
|
||||||
action_files = sorted(approved_dir.glob("*.json"))
|
action_files = sorted(approved_dir.glob("*.json"))
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
from datetime import datetime
|
||||||
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
|
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
@ -69,23 +70,33 @@ def current_recommendations():
|
||||||
def current_summary():
|
def current_summary():
|
||||||
summary = read_json_file(WORLD_DIR / "runtime-summary.json", default={})
|
summary = read_json_file(WORLD_DIR / "runtime-summary.json", default={})
|
||||||
if summary:
|
if summary:
|
||||||
# Check for staleness
|
# Check for staleness from the summary's own timestamp if available
|
||||||
mtime = os.path.getmtime(WORLD_DIR / "runtime-summary.json")
|
# otherwise use file mtime
|
||||||
summary["last_update"] = mtime
|
last_update_str = summary.get("last_update")
|
||||||
summary["stale"] = (time.time() - mtime) > 60 # Stale if older than 60s
|
if last_update_str:
|
||||||
|
try:
|
||||||
|
# Assuming ISO format from observer.py
|
||||||
|
last_update = datetime.fromisoformat(last_update_str.replace('Z', '+00:00')).timestamp()
|
||||||
|
except Exception:
|
||||||
|
last_update = os.path.getmtime(WORLD_DIR / "runtime-summary.json")
|
||||||
|
else:
|
||||||
|
last_update = os.path.getmtime(WORLD_DIR / "runtime-summary.json")
|
||||||
|
|
||||||
|
summary["last_update"] = last_update
|
||||||
|
summary["stale"] = (time.time() - last_update) > 60 # Stale if older than 60s
|
||||||
return summary
|
return summary
|
||||||
|
|
||||||
|
|
||||||
def current_events():
|
def current_events():
|
||||||
events = []
|
events = []
|
||||||
if EVENTS_DIR.exists():
|
if EVENTS_DIR.exists():
|
||||||
for f in EVENTS_DIR.glob("*.json"):
|
for f in EVENTS_DIR.glob("**/*.json"):
|
||||||
data = read_json_file(f)
|
data = read_json_file(f)
|
||||||
if data:
|
if data:
|
||||||
# Add source file for traceability
|
# Add source file for traceability
|
||||||
data["_source"] = f.name
|
data["_source"] = f.name
|
||||||
events.append(data)
|
events.append(data)
|
||||||
return sorted(events, key=lambda x: x.get("timestamp", 0), reverse=True)
|
return sorted(events, key=lambda x: x.get("timestamp", ""), reverse=True)
|
||||||
|
|
||||||
|
|
||||||
def current_actions():
|
def current_actions():
|
||||||
|
|
|
||||||
|
|
@ -66,6 +66,13 @@ class Supervisor:
|
||||||
logger.error(f"Failed to load {key} actual state: {e}")
|
logger.error(f"Failed to load {key} actual state: {e}")
|
||||||
|
|
||||||
def reconcile(self):
|
def reconcile(self):
|
||||||
|
# Update heartbeat
|
||||||
|
heartbeat_file = WORLD_DIR.parent / "state" / "supervisor.heartbeat"
|
||||||
|
try:
|
||||||
|
heartbeat_file.touch()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to touch heartbeat file: {e}")
|
||||||
|
|
||||||
self._load_desired_state()
|
self._load_desired_state()
|
||||||
self._load_actual_state()
|
self._load_actual_state()
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue