operator-ui: /events bez ladowania calego katalogu + daemon threads; epoch z regexa (fix chelsty-infra)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Oskar Kapala 2026-06-01 16:34:52 +02:00
parent 43c5d45353
commit 495741e7ac

View file

@ -1,5 +1,7 @@
import heapq
import json import json
import os import os
import re
import time import time
from datetime import datetime from datetime import datetime
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
@ -14,6 +16,8 @@ CONFIG_DIR = Path(os.getenv("HOMELAB_CONFIG_ROOT", "/opt/homelab/config"))
STATIC_DIR = Path(__file__).parent STATIC_DIR = Path(__file__).parent
_EVENT_TS_RE = re.compile(r"-(\d{9,11})-")
DEFAULT_CONFIG = { DEFAULT_CONFIG = {
"operator_mode": "approval", "operator_mode": "approval",
"auto_mode": True, "auto_mode": True,
@ -48,6 +52,7 @@ def save_config(config):
EVENTS_MAX_AGE_HOURS = int(os.getenv("EVENTS_MAX_AGE_HOURS", "24")) EVENTS_MAX_AGE_HOURS = int(os.getenv("EVENTS_MAX_AGE_HOURS", "24"))
EVENTS_MAX_COUNT = int(os.getenv("EVENTS_MAX_COUNT", "200"))
def _node_health(info): def _node_health(info):
@ -183,22 +188,43 @@ def current_summary():
return summary return summary
def current_events(): def _event_file_ts(p: Path) -> int:
"""Return recent events as a list sorted newest-first. """Extract epoch timestamp from event filename: evt-<node>-<ts>-<type>-<svc>.json"""
m = _EVENT_TS_RE.search(p.stem)
return int(m.group(1)) if m else 0
Reads individual *.json event files from EVENTS_DIR. Without a time filter
this would return every event file ever written (including events from ghost def current_events():
nodes created before NODE_NAME was configured). We cap at EVENTS_MAX_AGE_HOURS """Return the EVENTS_MAX_COUNT most-recent events, sorted newest-first.
(default 24 h) to keep the Events view responsive and stale-free.
Event files are named evt-<node>-<epoch>-<type>-<svc>.json. The directory
can contain hundreds of thousands of files (one file per event, written by
node-agent). Loading every file on each request causes catastrophic RSS
growth 242 k files 420 MB of Python objects + 100 MB JSON serialisation.
Fix: use heapq.nlargest to stream through file paths (O(N_files) time,
O(EVENTS_MAX_COUNT) memory), extracting the epoch from the filename without
opening any file. Only the winning EVENTS_MAX_COUNT files are then read.
""" """
events = [] if not EVENTS_DIR.exists():
return []
cutoff = time.time() - EVENTS_MAX_AGE_HOURS * 3600 cutoff = time.time() - EVENTS_MAX_AGE_HOURS * 3600
if EVENTS_DIR.exists():
for f in EVENTS_DIR.glob("**/*.json"): # Stream all paths through a max-heap — never materialises the full list.
data = read_json_file(f) candidates = heapq.nlargest(
if data and (data.get("timestamp") or 0) > cutoff: EVENTS_MAX_COUNT,
data["_source"] = f.name EVENTS_DIR.glob("**/*.json"),
events.append(data) key=_event_file_ts,
)
events = []
for f in candidates:
data = read_json_file(f)
if data and (data.get("timestamp") or 0) > cutoff:
data["_source"] = f.name
events.append(data)
return sorted(events, key=lambda x: x.get("timestamp") or 0, reverse=True) return sorted(events, key=lambda x: x.get("timestamp") or 0, reverse=True)
@ -373,14 +399,22 @@ class Handler(BaseHTTPRequestHandler):
return return
class OperatorHTTPServer(ThreadingHTTPServer):
# Use daemon threads so finished request threads do not accumulate in the
# internal _threads list. ThreadingMixIn only tracks non-daemon threads
# (for joining at server_close); with daemon_threads=True that list stays
# empty, preventing unbounded growth of dead Thread objects over time.
daemon_threads = True
if __name__ == "__main__": if __name__ == "__main__":
# Ensure directories exist # Ensure directories exist
for d in [STATE_DIR, EVENTS_DIR, WORLD_DIR, ACTIONS_DIR, CONFIG_DIR]: for d in [STATE_DIR, EVENTS_DIR, WORLD_DIR, ACTIONS_DIR, CONFIG_DIR]:
d.mkdir(parents=True, exist_ok=True) d.mkdir(parents=True, exist_ok=True)
for s in ["pending", "approved", "running", "completed", "failed", "rejected"]: for s in ["pending", "approved", "running", "completed", "failed", "rejected"]:
(ACTIONS_DIR / s).mkdir(parents=True, exist_ok=True) (ACTIONS_DIR / s).mkdir(parents=True, exist_ok=True)
port = int(os.getenv("PORT", "8080")) port = int(os.getenv("PORT", "8080"))
print(f"Operator Control Plane starting on 0.0.0.0:{port}") print(f"Operator Control Plane starting on 0.0.0.0:{port}")
server = ThreadingHTTPServer(("0.0.0.0", port), Handler) server = OperatorHTTPServer(("0.0.0.0", port), Handler)
server.serve_forever() server.serve_forever()