fix(observer): per-node-directory checkpoints replace single global checkpoint
The old mechanism tracked a single 'last_processed_file' and used sorted
filename order to find new events. Remote nodes ship events into
subdirectories (events/piha/, events/chelsty-infra/) that sort
alphabetically BEFORE the VPS directory (events/vps/). Once the
checkpoint pointed to a vps/ file, all piha/ and chelsty-infra/ events
were silently skipped forever.
New mechanism:
- node_checkpoints: {node_dir: last_processed_path}
- Each node directory has its own independent cursor
- New events = files whose path > that node's checkpoint
- Backward-compatible: old 'last_processed_file' is migrated by extracting
the node dir from the path on first load
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
a5a3e223dc
commit
f4a8db93e4
|
|
@ -24,7 +24,10 @@ logger = logging.getLogger("observer")
|
||||||
|
|
||||||
class Observer:
|
class Observer:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.last_processed_file = None
|
# Per-node-directory checkpoint: {"vps": "last/file/path", "piha": "last/file/path"}
|
||||||
|
# Replaces the old single last_processed_file which silently skipped event dirs
|
||||||
|
# that sort alphabetically before the checkpoint (e.g. piha/ < vps/).
|
||||||
|
self.node_checkpoints: dict = {}
|
||||||
self.world_state = {
|
self.world_state = {
|
||||||
"nodes": {},
|
"nodes": {},
|
||||||
"services": {},
|
"services": {},
|
||||||
|
|
@ -83,11 +86,22 @@ class Observer:
|
||||||
try:
|
try:
|
||||||
with open(OBSERVER_STATE_FILE, "r") as f:
|
with open(OBSERVER_STATE_FILE, "r") as f:
|
||||||
checkpoint = json.load(f)
|
checkpoint = json.load(f)
|
||||||
self.last_processed_file = checkpoint.get("last_processed_file")
|
|
||||||
# We might want to persist partial world state,
|
if "node_checkpoints" in checkpoint:
|
||||||
# but for now we rebuild from events (idempotent)
|
# New format: per-directory checkpoints.
|
||||||
# or we can load existing world state files.
|
self.node_checkpoints = checkpoint["node_checkpoints"]
|
||||||
self._load_world_from_disk()
|
elif "last_processed_file" in checkpoint:
|
||||||
|
# Migrate old single-file checkpoint: extract node dir from path.
|
||||||
|
old = checkpoint["last_processed_file"]
|
||||||
|
if old:
|
||||||
|
try:
|
||||||
|
node_dir = Path(old).relative_to(EVENTS_DIR).parts[0]
|
||||||
|
self.node_checkpoints = {node_dir: old}
|
||||||
|
logger.info(f"Migrated old checkpoint → node_checkpoints: {self.node_checkpoints}")
|
||||||
|
except Exception:
|
||||||
|
pass # Bad path — start fresh
|
||||||
|
|
||||||
|
self._load_world_from_disk()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to load checkpoint: {e}")
|
logger.error(f"Failed to load checkpoint: {e}")
|
||||||
|
|
||||||
|
|
@ -111,7 +125,7 @@ class Observer:
|
||||||
def _save_checkpoint(self):
|
def _save_checkpoint(self):
|
||||||
try:
|
try:
|
||||||
with open(OBSERVER_STATE_FILE, "w") as f:
|
with open(OBSERVER_STATE_FILE, "w") as f:
|
||||||
json.dump({"last_processed_file": self.last_processed_file}, f)
|
json.dump({"node_checkpoints": self.node_checkpoints}, f, indent=2)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to save checkpoint: {e}")
|
logger.error(f"Failed to save checkpoint: {e}")
|
||||||
|
|
||||||
|
|
@ -345,19 +359,21 @@ class Observer:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to touch heartbeat file: {e}")
|
logger.error(f"Failed to touch heartbeat file: {e}")
|
||||||
|
|
||||||
# Find all event files
|
# Collect all event files grouped by node directory.
|
||||||
event_files = sorted(glob.glob(str(EVENTS_DIR / "**" / "*.json"), recursive=True))
|
# Per-node checkpoints are compared within each directory independently,
|
||||||
|
# so late-arriving events from remote nodes (sorted earlier in the path)
|
||||||
|
# are never skipped just because another node's checkpoint is further ahead.
|
||||||
|
all_files = sorted(glob.glob(str(EVENTS_DIR / "**" / "*.json"), recursive=True))
|
||||||
|
|
||||||
new_files = []
|
new_files = []
|
||||||
if self.last_processed_file:
|
for file_path in all_files:
|
||||||
try:
|
try:
|
||||||
idx = event_files.index(self.last_processed_file)
|
node_dir = str(Path(file_path).relative_to(EVENTS_DIR).parts[0])
|
||||||
new_files = event_files[idx+1:]
|
except (IndexError, ValueError):
|
||||||
except ValueError:
|
node_dir = "__unknown__"
|
||||||
# If last_processed_file is gone or not in list, process all
|
last_for_node = self.node_checkpoints.get(node_dir, "")
|
||||||
new_files = event_files
|
if file_path > last_for_node:
|
||||||
else:
|
new_files.append((node_dir, file_path))
|
||||||
new_files = event_files
|
|
||||||
|
|
||||||
if not new_files:
|
if not new_files:
|
||||||
# Even if no new events, prune stale entries and refresh summary freshness.
|
# Even if no new events, prune stale entries and refresh summary freshness.
|
||||||
|
|
@ -365,13 +381,16 @@ class Observer:
|
||||||
self._save_world()
|
self._save_world()
|
||||||
return
|
return
|
||||||
|
|
||||||
logger.info(f"Processing {len(new_files)} new events")
|
logger.info(f"Processing {len(new_files)} new events across "
|
||||||
for file_path in new_files:
|
f"{len({n for n, _ in new_files})} node(s)")
|
||||||
|
for node_dir, file_path in new_files:
|
||||||
try:
|
try:
|
||||||
with open(file_path, "r") as f:
|
with open(file_path, "r") as f:
|
||||||
event = json.load(f)
|
event = json.load(f)
|
||||||
self.process_event(event)
|
self.process_event(event)
|
||||||
self.last_processed_file = file_path
|
# Advance per-node checkpoint (only forward — no regression).
|
||||||
|
if file_path > self.node_checkpoints.get(node_dir, ""):
|
||||||
|
self.node_checkpoints[node_dir] = file_path
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error processing {file_path}: {e}")
|
logger.error(f"Error processing {file_path}: {e}")
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue