fix(observer): atomic writes for world state files
All JSON state writes (services.json, nodes.json, incidents.json, deployments.json, runtime-summary.json, observer_checkpoint.json) now use _atomic_write_json: write to a .tmp sibling, fsync, then os.replace. This eliminates the truncated-write window that caused supervisors reading mid-write files to see empty/partial JSON. Also adds auto-resolution of phantom active incidents: if a service reports status=healthy and its incident's last_occurrence is >30 min old, the incident is resolved in _prune_stale_world. This clears false active incidents accumulated from previous race-condition reads. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
f381023206
commit
ffb0608b9a
|
|
@ -7,6 +7,16 @@ import yaml
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def _atomic_write_json(path: Path, data) -> None:
|
||||||
|
"""Write JSON atomically: write to a sibling .tmp, fsync, then os.replace."""
|
||||||
|
tmp = path.with_suffix(".tmp")
|
||||||
|
with open(tmp, "w") as f:
|
||||||
|
json.dump(data, f, indent=2)
|
||||||
|
f.flush()
|
||||||
|
os.fsync(f.fileno())
|
||||||
|
os.replace(tmp, path)
|
||||||
|
|
||||||
# Constants and Paths
|
# Constants and Paths
|
||||||
RUNTIME_PATH = os.getenv("RUNTIME_PATH", "/opt/homelab")
|
RUNTIME_PATH = os.getenv("RUNTIME_PATH", "/opt/homelab")
|
||||||
EVENTS_DIR = Path(RUNTIME_PATH) / "events"
|
EVENTS_DIR = Path(RUNTIME_PATH) / "events"
|
||||||
|
|
@ -124,8 +134,7 @@ class Observer:
|
||||||
|
|
||||||
def _save_checkpoint(self):
|
def _save_checkpoint(self):
|
||||||
try:
|
try:
|
||||||
with open(OBSERVER_STATE_FILE, "w") as f:
|
_atomic_write_json(OBSERVER_STATE_FILE, {"node_checkpoints": self.node_checkpoints})
|
||||||
json.dump({"node_checkpoints": self.node_checkpoints}, f, indent=2)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to save checkpoint: {e}")
|
logger.error(f"Failed to save checkpoint: {e}")
|
||||||
|
|
||||||
|
|
@ -173,8 +182,30 @@ class Observer:
|
||||||
logger.info(f"Pruning ghost (hash-prefixed) service key from world state: {k}")
|
logger.info(f"Pruning ghost (hash-prefixed) service key from world state: {k}")
|
||||||
del self.world_state["services"][k]
|
del self.world_state["services"][k]
|
||||||
|
|
||||||
# Remove resolved incidents older than 7 days.
|
|
||||||
now = time.time()
|
now = time.time()
|
||||||
|
|
||||||
|
# Auto-resolve active incidents for services that are currently healthy
|
||||||
|
# and whose last_occurrence is older than 30 minutes. These are phantom
|
||||||
|
# incidents created by race-condition reads of truncated state files; they
|
||||||
|
# never receive a service_recovered event because the service was healthy
|
||||||
|
# all along.
|
||||||
|
for svc_key, svc in self.world_state["services"].items():
|
||||||
|
if svc.get("status") == "healthy":
|
||||||
|
inc_id = svc.get("incident_id")
|
||||||
|
if inc_id and inc_id in self.world_state["incidents"]:
|
||||||
|
inc = self.world_state["incidents"][inc_id]
|
||||||
|
last_occ = inc.get("last_occurrence") or 0
|
||||||
|
if (inc.get("status") == "active"
|
||||||
|
and (now - last_occ) > 1800):
|
||||||
|
logger.info(
|
||||||
|
f"Auto-resolving stale incident {inc_id} for {svc_key}: "
|
||||||
|
f"service healthy, last_occurrence >{int((now - last_occ) / 60)}min ago"
|
||||||
|
)
|
||||||
|
inc["status"] = "resolved"
|
||||||
|
inc["resolved_at"] = now
|
||||||
|
svc["incident_id"] = None
|
||||||
|
|
||||||
|
# Remove resolved incidents older than 7 days.
|
||||||
stale_incidents = [
|
stale_incidents = [
|
||||||
k for k, v in self.world_state["incidents"].items()
|
k for k, v in self.world_state["incidents"].items()
|
||||||
if v.get("status") == "resolved"
|
if v.get("status") == "resolved"
|
||||||
|
|
@ -202,13 +233,12 @@ class Observer:
|
||||||
"services.json": self.world_state["services"],
|
"services.json": self.world_state["services"],
|
||||||
"deployments.json": self.world_state["deployments"],
|
"deployments.json": self.world_state["deployments"],
|
||||||
"incidents.json": self.world_state["incidents"],
|
"incidents.json": self.world_state["incidents"],
|
||||||
"recommendations.json": [], # Placeholder to satisfy requirements
|
"recommendations.json": [],
|
||||||
"runtime-summary.json": self.world_state["summary"]
|
"runtime-summary.json": self.world_state["summary"]
|
||||||
}
|
}
|
||||||
for filename, data in files.items():
|
for filename, data in files.items():
|
||||||
try:
|
try:
|
||||||
with open(WORLD_DIR / filename, "w") as f:
|
_atomic_write_json(WORLD_DIR / filename, data)
|
||||||
json.dump(data, f, indent=2)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to save {filename}: {e}")
|
logger.error(f"Failed to save {filename}: {e}")
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue