Recovery from bad merge of task/observer-poison-quarantine (c255a02)
which carried false deletes from a stale branch base. Re-applies only
the genuine observer changes on top of correct master state.
When an event file fails to parse (malformed JSON, truncated, corrupted),
the observer previously kept retrying on every cycle while the node's
checkpoint stayed pinned — all subsequent good events for that node lost.
Now: first parse failure -> atomic os.replace to STATE_DIR/observer_failed_events/<node>/
with collision handling. Checkpoint advances, downstream events flow.
Move failures are logged but don't crash the loop.
Complementary to the atomic_write_json fix on state files; this addresses
the same race-pattern on event files instead.
Regression test asserts: bad event quarantined to failed_events dir,
removed from hot path, subsequent good event processed (node online),
checkpoint moves to good event.
380 lines
13 KiB
Python
380 lines
13 KiB
Python
"""Tests for incident lifecycle: auto-resolve, orphan detection, timestamp parsing."""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
# Observer lives outside the control-plane package; add scripts/ to path.
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "scripts"))
|
|
from observer.observer import Observer, _parse_ts, _atomic_write_json
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _make_observer(tmp_path: Path) -> Observer:
|
|
"""Return an Observer with all runtime paths redirected to tmp_path."""
|
|
import observer.observer as obs_mod
|
|
|
|
world = tmp_path / "world"
|
|
state = tmp_path / "state"
|
|
events = tmp_path / "events"
|
|
logs = tmp_path / "logs"
|
|
repo = tmp_path / "repo"
|
|
|
|
for d in (world, state, events, logs, repo / "inventory", repo / "hosts"):
|
|
d.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Minimal topology so inventory isn't empty (avoids prune-guard early-return)
|
|
(repo / "inventory" / "topology.yaml").write_text(
|
|
"nodes:\n vps:\n roles: [control-plane]\n connectivity: {}\n"
|
|
)
|
|
|
|
original_world = obs_mod.WORLD_DIR
|
|
original_state = obs_mod.STATE_DIR
|
|
original_events = obs_mod.EVENTS_DIR
|
|
original_logs = obs_mod.LOGS_DIR
|
|
original_inventory = obs_mod.INVENTORY_TOPOLOGY
|
|
original_repo = obs_mod.REPO_ROOT
|
|
original_failed_events = obs_mod.FAILED_EVENTS_DIR
|
|
|
|
obs_mod.WORLD_DIR = world
|
|
obs_mod.STATE_DIR = state
|
|
obs_mod.EVENTS_DIR = events
|
|
obs_mod.LOGS_DIR = logs
|
|
obs_mod.INVENTORY_TOPOLOGY = repo / "inventory" / "topology.yaml"
|
|
obs_mod.REPO_ROOT = repo
|
|
obs_mod.FAILED_EVENTS_DIR = state / "observer_failed_events"
|
|
|
|
obs = Observer()
|
|
|
|
# Restore module-level constants (monkeypatching at module level is sufficient
|
|
# for the Observer instance which captures paths at construction time via globals)
|
|
obs_mod.WORLD_DIR = original_world
|
|
obs_mod.STATE_DIR = original_state
|
|
obs_mod.EVENTS_DIR = original_events
|
|
obs_mod.LOGS_DIR = original_logs
|
|
obs_mod.INVENTORY_TOPOLOGY = original_inventory
|
|
obs_mod.REPO_ROOT = original_repo
|
|
obs_mod.FAILED_EVENTS_DIR = original_failed_events
|
|
|
|
return obs
|
|
|
|
|
|
def _make_observer_simple(tmp_path: Path):
|
|
"""Return an Observer instance and patch its world_state in-place."""
|
|
import observer.observer as obs_mod
|
|
|
|
world = tmp_path / "world"
|
|
state = tmp_path / "state"
|
|
events = tmp_path / "events"
|
|
logs = tmp_path / "logs"
|
|
repo = tmp_path / "repo"
|
|
|
|
for d in (world, state, events, logs, repo / "inventory", repo / "hosts"):
|
|
d.mkdir(parents=True, exist_ok=True)
|
|
|
|
(repo / "inventory" / "topology.yaml").write_text(
|
|
"nodes:\n vps:\n roles: [control-plane]\n connectivity: {}\n"
|
|
)
|
|
|
|
# Patch before construction
|
|
obs_mod.WORLD_DIR = world
|
|
obs_mod.STATE_DIR = state
|
|
obs_mod.EVENTS_DIR = events
|
|
obs_mod.LOGS_DIR = logs
|
|
obs_mod.INVENTORY_TOPOLOGY = repo / "inventory" / "topology.yaml"
|
|
obs_mod.REPO_ROOT = repo
|
|
obs_mod.FAILED_EVENTS_DIR = state / "observer_failed_events"
|
|
|
|
obs = Observer()
|
|
return obs
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 1. _parse_ts — timestamp normalisation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_parse_ts_int():
|
|
ts = int(time.time()) - 3600
|
|
assert abs(_parse_ts(ts) - ts) < 1
|
|
|
|
|
|
def test_parse_ts_float():
|
|
ts = time.time() - 100.5
|
|
assert abs(_parse_ts(ts) - ts) < 0.01
|
|
|
|
|
|
def test_parse_ts_iso_string():
|
|
# ISO format as emitted by events.py / stability-agent
|
|
from datetime import datetime, timezone
|
|
iso = "2026-06-01T00:03:22Z"
|
|
expected = datetime(2026, 6, 1, 0, 3, 22, tzinfo=timezone.utc).timestamp()
|
|
result = _parse_ts(iso)
|
|
assert result > 0
|
|
assert isinstance(result, float)
|
|
assert abs(result - expected) < 1
|
|
|
|
|
|
def test_parse_ts_none_returns_zero():
|
|
assert _parse_ts(None) == 0.0
|
|
|
|
|
|
def test_parse_ts_garbage_returns_zero():
|
|
assert _parse_ts("not-a-date") == 0.0
|
|
|
|
|
|
def test_parse_ts_zero_int():
|
|
assert _parse_ts(0) == 0.0
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 2. Lifecycle: service_healthy event resolves linked incident
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_service_healthy_resolves_active_incident(tmp_path):
|
|
obs = _make_observer_simple(tmp_path)
|
|
inc_id = "inc-111-vps-outline"
|
|
obs.world_state["services"]["vps/outline"] = {
|
|
"node": "vps", "service": "outline",
|
|
"status": "unhealthy", "last_check": None,
|
|
"incident_id": inc_id,
|
|
}
|
|
obs.world_state["incidents"][inc_id] = {
|
|
"id": inc_id, "node": "vps", "service": "outline",
|
|
"status": "active", "trigger_type": "service_unhealthy",
|
|
"started_at": int(time.time()) - 600,
|
|
"last_occurrence": int(time.time()) - 600,
|
|
"occurrence_count": 1, "events": [],
|
|
}
|
|
|
|
obs.process_event({
|
|
"type": "service_healthy",
|
|
"node": "vps",
|
|
"service": "outline",
|
|
"severity": "info",
|
|
"timestamp": int(time.time()),
|
|
"payload": {},
|
|
})
|
|
|
|
assert obs.world_state["services"]["vps/outline"]["status"] == "healthy"
|
|
assert obs.world_state["services"]["vps/outline"]["incident_id"] is None
|
|
assert obs.world_state["incidents"][inc_id]["status"] == "resolved"
|
|
|
|
|
|
def test_service_healthy_does_not_resolve_other_incidents(tmp_path):
|
|
"""service_healthy for service A must not touch incident for service B."""
|
|
obs = _make_observer_simple(tmp_path)
|
|
inc_b = "inc-222-vps-supervisor"
|
|
obs.world_state["services"]["vps/supervisor"] = {
|
|
"node": "vps", "service": "supervisor",
|
|
"status": "unhealthy", "last_check": None,
|
|
"incident_id": inc_b,
|
|
}
|
|
obs.world_state["incidents"][inc_b] = {
|
|
"id": inc_b, "status": "active",
|
|
"last_occurrence": int(time.time()) - 300,
|
|
}
|
|
|
|
obs.process_event({
|
|
"type": "service_healthy",
|
|
"node": "vps",
|
|
"service": "outline", # different service
|
|
"severity": "info",
|
|
"timestamp": int(time.time()),
|
|
"payload": {},
|
|
})
|
|
|
|
assert obs.world_state["incidents"][inc_b]["status"] == "active"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 3. _prune_stale_world: healthy-service-linked incident → immediate resolve
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_prune_resolves_healthy_linked_incident(tmp_path):
|
|
"""If a service is healthy but still points at an active incident, resolve it."""
|
|
obs = _make_observer_simple(tmp_path)
|
|
inc_id = "inc-333-vps-outline"
|
|
obs.world_state["services"]["vps/outline"] = {
|
|
"node": "vps", "service": "outline",
|
|
"status": "healthy", # <-- healthy but incident_id still set
|
|
"last_check": None,
|
|
"incident_id": inc_id,
|
|
}
|
|
obs.world_state["incidents"][inc_id] = {
|
|
"id": inc_id, "status": "active",
|
|
"started_at": int(time.time()) - 7200,
|
|
"last_occurrence": int(time.time()) - 7200,
|
|
}
|
|
|
|
obs._prune_stale_world()
|
|
|
|
assert obs.world_state["services"]["vps/outline"]["incident_id"] is None
|
|
assert obs.world_state["incidents"][inc_id]["status"] == "resolved"
|
|
|
|
|
|
def test_prune_resolves_healthy_linked_incident_iso_timestamp(tmp_path):
|
|
"""Healthy-linked incident with ISO-string last_occurrence must still resolve."""
|
|
obs = _make_observer_simple(tmp_path)
|
|
inc_id = "inc-444-vps-outline"
|
|
obs.world_state["services"]["vps/outline"] = {
|
|
"node": "vps", "service": "outline",
|
|
"status": "healthy", "last_check": None, "incident_id": inc_id,
|
|
}
|
|
obs.world_state["incidents"][inc_id] = {
|
|
"id": inc_id, "status": "active",
|
|
"last_occurrence": "2026-06-01T00:03:22Z", # ISO string from events.py
|
|
}
|
|
|
|
obs._prune_stale_world() # must not raise TypeError
|
|
|
|
assert obs.world_state["incidents"][inc_id]["status"] == "resolved"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 4. _prune_stale_world: orphaned incident (no service link) → resolve after 5 min
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_prune_resolves_orphaned_incident_old_enough(tmp_path):
|
|
"""Orphaned active incident older than 5 min must be auto-resolved."""
|
|
obs = _make_observer_simple(tmp_path)
|
|
inc_id = "inc-555-vps-supervisor"
|
|
# No service entry links to this incident
|
|
obs.world_state["incidents"][inc_id] = {
|
|
"id": inc_id, "status": "active", "node": "vps", "service": "supervisor",
|
|
"last_occurrence": int(time.time()) - 400, # 6.7 min ago
|
|
}
|
|
|
|
obs._prune_stale_world()
|
|
|
|
assert obs.world_state["incidents"][inc_id]["status"] == "resolved"
|
|
|
|
|
|
def test_prune_does_not_resolve_orphaned_incident_too_recent(tmp_path):
|
|
"""Orphaned incident younger than 5 min must stay active (guard against race)."""
|
|
obs = _make_observer_simple(tmp_path)
|
|
inc_id = "inc-666-vps-supervisor"
|
|
obs.world_state["incidents"][inc_id] = {
|
|
"id": inc_id, "status": "active",
|
|
"last_occurrence": int(time.time()) - 60, # 1 min ago — within guard
|
|
}
|
|
|
|
obs._prune_stale_world()
|
|
|
|
assert obs.world_state["incidents"][inc_id]["status"] == "active"
|
|
|
|
|
|
def test_prune_resolves_orphaned_incident_iso_timestamp(tmp_path):
|
|
"""Orphaned incident with ISO-string last_occurrence must resolve correctly."""
|
|
obs = _make_observer_simple(tmp_path)
|
|
inc_id = "inc-777-vps-outline"
|
|
# ISO timestamp well in the past (2026-06-01)
|
|
obs.world_state["incidents"][inc_id] = {
|
|
"id": inc_id, "status": "active",
|
|
"last_occurrence": "2026-06-01T00:03:22Z",
|
|
}
|
|
|
|
obs._prune_stale_world() # must not raise TypeError
|
|
|
|
assert obs.world_state["incidents"][inc_id]["status"] == "resolved"
|
|
|
|
|
|
def test_prune_does_not_touch_linked_incident(tmp_path):
|
|
"""An active incident still linked from a non-healthy service must stay active."""
|
|
obs = _make_observer_simple(tmp_path)
|
|
inc_id = "inc-888-vps-outline"
|
|
obs.world_state["services"]["vps/outline"] = {
|
|
"node": "vps", "service": "outline",
|
|
"status": "unhealthy", # <-- still unhealthy
|
|
"last_check": None,
|
|
"incident_id": inc_id,
|
|
}
|
|
obs.world_state["incidents"][inc_id] = {
|
|
"id": inc_id, "status": "active",
|
|
"last_occurrence": int(time.time()) - 3600,
|
|
}
|
|
|
|
obs._prune_stale_world()
|
|
|
|
assert obs.world_state["incidents"][inc_id]["status"] == "active"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# 5. 7-day stale incident prune with ISO resolved_at
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_prune_removes_old_resolved_incident_iso_resolved_at(tmp_path):
|
|
"""Resolved incidents with ISO-string resolved_at older than 7 days must be pruned."""
|
|
obs = _make_observer_simple(tmp_path)
|
|
inc_id = "inc-old-resolved"
|
|
obs.world_state["incidents"][inc_id] = {
|
|
"id": inc_id, "status": "resolved",
|
|
"resolved_at": "2026-05-01T00:00:00Z", # >7 days before 2026-06-03
|
|
}
|
|
|
|
obs._prune_stale_world()
|
|
|
|
assert inc_id not in obs.world_state["incidents"]
|
|
|
|
|
|
def test_prune_keeps_recently_resolved_incident(tmp_path):
|
|
"""Resolved incidents within 7 days must be kept."""
|
|
obs = _make_observer_simple(tmp_path)
|
|
inc_id = "inc-recent-resolved"
|
|
obs.world_state["incidents"][inc_id] = {
|
|
"id": inc_id, "status": "resolved",
|
|
"resolved_at": time.time() - 86400, # 1 day ago
|
|
}
|
|
|
|
obs._prune_stale_world()
|
|
|
|
assert inc_id in obs.world_state["incidents"]
|
|
|
|
|
|
def test_run_once_quarantines_bad_event_and_processes_next_for_same_node(tmp_path):
|
|
"""A malformed event file must not wedge a node forever."""
|
|
obs = _make_observer_simple(tmp_path)
|
|
|
|
import observer.observer as obs_mod
|
|
|
|
topology = obs_mod.INVENTORY_TOPOLOGY
|
|
topology.write_text(
|
|
"nodes:\n"
|
|
" lustro:\n"
|
|
" roles: [edge]\n"
|
|
" connectivity: {}\n"
|
|
)
|
|
obs.inventory = obs._load_inventory()
|
|
|
|
bad_dir = obs_mod.EVENTS_DIR / "lustro"
|
|
bad_dir.mkdir(parents=True, exist_ok=True)
|
|
bad_event = bad_dir / "evt-lustro-1-bad.json"
|
|
bad_event.write_text("{not-json")
|
|
|
|
good_event = bad_dir / "evt-lustro-2-good.json"
|
|
good_event.write_text(json.dumps({
|
|
"id": "evt-lustro-2-good",
|
|
"timestamp": int(time.time()),
|
|
"date": "2026-06-10T00:00:00Z",
|
|
"type": "node_health",
|
|
"severity": "info",
|
|
"node": "lustro",
|
|
"service": "",
|
|
"message": "ok",
|
|
"payload": {"disk_pct": 1, "mem_pct": 2, "cpu_pct": 3},
|
|
}))
|
|
|
|
obs.run_once()
|
|
|
|
quarantined = obs_mod.FAILED_EVENTS_DIR / "lustro" / bad_event.name
|
|
assert quarantined.exists()
|
|
assert not bad_event.exists()
|
|
assert obs.world_state["nodes"]["lustro"]["status"] == "online"
|
|
assert obs.node_checkpoints["lustro"] == str(good_event)
|