homelab-codex-ws/services/control-plane/tests/test_incident_lifecycle.py
Oskar Kapala c255a021d1 fix(observer): quarantine malformed event files to prevent processing wedge
Was: malformed event (bad JSON / truncated / corrupted bytes) wedged the
node's checkpoint forever — every cycle re-tried, logged, never advanced
past the bad file; all subsequent good events for that node lost.

Now: first parse failure -> atomic os.replace to STATE_DIR/observer_failed_events/<node>/
with collision handling. Checkpoint advances, downstream events flow.
Move failures themselves are logged but don't crash the loop.

Complementary to yesterday's atomic_write_json fix (state files);
this addresses the same race-pattern on event files instead.

Regression test asserts: bad event quarantined to failed_events dir,
removed from hot path, subsequent good event processed (node online),
checkpoint moves to good event.
2026-06-12 11:22:56 +02:00

380 lines
13 KiB
Python

"""Tests for incident lifecycle: auto-resolve, orphan detection, timestamp parsing."""
from __future__ import annotations
import json
import sys
import time
from pathlib import Path
import pytest
# Observer lives outside the control-plane package; add scripts/ to path.
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "scripts"))
from observer.observer import Observer, _parse_ts, _atomic_write_json
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_observer(tmp_path: Path) -> Observer:
"""Return an Observer with all runtime paths redirected to tmp_path."""
import observer.observer as obs_mod
world = tmp_path / "world"
state = tmp_path / "state"
events = tmp_path / "events"
logs = tmp_path / "logs"
repo = tmp_path / "repo"
for d in (world, state, events, logs, repo / "inventory", repo / "hosts"):
d.mkdir(parents=True, exist_ok=True)
# Minimal topology so inventory isn't empty (avoids prune-guard early-return)
(repo / "inventory" / "topology.yaml").write_text(
"nodes:\n vps:\n roles: [control-plane]\n connectivity: {}\n"
)
original_world = obs_mod.WORLD_DIR
original_state = obs_mod.STATE_DIR
original_events = obs_mod.EVENTS_DIR
original_logs = obs_mod.LOGS_DIR
original_inventory = obs_mod.INVENTORY_TOPOLOGY
original_repo = obs_mod.REPO_ROOT
original_failed_events = obs_mod.FAILED_EVENTS_DIR
obs_mod.WORLD_DIR = world
obs_mod.STATE_DIR = state
obs_mod.EVENTS_DIR = events
obs_mod.LOGS_DIR = logs
obs_mod.INVENTORY_TOPOLOGY = repo / "inventory" / "topology.yaml"
obs_mod.REPO_ROOT = repo
obs_mod.FAILED_EVENTS_DIR = state / "observer_failed_events"
obs = Observer()
# Restore module-level constants (monkeypatching at module level is sufficient
# for the Observer instance which captures paths at construction time via globals)
obs_mod.WORLD_DIR = original_world
obs_mod.STATE_DIR = original_state
obs_mod.EVENTS_DIR = original_events
obs_mod.LOGS_DIR = original_logs
obs_mod.INVENTORY_TOPOLOGY = original_inventory
obs_mod.REPO_ROOT = original_repo
obs_mod.FAILED_EVENTS_DIR = original_failed_events
return obs
def _make_observer_simple(tmp_path: Path):
"""Return an Observer instance and patch its world_state in-place."""
import observer.observer as obs_mod
world = tmp_path / "world"
state = tmp_path / "state"
events = tmp_path / "events"
logs = tmp_path / "logs"
repo = tmp_path / "repo"
for d in (world, state, events, logs, repo / "inventory", repo / "hosts"):
d.mkdir(parents=True, exist_ok=True)
(repo / "inventory" / "topology.yaml").write_text(
"nodes:\n vps:\n roles: [control-plane]\n connectivity: {}\n"
)
# Patch before construction
obs_mod.WORLD_DIR = world
obs_mod.STATE_DIR = state
obs_mod.EVENTS_DIR = events
obs_mod.LOGS_DIR = logs
obs_mod.INVENTORY_TOPOLOGY = repo / "inventory" / "topology.yaml"
obs_mod.REPO_ROOT = repo
obs_mod.FAILED_EVENTS_DIR = state / "observer_failed_events"
obs = Observer()
return obs
# ---------------------------------------------------------------------------
# 1. _parse_ts — timestamp normalisation
# ---------------------------------------------------------------------------
def test_parse_ts_int():
ts = int(time.time()) - 3600
assert abs(_parse_ts(ts) - ts) < 1
def test_parse_ts_float():
ts = time.time() - 100.5
assert abs(_parse_ts(ts) - ts) < 0.01
def test_parse_ts_iso_string():
# ISO format as emitted by events.py / stability-agent
from datetime import datetime, timezone
iso = "2026-06-01T00:03:22Z"
expected = datetime(2026, 6, 1, 0, 3, 22, tzinfo=timezone.utc).timestamp()
result = _parse_ts(iso)
assert result > 0
assert isinstance(result, float)
assert abs(result - expected) < 1
def test_parse_ts_none_returns_zero():
assert _parse_ts(None) == 0.0
def test_parse_ts_garbage_returns_zero():
assert _parse_ts("not-a-date") == 0.0
def test_parse_ts_zero_int():
assert _parse_ts(0) == 0.0
# ---------------------------------------------------------------------------
# 2. Lifecycle: service_healthy event resolves linked incident
# ---------------------------------------------------------------------------
def test_service_healthy_resolves_active_incident(tmp_path):
obs = _make_observer_simple(tmp_path)
inc_id = "inc-111-vps-outline"
obs.world_state["services"]["vps/outline"] = {
"node": "vps", "service": "outline",
"status": "unhealthy", "last_check": None,
"incident_id": inc_id,
}
obs.world_state["incidents"][inc_id] = {
"id": inc_id, "node": "vps", "service": "outline",
"status": "active", "trigger_type": "service_unhealthy",
"started_at": int(time.time()) - 600,
"last_occurrence": int(time.time()) - 600,
"occurrence_count": 1, "events": [],
}
obs.process_event({
"type": "service_healthy",
"node": "vps",
"service": "outline",
"severity": "info",
"timestamp": int(time.time()),
"payload": {},
})
assert obs.world_state["services"]["vps/outline"]["status"] == "healthy"
assert obs.world_state["services"]["vps/outline"]["incident_id"] is None
assert obs.world_state["incidents"][inc_id]["status"] == "resolved"
def test_service_healthy_does_not_resolve_other_incidents(tmp_path):
"""service_healthy for service A must not touch incident for service B."""
obs = _make_observer_simple(tmp_path)
inc_b = "inc-222-vps-supervisor"
obs.world_state["services"]["vps/supervisor"] = {
"node": "vps", "service": "supervisor",
"status": "unhealthy", "last_check": None,
"incident_id": inc_b,
}
obs.world_state["incidents"][inc_b] = {
"id": inc_b, "status": "active",
"last_occurrence": int(time.time()) - 300,
}
obs.process_event({
"type": "service_healthy",
"node": "vps",
"service": "outline", # different service
"severity": "info",
"timestamp": int(time.time()),
"payload": {},
})
assert obs.world_state["incidents"][inc_b]["status"] == "active"
# ---------------------------------------------------------------------------
# 3. _prune_stale_world: healthy-service-linked incident → immediate resolve
# ---------------------------------------------------------------------------
def test_prune_resolves_healthy_linked_incident(tmp_path):
"""If a service is healthy but still points at an active incident, resolve it."""
obs = _make_observer_simple(tmp_path)
inc_id = "inc-333-vps-outline"
obs.world_state["services"]["vps/outline"] = {
"node": "vps", "service": "outline",
"status": "healthy", # <-- healthy but incident_id still set
"last_check": None,
"incident_id": inc_id,
}
obs.world_state["incidents"][inc_id] = {
"id": inc_id, "status": "active",
"started_at": int(time.time()) - 7200,
"last_occurrence": int(time.time()) - 7200,
}
obs._prune_stale_world()
assert obs.world_state["services"]["vps/outline"]["incident_id"] is None
assert obs.world_state["incidents"][inc_id]["status"] == "resolved"
def test_prune_resolves_healthy_linked_incident_iso_timestamp(tmp_path):
"""Healthy-linked incident with ISO-string last_occurrence must still resolve."""
obs = _make_observer_simple(tmp_path)
inc_id = "inc-444-vps-outline"
obs.world_state["services"]["vps/outline"] = {
"node": "vps", "service": "outline",
"status": "healthy", "last_check": None, "incident_id": inc_id,
}
obs.world_state["incidents"][inc_id] = {
"id": inc_id, "status": "active",
"last_occurrence": "2026-06-01T00:03:22Z", # ISO string from events.py
}
obs._prune_stale_world() # must not raise TypeError
assert obs.world_state["incidents"][inc_id]["status"] == "resolved"
# ---------------------------------------------------------------------------
# 4. _prune_stale_world: orphaned incident (no service link) → resolve after 5 min
# ---------------------------------------------------------------------------
def test_prune_resolves_orphaned_incident_old_enough(tmp_path):
"""Orphaned active incident older than 5 min must be auto-resolved."""
obs = _make_observer_simple(tmp_path)
inc_id = "inc-555-vps-supervisor"
# No service entry links to this incident
obs.world_state["incidents"][inc_id] = {
"id": inc_id, "status": "active", "node": "vps", "service": "supervisor",
"last_occurrence": int(time.time()) - 400, # 6.7 min ago
}
obs._prune_stale_world()
assert obs.world_state["incidents"][inc_id]["status"] == "resolved"
def test_prune_does_not_resolve_orphaned_incident_too_recent(tmp_path):
"""Orphaned incident younger than 5 min must stay active (guard against race)."""
obs = _make_observer_simple(tmp_path)
inc_id = "inc-666-vps-supervisor"
obs.world_state["incidents"][inc_id] = {
"id": inc_id, "status": "active",
"last_occurrence": int(time.time()) - 60, # 1 min ago — within guard
}
obs._prune_stale_world()
assert obs.world_state["incidents"][inc_id]["status"] == "active"
def test_prune_resolves_orphaned_incident_iso_timestamp(tmp_path):
"""Orphaned incident with ISO-string last_occurrence must resolve correctly."""
obs = _make_observer_simple(tmp_path)
inc_id = "inc-777-vps-outline"
# ISO timestamp well in the past (2026-06-01)
obs.world_state["incidents"][inc_id] = {
"id": inc_id, "status": "active",
"last_occurrence": "2026-06-01T00:03:22Z",
}
obs._prune_stale_world() # must not raise TypeError
assert obs.world_state["incidents"][inc_id]["status"] == "resolved"
def test_prune_does_not_touch_linked_incident(tmp_path):
"""An active incident still linked from a non-healthy service must stay active."""
obs = _make_observer_simple(tmp_path)
inc_id = "inc-888-vps-outline"
obs.world_state["services"]["vps/outline"] = {
"node": "vps", "service": "outline",
"status": "unhealthy", # <-- still unhealthy
"last_check": None,
"incident_id": inc_id,
}
obs.world_state["incidents"][inc_id] = {
"id": inc_id, "status": "active",
"last_occurrence": int(time.time()) - 3600,
}
obs._prune_stale_world()
assert obs.world_state["incidents"][inc_id]["status"] == "active"
# ---------------------------------------------------------------------------
# 5. 7-day stale incident prune with ISO resolved_at
# ---------------------------------------------------------------------------
def test_prune_removes_old_resolved_incident_iso_resolved_at(tmp_path):
"""Resolved incidents with ISO-string resolved_at older than 7 days must be pruned."""
obs = _make_observer_simple(tmp_path)
inc_id = "inc-old-resolved"
obs.world_state["incidents"][inc_id] = {
"id": inc_id, "status": "resolved",
"resolved_at": "2026-05-01T00:00:00Z", # >7 days before 2026-06-03
}
obs._prune_stale_world()
assert inc_id not in obs.world_state["incidents"]
def test_prune_keeps_recently_resolved_incident(tmp_path):
"""Resolved incidents within 7 days must be kept."""
obs = _make_observer_simple(tmp_path)
inc_id = "inc-recent-resolved"
obs.world_state["incidents"][inc_id] = {
"id": inc_id, "status": "resolved",
"resolved_at": time.time() - 86400, # 1 day ago
}
obs._prune_stale_world()
assert inc_id in obs.world_state["incidents"]
def test_run_once_quarantines_bad_event_and_processes_next_for_same_node(tmp_path):
"""A malformed event file must not wedge a node forever."""
obs = _make_observer_simple(tmp_path)
import observer.observer as obs_mod
topology = obs_mod.INVENTORY_TOPOLOGY
topology.write_text(
"nodes:\n"
" lustro:\n"
" roles: [edge]\n"
" connectivity: {}\n"
)
obs.inventory = obs._load_inventory()
bad_dir = obs_mod.EVENTS_DIR / "lustro"
bad_dir.mkdir(parents=True, exist_ok=True)
bad_event = bad_dir / "evt-lustro-1-bad.json"
bad_event.write_text("{not-json")
good_event = bad_dir / "evt-lustro-2-good.json"
good_event.write_text(json.dumps({
"id": "evt-lustro-2-good",
"timestamp": int(time.time()),
"date": "2026-06-10T00:00:00Z",
"type": "node_health",
"severity": "info",
"node": "lustro",
"service": "",
"message": "ok",
"payload": {"disk_pct": 1, "mem_pct": 2, "cpu_pct": 3},
}))
obs.run_once()
quarantined = obs_mod.FAILED_EVENTS_DIR / "lustro" / bad_event.name
assert quarantined.exists()
assert not bad_event.exists()
assert obs.world_state["nodes"]["lustro"]["status"] == "online"
assert obs.node_checkpoints["lustro"] == str(good_event)