"""Tests for incident lifecycle: auto-resolve, orphan detection, timestamp parsing.""" from __future__ import annotations import json import sys import time from pathlib import Path import pytest # Observer lives outside the control-plane package; add scripts/ to path. sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "scripts")) from observer.observer import Observer, _parse_ts, _atomic_write_json # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _make_observer(tmp_path: Path) -> Observer: """Return an Observer with all runtime paths redirected to tmp_path.""" import observer.observer as obs_mod world = tmp_path / "world" state = tmp_path / "state" events = tmp_path / "events" logs = tmp_path / "logs" repo = tmp_path / "repo" for d in (world, state, events, logs, repo / "inventory", repo / "hosts"): d.mkdir(parents=True, exist_ok=True) # Minimal topology so inventory isn't empty (avoids prune-guard early-return) (repo / "inventory" / "topology.yaml").write_text( "nodes:\n vps:\n roles: [control-plane]\n connectivity: {}\n" ) original_world = obs_mod.WORLD_DIR original_state = obs_mod.STATE_DIR original_events = obs_mod.EVENTS_DIR original_logs = obs_mod.LOGS_DIR original_inventory = obs_mod.INVENTORY_TOPOLOGY original_repo = obs_mod.REPO_ROOT obs_mod.WORLD_DIR = world obs_mod.STATE_DIR = state obs_mod.EVENTS_DIR = events obs_mod.LOGS_DIR = logs obs_mod.INVENTORY_TOPOLOGY = repo / "inventory" / "topology.yaml" obs_mod.REPO_ROOT = repo obs = Observer() # Restore module-level constants (monkeypatching at module level is sufficient # for the Observer instance which captures paths at construction time via globals) obs_mod.WORLD_DIR = original_world obs_mod.STATE_DIR = original_state obs_mod.EVENTS_DIR = original_events obs_mod.LOGS_DIR = original_logs obs_mod.INVENTORY_TOPOLOGY = original_inventory obs_mod.REPO_ROOT = original_repo return obs def _make_observer_simple(tmp_path: Path): """Return an Observer instance and patch its world_state in-place.""" import observer.observer as obs_mod world = tmp_path / "world" state = tmp_path / "state" events = tmp_path / "events" logs = tmp_path / "logs" repo = tmp_path / "repo" for d in (world, state, events, logs, repo / "inventory", repo / "hosts"): d.mkdir(parents=True, exist_ok=True) (repo / "inventory" / "topology.yaml").write_text( "nodes:\n vps:\n roles: [control-plane]\n connectivity: {}\n" ) # Patch before construction obs_mod.WORLD_DIR = world obs_mod.STATE_DIR = state obs_mod.EVENTS_DIR = events obs_mod.LOGS_DIR = logs obs_mod.INVENTORY_TOPOLOGY = repo / "inventory" / "topology.yaml" obs_mod.REPO_ROOT = repo obs = Observer() return obs # --------------------------------------------------------------------------- # 1. _parse_ts — timestamp normalisation # --------------------------------------------------------------------------- def test_parse_ts_int(): ts = int(time.time()) - 3600 assert abs(_parse_ts(ts) - ts) < 1 def test_parse_ts_float(): ts = time.time() - 100.5 assert abs(_parse_ts(ts) - ts) < 0.01 def test_parse_ts_iso_string(): # ISO format as emitted by events.py / stability-agent from datetime import datetime, timezone iso = "2026-06-01T00:03:22Z" expected = datetime(2026, 6, 1, 0, 3, 22, tzinfo=timezone.utc).timestamp() result = _parse_ts(iso) assert result > 0 assert isinstance(result, float) assert abs(result - expected) < 1 def test_parse_ts_none_returns_zero(): assert _parse_ts(None) == 0.0 def test_parse_ts_garbage_returns_zero(): assert _parse_ts("not-a-date") == 0.0 def test_parse_ts_zero_int(): assert _parse_ts(0) == 0.0 # --------------------------------------------------------------------------- # 2. Lifecycle: service_healthy event resolves linked incident # --------------------------------------------------------------------------- def test_service_healthy_resolves_active_incident(tmp_path): obs = _make_observer_simple(tmp_path) inc_id = "inc-111-vps-outline" obs.world_state["services"]["vps/outline"] = { "node": "vps", "service": "outline", "status": "unhealthy", "last_check": None, "incident_id": inc_id, } obs.world_state["incidents"][inc_id] = { "id": inc_id, "node": "vps", "service": "outline", "status": "active", "trigger_type": "service_unhealthy", "started_at": int(time.time()) - 600, "last_occurrence": int(time.time()) - 600, "occurrence_count": 1, "events": [], } obs.process_event({ "type": "service_healthy", "node": "vps", "service": "outline", "severity": "info", "timestamp": int(time.time()), "payload": {}, }) assert obs.world_state["services"]["vps/outline"]["status"] == "healthy" assert obs.world_state["services"]["vps/outline"]["incident_id"] is None assert obs.world_state["incidents"][inc_id]["status"] == "resolved" def test_service_healthy_does_not_resolve_other_incidents(tmp_path): """service_healthy for service A must not touch incident for service B.""" obs = _make_observer_simple(tmp_path) inc_b = "inc-222-vps-supervisor" obs.world_state["services"]["vps/supervisor"] = { "node": "vps", "service": "supervisor", "status": "unhealthy", "last_check": None, "incident_id": inc_b, } obs.world_state["incidents"][inc_b] = { "id": inc_b, "status": "active", "last_occurrence": int(time.time()) - 300, } obs.process_event({ "type": "service_healthy", "node": "vps", "service": "outline", # different service "severity": "info", "timestamp": int(time.time()), "payload": {}, }) assert obs.world_state["incidents"][inc_b]["status"] == "active" # --------------------------------------------------------------------------- # 3. _prune_stale_world: healthy-service-linked incident → immediate resolve # --------------------------------------------------------------------------- def test_prune_resolves_healthy_linked_incident(tmp_path): """If a service is healthy but still points at an active incident, resolve it.""" obs = _make_observer_simple(tmp_path) inc_id = "inc-333-vps-outline" obs.world_state["services"]["vps/outline"] = { "node": "vps", "service": "outline", "status": "healthy", # <-- healthy but incident_id still set "last_check": None, "incident_id": inc_id, } obs.world_state["incidents"][inc_id] = { "id": inc_id, "status": "active", "started_at": int(time.time()) - 7200, "last_occurrence": int(time.time()) - 7200, } obs._prune_stale_world() assert obs.world_state["services"]["vps/outline"]["incident_id"] is None assert obs.world_state["incidents"][inc_id]["status"] == "resolved" def test_prune_resolves_healthy_linked_incident_iso_timestamp(tmp_path): """Healthy-linked incident with ISO-string last_occurrence must still resolve.""" obs = _make_observer_simple(tmp_path) inc_id = "inc-444-vps-outline" obs.world_state["services"]["vps/outline"] = { "node": "vps", "service": "outline", "status": "healthy", "last_check": None, "incident_id": inc_id, } obs.world_state["incidents"][inc_id] = { "id": inc_id, "status": "active", "last_occurrence": "2026-06-01T00:03:22Z", # ISO string from events.py } obs._prune_stale_world() # must not raise TypeError assert obs.world_state["incidents"][inc_id]["status"] == "resolved" # --------------------------------------------------------------------------- # 4. _prune_stale_world: orphaned incident (no service link) → resolve after 5 min # --------------------------------------------------------------------------- def test_prune_resolves_orphaned_incident_old_enough(tmp_path): """Orphaned active incident older than 5 min must be auto-resolved.""" obs = _make_observer_simple(tmp_path) inc_id = "inc-555-vps-supervisor" # No service entry links to this incident obs.world_state["incidents"][inc_id] = { "id": inc_id, "status": "active", "node": "vps", "service": "supervisor", "last_occurrence": int(time.time()) - 400, # 6.7 min ago } obs._prune_stale_world() assert obs.world_state["incidents"][inc_id]["status"] == "resolved" def test_prune_does_not_resolve_orphaned_incident_too_recent(tmp_path): """Orphaned incident younger than 5 min must stay active (guard against race).""" obs = _make_observer_simple(tmp_path) inc_id = "inc-666-vps-supervisor" obs.world_state["incidents"][inc_id] = { "id": inc_id, "status": "active", "last_occurrence": int(time.time()) - 60, # 1 min ago — within guard } obs._prune_stale_world() assert obs.world_state["incidents"][inc_id]["status"] == "active" def test_prune_resolves_orphaned_incident_iso_timestamp(tmp_path): """Orphaned incident with ISO-string last_occurrence must resolve correctly.""" obs = _make_observer_simple(tmp_path) inc_id = "inc-777-vps-outline" # ISO timestamp well in the past (2026-06-01) obs.world_state["incidents"][inc_id] = { "id": inc_id, "status": "active", "last_occurrence": "2026-06-01T00:03:22Z", } obs._prune_stale_world() # must not raise TypeError assert obs.world_state["incidents"][inc_id]["status"] == "resolved" def test_prune_does_not_touch_linked_incident(tmp_path): """An active incident still linked from a non-healthy service must stay active.""" obs = _make_observer_simple(tmp_path) inc_id = "inc-888-vps-outline" obs.world_state["services"]["vps/outline"] = { "node": "vps", "service": "outline", "status": "unhealthy", # <-- still unhealthy "last_check": None, "incident_id": inc_id, } obs.world_state["incidents"][inc_id] = { "id": inc_id, "status": "active", "last_occurrence": int(time.time()) - 3600, } obs._prune_stale_world() assert obs.world_state["incidents"][inc_id]["status"] == "active" # --------------------------------------------------------------------------- # 5. 7-day stale incident prune with ISO resolved_at # --------------------------------------------------------------------------- def test_prune_removes_old_resolved_incident_iso_resolved_at(tmp_path): """Resolved incidents with ISO-string resolved_at older than 7 days must be pruned.""" obs = _make_observer_simple(tmp_path) inc_id = "inc-old-resolved" obs.world_state["incidents"][inc_id] = { "id": inc_id, "status": "resolved", "resolved_at": "2026-05-01T00:00:00Z", # >7 days before 2026-06-03 } obs._prune_stale_world() assert inc_id not in obs.world_state["incidents"] def test_prune_keeps_recently_resolved_incident(tmp_path): """Resolved incidents within 7 days must be kept.""" obs = _make_observer_simple(tmp_path) inc_id = "inc-recent-resolved" obs.world_state["incidents"][inc_id] = { "id": inc_id, "status": "resolved", "resolved_at": time.time() - 86400, # 1 day ago } obs._prune_stale_world() assert inc_id in obs.world_state["incidents"]