From 28e953476546409dc5136221003ad86cc2e70123 Mon Sep 17 00:00:00 2001 From: Oskar Kapala Date: Wed, 27 May 2026 15:20:19 +0200 Subject: [PATCH] observer: service_healthy resolves active incidents MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit service_healthy is a positive health confirmation — if the service had an active incident (e.g. from earlier service_unhealthy events), that incident should be resolved when the service is confirmed healthy. Previously only service_recovered resolved incidents; service_healthy set status=healthy but left incidents open, keeping status='degraded'. Co-Authored-By: Claude Sonnet 4.6 --- scripts/observer/observer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/observer/observer.py b/scripts/observer/observer.py index 4165abb..00f8bc0 100644 --- a/scripts/observer/observer.py +++ b/scripts/observer/observer.py @@ -268,8 +268,11 @@ class Observer: # Positive confirmation from node-agent that a managed container # is running. This keeps services.json populated so the supervisor # can correctly detect drift (absent entry = never reported = unknown, - # not the same as confirmed missing). No incident resolution needed. + # not the same as confirmed missing). + # Also resolve any active incident — if a service that had been + # unhealthy/crashing is now confirmed healthy, the incident is over. self.world_state["services"][svc_key]["status"] = "healthy" + self._resolve_incident(svc_key, timestamp) elif etype in ["service_unhealthy", "healthcheck_failed"]: self.world_state["services"][svc_key]["status"] = "unhealthy" self._handle_incident(svc_key, event)