From 28e953476546409dc5136221003ad86cc2e70123 Mon Sep 17 00:00:00 2001
From: Oskar Kapala <oskar.kapala@gmail.com>
Date: Wed, 27 May 2026 15:20:19 +0200
Subject: [PATCH] observer: service_healthy resolves active incidents
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

service_healthy is a positive health confirmation — if the service had
an active incident (e.g. from earlier service_unhealthy events), that
incident should be resolved when the service is confirmed healthy.

Previously only service_recovered resolved incidents; service_healthy
set status=healthy but left incidents open, keeping status='degraded'.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 scripts/observer/observer.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/observer/observer.py b/scripts/observer/observer.py
index 4165abb..00f8bc0 100644
--- a/scripts/observer/observer.py
+++ b/scripts/observer/observer.py
@@ -268,8 +268,11 @@ class Observer:
                 # Positive confirmation from node-agent that a managed container
                 # is running. This keeps services.json populated so the supervisor
                 # can correctly detect drift (absent entry = never reported = unknown,
-                # not the same as confirmed missing). No incident resolution needed.
+                # not the same as confirmed missing).
+                # Also resolve any active incident — if a service that had been
+                # unhealthy/crashing is now confirmed healthy, the incident is over.
                 self.world_state["services"][svc_key]["status"] = "healthy"
+                self._resolve_incident(svc_key, timestamp)
             elif etype in ["service_unhealthy", "healthcheck_failed"]:
                 self.world_state["services"][svc_key]["status"] = "unhealthy"
                 self._handle_incident(svc_key, event)