diff --git a/scripts/observer/observer.py b/scripts/observer/observer.py index 00f8bc0..ab2432f 100644 --- a/scripts/observer/observer.py +++ b/scripts/observer/observer.py @@ -157,6 +157,22 @@ class Observer: logger.info(f"Pruning stale service from world state: {k}") del self.world_state["services"][k] + # Prune ghost service keys whose service-name portion is a hash-prefixed + # Docker stale-state artifact (e.g. "9e36297651e7_control-plane-observer"). + # These are created when node-agent incorrectly uses c.name instead of the + # compose label, and accumulate on every container rebuild. + # Pattern: /<12hexchars>_ + ghost_svcs = [ + k for k in list(self.world_state["services"].keys()) + if len(k.split("/", 1)) == 2 + and len(k.split("/", 1)[1]) > 13 + and k.split("/", 1)[1][12] == "_" + and all(ch in "0123456789abcdef" for ch in k.split("/", 1)[1][:12]) + ] + for k in ghost_svcs: + logger.info(f"Pruning ghost (hash-prefixed) service key from world state: {k}") + del self.world_state["services"][k] + # Remove resolved incidents older than 7 days. now = time.time() stale_incidents = [ diff --git a/services/node-agent/src/node_agent.py b/services/node-agent/src/node_agent.py index d49c006..2cbb3e2 100644 --- a/services/node-agent/src/node_agent.py +++ b/services/node-agent/src/node_agent.py @@ -274,6 +274,34 @@ class NodeAgent: # Docker container health # ------------------------------------------------------------------ + def _canonical_container_name(self, c) -> str: + """Return a stable, human-readable service name for a container. + + Priority: + 1. com.docker.compose.service label — always the clean compose-file key + (e.g. "mosquitto", "zigbee2mqtt"), immune to the hash-prefix corruption + Docker uses for stale project-state tracking entries. + 2. c.name with hash prefix stripped — fallback for non-Compose containers. + When a container is removed outside of compose and then recreated, Docker + stores the old container record as "<12-char-hex-id>_". + c.name returns that corrupted form; we strip the prefix here. + + Using c.name directly is the source of ghost service keys like + "vps/9e36297651e7_control-plane-observer" that accumulate in services.json + every time containers are rebuilt. + """ + labels = c.attrs.get("Config", {}).get("Labels", {}) or {} + compose_svc = labels.get("com.docker.compose.service", "").strip() + if compose_svc: + return compose_svc + # Strip Docker-internal stale-state prefix: "<12-char hex>_" + name = c.name + if (len(name) > 13 + and name[12] == "_" + and all(ch in "0123456789abcdef" for ch in name[:12])): + return name[13:] + return name + def check_containers(self): if not self.docker_client: return @@ -285,7 +313,7 @@ class NodeAgent: for c in containers: try: - name = c.name + name = self._canonical_container_name(c) status = c.status host_config = c.attrs.get("HostConfig", {}) restart_policy = host_config.get("RestartPolicy", {}).get("Name", "") @@ -293,6 +321,12 @@ class NodeAgent: .get("Health", {}) .get("Status", "")) + # Skip containers in "created" state — these are Docker Compose + # internal tracking artifacts (never started, often hash-prefixed) + # that appear when a container is rebuilt outside of compose. + if status == "created": + continue + # Only track containers with a restart policy (long-running services) is_managed = restart_policy in ("unless-stopped", "always", "on-failure") if not is_managed: