Fix ghost service keys from hash-prefixed Docker container names

node-agent: use com.docker.compose.service label as canonical name
- Add _canonical_container_name() method: prefers compose label,
  falls back to hash-prefix-stripped c.name
- Replace bare c.name usage in check_containers()
- Skip 'created'-state containers (Docker stale-state artifacts)

observer: prune hash-prefixed ghost keys in _prune_stale_world()
- Each reconcile cycle removes service keys matching <node>/<12hex>_<name>
- Acts as safety net for entries already in services.json + future slippage

control-plane/docker-compose.yml already has explicit container_name on
all four services — no change needed there.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Oskar Kapala 2026-05-27 15:41:13 +02:00
parent 28e9534765
commit b40b832159
2 changed files with 51 additions and 1 deletions

View file

@ -157,6 +157,22 @@ class Observer:
logger.info(f"Pruning stale service from world state: {k}")
del self.world_state["services"][k]
# Prune ghost service keys whose service-name portion is a hash-prefixed
# Docker stale-state artifact (e.g. "9e36297651e7_control-plane-observer").
# These are created when node-agent incorrectly uses c.name instead of the
# compose label, and accumulate on every container rebuild.
# Pattern: <node>/<12hexchars>_<real-name>
ghost_svcs = [
k for k in list(self.world_state["services"].keys())
if len(k.split("/", 1)) == 2
and len(k.split("/", 1)[1]) > 13
and k.split("/", 1)[1][12] == "_"
and all(ch in "0123456789abcdef" for ch in k.split("/", 1)[1][:12])
]
for k in ghost_svcs:
logger.info(f"Pruning ghost (hash-prefixed) service key from world state: {k}")
del self.world_state["services"][k]
# Remove resolved incidents older than 7 days.
now = time.time()
stale_incidents = [

View file

@ -274,6 +274,34 @@ class NodeAgent:
# Docker container health
# ------------------------------------------------------------------
def _canonical_container_name(self, c) -> str:
"""Return a stable, human-readable service name for a container.
Priority:
1. com.docker.compose.service label always the clean compose-file key
(e.g. "mosquitto", "zigbee2mqtt"), immune to the hash-prefix corruption
Docker uses for stale project-state tracking entries.
2. c.name with hash prefix stripped fallback for non-Compose containers.
When a container is removed outside of compose and then recreated, Docker
stores the old container record as "<12-char-hex-id>_<original-name>".
c.name returns that corrupted form; we strip the prefix here.
Using c.name directly is the source of ghost service keys like
"vps/9e36297651e7_control-plane-observer" that accumulate in services.json
every time containers are rebuilt.
"""
labels = c.attrs.get("Config", {}).get("Labels", {}) or {}
compose_svc = labels.get("com.docker.compose.service", "").strip()
if compose_svc:
return compose_svc
# Strip Docker-internal stale-state prefix: "<12-char hex>_<real-name>"
name = c.name
if (len(name) > 13
and name[12] == "_"
and all(ch in "0123456789abcdef" for ch in name[:12])):
return name[13:]
return name
def check_containers(self):
if not self.docker_client:
return
@ -285,7 +313,7 @@ class NodeAgent:
for c in containers:
try:
name = c.name
name = self._canonical_container_name(c)
status = c.status
host_config = c.attrs.get("HostConfig", {})
restart_policy = host_config.get("RestartPolicy", {}).get("Name", "")
@ -293,6 +321,12 @@ class NodeAgent:
.get("Health", {})
.get("Status", ""))
# Skip containers in "created" state — these are Docker Compose
# internal tracking artifacts (never started, often hash-prefixed)
# that appear when a container is rebuilt outside of compose.
if status == "created":
continue
# Only track containers with a restart policy (long-running services)
is_managed = restart_policy in ("unless-stopped", "always", "on-failure")
if not is_managed: