Fix ghost service keys from hash-prefixed Docker container names
node-agent: use com.docker.compose.service label as canonical name - Add _canonical_container_name() method: prefers compose label, falls back to hash-prefix-stripped c.name - Replace bare c.name usage in check_containers() - Skip 'created'-state containers (Docker stale-state artifacts) observer: prune hash-prefixed ghost keys in _prune_stale_world() - Each reconcile cycle removes service keys matching <node>/<12hex>_<name> - Acts as safety net for entries already in services.json + future slippage control-plane/docker-compose.yml already has explicit container_name on all four services — no change needed there. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
28e9534765
commit
b40b832159
|
|
@ -157,6 +157,22 @@ class Observer:
|
||||||
logger.info(f"Pruning stale service from world state: {k}")
|
logger.info(f"Pruning stale service from world state: {k}")
|
||||||
del self.world_state["services"][k]
|
del self.world_state["services"][k]
|
||||||
|
|
||||||
|
# Prune ghost service keys whose service-name portion is a hash-prefixed
|
||||||
|
# Docker stale-state artifact (e.g. "9e36297651e7_control-plane-observer").
|
||||||
|
# These are created when node-agent incorrectly uses c.name instead of the
|
||||||
|
# compose label, and accumulate on every container rebuild.
|
||||||
|
# Pattern: <node>/<12hexchars>_<real-name>
|
||||||
|
ghost_svcs = [
|
||||||
|
k for k in list(self.world_state["services"].keys())
|
||||||
|
if len(k.split("/", 1)) == 2
|
||||||
|
and len(k.split("/", 1)[1]) > 13
|
||||||
|
and k.split("/", 1)[1][12] == "_"
|
||||||
|
and all(ch in "0123456789abcdef" for ch in k.split("/", 1)[1][:12])
|
||||||
|
]
|
||||||
|
for k in ghost_svcs:
|
||||||
|
logger.info(f"Pruning ghost (hash-prefixed) service key from world state: {k}")
|
||||||
|
del self.world_state["services"][k]
|
||||||
|
|
||||||
# Remove resolved incidents older than 7 days.
|
# Remove resolved incidents older than 7 days.
|
||||||
now = time.time()
|
now = time.time()
|
||||||
stale_incidents = [
|
stale_incidents = [
|
||||||
|
|
|
||||||
|
|
@ -274,6 +274,34 @@ class NodeAgent:
|
||||||
# Docker container health
|
# Docker container health
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _canonical_container_name(self, c) -> str:
|
||||||
|
"""Return a stable, human-readable service name for a container.
|
||||||
|
|
||||||
|
Priority:
|
||||||
|
1. com.docker.compose.service label — always the clean compose-file key
|
||||||
|
(e.g. "mosquitto", "zigbee2mqtt"), immune to the hash-prefix corruption
|
||||||
|
Docker uses for stale project-state tracking entries.
|
||||||
|
2. c.name with hash prefix stripped — fallback for non-Compose containers.
|
||||||
|
When a container is removed outside of compose and then recreated, Docker
|
||||||
|
stores the old container record as "<12-char-hex-id>_<original-name>".
|
||||||
|
c.name returns that corrupted form; we strip the prefix here.
|
||||||
|
|
||||||
|
Using c.name directly is the source of ghost service keys like
|
||||||
|
"vps/9e36297651e7_control-plane-observer" that accumulate in services.json
|
||||||
|
every time containers are rebuilt.
|
||||||
|
"""
|
||||||
|
labels = c.attrs.get("Config", {}).get("Labels", {}) or {}
|
||||||
|
compose_svc = labels.get("com.docker.compose.service", "").strip()
|
||||||
|
if compose_svc:
|
||||||
|
return compose_svc
|
||||||
|
# Strip Docker-internal stale-state prefix: "<12-char hex>_<real-name>"
|
||||||
|
name = c.name
|
||||||
|
if (len(name) > 13
|
||||||
|
and name[12] == "_"
|
||||||
|
and all(ch in "0123456789abcdef" for ch in name[:12])):
|
||||||
|
return name[13:]
|
||||||
|
return name
|
||||||
|
|
||||||
def check_containers(self):
|
def check_containers(self):
|
||||||
if not self.docker_client:
|
if not self.docker_client:
|
||||||
return
|
return
|
||||||
|
|
@ -285,7 +313,7 @@ class NodeAgent:
|
||||||
|
|
||||||
for c in containers:
|
for c in containers:
|
||||||
try:
|
try:
|
||||||
name = c.name
|
name = self._canonical_container_name(c)
|
||||||
status = c.status
|
status = c.status
|
||||||
host_config = c.attrs.get("HostConfig", {})
|
host_config = c.attrs.get("HostConfig", {})
|
||||||
restart_policy = host_config.get("RestartPolicy", {}).get("Name", "")
|
restart_policy = host_config.get("RestartPolicy", {}).get("Name", "")
|
||||||
|
|
@ -293,6 +321,12 @@ class NodeAgent:
|
||||||
.get("Health", {})
|
.get("Health", {})
|
||||||
.get("Status", ""))
|
.get("Status", ""))
|
||||||
|
|
||||||
|
# Skip containers in "created" state — these are Docker Compose
|
||||||
|
# internal tracking artifacts (never started, often hash-prefixed)
|
||||||
|
# that appear when a container is rebuilt outside of compose.
|
||||||
|
if status == "created":
|
||||||
|
continue
|
||||||
|
|
||||||
# Only track containers with a restart policy (long-running services)
|
# Only track containers with a restart policy (long-running services)
|
||||||
is_managed = restart_policy in ("unless-stopped", "always", "on-failure")
|
is_managed = restart_policy in ("unless-stopped", "always", "on-failure")
|
||||||
if not is_managed:
|
if not is_managed:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue