Fix ghost service keys from hash-prefixed Docker container names
node-agent: use com.docker.compose.service label as canonical name - Add _canonical_container_name() method: prefers compose label, falls back to hash-prefix-stripped c.name - Replace bare c.name usage in check_containers() - Skip 'created'-state containers (Docker stale-state artifacts) observer: prune hash-prefixed ghost keys in _prune_stale_world() - Each reconcile cycle removes service keys matching <node>/<12hex>_<name> - Acts as safety net for entries already in services.json + future slippage control-plane/docker-compose.yml already has explicit container_name on all four services — no change needed there. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
28e9534765
commit
b40b832159
|
|
@ -157,6 +157,22 @@ class Observer:
|
|||
logger.info(f"Pruning stale service from world state: {k}")
|
||||
del self.world_state["services"][k]
|
||||
|
||||
# Prune ghost service keys whose service-name portion is a hash-prefixed
|
||||
# Docker stale-state artifact (e.g. "9e36297651e7_control-plane-observer").
|
||||
# These are created when node-agent incorrectly uses c.name instead of the
|
||||
# compose label, and accumulate on every container rebuild.
|
||||
# Pattern: <node>/<12hexchars>_<real-name>
|
||||
ghost_svcs = [
|
||||
k for k in list(self.world_state["services"].keys())
|
||||
if len(k.split("/", 1)) == 2
|
||||
and len(k.split("/", 1)[1]) > 13
|
||||
and k.split("/", 1)[1][12] == "_"
|
||||
and all(ch in "0123456789abcdef" for ch in k.split("/", 1)[1][:12])
|
||||
]
|
||||
for k in ghost_svcs:
|
||||
logger.info(f"Pruning ghost (hash-prefixed) service key from world state: {k}")
|
||||
del self.world_state["services"][k]
|
||||
|
||||
# Remove resolved incidents older than 7 days.
|
||||
now = time.time()
|
||||
stale_incidents = [
|
||||
|
|
|
|||
|
|
@ -274,6 +274,34 @@ class NodeAgent:
|
|||
# Docker container health
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _canonical_container_name(self, c) -> str:
|
||||
"""Return a stable, human-readable service name for a container.
|
||||
|
||||
Priority:
|
||||
1. com.docker.compose.service label — always the clean compose-file key
|
||||
(e.g. "mosquitto", "zigbee2mqtt"), immune to the hash-prefix corruption
|
||||
Docker uses for stale project-state tracking entries.
|
||||
2. c.name with hash prefix stripped — fallback for non-Compose containers.
|
||||
When a container is removed outside of compose and then recreated, Docker
|
||||
stores the old container record as "<12-char-hex-id>_<original-name>".
|
||||
c.name returns that corrupted form; we strip the prefix here.
|
||||
|
||||
Using c.name directly is the source of ghost service keys like
|
||||
"vps/9e36297651e7_control-plane-observer" that accumulate in services.json
|
||||
every time containers are rebuilt.
|
||||
"""
|
||||
labels = c.attrs.get("Config", {}).get("Labels", {}) or {}
|
||||
compose_svc = labels.get("com.docker.compose.service", "").strip()
|
||||
if compose_svc:
|
||||
return compose_svc
|
||||
# Strip Docker-internal stale-state prefix: "<12-char hex>_<real-name>"
|
||||
name = c.name
|
||||
if (len(name) > 13
|
||||
and name[12] == "_"
|
||||
and all(ch in "0123456789abcdef" for ch in name[:12])):
|
||||
return name[13:]
|
||||
return name
|
||||
|
||||
def check_containers(self):
|
||||
if not self.docker_client:
|
||||
return
|
||||
|
|
@ -285,7 +313,7 @@ class NodeAgent:
|
|||
|
||||
for c in containers:
|
||||
try:
|
||||
name = c.name
|
||||
name = self._canonical_container_name(c)
|
||||
status = c.status
|
||||
host_config = c.attrs.get("HostConfig", {})
|
||||
restart_policy = host_config.get("RestartPolicy", {}).get("Name", "")
|
||||
|
|
@ -293,6 +321,12 @@ class NodeAgent:
|
|||
.get("Health", {})
|
||||
.get("Status", ""))
|
||||
|
||||
# Skip containers in "created" state — these are Docker Compose
|
||||
# internal tracking artifacts (never started, often hash-prefixed)
|
||||
# that appear when a container is rebuilt outside of compose.
|
||||
if status == "created":
|
||||
continue
|
||||
|
||||
# Only track containers with a restart policy (long-running services)
|
||||
is_managed = restart_policy in ("unless-stopped", "always", "on-failure")
|
||||
if not is_managed:
|
||||
|
|
|
|||
Loading…
Reference in a new issue