From 51002d45028400b74ff962e3f48617962d23393b Mon Sep 17 00:00:00 2001 From: Oskar Kapala Date: Wed, 27 May 2026 15:10:48 +0200 Subject: [PATCH] Fix pending actions: node_exporter, zigbee2mqtt, chelsty-ha monitoring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit node_exporter (new service): - Add services/node_exporter/docker-compose.yml matching solaria deployment (network_mode: host, pid: host, /:/host:ro,rslave mount) - Add services/node_exporter/service.yaml zigbee2mqtt chelsty-infra override: - Fix network_mode: host (mosquitto runs on host network, port 1883 on localhost) - Fix volume mount: ./configuration.yaml → absolute /opt/homelab/config/zigbee2mqtt/ (secrets stay in runtime config dir, never in Git) - Remove MQTT_USER/MQTT_PASSWORD (mosquitto uses allow_anonymous true) - Extend healthcheck start_period to 60s (z2m takes time on first start) chelsty-ha/services.yaml: - Remove node-agent entry entirely (never deployed, no plans to bootstrap now) - Keep homeassistant with monitor: false (no node-agent = no health events) supervisor: respect monitor: false in services.yaml - Skip action generation for services where monitor=false - Cleans up chelsty-ha entries from action queue without removing desired-state docs Co-Authored-By: Claude Sonnet 4.6 --- hosts/chelsty-ha/services.yaml | 15 ++++++--------- .../zigbee2mqtt/docker-compose.override.yml | 18 +++++++++++------- services/control-plane/src/supervisor.py | 10 ++++++++++ services/node_exporter/docker-compose.yml | 11 +++++++++++ services/node_exporter/service.yaml | 17 +++++++++++++++++ 5 files changed, 55 insertions(+), 16 deletions(-) create mode 100644 services/node_exporter/docker-compose.yml create mode 100644 services/node_exporter/service.yaml diff --git a/hosts/chelsty-ha/services.yaml b/hosts/chelsty-ha/services.yaml index 59a9cca..335161c 100644 --- a/hosts/chelsty-ha/services.yaml +++ b/hosts/chelsty-ha/services.yaml @@ -2,14 +2,11 @@ host: chelsty-ha site: chelsty services: - node-agent: - role: node-stability-monitor - # LTE node: monitors homeassistant container health and emits events only. - # No disk cleanup — HA database size is managed by recorder purge_keep_days - # in HA configuration, not by Docker or filesystem operations. - deployment_model: docker-compose - exposure: local-only - offline_required: true - homeassistant: role: home-automation-controller + offline_required: true + # monitor: false — chelsty-ha has no node-agent deployed, so there are no + # container-health events for the observer to track. HA is monitored + # indirectly via the chelsty-infra MQTT broker (if MQTT goes silent, HA + # is likely down). Re-enable once node-agent is bootstrapped on this VM. + monitor: false diff --git a/hosts/chelsty-infra/runtime/zigbee2mqtt/docker-compose.override.yml b/hosts/chelsty-infra/runtime/zigbee2mqtt/docker-compose.override.yml index 7f4d641..4210d5d 100644 --- a/hosts/chelsty-infra/runtime/zigbee2mqtt/docker-compose.override.yml +++ b/hosts/chelsty-infra/runtime/zigbee2mqtt/docker-compose.override.yml @@ -1,13 +1,17 @@ services: zigbee2mqtt: + # host network: mosquitto runs with network_mode: host on chelsty-infra, + # so zigbee2mqtt must also use host networking to reach localhost:1883. + network_mode: host volumes: - - ./configuration.yaml:/app/data/configuration.yaml:ro + # configuration.yaml lives in the runtime config dir (not in Git). + # On chelsty-infra: /opt/homelab/config/zigbee2mqtt/configuration.yaml + - /opt/homelab/config/zigbee2mqtt/configuration.yaml:/app/data/configuration.yaml:ro environment: - - MQTT_USER=${MQTT_USER} - - MQTT_PASSWORD=${MQTT_PASSWORD} - # Healthcheck is already defined in base service, but we ensure compatibility + - TZ=Europe/Warsaw healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8080"] - interval: 10s - timeout: 5s + test: ["CMD-SHELL", "wget -qO- http://localhost:8080 > /dev/null 2>&1 || exit 1"] + interval: 30s + timeout: 10s retries: 3 + start_period: 60s diff --git a/services/control-plane/src/supervisor.py b/services/control-plane/src/supervisor.py index a303743..c403dee 100644 --- a/services/control-plane/src/supervisor.py +++ b/services/control-plane/src/supervisor.py @@ -99,6 +99,16 @@ class Supervisor: data = yaml.safe_load(f) host_name = data.get("host") for svc_name, svc_info in data.get("services", {}).items(): + svc_info = svc_info or {} + # monitor: false — service is documented as desired but + # intentionally excluded from supervisor action generation. + # Use this when a service is not yet bootstrapped on an + # offline/LTE node so the queue stays clean until it is. + if svc_info.get("monitor") is False: + logger.debug( + f"Skipping {host_name}/{svc_name}: monitor=false" + ) + continue svc_key = f"{host_name}/{svc_name}" services[svc_key] = { "node": host_name, diff --git a/services/node_exporter/docker-compose.yml b/services/node_exporter/docker-compose.yml new file mode 100644 index 0000000..b19df6f --- /dev/null +++ b/services/node_exporter/docker-compose.yml @@ -0,0 +1,11 @@ +services: + node_exporter: + image: quay.io/prometheus/node-exporter:latest + container_name: node_exporter + command: + - '--path.rootfs=/host' + network_mode: host + pid: host + restart: unless-stopped + volumes: + - '/:/host:ro,rslave' diff --git a/services/node_exporter/service.yaml b/services/node_exporter/service.yaml new file mode 100644 index 0000000..f2743a6 --- /dev/null +++ b/services/node_exporter/service.yaml @@ -0,0 +1,17 @@ +name: node_exporter +owner_node: vps +role: metrics-exporter +description: > + Prometheus Node Exporter — exposes host-level metrics (CPU, memory, disk, + network) for scraping by a Prometheus instance. +exposure: local-only +dependencies: [] +restart_policy: unless-stopped +healthcheck: + type: http + endpoint: http://localhost:9100/metrics + interval: 30s +persistence: + paths: [] +runtime: + env_vars: []