Fix pending actions: node_exporter, zigbee2mqtt, chelsty-ha monitoring

node_exporter (new service):
- Add services/node_exporter/docker-compose.yml matching solaria deployment
  (network_mode: host, pid: host, /:/host:ro,rslave mount)
- Add services/node_exporter/service.yaml

zigbee2mqtt chelsty-infra override:
- Fix network_mode: host (mosquitto runs on host network, port 1883 on localhost)
- Fix volume mount: ./configuration.yaml → absolute /opt/homelab/config/zigbee2mqtt/
  (secrets stay in runtime config dir, never in Git)
- Remove MQTT_USER/MQTT_PASSWORD (mosquitto uses allow_anonymous true)
- Extend healthcheck start_period to 60s (z2m takes time on first start)

chelsty-ha/services.yaml:
- Remove node-agent entry entirely (never deployed, no plans to bootstrap now)
- Keep homeassistant with monitor: false (no node-agent = no health events)

supervisor: respect monitor: false in services.yaml
- Skip action generation for services where monitor=false
- Cleans up chelsty-ha entries from action queue without removing desired-state docs

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Oskar Kapala 2026-05-27 15:10:48 +02:00
parent fb7828b52b
commit 51002d4502
5 changed files with 55 additions and 16 deletions

View file

@ -2,14 +2,11 @@ host: chelsty-ha
site: chelsty
services:
node-agent:
role: node-stability-monitor
# LTE node: monitors homeassistant container health and emits events only.
# No disk cleanup — HA database size is managed by recorder purge_keep_days
# in HA configuration, not by Docker or filesystem operations.
deployment_model: docker-compose
exposure: local-only
offline_required: true
homeassistant:
role: home-automation-controller
offline_required: true
# monitor: false — chelsty-ha has no node-agent deployed, so there are no
# container-health events for the observer to track. HA is monitored
# indirectly via the chelsty-infra MQTT broker (if MQTT goes silent, HA
# is likely down). Re-enable once node-agent is bootstrapped on this VM.
monitor: false

View file

@ -1,13 +1,17 @@
services:
zigbee2mqtt:
# host network: mosquitto runs with network_mode: host on chelsty-infra,
# so zigbee2mqtt must also use host networking to reach localhost:1883.
network_mode: host
volumes:
- ./configuration.yaml:/app/data/configuration.yaml:ro
# configuration.yaml lives in the runtime config dir (not in Git).
# On chelsty-infra: /opt/homelab/config/zigbee2mqtt/configuration.yaml
- /opt/homelab/config/zigbee2mqtt/configuration.yaml:/app/data/configuration.yaml:ro
environment:
- MQTT_USER=${MQTT_USER}
- MQTT_PASSWORD=${MQTT_PASSWORD}
# Healthcheck is already defined in base service, but we ensure compatibility
- TZ=Europe/Warsaw
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8080"]
interval: 10s
timeout: 5s
test: ["CMD-SHELL", "wget -qO- http://localhost:8080 > /dev/null 2>&1 || exit 1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s

View file

@ -99,6 +99,16 @@ class Supervisor:
data = yaml.safe_load(f)
host_name = data.get("host")
for svc_name, svc_info in data.get("services", {}).items():
svc_info = svc_info or {}
# monitor: false — service is documented as desired but
# intentionally excluded from supervisor action generation.
# Use this when a service is not yet bootstrapped on an
# offline/LTE node so the queue stays clean until it is.
if svc_info.get("monitor") is False:
logger.debug(
f"Skipping {host_name}/{svc_name}: monitor=false"
)
continue
svc_key = f"{host_name}/{svc_name}"
services[svc_key] = {
"node": host_name,

View file

@ -0,0 +1,11 @@
services:
node_exporter:
image: quay.io/prometheus/node-exporter:latest
container_name: node_exporter
command:
- '--path.rootfs=/host'
network_mode: host
pid: host
restart: unless-stopped
volumes:
- '/:/host:ro,rslave'

View file

@ -0,0 +1,17 @@
name: node_exporter
owner_node: vps
role: metrics-exporter
description: >
Prometheus Node Exporter — exposes host-level metrics (CPU, memory, disk,
network) for scraping by a Prometheus instance.
exposure: local-only
dependencies: []
restart_policy: unless-stopped
healthcheck:
type: http
endpoint: http://localhost:9100/metrics
interval: 30s
persistence:
paths: []
runtime:
env_vars: []