diff --git a/hosts/chelsty-ha/services.yaml b/hosts/chelsty-ha/services.yaml index 59a9cca..335161c 100644 --- a/hosts/chelsty-ha/services.yaml +++ b/hosts/chelsty-ha/services.yaml @@ -2,14 +2,11 @@ host: chelsty-ha site: chelsty services: - node-agent: - role: node-stability-monitor - # LTE node: monitors homeassistant container health and emits events only. - # No disk cleanup — HA database size is managed by recorder purge_keep_days - # in HA configuration, not by Docker or filesystem operations. - deployment_model: docker-compose - exposure: local-only - offline_required: true - homeassistant: role: home-automation-controller + offline_required: true + # monitor: false — chelsty-ha has no node-agent deployed, so there are no + # container-health events for the observer to track. HA is monitored + # indirectly via the chelsty-infra MQTT broker (if MQTT goes silent, HA + # is likely down). Re-enable once node-agent is bootstrapped on this VM. + monitor: false diff --git a/hosts/chelsty-infra/runtime/zigbee2mqtt/docker-compose.override.yml b/hosts/chelsty-infra/runtime/zigbee2mqtt/docker-compose.override.yml index 7f4d641..4210d5d 100644 --- a/hosts/chelsty-infra/runtime/zigbee2mqtt/docker-compose.override.yml +++ b/hosts/chelsty-infra/runtime/zigbee2mqtt/docker-compose.override.yml @@ -1,13 +1,17 @@ services: zigbee2mqtt: + # host network: mosquitto runs with network_mode: host on chelsty-infra, + # so zigbee2mqtt must also use host networking to reach localhost:1883. + network_mode: host volumes: - - ./configuration.yaml:/app/data/configuration.yaml:ro + # configuration.yaml lives in the runtime config dir (not in Git). + # On chelsty-infra: /opt/homelab/config/zigbee2mqtt/configuration.yaml + - /opt/homelab/config/zigbee2mqtt/configuration.yaml:/app/data/configuration.yaml:ro environment: - - MQTT_USER=${MQTT_USER} - - MQTT_PASSWORD=${MQTT_PASSWORD} - # Healthcheck is already defined in base service, but we ensure compatibility + - TZ=Europe/Warsaw healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8080"] - interval: 10s - timeout: 5s + test: ["CMD-SHELL", "wget -qO- http://localhost:8080 > /dev/null 2>&1 || exit 1"] + interval: 30s + timeout: 10s retries: 3 + start_period: 60s diff --git a/services/control-plane/src/supervisor.py b/services/control-plane/src/supervisor.py index a303743..c403dee 100644 --- a/services/control-plane/src/supervisor.py +++ b/services/control-plane/src/supervisor.py @@ -99,6 +99,16 @@ class Supervisor: data = yaml.safe_load(f) host_name = data.get("host") for svc_name, svc_info in data.get("services", {}).items(): + svc_info = svc_info or {} + # monitor: false — service is documented as desired but + # intentionally excluded from supervisor action generation. + # Use this when a service is not yet bootstrapped on an + # offline/LTE node so the queue stays clean until it is. + if svc_info.get("monitor") is False: + logger.debug( + f"Skipping {host_name}/{svc_name}: monitor=false" + ) + continue svc_key = f"{host_name}/{svc_name}" services[svc_key] = { "node": host_name, diff --git a/services/node_exporter/docker-compose.yml b/services/node_exporter/docker-compose.yml new file mode 100644 index 0000000..b19df6f --- /dev/null +++ b/services/node_exporter/docker-compose.yml @@ -0,0 +1,11 @@ +services: + node_exporter: + image: quay.io/prometheus/node-exporter:latest + container_name: node_exporter + command: + - '--path.rootfs=/host' + network_mode: host + pid: host + restart: unless-stopped + volumes: + - '/:/host:ro,rslave' diff --git a/services/node_exporter/service.yaml b/services/node_exporter/service.yaml new file mode 100644 index 0000000..f2743a6 --- /dev/null +++ b/services/node_exporter/service.yaml @@ -0,0 +1,17 @@ +name: node_exporter +owner_node: vps +role: metrics-exporter +description: > + Prometheus Node Exporter — exposes host-level metrics (CPU, memory, disk, + network) for scraping by a Prometheus instance. +exposure: local-only +dependencies: [] +restart_policy: unless-stopped +healthcheck: + type: http + endpoint: http://localhost:9100/metrics + interval: 30s +persistence: + paths: [] +runtime: + env_vars: []