diff --git a/docs/chelsty-stability-agent.md b/docs/chelsty-stability-agent.md
new file mode 100644
index 0000000..e6f63d3
--- /dev/null
+++ b/docs/chelsty-stability-agent.md
@@ -0,0 +1,42 @@
+### CHELSTY Stability Agent
+
+The stability-agent on CHELSTY provides local observability and health monitoring for the node's services and infrastructure.
+
+#### Purpose
+
+It acts as a filesystem-first watchdog that detects anomalies in the local runtime environment without taking autonomous destructive actions (like restarts). It serves as the primary data source for node-level stability metrics.
+
+#### Monitoring Scope
+
+* **Docker Containers**: Monitors all local containers. If a container is not in the `running` state, a `containers_not_running` event is generated.
+* **Disk Usage**: Monitors the root filesystem. Generates `disk_usage_high` events if usage exceeds the configured threshold.
+* **Connectivity**:
+ * Checks if the Tailscale socket or interface is available.
+ * Checks reachability of the local Mosquitto MQTT broker.
+* **Zigbee2MQTT**: Specifically tracks the presence and status of the Zigbee2MQTT service.
+
+#### Storage and Integration
+
+* **Heartbeat**: Updated every cycle at `/opt/homelab/state/stability-agent.heartbeat`.
+* **State Summary**: A JSON summary of all latest checks at `/opt/homelab/state/stability-agent.json`.
+* **Events**: Append-only JSON lines at `/opt/homelab/events/YYYY-MM-DD/chelsty/events.jsonl`.
+
+#### Deployment
+
+The service is deployed via Docker Compose on CHELSTY.
+
+```bash
+cd services/stability-agent
+docker compose up -d
+```
+
+#### Configuration
+
+Configuration is managed via environment variables in `docker-compose.override.yml` on the host.
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `STABILITY_CHECK_INTERVAL` | Seconds between checks | `60` |
+| `DISK_THRESHOLD_PCT` | Disk usage alert threshold | `90` |
+| `MQTT_HOST` | MQTT broker hostname | `mosquitto` |
+| `MQTT_PORT` | MQTT broker port | `1883` |
diff --git a/hosts/chelsty/runtime/stability-agent/docker-compose.override.yml b/hosts/chelsty/runtime/stability-agent/docker-compose.override.yml
new file mode 100644
index 0000000..c8ed0e0
--- /dev/null
+++ b/hosts/chelsty/runtime/stability-agent/docker-compose.override.yml
@@ -0,0 +1,7 @@
+services:
+ stability-agent:
+ environment:
+ - STABILITY_CHECK_INTERVAL=60
+ - DISK_THRESHOLD_PCT=85
+ - MQTT_HOST=mosquitto
+ - MQTT_PORT=1883
diff --git a/hosts/chelsty/services.yaml b/hosts/chelsty/services.yaml
index b7ee27e..9e7091a 100644
--- a/hosts/chelsty/services.yaml
+++ b/hosts/chelsty/services.yaml
@@ -106,3 +106,21 @@ services:
- /opt/homelab/data/mosquitto
notes:
- Retain ACL, password, persistence, and bridge configuration if enabled.
+
+ stability-agent:
+ role: node-stability-monitor
+ deployment_model: docker-compose
+ exposure: local-only
+ offline_required: true
+ depends_on:
+ local:
+ - mosquitto
+ external: []
+ runtime:
+ config_path: null
+ data_path: /opt/homelab/state
+ logs_path: /opt/homelab/events
+ backup:
+ recommended: false
+ notes:
+ - Events and state are transient or can be reconstructed; high-frequency writes.
diff --git a/services/agent-system/README.md b/services/agent-system/README.md
new file mode 100644
index 0000000..c811dd6
--- /dev/null
+++ b/services/agent-system/README.md
@@ -0,0 +1,37 @@
+### Agent System
+Central runtime materializer and Operator Control Plane UI.
+
+#### Components
+- **Redis**: Central state store (on PIHA).
+- **Runtime Materializer**: Converts Redis state to JSON files in `/opt/homelab/world`.
+- **Web UI**: Exposes API endpoints and serving the Operator UI.
+
+#### Deployment (on PIHA)
+```bash
+cd services/agent-system
+./deploy.sh
+```
+
+#### Deployment (on CHELSTY)
+```bash
+cd services/stability-agent
+docker compose up -d --build
+```
+
+#### Verification
+The `deploy.sh` script automatically verifies the local endpoints.
+You can also manually check:
+```bash
+# Check runtime summary
+curl http://localhost:18180/summary
+
+# Check discovered nodes
+curl http://localhost:18180/nodes
+
+# Check discovered services
+curl http://localhost:18180/services
+```
+
+#### Directory Structure
+- `/opt/homelab/world`: Contains materialized JSON state.
+- `/opt/homelab/state`: Contains operator configuration and local heartbeats.
diff --git a/services/agent-system/deploy.sh b/services/agent-system/deploy.sh
new file mode 100755
index 0000000..3d6c015
--- /dev/null
+++ b/services/agent-system/deploy.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+set -e
+
+echo ">>> Validating docker-compose configuration..."
+docker compose config
+
+echo ">>> Building and starting Agent System services..."
+docker compose up -d --build
+
+echo ">>> Services status:"
+docker ps --filter "name=agent-system"
+
+echo ">>> Verifying API endpoints..."
+sleep 5 # Give it a moment to start
+
+endpoints=("summary" "nodes" "services")
+for ep in "${endpoints[@]}"; do
+ echo "Checking /$ep..."
+ curl -s -f http://localhost:18180/$ep > /dev/null && echo " OK" || echo " FAILED"
+done
+
+echo ">>> Deployment complete."
diff --git a/services/agent-system/docker-compose.yml b/services/agent-system/docker-compose.yml
new file mode 100644
index 0000000..c2e13e4
--- /dev/null
+++ b/services/agent-system/docker-compose.yml
@@ -0,0 +1,33 @@
+services:
+ redis:
+ image: redis:7
+ container_name: agent-system-redis
+ ports:
+ - "6379:6379"
+ restart: unless-stopped
+
+ webui:
+ build: ./webui
+ container_name: agent-system-webui
+ ports:
+ - "18180:8080"
+ volumes:
+ - /opt/homelab:/opt/homelab
+ depends_on:
+ - redis
+ restart: unless-stopped
+
+ runtime-materializer:
+ build: ./runtime-materializer
+ container_name: agent-system-runtime-materializer
+ environment:
+ REDIS_HOST: redis
+ REDIS_PORT: "6379"
+ HOMELAB_WORLD_ROOT: /opt/homelab/world
+ WORLD_DIR: /opt/homelab/world
+ MATERIALIZE_INTERVAL: "10"
+ volumes:
+ - /opt/homelab:/opt/homelab
+ depends_on:
+ - redis
+ restart: unless-stopped
diff --git a/services/agent-system/runtime-materializer/Dockerfile b/services/agent-system/runtime-materializer/Dockerfile
new file mode 100644
index 0000000..5510bc3
--- /dev/null
+++ b/services/agent-system/runtime-materializer/Dockerfile
@@ -0,0 +1,16 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+# Install redis python package as requested
+RUN pip install --no-cache-dir redis
+
+COPY materializer.py .
+
+# Ensure the world directory exists in the container (though it will likely be a volume)
+RUN mkdir -p /opt/homelab/world
+
+# Use unbuffered output to see logs in docker
+ENV PYTHONUNBUFFERED=1
+
+CMD ["python", "materializer.py"]
diff --git a/services/agent-system/runtime-materializer/materializer.py b/services/agent-system/runtime-materializer/materializer.py
new file mode 100644
index 0000000..cbf22ea
--- /dev/null
+++ b/services/agent-system/runtime-materializer/materializer.py
@@ -0,0 +1,181 @@
+import redis
+import json
+import os
+import time
+import argparse
+from datetime import datetime
+
+# Configuration from environment variables
+REDIS_HOST = os.environ.get("REDIS_HOST", "redis")
+REDIS_PORT = int(os.environ.get("REDIS_PORT", 6379))
+WORLD_DIR = os.environ.get("WORLD_DIR", "/opt/homelab/world")
+
+def get_redis_client():
+ """Returns a Redis client with decoding enabled."""
+ return redis.Redis(
+ host=REDIS_HOST,
+ port=REDIS_PORT,
+ decode_responses=True,
+ socket_timeout=5
+ )
+
+def safe_json_loads(data, default=None):
+ """Safely loads JSON from a string."""
+ if not data:
+ return default
+ try:
+ if isinstance(data, (dict, list)):
+ return data
+ return json.loads(data)
+ except (json.JSONDecodeError, TypeError):
+ return data
+
+def normalize_health(health):
+ """Normalizes health values for the UI."""
+ if not health:
+ return "nominal"
+ h = str(health).lower()
+ if h in ["healthy", "ok", "running", "nominal"]:
+ return "nominal"
+ if h in ["degraded", "warning"]:
+ return "degraded"
+ return "error"
+
+def materialize():
+ """Reads state from Redis and writes JSON files to the world directory."""
+ print(f"[{datetime.now().isoformat()}] Materializing world state...")
+ try:
+ r = get_redis_client()
+
+ # 1. Nodes
+ nodes = []
+ node_keys = r.keys("homelab:nodes:*")
+ for key in node_keys:
+ node_data = r.hgetall(key)
+ if node_data:
+ # Normalize health
+ if "health" in node_data:
+ node_data["health"] = normalize_health(node_data["health"])
+ # Parse JSON fields if they exist
+ if "capabilities" in node_data:
+ node_data["capabilities"] = safe_json_loads(node_data["capabilities"], [])
+ if "checks" in node_data:
+ node_data["checks"] = safe_json_loads(node_data["checks"], {})
+ nodes.append(node_data)
+
+ # 2. Services
+ services = []
+ service_keys = r.keys("homelab:services:*")
+ for key in service_keys:
+ svc_data = r.hgetall(key)
+ if svc_data:
+ # Normalize health
+ if "health" in svc_data:
+ svc_data["health"] = normalize_health(svc_data["health"])
+ if "dependencies" in svc_data:
+ svc_data["dependencies"] = safe_json_loads(svc_data["dependencies"], [])
+ if "recommendations" in svc_data:
+ svc_data["recommendations"] = safe_json_loads(svc_data["recommendations"], [])
+ services.append(svc_data)
+
+ # 3. Events (Stream)
+ events = []
+ try:
+ # Get last 100 events from the stream
+ raw_events = r.xrevrange("homelab:events", count=100)
+ for event_id, data in raw_events:
+ event = data.copy()
+ event["id"] = event_id
+ if "details" in event:
+ event["details"] = safe_json_loads(event["details"], {})
+ events.append(event)
+ except redis.exceptions.ResponseError:
+ # homelab:events might not be a stream or doesn't exist
+ pass
+
+ # 4. Incidents (Hash)
+ incidents = []
+ incident_keys = r.keys("homelab:incidents:*")
+ for key in incident_keys:
+ incident_data = r.hgetall(key)
+ if incident_data:
+ # Normalize health if present
+ if "health" in incident_data:
+ incident_data["health"] = normalize_health(incident_data["health"])
+ incidents.append(incident_data)
+
+ # 5. Deployments (Hash)
+ deployments = []
+ deployment_keys = r.keys("homelab:deployments:*")
+ for key in deployment_keys:
+ dep_data = r.hgetall(key)
+ if dep_data:
+ deployments.append(dep_data)
+
+ # 6. Recommendations (Hash)
+ recommendations = []
+ recommendation_keys = r.keys("homelab:recommendations:*")
+ for key in recommendation_keys:
+ rec_data = r.hgetall(key)
+ if rec_data:
+ recommendations.append(rec_data)
+
+ # 7. Runtime Summary
+ unhealthy_services = [s for s in services if s.get("health") != "nominal"]
+ active_incidents = [i for i in incidents if i.get("status") not in ["resolved", "closed"]]
+
+ status = "nominal"
+ if len(active_incidents) > 0 or len(unhealthy_services) > 5:
+ status = "error"
+ elif len(unhealthy_services) > 0:
+ status = "degraded"
+
+ summary = {
+ "status": status,
+ "timestamp": datetime.utcnow().isoformat() + "Z",
+ "last_update": int(time.time()),
+ "node_count": len(nodes),
+ "service_count": len(services),
+ "active_incidents_count": len(active_incidents),
+ "unhealthy_services_count": len(unhealthy_services),
+ "incident_count": len(incidents),
+ "recent_events_count": len(events),
+ "stale": False
+ }
+
+ # Ensure directory exists
+ os.makedirs(WORLD_DIR, exist_ok=True)
+
+ def write_json(filename, data):
+ path = os.path.join(WORLD_DIR, filename)
+ with open(path, "w") as f:
+ json.dump(data, f, indent=2)
+
+ write_json("runtime-summary.json", summary)
+ write_json("nodes.json", nodes)
+ write_json("services.json", services)
+ write_json("incidents.json", incidents)
+ write_json("events.json", events)
+ write_json("deployments.json", deployments)
+ write_json("recommendations.json", recommendations)
+
+ print(f"[{datetime.now().isoformat()}] Successfully materialized to {WORLD_DIR}")
+
+ except redis.exceptions.ConnectionError as e:
+ print(f"Redis connection error: {e}")
+ except Exception as e:
+ print(f"Unexpected error during materialization: {e}")
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Homelab Runtime Materializer")
+ parser.add_argument("--once", action="store_true", help="Run once and exit")
+ parser.add_argument("--interval", type=int, default=30, help="Sleep interval between runs (seconds)")
+ args = parser.parse_args()
+
+ if args.once:
+ materialize()
+ else:
+ print(f"Starting materializer loop (interval: {args.interval}s)...")
+ while True:
+ materialize()
+ time.sleep(args.interval)
diff --git a/services/agent-system/webui/Dockerfile b/services/agent-system/webui/Dockerfile
new file mode 100644
index 0000000..771fcda
--- /dev/null
+++ b/services/agent-system/webui/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+COPY web.py index.html ./
+
+EXPOSE 8080
+CMD ["python", "web.py"]
diff --git a/services/agent-system/webui/index.html b/services/agent-system/webui/index.html
new file mode 100644
index 0000000..d20843a
--- /dev/null
+++ b/services/agent-system/webui/index.html
@@ -0,0 +1,701 @@
+
+
+
+
+
+ Operator Control Plane
+
+
+
+
+
+
+
+ RUNTIME STATE IS STALE
+
+
+
+
Dashboard
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/services/agent-system/webui/web.py b/services/agent-system/webui/web.py
new file mode 100644
index 0000000..49fd021
--- /dev/null
+++ b/services/agent-system/webui/web.py
@@ -0,0 +1,264 @@
+import json
+import os
+import time
+from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
+from pathlib import Path
+
+
+STATE_DIR = Path(os.getenv("HOMELAB_STATE_ROOT", "/opt/homelab/state"))
+EVENTS_DIR = Path(os.getenv("HOMELAB_EVENTS_ROOT", "/opt/homelab/events"))
+WORLD_DIR = Path(os.getenv("HOMELAB_WORLD_ROOT", "/opt/homelab/world"))
+ACTIONS_DIR = Path(os.getenv("HOMELAB_ACTIONS_ROOT", "/opt/homelab/actions"))
+CONFIG_DIR = Path(os.getenv("HOMELAB_CONFIG_ROOT", "/opt/homelab/config"))
+
+STATIC_DIR = Path(__file__).parent
+
+DEFAULT_CONFIG = {
+ "operator_mode": "approval",
+ "auto_mode": True,
+ "action_thresholds": {
+ "restart_ha": 0.8,
+ "check_network": 0.9,
+ },
+ "default_threshold": 0.9,
+ "allowed_auto_actions": ["restart_ha"],
+}
+
+
+def read_json_file(path, default=None):
+ if not path.exists():
+ return default if default is not None else []
+ try:
+ return json.loads(path.read_text())
+ except Exception:
+ return default if default is not None else []
+
+
+def get_config():
+ config_path = STATE_DIR / "operator-config.json"
+ if config_path.exists():
+ return read_json_file(config_path, DEFAULT_CONFIG)
+ return DEFAULT_CONFIG
+
+
+def save_config(config):
+ STATE_DIR.mkdir(parents=True, exist_ok=True)
+ (STATE_DIR / "operator-config.json").write_text(json.dumps(config, indent=2))
+
+
+def current_nodes():
+ return read_json_file(WORLD_DIR / "nodes.json")
+
+
+def current_services():
+ return read_json_file(WORLD_DIR / "services.json")
+
+
+def current_deployments():
+ return read_json_file(WORLD_DIR / "deployments.json")
+
+
+def current_incidents():
+ return read_json_file(WORLD_DIR / "incidents.json")
+
+
+def current_recommendations():
+ return read_json_file(WORLD_DIR / "recommendations.json")
+
+
+def current_summary():
+ summary = read_json_file(WORLD_DIR / "runtime-summary.json", default={})
+ if summary:
+ # Check for staleness
+ mtime = os.path.getmtime(WORLD_DIR / "runtime-summary.json")
+ summary["last_update"] = mtime
+ summary["stale"] = (time.time() - mtime) > 60 # Stale if older than 60s
+ return summary
+
+
+def current_events():
+ return read_json_file(WORLD_DIR / "events.json", default=[])
+
+
+def current_actions():
+ actions = {}
+ statuses = ["pending", "approved", "running", "completed", "failed", "rejected"]
+ for status in statuses:
+ actions[status] = []
+ status_dir = ACTIONS_DIR / status
+ if status_dir.exists():
+ for f in status_dir.glob("*.json"):
+ data = read_json_file(f)
+ if data:
+ # Injects some metadata for UI
+ data["id"] = data.get("action_id") or f.stem
+ data["status"] = status
+ actions[status].append(data)
+ return actions
+
+
+def mutate_action(action_id, target_status):
+ statuses = ["pending", "approved", "running", "completed", "failed", "rejected"]
+ if target_status not in statuses:
+ return False, f"Invalid target status: {target_status}"
+
+ # Find where the action is
+ source_path = None
+ current_status = None
+ for status in statuses:
+ p = ACTIONS_DIR / status / f"{action_id}.json"
+ if p.exists():
+ source_path = p
+ current_status = status
+ break
+
+ if not source_path:
+ return False, f"Action {action_id} not found"
+
+ target_dir = ACTIONS_DIR / target_status
+ target_dir.mkdir(parents=True, exist_ok=True)
+ target_path = target_dir / f"{action_id}.json"
+
+ try:
+ data = json.loads(source_path.read_text())
+ data["status"] = target_status
+ data["updated_at"] = time.time()
+
+ # Keep history of transitions
+ history = data.get("transition_history", [])
+ history.append({
+ "from": current_status,
+ "to": target_status,
+ "timestamp": time.time()
+ })
+ data["transition_history"] = history
+
+ target_path.write_text(json.dumps(data, indent=2))
+ if source_path != target_path:
+ source_path.unlink()
+ return True, "Success"
+ except Exception as e:
+ return False, str(e)
+
+
+def send_json(status, payload, handler):
+ body = (json.dumps(payload) + "\n").encode("utf-8")
+ handler.send_response(status)
+ handler.send_header("Content-Type", "application/json")
+ handler.send_header("Content-Length", str(len(body)))
+ handler.end_headers()
+ handler.wfile.write(body)
+
+
+class Handler(BaseHTTPRequestHandler):
+ def do_GET(self):
+ if self.path == "/config":
+ send_json(200, get_config(), self)
+ return
+
+ if self.path == "/nodes":
+ send_json(200, current_nodes(), self)
+ return
+
+ if self.path == "/services":
+ send_json(200, current_services(), self)
+ return
+
+ if self.path == "/deployments":
+ send_json(200, current_deployments(), self)
+ return
+
+ if self.path == "/incidents":
+ send_json(200, current_incidents(), self)
+ return
+
+ if self.path == "/recommendations":
+ send_json(200, current_recommendations(), self)
+ return
+
+ if self.path == "/summary":
+ send_json(200, current_summary(), self)
+ return
+
+ if self.path == "/events":
+ send_json(200, current_events(), self)
+ return
+
+ if self.path == "/actions":
+ send_json(200, current_actions(), self)
+ return
+
+ if self.path in ("/", "/index.html"):
+ body = (STATIC_DIR / "index.html").read_bytes()
+ self.send_response(200)
+ self.send_header("Content-Type", "text/html; charset=utf-8")
+ self.send_header("Content-Length", str(len(body)))
+ self.end_headers()
+ self.wfile.write(body)
+ return
+
+ self.send_error(404)
+
+ def do_POST(self):
+ if self.path not in (
+ "/config",
+ "/action/mutate",
+ "/mode",
+ ):
+ self.send_error(404)
+ return
+
+ length = int(self.headers.get("Content-Length", "0"))
+ raw_body = self.rfile.read(length).decode("utf-8")
+ try:
+ payload = json.loads(raw_body)
+ except json.JSONDecodeError:
+ self.send_error(400, "Invalid JSON")
+ return
+
+ if self.path == "/config":
+ config = get_config()
+ config.update(payload)
+ save_config(config)
+ send_json(200, {"status": "ok"}, self)
+ return
+
+ if self.path == "/mode":
+ mode = payload.get("mode")
+ if not mode:
+ self.send_error(400, "mode is required")
+ return
+ config = get_config()
+ config["operator_mode"] = mode
+ save_config(config)
+ send_json(200, {"status": "ok"}, self)
+ return
+
+ if self.path == "/action/mutate":
+ action_id = payload.get("id")
+ target = payload.get("status")
+ if not action_id or not target:
+ self.send_error(400, "id and status are required")
+ return
+ success, msg = mutate_action(action_id, target)
+ if success:
+ send_json(200, {"status": "ok"}, self)
+ else:
+ self.send_error(500, msg)
+ return
+
+ def log_message(self, format, *args):
+ return
+
+
+if __name__ == "__main__":
+ # Ensure directories exist
+ for d in [STATE_DIR, EVENTS_DIR, WORLD_DIR, ACTIONS_DIR, CONFIG_DIR]:
+ d.mkdir(parents=True, exist_ok=True)
+ for s in ["pending", "approved", "running", "completed", "failed", "rejected"]:
+ (ACTIONS_DIR / s).mkdir(parents=True, exist_ok=True)
+
+ port = int(os.getenv("PORT", "8080"))
+ print(f"Operator Control Plane starting on 0.0.0.0:{port}")
+ server = ThreadingHTTPServer(("0.0.0.0", port), Handler)
+ server.serve_forever()
diff --git a/services/stability-agent/Dockerfile b/services/stability-agent/Dockerfile
new file mode 100644
index 0000000..403964d
--- /dev/null
+++ b/services/stability-agent/Dockerfile
@@ -0,0 +1,16 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+# No extra dependencies needed beyond standard library for the current script
+# But we might need them if we decide to use libraries later.
+
+COPY src/stability_agent.py .
+COPY healthcheck.sh .
+RUN chmod +x healthcheck.sh
+
+# Create the expected directories
+RUN mkdir -p /opt/homelab/state /opt/homelab/events
+
+# Run the agent
+CMD ["python", "stability_agent.py"]
diff --git a/services/stability-agent/README.md b/services/stability-agent/README.md
new file mode 100644
index 0000000..8fdbb6c
--- /dev/null
+++ b/services/stability-agent/README.md
@@ -0,0 +1,63 @@
+### Stability Agent
+
+A lightweight filesystem-first watchdog and observer agent for CHELSTY.
+
+#### Features
+
+* **Continuous Monitoring**: Runs as a background service.
+* **Docker Inspection**: Checks container status via read-only Docker socket.
+* **Disk Usage**: Monitors local disk utilization.
+* **Tailscale Check**: Verifies Tailscale availability.
+* **MQTT Reachability**: Checks connectivity to the local MQTT broker.
+* **Zigbee2MQTT Monitoring**: Specifically monitors the Zigbee2MQTT container.
+* **Redis Publishing**: (Optional) Publishes runtime state and events to a central Redis server.
+* **Event Logging**: Writes append-only JSON events to `/opt/homelab/events/YYYY-MM-DD/chelsty/`.
+* **State Reporting**: Writes heartbeat and status summary to `/opt/homelab/state/`.
+
+#### Configuration
+
+Environment variables:
+
+* `STABILITY_CHECK_INTERVAL`: Interval between checks in seconds (default: 60).
+* `DISK_THRESHOLD_PCT`: Disk usage percentage to trigger warning (default: 90).
+* `MQTT_HOST`: Hostname or IP of the MQTT broker to check.
+* `MQTT_PORT`: Port of the MQTT broker (default: 1883).
+* `REDIS_HOST`: Hostname or IP of the Redis server (e.g., PIHA at 100.108.208.3).
+* `REDIS_PORT`: Port of the Redis server (default: 6379).
+* `REDIS_ENABLED`: Whether to enable Redis publishing (default: true if REDIS_HOST is set).
+* `NODE_NAME`: Name of the current node (default: chelsty).
+
+#### Verification
+
+You can verify the Redis publishing using `redis-cli`:
+
+```bash
+# Check node state
+redis-cli -h 100.108.208.3 HGETALL homelab:nodes:chelsty
+
+# Check service discovery
+redis-cli -h 100.108.208.3 HGETALL homelab:services:chelsty:stability-agent
+
+# Check event stream
+redis-cli -h 100.108.208.3 XRANGE homelab:events - +
+```
+
+#### Safety
+
+* No automatic restarts are performed.
+* Read-only access to Docker socket.
+* No configuration mutation.
+* No secrets stored in the repository.
+
+#### Event Schema
+
+Events are written as JSON lines with the following fields:
+
+* `id`: Unique event UUID.
+* `timestamp`: ISO 8601 timestamp (UTC).
+* `node`: `chelsty`.
+* `source`: `stability-agent`.
+* `type`: Type of event (e.g., `disk_usage_high`, `containers_not_running`).
+* `severity`: `info`, `warning`, or `error`.
+* `message`: Human-readable description.
+* `details`: Object containing specific check results.
diff --git a/services/stability-agent/docker-compose.yml b/services/stability-agent/docker-compose.yml
new file mode 100644
index 0000000..4a09e98
--- /dev/null
+++ b/services/stability-agent/docker-compose.yml
@@ -0,0 +1,28 @@
+services:
+ stability-agent:
+ build: .
+ container_name: stability-agent
+ restart: unless-stopped
+ volumes:
+ - /opt/homelab:/opt/homelab
+ - /var/run/docker.sock:/var/run/docker.sock:ro
+ - /var/run/tailscale/tailscaled.sock:/var/run/tailscale/tailscaled.sock:ro
+ environment:
+ - STABILITY_CHECK_INTERVAL=${STABILITY_CHECK_INTERVAL:-60}
+ - DISK_THRESHOLD_PCT=${DISK_THRESHOLD_PCT:-90}
+ - MQTT_HOST=${MQTT_HOST}
+ - MQTT_PORT=${MQTT_PORT:-1883}
+ - REDIS_HOST=${REDIS_HOST:-100.108.208.3}
+ - REDIS_PORT=${REDIS_PORT:-6379}
+ - REDIS_ENABLED=${REDIS_ENABLED:-true}
+ - NODE_NAME=${NODE_NAME:-chelsty}
+ healthcheck:
+ test: ["CMD", "/bin/sh", "/app/healthcheck.sh"]
+ interval: 1m
+ timeout: 10s
+ retries: 3
+
+volumes:
+ opt_homelab:
+ external: true
+ name: homelab_data # This might vary, but /opt/homelab mount is preferred as direct path.
diff --git a/services/stability-agent/env.example b/services/stability-agent/env.example
new file mode 100644
index 0000000..787559b
--- /dev/null
+++ b/services/stability-agent/env.example
@@ -0,0 +1,8 @@
+STABILITY_CHECK_INTERVAL=60
+DISK_THRESHOLD_PCT=90
+MQTT_HOST=mosquitto
+MQTT_PORT=1883
+REDIS_HOST=100.108.208.3
+REDIS_PORT=6379
+REDIS_ENABLED=true
+NODE_NAME=chelsty
diff --git a/services/stability-agent/healthcheck.sh b/services/stability-agent/healthcheck.sh
new file mode 100644
index 0000000..d717c8f
--- /dev/null
+++ b/services/stability-agent/healthcheck.sh
@@ -0,0 +1,25 @@
+#!/bin/sh
+
+HEARTBEAT_FILE="/opt/homelab/state/stability-agent.heartbeat"
+MAX_AGE_SECONDS=300 # 5 minutes
+
+if [ ! -f "$HEARTBEAT_FILE" ]; then
+ echo "Heartbeat file missing"
+ exit 1
+fi
+
+# Get current time in seconds
+NOW=$(date +%s)
+
+# Get file modification time in seconds
+# Busybox stat (standard in alpine/slim) uses -c %Y
+FILE_TIME=$(stat -c %Y "$HEARTBEAT_FILE")
+
+AGE=$((NOW - FILE_TIME))
+
+if [ "$AGE" -gt "$MAX_AGE_SECONDS" ]; then
+ echo "Heartbeat is too old: ${AGE}s"
+ exit 1
+fi
+
+exit 0
diff --git a/services/stability-agent/service.yaml b/services/stability-agent/service.yaml
new file mode 100644
index 0000000..e75e6ca
--- /dev/null
+++ b/services/stability-agent/service.yaml
@@ -0,0 +1,24 @@
+service:
+ name: stability-agent
+ owner_node: chelsty
+ exposure: private
+ dependencies: []
+ healthcheck:
+ type: custom
+ interval: 60s
+ timeout: 10s
+ retries: 3
+ restart_policy: unless-stopped
+ persistence:
+ paths:
+ - /opt/homelab/state
+ - /opt/homelab/events
+ runtime:
+ directories:
+ - /opt/homelab/state
+ - /opt/homelab/events
+ env_vars:
+ - STABILITY_CHECK_INTERVAL
+ - DISK_THRESHOLD_PCT
+ - MQTT_HOST
+ - MQTT_PORT
diff --git a/services/stability-agent/src/stability_agent.py b/services/stability-agent/src/stability_agent.py
new file mode 100644
index 0000000..12baef8
--- /dev/null
+++ b/services/stability-agent/src/stability_agent.py
@@ -0,0 +1,333 @@
+import os
+import time
+import json
+import datetime
+import uuid
+import socket
+import shutil
+import http.client
+
+# Configuration from environment
+CHECK_INTERVAL = int(os.environ.get("STABILITY_CHECK_INTERVAL", "60"))
+DISK_THRESHOLD_PCT = float(os.environ.get("DISK_THRESHOLD_PCT", "90.0"))
+MQTT_HOST = os.environ.get("MQTT_HOST")
+MQTT_PORT = int(os.environ.get("MQTT_PORT", "1883"))
+NODE_NAME = os.environ.get("NODE_NAME", "chelsty")
+REDIS_HOST = os.environ.get("REDIS_HOST")
+REDIS_PORT = int(os.environ.get("REDIS_PORT", "6379"))
+REDIS_ENABLED = os.environ.get("REDIS_ENABLED", "true").lower() == "true" if REDIS_HOST else False
+SOURCE = "stability-agent"
+
+STATE_DIR = "/opt/homelab/state"
+EVENTS_BASE_DIR = "/opt/homelab/events"
+HEARTBEAT_FILE = os.path.join(STATE_DIR, "stability-agent.heartbeat")
+STATUS_FILE = os.path.join(STATE_DIR, "stability-agent.json")
+
+def get_timestamp():
+ return datetime.datetime.utcnow().isoformat() + "Z"
+
+def get_datestamp():
+ return datetime.datetime.utcnow().strftime("%Y-%m-%d")
+
+def emit_event(event_type, severity, message, service=None, details=None):
+ timestamp = get_timestamp()
+ event = {
+ "id": str(uuid.uuid4()),
+ "timestamp": timestamp,
+ "node": NODE_NAME,
+ "source": SOURCE,
+ "type": event_type,
+ "severity": severity,
+ "message": message,
+ "details": details or {}
+ }
+ if service:
+ event["service"] = service
+
+ date_str = get_datestamp()
+ event_dir = os.path.join(EVENTS_BASE_DIR, date_str, NODE_NAME)
+ try:
+ os.makedirs(event_dir, exist_ok=True)
+ event_file = os.path.join(event_dir, "events.jsonl")
+ with open(event_file, "a") as f:
+ f.write(json.dumps(event) + "\n")
+ except Exception as e:
+ print(f"Failed to write event to filesystem: {e}")
+
+ # Redis publishing
+ if REDIS_ENABLED and redis_client:
+ try:
+ redis_client.xadd("homelab:events", {
+ "node": NODE_NAME,
+ "type": event_type,
+ "severity": severity,
+ "timestamp": str(int(time.time())),
+ "message": message,
+ "details": json.dumps(details or {})
+ })
+ except Exception as e:
+ print(f"Failed to publish event to Redis: {e}")
+ # Do not crash, already logged to filesystem
+
+ print(f"[{severity}] {message}")
+
+def check_disk():
+ total, used, free = shutil.disk_usage("/")
+ percent = (used / total) * 100
+ details = {
+ "total_gb": total // (2**30),
+ "used_gb": used // (2**30),
+ "free_gb": free // (2**30),
+ "percent": round(percent, 2)
+ }
+
+ if percent > DISK_THRESHOLD_PCT:
+ emit_event("disk_usage_high", "warning", f"Disk usage is high: {details['percent']}%", details=details)
+
+ return details
+
+class DockerClient:
+ def __init__(self, socket_path="/var/run/docker.sock"):
+ self.socket_path = socket_path
+
+ def _request(self, path):
+ class UnixHTTPConnection(http.client.HTTPConnection):
+ def __init__(self, socket_path):
+ super().__init__("localhost")
+ self.socket_path = socket_path
+ def connect(self):
+ self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+ self.sock.settimeout(5.0)
+ self.sock.connect(self.socket_path)
+
+ if not os.path.exists(self.socket_path):
+ return None
+
+ conn = UnixHTTPConnection(self.socket_path)
+ try:
+ conn.request("GET", path)
+ res = conn.getresponse()
+ if res.status == 200:
+ return json.loads(res.read().decode())
+ return None
+ except Exception as e:
+ print(f"Docker API error: {e}")
+ return None
+ finally:
+ conn.close()
+
+ def get_containers(self):
+ return self._request("/containers/json?all=1")
+
+def check_docker():
+ client = DockerClient()
+ containers = client.get_containers()
+ if containers is None:
+ emit_event("docker_socket_error", "error", "Could not connect to Docker socket or socket missing")
+ return {"status": "error", "error": "Could not connect to Docker socket"}
+
+ summary = []
+ unhealthy_containers = []
+ for c in containers:
+ state = c.get("State", "")
+ status = c.get("Status", "")
+ name = c.get("Names", ["unknown"])[0].lstrip("/")
+
+ container_info = {
+ "name": name,
+ "state": state,
+ "status": status
+ }
+ summary.append(container_info)
+
+ if state != "running":
+ unhealthy_containers.append(container_info)
+
+ if unhealthy_containers:
+ names = [c["name"] for c in unhealthy_containers]
+ # Only emit warning for containers that should be running?
+ # For now, we report any non-running container found by Docker.
+ emit_event("containers_not_running", "warning", f"Some containers are not running: {', '.join(names)}", details={"containers": unhealthy_containers})
+
+ return {"status": "ok", "containers": summary}
+
+def check_tailscale():
+ # Check for tailscale socket or interface
+ socket_path = "/var/run/tailscale/tailscaled.sock"
+ socket_available = os.path.exists(socket_path)
+ interface_available = os.path.exists("/sys/class/net/tailscale0")
+
+ return {
+ "available": socket_available or interface_available,
+ "details": {
+ "socket": socket_available,
+ "interface": interface_available
+ }
+ }
+
+def check_mqtt():
+ if not MQTT_HOST:
+ return {"configured": False}
+
+ try:
+ with socket.create_connection((MQTT_HOST, MQTT_PORT), timeout=5):
+ return {"configured": True, "reachable": True}
+ except Exception as e:
+ emit_event("mqtt_unreachable", "error", f"MQTT broker at {MQTT_HOST}:{MQTT_PORT} is unreachable", details={"error": str(e)})
+ return {"configured": True, "reachable": False, "error": str(e)}
+
+class RedisClient:
+ def __init__(self, host, port=6379):
+ self.host = host
+ self.port = port
+ self.sock = None
+
+ def _connect(self):
+ if self.sock:
+ try:
+ # Check if socket is still alive
+ self.sock.send(b"", socket.MSG_DONTWAIT)
+ return True
+ except (socket.error, AttributeError):
+ self.sock = None
+
+ try:
+ self.sock = socket.create_connection((self.host, self.port), timeout=2)
+ self.sock.settimeout(2.0)
+ return True
+ except Exception as e:
+ self.sock = None
+ print(f"Redis connection error: {e}")
+ return False
+
+ def _send_command(self, *args):
+ if not self._connect():
+ return False
+
+ # RESP array
+ cmd = f"*{len(args)}\r\n"
+ for arg in args:
+ s_arg = str(arg)
+ cmd += f"${len(s_arg.encode('utf-8'))}\r\n{s_arg}\r\n"
+
+ try:
+ self.sock.sendall(cmd.encode('utf-8'))
+ # Basic response reading
+ resp = self.sock.recv(4096)
+ if resp.startswith(b"-"):
+ print(f"Redis error response: {resp.decode().strip()}")
+ return False
+ return True
+ except Exception as e:
+ print(f"Redis send error: {e}")
+ if self.sock:
+ self.sock.close()
+ self.sock = None
+ return False
+
+ def hset(self, key, mapping):
+ args = ["HSET", key]
+ for k, v in mapping.items():
+ args.extend([k, v])
+ return self._send_command(*args)
+
+ def xadd(self, key, fields):
+ args = ["XADD", key, "*"]
+ for k, v in fields.items():
+ args.extend([k, v])
+ return self._send_command(*args)
+
+redis_client = RedisClient(REDIS_HOST, REDIS_PORT) if REDIS_ENABLED else None
+
+def main():
+ print(f"Starting stability-agent on {NODE_NAME}...")
+
+ # Ensure directories exist
+ os.makedirs(STATE_DIR, exist_ok=True)
+ os.makedirs(EVENTS_BASE_DIR, exist_ok=True)
+
+ while True:
+ try:
+ status = {
+ "timestamp": get_timestamp(),
+ "node": NODE_NAME,
+ "checks": {}
+ }
+
+ status["checks"]["disk"] = check_disk()
+ status["checks"]["docker"] = check_docker()
+ status["checks"]["tailscale"] = check_tailscale()
+ status["checks"]["mqtt"] = check_mqtt()
+
+ # Zigbee2MQTT container check
+ z2m_present = False
+ z2m_running = False
+ if status["checks"]["docker"]["status"] == "ok":
+ for c in status["checks"]["docker"]["containers"]:
+ if "zigbee2mqtt" in c["name"]:
+ z2m_present = True
+ if c["state"] == "running":
+ z2m_running = True
+
+ status["checks"]["zigbee2mqtt"] = {
+ "present": z2m_present,
+ "running": z2m_running
+ }
+
+ # Write heartbeat
+ with open(HEARTBEAT_FILE, "w") as f:
+ f.write(get_timestamp())
+
+ # Write status summary
+ with open(STATUS_FILE, "w") as f:
+ json.dump(status, f, indent=2)
+
+ # Redis publishing
+ if REDIS_ENABLED and redis_client:
+ try:
+ # Node state
+ node_health = "healthy"
+ for check in status["checks"].values():
+ if isinstance(check, dict) and check.get("status") == "error":
+ node_health = "unhealthy"
+
+ redis_client.hset(f"homelab:nodes:{NODE_NAME}", {
+ "id": NODE_NAME,
+ "hostname": NODE_NAME,
+ "health": node_health,
+ "status": "online",
+ "last_seen": status["timestamp"],
+ "capabilities": json.dumps(["docker", "tailscale", "mqtt", "disk"]),
+ "checks": json.dumps(status["checks"])
+ })
+
+ # Services discovered from Docker
+ if status["checks"]["docker"]["status"] == "ok":
+ for c in status["checks"]["docker"]["containers"]:
+ service_name = c["name"]
+ service_health = "healthy" if c["state"] == "running" else "unhealthy"
+
+ redis_client.hset(f"homelab:services:{NODE_NAME}:{service_name}", {
+ "name": service_name,
+ "node": NODE_NAME,
+ "health": service_health,
+ "desired_state": "running",
+ "actual_state": c["state"],
+ "deployment_state": "deployed",
+ "updated_at": status["timestamp"],
+ "dependencies": json.dumps([]),
+ "recommendations": json.dumps([])
+ })
+ except Exception as e:
+ print(f"Failed to publish to Redis: {e}")
+ # Local event for Redis error
+ emit_event("redis_publish_error", "warning", f"Failed to publish to Redis: {e}", details={"error": str(e)})
+
+ except Exception as e:
+ print(f"Error in main loop: {e}")
+ emit_event("agent_error", "error", f"Internal agent error: {e}", details={"error": str(e)})
+
+ time.sleep(CHECK_INTERVAL)
+
+if __name__ == "__main__":
+ main()