feat(planner-agent): main loop with LLM routing and HITL action proposals
services/planner-agent/src/planner.py: - PlannerAgent: async Redis pub/sub on health_events + world_updates - Pipeline: receive event → cooldown gate → LLMRouter → write pending action → emit remediation_started filesystem event - CooldownTracker: 5-min suppression per svc_key (configurable via env) - parse_event(): accepts node-agent shape A and world_updates shape B - PROPOSAL_SCHEMA: jsonschema enforced by LLMRouter before accepting response - SYSTEM_PROMPT: homelab topology + action rules (chelsty always requires_human, disk_pressure always notify, confidence<0.7 → requires_human) - write_pending_action(): atomic tmp→rename write, executor-compatible format - emit_event(): async wrapper around filesystem event write (no control-plane import) - _emit_event_sync() reads NODE_NAME at call time (not import) for testability - Benign events (service_healthy, node_online, ...) silently skipped - LLM chain failure: no cooldown recorded so next event can retry services/planner-agent/tests/test_planner.py (49 tests, 0 network): - TestCooldownTracker: 7 tests (ready/not-ready/elapsed/reset/independence) - TestHealthEvent, TestActionProposal, TestMapActionToExecutorType - TestParseEvent: both event shapes, missing fields, timestamp formats - TestBuildMessages: system prompt rules, payload inclusion - TestPlannerHandleEvent: benign skip, cooldown block, ignore/restart/redeploy/ notify proposals, remediation event emission, LLM failure isolation, requires_human propagation, cooldown recording, model name in proposal - TestPlannerDispatch: valid JSON, invalid JSON, non-string data, missing node - TestWritePendingAction, TestEmitEvent: filesystem integration with tmp_path services/planner-agent/service.yaml: owner_node: solaria, dependencies: [redis, ollama] services/planner-agent/docker-compose.yml: env + healthcheck services/planner-agent/Dockerfile: python:3.11-slim services/planner-agent/healthcheck.sh: heartbeat file age check (300s) services/planner-agent/requirements.txt: litellm, redis, jsonschema, structlog Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
1bbc511bb7
commit
ca37fca5ce
17
services/planner-agent/Dockerfile
Normal file
17
services/planner-agent/Dockerfile
Normal file
|
|
@ -0,0 +1,17 @@
|
||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Copy source
|
||||||
|
COPY src/ /app/src/
|
||||||
|
|
||||||
|
COPY healthcheck.sh /app/healthcheck.sh
|
||||||
|
RUN chmod +x /app/healthcheck.sh
|
||||||
|
|
||||||
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
|
||||||
|
CMD ["python", "src/planner.py"]
|
||||||
21
services/planner-agent/docker-compose.yml
Normal file
21
services/planner-agent/docker-compose.yml
Normal file
|
|
@ -0,0 +1,21 @@
|
||||||
|
services:
|
||||||
|
planner-agent:
|
||||||
|
build: .
|
||||||
|
container_name: planner-agent
|
||||||
|
restart: unless-stopped
|
||||||
|
volumes:
|
||||||
|
- /opt/homelab:/opt/homelab
|
||||||
|
environment:
|
||||||
|
- REDIS_URL=${REDIS_URL:-redis://100.108.208.3:6379}
|
||||||
|
- OLLAMA_HOST=${OLLAMA_HOST:-http://100.108.208.3:11434}
|
||||||
|
- OLLAMA_MODEL=${OLLAMA_MODEL:-qwen2.5:7b}
|
||||||
|
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
|
||||||
|
- NODE_NAME=${NODE_NAME:-solaria}
|
||||||
|
- COOLDOWN_SECONDS=${COOLDOWN_SECONDS:-300}
|
||||||
|
- RUNTIME_PATH=${RUNTIME_PATH:-/opt/homelab}
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "/bin/sh", "/app/healthcheck.sh"]
|
||||||
|
interval: 60s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 30s
|
||||||
28
services/planner-agent/healthcheck.sh
Normal file
28
services/planner-agent/healthcheck.sh
Normal file
|
|
@ -0,0 +1,28 @@
|
||||||
|
#!/bin/sh
|
||||||
|
# Healthcheck: verify the planner-agent heartbeat is fresh.
|
||||||
|
# The planner touches /opt/homelab/state/planner-agent.heartbeat
|
||||||
|
# at the top of every poll cycle (≤5 s intervals).
|
||||||
|
# We fail if it is older than 300 s (5 min = one full cooldown window).
|
||||||
|
|
||||||
|
HEARTBEAT_FILE="${RUNTIME_PATH:-/opt/homelab}/state/planner-agent.heartbeat"
|
||||||
|
MAX_AGE_SECONDS=300
|
||||||
|
|
||||||
|
if [ ! -f "$HEARTBEAT_FILE" ]; then
|
||||||
|
echo "FAIL: heartbeat file missing: $HEARTBEAT_FILE"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
NOW=$(date +%s)
|
||||||
|
FILE_TIME=$(stat -c %Y "$HEARTBEAT_FILE" 2>/dev/null) || {
|
||||||
|
echo "FAIL: cannot stat heartbeat file"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
AGE=$((NOW - FILE_TIME))
|
||||||
|
|
||||||
|
if [ "$AGE" -gt "$MAX_AGE_SECONDS" ]; then
|
||||||
|
echo "FAIL: heartbeat stale (${AGE}s > ${MAX_AGE_SECONDS}s)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "OK: heartbeat age ${AGE}s"
|
||||||
|
exit 0
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
litellm>=1.40.0
|
litellm>=1.40.0
|
||||||
redis[asyncio]>=5.0.0
|
redis>=5.0.0
|
||||||
jsonschema>=4.21.0
|
jsonschema>=4.21.0
|
||||||
|
structlog>=24.1.0
|
||||||
|
|
|
||||||
45
services/planner-agent/service.yaml
Normal file
45
services/planner-agent/service.yaml
Normal file
|
|
@ -0,0 +1,45 @@
|
||||||
|
service:
|
||||||
|
name: planner-agent
|
||||||
|
owner_node: solaria
|
||||||
|
exposure: private
|
||||||
|
dependencies:
|
||||||
|
- redis
|
||||||
|
- ollama
|
||||||
|
|
||||||
|
ports: [] # no external port; communicates via Redis pub/sub
|
||||||
|
|
||||||
|
healthcheck:
|
||||||
|
type: file
|
||||||
|
path: /opt/homelab/state/planner-agent.heartbeat
|
||||||
|
max_age_seconds: 300 # 5 minutes — matches COOLDOWN_SECONDS
|
||||||
|
interval: 60s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
|
restart_policy: unless-stopped
|
||||||
|
|
||||||
|
persistence:
|
||||||
|
paths:
|
||||||
|
- /opt/homelab/actions
|
||||||
|
- /opt/homelab/events
|
||||||
|
- /opt/homelab/state
|
||||||
|
|
||||||
|
runtime:
|
||||||
|
directories:
|
||||||
|
- /opt/homelab/actions/pending
|
||||||
|
- /opt/homelab/actions/approved
|
||||||
|
- /opt/homelab/actions/running
|
||||||
|
- /opt/homelab/actions/completed
|
||||||
|
- /opt/homelab/actions/failed
|
||||||
|
- /opt/homelab/actions/rejected
|
||||||
|
- /opt/homelab/actions/cancelled
|
||||||
|
- /opt/homelab/events
|
||||||
|
- /opt/homelab/state
|
||||||
|
env_vars:
|
||||||
|
- REDIS_URL # redis://100.108.208.3:6379
|
||||||
|
- OLLAMA_HOST # http://100.108.208.3:11434
|
||||||
|
- OLLAMA_MODEL # qwen2.5:7b
|
||||||
|
- ANTHROPIC_API_KEY # for claude-haiku/sonnet fallback
|
||||||
|
- NODE_NAME # solaria
|
||||||
|
- COOLDOWN_SECONDS # default 300
|
||||||
|
- RUNTIME_PATH # default /opt/homelab
|
||||||
709
services/planner-agent/src/planner.py
Normal file
709
services/planner-agent/src/planner.py
Normal file
|
|
@ -0,0 +1,709 @@
|
||||||
|
"""
|
||||||
|
planner.py — planner-agent main loop.
|
||||||
|
|
||||||
|
Listens to Redis pub/sub channels:
|
||||||
|
- health_events: node-agent / stability-agent health notifications
|
||||||
|
- world_updates: observer world-state change notifications
|
||||||
|
|
||||||
|
For each event that clears the cooldown gate:
|
||||||
|
1. Ask LLMRouter to diagnose and produce a structured action proposal.
|
||||||
|
2. Write proposal to /opt/homelab/actions/pending/<id>.json.
|
||||||
|
3. Emit a remediation_started filesystem event.
|
||||||
|
|
||||||
|
Human-in-the-loop invariant
|
||||||
|
----------------------------
|
||||||
|
The planner ONLY writes to actions/pending/. Execution requires an
|
||||||
|
operator-approved action file in actions/approved/ — the planner
|
||||||
|
never touches that directory.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import signal
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from dataclasses import asdict, dataclass, field
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
import redis.asyncio as aioredis
|
||||||
|
import structlog
|
||||||
|
|
||||||
|
# Allow running from src/ directory without installation
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent))
|
||||||
|
from llm_router import LLMRouter, RouteResult # noqa: E402
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Structured logging — JSON to stdout
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
structlog.configure(
|
||||||
|
processors=[
|
||||||
|
structlog.stdlib.add_log_level, # adds "level" key
|
||||||
|
structlog.processors.TimeStamper(fmt="iso", utc=True),
|
||||||
|
structlog.processors.StackInfoRenderer(),
|
||||||
|
structlog.processors.format_exc_info,
|
||||||
|
structlog.processors.JSONRenderer(),
|
||||||
|
],
|
||||||
|
wrapper_class=structlog.make_filtering_bound_logger(20), # INFO+
|
||||||
|
logger_factory=structlog.PrintLoggerFactory(),
|
||||||
|
# add_logger_name is intentionally excluded: it requires a stdlib
|
||||||
|
# logger with a .name attribute; PrintLogger does not have one.
|
||||||
|
)
|
||||||
|
log = structlog.get_logger("planner")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Runtime paths
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
RUNTIME_PATH = Path(os.getenv("RUNTIME_PATH", "/opt/homelab"))
|
||||||
|
ACTIONS_DIR = RUNTIME_PATH / "actions"
|
||||||
|
EVENTS_DIR = RUNTIME_PATH / "events"
|
||||||
|
STATE_DIR = RUNTIME_PATH / "state"
|
||||||
|
HEARTBEAT = STATE_DIR / "planner-agent.heartbeat"
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Configuration (from env)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
REDIS_URL = os.getenv("REDIS_URL", "redis://100.108.208.3:6379")
|
||||||
|
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://100.108.208.3:11434")
|
||||||
|
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "qwen2.5:7b")
|
||||||
|
NODE_NAME = os.getenv("NODE_NAME", "solaria")
|
||||||
|
COOLDOWN_SECONDS = int(os.getenv("COOLDOWN_SECONDS", "300")) # 5 min
|
||||||
|
SUBSCRIBE_CHANNELS = ["health_events", "world_updates"]
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# JSON Schema — validated by LLMRouter (jsonschema) before accepting response
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
PROPOSAL_SCHEMA: dict = {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["action", "service", "node", "reason", "confidence", "requires_human"],
|
||||||
|
"additionalProperties": False,
|
||||||
|
"properties": {
|
||||||
|
"action": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["restart", "redeploy", "notify", "ignore"],
|
||||||
|
},
|
||||||
|
"service": {"type": "string"},
|
||||||
|
"node": {"type": "string"},
|
||||||
|
"reason": {"type": "string", "minLength": 10},
|
||||||
|
"confidence": {"type": "number", "minimum": 0.0, "maximum": 1.0},
|
||||||
|
"requires_human": {"type": "boolean"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# LLM system prompt
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
SYSTEM_PROMPT = """You are the planner agent for a distributed homelab orchestration system.
|
||||||
|
Your job is to diagnose infrastructure health events and propose a remediation action.
|
||||||
|
|
||||||
|
Homelab topology:
|
||||||
|
vps — Hetzner VPS; public ingress, control plane
|
||||||
|
piha — Raspberry Pi 5; infra, monitoring, Redis, Ollama
|
||||||
|
solaria — GPU workstation; AI / compute workloads
|
||||||
|
chelsty-infra — LTE edge; Zigbee2MQTT, Mosquitto — offline-first
|
||||||
|
chelsty-ha — LTE edge; Home Assistant — offline-first
|
||||||
|
|
||||||
|
Action selection rules:
|
||||||
|
restart — container exists but is stopped/unhealthy; docker restart suffices (low risk)
|
||||||
|
redeploy — container is broken beyond a simple restart; full docker compose up (guarded)
|
||||||
|
notify — human decision required; do not attempt automated fix
|
||||||
|
ignore — transient / one-off glitch; monitoring will catch a repeat
|
||||||
|
|
||||||
|
Risk rules (enforce strictly):
|
||||||
|
- For any chelsty-* node: always set requires_human: true
|
||||||
|
- For disk_pressure events: always use "notify"
|
||||||
|
- If confidence < 0.7: set requires_human: true
|
||||||
|
- Unknown/novel failure patterns: prefer "notify" over guessing
|
||||||
|
|
||||||
|
Respond with ONLY a single JSON object, no markdown, no commentary:
|
||||||
|
{
|
||||||
|
"action": "restart|redeploy|notify|ignore",
|
||||||
|
"service": "<service name>",
|
||||||
|
"node": "<node name>",
|
||||||
|
"reason": "<concise explanation, minimum 10 characters>",
|
||||||
|
"confidence": <0.0–1.0>,
|
||||||
|
"requires_human": <true|false>
|
||||||
|
}"""
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Data models
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class HealthEvent:
|
||||||
|
"""Normalized health event ingested from a Redis channel message."""
|
||||||
|
node: str
|
||||||
|
service: str
|
||||||
|
event_type: str # e.g. "service_unhealthy", "disk_pressure_high"
|
||||||
|
severity: str # "info" | "warning" | "error" | "critical"
|
||||||
|
payload: dict = field(default_factory=dict)
|
||||||
|
timestamp: float = field(default_factory=time.time)
|
||||||
|
raw_channel: str = ""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def svc_key(self) -> str:
|
||||||
|
return f"{self.node}/{self.service}"
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
return f"[{self.event_type}] {self.svc_key} ({self.severity})"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ActionProposal:
|
||||||
|
"""Planner's structured output, written to actions/pending/<id>.json."""
|
||||||
|
action_id: str
|
||||||
|
type: str # executor type: "container_restart"|"redeploy"|"notify"|"ignore"
|
||||||
|
action: str # LLM's action: "restart"|"redeploy"|"notify"|"ignore"
|
||||||
|
service: str
|
||||||
|
node: str
|
||||||
|
reason: str
|
||||||
|
confidence: float
|
||||||
|
requires_human: bool
|
||||||
|
risk_level: str
|
||||||
|
status: str = "pending"
|
||||||
|
timestamp: float = field(default_factory=time.time)
|
||||||
|
source_event: str = ""
|
||||||
|
description: str = ""
|
||||||
|
llm_model: str = ""
|
||||||
|
llm_attempts: int = 0
|
||||||
|
|
||||||
|
def to_action_file(self) -> dict:
|
||||||
|
"""Return a dict compatible with the executor's action file format."""
|
||||||
|
return {
|
||||||
|
"action_id": self.action_id,
|
||||||
|
"type": self.type,
|
||||||
|
"node": self.node,
|
||||||
|
"service": self.service,
|
||||||
|
"risk_level": self.risk_level,
|
||||||
|
"confidence": self.confidence,
|
||||||
|
"requires_human": self.requires_human,
|
||||||
|
"description": self.description or self.reason,
|
||||||
|
"status": self.status,
|
||||||
|
"timestamp": self.timestamp,
|
||||||
|
"source_event": self.source_event,
|
||||||
|
"llm_model": self.llm_model,
|
||||||
|
"llm_attempts": self.llm_attempts,
|
||||||
|
"payload": {
|
||||||
|
"action": self.action,
|
||||||
|
"reason": self.reason,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Cooldown tracker
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class CooldownTracker:
|
||||||
|
"""Gate: suppress duplicate proposals for the same service/node pair.
|
||||||
|
|
||||||
|
A proposal is suppressed if a previous proposal for the same svc_key
|
||||||
|
was emitted within the last ``cooldown_seconds`` seconds.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, cooldown_seconds: float = COOLDOWN_SECONDS) -> None:
|
||||||
|
self._cooldown = cooldown_seconds
|
||||||
|
self._last: dict[str, float] = {}
|
||||||
|
|
||||||
|
def is_ready(self, svc_key: str) -> bool:
|
||||||
|
"""True when enough time has elapsed since the last proposal."""
|
||||||
|
return (time.time() - self._last.get(svc_key, 0.0)) >= self._cooldown
|
||||||
|
|
||||||
|
def record(self, svc_key: str) -> None:
|
||||||
|
"""Mark a proposal as just emitted for svc_key."""
|
||||||
|
self._last[svc_key] = time.time()
|
||||||
|
|
||||||
|
def remaining_seconds(self, svc_key: str) -> float:
|
||||||
|
return max(0.0, self._cooldown - (time.time() - self._last.get(svc_key, 0.0)))
|
||||||
|
|
||||||
|
def reset(self, svc_key: str) -> None:
|
||||||
|
"""Force-reset cooldown (e.g. for testing or manual override)."""
|
||||||
|
self._last.pop(svc_key, None)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Event emission (filesystem — no control-plane import)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _emit_event_sync(
|
||||||
|
event_type: str,
|
||||||
|
severity: str,
|
||||||
|
service: str,
|
||||||
|
correlation_id: str,
|
||||||
|
payload: Optional[dict] = None,
|
||||||
|
node: Optional[str] = None,
|
||||||
|
) -> None:
|
||||||
|
# Read NODE_NAME at call time (not import time) so monkeypatching works in tests.
|
||||||
|
if node is None:
|
||||||
|
node = NODE_NAME
|
||||||
|
"""Write a normalized JSON event file to the filesystem event store.
|
||||||
|
|
||||||
|
Mirrors scripts/lib/events.py behaviour — keeping planner fully
|
||||||
|
independent of the control-plane package.
|
||||||
|
"""
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
timestamp = now.strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
date_dir = now.strftime("%Y-%m-%d")
|
||||||
|
svc_slug = (service or "planner").replace("/", "-").replace(" ", "-")
|
||||||
|
fname = f"evt-{node}-{int(time.time())}-{event_type}-{svc_slug}.json"
|
||||||
|
event_dir = EVENTS_DIR / date_dir / node
|
||||||
|
|
||||||
|
try:
|
||||||
|
event_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
(event_dir / fname).write_text(json.dumps({
|
||||||
|
"timestamp": timestamp,
|
||||||
|
"node": node,
|
||||||
|
"type": event_type,
|
||||||
|
"severity": severity,
|
||||||
|
"source": "planner-agent",
|
||||||
|
"service": service,
|
||||||
|
"correlation_id": correlation_id,
|
||||||
|
"payload": payload or {},
|
||||||
|
}, indent=2))
|
||||||
|
except Exception as exc:
|
||||||
|
log.warning("event_write_failed", path=str(event_dir / fname), error=str(exc))
|
||||||
|
|
||||||
|
|
||||||
|
async def emit_event(
|
||||||
|
event_type: str,
|
||||||
|
severity: str,
|
||||||
|
service: str,
|
||||||
|
correlation_id: str,
|
||||||
|
payload: Optional[dict] = None,
|
||||||
|
) -> None:
|
||||||
|
"""Async wrapper around _emit_event_sync (runs in thread pool)."""
|
||||||
|
await asyncio.to_thread(
|
||||||
|
_emit_event_sync, event_type, severity, service, correlation_id, payload
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Action file I/O
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def write_pending_action(proposal: ActionProposal) -> Path:
|
||||||
|
"""Atomically write proposal JSON to actions/pending/<action_id>.json."""
|
||||||
|
pending_dir = ACTIONS_DIR / "pending"
|
||||||
|
pending_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
path = pending_dir / f"{proposal.action_id}.json"
|
||||||
|
|
||||||
|
def _write() -> None:
|
||||||
|
# Write to tmp then rename so readers never see a partial file
|
||||||
|
tmp = path.with_suffix(".tmp")
|
||||||
|
tmp.write_text(json.dumps(proposal.to_action_file(), indent=2))
|
||||||
|
tmp.replace(path)
|
||||||
|
|
||||||
|
await asyncio.to_thread(_write)
|
||||||
|
return path
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# LLM prompt helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def build_messages(event: HealthEvent) -> list[dict]:
|
||||||
|
"""Construct the OpenAI-style message list for one health event."""
|
||||||
|
user_content = (
|
||||||
|
f"Health event received:\n"
|
||||||
|
f" node: {event.node}\n"
|
||||||
|
f" service: {event.service}\n"
|
||||||
|
f" type: {event.event_type}\n"
|
||||||
|
f" severity: {event.severity}\n"
|
||||||
|
f" timestamp: {datetime.fromtimestamp(event.timestamp, tz=timezone.utc).isoformat()}\n"
|
||||||
|
)
|
||||||
|
if event.payload:
|
||||||
|
payload_str = json.dumps(event.payload, indent=4)
|
||||||
|
user_content += f" payload:\n{payload_str}\n"
|
||||||
|
user_content += (
|
||||||
|
"\nRespond with ONLY the JSON object as specified."
|
||||||
|
)
|
||||||
|
return [
|
||||||
|
{"role": "system", "content": SYSTEM_PROMPT},
|
||||||
|
{"role": "user", "content": user_content},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def map_action_to_executor_type(action: str) -> tuple[str, str]:
|
||||||
|
"""Map LLM action name → (executor type, risk_level)."""
|
||||||
|
return {
|
||||||
|
"restart": ("container_restart", "low"),
|
||||||
|
"redeploy": ("redeploy", "guarded"),
|
||||||
|
"notify": ("notify", "low"),
|
||||||
|
"ignore": ("ignore", "none"),
|
||||||
|
}.get(action, ("notify", "low"))
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Event parsing
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def parse_event(raw: dict, channel: str) -> Optional[HealthEvent]:
|
||||||
|
"""Normalise a raw Redis pub/sub payload into a HealthEvent.
|
||||||
|
|
||||||
|
Accepts two common shapes:
|
||||||
|
|
||||||
|
Shape A — node-agent / stability-agent filesystem event format:
|
||||||
|
{"type": "service_unhealthy", "node": "piha", "service": "mosquitto",
|
||||||
|
"severity": "error", "payload": {...}}
|
||||||
|
|
||||||
|
Shape B — control-plane world_updates format:
|
||||||
|
{"event_type": "...", "node": "...", "service": "...", ...}
|
||||||
|
"""
|
||||||
|
event_type = raw.get("type") or raw.get("event_type", "")
|
||||||
|
node = (raw.get("node") or "").strip()
|
||||||
|
service = (raw.get("service") or "").strip()
|
||||||
|
severity = (raw.get("severity") or "info").strip()
|
||||||
|
|
||||||
|
if not event_type or not node:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# For node-level events (e.g. node_offline) without a service field
|
||||||
|
if not service:
|
||||||
|
details = raw.get("details") or raw.get("payload") or {}
|
||||||
|
service = details.get("service", "") if isinstance(details, dict) else ""
|
||||||
|
if not service:
|
||||||
|
service = node # fallback: use node name as service key
|
||||||
|
|
||||||
|
# Parse timestamp
|
||||||
|
ts_raw = raw.get("timestamp", time.time())
|
||||||
|
if isinstance(ts_raw, str):
|
||||||
|
try:
|
||||||
|
ts = datetime.fromisoformat(ts_raw.replace("Z", "+00:00")).timestamp()
|
||||||
|
except (ValueError, AttributeError):
|
||||||
|
ts = time.time()
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
ts = float(ts_raw)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
ts = time.time()
|
||||||
|
|
||||||
|
payload = raw.get("payload") or raw.get("details") or {}
|
||||||
|
if not isinstance(payload, dict):
|
||||||
|
payload = {}
|
||||||
|
|
||||||
|
return HealthEvent(
|
||||||
|
node = node,
|
||||||
|
service = service,
|
||||||
|
event_type = event_type,
|
||||||
|
severity = severity,
|
||||||
|
payload = payload,
|
||||||
|
timestamp = ts,
|
||||||
|
raw_channel = channel,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Planner agent
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Event types that require no action (healthy signals, completions)
|
||||||
|
_BENIGN_EVENTS = frozenset({
|
||||||
|
"service_healthy",
|
||||||
|
"service_recovered",
|
||||||
|
"node_online",
|
||||||
|
"deployment_completed",
|
||||||
|
"deployment_started",
|
||||||
|
"remediation_started",
|
||||||
|
"remediation_completed",
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
class PlannerAgent:
|
||||||
|
"""Async agent: subscribe → receive → diagnose → propose action.
|
||||||
|
|
||||||
|
Designed for testability: all I/O (Redis, filesystem, LLM) is
|
||||||
|
injected or mockable. The ``router`` parameter accepts a pre-built
|
||||||
|
LLMRouter so tests can substitute it without network calls.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
redis_url: str = REDIS_URL,
|
||||||
|
ollama_host: str = OLLAMA_HOST,
|
||||||
|
ollama_model: str = OLLAMA_MODEL,
|
||||||
|
router: Optional[LLMRouter] = None,
|
||||||
|
cooldown: Optional[CooldownTracker] = None,
|
||||||
|
) -> None:
|
||||||
|
self._redis_url = redis_url
|
||||||
|
self._redis: Optional[aioredis.Redis] = None
|
||||||
|
self._pubsub: Optional[aioredis.client.PubSub] = None
|
||||||
|
self._running = False
|
||||||
|
|
||||||
|
self.router = router or LLMRouter(
|
||||||
|
redis_url = redis_url,
|
||||||
|
ollama_host = ollama_host,
|
||||||
|
ollama_model = ollama_model,
|
||||||
|
)
|
||||||
|
self.cooldown = cooldown or CooldownTracker()
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Lifecycle
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def start(self) -> None:
|
||||||
|
self._redis = aioredis.from_url(
|
||||||
|
self._redis_url,
|
||||||
|
decode_responses=True,
|
||||||
|
socket_connect_timeout=5,
|
||||||
|
socket_timeout=10,
|
||||||
|
)
|
||||||
|
self._pubsub = self._redis.pubsub(ignore_subscribe_messages=True)
|
||||||
|
await self._pubsub.subscribe(*SUBSCRIBE_CHANNELS)
|
||||||
|
log.info("planner_started", channels=SUBSCRIBE_CHANNELS, node=NODE_NAME)
|
||||||
|
|
||||||
|
async def stop(self) -> None:
|
||||||
|
self._running = False
|
||||||
|
if self._pubsub:
|
||||||
|
try:
|
||||||
|
await self._pubsub.unsubscribe()
|
||||||
|
await self._pubsub.aclose()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if self._redis:
|
||||||
|
try:
|
||||||
|
await self._redis.aclose()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
await self.router.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
log.info("planner_stopped")
|
||||||
|
|
||||||
|
async def run(self) -> None:
|
||||||
|
"""Main event loop. Runs until cancelled or SIGINT/SIGTERM."""
|
||||||
|
await self.start()
|
||||||
|
self._running = True
|
||||||
|
_ensure_dirs()
|
||||||
|
|
||||||
|
try:
|
||||||
|
while self._running:
|
||||||
|
self._touch_heartbeat()
|
||||||
|
try:
|
||||||
|
msg = await asyncio.wait_for(
|
||||||
|
self._pubsub.get_message(ignore_subscribe_messages=True),
|
||||||
|
timeout=5.0,
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if msg is None:
|
||||||
|
await asyncio.sleep(0.05)
|
||||||
|
continue
|
||||||
|
|
||||||
|
await self._dispatch(msg)
|
||||||
|
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
log.info("planner_cancelled")
|
||||||
|
except Exception as exc:
|
||||||
|
log.exception("planner_fatal_error", error=str(exc))
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
await self.stop()
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Message dispatch
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def _dispatch(self, msg: dict) -> None:
|
||||||
|
"""Deserialise one Redis pub/sub message and hand off to _handle_event."""
|
||||||
|
channel = msg.get("channel", "")
|
||||||
|
data = msg.get("data", "")
|
||||||
|
|
||||||
|
if not isinstance(data, str):
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
raw = json.loads(data)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
log.warning("malformed_message", channel=channel, preview=data[:120])
|
||||||
|
return
|
||||||
|
|
||||||
|
if not isinstance(raw, dict):
|
||||||
|
return
|
||||||
|
|
||||||
|
event = parse_event(raw, channel)
|
||||||
|
if event is None:
|
||||||
|
log.debug("unparseable_event", channel=channel, keys=list(raw.keys()))
|
||||||
|
return
|
||||||
|
|
||||||
|
log.info(
|
||||||
|
"event_received",
|
||||||
|
channel = channel,
|
||||||
|
svc_key = event.svc_key,
|
||||||
|
type = event.event_type,
|
||||||
|
severity = event.severity,
|
||||||
|
)
|
||||||
|
await self._handle_event(event)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Core pipeline
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def _handle_event(self, event: HealthEvent) -> None:
|
||||||
|
"""Cooldown → LLM proposal → write pending action → emit event."""
|
||||||
|
|
||||||
|
# Benign events need no remediation
|
||||||
|
if event.event_type in _BENIGN_EVENTS:
|
||||||
|
log.debug("benign_event_skipped", type=event.event_type, svc_key=event.svc_key)
|
||||||
|
return
|
||||||
|
|
||||||
|
svc_key = event.svc_key
|
||||||
|
|
||||||
|
if not self.cooldown.is_ready(svc_key):
|
||||||
|
log.info(
|
||||||
|
"cooldown_active",
|
||||||
|
svc_key = svc_key,
|
||||||
|
remaining_seconds = round(self.cooldown.remaining_seconds(svc_key)),
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
proposal = await self._propose_action(event)
|
||||||
|
if proposal is None:
|
||||||
|
# LLM fully failed — do not record cooldown so next event can retry
|
||||||
|
return
|
||||||
|
|
||||||
|
self.cooldown.record(svc_key)
|
||||||
|
|
||||||
|
if proposal.action == "ignore":
|
||||||
|
log.info(
|
||||||
|
"proposal_ignored",
|
||||||
|
svc_key = svc_key,
|
||||||
|
reason = proposal.reason,
|
||||||
|
confidence = proposal.confidence,
|
||||||
|
llm_model = proposal.llm_model,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Write to pending (human must approve before executor runs it)
|
||||||
|
try:
|
||||||
|
path = await write_pending_action(proposal)
|
||||||
|
except Exception as exc:
|
||||||
|
log.error("action_write_failed", svc_key=svc_key, error=str(exc))
|
||||||
|
return
|
||||||
|
|
||||||
|
log.info(
|
||||||
|
"action_proposed",
|
||||||
|
action_id = proposal.action_id,
|
||||||
|
action = proposal.action,
|
||||||
|
executor_type = proposal.type,
|
||||||
|
svc_key = svc_key,
|
||||||
|
requires_human = proposal.requires_human,
|
||||||
|
confidence = proposal.confidence,
|
||||||
|
risk_level = proposal.risk_level,
|
||||||
|
llm_model = proposal.llm_model,
|
||||||
|
path = str(path),
|
||||||
|
)
|
||||||
|
|
||||||
|
await emit_event(
|
||||||
|
event_type = "remediation_started",
|
||||||
|
severity = "info",
|
||||||
|
service = event.service,
|
||||||
|
correlation_id = proposal.action_id,
|
||||||
|
payload = {
|
||||||
|
"action": proposal.action,
|
||||||
|
"executor_type": proposal.type,
|
||||||
|
"node": event.node,
|
||||||
|
"action_id": proposal.action_id,
|
||||||
|
"requires_human": proposal.requires_human,
|
||||||
|
"confidence": proposal.confidence,
|
||||||
|
"llm_model": proposal.llm_model,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# LLM call
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def _propose_action(self, event: HealthEvent) -> Optional[ActionProposal]:
|
||||||
|
"""Invoke LLMRouter and map the validated response to an ActionProposal."""
|
||||||
|
messages = build_messages(event)
|
||||||
|
action_id = (
|
||||||
|
f"plan-{event.node}-{event.service.replace('/', '-')}"
|
||||||
|
f"-{int(event.timestamp)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
result: RouteResult = await self.router.route(
|
||||||
|
messages = messages,
|
||||||
|
schema = PROPOSAL_SCHEMA,
|
||||||
|
context = f"planner.{event.svc_key}",
|
||||||
|
)
|
||||||
|
except RuntimeError as exc:
|
||||||
|
log.error(
|
||||||
|
"llm_chain_exhausted",
|
||||||
|
svc_key = event.svc_key,
|
||||||
|
error = str(exc)[:400],
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
raw = result.content # already parsed + schema-validated by LLMRouter
|
||||||
|
action = raw["action"]
|
||||||
|
ex_type, risk = map_action_to_executor_type(action)
|
||||||
|
|
||||||
|
return ActionProposal(
|
||||||
|
action_id = action_id,
|
||||||
|
type = ex_type,
|
||||||
|
action = action,
|
||||||
|
service = raw.get("service") or event.service,
|
||||||
|
node = raw.get("node") or event.node,
|
||||||
|
reason = raw["reason"],
|
||||||
|
confidence = float(raw["confidence"]),
|
||||||
|
requires_human = bool(raw["requires_human"]),
|
||||||
|
risk_level = risk,
|
||||||
|
timestamp = event.timestamp,
|
||||||
|
source_event = event.event_type,
|
||||||
|
description = (
|
||||||
|
f"{action.upper()} {raw.get('service', event.service)} "
|
||||||
|
f"on {raw.get('node', event.node)}: {raw['reason']}"
|
||||||
|
),
|
||||||
|
llm_model = result.model_used,
|
||||||
|
llm_attempts = len(result.attempts),
|
||||||
|
)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Utilities
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _touch_heartbeat(self) -> None:
|
||||||
|
try:
|
||||||
|
STATE_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
HEARTBEAT.touch()
|
||||||
|
except Exception as exc:
|
||||||
|
log.warning("heartbeat_failed", error=str(exc))
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Module helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _ensure_dirs() -> None:
|
||||||
|
for sub in ("pending", "approved", "running", "completed",
|
||||||
|
"failed", "rejected", "cancelled"):
|
||||||
|
(ACTIONS_DIR / sub).mkdir(parents=True, exist_ok=True)
|
||||||
|
STATE_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Entry point
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def _main() -> None:
|
||||||
|
agent = PlannerAgent()
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
|
||||||
|
def _shutdown(sig_name: str) -> None:
|
||||||
|
log.info("shutdown_signal", signal=sig_name)
|
||||||
|
agent._running = False
|
||||||
|
|
||||||
|
for sig in (signal.SIGINT, signal.SIGTERM):
|
||||||
|
loop.add_signal_handler(sig, _shutdown, sig.name)
|
||||||
|
|
||||||
|
await agent.run()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(_main())
|
||||||
604
services/planner-agent/tests/test_planner.py
Normal file
604
services/planner-agent/tests/test_planner.py
Normal file
|
|
@ -0,0 +1,604 @@
|
||||||
|
"""
|
||||||
|
Unit tests for planner.py.
|
||||||
|
|
||||||
|
All Redis, LLMRouter, and filesystem operations are mocked —
|
||||||
|
no network or disk I/O required.
|
||||||
|
|
||||||
|
Run:
|
||||||
|
pytest services/planner-agent/tests/test_planner.py -v
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
from unittest.mock import AsyncMock, MagicMock, patch, call
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
# Allow importing from src/ without installation
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||||
|
|
||||||
|
from planner import (
|
||||||
|
ActionProposal,
|
||||||
|
CooldownTracker,
|
||||||
|
HealthEvent,
|
||||||
|
PlannerAgent,
|
||||||
|
build_messages,
|
||||||
|
map_action_to_executor_type,
|
||||||
|
parse_event,
|
||||||
|
write_pending_action,
|
||||||
|
emit_event,
|
||||||
|
PROPOSAL_SCHEMA,
|
||||||
|
)
|
||||||
|
from llm_router import AttemptRecord, RouteResult
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _make_route_result(
|
||||||
|
action: str = "restart",
|
||||||
|
service: str = "mosquitto",
|
||||||
|
node: str = "piha",
|
||||||
|
reason: str = "Container is stopped",
|
||||||
|
confidence: float = 0.9,
|
||||||
|
requires_human: bool = False,
|
||||||
|
model: str = "ollama/qwen2.5:7b",
|
||||||
|
) -> RouteResult:
|
||||||
|
content = {
|
||||||
|
"action": action,
|
||||||
|
"service": service,
|
||||||
|
"node": node,
|
||||||
|
"reason": reason,
|
||||||
|
"confidence": confidence,
|
||||||
|
"requires_human": requires_human,
|
||||||
|
}
|
||||||
|
return RouteResult(
|
||||||
|
content = content,
|
||||||
|
raw_text = json.dumps(content),
|
||||||
|
model_used = model,
|
||||||
|
attempts = [AttemptRecord(model, "success", None, 120)],
|
||||||
|
latency_ms = 120,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _health_event(
|
||||||
|
node: str = "piha",
|
||||||
|
service: str = "mosquitto",
|
||||||
|
event_type: str = "service_unhealthy",
|
||||||
|
severity: str = "error",
|
||||||
|
payload: dict = None,
|
||||||
|
) -> HealthEvent:
|
||||||
|
return HealthEvent(
|
||||||
|
node = node,
|
||||||
|
service = service,
|
||||||
|
event_type = event_type,
|
||||||
|
severity = severity,
|
||||||
|
payload = payload or {},
|
||||||
|
timestamp = time.time(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _mock_router(result: RouteResult) -> MagicMock:
|
||||||
|
router = MagicMock()
|
||||||
|
router.route = AsyncMock(return_value=result)
|
||||||
|
router.close = AsyncMock()
|
||||||
|
return router
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# CooldownTracker
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestCooldownTracker:
|
||||||
|
def test_initially_ready(self):
|
||||||
|
ct = CooldownTracker(cooldown_seconds=60)
|
||||||
|
assert ct.is_ready("piha/mosquitto") is True
|
||||||
|
|
||||||
|
def test_not_ready_after_record(self):
|
||||||
|
ct = CooldownTracker(cooldown_seconds=300)
|
||||||
|
ct.record("piha/mosquitto")
|
||||||
|
assert ct.is_ready("piha/mosquitto") is False
|
||||||
|
|
||||||
|
def test_ready_after_elapsed(self):
|
||||||
|
ct = CooldownTracker(cooldown_seconds=1)
|
||||||
|
ct.record("piha/mosquitto")
|
||||||
|
time.sleep(1.1)
|
||||||
|
assert ct.is_ready("piha/mosquitto") is True
|
||||||
|
|
||||||
|
def test_remaining_seconds_decreases(self):
|
||||||
|
ct = CooldownTracker(cooldown_seconds=60)
|
||||||
|
ct.record("piha/mosquitto")
|
||||||
|
r = ct.remaining_seconds("piha/mosquitto")
|
||||||
|
assert 0 < r <= 60
|
||||||
|
|
||||||
|
def test_remaining_zero_when_never_recorded(self):
|
||||||
|
ct = CooldownTracker()
|
||||||
|
assert ct.remaining_seconds("ghost/svc") == 0.0
|
||||||
|
|
||||||
|
def test_reset_clears_cooldown(self):
|
||||||
|
ct = CooldownTracker(cooldown_seconds=300)
|
||||||
|
ct.record("piha/mosquitto")
|
||||||
|
assert ct.is_ready("piha/mosquitto") is False
|
||||||
|
ct.reset("piha/mosquitto")
|
||||||
|
assert ct.is_ready("piha/mosquitto") is True
|
||||||
|
|
||||||
|
def test_independent_keys(self):
|
||||||
|
ct = CooldownTracker(cooldown_seconds=300)
|
||||||
|
ct.record("piha/mosquitto")
|
||||||
|
assert ct.is_ready("piha/mosquitto") is False
|
||||||
|
assert ct.is_ready("solaria/ollama") is True
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# HealthEvent
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestHealthEvent:
|
||||||
|
def test_svc_key(self):
|
||||||
|
e = _health_event("piha", "mosquitto")
|
||||||
|
assert e.svc_key == "piha/mosquitto"
|
||||||
|
|
||||||
|
def test_str_repr(self):
|
||||||
|
e = _health_event("vps", "observer", "service_unhealthy", "error")
|
||||||
|
assert "service_unhealthy" in str(e)
|
||||||
|
assert "vps/observer" in str(e)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# ActionProposal.to_action_file
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestActionProposal:
|
||||||
|
def _sample(self, **kwargs) -> ActionProposal:
|
||||||
|
defaults = dict(
|
||||||
|
action_id = "plan-piha-mosquitto-123",
|
||||||
|
type = "container_restart",
|
||||||
|
action = "restart",
|
||||||
|
service = "mosquitto",
|
||||||
|
node = "piha",
|
||||||
|
reason = "Container stopped unexpectedly",
|
||||||
|
confidence = 0.9,
|
||||||
|
requires_human = False,
|
||||||
|
risk_level = "low",
|
||||||
|
)
|
||||||
|
defaults.update(kwargs)
|
||||||
|
return ActionProposal(**defaults)
|
||||||
|
|
||||||
|
def test_to_action_file_keys(self):
|
||||||
|
d = self._sample().to_action_file()
|
||||||
|
for key in ("action_id", "type", "node", "service", "risk_level",
|
||||||
|
"confidence", "requires_human", "status", "timestamp",
|
||||||
|
"source_event", "llm_model", "llm_attempts", "payload"):
|
||||||
|
assert key in d, f"missing key: {key}"
|
||||||
|
|
||||||
|
def test_status_pending(self):
|
||||||
|
d = self._sample().to_action_file()
|
||||||
|
assert d["status"] == "pending"
|
||||||
|
|
||||||
|
def test_payload_contains_action_and_reason(self):
|
||||||
|
d = self._sample().to_action_file()
|
||||||
|
assert d["payload"]["action"] == "restart"
|
||||||
|
assert "Container stopped" in d["payload"]["reason"]
|
||||||
|
|
||||||
|
def test_description_fallback_to_reason(self):
|
||||||
|
p = self._sample(description="")
|
||||||
|
d = p.to_action_file()
|
||||||
|
assert d["description"] == p.reason
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# map_action_to_executor_type
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestMapActionToExecutorType:
|
||||||
|
@pytest.mark.parametrize("action,expected_type,expected_risk", [
|
||||||
|
("restart", "container_restart", "low"),
|
||||||
|
("redeploy", "redeploy", "guarded"),
|
||||||
|
("notify", "notify", "low"),
|
||||||
|
("ignore", "ignore", "none"),
|
||||||
|
("unknown", "notify", "low"), # safe fallback
|
||||||
|
])
|
||||||
|
def test_mapping(self, action, expected_type, expected_risk):
|
||||||
|
t, r = map_action_to_executor_type(action)
|
||||||
|
assert t == expected_type
|
||||||
|
assert r == expected_risk
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# parse_event
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestParseEvent:
|
||||||
|
def test_shape_a_node_agent(self):
|
||||||
|
raw = {
|
||||||
|
"type": "service_unhealthy",
|
||||||
|
"node": "piha",
|
||||||
|
"service": "mosquitto",
|
||||||
|
"severity": "error",
|
||||||
|
"payload": {"status": "exited"},
|
||||||
|
}
|
||||||
|
ev = parse_event(raw, "health_events")
|
||||||
|
assert ev is not None
|
||||||
|
assert ev.node == "piha"
|
||||||
|
assert ev.service == "mosquitto"
|
||||||
|
assert ev.event_type == "service_unhealthy"
|
||||||
|
assert ev.severity == "error"
|
||||||
|
assert ev.payload == {"status": "exited"}
|
||||||
|
|
||||||
|
def test_shape_b_world_updates(self):
|
||||||
|
raw = {
|
||||||
|
"event_type": "node_offline",
|
||||||
|
"node": "chelsty-infra",
|
||||||
|
"service": "mosquitto",
|
||||||
|
"severity": "critical",
|
||||||
|
}
|
||||||
|
ev = parse_event(raw, "world_updates")
|
||||||
|
assert ev is not None
|
||||||
|
assert ev.event_type == "node_offline"
|
||||||
|
assert ev.node == "chelsty-infra"
|
||||||
|
|
||||||
|
def test_missing_node_returns_none(self):
|
||||||
|
raw = {"type": "service_unhealthy", "service": "mosquitto"}
|
||||||
|
assert parse_event(raw, "health_events") is None
|
||||||
|
|
||||||
|
def test_missing_type_returns_none(self):
|
||||||
|
raw = {"node": "piha", "service": "mosquitto"}
|
||||||
|
assert parse_event(raw, "health_events") is None
|
||||||
|
|
||||||
|
def test_service_falls_back_to_node(self):
|
||||||
|
raw = {"type": "node_offline", "node": "piha"}
|
||||||
|
ev = parse_event(raw, "health_events")
|
||||||
|
assert ev is not None
|
||||||
|
assert ev.service == "piha"
|
||||||
|
|
||||||
|
def test_timestamp_iso_parsed(self):
|
||||||
|
raw = {
|
||||||
|
"type": "service_unhealthy",
|
||||||
|
"node": "piha",
|
||||||
|
"service": "mosquitto",
|
||||||
|
"timestamp": "2026-05-27T12:00:00Z",
|
||||||
|
}
|
||||||
|
ev = parse_event(raw, "health_events")
|
||||||
|
assert ev is not None
|
||||||
|
assert ev.timestamp > 1_700_000_000 # sanity: recent epoch
|
||||||
|
|
||||||
|
def test_timestamp_numeric_accepted(self):
|
||||||
|
ts = time.time()
|
||||||
|
raw = {"type": "service_unhealthy", "node": "piha", "service": "mosquitto",
|
||||||
|
"timestamp": ts}
|
||||||
|
ev = parse_event(raw, "health_events")
|
||||||
|
assert abs(ev.timestamp - ts) < 1
|
||||||
|
|
||||||
|
def test_channel_stored(self):
|
||||||
|
raw = {"type": "service_unhealthy", "node": "piha", "service": "mosquitto"}
|
||||||
|
ev = parse_event(raw, "world_updates")
|
||||||
|
assert ev.raw_channel == "world_updates"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# build_messages
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestBuildMessages:
|
||||||
|
def test_returns_two_messages(self):
|
||||||
|
ev = _health_event()
|
||||||
|
msgs = build_messages(ev)
|
||||||
|
assert len(msgs) == 2
|
||||||
|
assert msgs[0]["role"] == "system"
|
||||||
|
assert msgs[1]["role"] == "user"
|
||||||
|
|
||||||
|
def test_user_message_contains_event_fields(self):
|
||||||
|
ev = _health_event("vps", "observer", "service_unhealthy", "error",
|
||||||
|
payload={"exit_code": 1})
|
||||||
|
msgs = build_messages(ev)
|
||||||
|
user = msgs[1]["content"]
|
||||||
|
assert "vps" in user
|
||||||
|
assert "observer" in user
|
||||||
|
assert "service_unhealthy" in user
|
||||||
|
|
||||||
|
def test_payload_included_when_present(self):
|
||||||
|
ev = _health_event(payload={"disk_pct": 95})
|
||||||
|
msgs = build_messages(ev)
|
||||||
|
assert "disk_pct" in msgs[1]["content"]
|
||||||
|
|
||||||
|
def test_system_prompt_contains_homelab_rules(self):
|
||||||
|
ev = _health_event()
|
||||||
|
msgs = build_messages(ev)
|
||||||
|
sys_content = msgs[0]["content"]
|
||||||
|
assert "chelsty" in sys_content
|
||||||
|
assert "requires_human" in sys_content
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# PlannerAgent._handle_event
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
class TestPlannerHandleEvent:
|
||||||
|
def _agent(self, result: RouteResult) -> PlannerAgent:
|
||||||
|
router = _mock_router(result)
|
||||||
|
return PlannerAgent(redis_url=None, router=router)
|
||||||
|
|
||||||
|
async def test_benign_event_no_proposal(self, tmp_path):
|
||||||
|
agent = self._agent(_make_route_result())
|
||||||
|
ev = _health_event(event_type="service_healthy")
|
||||||
|
with patch("planner.write_pending_action", new=AsyncMock()) as mock_write:
|
||||||
|
await agent._handle_event(ev)
|
||||||
|
mock_write.assert_not_called()
|
||||||
|
|
||||||
|
async def test_cooldown_blocks_duplicate(self):
|
||||||
|
agent = self._agent(_make_route_result())
|
||||||
|
ev = _health_event()
|
||||||
|
agent.cooldown.record(ev.svc_key) # simulate recent proposal
|
||||||
|
with patch("planner.write_pending_action", new=AsyncMock()) as mock_write:
|
||||||
|
await agent._handle_event(ev)
|
||||||
|
mock_write.assert_not_called()
|
||||||
|
agent.router.route.assert_not_called()
|
||||||
|
|
||||||
|
async def test_ignore_action_no_file_written(self):
|
||||||
|
agent = self._agent(_make_route_result(action="ignore", reason="Transient glitch"))
|
||||||
|
ev = _health_event()
|
||||||
|
with patch("planner.write_pending_action", new=AsyncMock()) as mock_write:
|
||||||
|
await agent._handle_event(ev)
|
||||||
|
mock_write.assert_not_called()
|
||||||
|
|
||||||
|
async def test_ignore_records_cooldown(self):
|
||||||
|
agent = self._agent(_make_route_result(action="ignore", reason="Transient glitch"))
|
||||||
|
ev = _health_event()
|
||||||
|
with patch("planner.write_pending_action", new=AsyncMock()):
|
||||||
|
await agent._handle_event(ev)
|
||||||
|
assert not agent.cooldown.is_ready(ev.svc_key)
|
||||||
|
|
||||||
|
async def test_restart_action_writes_pending_file(self, tmp_path):
|
||||||
|
agent = self._agent(_make_route_result(action="restart"))
|
||||||
|
ev = _health_event()
|
||||||
|
|
||||||
|
captured: list[ActionProposal] = []
|
||||||
|
|
||||||
|
async def fake_write(p: ActionProposal) -> Path:
|
||||||
|
captured.append(p)
|
||||||
|
return tmp_path / f"{p.action_id}.json"
|
||||||
|
|
||||||
|
with patch("planner.write_pending_action", new=fake_write), \
|
||||||
|
patch("planner.emit_event", new=AsyncMock()):
|
||||||
|
await agent._handle_event(ev)
|
||||||
|
|
||||||
|
assert len(captured) == 1
|
||||||
|
assert captured[0].action == "restart"
|
||||||
|
assert captured[0].type == "container_restart"
|
||||||
|
|
||||||
|
async def test_redeploy_action_risk_guarded(self, tmp_path):
|
||||||
|
agent = self._agent(_make_route_result(action="redeploy"))
|
||||||
|
ev = _health_event()
|
||||||
|
|
||||||
|
captured: list[ActionProposal] = []
|
||||||
|
|
||||||
|
async def fake_write(p: ActionProposal) -> Path:
|
||||||
|
captured.append(p)
|
||||||
|
return tmp_path / f"{p.action_id}.json"
|
||||||
|
|
||||||
|
with patch("planner.write_pending_action", new=fake_write), \
|
||||||
|
patch("planner.emit_event", new=AsyncMock()):
|
||||||
|
await agent._handle_event(ev)
|
||||||
|
|
||||||
|
assert captured[0].risk_level == "guarded"
|
||||||
|
assert captured[0].type == "redeploy"
|
||||||
|
|
||||||
|
async def test_remediation_started_event_emitted(self, tmp_path):
|
||||||
|
agent = self._agent(_make_route_result(action="restart"))
|
||||||
|
ev = _health_event()
|
||||||
|
|
||||||
|
emitted: list[tuple] = []
|
||||||
|
|
||||||
|
async def fake_emit(event_type, severity, service, correlation_id, payload=None):
|
||||||
|
emitted.append((event_type, service, correlation_id))
|
||||||
|
|
||||||
|
with patch("planner.write_pending_action", new=AsyncMock(return_value=tmp_path / "x.json")), \
|
||||||
|
patch("planner.emit_event", new=fake_emit):
|
||||||
|
await agent._handle_event(ev)
|
||||||
|
|
||||||
|
assert len(emitted) == 1
|
||||||
|
assert emitted[0][0] == "remediation_started"
|
||||||
|
assert emitted[0][1] == ev.service
|
||||||
|
|
||||||
|
async def test_llm_failure_no_file_no_cooldown(self):
|
||||||
|
router = MagicMock()
|
||||||
|
router.route = AsyncMock(side_effect=RuntimeError("all models failed"))
|
||||||
|
router.close = AsyncMock()
|
||||||
|
agent = PlannerAgent(redis_url=None, router=router)
|
||||||
|
ev = _health_event()
|
||||||
|
|
||||||
|
with patch("planner.write_pending_action", new=AsyncMock()) as mock_write:
|
||||||
|
await agent._handle_event(ev)
|
||||||
|
|
||||||
|
mock_write.assert_not_called()
|
||||||
|
# Cooldown NOT recorded — next event should be able to retry
|
||||||
|
assert agent.cooldown.is_ready(ev.svc_key) is True
|
||||||
|
|
||||||
|
async def test_requires_human_preserved_in_proposal(self, tmp_path):
|
||||||
|
agent = self._agent(
|
||||||
|
_make_route_result(action="restart", requires_human=True, confidence=0.6)
|
||||||
|
)
|
||||||
|
ev = _health_event()
|
||||||
|
|
||||||
|
captured: list[ActionProposal] = []
|
||||||
|
|
||||||
|
async def fake_write(p: ActionProposal) -> Path:
|
||||||
|
captured.append(p)
|
||||||
|
return tmp_path / f"{p.action_id}.json"
|
||||||
|
|
||||||
|
with patch("planner.write_pending_action", new=fake_write), \
|
||||||
|
patch("planner.emit_event", new=AsyncMock()):
|
||||||
|
await agent._handle_event(ev)
|
||||||
|
|
||||||
|
assert captured[0].requires_human is True
|
||||||
|
|
||||||
|
async def test_cooldown_recorded_after_success(self, tmp_path):
|
||||||
|
agent = self._agent(_make_route_result(action="restart"))
|
||||||
|
ev = _health_event()
|
||||||
|
|
||||||
|
with patch("planner.write_pending_action",
|
||||||
|
new=AsyncMock(return_value=tmp_path / "x.json")), \
|
||||||
|
patch("planner.emit_event", new=AsyncMock()):
|
||||||
|
await agent._handle_event(ev)
|
||||||
|
|
||||||
|
assert not agent.cooldown.is_ready(ev.svc_key)
|
||||||
|
|
||||||
|
async def test_llm_model_recorded_in_proposal(self, tmp_path):
|
||||||
|
agent = self._agent(
|
||||||
|
_make_route_result(action="restart", model="claude-haiku-4-5-20251001")
|
||||||
|
)
|
||||||
|
ev = _health_event()
|
||||||
|
|
||||||
|
captured: list[ActionProposal] = []
|
||||||
|
|
||||||
|
async def fake_write(p: ActionProposal) -> Path:
|
||||||
|
captured.append(p)
|
||||||
|
return tmp_path / f"{p.action_id}.json"
|
||||||
|
|
||||||
|
with patch("planner.write_pending_action", new=fake_write), \
|
||||||
|
patch("planner.emit_event", new=AsyncMock()):
|
||||||
|
await agent._handle_event(ev)
|
||||||
|
|
||||||
|
assert captured[0].llm_model == "claude-haiku-4-5-20251001"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# PlannerAgent._dispatch
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
class TestPlannerDispatch:
|
||||||
|
def _agent(self) -> PlannerAgent:
|
||||||
|
router = _mock_router(_make_route_result())
|
||||||
|
return PlannerAgent(redis_url=None, router=router)
|
||||||
|
|
||||||
|
async def test_valid_json_dispatched(self):
|
||||||
|
agent = self._agent()
|
||||||
|
msg = {
|
||||||
|
"channel": "health_events",
|
||||||
|
"data": json.dumps({
|
||||||
|
"type": "service_unhealthy",
|
||||||
|
"node": "piha",
|
||||||
|
"service": "mosquitto",
|
||||||
|
"severity": "error",
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
with patch.object(agent, "_handle_event", new=AsyncMock()) as mock_handle:
|
||||||
|
await agent._dispatch(msg)
|
||||||
|
mock_handle.assert_awaited_once()
|
||||||
|
|
||||||
|
async def test_invalid_json_skipped(self):
|
||||||
|
agent = self._agent()
|
||||||
|
msg = {"channel": "health_events", "data": "{not valid json"}
|
||||||
|
with patch.object(agent, "_handle_event", new=AsyncMock()) as mock_handle:
|
||||||
|
await agent._dispatch(msg)
|
||||||
|
mock_handle.assert_not_called()
|
||||||
|
|
||||||
|
async def test_non_string_data_skipped(self):
|
||||||
|
agent = self._agent()
|
||||||
|
msg = {"channel": "health_events", "data": 42}
|
||||||
|
with patch.object(agent, "_handle_event", new=AsyncMock()) as mock_handle:
|
||||||
|
await agent._dispatch(msg)
|
||||||
|
mock_handle.assert_not_called()
|
||||||
|
|
||||||
|
async def test_missing_node_skipped(self):
|
||||||
|
agent = self._agent()
|
||||||
|
msg = {
|
||||||
|
"channel": "health_events",
|
||||||
|
"data": json.dumps({"type": "service_unhealthy", "service": "mosquitto"}),
|
||||||
|
}
|
||||||
|
with patch.object(agent, "_handle_event", new=AsyncMock()) as mock_handle:
|
||||||
|
await agent._dispatch(msg)
|
||||||
|
mock_handle.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# write_pending_action (integration-style with tmp_path)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
class TestWritePendingAction:
|
||||||
|
async def test_file_created_with_correct_content(self, tmp_path):
|
||||||
|
proposal = ActionProposal(
|
||||||
|
action_id = "plan-piha-mosquitto-1000",
|
||||||
|
type = "container_restart",
|
||||||
|
action = "restart",
|
||||||
|
service = "mosquitto",
|
||||||
|
node = "piha",
|
||||||
|
reason = "Container stopped unexpectedly",
|
||||||
|
confidence = 0.95,
|
||||||
|
requires_human = False,
|
||||||
|
risk_level = "low",
|
||||||
|
)
|
||||||
|
with patch("planner.ACTIONS_DIR", tmp_path):
|
||||||
|
path = await write_pending_action(proposal)
|
||||||
|
|
||||||
|
assert path.exists()
|
||||||
|
data = json.loads(path.read_text())
|
||||||
|
assert data["action_id"] == "plan-piha-mosquitto-1000"
|
||||||
|
assert data["status"] == "pending"
|
||||||
|
assert data["type"] == "container_restart"
|
||||||
|
assert data["confidence"] == 0.95
|
||||||
|
assert data["requires_human"] is False
|
||||||
|
|
||||||
|
async def test_file_is_valid_json(self, tmp_path):
|
||||||
|
proposal = ActionProposal(
|
||||||
|
action_id="x", type="redeploy", action="redeploy",
|
||||||
|
service="ollama", node="solaria",
|
||||||
|
reason="Service is broken beyond a simple restart",
|
||||||
|
confidence=0.8, requires_human=True, risk_level="guarded",
|
||||||
|
)
|
||||||
|
with patch("planner.ACTIONS_DIR", tmp_path):
|
||||||
|
path = await write_pending_action(proposal)
|
||||||
|
# Should not raise
|
||||||
|
json.loads(path.read_text())
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# emit_event (filesystem write)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
class TestEmitEvent:
|
||||||
|
async def test_event_file_created(self, tmp_path):
|
||||||
|
with patch("planner.EVENTS_DIR", tmp_path), \
|
||||||
|
patch("planner.NODE_NAME", "test-node"):
|
||||||
|
await emit_event(
|
||||||
|
event_type = "remediation_started",
|
||||||
|
severity = "info",
|
||||||
|
service = "mosquitto",
|
||||||
|
correlation_id = "plan-abc-123",
|
||||||
|
payload = {"action": "restart"},
|
||||||
|
)
|
||||||
|
|
||||||
|
files = list(tmp_path.rglob("*.json"))
|
||||||
|
assert len(files) == 1
|
||||||
|
data = json.loads(files[0].read_text())
|
||||||
|
assert data["type"] == "remediation_started"
|
||||||
|
assert data["service"] == "mosquitto"
|
||||||
|
assert data["correlation_id"] == "plan-abc-123"
|
||||||
|
assert data["payload"]["action"] == "restart"
|
||||||
|
|
||||||
|
async def test_event_dir_structure(self, tmp_path):
|
||||||
|
"""Events must be stored under YYYY-MM-DD/<node>/."""
|
||||||
|
import planner as planner_mod
|
||||||
|
orig = planner_mod.NODE_NAME
|
||||||
|
planner_mod.NODE_NAME = "piha"
|
||||||
|
try:
|
||||||
|
with patch("planner.EVENTS_DIR", tmp_path):
|
||||||
|
await emit_event("test_event", "info", "svc", "cid-1")
|
||||||
|
finally:
|
||||||
|
planner_mod.NODE_NAME = orig
|
||||||
|
|
||||||
|
files = list(tmp_path.rglob("*.json"))
|
||||||
|
assert len(files) == 1
|
||||||
|
# Path: <date>/<node>/<filename>
|
||||||
|
parts = files[0].relative_to(tmp_path).parts
|
||||||
|
assert len(parts) == 3 # date / node / file.json
|
||||||
|
assert parts[1] == "piha"
|
||||||
Loading…
Reference in a new issue