Refactor Telegram bot to use control plane API

2026-05-17 23:42:52 +02:00 · 2026-05-17 23:42:52 +02:00 · 5754994f8e
parent c299a2cb85
commit 5754994f8e
5 changed files with 271 additions and 19 deletions
--- a/services/agent-system/README.md
+++ b/services/agent-system/README.md
@ -5,6 +5,24 @@ Central runtime materializer and Operator Control Plane UI.
 - **Redis**: Central state store (on PIHA).
 - **Runtime Materializer**: Converts Redis state to JSON files in `/opt/homelab/world`.
 - **Web UI**: Exposes API endpoints and serving the Operator UI.
 - **Telegram Bot**: Provides operator commands and action approvals via Telegram.
 #### Configuration
 Environment variables should be set in `.env` (see `env.example`).
 Key variables for the Telegram Bot:
 - `TELEGRAM_BOT_TOKEN`: Your bot token from @BotFather.
 - `TELEGRAM_ALLOWED_USER_IDS`: Comma-separated list of authorized Telegram User IDs.
 - `CONTROL_PLANE_URL`: URL to the `agent-system-webui` (default: `http://webui:8080`).
 #### Telegram Commands
 - `/status`: Check bot and API connectivity.
 - `/summary`: System health overview.
 - `/nodes`: List homelab nodes and their status.
 - `/services`: Summary of services across nodes.
 - `/unhealthy`: List all unhealthy components.
 - `/incidents`: View active incidents.
 - `/actions`: Summary of operator actions.
 - `/help`: List all commands.
 #### Deployment (on PIHA)
 ```bash
--- a/services/agent-system/deploy.sh
+++ b/services/agent-system/deploy.sh
@ -11,7 +11,9 @@ echo ">>> Services status:"
 docker ps --filter "name=agent-system" --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
 if [ -z "$TELEGRAM_BOT_TOKEN" ]; then
-  echo "NOTE: TELEGRAM_BOT_TOKEN is not set. Telegram approval bot will be disabled."
+  echo ">>> Telegram bot status: DISABLED (token missing)"
 else
  echo ">>> Telegram bot status: ENABLED"
 fi
 echo ">>> Verifying API endpoints..."
--- a/services/agent-system/docker-compose.yml
+++ b/services/agent-system/docker-compose.yml
@ -38,7 +38,10 @@ services:
    environment:
      TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN}
      TELEGRAM_ALLOWED_USER_IDS: ${TELEGRAM_ALLOWED_USER_IDS}
      CONTROL_PLANE_URL: ${CONTROL_PLANE_URL:-http://webui:8080}
      ENABLE_LLM_FALLBACK: ${ENABLE_LLM_FALLBACK:-false}
      OPENCLAW_BASE_URL: ${OPENCLAW_BASE_URL}
      ACTIONS_ROOT: /opt/homelab/actions
    volumes:
      - /opt/homelab:/opt/homelab
-    restart: unless-stopped
+    restart: on-failure
--- a/services/agent-system/env.example
+++ b/services/agent-system/env.example
@ -3,6 +3,11 @@
 TELEGRAM_BOT_TOKEN=123456789:ABCdefGHIjklMNOpqrsTUVwxyz
 # Comma-separated list of Telegram User IDs
 TELEGRAM_ALLOWED_USER_IDS=12345678,87654321
 # Local control-plane API (default is internal compose address)
 CONTROL_PLANE_URL=http://webui:8080
 # Optional LLM fallback logic
 ENABLE_LLM_FALLBACK=false
 OPENCLAW_BASE_URL=http://openclaw.internal
 # Runtime Materializer Configuration
 REDIS_HOST=100.108.208.3
--- a/services/agent-system/telegram-bot/bot.py
+++ b/services/agent-system/telegram-bot/bot.py
@ -3,9 +3,11 @@ import json
 import time
 import asyncio
 import logging
 import urllib.request
 import urllib.error
 from pathlib import Path
 from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup
-from telegram.ext import ApplicationBuilder, ContextTypes, CommandHandler, CallbackQueryHandler
+from telegram.ext import ApplicationBuilder, ContextTypes, CommandHandler, CallbackQueryHandler, MessageHandler, filters
 # Setup logging
 logging.basicConfig(
@ -18,6 +20,39 @@ logger = logging.getLogger(__name__)
 TOKEN = os.getenv("TELEGRAM_BOT_TOKEN")
 ALLOWED_IDS = [int(i.strip()) for i in os.getenv("TELEGRAM_ALLOWED_USER_IDS", "").split(",") if i.strip()]
 ACTIONS_ROOT = Path(os.getenv("ACTIONS_ROOT", "/opt/homelab/actions"))
 CONTROL_PLANE_URL = os.getenv("CONTROL_PLANE_URL", "http://webui:8080")
 ENABLE_LLM_FALLBACK = os.getenv("ENABLE_LLM_FALLBACK", "false").lower() == "true"
 OPENCLAW_BASE_URL = os.getenv("OPENCLAW_BASE_URL")
 async def fetch_api(path):
    """Helper to fetch JSON from the Control Plane API."""
    url = f"{CONTROL_PLANE_URL.rstrip('/')}/{path.lstrip('/')}"
    try:
        def do_request():
            req = urllib.request.Request(url)
            with urllib.request.urlopen(req, timeout=5) as response:
                if response.status != 200:
                    return None
                return json.loads(response.read().decode())
        return await asyncio.to_thread(do_request)
    except Exception as e:
        logger.error(f"Error fetching {url}: {e}")
        return None
 async def post_api(path, data):
    """Helper to POST JSON to the Control Plane API."""
    url = f"{CONTROL_PLANE_URL.rstrip('/')}/{path.lstrip('/')}"
    try:
        body = json.dumps(data).encode("utf-8")
        def do_request():
            req = urllib.request.Request(url, data=body, method="POST")
            req.add_header("Content-Type", "application/json")
            with urllib.request.urlopen(req, timeout=5) as response:
                return response.status == 200
        return await asyncio.to_thread(do_request)
    except Exception as e:
        logger.error(f"Error posting to {url}: {e}")
        return False
 class ApprovalBot:
    def __init__(self):
@ -59,7 +94,7 @@ class ApprovalBot:
            f"Node: `{data.get('node', 'unknown')}`\n"
            f"Risk: *{data.get('risk', 'unknown')}*\n"
        )
-        
+
        if "details" in data:
            details_str = json.dumps(data['details'], indent=2)
            if len(details_str) > 1000:
@ -96,16 +131,22 @@ class ApprovalBot:
            return
        await query.answer()
-        
+
        cb_data = query.data
        if ":" not in cb_data:
            return
-            
+
        action, action_id = cb_data.split(":", 1)
        target_status = "approved" if action == "approve" else "rejected"
-        success, msg = self.move_action(action_id, target_status, user_id, query.from_user.username or str(user_id))
+        # Use API for mutation if available, fallback to local disk move
-        
+        success = await post_api("/action/mutate", {"id": action_id, "status": target_status})
        msg = "Success" if success else "API call failed"
        if not success:
            # Fallback to direct disk manipulation (original behavior)
            success, msg = self.move_action(action_id, target_status, user_id, query.from_user.username or str(user_id))
        if success:
            status_text = "✅ Approved" if target_status == "approved" else "❌ Rejected"
            await query.edit_message_text(
@ -131,11 +172,11 @@ class ApprovalBot:
        try:
            data = json.loads(source_path.read_text())
            current_status = data.get("status", "pending")
-            
+
            # Update data
            data["status"] = target_status
            data["updated_at"] = time.time()
-            
+
            history = data.get("transition_history", [])
            history.append({
                "from": current_status,
@ -158,16 +199,189 @@ async def start_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
    """Simple start command to help users find their ID."""
    user = update.effective_user
    message = (
-        f"Hello {user.first_name}!\n"
+        f"Hello {user.first_name}! 🤖\n"
        f"Your Telegram User ID is: `{user.id}`\n\n"
    )
    if user.id in ALLOWED_IDS:
-        message += "✅ You are authorized to approve actions."
+        message += "✅ You are authorized to manage the homelab.\n\n"
        message += "Use /help to see available commands."
    else:
-        message += "❌ You are NOT authorized. Add your ID to TELEGRAM_ALLOWED_USER_IDS."
+        message += "❌ You are NOT authorized. Add your ID to `TELEGRAM_ALLOWED_USER_IDS`."
-    
+
    await update.message.reply_text(message, parse_mode="Markdown")
 async def status_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
    if update.effective_user.id not in ALLOWED_IDS: return
    res = await fetch_api("/summary")
    status = "✅ Online" if res else "❌ Unreachable"
    message = (
        f"🤖 *Telegram Bot Status*\n"
        f"Control Plane API: {status}\n"
        f"Target URL: `{CONTROL_PLANE_URL}`\n"
    )
    await update.message.reply_text(message, parse_mode="Markdown")
 async def summary_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
    if update.effective_user.id not in ALLOWED_IDS: return
    data = await fetch_api("/summary")
    if not data:
        await update.message.reply_text("❌ Failed to fetch summary from Control Plane.")
        return
    msg = "📊 *System Summary*\n"
    msg += f"Status: `{data.get('status', 'unknown')}`\n"
    msg += f"Nodes: {data.get('node_count', 0)}\n"
    msg += f"Services: {data.get('service_count', 0)}\n"
    msg += f"Active Incidents: {data.get('active_incidents_count', 0)}\n"
    if data.get('stale'):
        msg += "\n⚠️ *Warning: Data is stale!*"
    await update.message.reply_text(msg, parse_mode="Markdown")
 async def nodes_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
    if update.effective_user.id not in ALLOWED_IDS: return
    nodes = await fetch_api("/nodes")
    if nodes is None:
        await update.message.reply_text("❌ Failed to fetch nodes.")
        return
    if not nodes:
        await update.message.reply_text("No nodes discovered in the fleet.")
        return
    msg = "🖥️ *Nodes Status*\n"
    for node in nodes:
        health_icon = "✅" if node.get('health') == 'nominal' else "⚠️" if node.get('health') == 'degraded' else "❌"
        msg += f"{health_icon} *{node.get('hostname')}*: `{node.get('status', 'unknown')}`\n"
        msg += f"   Last seen: {node.get('last_seen', 'N/A')}\n"
    await update.message.reply_text(msg, parse_mode="Markdown")
 async def services_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
    if update.effective_user.id not in ALLOWED_IDS: return
    services = await fetch_api("/services")
    if services is None:
        await update.message.reply_text("❌ Failed to fetch services.")
        return
    # Summarize by node
    nodes = {}
    for s in services:
        node = s.get("node", "unknown")
        if node not in nodes: nodes[node] = []
        nodes[node].append(s)
    msg = "⚙️ *Services Summary*\n"
    if not nodes:
        msg += "No services discovered."
    else:
        for node, svc_list in sorted(nodes.items()):
            nominal = len([s for s in svc_list if s.get("health") == "nominal"])
            msg += f"• *{node}*: {nominal}/{len(svc_list)} nominal\n"
    msg += "\nUse /unhealthy to see issues."
    await update.message.reply_text(msg, parse_mode="Markdown")
 async def unhealthy_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
    if update.effective_user.id not in ALLOWED_IDS: return
    services = await fetch_api("/services")
    nodes = await fetch_api("/nodes")
    msg = "⚠️ *Unhealthy Components*\n"
    found = False
    if services:
        for s in services:
            health = s.get("health", "").lower()
            if health != "nominal":
                msg += f"• Service *{s.get('name')}* on *{s.get('node')}*: `{health}`\n"
                found = True
    if nodes:
        for n in nodes:
            checks = n.get("checks", {})
            if isinstance(checks, str):
                try: checks = json.loads(checks)
                except: checks = {}
            docker = checks.get("docker", {})
            if docker.get("status") == "ok":
                for c in docker.get("containers", []):
                    if c.get("state") != "running":
                        msg += f"• Container *{c.get('name')}* on *{n.get('hostname')}*: `{c.get('state')}`\n"
                        found = True
    if not found:
        msg += "All systems nominal. ✅"
    await update.message.reply_text(msg, parse_mode="Markdown")
 async def incidents_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
    if update.effective_user.id not in ALLOWED_IDS: return
    incidents = await fetch_api("/incidents")
    if incidents is None:
        await update.message.reply_text("❌ Failed to fetch incidents.")
        return
    active = [i for i in incidents if i.get("status") not in ("resolved", "closed")]
    if not active:
        await update.message.reply_text("No active incidents. ✅")
        return
    msg = "🚨 *Active Incidents*\n"
    for inc in active:
        severity = inc.get('severity', 'info').upper()
        msg += f"• [{severity}] *{inc.get('type')}*: {inc.get('message')}\n"
    await update.message.reply_text(msg, parse_mode="Markdown")
 async def actions_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
    if update.effective_user.id not in ALLOWED_IDS: return
    actions = await fetch_api("/actions")
    if actions is None:
        await update.message.reply_text("❌ Actions endpoint unavailable.")
        return
    msg = "⚡ *Actions Summary*\n"
    total = 0
    for status, act_list in actions.items():
        if act_list:
            msg += f"• {status.capitalize()}: {len(act_list)}\n"
            total += len(act_list)
    if total == 0:
        msg = "No actions recorded."
    await update.message.reply_text(msg, parse_mode="Markdown")
 async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
    msg = (
        "📖 *Supported Commands*\n\n"
        "/status - Check bot and API connectivity\n"
        "/summary - System health overview\n"
        "/nodes - List homelab nodes and their status\n"
        "/services - Summary of services across nodes\n"
        "/unhealthy - List all unhealthy components\n"
        "/incidents - View active incidents\n"
        "/actions - Summary of operator actions\n"
        "/help - Show this help message\n\n"
        "Free text will be handled by the guidance system."
    )
    await update.message.reply_text(msg, parse_mode="Markdown")
 async def handle_fallback(update: Update, context: ContextTypes.DEFAULT_TYPE):
    """Handles non-command messages."""
    if update.effective_user.id not in ALLOWED_IDS: return
    if ENABLE_LLM_FALLBACK and OPENCLAW_BASE_URL:
        # Placeholder for OpenClaw LLM fallback
        # In a real scenario, this would call the LLM API
        logger.info(f"LLM fallback requested for: {update.message.text}")
    await update.message.reply_text(
        "Use /summary, /nodes, /services, /unhealthy, /incidents, /actions."
    )
 async def run_bot():
    if not TOKEN:
        print("CRITICAL: TELEGRAM_BOT_TOKEN is not set. Telegram bot will not start.")
@ -176,21 +390,31 @@ async def run_bot():
        return
    bot_logic = ApprovalBot()
-    
+
    application = ApplicationBuilder().token(TOKEN).build()
-    
+
    application.add_handler(CommandHandler("start", start_command))
    application.add_handler(CommandHandler("status", status_command))
    application.add_handler(CommandHandler("summary", summary_command))
    application.add_handler(CommandHandler("nodes", nodes_command))
    application.add_handler(CommandHandler("services", services_command))
    application.add_handler(CommandHandler("unhealthy", unhealthy_command))
    application.add_handler(CommandHandler("incidents", incidents_command))
    application.add_handler(CommandHandler("actions", actions_command))
    application.add_handler(CommandHandler("help", help_command))
    application.add_handler(MessageHandler(filters.TEXT & (~filters.COMMAND), handle_fallback))
    application.add_handler(CallbackQueryHandler(bot_logic.handle_callback))
-    
+
    # Schedule the pending actions check
    job_queue = application.job_queue
    job_queue.run_repeating(bot_logic.check_pending_actions, interval=10, first=5)
-    
+
    logger.info("Starting Telegram Approval Bot...")
    await application.initialize()
    await application.start()
    await application.updater.start_polling()
-    
+
    # Run until the application is stopped
    stop_event = asyncio.Event()
    try: