Refactor Telegram bot to use control plane API
This commit is contained in:
parent
c299a2cb85
commit
5754994f8e
|
|
@ -5,6 +5,24 @@ Central runtime materializer and Operator Control Plane UI.
|
||||||
- **Redis**: Central state store (on PIHA).
|
- **Redis**: Central state store (on PIHA).
|
||||||
- **Runtime Materializer**: Converts Redis state to JSON files in `/opt/homelab/world`.
|
- **Runtime Materializer**: Converts Redis state to JSON files in `/opt/homelab/world`.
|
||||||
- **Web UI**: Exposes API endpoints and serving the Operator UI.
|
- **Web UI**: Exposes API endpoints and serving the Operator UI.
|
||||||
|
- **Telegram Bot**: Provides operator commands and action approvals via Telegram.
|
||||||
|
|
||||||
|
#### Configuration
|
||||||
|
Environment variables should be set in `.env` (see `env.example`).
|
||||||
|
Key variables for the Telegram Bot:
|
||||||
|
- `TELEGRAM_BOT_TOKEN`: Your bot token from @BotFather.
|
||||||
|
- `TELEGRAM_ALLOWED_USER_IDS`: Comma-separated list of authorized Telegram User IDs.
|
||||||
|
- `CONTROL_PLANE_URL`: URL to the `agent-system-webui` (default: `http://webui:8080`).
|
||||||
|
|
||||||
|
#### Telegram Commands
|
||||||
|
- `/status`: Check bot and API connectivity.
|
||||||
|
- `/summary`: System health overview.
|
||||||
|
- `/nodes`: List homelab nodes and their status.
|
||||||
|
- `/services`: Summary of services across nodes.
|
||||||
|
- `/unhealthy`: List all unhealthy components.
|
||||||
|
- `/incidents`: View active incidents.
|
||||||
|
- `/actions`: Summary of operator actions.
|
||||||
|
- `/help`: List all commands.
|
||||||
|
|
||||||
#### Deployment (on PIHA)
|
#### Deployment (on PIHA)
|
||||||
```bash
|
```bash
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,9 @@ echo ">>> Services status:"
|
||||||
docker ps --filter "name=agent-system" --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
|
docker ps --filter "name=agent-system" --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
|
||||||
|
|
||||||
if [ -z "$TELEGRAM_BOT_TOKEN" ]; then
|
if [ -z "$TELEGRAM_BOT_TOKEN" ]; then
|
||||||
echo "NOTE: TELEGRAM_BOT_TOKEN is not set. Telegram approval bot will be disabled."
|
echo ">>> Telegram bot status: DISABLED (token missing)"
|
||||||
|
else
|
||||||
|
echo ">>> Telegram bot status: ENABLED"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo ">>> Verifying API endpoints..."
|
echo ">>> Verifying API endpoints..."
|
||||||
|
|
|
||||||
|
|
@ -38,7 +38,10 @@ services:
|
||||||
environment:
|
environment:
|
||||||
TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN}
|
TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN}
|
||||||
TELEGRAM_ALLOWED_USER_IDS: ${TELEGRAM_ALLOWED_USER_IDS}
|
TELEGRAM_ALLOWED_USER_IDS: ${TELEGRAM_ALLOWED_USER_IDS}
|
||||||
|
CONTROL_PLANE_URL: ${CONTROL_PLANE_URL:-http://webui:8080}
|
||||||
|
ENABLE_LLM_FALLBACK: ${ENABLE_LLM_FALLBACK:-false}
|
||||||
|
OPENCLAW_BASE_URL: ${OPENCLAW_BASE_URL}
|
||||||
ACTIONS_ROOT: /opt/homelab/actions
|
ACTIONS_ROOT: /opt/homelab/actions
|
||||||
volumes:
|
volumes:
|
||||||
- /opt/homelab:/opt/homelab
|
- /opt/homelab:/opt/homelab
|
||||||
restart: unless-stopped
|
restart: on-failure
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,11 @@
|
||||||
TELEGRAM_BOT_TOKEN=123456789:ABCdefGHIjklMNOpqrsTUVwxyz
|
TELEGRAM_BOT_TOKEN=123456789:ABCdefGHIjklMNOpqrsTUVwxyz
|
||||||
# Comma-separated list of Telegram User IDs
|
# Comma-separated list of Telegram User IDs
|
||||||
TELEGRAM_ALLOWED_USER_IDS=12345678,87654321
|
TELEGRAM_ALLOWED_USER_IDS=12345678,87654321
|
||||||
|
# Local control-plane API (default is internal compose address)
|
||||||
|
CONTROL_PLANE_URL=http://webui:8080
|
||||||
|
# Optional LLM fallback logic
|
||||||
|
ENABLE_LLM_FALLBACK=false
|
||||||
|
OPENCLAW_BASE_URL=http://openclaw.internal
|
||||||
|
|
||||||
# Runtime Materializer Configuration
|
# Runtime Materializer Configuration
|
||||||
REDIS_HOST=100.108.208.3
|
REDIS_HOST=100.108.208.3
|
||||||
|
|
|
||||||
|
|
@ -3,9 +3,11 @@ import json
|
||||||
import time
|
import time
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
|
import urllib.request
|
||||||
|
import urllib.error
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup
|
from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup
|
||||||
from telegram.ext import ApplicationBuilder, ContextTypes, CommandHandler, CallbackQueryHandler
|
from telegram.ext import ApplicationBuilder, ContextTypes, CommandHandler, CallbackQueryHandler, MessageHandler, filters
|
||||||
|
|
||||||
# Setup logging
|
# Setup logging
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
|
|
@ -18,6 +20,39 @@ logger = logging.getLogger(__name__)
|
||||||
TOKEN = os.getenv("TELEGRAM_BOT_TOKEN")
|
TOKEN = os.getenv("TELEGRAM_BOT_TOKEN")
|
||||||
ALLOWED_IDS = [int(i.strip()) for i in os.getenv("TELEGRAM_ALLOWED_USER_IDS", "").split(",") if i.strip()]
|
ALLOWED_IDS = [int(i.strip()) for i in os.getenv("TELEGRAM_ALLOWED_USER_IDS", "").split(",") if i.strip()]
|
||||||
ACTIONS_ROOT = Path(os.getenv("ACTIONS_ROOT", "/opt/homelab/actions"))
|
ACTIONS_ROOT = Path(os.getenv("ACTIONS_ROOT", "/opt/homelab/actions"))
|
||||||
|
CONTROL_PLANE_URL = os.getenv("CONTROL_PLANE_URL", "http://webui:8080")
|
||||||
|
ENABLE_LLM_FALLBACK = os.getenv("ENABLE_LLM_FALLBACK", "false").lower() == "true"
|
||||||
|
OPENCLAW_BASE_URL = os.getenv("OPENCLAW_BASE_URL")
|
||||||
|
|
||||||
|
async def fetch_api(path):
|
||||||
|
"""Helper to fetch JSON from the Control Plane API."""
|
||||||
|
url = f"{CONTROL_PLANE_URL.rstrip('/')}/{path.lstrip('/')}"
|
||||||
|
try:
|
||||||
|
def do_request():
|
||||||
|
req = urllib.request.Request(url)
|
||||||
|
with urllib.request.urlopen(req, timeout=5) as response:
|
||||||
|
if response.status != 200:
|
||||||
|
return None
|
||||||
|
return json.loads(response.read().decode())
|
||||||
|
return await asyncio.to_thread(do_request)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error fetching {url}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def post_api(path, data):
|
||||||
|
"""Helper to POST JSON to the Control Plane API."""
|
||||||
|
url = f"{CONTROL_PLANE_URL.rstrip('/')}/{path.lstrip('/')}"
|
||||||
|
try:
|
||||||
|
body = json.dumps(data).encode("utf-8")
|
||||||
|
def do_request():
|
||||||
|
req = urllib.request.Request(url, data=body, method="POST")
|
||||||
|
req.add_header("Content-Type", "application/json")
|
||||||
|
with urllib.request.urlopen(req, timeout=5) as response:
|
||||||
|
return response.status == 200
|
||||||
|
return await asyncio.to_thread(do_request)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error posting to {url}: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
class ApprovalBot:
|
class ApprovalBot:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
@ -104,6 +139,12 @@ class ApprovalBot:
|
||||||
action, action_id = cb_data.split(":", 1)
|
action, action_id = cb_data.split(":", 1)
|
||||||
target_status = "approved" if action == "approve" else "rejected"
|
target_status = "approved" if action == "approve" else "rejected"
|
||||||
|
|
||||||
|
# Use API for mutation if available, fallback to local disk move
|
||||||
|
success = await post_api("/action/mutate", {"id": action_id, "status": target_status})
|
||||||
|
msg = "Success" if success else "API call failed"
|
||||||
|
|
||||||
|
if not success:
|
||||||
|
# Fallback to direct disk manipulation (original behavior)
|
||||||
success, msg = self.move_action(action_id, target_status, user_id, query.from_user.username or str(user_id))
|
success, msg = self.move_action(action_id, target_status, user_id, query.from_user.username or str(user_id))
|
||||||
|
|
||||||
if success:
|
if success:
|
||||||
|
|
@ -158,16 +199,189 @@ async def start_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||||
"""Simple start command to help users find their ID."""
|
"""Simple start command to help users find their ID."""
|
||||||
user = update.effective_user
|
user = update.effective_user
|
||||||
message = (
|
message = (
|
||||||
f"Hello {user.first_name}!\n"
|
f"Hello {user.first_name}! 🤖\n"
|
||||||
f"Your Telegram User ID is: `{user.id}`\n\n"
|
f"Your Telegram User ID is: `{user.id}`\n\n"
|
||||||
)
|
)
|
||||||
if user.id in ALLOWED_IDS:
|
if user.id in ALLOWED_IDS:
|
||||||
message += "✅ You are authorized to approve actions."
|
message += "✅ You are authorized to manage the homelab.\n\n"
|
||||||
|
message += "Use /help to see available commands."
|
||||||
else:
|
else:
|
||||||
message += "❌ You are NOT authorized. Add your ID to TELEGRAM_ALLOWED_USER_IDS."
|
message += "❌ You are NOT authorized. Add your ID to `TELEGRAM_ALLOWED_USER_IDS`."
|
||||||
|
|
||||||
await update.message.reply_text(message, parse_mode="Markdown")
|
await update.message.reply_text(message, parse_mode="Markdown")
|
||||||
|
|
||||||
|
async def status_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||||
|
if update.effective_user.id not in ALLOWED_IDS: return
|
||||||
|
res = await fetch_api("/summary")
|
||||||
|
status = "✅ Online" if res else "❌ Unreachable"
|
||||||
|
message = (
|
||||||
|
f"🤖 *Telegram Bot Status*\n"
|
||||||
|
f"Control Plane API: {status}\n"
|
||||||
|
f"Target URL: `{CONTROL_PLANE_URL}`\n"
|
||||||
|
)
|
||||||
|
await update.message.reply_text(message, parse_mode="Markdown")
|
||||||
|
|
||||||
|
async def summary_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||||
|
if update.effective_user.id not in ALLOWED_IDS: return
|
||||||
|
data = await fetch_api("/summary")
|
||||||
|
if not data:
|
||||||
|
await update.message.reply_text("❌ Failed to fetch summary from Control Plane.")
|
||||||
|
return
|
||||||
|
|
||||||
|
msg = "📊 *System Summary*\n"
|
||||||
|
msg += f"Status: `{data.get('status', 'unknown')}`\n"
|
||||||
|
msg += f"Nodes: {data.get('node_count', 0)}\n"
|
||||||
|
msg += f"Services: {data.get('service_count', 0)}\n"
|
||||||
|
msg += f"Active Incidents: {data.get('active_incidents_count', 0)}\n"
|
||||||
|
if data.get('stale'):
|
||||||
|
msg += "\n⚠️ *Warning: Data is stale!*"
|
||||||
|
|
||||||
|
await update.message.reply_text(msg, parse_mode="Markdown")
|
||||||
|
|
||||||
|
async def nodes_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||||
|
if update.effective_user.id not in ALLOWED_IDS: return
|
||||||
|
nodes = await fetch_api("/nodes")
|
||||||
|
if nodes is None:
|
||||||
|
await update.message.reply_text("❌ Failed to fetch nodes.")
|
||||||
|
return
|
||||||
|
|
||||||
|
if not nodes:
|
||||||
|
await update.message.reply_text("No nodes discovered in the fleet.")
|
||||||
|
return
|
||||||
|
|
||||||
|
msg = "🖥️ *Nodes Status*\n"
|
||||||
|
for node in nodes:
|
||||||
|
health_icon = "✅" if node.get('health') == 'nominal' else "⚠️" if node.get('health') == 'degraded' else "❌"
|
||||||
|
msg += f"{health_icon} *{node.get('hostname')}*: `{node.get('status', 'unknown')}`\n"
|
||||||
|
msg += f" Last seen: {node.get('last_seen', 'N/A')}\n"
|
||||||
|
|
||||||
|
await update.message.reply_text(msg, parse_mode="Markdown")
|
||||||
|
|
||||||
|
async def services_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||||
|
if update.effective_user.id not in ALLOWED_IDS: return
|
||||||
|
services = await fetch_api("/services")
|
||||||
|
if services is None:
|
||||||
|
await update.message.reply_text("❌ Failed to fetch services.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Summarize by node
|
||||||
|
nodes = {}
|
||||||
|
for s in services:
|
||||||
|
node = s.get("node", "unknown")
|
||||||
|
if node not in nodes: nodes[node] = []
|
||||||
|
nodes[node].append(s)
|
||||||
|
|
||||||
|
msg = "⚙️ *Services Summary*\n"
|
||||||
|
if not nodes:
|
||||||
|
msg += "No services discovered."
|
||||||
|
else:
|
||||||
|
for node, svc_list in sorted(nodes.items()):
|
||||||
|
nominal = len([s for s in svc_list if s.get("health") == "nominal"])
|
||||||
|
msg += f"• *{node}*: {nominal}/{len(svc_list)} nominal\n"
|
||||||
|
|
||||||
|
msg += "\nUse /unhealthy to see issues."
|
||||||
|
await update.message.reply_text(msg, parse_mode="Markdown")
|
||||||
|
|
||||||
|
async def unhealthy_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||||
|
if update.effective_user.id not in ALLOWED_IDS: return
|
||||||
|
services = await fetch_api("/services")
|
||||||
|
nodes = await fetch_api("/nodes")
|
||||||
|
|
||||||
|
msg = "⚠️ *Unhealthy Components*\n"
|
||||||
|
found = False
|
||||||
|
|
||||||
|
if services:
|
||||||
|
for s in services:
|
||||||
|
health = s.get("health", "").lower()
|
||||||
|
if health != "nominal":
|
||||||
|
msg += f"• Service *{s.get('name')}* on *{s.get('node')}*: `{health}`\n"
|
||||||
|
found = True
|
||||||
|
|
||||||
|
if nodes:
|
||||||
|
for n in nodes:
|
||||||
|
checks = n.get("checks", {})
|
||||||
|
if isinstance(checks, str):
|
||||||
|
try: checks = json.loads(checks)
|
||||||
|
except: checks = {}
|
||||||
|
|
||||||
|
docker = checks.get("docker", {})
|
||||||
|
if docker.get("status") == "ok":
|
||||||
|
for c in docker.get("containers", []):
|
||||||
|
if c.get("state") != "running":
|
||||||
|
msg += f"• Container *{c.get('name')}* on *{n.get('hostname')}*: `{c.get('state')}`\n"
|
||||||
|
found = True
|
||||||
|
|
||||||
|
if not found:
|
||||||
|
msg += "All systems nominal. ✅"
|
||||||
|
|
||||||
|
await update.message.reply_text(msg, parse_mode="Markdown")
|
||||||
|
|
||||||
|
async def incidents_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||||
|
if update.effective_user.id not in ALLOWED_IDS: return
|
||||||
|
incidents = await fetch_api("/incidents")
|
||||||
|
if incidents is None:
|
||||||
|
await update.message.reply_text("❌ Failed to fetch incidents.")
|
||||||
|
return
|
||||||
|
|
||||||
|
active = [i for i in incidents if i.get("status") not in ("resolved", "closed")]
|
||||||
|
if not active:
|
||||||
|
await update.message.reply_text("No active incidents. ✅")
|
||||||
|
return
|
||||||
|
|
||||||
|
msg = "🚨 *Active Incidents*\n"
|
||||||
|
for inc in active:
|
||||||
|
severity = inc.get('severity', 'info').upper()
|
||||||
|
msg += f"• [{severity}] *{inc.get('type')}*: {inc.get('message')}\n"
|
||||||
|
|
||||||
|
await update.message.reply_text(msg, parse_mode="Markdown")
|
||||||
|
|
||||||
|
async def actions_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||||
|
if update.effective_user.id not in ALLOWED_IDS: return
|
||||||
|
actions = await fetch_api("/actions")
|
||||||
|
if actions is None:
|
||||||
|
await update.message.reply_text("❌ Actions endpoint unavailable.")
|
||||||
|
return
|
||||||
|
|
||||||
|
msg = "⚡ *Actions Summary*\n"
|
||||||
|
total = 0
|
||||||
|
for status, act_list in actions.items():
|
||||||
|
if act_list:
|
||||||
|
msg += f"• {status.capitalize()}: {len(act_list)}\n"
|
||||||
|
total += len(act_list)
|
||||||
|
|
||||||
|
if total == 0:
|
||||||
|
msg = "No actions recorded."
|
||||||
|
|
||||||
|
await update.message.reply_text(msg, parse_mode="Markdown")
|
||||||
|
|
||||||
|
async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||||
|
msg = (
|
||||||
|
"📖 *Supported Commands*\n\n"
|
||||||
|
"/status - Check bot and API connectivity\n"
|
||||||
|
"/summary - System health overview\n"
|
||||||
|
"/nodes - List homelab nodes and their status\n"
|
||||||
|
"/services - Summary of services across nodes\n"
|
||||||
|
"/unhealthy - List all unhealthy components\n"
|
||||||
|
"/incidents - View active incidents\n"
|
||||||
|
"/actions - Summary of operator actions\n"
|
||||||
|
"/help - Show this help message\n\n"
|
||||||
|
"Free text will be handled by the guidance system."
|
||||||
|
)
|
||||||
|
await update.message.reply_text(msg, parse_mode="Markdown")
|
||||||
|
|
||||||
|
async def handle_fallback(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||||
|
"""Handles non-command messages."""
|
||||||
|
if update.effective_user.id not in ALLOWED_IDS: return
|
||||||
|
|
||||||
|
if ENABLE_LLM_FALLBACK and OPENCLAW_BASE_URL:
|
||||||
|
# Placeholder for OpenClaw LLM fallback
|
||||||
|
# In a real scenario, this would call the LLM API
|
||||||
|
logger.info(f"LLM fallback requested for: {update.message.text}")
|
||||||
|
|
||||||
|
await update.message.reply_text(
|
||||||
|
"Use /summary, /nodes, /services, /unhealthy, /incidents, /actions."
|
||||||
|
)
|
||||||
|
|
||||||
async def run_bot():
|
async def run_bot():
|
||||||
if not TOKEN:
|
if not TOKEN:
|
||||||
print("CRITICAL: TELEGRAM_BOT_TOKEN is not set. Telegram bot will not start.")
|
print("CRITICAL: TELEGRAM_BOT_TOKEN is not set. Telegram bot will not start.")
|
||||||
|
|
@ -180,6 +394,16 @@ async def run_bot():
|
||||||
application = ApplicationBuilder().token(TOKEN).build()
|
application = ApplicationBuilder().token(TOKEN).build()
|
||||||
|
|
||||||
application.add_handler(CommandHandler("start", start_command))
|
application.add_handler(CommandHandler("start", start_command))
|
||||||
|
application.add_handler(CommandHandler("status", status_command))
|
||||||
|
application.add_handler(CommandHandler("summary", summary_command))
|
||||||
|
application.add_handler(CommandHandler("nodes", nodes_command))
|
||||||
|
application.add_handler(CommandHandler("services", services_command))
|
||||||
|
application.add_handler(CommandHandler("unhealthy", unhealthy_command))
|
||||||
|
application.add_handler(CommandHandler("incidents", incidents_command))
|
||||||
|
application.add_handler(CommandHandler("actions", actions_command))
|
||||||
|
application.add_handler(CommandHandler("help", help_command))
|
||||||
|
|
||||||
|
application.add_handler(MessageHandler(filters.TEXT & (~filters.COMMAND), handle_fallback))
|
||||||
application.add_handler(CallbackQueryHandler(bot_logic.handle_callback))
|
application.add_handler(CallbackQueryHandler(bot_logic.handle_callback))
|
||||||
|
|
||||||
# Schedule the pending actions check
|
# Schedule the pending actions check
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue