Refactor Telegram bot to use control plane API

This commit is contained in:
oskar 2026-05-17 23:42:52 +02:00
parent c299a2cb85
commit 5754994f8e
5 changed files with 271 additions and 19 deletions

View file

@ -5,6 +5,24 @@ Central runtime materializer and Operator Control Plane UI.
- **Redis**: Central state store (on PIHA). - **Redis**: Central state store (on PIHA).
- **Runtime Materializer**: Converts Redis state to JSON files in `/opt/homelab/world`. - **Runtime Materializer**: Converts Redis state to JSON files in `/opt/homelab/world`.
- **Web UI**: Exposes API endpoints and serving the Operator UI. - **Web UI**: Exposes API endpoints and serving the Operator UI.
- **Telegram Bot**: Provides operator commands and action approvals via Telegram.
#### Configuration
Environment variables should be set in `.env` (see `env.example`).
Key variables for the Telegram Bot:
- `TELEGRAM_BOT_TOKEN`: Your bot token from @BotFather.
- `TELEGRAM_ALLOWED_USER_IDS`: Comma-separated list of authorized Telegram User IDs.
- `CONTROL_PLANE_URL`: URL to the `agent-system-webui` (default: `http://webui:8080`).
#### Telegram Commands
- `/status`: Check bot and API connectivity.
- `/summary`: System health overview.
- `/nodes`: List homelab nodes and their status.
- `/services`: Summary of services across nodes.
- `/unhealthy`: List all unhealthy components.
- `/incidents`: View active incidents.
- `/actions`: Summary of operator actions.
- `/help`: List all commands.
#### Deployment (on PIHA) #### Deployment (on PIHA)
```bash ```bash

View file

@ -11,7 +11,9 @@ echo ">>> Services status:"
docker ps --filter "name=agent-system" --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" docker ps --filter "name=agent-system" --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
if [ -z "$TELEGRAM_BOT_TOKEN" ]; then if [ -z "$TELEGRAM_BOT_TOKEN" ]; then
echo "NOTE: TELEGRAM_BOT_TOKEN is not set. Telegram approval bot will be disabled." echo ">>> Telegram bot status: DISABLED (token missing)"
else
echo ">>> Telegram bot status: ENABLED"
fi fi
echo ">>> Verifying API endpoints..." echo ">>> Verifying API endpoints..."

View file

@ -38,7 +38,10 @@ services:
environment: environment:
TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN} TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN}
TELEGRAM_ALLOWED_USER_IDS: ${TELEGRAM_ALLOWED_USER_IDS} TELEGRAM_ALLOWED_USER_IDS: ${TELEGRAM_ALLOWED_USER_IDS}
CONTROL_PLANE_URL: ${CONTROL_PLANE_URL:-http://webui:8080}
ENABLE_LLM_FALLBACK: ${ENABLE_LLM_FALLBACK:-false}
OPENCLAW_BASE_URL: ${OPENCLAW_BASE_URL}
ACTIONS_ROOT: /opt/homelab/actions ACTIONS_ROOT: /opt/homelab/actions
volumes: volumes:
- /opt/homelab:/opt/homelab - /opt/homelab:/opt/homelab
restart: unless-stopped restart: on-failure

View file

@ -3,6 +3,11 @@
TELEGRAM_BOT_TOKEN=123456789:ABCdefGHIjklMNOpqrsTUVwxyz TELEGRAM_BOT_TOKEN=123456789:ABCdefGHIjklMNOpqrsTUVwxyz
# Comma-separated list of Telegram User IDs # Comma-separated list of Telegram User IDs
TELEGRAM_ALLOWED_USER_IDS=12345678,87654321 TELEGRAM_ALLOWED_USER_IDS=12345678,87654321
# Local control-plane API (default is internal compose address)
CONTROL_PLANE_URL=http://webui:8080
# Optional LLM fallback logic
ENABLE_LLM_FALLBACK=false
OPENCLAW_BASE_URL=http://openclaw.internal
# Runtime Materializer Configuration # Runtime Materializer Configuration
REDIS_HOST=100.108.208.3 REDIS_HOST=100.108.208.3

View file

@ -3,9 +3,11 @@ import json
import time import time
import asyncio import asyncio
import logging import logging
import urllib.request
import urllib.error
from pathlib import Path from pathlib import Path
from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup
from telegram.ext import ApplicationBuilder, ContextTypes, CommandHandler, CallbackQueryHandler from telegram.ext import ApplicationBuilder, ContextTypes, CommandHandler, CallbackQueryHandler, MessageHandler, filters
# Setup logging # Setup logging
logging.basicConfig( logging.basicConfig(
@ -18,6 +20,39 @@ logger = logging.getLogger(__name__)
TOKEN = os.getenv("TELEGRAM_BOT_TOKEN") TOKEN = os.getenv("TELEGRAM_BOT_TOKEN")
ALLOWED_IDS = [int(i.strip()) for i in os.getenv("TELEGRAM_ALLOWED_USER_IDS", "").split(",") if i.strip()] ALLOWED_IDS = [int(i.strip()) for i in os.getenv("TELEGRAM_ALLOWED_USER_IDS", "").split(",") if i.strip()]
ACTIONS_ROOT = Path(os.getenv("ACTIONS_ROOT", "/opt/homelab/actions")) ACTIONS_ROOT = Path(os.getenv("ACTIONS_ROOT", "/opt/homelab/actions"))
CONTROL_PLANE_URL = os.getenv("CONTROL_PLANE_URL", "http://webui:8080")
ENABLE_LLM_FALLBACK = os.getenv("ENABLE_LLM_FALLBACK", "false").lower() == "true"
OPENCLAW_BASE_URL = os.getenv("OPENCLAW_BASE_URL")
async def fetch_api(path):
"""Helper to fetch JSON from the Control Plane API."""
url = f"{CONTROL_PLANE_URL.rstrip('/')}/{path.lstrip('/')}"
try:
def do_request():
req = urllib.request.Request(url)
with urllib.request.urlopen(req, timeout=5) as response:
if response.status != 200:
return None
return json.loads(response.read().decode())
return await asyncio.to_thread(do_request)
except Exception as e:
logger.error(f"Error fetching {url}: {e}")
return None
async def post_api(path, data):
"""Helper to POST JSON to the Control Plane API."""
url = f"{CONTROL_PLANE_URL.rstrip('/')}/{path.lstrip('/')}"
try:
body = json.dumps(data).encode("utf-8")
def do_request():
req = urllib.request.Request(url, data=body, method="POST")
req.add_header("Content-Type", "application/json")
with urllib.request.urlopen(req, timeout=5) as response:
return response.status == 200
return await asyncio.to_thread(do_request)
except Exception as e:
logger.error(f"Error posting to {url}: {e}")
return False
class ApprovalBot: class ApprovalBot:
def __init__(self): def __init__(self):
@ -59,7 +94,7 @@ class ApprovalBot:
f"Node: `{data.get('node', 'unknown')}`\n" f"Node: `{data.get('node', 'unknown')}`\n"
f"Risk: *{data.get('risk', 'unknown')}*\n" f"Risk: *{data.get('risk', 'unknown')}*\n"
) )
if "details" in data: if "details" in data:
details_str = json.dumps(data['details'], indent=2) details_str = json.dumps(data['details'], indent=2)
if len(details_str) > 1000: if len(details_str) > 1000:
@ -96,16 +131,22 @@ class ApprovalBot:
return return
await query.answer() await query.answer()
cb_data = query.data cb_data = query.data
if ":" not in cb_data: if ":" not in cb_data:
return return
action, action_id = cb_data.split(":", 1) action, action_id = cb_data.split(":", 1)
target_status = "approved" if action == "approve" else "rejected" target_status = "approved" if action == "approve" else "rejected"
success, msg = self.move_action(action_id, target_status, user_id, query.from_user.username or str(user_id)) # Use API for mutation if available, fallback to local disk move
success = await post_api("/action/mutate", {"id": action_id, "status": target_status})
msg = "Success" if success else "API call failed"
if not success:
# Fallback to direct disk manipulation (original behavior)
success, msg = self.move_action(action_id, target_status, user_id, query.from_user.username or str(user_id))
if success: if success:
status_text = "✅ Approved" if target_status == "approved" else "❌ Rejected" status_text = "✅ Approved" if target_status == "approved" else "❌ Rejected"
await query.edit_message_text( await query.edit_message_text(
@ -131,11 +172,11 @@ class ApprovalBot:
try: try:
data = json.loads(source_path.read_text()) data = json.loads(source_path.read_text())
current_status = data.get("status", "pending") current_status = data.get("status", "pending")
# Update data # Update data
data["status"] = target_status data["status"] = target_status
data["updated_at"] = time.time() data["updated_at"] = time.time()
history = data.get("transition_history", []) history = data.get("transition_history", [])
history.append({ history.append({
"from": current_status, "from": current_status,
@ -158,16 +199,189 @@ async def start_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
"""Simple start command to help users find their ID.""" """Simple start command to help users find their ID."""
user = update.effective_user user = update.effective_user
message = ( message = (
f"Hello {user.first_name}!\n" f"Hello {user.first_name}! 🤖\n"
f"Your Telegram User ID is: `{user.id}`\n\n" f"Your Telegram User ID is: `{user.id}`\n\n"
) )
if user.id in ALLOWED_IDS: if user.id in ALLOWED_IDS:
message += "✅ You are authorized to approve actions." message += "✅ You are authorized to manage the homelab.\n\n"
message += "Use /help to see available commands."
else: else:
message += "❌ You are NOT authorized. Add your ID to TELEGRAM_ALLOWED_USER_IDS." message += "❌ You are NOT authorized. Add your ID to `TELEGRAM_ALLOWED_USER_IDS`."
await update.message.reply_text(message, parse_mode="Markdown") await update.message.reply_text(message, parse_mode="Markdown")
async def status_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
if update.effective_user.id not in ALLOWED_IDS: return
res = await fetch_api("/summary")
status = "✅ Online" if res else "❌ Unreachable"
message = (
f"🤖 *Telegram Bot Status*\n"
f"Control Plane API: {status}\n"
f"Target URL: `{CONTROL_PLANE_URL}`\n"
)
await update.message.reply_text(message, parse_mode="Markdown")
async def summary_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
if update.effective_user.id not in ALLOWED_IDS: return
data = await fetch_api("/summary")
if not data:
await update.message.reply_text("❌ Failed to fetch summary from Control Plane.")
return
msg = "📊 *System Summary*\n"
msg += f"Status: `{data.get('status', 'unknown')}`\n"
msg += f"Nodes: {data.get('node_count', 0)}\n"
msg += f"Services: {data.get('service_count', 0)}\n"
msg += f"Active Incidents: {data.get('active_incidents_count', 0)}\n"
if data.get('stale'):
msg += "\n⚠️ *Warning: Data is stale!*"
await update.message.reply_text(msg, parse_mode="Markdown")
async def nodes_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
if update.effective_user.id not in ALLOWED_IDS: return
nodes = await fetch_api("/nodes")
if nodes is None:
await update.message.reply_text("❌ Failed to fetch nodes.")
return
if not nodes:
await update.message.reply_text("No nodes discovered in the fleet.")
return
msg = "🖥️ *Nodes Status*\n"
for node in nodes:
health_icon = "" if node.get('health') == 'nominal' else "⚠️" if node.get('health') == 'degraded' else ""
msg += f"{health_icon} *{node.get('hostname')}*: `{node.get('status', 'unknown')}`\n"
msg += f" Last seen: {node.get('last_seen', 'N/A')}\n"
await update.message.reply_text(msg, parse_mode="Markdown")
async def services_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
if update.effective_user.id not in ALLOWED_IDS: return
services = await fetch_api("/services")
if services is None:
await update.message.reply_text("❌ Failed to fetch services.")
return
# Summarize by node
nodes = {}
for s in services:
node = s.get("node", "unknown")
if node not in nodes: nodes[node] = []
nodes[node].append(s)
msg = "⚙️ *Services Summary*\n"
if not nodes:
msg += "No services discovered."
else:
for node, svc_list in sorted(nodes.items()):
nominal = len([s for s in svc_list if s.get("health") == "nominal"])
msg += f"• *{node}*: {nominal}/{len(svc_list)} nominal\n"
msg += "\nUse /unhealthy to see issues."
await update.message.reply_text(msg, parse_mode="Markdown")
async def unhealthy_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
if update.effective_user.id not in ALLOWED_IDS: return
services = await fetch_api("/services")
nodes = await fetch_api("/nodes")
msg = "⚠️ *Unhealthy Components*\n"
found = False
if services:
for s in services:
health = s.get("health", "").lower()
if health != "nominal":
msg += f"• Service *{s.get('name')}* on *{s.get('node')}*: `{health}`\n"
found = True
if nodes:
for n in nodes:
checks = n.get("checks", {})
if isinstance(checks, str):
try: checks = json.loads(checks)
except: checks = {}
docker = checks.get("docker", {})
if docker.get("status") == "ok":
for c in docker.get("containers", []):
if c.get("state") != "running":
msg += f"• Container *{c.get('name')}* on *{n.get('hostname')}*: `{c.get('state')}`\n"
found = True
if not found:
msg += "All systems nominal. ✅"
await update.message.reply_text(msg, parse_mode="Markdown")
async def incidents_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
if update.effective_user.id not in ALLOWED_IDS: return
incidents = await fetch_api("/incidents")
if incidents is None:
await update.message.reply_text("❌ Failed to fetch incidents.")
return
active = [i for i in incidents if i.get("status") not in ("resolved", "closed")]
if not active:
await update.message.reply_text("No active incidents. ✅")
return
msg = "🚨 *Active Incidents*\n"
for inc in active:
severity = inc.get('severity', 'info').upper()
msg += f"• [{severity}] *{inc.get('type')}*: {inc.get('message')}\n"
await update.message.reply_text(msg, parse_mode="Markdown")
async def actions_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
if update.effective_user.id not in ALLOWED_IDS: return
actions = await fetch_api("/actions")
if actions is None:
await update.message.reply_text("❌ Actions endpoint unavailable.")
return
msg = "⚡ *Actions Summary*\n"
total = 0
for status, act_list in actions.items():
if act_list:
msg += f"{status.capitalize()}: {len(act_list)}\n"
total += len(act_list)
if total == 0:
msg = "No actions recorded."
await update.message.reply_text(msg, parse_mode="Markdown")
async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
msg = (
"📖 *Supported Commands*\n\n"
"/status - Check bot and API connectivity\n"
"/summary - System health overview\n"
"/nodes - List homelab nodes and their status\n"
"/services - Summary of services across nodes\n"
"/unhealthy - List all unhealthy components\n"
"/incidents - View active incidents\n"
"/actions - Summary of operator actions\n"
"/help - Show this help message\n\n"
"Free text will be handled by the guidance system."
)
await update.message.reply_text(msg, parse_mode="Markdown")
async def handle_fallback(update: Update, context: ContextTypes.DEFAULT_TYPE):
"""Handles non-command messages."""
if update.effective_user.id not in ALLOWED_IDS: return
if ENABLE_LLM_FALLBACK and OPENCLAW_BASE_URL:
# Placeholder for OpenClaw LLM fallback
# In a real scenario, this would call the LLM API
logger.info(f"LLM fallback requested for: {update.message.text}")
await update.message.reply_text(
"Use /summary, /nodes, /services, /unhealthy, /incidents, /actions."
)
async def run_bot(): async def run_bot():
if not TOKEN: if not TOKEN:
print("CRITICAL: TELEGRAM_BOT_TOKEN is not set. Telegram bot will not start.") print("CRITICAL: TELEGRAM_BOT_TOKEN is not set. Telegram bot will not start.")
@ -176,21 +390,31 @@ async def run_bot():
return return
bot_logic = ApprovalBot() bot_logic = ApprovalBot()
application = ApplicationBuilder().token(TOKEN).build() application = ApplicationBuilder().token(TOKEN).build()
application.add_handler(CommandHandler("start", start_command)) application.add_handler(CommandHandler("start", start_command))
application.add_handler(CommandHandler("status", status_command))
application.add_handler(CommandHandler("summary", summary_command))
application.add_handler(CommandHandler("nodes", nodes_command))
application.add_handler(CommandHandler("services", services_command))
application.add_handler(CommandHandler("unhealthy", unhealthy_command))
application.add_handler(CommandHandler("incidents", incidents_command))
application.add_handler(CommandHandler("actions", actions_command))
application.add_handler(CommandHandler("help", help_command))
application.add_handler(MessageHandler(filters.TEXT & (~filters.COMMAND), handle_fallback))
application.add_handler(CallbackQueryHandler(bot_logic.handle_callback)) application.add_handler(CallbackQueryHandler(bot_logic.handle_callback))
# Schedule the pending actions check # Schedule the pending actions check
job_queue = application.job_queue job_queue = application.job_queue
job_queue.run_repeating(bot_logic.check_pending_actions, interval=10, first=5) job_queue.run_repeating(bot_logic.check_pending_actions, interval=10, first=5)
logger.info("Starting Telegram Approval Bot...") logger.info("Starting Telegram Approval Bot...")
await application.initialize() await application.initialize()
await application.start() await application.start()
await application.updater.start_polling() await application.updater.start_polling()
# Run until the application is stopped # Run until the application is stopped
stop_event = asyncio.Event() stop_event = asyncio.Event()
try: try: