agent-system/monitor-agent/main.py

116 lines
3.2 KiB
Python
Raw Normal View History

import json
import os
import socket
import time
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen
NODE_NAME = os.getenv("NODE_NAME") or socket.gethostname()
ORCHESTRATOR_URL = os.getenv("ORCHESTRATOR_URL")
SERVICES_TO_CHECK = {
name.strip()
for name in os.getenv("SERVICES_TO_CHECK", "").split(",")
if name.strip()
}
INTERVAL_SECONDS = int(os.getenv("INTERVAL_SECONDS", "30"))
SERVICE_CATALOG = [
{"name": "homeassistant", "type": "http", "url": "http://homeassistant:8123"},
{"name": "lms", "type": "tcp", "host": "192.168.31.6", "port": 9000},
{"name": "forgejo", "type": "http", "url": "http://forgejo:3000"},
{"name": "nginx", "type": "http", "url": "http://nginx"},
{"name": "mosquitto", "type": "tcp", "host": "mosquitto", "port": 1883},
]
def services_to_check():
if not SERVICES_TO_CHECK:
return SERVICE_CATALOG
return [
service for service in SERVICE_CATALOG
if service["name"] in SERVICES_TO_CHECK
]
def check_http(url):
request = Request(url, headers={"User-Agent": "monitor-agent/1.0"})
try:
with urlopen(request, timeout=5) as response:
return "ok" if response.status == 200 else "error"
except (HTTPError, URLError, TimeoutError, OSError):
return "error"
def check_tcp(host, port):
try:
with socket.create_connection((host, int(port)), timeout=5):
return "ok"
except OSError:
return "error"
def check_service(service):
service_type = service.get("type")
if service_type == "http":
return check_http(service["url"])
if service_type == "tcp":
return check_tcp(service["host"], service["port"])
return "error"
def build_event(service, status):
return {
"type": "health",
"service": service["name"],
"status": status,
"timestamp": time.time(),
"run_id": None,
"node": NODE_NAME,
}
def send_event(event):
body = json.dumps(event).encode("utf-8")
request = Request(
ORCHESTRATOR_URL,
data=body,
headers={"Content-Type": "application/json"},
method="POST",
)
with urlopen(request, timeout=5) as response:
if response.status >= 300:
raise RuntimeError(f"event endpoint returned {response.status}")
def main():
if not ORCHESTRATOR_URL:
raise SystemExit("ORCHESTRATOR_URL is required")
selected_services = services_to_check()
print(
(
f"[monitor-agent] ready node={NODE_NAME} "
f"url={ORCHESTRATOR_URL} "
f"services={[service['name'] for service in selected_services]}"
),
flush=True,
)
while True:
started = time.time()
for service in selected_services:
status = check_service(service)
event = build_event(service, status)
try:
send_event(event)
except Exception as exc:
print(f"[monitor-agent] send failed: {exc}", flush=True)
print(json.dumps(event), flush=True)
elapsed = time.time() - started
time.sleep(max(0, INTERVAL_SECONDS - elapsed))
if __name__ == "__main__":
main()