116 lines
3.2 KiB
Python
116 lines
3.2 KiB
Python
|
|
import json
|
||
|
|
import os
|
||
|
|
import socket
|
||
|
|
import time
|
||
|
|
from urllib.error import HTTPError, URLError
|
||
|
|
from urllib.request import Request, urlopen
|
||
|
|
|
||
|
|
|
||
|
|
NODE_NAME = os.getenv("NODE_NAME") or socket.gethostname()
|
||
|
|
ORCHESTRATOR_URL = os.getenv("ORCHESTRATOR_URL")
|
||
|
|
SERVICES_TO_CHECK = {
|
||
|
|
name.strip()
|
||
|
|
for name in os.getenv("SERVICES_TO_CHECK", "").split(",")
|
||
|
|
if name.strip()
|
||
|
|
}
|
||
|
|
INTERVAL_SECONDS = int(os.getenv("INTERVAL_SECONDS", "30"))
|
||
|
|
SERVICE_CATALOG = [
|
||
|
|
{"name": "homeassistant", "type": "http", "url": "http://homeassistant:8123"},
|
||
|
|
{"name": "lms", "type": "tcp", "host": "192.168.31.6", "port": 9000},
|
||
|
|
{"name": "forgejo", "type": "http", "url": "http://forgejo:3000"},
|
||
|
|
{"name": "nginx", "type": "http", "url": "http://nginx"},
|
||
|
|
{"name": "mosquitto", "type": "tcp", "host": "mosquitto", "port": 1883},
|
||
|
|
]
|
||
|
|
|
||
|
|
|
||
|
|
def services_to_check():
|
||
|
|
if not SERVICES_TO_CHECK:
|
||
|
|
return SERVICE_CATALOG
|
||
|
|
return [
|
||
|
|
service for service in SERVICE_CATALOG
|
||
|
|
if service["name"] in SERVICES_TO_CHECK
|
||
|
|
]
|
||
|
|
|
||
|
|
|
||
|
|
def check_http(url):
|
||
|
|
request = Request(url, headers={"User-Agent": "monitor-agent/1.0"})
|
||
|
|
try:
|
||
|
|
with urlopen(request, timeout=5) as response:
|
||
|
|
return "ok" if response.status == 200 else "error"
|
||
|
|
except (HTTPError, URLError, TimeoutError, OSError):
|
||
|
|
return "error"
|
||
|
|
|
||
|
|
|
||
|
|
def check_tcp(host, port):
|
||
|
|
try:
|
||
|
|
with socket.create_connection((host, int(port)), timeout=5):
|
||
|
|
return "ok"
|
||
|
|
except OSError:
|
||
|
|
return "error"
|
||
|
|
|
||
|
|
|
||
|
|
def check_service(service):
|
||
|
|
service_type = service.get("type")
|
||
|
|
if service_type == "http":
|
||
|
|
return check_http(service["url"])
|
||
|
|
if service_type == "tcp":
|
||
|
|
return check_tcp(service["host"], service["port"])
|
||
|
|
return "error"
|
||
|
|
|
||
|
|
|
||
|
|
def build_event(service, status):
|
||
|
|
return {
|
||
|
|
"type": "health",
|
||
|
|
"service": service["name"],
|
||
|
|
"status": status,
|
||
|
|
"timestamp": time.time(),
|
||
|
|
"run_id": None,
|
||
|
|
"node": NODE_NAME,
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def send_event(event):
|
||
|
|
body = json.dumps(event).encode("utf-8")
|
||
|
|
request = Request(
|
||
|
|
ORCHESTRATOR_URL,
|
||
|
|
data=body,
|
||
|
|
headers={"Content-Type": "application/json"},
|
||
|
|
method="POST",
|
||
|
|
)
|
||
|
|
with urlopen(request, timeout=5) as response:
|
||
|
|
if response.status >= 300:
|
||
|
|
raise RuntimeError(f"event endpoint returned {response.status}")
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
if not ORCHESTRATOR_URL:
|
||
|
|
raise SystemExit("ORCHESTRATOR_URL is required")
|
||
|
|
|
||
|
|
selected_services = services_to_check()
|
||
|
|
print(
|
||
|
|
(
|
||
|
|
f"[monitor-agent] ready node={NODE_NAME} "
|
||
|
|
f"url={ORCHESTRATOR_URL} "
|
||
|
|
f"services={[service['name'] for service in selected_services]}"
|
||
|
|
),
|
||
|
|
flush=True,
|
||
|
|
)
|
||
|
|
|
||
|
|
while True:
|
||
|
|
started = time.time()
|
||
|
|
for service in selected_services:
|
||
|
|
status = check_service(service)
|
||
|
|
event = build_event(service, status)
|
||
|
|
try:
|
||
|
|
send_event(event)
|
||
|
|
except Exception as exc:
|
||
|
|
print(f"[monitor-agent] send failed: {exc}", flush=True)
|
||
|
|
print(json.dumps(event), flush=True)
|
||
|
|
|
||
|
|
elapsed = time.time() - started
|
||
|
|
time.sleep(max(0, INTERVAL_SECONDS - elapsed))
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|