feat(vps): migracja npm/outline/joplin/ai-cluster do GitOps (manifesty; cutover NIE wykonany)

This commit is contained in:
Oskar Kapala 2026-06-01 21:44:37 +02:00
parent f381023206
commit 862c04a612
20 changed files with 586 additions and 8 deletions

View file

@ -0,0 +1,33 @@
# AI cluster memory limits — HARD caps, containers are OOM-killed and auto-restarted
# by Docker rather than consuming host memory. ai-cluster is the primary OOM suspect
# (unbounded Python workers, no limits since deployment).
#
# Architectural note: compute workloads here should migrate to SOLARIA (GPU node).
# Until migration: contain the blast radius with per-container limits.
#
# Pre-cutover: service-ops-worker still mounts compose/env from old paths.
# After cutover and git pull, these overrides are removed and base compose paths are used.
services:
codex-worker:
mem_limit: 64m
openclaw:
mem_limit: 128m
planner-worker:
mem_limit: 64m
service-ops-worker:
mem_limit: 64m
# Pre-cutover: override bind mounts to keep pointing at old dockeruser paths
volumes:
- /home/dockeruser/docker/ai-cluster/docker-compose.yml:/app/docker-compose.yml:ro
- /home/dockeruser/docker/ai-cluster/.env:/app/.env:ro
- /var/run/docker.sock:/var/run/docker.sock:rw
redis:
mem_limit: 32m
mosquitto:
mem_limit: 32m

View file

@ -0,0 +1,6 @@
services:
app:
mem_limit: 224m
db:
mem_limit: 128m

View file

@ -0,0 +1,3 @@
services:
node_exporter:
mem_limit: 32m

View file

@ -0,0 +1,6 @@
services:
npm:
mem_limit: 160m
# Public ingress — elevated OOM protection so TLS termination + proxy host
# config survive memory pressure. Host OOM-killer will not target this container.
oom_score_adj: -800

View file

@ -0,0 +1,9 @@
services:
outline:
mem_limit: 512m
postgres:
mem_limit: 96m
redis:
mem_limit: 32m

View file

@ -41,3 +41,81 @@ services:
depends_on:
local: []
external: []
npm:
role: reverse-proxy-ingress
deployment_model: docker-compose
exposure: public
offline_required: false
depends_on:
local: []
external: []
ports:
- name: http
container_port: 80
protocol: tcp
- name: https
container_port: 443
protocol: tcp
- name: admin
container_port: 81
protocol: tcp
runtime:
data_path: /home/dockeruser/docker/npm/data
config_path: /opt/homelab/config/npm
outline:
role: team-wiki
deployment_model: docker-compose
exposure: public
offline_required: false
depends_on:
local:
- npm
external: []
ports:
- name: http
container_port: 3000
protocol: tcp
runtime:
config_path: /opt/homelab/config/outline
joplin:
role: note-sync-server
deployment_model: docker-compose
exposure: tailscale-internal
offline_required: false
depends_on:
local:
- npm
external: []
ports:
- name: http
container_port: 22300
bind: 127.0.0.1
protocol: tcp
runtime:
config_path: /opt/homelab/config/joplin
ai-cluster:
role: ai-worker-cluster
deployment_model: docker-compose
exposure: tailscale-internal
offline_required: false
depends_on:
local: []
external:
- piha:gateway
ports:
- name: openclaw-api
container_port: 8000
protocol: tcp
- name: mqtt
container_port: 1883
protocol: tcp
bind: tailscale
runtime:
config_path: /opt/homelab/config/ai-cluster
notes:
- "Local images must be built on VPS — not pulled from registry"
- "Compute workloads belong on SOLARIA; migrate when possible"

View file

@ -0,0 +1,110 @@
services:
codex-worker:
image: ai-cluster-codex-worker
restart: unless-stopped
environment:
- AGENT_ID=vps-dev-1
- ROLE=dev
- MQTT_HOST=mosquitto
- MQTT_PORT=1883
- MQTT_USERNAME=${MQTT_USERNAME:-codex}
- MQTT_PASSWORD=${MQTT_PASSWORD}
- GATEWAY_BASE_URL=${GATEWAY_BASE_URL:-http://piha:8080}
- REQUEST_TIMEOUT_SECONDS=30
command: ["python", "worker.py"]
networks:
- internal
openclaw:
image: ai-cluster-openclaw
restart: unless-stopped
environment:
- MQTT_HOST=mosquitto
- MQTT_PORT=1883
- MQTT_USERNAME=${MQTT_USERNAME:-codex}
- MQTT_PASSWORD=${MQTT_PASSWORD}
command: ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
ports:
- "8000:8000"
networks:
- internal
- npm_default
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
planner-worker:
image: ai-cluster-planner-worker
restart: unless-stopped
environment:
- AGENT_ID=vps-planner-1
- ROLE=planner
- MQTT_HOST=mosquitto
- MQTT_PORT=1883
- MQTT_USERNAME=${MQTT_USERNAME:-codex}
- MQTT_PASSWORD=${MQTT_PASSWORD}
command: ["python", "planner_worker.py"]
networks:
- internal
service-ops-worker:
image: ai-cluster-service-ops-worker
restart: unless-stopped
environment:
- AGENT_ID=vps-service-ops-1
- ROLE=service-ops
- MQTT_HOST=mosquitto
- MQTT_PORT=1883
- MQTT_USERNAME=${MQTT_USERNAME:-codex}
- MQTT_PASSWORD=${MQTT_PASSWORD}
- COMPOSE_PROJECT_NAME=ai-cluster
command: ["python", "service_ops_worker.py"]
volumes:
# Post-migration: compose definition and env are in the repo/runtime paths.
# Pre-cutover these are overridden to old paths via docker-compose.override.yml.
- /home/oskar/homelab-codex-ws/services/ai-cluster/docker-compose.yml:/app/docker-compose.yml:ro
- /opt/homelab/config/ai-cluster/.env:/app/.env:ro
- /var/run/docker.sock:/var/run/docker.sock:rw
networks:
- internal
redis:
image: redis:7-alpine
restart: unless-stopped
command: ["redis-server"]
volumes:
- redis_data:/data
networks:
- internal
mosquitto:
image: eclipse-mosquitto:2
container_name: mosquitto
restart: unless-stopped
command: ["/usr/sbin/mosquitto", "-c", "/mosquitto/config/mosquitto.conf"]
ports:
# Tailscale IP binding — matches running container
- "100.95.58.48:1883:1883"
volumes:
# Config: kept at old path until mosquitto config migration is complete
- /home/dockeruser/docker/ai-cluster/mosquitto:/mosquitto/config:ro
- mosquitto_data:/mosquitto/data
- mosquitto_log:/mosquitto/log
networks:
- internal
volumes:
redis_data:
mosquitto_data:
mosquitto_log:
networks:
internal:
driver: bridge
name: ai-cluster_ai-cluster
npm_default:
external: true
name: npm_default

View file

@ -0,0 +1,14 @@
# AI Cluster — /opt/homelab/config/ai-cluster/.env
# Read by all worker containers and mounted into service-ops-worker as /app/.env
# MQTT broker credentials
MQTT_HOST=mosquitto
MQTT_PORT=1883
MQTT_USERNAME=codex
MQTT_PASSWORD=
# API gateway (piha)
GATEWAY_BASE_URL=http://piha:8080
# Compose project name (required for service-ops-worker docker-compose operations)
COMPOSE_PROJECT_NAME=ai-cluster

View file

@ -0,0 +1,15 @@
#!/bin/bash
# Healthcheck for AI cluster (checks openclaw API gateway is responding)
if ! docker ps --filter "name=ai-cluster-openclaw-1" --filter "status=running" | grep -q "openclaw"; then
echo "[FAIL] openclaw container is not running"
exit 1
fi
if ! curl -sf http://localhost:8000/health > /dev/null; then
echo "[FAIL] openclaw HTTP health endpoint not responding"
exit 1
fi
echo "[OK] ai-cluster is healthy"
exit 0

View file

@ -0,0 +1,37 @@
service:
name: ai-cluster
owner_node: vps
exposure: tailscale-internal
dependencies:
- mosquitto
- redis
ports:
- container: 8000
host: 8000
protocol: tcp
service: openclaw
- container: 1883
host: 1883
protocol: tcp
bind: 100.95.58.48 # Tailscale only
service: mosquitto
healthcheck:
type: http
endpoint: http://localhost:8000/health
interval: 30s
timeout: 10s
retries: 3
restart_policy: unless-stopped
persistence:
paths:
- volume:mosquitto_config_bind # /home/dockeruser/docker/ai-cluster/mosquitto (bind, not volume)
runtime:
env_file: /opt/homelab/config/ai-cluster/.env
env_vars:
- MQTT_PASSWORD
- MQTT_USERNAME
- GATEWAY_BASE_URL
notes:
- "Local images (ai-cluster-*) must be built on VPS before deployment"
- "service-ops-worker mounts docker.sock and the compose file — needs post-migration path update"
- "Recommendation: move ai-cluster compute workloads to SOLARIA (GPU/compute node)"

View file

@ -0,0 +1,44 @@
services:
app:
image: joplin/server:latest
container_name: joplin-server
restart: unless-stopped
env_file:
- /opt/homelab/config/joplin/.env
ports:
- "127.0.0.1:22300:22300"
depends_on:
db:
condition: service_healthy
networks:
- joplin_net
- npm_default
db:
image: postgres:18
container_name: joplin-db
restart: unless-stopped
env_file:
- /opt/homelab/config/joplin/.env
volumes:
- postgres_data:/var/lib/postgresql
networks:
- joplin_net
healthcheck:
test: ["CMD-SHELL", "pg_isready -U joplin -d joplin"]
interval: 10s
timeout: 5s
retries: 5
volumes:
postgres_data:
external: true
name: joplin_postgres_data
networks:
joplin_net:
driver: bridge
name: joplin-net
npm_default:
external: true
name: npm_default

View file

@ -0,0 +1,20 @@
# Joplin Server — /opt/homelab/config/joplin/.env
# Both the `app` (joplin-server) and `db` (postgres) containers read this file.
# Application
APP_BASE_URL=https://joplin.example.com
APP_PORT=22300
TRUST_PROXY=1
RUNNING_IN_DOCKER=1
# Database connection (joplin-server reads these)
DB_CLIENT=pg
POSTGRES_HOST=db
POSTGRES_PORT=5432
POSTGRES_USER=joplin
POSTGRES_DB=joplin
POSTGRES_DATABASE=joplin
POSTGRES_PASSWORD=
# Runtime
PM2_HOME=/opt/pm2

View file

@ -0,0 +1,15 @@
#!/bin/bash
# Healthcheck for Joplin Server
if ! docker ps --filter "name=joplin-server" --filter "status=running" | grep -q "joplin-server"; then
echo "[FAIL] joplin-server container is not running"
exit 1
fi
if ! curl -sf http://localhost:22300/api/ping > /dev/null; then
echo "[FAIL] Joplin Server HTTP endpoint not responding"
exit 1
fi
echo "[OK] Joplin Server is healthy"
exit 0

View file

@ -0,0 +1,31 @@
service:
name: joplin
owner_node: vps
exposure: tailscale-internal
dependencies:
- db
ports:
- container: 22300
host: 22300
protocol: tcp
bind: 127.0.0.1
healthcheck:
type: http
endpoint: http://localhost:22300/api/ping
interval: 30s
timeout: 10s
retries: 3
restart_policy: unless-stopped
persistence:
paths:
- volume:joplin_postgres_data # Joplin notes DB
runtime:
env_file: /opt/homelab/config/joplin/.env
env_vars:
- APP_BASE_URL
- APP_PORT
- DB_CLIENT
- POSTGRES_HOST
- POSTGRES_USER
- POSTGRES_PASSWORD
- POSTGRES_DB

View file

@ -8,5 +8,7 @@ services:
- '81:81'
- '443:443'
volumes:
- /opt/homelab/data/npm/data:/data
- /opt/homelab/data/npm/letsencrypt:/etc/letsencrypt
# Data lives at dockeruser's path — do NOT move these without a migration plan.
# Proxy hosts, SSL certs, and DB are stored here.
- /home/dockeruser/docker/npm/data:/data
- /home/dockeruser/docker/npm/letsencrypt:/etc/letsencrypt

View file

@ -22,10 +22,6 @@ service:
restart_policy: unless-stopped
persistence:
paths:
- /opt/homelab/data/npm/data
- /opt/homelab/data/npm/letsencrypt
runtime:
directories:
- /opt/homelab/data/npm/data
- /opt/homelab/data/npm/letsencrypt
- /home/dockeruser/docker/npm/data
- /home/dockeruser/docker/npm/letsencrypt
env_vars: []

View file

@ -0,0 +1,68 @@
services:
outline:
image: outlinewiki/outline:1.6.1
container_name: outline-outline-1
restart: unless-stopped
env_file:
- /opt/homelab/config/outline/.env
ports:
- "3000:3000"
volumes:
- outline_storage:/var/lib/outline/data
depends_on:
- postgres
- redis
networks:
- outline_internal
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:3000/_health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 30s
postgres:
image: postgres:16-alpine
container_name: outline-postgres-1
restart: unless-stopped
env_file:
- /opt/homelab/config/outline/.env
volumes:
- postgres_data:/var/lib/postgresql/data
networks:
- outline_internal
healthcheck:
test: ["CMD-SHELL", "pg_isready -U outline -d outline"]
interval: 10s
timeout: 5s
retries: 5
redis:
image: redis:7-alpine
container_name: outline-redis-1
restart: unless-stopped
volumes:
- redis_data:/data
networks:
- outline_internal
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 10s
timeout: 5s
retries: 3
volumes:
outline_storage:
external: true
name: outline_outline_storage
postgres_data:
external: true
name: outline_postgres_data
redis_data:
external: true
name: outline_redis_data
networks:
outline_internal:
driver: bridge
name: outline_outline_internal

View file

@ -0,0 +1,40 @@
# Outline Wiki — /opt/homelab/config/outline/.env
# Both the `outline` and `postgres` containers read this file.
# Application
URL=https://outline.example.com
NODE_ENV=production
PORT=3000
FILE_STORAGE=local
FILE_STORAGE_LOCAL_ROOT_DIR=/var/lib/outline/data
FORCE_HTTPS=true
# Secrets — generate with: openssl rand -hex 32
SECRET_KEY=
UTILS_SECRET=
# Database
DATABASE_URL=postgres://outline:<password>@postgres:5432/outline
PGSSLMODE=disable
# Redis
REDIS_URL=redis://redis:6379
# Postgres sidecar vars (read by the postgres container)
POSTGRES_USER=outline
POSTGRES_DB=outline
POSTGRES_PASSWORD=
# Google OAuth (optional)
GOOGLE_CLIENT_ID=
GOOGLE_CLIENT_SECRET=
# SMTP
SMTP_HOST=
SMTP_PORT=587
SMTP_USERNAME=
SMTP_PASSWORD=
SMTP_FROM_EMAIL=outline@example.com
SMTP_REPLY_EMAIL=outline@example.com
SMTP_SECURE=false
ALLOWED_DOMAINS=

View file

@ -0,0 +1,15 @@
#!/bin/bash
# Healthcheck for Outline Wiki stack
if ! docker ps --filter "name=outline-outline-1" --filter "status=running" | grep -q "outline-outline-1"; then
echo "[FAIL] outline container is not running"
exit 1
fi
if ! curl -sf http://localhost:3000/_health > /dev/null; then
echo "[FAIL] Outline HTTP health endpoint not responding"
exit 1
fi
echo "[OK] Outline is healthy"
exit 0

View file

@ -0,0 +1,36 @@
service:
name: outline
owner_node: vps
exposure: public
dependencies:
- postgres
- redis
ports:
- container: 3000
host: 3000
protocol: tcp
healthcheck:
type: http
endpoint: http://localhost:3000/_health
interval: 30s
timeout: 10s
retries: 3
restart_policy: unless-stopped
persistence:
paths:
# Docker named volumes — data stays at Docker volume paths
- volume:outline_outline_storage # /var/lib/outline/data inside container
- volume:outline_postgres_data # Postgres data directory
- volume:outline_redis_data # Redis persistence
runtime:
env_file: /opt/homelab/config/outline/.env
env_vars:
- URL
- DATABASE_URL
- REDIS_URL
- SECRET_KEY
- UTILS_SECRET
- FILE_STORAGE
- POSTGRES_USER
- POSTGRES_PASSWORD
- POSTGRES_DB