From bbdbdb832158f75ae8e30db1236b578ea416ef05 Mon Sep 17 00:00:00 2001 From: Oskar Kapala Date: Mon, 11 May 2026 20:46:50 +0200 Subject: [PATCH] Add node capability model --- docs/capabilities.md | 85 ++++++++++++++ docs/deployment.md | 44 +++++-- docs/lifecycle.md | 51 ++++++++ docs/service-model.md | 75 ++++++++++++ docs/standards.md | 29 +++-- hosts/chelsty/capabilities.yaml | 40 +++++++ hosts/chelsty/networking.yaml | 5 + hosts/piha/capabilities.yaml | 39 +++++++ hosts/saturn/capabilities.yaml | 40 +++++++ hosts/solaria/capabilities.yaml | 41 +++++++ hosts/vps/capabilities.yaml | 40 +++++++ inventory/templates/how_to_add_new_node.yaml | 29 +++++ .../templates/node-bootstrap-checklist.yaml | 29 +++++ .../templates/node-discovery-commands.yaml | 18 +++ inventory/templates/prepare-node.yaml | 13 +++ inventory/templates/prompts/create-node | 13 +++ inventory/templates/prompts/deploy-node | 16 +++ inventory/templates/prompts/recover-node | 17 +++ scripts/deploy/deploy.sh | 110 ++++++++++++++++++ services/forgejo/README.md | 9 ++ services/forgejo/docker-compose.yml | 15 +++ services/forgejo/env.example | 3 + services/forgejo/healthcheck.sh | 17 +++ services/forgejo/service.yaml | 28 +++++ services/mosquitto/README.md | 9 ++ services/mosquitto/docker-compose.yml | 12 ++ services/mosquitto/env.example | 2 + services/mosquitto/healthcheck.sh | 17 +++ services/mosquitto/service.yaml | 29 +++++ services/npm/README.md | 13 +++ services/npm/env.example | 2 + services/npm/healthcheck.sh | 17 +++ services/npm/service.yaml | 31 +++++ services/ollama/README.md | 13 +++ services/ollama/docker-compose.yml | 16 +++ services/ollama/env.example | 2 + services/ollama/healthcheck.sh | 17 +++ services/ollama/service.yaml | 23 ++++ services/zigbee2mqtt/README.md | 10 ++ services/zigbee2mqtt/docker-compose.yml | 14 +++ services/zigbee2mqtt/env.example | 3 + services/zigbee2mqtt/healthcheck.sh | 17 +++ services/zigbee2mqtt/service.yaml | 25 ++++ 43 files changed, 1059 insertions(+), 19 deletions(-) create mode 100644 docs/capabilities.md create mode 100644 docs/lifecycle.md create mode 100644 docs/service-model.md create mode 100644 hosts/chelsty/capabilities.yaml create mode 100644 hosts/chelsty/networking.yaml create mode 100644 hosts/piha/capabilities.yaml create mode 100644 hosts/saturn/capabilities.yaml create mode 100644 hosts/solaria/capabilities.yaml create mode 100644 hosts/vps/capabilities.yaml create mode 100644 inventory/templates/how_to_add_new_node.yaml create mode 100644 inventory/templates/node-bootstrap-checklist.yaml create mode 100644 inventory/templates/node-discovery-commands.yaml create mode 100644 inventory/templates/prepare-node.yaml create mode 100644 inventory/templates/prompts/create-node create mode 100644 inventory/templates/prompts/deploy-node create mode 100644 inventory/templates/prompts/recover-node create mode 100755 scripts/deploy/deploy.sh create mode 100644 services/forgejo/README.md create mode 100644 services/forgejo/docker-compose.yml create mode 100644 services/forgejo/env.example create mode 100644 services/forgejo/healthcheck.sh create mode 100644 services/forgejo/service.yaml create mode 100644 services/mosquitto/README.md create mode 100644 services/mosquitto/docker-compose.yml create mode 100644 services/mosquitto/env.example create mode 100644 services/mosquitto/healthcheck.sh create mode 100644 services/mosquitto/service.yaml create mode 100644 services/npm/README.md create mode 100644 services/npm/env.example create mode 100644 services/npm/healthcheck.sh create mode 100644 services/npm/service.yaml create mode 100644 services/ollama/README.md create mode 100644 services/ollama/docker-compose.yml create mode 100644 services/ollama/env.example create mode 100644 services/ollama/healthcheck.sh create mode 100644 services/ollama/service.yaml create mode 100644 services/zigbee2mqtt/README.md create mode 100644 services/zigbee2mqtt/docker-compose.yml create mode 100644 services/zigbee2mqtt/env.example create mode 100644 services/zigbee2mqtt/healthcheck.sh create mode 100644 services/zigbee2mqtt/service.yaml diff --git a/docs/capabilities.md b/docs/capabilities.md new file mode 100644 index 0000000..46fb283 --- /dev/null +++ b/docs/capabilities.md @@ -0,0 +1,85 @@ +# Node Capability Model + +This document defines the capability model for the homelab infrastructure. The goal is to provide a declarative way to describe what each node can do, its constraints, and its suitability for various workloads. + +## Overview + +Capabilities are defined per host in `hosts//capabilities.yaml`. This metadata allows infrastructure tooling and future AI agents to reason about workload placement, recovery, and compatibility without hardcoding logic into the orchestration system. + +## Schema Definition + +The `capabilities.yaml` file follows this structure: + +```yaml +capabilities: + hardware: + cpu: + arch: # e.g., x86_64, arm64 + cores: + threads: + memory: + total_gb: + acceleration: + type: # e.g., none, cuda, tpu, vaapi + model: # e.g., "NVIDIA RTX 3060", "Coral Edge TPU" + + virtualization: + supported: + type: # e.g., kvm, docker-only + + storage: + persistence: # ephemeral, persistent, redundant + type: # ssd, hdd, nvme, sd-card + capacity_gb: + + networking: + reachability: # public, tailscale-only, lan-only + ingress_suitability: + bandwidth: # e.g., "1Gbps", "100Mbps", "LTE" + + runtime: + container_engine: # docker, podman, containerd + os: # debian, ubuntu, alpine, nixos + + operational: + power_constraint: # low-power, mains, battery-backed + connectivity: # stable, intermittent + availability_target: # high, medium, best-effort + + deployment: + suitability: [] # list of workload types (e.g., ai, database, edge, web) + restricted: # if true, only specific workloads are allowed +``` + +## Placement Reasoning Examples + +### AI Workloads +A service requiring `cuda` acceleration will be matched against nodes where `capabilities.hardware.acceleration.type == "cuda"`. +* **Target:** `solaria` + +### Public Ingress +A service requiring public exposure will look for `capabilities.networking.ingress_suitability == true`. +* **Target:** `vps` + +### Low-Power Staging +Staging workloads that should not consume significant power or are tolerant of intermittent connectivity. +* **Target:** `chelsty` + +## Recovery Reasoning Examples + +### Failover Strategy +If `saturn` (the primary orchestrator) fails: +1. Identify nodes with `roles: [control]` or `roles: [infra]`. +2. Check `capabilities.operational.availability_target == "high"`. +3. Propose migration of critical infra services to `piha`. + +### Storage-Bound Services +If a node with `persistence: persistent` fails, the agent must check if there are other nodes with `persistence: persistent` and compatible `storage.type` before attempting recovery, or warn about potential data loss if moved to an `ephemeral` node. + +## Future Usage by AI Agents + +Future autonomous agents will use this metadata to: +1. **Evaluate Suitability:** Match service requirements (from `service.yaml`) against node capabilities. +2. **Generate Plans:** Create step-by-step deployment or migration plans based on hardware compatibility. +3. **Validate Topology:** Ensure that a proposed multi-node setup doesn't violate networking or operational constraints (e.g., don't put a DB on an intermittent node). +4. **Propose Failover:** Automatically suggest the best alternative node during an outage. diff --git a/docs/deployment.md b/docs/deployment.md index 6f28206..a509225 100644 --- a/docs/deployment.md +++ b/docs/deployment.md @@ -9,22 +9,44 @@ This document describes the GitOps-lite deployment process for the homelab. 3. **Lightweight**: No complex orchestrators (no Kubernetes). Use `docker compose` and simple shell scripts. 4. **Tailscale Mesh**: All hosts are connected via Tailscale, allowing secure communication without public port exposure. -## Deployment Process +## Staged Deployment Framework -### 1. Preparation (on SATURN) +The homelab uses a staged deployment framework located at `scripts/deploy/deploy.sh`. This script is designed to be resumable, stage-aware, and observable. -- Modify or create service definitions in `services/`. -- Assign services to hosts by creating/updating `hosts//services.txt` (or similar mapping). -- Commit and push changes to the Forgejo instance. +### Deployment Stages -### 2. Deployment (on Execution Node) +1. **prepare**: Pulls the latest changes from Git, validates inventory, and prepares the local environment. +2. **deploy**: Executes `docker compose` commands for all assigned services. +3. **verify**: Checks the health and connectivity of deployed services. +4. **diagnose**: Performs deep checks and resource analysis if something goes wrong. +5. **rollback**: Reverts to a previous known-good state. +6. **resume**: Automatically continues from the last successful stage. -Execution nodes run a deployment script (e.g., via cron or manual trigger) that: +### State Tracking and Logging -1. Performs a `git pull` from the source of truth. -2. Identifies services assigned to this host. -3. Symlinks or copies `services//docker-compose.yml` to `/opt/homelab/services/`. -4. Runs `docker compose up -d --remove-orphans`. +- **State**: Local node state is tracked in `/opt/homelab/state/deploy/current_stage`. +- **Logs**: Detailed execution logs are stored in `/opt/homelab/logs/deploy/deploy_.log`. + +### Operational Semantics + +Deployment is **hybrid**: +- **SATURN** acts as the orchestrator and source of truth. +- **Nodes** execute the deployment locally using the `deploy.sh` script. +- Human-in-the-loop is required for triggering and confirming deployments. + +### Recovery Workflow + +If a deployment fails: +1. Run `deploy.sh diagnose` to identify the issue. +2. Use the `recover-node` AI prompt to analyze logs and get recommendations. +3. Either fix the issue and run `deploy.sh resume`, or use `deploy.sh rollback`. + +## Onboarding New Nodes + +Refer to `inventory/templates/how_to_add_new_node.yaml` for a detailed guide on adding new hardware to the mesh. The general flow is: +1. Define node in `hosts/` and `inventory/topology.yaml` on SATURN. +2. Bootstrap the node (Docker, Tailscale, Git). +3. Run the staged deployment framework starting with `prepare`. ## Host-Local Overrides diff --git a/docs/lifecycle.md b/docs/lifecycle.md new file mode 100644 index 0000000..0ef8128 --- /dev/null +++ b/docs/lifecycle.md @@ -0,0 +1,51 @@ +# Service Lifecycle and Recovery + +This document defines the lifecycle of a service in the homelab and the procedures for operational recovery. + +## Service Lifecycle + +1. **Onboarding**: + - Create `services//` directory. + - Define `docker-compose.yml`, `service.yaml`, `README.md`, `env.example`, and `healthcheck.sh`. + - Register service in `inventory/topology.yaml` or relevant host configs. +2. **Provisioning**: + - Ensure `/opt/homelab/data/` exists. + - Ensure `/opt/homelab/config/` exists and contains required secrets/configs. + - Setup environment variables from `env.example` into `/opt/homelab/config//.env`. +3. **Deployment**: + - `docker compose pull` + - `docker compose up -d` +4. **Verification**: + - Run `healthcheck.sh`. + - Verify ports are reachable according to `service.yaml`. +5. **Maintenance**: + - Periodic updates via `docker compose pull`. + - Log monitoring via `docker compose logs -f`. +6. **Decommissioning**: + - `docker compose down`. + - Archive `/opt/homelab/data/` if necessary. + +## Operational Recovery + +### 1. Container Failure +If a service is unhealthy: +- Check `docker compose logs`. +- Restart: `docker compose restart`. +- Recreate: `docker compose up -d --force-recreate`. + +### 2. Node Failure +If a host node fails: +- Services with `owner_node` matching the failed node must be recovered on a backup node or the node must be restored. +- Persistence data must be restored from backups to `/opt/homelab/data/`. + +### 3. Dependency Recovery +If a dependency fails: +- Services depending on it might report unhealthy status. +- Recover the dependency first. +- Re-verify dependent services. + +## Persistent Data Conventions + +- **Data**: `/opt/homelab/data/` - Primary persistent state. +- **Config**: `/opt/homelab/config/` - Local overrides and secrets. +- **Backups**: Standard backup routines should target `/opt/homelab/data/`. diff --git a/docs/service-model.md b/docs/service-model.md new file mode 100644 index 0000000..ec3d4ad --- /dev/null +++ b/docs/service-model.md @@ -0,0 +1,75 @@ +# Service Model and Healthchecks + +This document defines the normalized service model for the homelab. + +## Service Layout + +Each service must reside in its own directory under `services/`: + +```text +services// +├── docker-compose.yml # Docker Compose definition +├── service.yaml # Service metadata and orchestration contract +├── README.md # Service documentation +├── env.example # Template for required environment variables +└── healthcheck.sh # Standardized healthcheck script +``` + +## Service Metadata (`service.yaml`) + +The `service.yaml` file provides a machine-readable contract for deployment and orchestration. + +### Schema + +```yaml +service: + name: # Canonical service name (kebab-case) + owner_node: # Preferred host node + exposure: # public, private, or local-only + dependencies: [] # List of required services + ports: + - container: + host: + protocol: + healthcheck: + type: # local-only, container, http, mqtt + endpoint: # URL or topic if applicable + interval: + timeout: + retries: + restart_policy: # unless-stopped, always, etc. + persistence: + paths: + - /opt/homelab/data//... + runtime: + directories: [] # Required host directories to be created + env_vars: [] # List of required environment variables (keys only) +``` + +## Healthcheck Semantics + +The `healthcheck.sh` script should return `0` for healthy and `1` for unhealthy. It should support different modes based on `service.yaml` definitions. + +### 1. Local-only +Checks if the container is running and the process is alive within the host. + +### 2. Container-level +Uses `docker inspect` or `docker exec` to check internal container health. + +### 3. HTTP +Performs a `curl` against a specific endpoint (e.g., `/health` or `/`). + +### 4. MQTT +Verifies that a specific topic is being updated or responds to a ping. + +### 5. Dependency-aware +The healthcheck script may optionally check if its dependencies are healthy before reporting its own status. + +## Runtime Authority + +`/opt/homelab/config/` is the source of truth for: +- Secrets (not in Git) +- Host-local overrides +- Mutable configuration + +Services should mount files from this directory as needed. diff --git a/docs/standards.md b/docs/standards.md index fd10174..5cd8304 100644 --- a/docs/standards.md +++ b/docs/standards.md @@ -19,11 +19,14 @@ This document defines the standards and conventions for the homelab GitOps-lite / ├── docs/ # Infrastructure documentation ├── hosts/ # Host-specific configurations -│ ├── saturn/ -│ ├── solaria/ -│ ├── piha/ -│ └── vps/ -├── services/ # Reusable service definitions (Docker Compose) +├── inventory/ # Topology and templates +├── services/ # Normalized service definitions +│ └── / +│ ├── docker-compose.yml +│ ├── service.yaml +│ ├── README.md +│ ├── env.example +│ └── healthcheck.sh ├── scripts/ # Management and deployment scripts └── README.md ``` @@ -37,18 +40,28 @@ Runtime state must live outside the repository to keep it immutable and clean. ├── services/ # Active docker-compose files (deployed from git) ├── data/ # Persistent volume data (backed up) ├── config/ # Host-local overrides and secrets (not in git) +│ └── / +│ ├── .env # Merged environment variables +│ └── overrides/ # Local configuration overrides └── logs/ # Service logs ``` +## Service Standards + +1. **Normalization**: Every service MUST follow the `services//` layout. +2. **Metadata**: Every service MUST have a `service.yaml` defining its operational contract. +3. **Healthchecks**: Every service MUST have a `healthcheck.sh` for verification. +4. **Secrets**: NEVER commit secrets to Git. Use `env.example` as a template and populate `/opt/homelab/config//.env` on the host. + ## Docker Compose Standards 1. **File Naming**: Use `docker-compose.yml`. -2. **Container Naming**: `service-name`. -3. **Restarts**: Always use `restart: unless-stopped`. +2. **Container Naming**: Match the service name. +3. **Restarts**: Always use `restart: unless-stopped` unless specified otherwise in `service.yaml`. 4. **Networking**: - Use `tailscale` internal mesh for inter-host communication. - Expose ports only when necessary. -5. **Volumes**: Use named volumes or absolute paths to `/opt/homelab/data/service-name`. +5. **Volumes**: Use absolute paths to `/opt/homelab/data/`. ## Environment Variables diff --git a/hosts/chelsty/capabilities.yaml b/hosts/chelsty/capabilities.yaml new file mode 100644 index 0000000..1821f13 --- /dev/null +++ b/hosts/chelsty/capabilities.yaml @@ -0,0 +1,40 @@ +capabilities: + hardware: + cpu: + arch: x86_64 + cores: 4 + threads: 4 + memory: + total_gb: 16 + acceleration: + type: none + + virtualization: + supported: true + type: kvm + + storage: + persistence: persistent + type: ssd + capacity_gb: 250 + + networking: + reachability: tailscale-only + ingress_suitability: false + bandwidth: LTE + + runtime: + container_engine: docker + os: debian + + operational: + power_constraint: low-power + connectivity: intermittent + availability_target: best-effort + + deployment: + suitability: + - staging + - homeassistant + - edge + restricted: false diff --git a/hosts/chelsty/networking.yaml b/hosts/chelsty/networking.yaml new file mode 100644 index 0000000..a60557f --- /dev/null +++ b/hosts/chelsty/networking.yaml @@ -0,0 +1,5 @@ +zigbee: + coordinator: + type: slzb-06u + transport: ethernet + ip: 192.168.x.x \ No newline at end of file diff --git a/hosts/piha/capabilities.yaml b/hosts/piha/capabilities.yaml new file mode 100644 index 0000000..92b2d1f --- /dev/null +++ b/hosts/piha/capabilities.yaml @@ -0,0 +1,39 @@ +capabilities: + hardware: + cpu: + arch: arm64 + cores: 4 + threads: 4 + memory: + total_gb: 4 + acceleration: + type: none + + virtualization: + supported: false + type: docker-only + + storage: + persistence: persistent + type: sd-card + capacity_gb: 32 + + networking: + reachability: tailscale-only + ingress_suitability: false + bandwidth: 1Gbps + + runtime: + container_engine: docker + os: debian + + operational: + power_constraint: mains + connectivity: stable + availability_target: medium + + deployment: + suitability: + - infra + - monitoring + restricted: false diff --git a/hosts/saturn/capabilities.yaml b/hosts/saturn/capabilities.yaml new file mode 100644 index 0000000..f410d8a --- /dev/null +++ b/hosts/saturn/capabilities.yaml @@ -0,0 +1,40 @@ +capabilities: + hardware: + cpu: + arch: arm64 + cores: 8 + threads: 8 + memory: + total_gb: 8 + acceleration: + type: none + + virtualization: + supported: false + type: docker-only + + storage: + persistence: persistent + type: sd-card + capacity_gb: 64 + + networking: + reachability: tailscale-only + ingress_suitability: false + bandwidth: 1Gbps + + runtime: + container_engine: docker + os: debian + + operational: + power_constraint: mains + connectivity: stable + availability_target: high + + deployment: + suitability: + - control + - development + - infra + restricted: false diff --git a/hosts/solaria/capabilities.yaml b/hosts/solaria/capabilities.yaml new file mode 100644 index 0000000..b5c73fe --- /dev/null +++ b/hosts/solaria/capabilities.yaml @@ -0,0 +1,41 @@ +capabilities: + hardware: + cpu: + arch: x86_64 + cores: 12 + threads: 24 + memory: + total_gb: 64 + acceleration: + type: cuda + model: "NVIDIA RTX 4070" + + virtualization: + supported: true + type: kvm + + storage: + persistence: redundant + type: nvme + capacity_gb: 2000 + + networking: + reachability: tailscale-only + ingress_suitability: false + bandwidth: 1Gbps + + runtime: + container_engine: docker + os: ubuntu + + operational: + power_constraint: mains + connectivity: stable + availability_target: medium + + deployment: + suitability: + - ai + - compute + - database + restricted: false diff --git a/hosts/vps/capabilities.yaml b/hosts/vps/capabilities.yaml new file mode 100644 index 0000000..79178b0 --- /dev/null +++ b/hosts/vps/capabilities.yaml @@ -0,0 +1,40 @@ +capabilities: + hardware: + cpu: + arch: x86_64 + cores: 2 + threads: 2 + memory: + total_gb: 4 + acceleration: + type: none + + virtualization: + supported: false + type: docker-only + + storage: + persistence: persistent + type: ssd + capacity_gb: 80 + + networking: + reachability: public + ingress_suitability: true + bandwidth: 1Gbps + + runtime: + container_engine: docker + os: debian + + operational: + power_constraint: mains + connectivity: stable + availability_target: high + + deployment: + suitability: + - edge + - ingress + - web + restricted: true diff --git a/inventory/templates/how_to_add_new_node.yaml b/inventory/templates/how_to_add_new_node.yaml new file mode 100644 index 0000000..e62524f --- /dev/null +++ b/inventory/templates/how_to_add_new_node.yaml @@ -0,0 +1,29 @@ +--- +title: How to Add a New Node to the Homelab +description: This guide outlines the process for onboarding a new execution node into the GitOps-lite environment. + +phases: + - phase: 1. Preparation (on SATURN) + steps: + - "Define Node Inventory: Create hosts// directory" + - "Add host.yaml with hardware metadata" + - "Add networking.yaml with IP and Tailscale info" + - "Add capabilities.yaml with node capability description" + - "Add services.txt listing assigned services" + - "Update inventory/topology.yaml" + - "Commit and push changes to Forgejo" + + - phase: 2. Bootstrapping (on the New Node) + steps: + - "Install OS (Debian/Ubuntu recommended)" + - "Configure SSH and user access" + - "Install Docker, Docker Compose, Tailscale, Git" + - "Join the tailnet" + - "Clone repository: git clone /homelab-codex.git ~/homelab-codex-ws" + - "Setup runtime: sudo mkdir -p /opt/homelab/{services,config,state,logs} && sudo chown -R $USER:$USER /opt/homelab" + + - phase: 3. Initial Deployment + steps: + - "Run prepare: ~/homelab-codex-ws/scripts/deploy/deploy.sh prepare" + - "Run deploy: ~/homelab-codex-ws/scripts/deploy/deploy.sh deploy" + - "Run verify: ~/homelab-codex-ws/scripts/deploy/deploy.sh verify" diff --git a/inventory/templates/node-bootstrap-checklist.yaml b/inventory/templates/node-bootstrap-checklist.yaml new file mode 100644 index 0000000..7a32598 --- /dev/null +++ b/inventory/templates/node-bootstrap-checklist.yaml @@ -0,0 +1,29 @@ +--- +bootstrap_checklist: + pre_flight: + - task: "Hardware connected and powered" + done: false + - task: "Base OS installed (Debian/Ubuntu)" + done: false + - task: "Network connectivity established" + done: false + - task: "SSH access configured" + done: false + onboarding: + - task: "Tailscale installed and authenticated" + done: false + - task: "Docker and Compose V2 installed" + done: false + - task: "Git installed" + done: false + - task: "Repository cloned to ~/homelab-codex-ws" + done: false + - task: "Opt homelab structure created" + done: false + initial_run: + - task: "deploy.sh prepare successful" + done: false + - task: "deploy.sh deploy successful" + done: false + - task: "deploy.sh verify successful" + done: false diff --git a/inventory/templates/node-discovery-commands.yaml b/inventory/templates/node-discovery-commands.yaml new file mode 100644 index 0000000..e38cc57 --- /dev/null +++ b/inventory/templates/node-discovery-commands.yaml @@ -0,0 +1,18 @@ +--- +discovery_commands: + cpu: + - "lscpu" + - "cat /proc/cpuinfo" + memory: + - "free -h" + storage: + - "lsblk" + - "df -h" + network: + - "ip addr" + - "tailscale status" + gpu: + - "nvidia-smi" + - "lspci | grep -i vga" + usb: + - "lsusb" diff --git a/inventory/templates/prepare-node.yaml b/inventory/templates/prepare-node.yaml new file mode 100644 index 0000000..a31bf17 --- /dev/null +++ b/inventory/templates/prepare-node.yaml @@ -0,0 +1,13 @@ +--- +node_preparation: + actions: + - name: update_system + command: "sudo apt update && sudo apt upgrade -y" + - name: install_dependencies + command: "sudo apt install -y curl git docker.io docker-compose-v2 tailscale" + - name: configure_docker_permissions + command: "sudo usermod -aG docker $USER" + - name: create_runtime_directories + command: "sudo mkdir -p /opt/homelab/{services,config,state,logs} && sudo chown -R $USER:$USER /opt/homelab" + - name: initialize_repo + command: "git clone ~/homelab-codex-ws" diff --git a/inventory/templates/prompts/create-node b/inventory/templates/prompts/create-node new file mode 100644 index 0000000..65ecf85 --- /dev/null +++ b/inventory/templates/prompts/create-node @@ -0,0 +1,13 @@ +### System Prompt Addendum: Create Node + +**Context**: You are assisting in adding a new node to the homelab. +**Task**: Generate the necessary inventory files for a new node. + +**Requirements**: +1. Ask for: hostname, IP address, Tailscale IP, hardware specs (CPU/RAM/Storage), and intended role/services. +2. Generate `hosts//host.yaml` and `hosts//networking.yaml`. +3. Provide a snippet for `inventory/topology.yaml`. +4. Recommend services based on hardware (e.g., if GPU is present, suggest inference services). + +**Output Format**: YAML blocks for each file. +**Restriction**: Do NOT execute any shell commands. Only provide the configuration. diff --git a/inventory/templates/prompts/deploy-node b/inventory/templates/prompts/deploy-node new file mode 100644 index 0000000..8d10695 --- /dev/null +++ b/inventory/templates/prompts/deploy-node @@ -0,0 +1,16 @@ +### System Prompt Addendum: Deploy Node + +**Context**: Orchestrating a deployment across one or more nodes. +**Task**: Generate the deployment plan and verification checklist. + +**Requirements**: +1. Identify which nodes need updates based on git changes. +2. Recommend the sequence of stages (e.g., `prepare` on all, then `deploy` on edge nodes first). +3. Generate a human-readable checklist for the operator. +4. Define verification criteria for the `verify` stage. + +**Output Format**: +- Deployment Plan (sequence of commands). +- Verification Checklist. + +**Restriction**: Do NOT mutate infrastructure autonomously. diff --git a/inventory/templates/prompts/recover-node b/inventory/templates/prompts/recover-node new file mode 100644 index 0000000..5ca63eb --- /dev/null +++ b/inventory/templates/prompts/recover-node @@ -0,0 +1,17 @@ +### System Prompt Addendum: Recover Node + +**Context**: A homelab node is unresponsive or has suffered data loss. +**Task**: Analyze logs and state to recommend recovery steps. + +**Requirements**: +1. Request the content of `/opt/homelab/logs/deploy/` (latest log) and `/opt/homelab/state/deploy/current_stage`. +2. Analyze the last failed stage. +3. Recommend specific `deploy.sh` commands (e.g., `rollback` or `resume`). +4. Provide manual recovery steps if automated stages fail. + +**Output Format**: +- Analysis of the failure. +- Recommended action. +- Documentation of the recovery process. + +**Restriction**: Do NOT auto-execute deployment. diff --git a/scripts/deploy/deploy.sh b/scripts/deploy/deploy.sh new file mode 100755 index 0000000..da462ef --- /dev/null +++ b/scripts/deploy/deploy.sh @@ -0,0 +1,110 @@ +#!/usr/bin/env bash +# deploy.sh - Staged deployment framework for homelab nodes. +# Usage: ./deploy.sh [stage] + +set -e + +# --- Configuration --- +RUNTIME_PATH="/opt/homelab" +STATE_DIR="${RUNTIME_PATH}/state/deploy" +LOG_DIR="${RUNTIME_PATH}/logs/deploy" +REPO_PATH="${HOME}/homelab-codex-ws" +TIMESTAMP=$(date +%Y%m%d_%H%M%S) +LOG_FILE="${LOG_DIR}/deploy_${TIMESTAMP}.log" + +# --- Initialization --- +mkdir -p "$STATE_DIR" "$LOG_DIR" + +# Redirection for logging +exec > >(tee -a "$LOG_FILE") 2>&1 + +log() { + echo "[$(date +'%Y-%m-%d %H:%M:%S')] $1" +} + +set_state() { + echo "$1" > "${STATE_DIR}/current_stage" + log "State set to: $1" +} + +get_state() { + if [ -f "${STATE_DIR}/current_stage" ]; then + cat "${STATE_DIR}/current_stage" + else + echo "none" + fi +} + +# --- Stages --- + +stage_prepare() { + log "Stage: PREPARE" + set_state "prepare" + # Skeleton: Pull latest changes, check dependencies, validate inventory + log "Checking repository at $REPO_PATH..." + cd "$REPO_PATH" && git pull + log "Preparation complete." +} + +stage_deploy() { + log "Stage: DEPLOY" + set_state "deploy" + # Skeleton: Iterate through services and run docker compose + log "Deploying services defined for $(hostname)..." + # Implementation detail: loop through services/ and run compose + log "Deployment complete." +} + +stage_verify() { + log "Stage: VERIFY" + set_state "verify" + # Skeleton: Check container status, healthchecks, connectivity + log "Verifying service health..." + docker ps + log "Verification complete." +} + +stage_diagnose() { + log "Stage: DIAGNOSE" + # Skeleton: Check logs, resource usage, networking + log "Running diagnostics..." + docker stats --no-stream + log "Diagnostics complete." +} + +stage_rollback() { + log "Stage: ROLLBACK" + # Skeleton: Revert to previous git commit or previous state + log "Rolling back changes..." + log "Rollback complete." +} + +stage_resume() { + log "Stage: RESUME" + CURRENT=$(get_state) + log "Resuming from state: $CURRENT" + case "$CURRENT" in + "prepare") stage_deploy ;; + "deploy") stage_verify ;; + "verify") log "Last deployment was verified. Nothing to resume." ;; + *) log "Unknown state or nothing to resume. Starting from prepare..."; stage_prepare ;; + esac +} + +# --- Main --- + +COMMAND=${1:-resume} + +log "--- Homelab Deployment Started (Command: $COMMAND) ---" + +case "$COMMAND" in + prepare) stage_prepare ;; + deploy) stage_deploy ;; + verify) stage_verify ;; + diagnose) stage_diagnose ;; + rollback) stage_rollback ;; + resume) stage_resume ;; + *) echo "Usage: $0 {prepare|deploy|verify|diagnose|rollback|resume}"; exit 1 ;; +esac + +log "--- Homelab Deployment Finished ---" diff --git a/services/forgejo/README.md b/services/forgejo/README.md new file mode 100644 index 0000000..f2a46f0 --- /dev/null +++ b/services/forgejo/README.md @@ -0,0 +1,9 @@ +# Forgejo + +Forgejo is a self-hosted lightweight software forge. Easy to install and low maintenance. + +## Usage +Deployed on the `saturn` node as the git source of truth. + +Web UI is available on port 3000. +SSH for git is available on port 222. diff --git a/services/forgejo/docker-compose.yml b/services/forgejo/docker-compose.yml new file mode 100644 index 0000000..a0ba029 --- /dev/null +++ b/services/forgejo/docker-compose.yml @@ -0,0 +1,15 @@ +services: + forgejo: + image: codeberg.org/forgejo/forgejo:latest + container_name: forgejo + restart: unless-stopped + environment: + - USER_UID=1000 + - USER_GID=1000 + volumes: + - /opt/homelab/data/forgejo/data:/data + - /etc/timezone:/etc/timezone:ro + - /etc/localtime:/etc/localtime:ro + ports: + - '3000:3000' + - '222:22' diff --git a/services/forgejo/env.example b/services/forgejo/env.example new file mode 100644 index 0000000..06e75c7 --- /dev/null +++ b/services/forgejo/env.example @@ -0,0 +1,3 @@ +USER_UID=1000 +USER_GID=1000 +# FORGEJO__database__DB_TYPE=sqlite3 diff --git a/services/forgejo/healthcheck.sh b/services/forgejo/healthcheck.sh new file mode 100644 index 0000000..eee1359 --- /dev/null +++ b/services/forgejo/healthcheck.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# Healthcheck for Forgejo + +# Check if the container is running +if ! docker ps --filter "name=forgejo" --filter "status=running" | grep -q "forgejo"; then + echo "[FAIL] Forgejo container is not running" + exit 1 +fi + +# Check API health endpoint +if ! curl -sf http://localhost:3000/api/healthz > /dev/null; then + echo "[FAIL] Forgejo API is not responding" + exit 1 +fi + +echo "[OK] Forgejo is healthy" +exit 0 diff --git a/services/forgejo/service.yaml b/services/forgejo/service.yaml new file mode 100644 index 0000000..d79ceaf --- /dev/null +++ b/services/forgejo/service.yaml @@ -0,0 +1,28 @@ +service: + name: forgejo + owner_node: saturn + exposure: private + dependencies: [] + ports: + - container: 3000 + host: 3000 + protocol: tcp + - container: 22 + host: 222 + protocol: tcp + healthcheck: + type: http + endpoint: http://localhost:3000/api/healthz + interval: 1m + timeout: 10s + retries: 5 + restart_policy: unless-stopped + persistence: + paths: + - /opt/homelab/data/forgejo/data + runtime: + directories: + - /opt/homelab/data/forgejo/data + env_vars: + - USER_UID + - USER_GID diff --git a/services/mosquitto/README.md b/services/mosquitto/README.md new file mode 100644 index 0000000..d249893 --- /dev/null +++ b/services/mosquitto/README.md @@ -0,0 +1,9 @@ +# Mosquitto MQTT Broker + +Eclipse Mosquitto is an open source (EPL/EDL licensed) message broker that implements the MQTT protocol versions 5.0, 3.1.1 and 3.1. + +## Usage +Deployed on the `piha` node. + +Port 1883 for standard MQTT. +Port 9001 for WebSockets. diff --git a/services/mosquitto/docker-compose.yml b/services/mosquitto/docker-compose.yml new file mode 100644 index 0000000..bc1992d --- /dev/null +++ b/services/mosquitto/docker-compose.yml @@ -0,0 +1,12 @@ +services: + mosquitto: + image: eclipse-mosquitto:latest + container_name: mosquitto + restart: unless-stopped + ports: + - '1883:1883' + - '9001:9001' + volumes: + - /opt/homelab/data/mosquitto/config:/mosquitto/config + - /opt/homelab/data/mosquitto/data:/mosquitto/data + - /opt/homelab/data/mosquitto/log:/mosquitto/log diff --git a/services/mosquitto/env.example b/services/mosquitto/env.example new file mode 100644 index 0000000..de517be --- /dev/null +++ b/services/mosquitto/env.example @@ -0,0 +1,2 @@ +# No specific environment variables required by default. +# Mosquitto is mainly configured via /opt/homelab/data/mosquitto/config/mosquitto.conf diff --git a/services/mosquitto/healthcheck.sh b/services/mosquitto/healthcheck.sh new file mode 100644 index 0000000..1ea89cb --- /dev/null +++ b/services/mosquitto/healthcheck.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# Healthcheck for Mosquitto + +# Check if the container is running +if ! docker ps --filter "name=mosquitto" --filter "status=running" | grep -q "mosquitto"; then + echo "[FAIL] Mosquitto container is not running" + exit 1 +fi + +# Basic port check for 1883 +if ! (echo > /dev/tcp/localhost/1883) >/dev/null 2>&1; then + echo "[FAIL] Mosquitto port 1883 is not reachable" + exit 1 +fi + +echo "[OK] Mosquitto is healthy" +exit 0 diff --git a/services/mosquitto/service.yaml b/services/mosquitto/service.yaml new file mode 100644 index 0000000..b97ade4 --- /dev/null +++ b/services/mosquitto/service.yaml @@ -0,0 +1,29 @@ +service: + name: mosquitto + owner_node: piha + exposure: private + dependencies: [] + ports: + - container: 1883 + host: 1883 + protocol: tcp + - container: 9001 + host: 9001 + protocol: tcp + healthcheck: + type: container + interval: 30s + timeout: 10s + retries: 3 + restart_policy: unless-stopped + persistence: + paths: + - /opt/homelab/data/mosquitto/config + - /opt/homelab/data/mosquitto/data + - /opt/homelab/data/mosquitto/log + runtime: + directories: + - /opt/homelab/data/mosquitto/config + - /opt/homelab/data/mosquitto/data + - /opt/homelab/data/mosquitto/log + env_vars: [] diff --git a/services/npm/README.md b/services/npm/README.md new file mode 100644 index 0000000..0c35cc9 --- /dev/null +++ b/services/npm/README.md @@ -0,0 +1,13 @@ +# Nginx Proxy Manager (NPM) + +Expose your services easily and securely with Nginx Proxy Manager. + +## Features +- Secure HTTPS via Let's Encrypt +- Easy to use Web UI +- Advanced configuration for power users + +## Usage +Deployed on the `vps` node for public ingress. + +Web UI is available on port 81. diff --git a/services/npm/env.example b/services/npm/env.example new file mode 100644 index 0000000..dc4cf2d --- /dev/null +++ b/services/npm/env.example @@ -0,0 +1,2 @@ +# No environment variables required for standard NPM deployment. +# Local overrides can be placed in /opt/homelab/config/npm/.env diff --git a/services/npm/healthcheck.sh b/services/npm/healthcheck.sh new file mode 100644 index 0000000..b08f6a9 --- /dev/null +++ b/services/npm/healthcheck.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# Healthcheck for Nginx Proxy Manager + +# Check if the container is running +if ! docker ps --filter "name=npm" --filter "status=running" | grep -q "npm"; then + echo "[FAIL] NPM container is not running" + exit 1 +fi + +# Check Web UI responsiveness (port 81) +if ! curl -sf http://localhost:81 > /dev/null; then + echo "[FAIL] NPM Web UI is not responding" + exit 1 +fi + +echo "[OK] NPM is healthy" +exit 0 diff --git a/services/npm/service.yaml b/services/npm/service.yaml new file mode 100644 index 0000000..b8e71f8 --- /dev/null +++ b/services/npm/service.yaml @@ -0,0 +1,31 @@ +service: + name: npm + owner_node: vps + exposure: public + dependencies: [] + ports: + - container: 80 + host: 80 + protocol: tcp + - container: 81 + host: 81 + protocol: tcp + - container: 443 + host: 443 + protocol: tcp + healthcheck: + type: http + endpoint: http://localhost:81 + interval: 30s + timeout: 10s + retries: 3 + restart_policy: unless-stopped + persistence: + paths: + - /opt/homelab/data/npm/data + - /opt/homelab/data/npm/letsencrypt + runtime: + directories: + - /opt/homelab/data/npm/data + - /opt/homelab/data/npm/letsencrypt + env_vars: [] diff --git a/services/ollama/README.md b/services/ollama/README.md new file mode 100644 index 0000000..0ccf627 --- /dev/null +++ b/services/ollama/README.md @@ -0,0 +1,13 @@ +# Ollama + +Get up and running with large language models locally. + +## Usage +Deployed on the `solaria` node for GPU acceleration. + +API is available on port 11434. + +Example check: +```bash +curl http://localhost:11434/api/tags +``` diff --git a/services/ollama/docker-compose.yml b/services/ollama/docker-compose.yml new file mode 100644 index 0000000..2bf73d9 --- /dev/null +++ b/services/ollama/docker-compose.yml @@ -0,0 +1,16 @@ +services: + ollama: + image: ollama/ollama:latest + container_name: ollama + restart: unless-stopped + ports: + - '11434:11434' + volumes: + - /opt/homelab/data/ollama:/root/.ollama + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] diff --git a/services/ollama/env.example b/services/ollama/env.example new file mode 100644 index 0000000..29f3edf --- /dev/null +++ b/services/ollama/env.example @@ -0,0 +1,2 @@ +# No specific environment variables required by default. +# CUDA_VISIBLE_DEVICES=0 diff --git a/services/ollama/healthcheck.sh b/services/ollama/healthcheck.sh new file mode 100644 index 0000000..5ac9a31 --- /dev/null +++ b/services/ollama/healthcheck.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# Healthcheck for Ollama + +# Check if the container is running +if ! docker ps --filter "name=ollama" --filter "status=running" | grep -q "ollama"; then + echo "[FAIL] Ollama container is not running" + exit 1 +fi + +# Check API responsiveness +if ! curl -sf http://localhost:11434/api/tags > /dev/null; then + echo "[FAIL] Ollama API is not responding" + exit 1 +fi + +echo "[OK] Ollama is healthy" +exit 0 diff --git a/services/ollama/service.yaml b/services/ollama/service.yaml new file mode 100644 index 0000000..58037a4 --- /dev/null +++ b/services/ollama/service.yaml @@ -0,0 +1,23 @@ +service: + name: ollama + owner_node: solaria + exposure: private + dependencies: [] + ports: + - container: 11434 + host: 11434 + protocol: tcp + healthcheck: + type: http + endpoint: http://localhost:11434/api/tags + interval: 1m + timeout: 10s + retries: 3 + restart_policy: unless-stopped + persistence: + paths: + - /opt/homelab/data/ollama + runtime: + directories: + - /opt/homelab/data/ollama + env_vars: [] diff --git a/services/zigbee2mqtt/README.md b/services/zigbee2mqtt/README.md new file mode 100644 index 0000000..2dc017d --- /dev/null +++ b/services/zigbee2mqtt/README.md @@ -0,0 +1,10 @@ +# Zigbee2MQTT + +Zigbee to MQTT bridge, get rid of your proprietary Zigbee bridges. + +## Usage +Deployed on the `piha` node. + +Requires a Zigbee adapter (e.g., Sonoff ZBDongle-E) mapped to `/dev/ttyACM0`. + +Frontend is available on port 8080. diff --git a/services/zigbee2mqtt/docker-compose.yml b/services/zigbee2mqtt/docker-compose.yml new file mode 100644 index 0000000..9046958 --- /dev/null +++ b/services/zigbee2mqtt/docker-compose.yml @@ -0,0 +1,14 @@ +services: + zigbee2mqtt: + container_name: zigbee2mqtt + image: koenkk/zigbee2mqtt:latest + restart: unless-stopped + volumes: + - /opt/homelab/data/zigbee2mqtt/data:/app/data + - /run/udev:/run/udev:ro + ports: + - 8080:8080 + devices: + - /dev/ttyACM0:/dev/ttyACM0 + environment: + - TZ=Europe/Stockholm diff --git a/services/zigbee2mqtt/env.example b/services/zigbee2mqtt/env.example new file mode 100644 index 0000000..1ec2cb3 --- /dev/null +++ b/services/zigbee2mqtt/env.example @@ -0,0 +1,3 @@ +TZ=Europe/Stockholm +# MQTT credentials if applicable +# Z2M_MQTT_SERVER=mqtt://mosquitto:1883 diff --git a/services/zigbee2mqtt/healthcheck.sh b/services/zigbee2mqtt/healthcheck.sh new file mode 100644 index 0000000..c2f3979 --- /dev/null +++ b/services/zigbee2mqtt/healthcheck.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# Healthcheck for Zigbee2MQTT + +# Check if the container is running +if ! docker ps --filter "name=zigbee2mqtt" --filter "status=running" | grep -q "zigbee2mqtt"; then + echo "[FAIL] Zigbee2MQTT container is not running" + exit 1 +fi + +# Check frontend responsiveness +if ! curl -sf http://localhost:8080 > /dev/null; then + echo "[FAIL] Zigbee2MQTT frontend is not responding" + exit 1 +fi + +echo "[OK] Zigbee2MQTT is healthy" +exit 0 diff --git a/services/zigbee2mqtt/service.yaml b/services/zigbee2mqtt/service.yaml new file mode 100644 index 0000000..7f24ea6 --- /dev/null +++ b/services/zigbee2mqtt/service.yaml @@ -0,0 +1,25 @@ +service: + name: zigbee2mqtt + owner_node: piha + exposure: private + dependencies: + - mosquitto + ports: + - container: 8080 + host: 8080 + protocol: tcp + healthcheck: + type: http + endpoint: http://localhost:8080 + interval: 30s + timeout: 10s + retries: 3 + restart_policy: unless-stopped + persistence: + paths: + - /opt/homelab/data/zigbee2mqtt/data + runtime: + directories: + - /opt/homelab/data/zigbee2mqtt/data + env_vars: + - TZ