From 62ca125a684ea745473a4e036bead7d3880c26a5 Mon Sep 17 00:00:00 2001 From: rob thijssen Date: Wed, 20 May 2026 07:47:08 +0300 Subject: [PATCH] chore: keep models.example.toml generic; deploy.sh sync's local models.toml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reverts the previous commit's naming of specific helexa neuron hosts in the shipped example catalogue (`models.example.toml`) — the example is supposed to be a generic starting point that any operator copies and adapts, not a record of one particular fleet's layout. - `pinned_on` in the TP example uses the placeholder `"your-multi-gpu-neuron"`. Other entries keep the model ids (since those are HuggingFace-canonical, not fleet-specific). - New `models.toml` at repo root holds the helexa-fleet catalogue (beast / benjy / quadbrat). Added to `.gitignore` alongside `cortex.toml` — both are operator-owned, gitignored, RPM-marked `%config(noreplace)`, and synced by `deploy.sh`. - `deploy.sh` now rsync's `models.toml` to `/etc/cortex/models.toml` on the gateway host on the same lifecycle as `cortex.toml`. Skips cleanly when no local file exists, so users without a catalogue aren't surprised by silent overwrites. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 1 + models.example.toml | 13 ++++++------- script/deploy.sh | 19 +++++++++++++++++++ 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index f1ed483..90d4ed2 100644 --- a/.gitignore +++ b/.gitignore @@ -4,5 +4,6 @@ .idea/ .vscode/ cortex.toml +models.toml doc/plan/* /target-cuda/ diff --git a/models.example.toml b/models.example.toml index cd9e3d5..5f73c1f 100644 --- a/models.example.toml +++ b/models.example.toml @@ -20,20 +20,19 @@ # pinned_on - optional whitelist of neuron names. Non-empty # narrows feasibility to just those neurons and # protects the model from LRU eviction there. -# -# The examples below match the canonical helexa fleet -# (beast = 2x RTX 5090, benjy = RTX 4090, quadbrat = RTX 3060). -# Tensor-parallel target — only beast has two big GPUs. +# Tensor-parallel target — needs a neuron with at least 2 large GPUs. +# The example pins to a specific neuron name; adjust or remove the +# pinned_on entry for your own fleet. [[models]] id = "Qwen/Qwen3.6-27B" harness = "candle" vram_mb = 54000 min_devices = 2 min_device_vram_mb = 24000 -pinned_on = ["beast"] +pinned_on = ["your-multi-gpu-neuron"] -# Mid-size dense model — fits on benjy or beast. +# Mid-size dense model — fits on any single GPU with ≥16 GB VRAM. [[models]] id = "Qwen/Qwen3-8B" harness = "candle" @@ -41,7 +40,7 @@ vram_mb = 18000 min_devices = 1 min_device_vram_mb = 16000 -# Small GGUF quantised — runs on the smallest neuron (quadbrat). +# Small GGUF quantised — runs on any small GPU. [[models]] id = "unsloth/Qwen3-0.6B-GGUF" harness = "candle" diff --git a/script/deploy.sh b/script/deploy.sh index 4bebc1c..9ce61da 100755 --- a/script/deploy.sh +++ b/script/deploy.sh @@ -198,6 +198,25 @@ else echo "[${cortex_host}] failed to sync cortex.toml" fi +# Sync models.toml on the same lifecycle as cortex.toml — operator-owned, +# gitignored, drives /v1/models catalogue × topology resolution. +if [[ -f "${REPO_DIR}/models.toml" ]]; then + if rsync \ + --archive \ + --compress \ + --rsync-path 'sudo rsync' \ + --chown root:root \ + --chmod 644 \ + "${REPO_DIR}/models.toml" \ + "${cortex_host}:/etc/cortex/models.toml"; then + echo "[${cortex_host}] sync'd models.toml" + else + echo "[${cortex_host}] failed to sync models.toml" + fi +else + echo "[${cortex_host}] no local models.toml — leaving /etc/cortex/models.toml untouched" +fi + ssh "${cortex_host}" sudo systemctl daemon-reload if ssh "${cortex_host}" systemctl is-active --quiet cortex.service; then echo "[${cortex_host}] cortex service is active"