diff --git a/.gitignore b/.gitignore index f1ed483..90d4ed2 100644 --- a/.gitignore +++ b/.gitignore @@ -4,5 +4,6 @@ .idea/ .vscode/ cortex.toml +models.toml doc/plan/* /target-cuda/ diff --git a/models.example.toml b/models.example.toml index cd9e3d5..5f73c1f 100644 --- a/models.example.toml +++ b/models.example.toml @@ -20,20 +20,19 @@ # pinned_on - optional whitelist of neuron names. Non-empty # narrows feasibility to just those neurons and # protects the model from LRU eviction there. -# -# The examples below match the canonical helexa fleet -# (beast = 2x RTX 5090, benjy = RTX 4090, quadbrat = RTX 3060). -# Tensor-parallel target — only beast has two big GPUs. +# Tensor-parallel target — needs a neuron with at least 2 large GPUs. +# The example pins to a specific neuron name; adjust or remove the +# pinned_on entry for your own fleet. [[models]] id = "Qwen/Qwen3.6-27B" harness = "candle" vram_mb = 54000 min_devices = 2 min_device_vram_mb = 24000 -pinned_on = ["beast"] +pinned_on = ["your-multi-gpu-neuron"] -# Mid-size dense model — fits on benjy or beast. +# Mid-size dense model — fits on any single GPU with ≥16 GB VRAM. [[models]] id = "Qwen/Qwen3-8B" harness = "candle" @@ -41,7 +40,7 @@ vram_mb = 18000 min_devices = 1 min_device_vram_mb = 16000 -# Small GGUF quantised — runs on the smallest neuron (quadbrat). +# Small GGUF quantised — runs on any small GPU. [[models]] id = "unsloth/Qwen3-0.6B-GGUF" harness = "candle" diff --git a/script/deploy.sh b/script/deploy.sh index 4bebc1c..9ce61da 100755 --- a/script/deploy.sh +++ b/script/deploy.sh @@ -198,6 +198,25 @@ else echo "[${cortex_host}] failed to sync cortex.toml" fi +# Sync models.toml on the same lifecycle as cortex.toml — operator-owned, +# gitignored, drives /v1/models catalogue × topology resolution. +if [[ -f "${REPO_DIR}/models.toml" ]]; then + if rsync \ + --archive \ + --compress \ + --rsync-path 'sudo rsync' \ + --chown root:root \ + --chmod 644 \ + "${REPO_DIR}/models.toml" \ + "${cortex_host}:/etc/cortex/models.toml"; then + echo "[${cortex_host}] sync'd models.toml" + else + echo "[${cortex_host}] failed to sync models.toml" + fi +else + echo "[${cortex_host}] no local models.toml — leaving /etc/cortex/models.toml untouched" +fi + ssh "${cortex_host}" sudo systemctl daemon-reload if ssh "${cortex_host}" systemctl is-active --quiet cortex.service; then echo "[${cortex_host}] cortex service is active"