cortex/models.example.toml

# models.example.toml — model catalogue
#
# Copy to /etc/cortex/models.toml and adjust for your environment.
# Describes how to serve each model. Cortex matches these profiles
# against discovered neuron topologies for placement decisions; the
# resulting `(catalogue × topology)` set is what `GET /v1/models`
# returns and what the router can cold-load on demand.
#
# Field reference:
#   id                 - Repo id in the source registry (e.g. "Qwen/Qwen3.6-27B").
#                        Exact match.
#   harness            - which engine handles inference (currently "candle").
#   quant              - GGUF quantisation tag for the file in the HF repo
#                        (e.g. "Q4_K_M"). Omit/empty for the dense
#                        safetensors path. TP requires dense.
#   vram_mb            - rough estimate; advisory only, not enforced.
#   min_devices        - GPU count this profile needs. TP profiles use
#                        the same value as the tensor-parallel size.
#   min_device_vram_mb - each device must meet this VRAM floor for the
#                        neuron to be considered "feasible".
#   pinned_on          - optional whitelist of neuron names. Non-empty
#                        narrows feasibility to just those neurons and
#                        protects the model from LRU eviction there.
#   source             - optional source scheme ("huggingface", "helexa",
#                        operator mirror tag). When set, cortex forwards
#                        the load to neuron as `scheme:id` so the daemon
#                        fetches from the right registry. Omit to let
#                        neuron substitute its own `default_source`.

# Tensor-parallel target — needs a neuron with at least 2 large GPUs.
# The example pins to a specific neuron name; adjust or remove the
# pinned_on entry for your own fleet.
[[models]]
id = "Qwen/Qwen3.6-27B"
harness = "candle"
vram_mb = 54000
min_devices = 2
min_device_vram_mb = 24000
pinned_on = ["your-multi-gpu-neuron"]

# Mid-size dense model — fits on any single GPU with ≥16 GB VRAM.
[[models]]
id = "Qwen/Qwen3-8B"
harness = "candle"
vram_mb = 18000
min_devices = 1
min_device_vram_mb = 16000

# Small GGUF quantised — runs on any small GPU.
[[models]]
id = "unsloth/Qwen3-0.6B-GGUF"
harness = "candle"
quant = "Q4_K_M"
vram_mb = 500
min_devices = 1
min_device_vram_mb = 4000

# Helexa registry model — `source` pins this entry to the helexa
# scheme so cortex forwards `helexa:Helexa/Qwen3.6-27B-Uncensored` to
# neuron's /models/load. Requires the neuron config to declare a
# matching [harness.candle.sources.helexa] entry pointing at the
# helexa registry endpoint (see neuron.example.toml).
#
# [[models]]
# id = "Helexa/Qwen3.6-27B-Uncensored"
# harness = "candle"
# source = "helexa"
# vram_mb = 54000
# min_devices = 2
# min_device_vram_mb = 24000

# -- Tier aliases ------------------------------------------------------------
# Optional. Clients can request inference against an alias (e.g.
# `model: "helexa/small"` in /v1/chat/completions) and cortex
# transparently routes to the concrete model id below — including
# rewriting the body's model field so neuron sees a name that matches
# its loaded handle. Both the alias and the target appear in
# /v1/models so clients can discover either. Operators can swap
# targets here without changing client code.
#
# [aliases]
# "helexa/small" = "Qwen/Qwen3-1.7B"
# "helexa/balanced" = "Qwen/Qwen3-8B"
# "helexa/large" = "Qwen/Qwen3.6-27B"