# models.example.toml — model catalogue # # Copy to /etc/cortex/models.toml and adjust for your environment. # Describes how to serve each model. Cortex matches these profiles # against discovered neuron topologies for placement decisions; the # resulting `(catalogue × topology)` set is what `GET /v1/models` # returns and what the router can cold-load on demand. # # Field reference: # id - Repo id in the source registry (e.g. "Qwen/Qwen3.6-27B"). # Exact match. # harness - which engine handles inference (currently "candle"). # quant - GGUF quantisation tag for the file in the HF repo # (e.g. "Q4_K_M"). Omit/empty for the dense # safetensors path. TP requires dense. # vram_mb - rough estimate; advisory only, not enforced. # min_devices - GPU count this profile needs. TP profiles use # the same value as the tensor-parallel size. # min_device_vram_mb - each device must meet this VRAM floor for the # neuron to be considered "feasible". # pinned_on - optional whitelist of neuron names. Non-empty # narrows feasibility to just those neurons and # protects the model from LRU eviction there. # source - optional source scheme ("huggingface", "helexa", # operator mirror tag). When set, cortex forwards # the load to neuron as `scheme:id` so the daemon # fetches from the right registry. Omit to let # neuron substitute its own `default_source`. # Tensor-parallel target — needs a neuron with at least 2 large GPUs. # The example pins to a specific neuron name; adjust or remove the # pinned_on entry for your own fleet. [[models]] id = "Qwen/Qwen3.6-27B" harness = "candle" vram_mb = 54000 min_devices = 2 min_device_vram_mb = 24000 pinned_on = ["your-multi-gpu-neuron"] # Mid-size dense model — fits on any single GPU with ≥16 GB VRAM. [[models]] id = "Qwen/Qwen3-8B" harness = "candle" vram_mb = 18000 min_devices = 1 min_device_vram_mb = 16000 # Small GGUF quantised — runs on any small GPU. [[models]] id = "unsloth/Qwen3-0.6B-GGUF" harness = "candle" quant = "Q4_K_M" vram_mb = 500 min_devices = 1 min_device_vram_mb = 4000 # Helexa registry model — `source` pins this entry to the helexa # scheme so cortex forwards `helexa:Helexa/Qwen3.6-27B-Uncensored` to # neuron's /models/load. Requires the neuron config to declare a # matching [harness.candle.sources.helexa] entry pointing at the # helexa registry endpoint (see neuron.example.toml). # # [[models]] # id = "Helexa/Qwen3.6-27B-Uncensored" # harness = "candle" # source = "helexa" # vram_mb = 54000 # min_devices = 2 # min_device_vram_mb = 24000 # -- Tier aliases ------------------------------------------------------------ # Optional. Clients can request inference against an alias (e.g. # `model: "helexa/small"` in /v1/chat/completions) and cortex # transparently routes to the concrete model id below — including # rewriting the body's model field so neuron sees a name that matches # its loaded handle. Both the alias and the target appear in # /v1/models so clients can discover either. Operators can swap # targets here without changing client code. # # [aliases] # "helexa/small" = "Qwen/Qwen3-1.7B" # "helexa/balanced" = "Qwen/Qwen3-8B" # "helexa/large" = "Qwen/Qwen3.6-27B"