# neuron.example.toml — example configuration # # Copy to /etc/neuron/neuron.toml and adjust for your environment. # # Environment variable overrides use NEURON_ prefix with __ separators: # NEURON_PORT=13131 port = 13131 # -- Harnesses --------------------------------------------------------------- # Each [[harnesses]] entry enables an inference engine. Currently only # "candle" is supported — it runs in-process and uses huggingface/candle # for inference on local CUDA devices (or CPU when CUDA is unavailable). [[harnesses]] name = "candle" # -- Candle harness settings ------------------------------------------------- # Optional tuning for the candle harness. [harness.candle] # HuggingFace cache directory for model weights. # # Resolution order (first hit wins): # 1. `hf_cache` here in this file (applies to the synth `huggingface` # source only — see [harness.candle.sources.*] below for explicit # per-source paths). # 2. `HF_HUB_CACHE` env var — same convention as the Python # `huggingface_hub` library, so an existing cache directory shared # with other tooling can be reused without per-tool config. # 3. `HF_HOME` env var (cache appended as `$HF_HOME/hub`). # 4. hf-hub's default (`~/.cache/huggingface/hub`). # # For per-host overrides (e.g. one neuron has an SSD with prefetched # weights), prefer a systemd drop-in over editing this file: # /etc/systemd/system/neuron.service.d/local.conf: # [Service] # Environment=HF_HUB_CACHE=/archive/hf-cache # hf_cache = "/var/lib/neuron/hf-cache" # Default scheme applied to bare `org/name` model ids (those without a # `scheme:` prefix). Defaults to "huggingface" when unset. Set to # "helexa" to make `default_models = [{ model_id = "Helexa/Foo" }]` # resolve via the helexa registry without prefixing every entry. # default_source = "huggingface" # Per-scheme source endpoints. Each scheme maps to an HF-compatible # registry. The `huggingface` source is auto-synthesised pointing at # `https://huggingface.co` when omitted; declare it explicitly here to # override the endpoint, auth env, or cache dir. # # [harness.candle.sources.huggingface] # endpoint = "https://huggingface.co" # auth_env = "HF_TOKEN" # optional bearer token via env var # cache_dir = "/archive3/llm-cache/huggingface" # # Add helexa (or any operator-run mirror speaking the HF-compatible # wire format) by adding another sources entry. Caches are # disambiguated per scheme so a mirror serving the same `org/name` as # HF cannot collide on disk. # # [harness.candle.sources.helexa] # endpoint = "https://registry.helexa.ai" # auth_env = "HELEXA_TOKEN" # cache_dir = "/archive3/llm-cache/helexa" # -- Default models ---------------------------------------------------------- # Models listed here are loaded automatically when the neuron service # activates. Loading is sequential — a slow or failing entry doesn't # block the rest of the fleet, but it does push out the time before # neuron starts serving HTTP, so keep the list short. Operators can # load additional models on demand via POST /models/load. # # Make sure data/neuron.service's TimeoutStartSec is generous enough to # cover the slowest entry's first-time download + materialisation. # [[default_models]] # model_id = "Qwen/Qwen3-0.6B-GGUF" # harness = "candle" # quant = "Q4_K_M" # devices = [0]