# neuron.example.toml — example configuration # # Copy to /etc/neuron/neuron.toml and adjust for your environment. # # Environment variable overrides use NEURON_ prefix with __ separators: # NEURON_PORT=13131 port = 13131 # -- Harnesses --------------------------------------------------------------- # Each [[harnesses]] entry enables an inference engine. Currently only # "candle" is supported — it runs in-process and uses huggingface/candle # for inference on local CUDA devices (or CPU when CUDA is unavailable). [[harnesses]] name = "candle" # -- Candle harness settings ------------------------------------------------- # Optional tuning for the candle harness. [harness.candle] # HuggingFace cache directory for model weights. When unset, hf-hub's # default (~/.cache/huggingface) is used. # hf_cache = "/var/lib/neuron/hf-cache" # -- Default models ---------------------------------------------------------- # Models listed here are loaded automatically when the neuron service # activates. Loading is sequential — a slow or failing entry doesn't # block the rest of the fleet, but it does push out the time before # neuron starts serving HTTP, so keep the list short. Operators can # load additional models on demand via POST /models/load. # # Make sure data/neuron.service's TimeoutStartSec is generous enough to # cover the slowest entry's first-time download + materialisation. # [[default_models]] # model_id = "Qwen/Qwen3-0.6B-GGUF" # harness = "candle" # quant = "Q4_K_M" # devices = [0]