All checks were successful
CI / Format (push) Successful in 38s
CI / CUDA type-check (push) Successful in 1m49s
CI / Clippy (push) Successful in 2m16s
CI / Test (push) Successful in 4m28s
CI / Build cortex SRPM (push) Has been skipped
CI / Build neuron SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
Refs #67. The correct limit{context,input,output} for a deployment is a computed function of model architecture + live free VRAM + a coherence/throughput trade-off, not an operator-declared static fact that goes stale on model swap. This lands the arch-agnostic derivation core; later phases capture per-model physics at load, measure throughput, and advertise/enforce the computed limit. - crates/neuron/src/harness/context_limit.rs (new): - kv_bytes_per_token(): shared per-card KV cost (counts only full-attention layers; sharded by TP world size). The TP load paths' inline math folds onto this in phase 2. - ContextProfile: per-model physics snapshot (max_position_embeddings, kv_bytes_per_token_per_card, world_size). - derive_limit(): context = min(max_pos, vram_ceiling, throughput_ceiling) clamped by an optional backstop; input = context − output; rounded to 1024. 6 unit tests. - config.rs: [harness.candle.context_limit] block (mirrors prefix_cache): target_prefill_latency_secs, bootstrap_prefill_tok_per_sec, activation_headroom_mb, min_free_floor_mb, output_reserve_tokens. - neuron.example.toml: documented the new block. No runtime behaviour change yet. fmt/clippy/test green. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
113 lines
4.9 KiB
TOML
113 lines
4.9 KiB
TOML
# neuron.example.toml — example configuration
|
|
#
|
|
# Copy to /etc/neuron/neuron.toml and adjust for your environment.
|
|
#
|
|
# Environment variable overrides use NEURON_ prefix with __ separators:
|
|
# NEURON_PORT=13131
|
|
|
|
port = 13131
|
|
|
|
# -- Harnesses ---------------------------------------------------------------
|
|
# Each [[harnesses]] entry enables an inference engine. Currently only
|
|
# "candle" is supported — it runs in-process and uses huggingface/candle
|
|
# for inference on local CUDA devices (or CPU when CUDA is unavailable).
|
|
|
|
[[harnesses]]
|
|
name = "candle"
|
|
|
|
# -- Candle harness settings -------------------------------------------------
|
|
# Optional tuning for the candle harness.
|
|
|
|
[harness.candle]
|
|
# HuggingFace cache directory for model weights.
|
|
#
|
|
# Resolution order (first hit wins):
|
|
# 1. `hf_cache` here in this file (applies to the synth `huggingface`
|
|
# source only — see [harness.candle.sources.*] below for explicit
|
|
# per-source paths).
|
|
# 2. `HF_HUB_CACHE` env var — same convention as the Python
|
|
# `huggingface_hub` library, so an existing cache directory shared
|
|
# with other tooling can be reused without per-tool config.
|
|
# 3. `HF_HOME` env var (cache appended as `$HF_HOME/hub`).
|
|
# 4. hf-hub's default (`~/.cache/huggingface/hub`).
|
|
#
|
|
# For per-host overrides (e.g. one neuron has an SSD with prefetched
|
|
# weights), prefer a systemd drop-in over editing this file:
|
|
# /etc/systemd/system/neuron.service.d/local.conf:
|
|
# [Service]
|
|
# Environment=HF_HUB_CACHE=/archive/hf-cache
|
|
# hf_cache = "/var/lib/neuron/hf-cache"
|
|
|
|
# Default scheme applied to bare `org/name` model ids (those without a
|
|
# `scheme:` prefix). Defaults to "huggingface" when unset. Set to
|
|
# "helexa" to make `default_models = [{ model_id = "Helexa/Foo" }]`
|
|
# resolve via the helexa registry without prefixing every entry.
|
|
# default_source = "huggingface"
|
|
|
|
# Per-scheme source endpoints. Each scheme maps to an HF-compatible
|
|
# registry. The `huggingface` source is auto-synthesised pointing at
|
|
# `https://huggingface.co` when omitted; declare it explicitly here to
|
|
# override the endpoint, auth env, or cache dir.
|
|
#
|
|
# [harness.candle.sources.huggingface]
|
|
# endpoint = "https://huggingface.co"
|
|
# auth_env = "HF_TOKEN" # optional bearer token via env var
|
|
# cache_dir = "/archive3/llm-cache/huggingface"
|
|
#
|
|
# Add helexa (or any operator-run mirror speaking the HF-compatible
|
|
# wire format) by adding another sources entry. Caches are
|
|
# disambiguated per scheme so a mirror serving the same `org/name` as
|
|
# HF cannot collide on disk.
|
|
#
|
|
# [harness.candle.sources.helexa]
|
|
# endpoint = "https://registry.helexa.ai"
|
|
# auth_env = "HELEXA_TOKEN"
|
|
# cache_dir = "/archive3/llm-cache/helexa"
|
|
|
|
# -- Prefix KV cache ----------------------------------------------------------
|
|
# Reuse cache state across requests when a new prompt starts with the
|
|
# exact token sequence of a previous one (chat/agent workloads), so
|
|
# prefill only runs on the new suffix. Applies per loaded model, on
|
|
# architectures that expose their cache state (qwen3_5). Snapshots
|
|
# live in device memory: budget_mb is per loaded model and comes out
|
|
# of the same VRAM that serves inference.
|
|
#
|
|
# [harness.candle.prefix_cache]
|
|
# enabled = true
|
|
# budget_mb = 1024
|
|
# max_entries = 8
|
|
|
|
# -- Self-derived context limits (#67) ---------------------------------------
|
|
# neuron computes the most-efficient limit{context,input,output} that still
|
|
# allows coherent agentic performance, from model architecture + live free
|
|
# VRAM + a self-measured prefill throughput ceiling, advertises it on
|
|
# /models, and enforces it. The advertised limit tracks the resident model
|
|
# and rises automatically as efficiency work (e.g. prefix caching, #11)
|
|
# frees headroom or speeds prefill — no operator action. These defaults
|
|
# rarely need changing; raise target_prefill_latency_secs once prefix
|
|
# caching makes long-context re-prefill cheap.
|
|
#
|
|
# [harness.candle.context_limit]
|
|
# enabled = true
|
|
# target_prefill_latency_secs = 120.0 # coherence wall (longest prefill/turn)
|
|
# bootstrap_prefill_tok_per_sec = 800.0 # cold-start estimate until measured
|
|
# activation_headroom_mb = 2048 # per-card prefill activation reserve
|
|
# min_free_floor_mb = 1500 # per-card free-VRAM floor to keep
|
|
# output_reserve_tokens = 8192 # generation reserve below the wall
|
|
|
|
# -- Default models ----------------------------------------------------------
|
|
# Models listed here are loaded automatically when the neuron service
|
|
# activates. Loading is sequential — a slow or failing entry doesn't
|
|
# block the rest of the fleet, but it does push out the time before
|
|
# neuron starts serving HTTP, so keep the list short. Operators can
|
|
# load additional models on demand via POST /models/load.
|
|
#
|
|
# Make sure data/neuron.service's TimeoutStartSec is generous enough to
|
|
# cover the slowest entry's first-time download + materialisation.
|
|
|
|
# [[default_models]]
|
|
# model_id = "Qwen/Qwen3-0.6B-GGUF"
|
|
# harness = "candle"
|
|
# quant = "Q4_K_M"
|
|
# devices = [0]
|