2 Commits

Author SHA1 Message Date
6779b7526a feat(neuron): load default_models on service activation
All checks were successful
CI / Format (push) Successful in 34s
CI / Clippy (push) Successful in 2m13s
CI / Test (push) Successful in 4m6s
CI / Build cortex SRPM (push) Has been skipped
CI / Build neuron SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
Stage 5 of the candle-native pivot. Adds first-class support for
auto-loading a configured set of models when the neuron service
activates.

Config:
- NeuronConfig.default_models: Vec<ModelSpec> (defaults to []).
- neuron.example.toml ships a commented [[default_models]] example.

Activation flow (crates/neuron/src/startup.rs::load_default_models):
- Sequential — VRAM contention makes parallel loads risky.
- Per-entry timing logged at info level on success.
- Failures logged as warnings; the next entry is still attempted.
- An empty list short-circuits without log noise.

Called from main.rs after the registry is built and before the axum
listener binds, so /models reflects the loaded state from the very
first request.

data/neuron.service gains TimeoutStartSec=1800s. With activation
blocked on potentially slow first-time HF downloads + GGUF
materialisation, systemd's default 90s would kill larger model loads
mid-flight.

Two non-gated tests in tests/activation.rs cover the
continues-past-failure and empty-list paths using a synthetically
unknown harness name to fail loads fast without touching the network.
The cuda-integration test from earlier stages still exercises the
real load/unload lifecycle.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 17:56:08 +03:00
84f5662df1 feat(neuron): OpenAI-compatible SSE streaming chat completions
Stage 4 of the candle-native pivot. /v1/chat/completions now switches
to text/event-stream when the request sets stream: true, emitting one
chat.completion.chunk per generated token followed by the OpenAI
[DONE] terminator.

Pipeline:
- chat_completion_stream creates a bounded mpsc::channel<ChatCompletionChunk>(32),
  sends the leading role chunk, then spawns a blocking task that
  acquires the per-model arch lock and runs the streaming generation
  loop.
- run_inference_streaming tracks a cumulative decoded prefix so each
  chunk's delta.content is the substring added since the last chunk —
  safe across BPE byte-fallback boundaries that would otherwise split
  multi-byte UTF-8 chars.
- The blocking task aborts cleanly if blocking_send fails (client
  disconnected), so generation stops when the SSE consumer hangs up.
- Final chunk carries finish_reason ("stop" on EOS, "length" on
  max_tokens). The handler appends data: [DONE] after the channel
  closes.

The Stage 3 streaming 501 placeholder test is repurposed: with the
streaming path live, an unloaded model now hits the same 404 surface
as the non-streaming path (the model lookup happens first).

cortex-gateway's existing proxy is unchanged — it already forwards
SSE bytes verbatim from Phase 2 work, so the candle SSE format passes
through unmodified.

Neuron Cargo.toml gains futures + tokio-stream (both already in
workspace deps) for ReceiverStream and stream combinators.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 17:53:14 +03:00
12 changed files with 413 additions and 31 deletions

2
Cargo.lock generated
View File

@@ -2114,6 +2114,7 @@ dependencies = [
"clap", "clap",
"cortex-core", "cortex-core",
"figment", "figment",
"futures",
"hf-hub", "hf-hub",
"reqwest", "reqwest",
"serde", "serde",
@@ -2121,6 +2122,7 @@ dependencies = [
"thiserror 2.0.18", "thiserror 2.0.18",
"tokenizers", "tokenizers",
"tokio", "tokio",
"tokio-stream",
"toml", "toml",
"tracing", "tracing",
"tracing-subscriber", "tracing-subscriber",

View File

@@ -49,6 +49,8 @@ anyhow.workspace = true
async-trait.workspace = true async-trait.workspace = true
clap.workspace = true clap.workspace = true
thiserror.workspace = true thiserror.workspace = true
futures.workspace = true
tokio-stream.workspace = true
figment.workspace = true figment.workspace = true
toml.workspace = true toml.workspace = true

View File

@@ -6,14 +6,18 @@ use crate::health::HealthCache;
use axum::Router; use axum::Router;
use axum::extract::{Path, State}; use axum::extract::{Path, State};
use axum::http::StatusCode; use axum::http::StatusCode;
use axum::response::sse::{Event, KeepAlive, Sse};
use axum::response::{IntoResponse, Json}; use axum::response::{IntoResponse, Json};
use axum::routing::{get, post}; use axum::routing::{get, post};
use cortex_core::discovery::{DiscoveryResponse, HealthResponse}; use cortex_core::discovery::{DiscoveryResponse, HealthResponse};
use cortex_core::harness::ModelSpec; use cortex_core::harness::ModelSpec;
use cortex_core::openai::ChatCompletionRequest; use cortex_core::openai::ChatCompletionRequest;
use futures::stream::{self, StreamExt};
use serde_json::{Value, json}; use serde_json::{Value, json};
use std::convert::Infallible;
use std::sync::Arc; use std::sync::Arc;
use tokio::sync::RwLock; use tokio::sync::RwLock;
use tokio_stream::wrappers::ReceiverStream;
/// Shared state for the neuron HTTP server. /// Shared state for the neuron HTTP server.
pub struct NeuronState { pub struct NeuronState {
@@ -110,8 +114,9 @@ async fn model_endpoint(
} }
} }
/// OpenAI-compatible chat completions. Non-streaming for Stage 3; the /// OpenAI-compatible chat completions. Dispatches to streaming SSE when
/// streaming path is added in Stage 4. /// `stream: true` is set on the request; otherwise returns a single
/// `ChatCompletionResponse`.
async fn chat_completions( async fn chat_completions(
State(state): State<Arc<NeuronState>>, State(state): State<Arc<NeuronState>>,
Json(req): Json<ChatCompletionRequest>, Json(req): Json<ChatCompletionRequest>,
@@ -125,24 +130,44 @@ async fn chat_completions(
}; };
if req.stream.unwrap_or(false) { if req.stream.unwrap_or(false) {
return ( match candle.chat_completion_stream(req).await {
StatusCode::NOT_IMPLEMENTED, Ok(rx) => {
Json(json!({"error": "streaming responses arrive in Stage 4"})), // Each chunk → one SSE `data: {json}` line. After the
) // channel closes, append the OpenAI [DONE] terminator.
.into_response(); let body_stream = ReceiverStream::new(rx).map(|chunk| {
} let body = serde_json::to_string(&chunk).unwrap_or_default();
Ok::<_, Infallible>(Event::default().data(body))
match candle.chat_completion(req).await { });
Ok(resp) => Json(resp).into_response(), let done_stream =
Err(InferenceError::ModelNotLoaded(id)) => ( stream::once(async { Ok::<_, Infallible>(Event::default().data("[DONE]")) });
StatusCode::NOT_FOUND, Sse::new(body_stream.chain(done_stream))
Json(json!({"error": format!("model '{id}' not loaded on this neuron")})), .keep_alive(KeepAlive::default())
) .into_response()
.into_response(), }
Err(InferenceError::Other(e)) => ( Err(InferenceError::ModelNotLoaded(id)) => (
StatusCode::INTERNAL_SERVER_ERROR, StatusCode::NOT_FOUND,
Json(json!({"error": e.to_string()})), Json(json!({"error": format!("model '{id}' not loaded on this neuron")})),
) )
.into_response(), .into_response(),
Err(InferenceError::Other(e)) => (
StatusCode::INTERNAL_SERVER_ERROR,
Json(json!({"error": e.to_string()})),
)
.into_response(),
}
} else {
match candle.chat_completion(req).await {
Ok(resp) => Json(resp).into_response(),
Err(InferenceError::ModelNotLoaded(id)) => (
StatusCode::NOT_FOUND,
Json(json!({"error": format!("model '{id}' not loaded on this neuron")})),
)
.into_response(),
Err(InferenceError::Other(e)) => (
StatusCode::INTERNAL_SERVER_ERROR,
Json(json!({"error": e.to_string()})),
)
.into_response(),
}
} }
} }

View File

@@ -1,6 +1,6 @@
//! Neuron configuration loaded from neuron.toml. //! Neuron configuration loaded from neuron.toml.
use cortex_core::harness::HarnessConfig; use cortex_core::harness::{HarnessConfig, ModelSpec};
use figment::{ use figment::{
Figment, Figment,
providers::{Env, Format, Toml}, providers::{Env, Format, Toml},
@@ -17,6 +17,12 @@ pub struct NeuronConfig {
/// Per-harness configuration. Currently only `candle` is recognised. /// Per-harness configuration. Currently only `candle` is recognised.
#[serde(default)] #[serde(default)]
pub harness: HarnessSettings, pub harness: HarnessSettings,
/// Models to auto-load when the neuron service activates. Each entry
/// is loaded sequentially before the HTTP listener binds. A failure
/// on any single entry logs a warning and proceeds — broken entries
/// don't prevent the rest of the fleet from starting.
#[serde(default)]
pub default_models: Vec<ModelSpec>,
} }
/// Settings for individual harness implementations. Each harness owns /// Settings for individual harness implementations. Each harness owns
@@ -55,6 +61,7 @@ impl Default for NeuronConfig {
port: 13131, port: 13131,
harnesses: vec![], harnesses: vec![],
harness: HarnessSettings::default(), harness: HarnessSettings::default(),
default_models: vec![],
} }
} }
} }

View File

@@ -16,15 +16,16 @@ use candle_transformers::generation::{LogitsProcessor, Sampling};
use candle_transformers::models::quantized_qwen3::ModelWeights as QuantizedQwen3Weights; use candle_transformers::models::quantized_qwen3::ModelWeights as QuantizedQwen3Weights;
use cortex_core::harness::{Harness, HarnessHealth, ModelInfo, ModelSpec}; use cortex_core::harness::{Harness, HarnessHealth, ModelInfo, ModelSpec};
use cortex_core::openai::{ use cortex_core::openai::{
ChatCompletionChoice, ChatCompletionRequest, ChatCompletionResponse, ChatMessage, ChatCompletionChoice, ChatCompletionChunk, ChatCompletionRequest, ChatCompletionResponse,
MessageContent, Usage, ChatMessage, ChunkChoice, MessageContent, Usage,
}; };
use serde_json::json;
use std::collections::HashMap; use std::collections::HashMap;
use std::path::PathBuf; use std::path::PathBuf;
use std::sync::Arc; use std::sync::Arc;
use std::time::{SystemTime, UNIX_EPOCH}; use std::time::{SystemTime, UNIX_EPOCH};
use tokenizers::Tokenizer; use tokenizers::Tokenizer;
use tokio::sync::{Mutex, RwLock}; use tokio::sync::{Mutex, RwLock, mpsc};
/// In-process candle harness. Owns the loaded model registry. /// In-process candle harness. Owns the loaded model registry.
pub struct CandleHarness { pub struct CandleHarness {
@@ -212,6 +213,104 @@ impl CandleHarness {
extra: serde_json::Value::Object(Default::default()), extra: serde_json::Value::Object(Default::default()),
}) })
} }
/// Run a streaming chat completion against a loaded model.
///
/// Returns an `mpsc::Receiver` that yields `ChatCompletionChunk`s in
/// OpenAI SSE format. The first chunk carries the assistant role;
/// subsequent chunks carry incremental `content` deltas; the final
/// chunk carries `finish_reason`. The handler is responsible for
/// wrapping these into an SSE response and appending the `[DONE]`
/// terminator.
///
/// Token-by-token decoding tracks the cumulative decoded prefix so
/// BPE byte-fallback boundaries don't split a UTF-8 char across
/// chunks.
pub async fn chat_completion_stream(
&self,
request: ChatCompletionRequest,
) -> Result<mpsc::Receiver<ChatCompletionChunk>, InferenceError> {
let loaded = {
let models = self.models.read().await;
models.get(&request.model).cloned()
};
let loaded = loaded.ok_or_else(|| InferenceError::ModelNotLoaded(request.model.clone()))?;
let prompt = format_qwen3_prompt(&request.messages);
let encoding = loaded
.tokenizer
.encode(prompt.as_str(), true)
.map_err(|e| InferenceError::Other(anyhow::anyhow!("tokenize: {e}")))?;
let prompt_tokens: Vec<u32> = encoding.get_ids().to_vec();
let temperature = request.temperature.unwrap_or(0.7);
let top_p = request.top_p;
let max_new = request.max_tokens.unwrap_or(512) as usize;
let seed = unix_subsec_nanos();
let eos_id = loaded
.tokenizer
.token_to_id("<|im_end|>")
.or_else(|| loaded.tokenizer.token_to_id("<|endoftext|>"));
let arch_arc = Arc::clone(&loaded.arch);
let device = loaded.device.clone();
let tokenizer = loaded.tokenizer.clone();
let model_id = request.model.clone();
let id = format!("chatcmpl-{:x}", unix_subsec_nanos());
let created = unix_now_secs();
// Bounded channel so the producer (blocking inference) is back-
// pressured by the consumer (SSE writer). 32 is generous —
// tokens arrive one at a time and the SSE writer is async.
let (tx, rx) = mpsc::channel::<ChatCompletionChunk>(32);
// Lead chunk: announce the assistant role per OpenAI streaming
// conventions. Tools that auto-detect a streaming reply expect
// this before any content delta.
let role_chunk = ChatCompletionChunk {
id: id.clone(),
object: "chat.completion.chunk".into(),
created,
model: model_id.clone(),
choices: vec![ChunkChoice {
index: 0,
delta: json!({"role": "assistant"}),
finish_reason: None,
extra: serde_json::Value::Object(Default::default()),
}],
usage: None,
extra: serde_json::Value::Object(Default::default()),
};
// If sending the role chunk fails the receiver is already gone;
// bail before kicking off the heavy blocking work.
tx.send(role_chunk)
.await
.map_err(|_| InferenceError::Other(anyhow::anyhow!("client disconnected")))?;
tokio::task::spawn_blocking(move || {
let mut guard = arch_arc.blocking_lock();
if let Err(e) = run_inference_streaming(
&mut guard,
&device,
&tokenizer,
&prompt_tokens,
max_new,
temperature,
top_p,
seed,
eos_id,
&id,
created,
&model_id,
&tx,
) {
tracing::warn!(model = %model_id, error = %e, "streaming inference failed");
}
});
Ok(rx)
}
} }
#[async_trait] #[async_trait]
@@ -426,6 +525,130 @@ fn run_inference(
Ok((generated, "length".into())) Ok((generated, "length".into()))
} }
/// Streaming counterpart to `run_inference`. Emits chunks via `tx` as
/// tokens are generated and exits on EOS, max_new, or receiver drop.
///
/// Detokenization tracks the cumulative decoded prefix so each chunk's
/// `content` delta is the substring appended since the last chunk —
/// safe across BPE byte-fallback boundaries.
#[allow(clippy::too_many_arguments)]
fn run_inference_streaming(
arch: &mut ModelArch,
device: &Device,
tokenizer: &Tokenizer,
prompt_tokens: &[u32],
max_new: usize,
temperature: f64,
top_p: Option<f64>,
seed: u64,
eos_id: Option<u32>,
id: &str,
created: u64,
model_id: &str,
tx: &mpsc::Sender<ChatCompletionChunk>,
) -> Result<()> {
let mut logits_processor = {
let sampling = if temperature <= 0.0 {
Sampling::ArgMax
} else {
match top_p {
Some(p) => Sampling::TopP { p, temperature },
None => Sampling::All { temperature },
}
};
LogitsProcessor::from_sampling(seed, sampling)
};
let mut all_tokens: Vec<u32> = Vec::new();
let mut decoded_prefix = String::new();
let mut finish_reason = "length".to_string();
let mut next_token = match arch {
ModelArch::Qwen3Quantized(model) => {
model.clear_kv_cache();
let input = Tensor::new(prompt_tokens, device)?.unsqueeze(0)?;
let logits = model.forward(&input, 0)?;
let logits = logits.squeeze(0)?;
logits_processor.sample(&logits)?
}
};
let emit_token = |all_tokens: &[u32], decoded_prefix: &mut String| -> Result<bool> {
let full = tokenizer
.decode(all_tokens, true)
.map_err(|e| anyhow::anyhow!("decode: {e}"))?;
if full.len() > decoded_prefix.len() {
let delta = full[decoded_prefix.len()..].to_string();
*decoded_prefix = full;
let chunk = ChatCompletionChunk {
id: id.into(),
object: "chat.completion.chunk".into(),
created,
model: model_id.into(),
choices: vec![ChunkChoice {
index: 0,
delta: json!({ "content": delta }),
finish_reason: None,
extra: serde_json::Value::Object(Default::default()),
}],
usage: None,
extra: serde_json::Value::Object(Default::default()),
};
// blocking_send returns Err if the consumer hung up — signal
// the caller to stop generating.
if tx.blocking_send(chunk).is_err() {
return Ok(false);
}
}
Ok(true)
};
if Some(next_token) == eos_id {
finish_reason = "stop".into();
} else {
all_tokens.push(next_token);
if !emit_token(&all_tokens, &mut decoded_prefix)? {
return Ok(());
}
for index in 0..max_new.saturating_sub(1) {
next_token = match arch {
ModelArch::Qwen3Quantized(model) => {
let input = Tensor::new(&[next_token], device)?.unsqueeze(0)?;
let logits = model.forward(&input, prompt_tokens.len() + index)?;
let logits = logits.squeeze(0)?;
logits_processor.sample(&logits)?
}
};
if Some(next_token) == eos_id {
finish_reason = "stop".into();
break;
}
all_tokens.push(next_token);
if !emit_token(&all_tokens, &mut decoded_prefix)? {
return Ok(());
}
}
}
let final_chunk = ChatCompletionChunk {
id: id.into(),
object: "chat.completion.chunk".into(),
created,
model: model_id.into(),
choices: vec![ChunkChoice {
index: 0,
delta: serde_json::Value::Object(Default::default()),
finish_reason: Some(finish_reason),
extra: serde_json::Value::Object(Default::default()),
}],
usage: None,
extra: serde_json::Value::Object(Default::default()),
};
let _ = tx.blocking_send(final_chunk);
Ok(())
}
fn unix_now_secs() -> u64 { fn unix_now_secs() -> u64 {
SystemTime::now() SystemTime::now()
.duration_since(UNIX_EPOCH) .duration_since(UNIX_EPOCH)

View File

@@ -3,3 +3,4 @@ pub mod config;
pub mod discovery; pub mod discovery;
pub mod harness; pub mod harness;
pub mod health; pub mod health;
pub mod startup;

View File

@@ -1,6 +1,6 @@
use anyhow::Result; use anyhow::Result;
use clap::Parser; use clap::Parser;
use neuron::{api, config::NeuronConfig, discovery, harness::HarnessRegistry, health}; use neuron::{api, config::NeuronConfig, discovery, harness::HarnessRegistry, health, startup};
use std::sync::Arc; use std::sync::Arc;
use std::time::Instant; use std::time::Instant;
use tokio::sync::RwLock; use tokio::sync::RwLock;
@@ -55,6 +55,12 @@ async fn main() -> Result<()> {
discovery_result.harnesses = registry.names(); discovery_result.harnesses = registry.names();
let candle = registry.candle(); let candle = registry.candle();
// Activation: load default models before binding the listener.
// Each load may take tens of seconds to several minutes depending
// on model size and HF cache state — keep TimeoutStartSec in the
// systemd unit generous enough to cover the slowest entry.
startup::load_default_models(&registry, &cfg.default_models).await;
let health_cache = Arc::new(health::HealthCache::new()); let health_cache = Arc::new(health::HealthCache::new());
health_cache health_cache
.set_has_gpus(!discovery_result.devices.is_empty()) .set_has_gpus(!discovery_result.devices.is_empty())

View File

@@ -0,0 +1,38 @@
//! Activation-time orchestration.
//!
//! Wired from `main.rs` after the harness registry is built and before
//! the HTTP listener binds. Kept in its own module so the logic is
//! unit-testable without spinning up a full neuron process.
use crate::harness::HarnessRegistry;
use cortex_core::harness::ModelSpec;
use std::time::Instant;
/// Load each spec sequentially against the registry, treating
/// individual failures as warnings rather than fatal errors.
///
/// VRAM contention makes parallel loads risky; the sequential path is
/// boring but correct. The function logs elapsed time per load so an
/// operator can see which model is hogging activation.
pub async fn load_default_models(registry: &HarnessRegistry, specs: &[ModelSpec]) {
if specs.is_empty() {
return;
}
tracing::info!(count = specs.len(), "loading default models");
for spec in specs {
let start = Instant::now();
match registry.load_model(spec).await {
Ok(()) => tracing::info!(
model = %spec.model_id,
elapsed_ms = start.elapsed().as_millis() as u64,
"loaded default model"
),
Err(e) => tracing::warn!(
model = %spec.model_id,
error = %e,
elapsed_ms = start.elapsed().as_millis() as u64,
"failed to load default model, continuing"
),
}
}
}

View File

@@ -0,0 +1,56 @@
//! Activation-time behaviour: load_default_models continues past
//! individual failures so a single broken catalogue entry doesn't
//! prevent the rest of the fleet from starting.
use cortex_core::harness::{HarnessConfig, ModelSpec};
use neuron::config::HarnessSettings;
use neuron::harness::HarnessRegistry;
use neuron::startup;
#[tokio::test]
async fn test_load_default_models_skips_unknown_harness() {
let registry = HarnessRegistry::from_configs(
&[HarnessConfig {
name: "candle".into(),
}],
"http://localhost:0",
&HarnessSettings::default(),
);
// Both entries fail synchronously inside the registry — no network
// call escapes (the harness lookup mismatches before hf-hub is
// touched). The function should still return cleanly.
let specs = vec![
ModelSpec {
model_id: "model-a".into(),
harness: "no-such-harness".into(),
quant: None,
tensor_parallel: None,
devices: None,
},
ModelSpec {
model_id: "model-b".into(),
harness: "no-such-harness".into(),
quant: None,
tensor_parallel: None,
devices: None,
},
];
startup::load_default_models(&registry, &specs).await;
let listed = registry
.list_all_models()
.await
.expect("list_all_models should succeed");
assert!(
listed.is_empty(),
"no models should be loaded after failed entries"
);
}
#[tokio::test]
async fn test_load_default_models_empty_is_noop() {
let registry = HarnessRegistry::new();
startup::load_default_models(&registry, &[]).await;
}

View File

@@ -273,10 +273,11 @@ async fn test_chat_completions_model_not_loaded() {
assert_eq!(resp.status(), 404); assert_eq!(resp.status(), 404);
} }
/// `/v1/chat/completions` with `stream: true` returns 501 until Stage 4 /// `/v1/chat/completions` with `stream: true` returns 404 when the
/// wires up SSE. /// model isn't loaded — same surface as the non-streaming path. The
/// streaming code only kicks in once the model lookup succeeds.
#[tokio::test] #[tokio::test]
async fn test_chat_completions_streaming_not_yet_implemented() { async fn test_chat_completions_streaming_model_not_loaded() {
use cortex_core::harness::HarnessConfig; use cortex_core::harness::HarnessConfig;
use neuron::config::HarnessSettings; use neuron::config::HarnessSettings;
@@ -306,12 +307,12 @@ async fn test_chat_completions_streaming_not_yet_implemented() {
let resp = reqwest::Client::new() let resp = reqwest::Client::new()
.post(format!("{url}/v1/chat/completions")) .post(format!("{url}/v1/chat/completions"))
.json(&json!({ .json(&json!({
"model": "anything", "model": "definitely/not-loaded",
"messages": [{"role": "user", "content": "hi"}], "messages": [{"role": "user", "content": "hi"}],
"stream": true "stream": true
})) }))
.send() .send()
.await .await
.unwrap(); .unwrap();
assert_eq!(resp.status(), 501); assert_eq!(resp.status(), 404);
} }

View File

@@ -10,6 +10,11 @@ Restart=on-failure
RestartSec=5 RestartSec=5
User=neuron User=neuron
Group=neuron Group=neuron
# Loading default_models from neuron.toml happens before the HTTP
# listener binds; large models can take many minutes to download and
# materialise on first activation. systemd's default TimeoutStartSec
# (90s) is far too short; allow 30 minutes.
TimeoutStartSec=1800s
[Install] [Install]
WantedBy=multi-user.target WantedBy=multi-user.target

View File

@@ -22,3 +22,19 @@ name = "candle"
# HuggingFace cache directory for model weights. When unset, hf-hub's # HuggingFace cache directory for model weights. When unset, hf-hub's
# default (~/.cache/huggingface) is used. # default (~/.cache/huggingface) is used.
# hf_cache = "/var/lib/neuron/hf-cache" # hf_cache = "/var/lib/neuron/hf-cache"
# -- Default models ----------------------------------------------------------
# Models listed here are loaded automatically when the neuron service
# activates. Loading is sequential — a slow or failing entry doesn't
# block the rest of the fleet, but it does push out the time before
# neuron starts serving HTTP, so keep the list short. Operators can
# load additional models on demand via POST /models/load.
#
# Make sure data/neuron.service's TimeoutStartSec is generous enough to
# cover the slowest entry's first-time download + materialisation.
# [[default_models]]
# model_id = "Qwen/Qwen3-0.6B-GGUF"
# harness = "candle"
# quant = "Q4_K_M"
# devices = [0]