Compare commits
2 Commits
249c9442e8
...
6779b7526a
| Author | SHA1 | Date | |
|---|---|---|---|
|
6779b7526a
|
|||
|
84f5662df1
|
2
Cargo.lock
generated
2
Cargo.lock
generated
@@ -2114,6 +2114,7 @@ dependencies = [
|
||||
"clap",
|
||||
"cortex-core",
|
||||
"figment",
|
||||
"futures",
|
||||
"hf-hub",
|
||||
"reqwest",
|
||||
"serde",
|
||||
@@ -2121,6 +2122,7 @@ dependencies = [
|
||||
"thiserror 2.0.18",
|
||||
"tokenizers",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"toml",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
|
||||
@@ -49,6 +49,8 @@ anyhow.workspace = true
|
||||
async-trait.workspace = true
|
||||
clap.workspace = true
|
||||
thiserror.workspace = true
|
||||
futures.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
figment.workspace = true
|
||||
toml.workspace = true
|
||||
|
||||
|
||||
@@ -6,14 +6,18 @@ use crate::health::HealthCache;
|
||||
use axum::Router;
|
||||
use axum::extract::{Path, State};
|
||||
use axum::http::StatusCode;
|
||||
use axum::response::sse::{Event, KeepAlive, Sse};
|
||||
use axum::response::{IntoResponse, Json};
|
||||
use axum::routing::{get, post};
|
||||
use cortex_core::discovery::{DiscoveryResponse, HealthResponse};
|
||||
use cortex_core::harness::ModelSpec;
|
||||
use cortex_core::openai::ChatCompletionRequest;
|
||||
use futures::stream::{self, StreamExt};
|
||||
use serde_json::{Value, json};
|
||||
use std::convert::Infallible;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::RwLock;
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
|
||||
/// Shared state for the neuron HTTP server.
|
||||
pub struct NeuronState {
|
||||
@@ -110,8 +114,9 @@ async fn model_endpoint(
|
||||
}
|
||||
}
|
||||
|
||||
/// OpenAI-compatible chat completions. Non-streaming for Stage 3; the
|
||||
/// streaming path is added in Stage 4.
|
||||
/// OpenAI-compatible chat completions. Dispatches to streaming SSE when
|
||||
/// `stream: true` is set on the request; otherwise returns a single
|
||||
/// `ChatCompletionResponse`.
|
||||
async fn chat_completions(
|
||||
State(state): State<Arc<NeuronState>>,
|
||||
Json(req): Json<ChatCompletionRequest>,
|
||||
@@ -125,13 +130,32 @@ async fn chat_completions(
|
||||
};
|
||||
|
||||
if req.stream.unwrap_or(false) {
|
||||
return (
|
||||
StatusCode::NOT_IMPLEMENTED,
|
||||
Json(json!({"error": "streaming responses arrive in Stage 4"})),
|
||||
)
|
||||
.into_response();
|
||||
match candle.chat_completion_stream(req).await {
|
||||
Ok(rx) => {
|
||||
// Each chunk → one SSE `data: {json}` line. After the
|
||||
// channel closes, append the OpenAI [DONE] terminator.
|
||||
let body_stream = ReceiverStream::new(rx).map(|chunk| {
|
||||
let body = serde_json::to_string(&chunk).unwrap_or_default();
|
||||
Ok::<_, Infallible>(Event::default().data(body))
|
||||
});
|
||||
let done_stream =
|
||||
stream::once(async { Ok::<_, Infallible>(Event::default().data("[DONE]")) });
|
||||
Sse::new(body_stream.chain(done_stream))
|
||||
.keep_alive(KeepAlive::default())
|
||||
.into_response()
|
||||
}
|
||||
|
||||
Err(InferenceError::ModelNotLoaded(id)) => (
|
||||
StatusCode::NOT_FOUND,
|
||||
Json(json!({"error": format!("model '{id}' not loaded on this neuron")})),
|
||||
)
|
||||
.into_response(),
|
||||
Err(InferenceError::Other(e)) => (
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(json!({"error": e.to_string()})),
|
||||
)
|
||||
.into_response(),
|
||||
}
|
||||
} else {
|
||||
match candle.chat_completion(req).await {
|
||||
Ok(resp) => Json(resp).into_response(),
|
||||
Err(InferenceError::ModelNotLoaded(id)) => (
|
||||
@@ -146,3 +170,4 @@ async fn chat_completions(
|
||||
.into_response(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//! Neuron configuration loaded from neuron.toml.
|
||||
|
||||
use cortex_core::harness::HarnessConfig;
|
||||
use cortex_core::harness::{HarnessConfig, ModelSpec};
|
||||
use figment::{
|
||||
Figment,
|
||||
providers::{Env, Format, Toml},
|
||||
@@ -17,6 +17,12 @@ pub struct NeuronConfig {
|
||||
/// Per-harness configuration. Currently only `candle` is recognised.
|
||||
#[serde(default)]
|
||||
pub harness: HarnessSettings,
|
||||
/// Models to auto-load when the neuron service activates. Each entry
|
||||
/// is loaded sequentially before the HTTP listener binds. A failure
|
||||
/// on any single entry logs a warning and proceeds — broken entries
|
||||
/// don't prevent the rest of the fleet from starting.
|
||||
#[serde(default)]
|
||||
pub default_models: Vec<ModelSpec>,
|
||||
}
|
||||
|
||||
/// Settings for individual harness implementations. Each harness owns
|
||||
@@ -55,6 +61,7 @@ impl Default for NeuronConfig {
|
||||
port: 13131,
|
||||
harnesses: vec![],
|
||||
harness: HarnessSettings::default(),
|
||||
default_models: vec![],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,15 +16,16 @@ use candle_transformers::generation::{LogitsProcessor, Sampling};
|
||||
use candle_transformers::models::quantized_qwen3::ModelWeights as QuantizedQwen3Weights;
|
||||
use cortex_core::harness::{Harness, HarnessHealth, ModelInfo, ModelSpec};
|
||||
use cortex_core::openai::{
|
||||
ChatCompletionChoice, ChatCompletionRequest, ChatCompletionResponse, ChatMessage,
|
||||
MessageContent, Usage,
|
||||
ChatCompletionChoice, ChatCompletionChunk, ChatCompletionRequest, ChatCompletionResponse,
|
||||
ChatMessage, ChunkChoice, MessageContent, Usage,
|
||||
};
|
||||
use serde_json::json;
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
use tokenizers::Tokenizer;
|
||||
use tokio::sync::{Mutex, RwLock};
|
||||
use tokio::sync::{Mutex, RwLock, mpsc};
|
||||
|
||||
/// In-process candle harness. Owns the loaded model registry.
|
||||
pub struct CandleHarness {
|
||||
@@ -212,6 +213,104 @@ impl CandleHarness {
|
||||
extra: serde_json::Value::Object(Default::default()),
|
||||
})
|
||||
}
|
||||
|
||||
/// Run a streaming chat completion against a loaded model.
|
||||
///
|
||||
/// Returns an `mpsc::Receiver` that yields `ChatCompletionChunk`s in
|
||||
/// OpenAI SSE format. The first chunk carries the assistant role;
|
||||
/// subsequent chunks carry incremental `content` deltas; the final
|
||||
/// chunk carries `finish_reason`. The handler is responsible for
|
||||
/// wrapping these into an SSE response and appending the `[DONE]`
|
||||
/// terminator.
|
||||
///
|
||||
/// Token-by-token decoding tracks the cumulative decoded prefix so
|
||||
/// BPE byte-fallback boundaries don't split a UTF-8 char across
|
||||
/// chunks.
|
||||
pub async fn chat_completion_stream(
|
||||
&self,
|
||||
request: ChatCompletionRequest,
|
||||
) -> Result<mpsc::Receiver<ChatCompletionChunk>, InferenceError> {
|
||||
let loaded = {
|
||||
let models = self.models.read().await;
|
||||
models.get(&request.model).cloned()
|
||||
};
|
||||
let loaded = loaded.ok_or_else(|| InferenceError::ModelNotLoaded(request.model.clone()))?;
|
||||
|
||||
let prompt = format_qwen3_prompt(&request.messages);
|
||||
let encoding = loaded
|
||||
.tokenizer
|
||||
.encode(prompt.as_str(), true)
|
||||
.map_err(|e| InferenceError::Other(anyhow::anyhow!("tokenize: {e}")))?;
|
||||
let prompt_tokens: Vec<u32> = encoding.get_ids().to_vec();
|
||||
|
||||
let temperature = request.temperature.unwrap_or(0.7);
|
||||
let top_p = request.top_p;
|
||||
let max_new = request.max_tokens.unwrap_or(512) as usize;
|
||||
let seed = unix_subsec_nanos();
|
||||
|
||||
let eos_id = loaded
|
||||
.tokenizer
|
||||
.token_to_id("<|im_end|>")
|
||||
.or_else(|| loaded.tokenizer.token_to_id("<|endoftext|>"));
|
||||
|
||||
let arch_arc = Arc::clone(&loaded.arch);
|
||||
let device = loaded.device.clone();
|
||||
let tokenizer = loaded.tokenizer.clone();
|
||||
let model_id = request.model.clone();
|
||||
let id = format!("chatcmpl-{:x}", unix_subsec_nanos());
|
||||
let created = unix_now_secs();
|
||||
|
||||
// Bounded channel so the producer (blocking inference) is back-
|
||||
// pressured by the consumer (SSE writer). 32 is generous —
|
||||
// tokens arrive one at a time and the SSE writer is async.
|
||||
let (tx, rx) = mpsc::channel::<ChatCompletionChunk>(32);
|
||||
|
||||
// Lead chunk: announce the assistant role per OpenAI streaming
|
||||
// conventions. Tools that auto-detect a streaming reply expect
|
||||
// this before any content delta.
|
||||
let role_chunk = ChatCompletionChunk {
|
||||
id: id.clone(),
|
||||
object: "chat.completion.chunk".into(),
|
||||
created,
|
||||
model: model_id.clone(),
|
||||
choices: vec![ChunkChoice {
|
||||
index: 0,
|
||||
delta: json!({"role": "assistant"}),
|
||||
finish_reason: None,
|
||||
extra: serde_json::Value::Object(Default::default()),
|
||||
}],
|
||||
usage: None,
|
||||
extra: serde_json::Value::Object(Default::default()),
|
||||
};
|
||||
// If sending the role chunk fails the receiver is already gone;
|
||||
// bail before kicking off the heavy blocking work.
|
||||
tx.send(role_chunk)
|
||||
.await
|
||||
.map_err(|_| InferenceError::Other(anyhow::anyhow!("client disconnected")))?;
|
||||
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let mut guard = arch_arc.blocking_lock();
|
||||
if let Err(e) = run_inference_streaming(
|
||||
&mut guard,
|
||||
&device,
|
||||
&tokenizer,
|
||||
&prompt_tokens,
|
||||
max_new,
|
||||
temperature,
|
||||
top_p,
|
||||
seed,
|
||||
eos_id,
|
||||
&id,
|
||||
created,
|
||||
&model_id,
|
||||
&tx,
|
||||
) {
|
||||
tracing::warn!(model = %model_id, error = %e, "streaming inference failed");
|
||||
}
|
||||
});
|
||||
|
||||
Ok(rx)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
@@ -426,6 +525,130 @@ fn run_inference(
|
||||
Ok((generated, "length".into()))
|
||||
}
|
||||
|
||||
/// Streaming counterpart to `run_inference`. Emits chunks via `tx` as
|
||||
/// tokens are generated and exits on EOS, max_new, or receiver drop.
|
||||
///
|
||||
/// Detokenization tracks the cumulative decoded prefix so each chunk's
|
||||
/// `content` delta is the substring appended since the last chunk —
|
||||
/// safe across BPE byte-fallback boundaries.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn run_inference_streaming(
|
||||
arch: &mut ModelArch,
|
||||
device: &Device,
|
||||
tokenizer: &Tokenizer,
|
||||
prompt_tokens: &[u32],
|
||||
max_new: usize,
|
||||
temperature: f64,
|
||||
top_p: Option<f64>,
|
||||
seed: u64,
|
||||
eos_id: Option<u32>,
|
||||
id: &str,
|
||||
created: u64,
|
||||
model_id: &str,
|
||||
tx: &mpsc::Sender<ChatCompletionChunk>,
|
||||
) -> Result<()> {
|
||||
let mut logits_processor = {
|
||||
let sampling = if temperature <= 0.0 {
|
||||
Sampling::ArgMax
|
||||
} else {
|
||||
match top_p {
|
||||
Some(p) => Sampling::TopP { p, temperature },
|
||||
None => Sampling::All { temperature },
|
||||
}
|
||||
};
|
||||
LogitsProcessor::from_sampling(seed, sampling)
|
||||
};
|
||||
|
||||
let mut all_tokens: Vec<u32> = Vec::new();
|
||||
let mut decoded_prefix = String::new();
|
||||
let mut finish_reason = "length".to_string();
|
||||
|
||||
let mut next_token = match arch {
|
||||
ModelArch::Qwen3Quantized(model) => {
|
||||
model.clear_kv_cache();
|
||||
let input = Tensor::new(prompt_tokens, device)?.unsqueeze(0)?;
|
||||
let logits = model.forward(&input, 0)?;
|
||||
let logits = logits.squeeze(0)?;
|
||||
logits_processor.sample(&logits)?
|
||||
}
|
||||
};
|
||||
|
||||
let emit_token = |all_tokens: &[u32], decoded_prefix: &mut String| -> Result<bool> {
|
||||
let full = tokenizer
|
||||
.decode(all_tokens, true)
|
||||
.map_err(|e| anyhow::anyhow!("decode: {e}"))?;
|
||||
if full.len() > decoded_prefix.len() {
|
||||
let delta = full[decoded_prefix.len()..].to_string();
|
||||
*decoded_prefix = full;
|
||||
let chunk = ChatCompletionChunk {
|
||||
id: id.into(),
|
||||
object: "chat.completion.chunk".into(),
|
||||
created,
|
||||
model: model_id.into(),
|
||||
choices: vec![ChunkChoice {
|
||||
index: 0,
|
||||
delta: json!({ "content": delta }),
|
||||
finish_reason: None,
|
||||
extra: serde_json::Value::Object(Default::default()),
|
||||
}],
|
||||
usage: None,
|
||||
extra: serde_json::Value::Object(Default::default()),
|
||||
};
|
||||
// blocking_send returns Err if the consumer hung up — signal
|
||||
// the caller to stop generating.
|
||||
if tx.blocking_send(chunk).is_err() {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
Ok(true)
|
||||
};
|
||||
|
||||
if Some(next_token) == eos_id {
|
||||
finish_reason = "stop".into();
|
||||
} else {
|
||||
all_tokens.push(next_token);
|
||||
if !emit_token(&all_tokens, &mut decoded_prefix)? {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
for index in 0..max_new.saturating_sub(1) {
|
||||
next_token = match arch {
|
||||
ModelArch::Qwen3Quantized(model) => {
|
||||
let input = Tensor::new(&[next_token], device)?.unsqueeze(0)?;
|
||||
let logits = model.forward(&input, prompt_tokens.len() + index)?;
|
||||
let logits = logits.squeeze(0)?;
|
||||
logits_processor.sample(&logits)?
|
||||
}
|
||||
};
|
||||
if Some(next_token) == eos_id {
|
||||
finish_reason = "stop".into();
|
||||
break;
|
||||
}
|
||||
all_tokens.push(next_token);
|
||||
if !emit_token(&all_tokens, &mut decoded_prefix)? {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let final_chunk = ChatCompletionChunk {
|
||||
id: id.into(),
|
||||
object: "chat.completion.chunk".into(),
|
||||
created,
|
||||
model: model_id.into(),
|
||||
choices: vec![ChunkChoice {
|
||||
index: 0,
|
||||
delta: serde_json::Value::Object(Default::default()),
|
||||
finish_reason: Some(finish_reason),
|
||||
extra: serde_json::Value::Object(Default::default()),
|
||||
}],
|
||||
usage: None,
|
||||
extra: serde_json::Value::Object(Default::default()),
|
||||
};
|
||||
let _ = tx.blocking_send(final_chunk);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn unix_now_secs() -> u64 {
|
||||
SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
|
||||
@@ -3,3 +3,4 @@ pub mod config;
|
||||
pub mod discovery;
|
||||
pub mod harness;
|
||||
pub mod health;
|
||||
pub mod startup;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use neuron::{api, config::NeuronConfig, discovery, harness::HarnessRegistry, health};
|
||||
use neuron::{api, config::NeuronConfig, discovery, harness::HarnessRegistry, health, startup};
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
use tokio::sync::RwLock;
|
||||
@@ -55,6 +55,12 @@ async fn main() -> Result<()> {
|
||||
discovery_result.harnesses = registry.names();
|
||||
let candle = registry.candle();
|
||||
|
||||
// Activation: load default models before binding the listener.
|
||||
// Each load may take tens of seconds to several minutes depending
|
||||
// on model size and HF cache state — keep TimeoutStartSec in the
|
||||
// systemd unit generous enough to cover the slowest entry.
|
||||
startup::load_default_models(®istry, &cfg.default_models).await;
|
||||
|
||||
let health_cache = Arc::new(health::HealthCache::new());
|
||||
health_cache
|
||||
.set_has_gpus(!discovery_result.devices.is_empty())
|
||||
|
||||
38
crates/neuron/src/startup.rs
Normal file
38
crates/neuron/src/startup.rs
Normal file
@@ -0,0 +1,38 @@
|
||||
//! Activation-time orchestration.
|
||||
//!
|
||||
//! Wired from `main.rs` after the harness registry is built and before
|
||||
//! the HTTP listener binds. Kept in its own module so the logic is
|
||||
//! unit-testable without spinning up a full neuron process.
|
||||
|
||||
use crate::harness::HarnessRegistry;
|
||||
use cortex_core::harness::ModelSpec;
|
||||
use std::time::Instant;
|
||||
|
||||
/// Load each spec sequentially against the registry, treating
|
||||
/// individual failures as warnings rather than fatal errors.
|
||||
///
|
||||
/// VRAM contention makes parallel loads risky; the sequential path is
|
||||
/// boring but correct. The function logs elapsed time per load so an
|
||||
/// operator can see which model is hogging activation.
|
||||
pub async fn load_default_models(registry: &HarnessRegistry, specs: &[ModelSpec]) {
|
||||
if specs.is_empty() {
|
||||
return;
|
||||
}
|
||||
tracing::info!(count = specs.len(), "loading default models");
|
||||
for spec in specs {
|
||||
let start = Instant::now();
|
||||
match registry.load_model(spec).await {
|
||||
Ok(()) => tracing::info!(
|
||||
model = %spec.model_id,
|
||||
elapsed_ms = start.elapsed().as_millis() as u64,
|
||||
"loaded default model"
|
||||
),
|
||||
Err(e) => tracing::warn!(
|
||||
model = %spec.model_id,
|
||||
error = %e,
|
||||
elapsed_ms = start.elapsed().as_millis() as u64,
|
||||
"failed to load default model, continuing"
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
56
crates/neuron/tests/activation.rs
Normal file
56
crates/neuron/tests/activation.rs
Normal file
@@ -0,0 +1,56 @@
|
||||
//! Activation-time behaviour: load_default_models continues past
|
||||
//! individual failures so a single broken catalogue entry doesn't
|
||||
//! prevent the rest of the fleet from starting.
|
||||
|
||||
use cortex_core::harness::{HarnessConfig, ModelSpec};
|
||||
use neuron::config::HarnessSettings;
|
||||
use neuron::harness::HarnessRegistry;
|
||||
use neuron::startup;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_load_default_models_skips_unknown_harness() {
|
||||
let registry = HarnessRegistry::from_configs(
|
||||
&[HarnessConfig {
|
||||
name: "candle".into(),
|
||||
}],
|
||||
"http://localhost:0",
|
||||
&HarnessSettings::default(),
|
||||
);
|
||||
|
||||
// Both entries fail synchronously inside the registry — no network
|
||||
// call escapes (the harness lookup mismatches before hf-hub is
|
||||
// touched). The function should still return cleanly.
|
||||
let specs = vec![
|
||||
ModelSpec {
|
||||
model_id: "model-a".into(),
|
||||
harness: "no-such-harness".into(),
|
||||
quant: None,
|
||||
tensor_parallel: None,
|
||||
devices: None,
|
||||
},
|
||||
ModelSpec {
|
||||
model_id: "model-b".into(),
|
||||
harness: "no-such-harness".into(),
|
||||
quant: None,
|
||||
tensor_parallel: None,
|
||||
devices: None,
|
||||
},
|
||||
];
|
||||
|
||||
startup::load_default_models(®istry, &specs).await;
|
||||
|
||||
let listed = registry
|
||||
.list_all_models()
|
||||
.await
|
||||
.expect("list_all_models should succeed");
|
||||
assert!(
|
||||
listed.is_empty(),
|
||||
"no models should be loaded after failed entries"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_load_default_models_empty_is_noop() {
|
||||
let registry = HarnessRegistry::new();
|
||||
startup::load_default_models(®istry, &[]).await;
|
||||
}
|
||||
@@ -273,10 +273,11 @@ async fn test_chat_completions_model_not_loaded() {
|
||||
assert_eq!(resp.status(), 404);
|
||||
}
|
||||
|
||||
/// `/v1/chat/completions` with `stream: true` returns 501 until Stage 4
|
||||
/// wires up SSE.
|
||||
/// `/v1/chat/completions` with `stream: true` returns 404 when the
|
||||
/// model isn't loaded — same surface as the non-streaming path. The
|
||||
/// streaming code only kicks in once the model lookup succeeds.
|
||||
#[tokio::test]
|
||||
async fn test_chat_completions_streaming_not_yet_implemented() {
|
||||
async fn test_chat_completions_streaming_model_not_loaded() {
|
||||
use cortex_core::harness::HarnessConfig;
|
||||
use neuron::config::HarnessSettings;
|
||||
|
||||
@@ -306,12 +307,12 @@ async fn test_chat_completions_streaming_not_yet_implemented() {
|
||||
let resp = reqwest::Client::new()
|
||||
.post(format!("{url}/v1/chat/completions"))
|
||||
.json(&json!({
|
||||
"model": "anything",
|
||||
"model": "definitely/not-loaded",
|
||||
"messages": [{"role": "user", "content": "hi"}],
|
||||
"stream": true
|
||||
}))
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(resp.status(), 501);
|
||||
assert_eq!(resp.status(), 404);
|
||||
}
|
||||
|
||||
@@ -10,6 +10,11 @@ Restart=on-failure
|
||||
RestartSec=5
|
||||
User=neuron
|
||||
Group=neuron
|
||||
# Loading default_models from neuron.toml happens before the HTTP
|
||||
# listener binds; large models can take many minutes to download and
|
||||
# materialise on first activation. systemd's default TimeoutStartSec
|
||||
# (90s) is far too short; allow 30 minutes.
|
||||
TimeoutStartSec=1800s
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
||||
@@ -22,3 +22,19 @@ name = "candle"
|
||||
# HuggingFace cache directory for model weights. When unset, hf-hub's
|
||||
# default (~/.cache/huggingface) is used.
|
||||
# hf_cache = "/var/lib/neuron/hf-cache"
|
||||
|
||||
# -- Default models ----------------------------------------------------------
|
||||
# Models listed here are loaded automatically when the neuron service
|
||||
# activates. Loading is sequential — a slow or failing entry doesn't
|
||||
# block the rest of the fleet, but it does push out the time before
|
||||
# neuron starts serving HTTP, so keep the list short. Operators can
|
||||
# load additional models on demand via POST /models/load.
|
||||
#
|
||||
# Make sure data/neuron.service's TimeoutStartSec is generous enough to
|
||||
# cover the slowest entry's first-time download + materialisation.
|
||||
|
||||
# [[default_models]]
|
||||
# model_id = "Qwen/Qwen3-0.6B-GGUF"
|
||||
# harness = "candle"
|
||||
# quant = "Q4_K_M"
|
||||
# devices = [0]
|
||||
|
||||
Reference in New Issue
Block a user