feat(neuron): load default_models on service activation

Stage 5 of the candle-native pivot. Adds first-class support for auto-loading a configured set of models when the neuron service activates. Config: - NeuronConfig.default_models: Vec<ModelSpec> (defaults to []). - neuron.example.toml ships a commented [[default_models]] example. Activation flow (crates/neuron/src/startup.rs::load_default_models): - Sequential — VRAM contention makes parallel loads risky. - Per-entry timing logged at info level on success. - Failures logged as warnings; the next entry is still attempted. - An empty list short-circuits without log noise. Called from main.rs after the registry is built and before the axum listener binds, so /models reflects the loaded state from the very first request. data/neuron.service gains TimeoutStartSec=1800s. With activation blocked on potentially slow first-time HF downloads + GGUF materialisation, systemd's default 90s would kill larger model loads mid-flight. Two non-gated tests in tests/activation.rs cover the continues-past-failure and empty-list paths using a synthetically unknown harness name to fail loads fast without touching the network. The cuda-integration test from earlier stages still exercises the real load/unload lifecycle. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
feat(neuron): OpenAI-compatible SSE streaming chat completions
2026-05-18 17:56:08 +03:00 · 2026-05-18 17:53:14 +03:00
12 changed files with 413 additions and 31 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2114,6 +2114,7 @@ dependencies = [
 "clap",
 "cortex-core",
 "figment",
 "futures",
 "hf-hub",
 "reqwest",
 "serde",
@@ -2121,6 +2122,7 @@ dependencies = [
 "thiserror 2.0.18",
 "tokenizers",
 "tokio",
 "tokio-stream",
 "toml",
 "tracing",
 "tracing-subscriber",
--- a/crates/neuron/Cargo.toml
+++ b/crates/neuron/Cargo.toml
@@ -49,6 +49,8 @@ anyhow.workspace = true
 async-trait.workspace = true
 clap.workspace = true
 thiserror.workspace = true
 futures.workspace = true
 tokio-stream.workspace = true
 figment.workspace = true
 toml.workspace = true
--- a/crates/neuron/src/api.rs
+++ b/crates/neuron/src/api.rs
@@ -6,14 +6,18 @@ use crate::health::HealthCache;
 use axum::Router;
 use axum::extract::{Path, State};
 use axum::http::StatusCode;
 use axum::response::sse::{Event, KeepAlive, Sse};
 use axum::response::{IntoResponse, Json};
 use axum::routing::{get, post};
 use cortex_core::discovery::{DiscoveryResponse, HealthResponse};
 use cortex_core::harness::ModelSpec;
 use cortex_core::openai::ChatCompletionRequest;
 use futures::stream::{self, StreamExt};
 use serde_json::{Value, json};
 use std::convert::Infallible;
 use std::sync::Arc;
 use tokio::sync::RwLock;
 use tokio_stream::wrappers::ReceiverStream;
 /// Shared state for the neuron HTTP server.
 pub struct NeuronState {
@@ -110,8 +114,9 @@ async fn model_endpoint(
    }
 }
-/// OpenAI-compatible chat completions. Non-streaming for Stage 3; the
+/// OpenAI-compatible chat completions. Dispatches to streaming SSE when
-/// streaming path is added in Stage 4.
+/// `stream: true` is set on the request; otherwise returns a single
 /// `ChatCompletionResponse`.
 async fn chat_completions(
    State(state): State<Arc<NeuronState>>,
    Json(req): Json<ChatCompletionRequest>,
@@ -125,24 +130,44 @@ async fn chat_completions(
    };
    if req.stream.unwrap_or(false) {
-        return (
+        match candle.chat_completion_stream(req).await {
-            StatusCode::NOT_IMPLEMENTED,
+            Ok(rx) => {
-            Json(json!({"error": "streaming responses arrive in Stage 4"})),
+                // Each chunk → one SSE `data: {json}` line. After the
-        )
+                // channel closes, append the OpenAI [DONE] terminator.
-            .into_response();
+                let body_stream = ReceiverStream::new(rx).map(|chunk| {
-    }
+                    let body = serde_json::to_string(&chunk).unwrap_or_default();
-
+                    Ok::<_, Infallible>(Event::default().data(body))
-    match candle.chat_completion(req).await {
+                });
-        Ok(resp) => Json(resp).into_response(),
+                let done_stream =
-        Err(InferenceError::ModelNotLoaded(id)) => (
+                    stream::once(async { Ok::<_, Infallible>(Event::default().data("[DONE]")) });
-            StatusCode::NOT_FOUND,
+                Sse::new(body_stream.chain(done_stream))
-            Json(json!({"error": format!("model '{id}' not loaded on this neuron")})),
+                    .keep_alive(KeepAlive::default())
-        )
+                    .into_response()
-            .into_response(),
+            }
-        Err(InferenceError::Other(e)) => (
+            Err(InferenceError::ModelNotLoaded(id)) => (
-            StatusCode::INTERNAL_SERVER_ERROR,
+                StatusCode::NOT_FOUND,
-            Json(json!({"error": e.to_string()})),
+                Json(json!({"error": format!("model '{id}' not loaded on this neuron")})),
-        )
+            )
-            .into_response(),
+                .into_response(),
            Err(InferenceError::Other(e)) => (
                StatusCode::INTERNAL_SERVER_ERROR,
                Json(json!({"error": e.to_string()})),
            )
                .into_response(),
        }
    } else {
        match candle.chat_completion(req).await {
            Ok(resp) => Json(resp).into_response(),
            Err(InferenceError::ModelNotLoaded(id)) => (
                StatusCode::NOT_FOUND,
                Json(json!({"error": format!("model '{id}' not loaded on this neuron")})),
            )
                .into_response(),
            Err(InferenceError::Other(e)) => (
                StatusCode::INTERNAL_SERVER_ERROR,
                Json(json!({"error": e.to_string()})),
            )
                .into_response(),
        }
    }
 }
--- a/crates/neuron/src/config.rs
+++ b/crates/neuron/src/config.rs
@@ -1,6 +1,6 @@
 //! Neuron configuration loaded from neuron.toml.
-use cortex_core::harness::HarnessConfig;
+use cortex_core::harness::{HarnessConfig, ModelSpec};
 use figment::{
    Figment,
    providers::{Env, Format, Toml},
@@ -17,6 +17,12 @@ pub struct NeuronConfig {
    /// Per-harness configuration. Currently only `candle` is recognised.
    #[serde(default)]
    pub harness: HarnessSettings,
    /// Models to auto-load when the neuron service activates. Each entry
    /// is loaded sequentially before the HTTP listener binds. A failure
    /// on any single entry logs a warning and proceeds — broken entries
    /// don't prevent the rest of the fleet from starting.
    #[serde(default)]
    pub default_models: Vec<ModelSpec>,
 }
 /// Settings for individual harness implementations. Each harness owns
@@ -55,6 +61,7 @@ impl Default for NeuronConfig {
            port: 13131,
            harnesses: vec![],
            harness: HarnessSettings::default(),
            default_models: vec![],
        }
    }
 }
--- a/crates/neuron/src/harness/candle.rs
+++ b/crates/neuron/src/harness/candle.rs
@@ -16,15 +16,16 @@ use candle_transformers::generation::{LogitsProcessor, Sampling};
 use candle_transformers::models::quantized_qwen3::ModelWeights as QuantizedQwen3Weights;
 use cortex_core::harness::{Harness, HarnessHealth, ModelInfo, ModelSpec};
 use cortex_core::openai::{
-    ChatCompletionChoice, ChatCompletionRequest, ChatCompletionResponse, ChatMessage,
+    ChatCompletionChoice, ChatCompletionChunk, ChatCompletionRequest, ChatCompletionResponse,
-    MessageContent, Usage,
+    ChatMessage, ChunkChoice, MessageContent, Usage,
 };
 use serde_json::json;
 use std::collections::HashMap;
 use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::{SystemTime, UNIX_EPOCH};
 use tokenizers::Tokenizer;
-use tokio::sync::{Mutex, RwLock};
+use tokio::sync::{Mutex, RwLock, mpsc};
 /// In-process candle harness. Owns the loaded model registry.
 pub struct CandleHarness {
@@ -212,6 +213,104 @@ impl CandleHarness {
            extra: serde_json::Value::Object(Default::default()),
        })
    }
    /// Run a streaming chat completion against a loaded model.
    ///
    /// Returns an `mpsc::Receiver` that yields `ChatCompletionChunk`s in
    /// OpenAI SSE format. The first chunk carries the assistant role;
    /// subsequent chunks carry incremental `content` deltas; the final
    /// chunk carries `finish_reason`. The handler is responsible for
    /// wrapping these into an SSE response and appending the `[DONE]`
    /// terminator.
    ///
    /// Token-by-token decoding tracks the cumulative decoded prefix so
    /// BPE byte-fallback boundaries don't split a UTF-8 char across
    /// chunks.
    pub async fn chat_completion_stream(
        &self,
        request: ChatCompletionRequest,
    ) -> Result<mpsc::Receiver<ChatCompletionChunk>, InferenceError> {
        let loaded = {
            let models = self.models.read().await;
            models.get(&request.model).cloned()
        };
        let loaded = loaded.ok_or_else(|| InferenceError::ModelNotLoaded(request.model.clone()))?;
        let prompt = format_qwen3_prompt(&request.messages);
        let encoding = loaded
            .tokenizer
            .encode(prompt.as_str(), true)
            .map_err(|e| InferenceError::Other(anyhow::anyhow!("tokenize: {e}")))?;
        let prompt_tokens: Vec<u32> = encoding.get_ids().to_vec();
        let temperature = request.temperature.unwrap_or(0.7);
        let top_p = request.top_p;
        let max_new = request.max_tokens.unwrap_or(512) as usize;
        let seed = unix_subsec_nanos();
        let eos_id = loaded
            .tokenizer
            .token_to_id("<|im_end|>")
            .or_else(|| loaded.tokenizer.token_to_id("<|endoftext|>"));
        let arch_arc = Arc::clone(&loaded.arch);
        let device = loaded.device.clone();
        let tokenizer = loaded.tokenizer.clone();
        let model_id = request.model.clone();
        let id = format!("chatcmpl-{:x}", unix_subsec_nanos());
        let created = unix_now_secs();
        // Bounded channel so the producer (blocking inference) is back-
        // pressured by the consumer (SSE writer). 32 is generous —
        // tokens arrive one at a time and the SSE writer is async.
        let (tx, rx) = mpsc::channel::<ChatCompletionChunk>(32);
        // Lead chunk: announce the assistant role per OpenAI streaming
        // conventions. Tools that auto-detect a streaming reply expect
        // this before any content delta.
        let role_chunk = ChatCompletionChunk {
            id: id.clone(),
            object: "chat.completion.chunk".into(),
            created,
            model: model_id.clone(),
            choices: vec![ChunkChoice {
                index: 0,
                delta: json!({"role": "assistant"}),
                finish_reason: None,
                extra: serde_json::Value::Object(Default::default()),
            }],
            usage: None,
            extra: serde_json::Value::Object(Default::default()),
        };
        // If sending the role chunk fails the receiver is already gone;
        // bail before kicking off the heavy blocking work.
        tx.send(role_chunk)
            .await
            .map_err(|_| InferenceError::Other(anyhow::anyhow!("client disconnected")))?;
        tokio::task::spawn_blocking(move || {
            let mut guard = arch_arc.blocking_lock();
            if let Err(e) = run_inference_streaming(
                &mut guard,
                &device,
                &tokenizer,
                &prompt_tokens,
                max_new,
                temperature,
                top_p,
                seed,
                eos_id,
                &id,
                created,
                &model_id,
                &tx,
            ) {
                tracing::warn!(model = %model_id, error = %e, "streaming inference failed");
            }
        });
        Ok(rx)
    }
 }
 #[async_trait]
@@ -426,6 +525,130 @@ fn run_inference(
    Ok((generated, "length".into()))
 }
 /// Streaming counterpart to `run_inference`. Emits chunks via `tx` as
 /// tokens are generated and exits on EOS, max_new, or receiver drop.
 ///
 /// Detokenization tracks the cumulative decoded prefix so each chunk's
 /// `content` delta is the substring appended since the last chunk —
 /// safe across BPE byte-fallback boundaries.
 #[allow(clippy::too_many_arguments)]
 fn run_inference_streaming(
    arch: &mut ModelArch,
    device: &Device,
    tokenizer: &Tokenizer,
    prompt_tokens: &[u32],
    max_new: usize,
    temperature: f64,
    top_p: Option<f64>,
    seed: u64,
    eos_id: Option<u32>,
    id: &str,
    created: u64,
    model_id: &str,
    tx: &mpsc::Sender<ChatCompletionChunk>,
 ) -> Result<()> {
    let mut logits_processor = {
        let sampling = if temperature <= 0.0 {
            Sampling::ArgMax
        } else {
            match top_p {
                Some(p) => Sampling::TopP { p, temperature },
                None => Sampling::All { temperature },
            }
        };
        LogitsProcessor::from_sampling(seed, sampling)
    };
    let mut all_tokens: Vec<u32> = Vec::new();
    let mut decoded_prefix = String::new();
    let mut finish_reason = "length".to_string();
    let mut next_token = match arch {
        ModelArch::Qwen3Quantized(model) => {
            model.clear_kv_cache();
            let input = Tensor::new(prompt_tokens, device)?.unsqueeze(0)?;
            let logits = model.forward(&input, 0)?;
            let logits = logits.squeeze(0)?;
            logits_processor.sample(&logits)?
        }
    };
    let emit_token = |all_tokens: &[u32], decoded_prefix: &mut String| -> Result<bool> {
        let full = tokenizer
            .decode(all_tokens, true)
            .map_err(|e| anyhow::anyhow!("decode: {e}"))?;
        if full.len() > decoded_prefix.len() {
            let delta = full[decoded_prefix.len()..].to_string();
            *decoded_prefix = full;
            let chunk = ChatCompletionChunk {
                id: id.into(),
                object: "chat.completion.chunk".into(),
                created,
                model: model_id.into(),
                choices: vec![ChunkChoice {
                    index: 0,
                    delta: json!({ "content": delta }),
                    finish_reason: None,
                    extra: serde_json::Value::Object(Default::default()),
                }],
                usage: None,
                extra: serde_json::Value::Object(Default::default()),
            };
            // blocking_send returns Err if the consumer hung up — signal
            // the caller to stop generating.
            if tx.blocking_send(chunk).is_err() {
                return Ok(false);
            }
        }
        Ok(true)
    };
    if Some(next_token) == eos_id {
        finish_reason = "stop".into();
    } else {
        all_tokens.push(next_token);
        if !emit_token(&all_tokens, &mut decoded_prefix)? {
            return Ok(());
        }
        for index in 0..max_new.saturating_sub(1) {
            next_token = match arch {
                ModelArch::Qwen3Quantized(model) => {
                    let input = Tensor::new(&[next_token], device)?.unsqueeze(0)?;
                    let logits = model.forward(&input, prompt_tokens.len() + index)?;
                    let logits = logits.squeeze(0)?;
                    logits_processor.sample(&logits)?
                }
            };
            if Some(next_token) == eos_id {
                finish_reason = "stop".into();
                break;
            }
            all_tokens.push(next_token);
            if !emit_token(&all_tokens, &mut decoded_prefix)? {
                return Ok(());
            }
        }
    }
    let final_chunk = ChatCompletionChunk {
        id: id.into(),
        object: "chat.completion.chunk".into(),
        created,
        model: model_id.into(),
        choices: vec![ChunkChoice {
            index: 0,
            delta: serde_json::Value::Object(Default::default()),
            finish_reason: Some(finish_reason),
            extra: serde_json::Value::Object(Default::default()),
        }],
        usage: None,
        extra: serde_json::Value::Object(Default::default()),
    };
    let _ = tx.blocking_send(final_chunk);
    Ok(())
 }
 fn unix_now_secs() -> u64 {
    SystemTime::now()
        .duration_since(UNIX_EPOCH)
--- a/crates/neuron/src/lib.rs
+++ b/crates/neuron/src/lib.rs
@@ -3,3 +3,4 @@ pub mod config;
 pub mod discovery;
 pub mod harness;
 pub mod health;
 pub mod startup;
--- a/crates/neuron/src/main.rs
+++ b/crates/neuron/src/main.rs
@@ -1,6 +1,6 @@
 use anyhow::Result;
 use clap::Parser;
-use neuron::{api, config::NeuronConfig, discovery, harness::HarnessRegistry, health};
+use neuron::{api, config::NeuronConfig, discovery, harness::HarnessRegistry, health, startup};
 use std::sync::Arc;
 use std::time::Instant;
 use tokio::sync::RwLock;
@@ -55,6 +55,12 @@ async fn main() -> Result<()> {
    discovery_result.harnesses = registry.names();
    let candle = registry.candle();
    // Activation: load default models before binding the listener.
    // Each load may take tens of seconds to several minutes depending
    // on model size and HF cache state — keep TimeoutStartSec in the
    // systemd unit generous enough to cover the slowest entry.
    startup::load_default_models(&registry, &cfg.default_models).await;
    let health_cache = Arc::new(health::HealthCache::new());
    health_cache
        .set_has_gpus(!discovery_result.devices.is_empty())
--- a/crates/neuron/src/startup.rs
+++ b/crates/neuron/src/startup.rs
@@ -0,0 +1,38 @@
 //! Activation-time orchestration.
 //!
 //! Wired from `main.rs` after the harness registry is built and before
 //! the HTTP listener binds. Kept in its own module so the logic is
 //! unit-testable without spinning up a full neuron process.
 use crate::harness::HarnessRegistry;
 use cortex_core::harness::ModelSpec;
 use std::time::Instant;
 /// Load each spec sequentially against the registry, treating
 /// individual failures as warnings rather than fatal errors.
 ///
 /// VRAM contention makes parallel loads risky; the sequential path is
 /// boring but correct. The function logs elapsed time per load so an
 /// operator can see which model is hogging activation.
 pub async fn load_default_models(registry: &HarnessRegistry, specs: &[ModelSpec]) {
    if specs.is_empty() {
        return;
    }
    tracing::info!(count = specs.len(), "loading default models");
    for spec in specs {
        let start = Instant::now();
        match registry.load_model(spec).await {
            Ok(()) => tracing::info!(
                model = %spec.model_id,
                elapsed_ms = start.elapsed().as_millis() as u64,
                "loaded default model"
            ),
            Err(e) => tracing::warn!(
                model = %spec.model_id,
                error = %e,
                elapsed_ms = start.elapsed().as_millis() as u64,
                "failed to load default model, continuing"
            ),
        }
    }
 }
--- a/crates/neuron/tests/activation.rs
+++ b/crates/neuron/tests/activation.rs
@@ -0,0 +1,56 @@
 //! Activation-time behaviour: load_default_models continues past
 //! individual failures so a single broken catalogue entry doesn't
 //! prevent the rest of the fleet from starting.
 use cortex_core::harness::{HarnessConfig, ModelSpec};
 use neuron::config::HarnessSettings;
 use neuron::harness::HarnessRegistry;
 use neuron::startup;
 #[tokio::test]
 async fn test_load_default_models_skips_unknown_harness() {
    let registry = HarnessRegistry::from_configs(
        &[HarnessConfig {
            name: "candle".into(),
        }],
        "http://localhost:0",
        &HarnessSettings::default(),
    );
    // Both entries fail synchronously inside the registry — no network
    // call escapes (the harness lookup mismatches before hf-hub is
    // touched). The function should still return cleanly.
    let specs = vec![
        ModelSpec {
            model_id: "model-a".into(),
            harness: "no-such-harness".into(),
            quant: None,
            tensor_parallel: None,
            devices: None,
        },
        ModelSpec {
            model_id: "model-b".into(),
            harness: "no-such-harness".into(),
            quant: None,
            tensor_parallel: None,
            devices: None,
        },
    ];
    startup::load_default_models(&registry, &specs).await;
    let listed = registry
        .list_all_models()
        .await
        .expect("list_all_models should succeed");
    assert!(
        listed.is_empty(),
        "no models should be loaded after failed entries"
    );
 }
 #[tokio::test]
 async fn test_load_default_models_empty_is_noop() {
    let registry = HarnessRegistry::new();
    startup::load_default_models(&registry, &[]).await;
 }
--- a/crates/neuron/tests/api.rs
+++ b/crates/neuron/tests/api.rs
@@ -273,10 +273,11 @@ async fn test_chat_completions_model_not_loaded() {
    assert_eq!(resp.status(), 404);
 }
-/// `/v1/chat/completions` with `stream: true` returns 501 until Stage 4
+/// `/v1/chat/completions` with `stream: true` returns 404 when the
-/// wires up SSE.
+/// model isn't loaded — same surface as the non-streaming path. The
 /// streaming code only kicks in once the model lookup succeeds.
 #[tokio::test]
-async fn test_chat_completions_streaming_not_yet_implemented() {
+async fn test_chat_completions_streaming_model_not_loaded() {
    use cortex_core::harness::HarnessConfig;
    use neuron::config::HarnessSettings;
@@ -306,12 +307,12 @@ async fn test_chat_completions_streaming_not_yet_implemented() {
    let resp = reqwest::Client::new()
        .post(format!("{url}/v1/chat/completions"))
        .json(&json!({
-            "model": "anything",
+            "model": "definitely/not-loaded",
            "messages": [{"role": "user", "content": "hi"}],
            "stream": true
        }))
        .send()
        .await
        .unwrap();
-    assert_eq!(resp.status(), 501);
+    assert_eq!(resp.status(), 404);
 }
--- a/data/neuron.service
+++ b/data/neuron.service
@@ -10,6 +10,11 @@ Restart=on-failure
 RestartSec=5
 User=neuron
 Group=neuron
 # Loading default_models from neuron.toml happens before the HTTP
 # listener binds; large models can take many minutes to download and
 # materialise on first activation. systemd's default TimeoutStartSec
 # (90s) is far too short; allow 30 minutes.
 TimeoutStartSec=1800s
 [Install]
 WantedBy=multi-user.target
--- a/neuron.example.toml
+++ b/neuron.example.toml
@@ -22,3 +22,19 @@ name = "candle"
 # HuggingFace cache directory for model weights. When unset, hf-hub's
 # default (~/.cache/huggingface) is used.
 # hf_cache = "/var/lib/neuron/hf-cache"
 # -- Default models ----------------------------------------------------------
 # Models listed here are loaded automatically when the neuron service
 # activates. Loading is sequential — a slow or failing entry doesn't
 # block the rest of the fleet, but it does push out the time before
 # neuron starts serving HTTP, so keep the list short. Operators can
 # load additional models on demand via POST /models/load.
 #
 # Make sure data/neuron.service's TimeoutStartSec is generous enough to
 # cover the slowest entry's first-time download + materialisation.
 # [[default_models]]
 # model_id = "Qwen/Qwen3-0.6B-GGUF"
 # harness = "candle"
 # quant = "Q4_K_M"
 # devices = [0]