feat(neuron): OpenAI-compatible non-streaming chat completion

Stage 3 of the candle-native pivot. neuron now serves POST /v1/chat/completions backed by candle's quantized_qwen3 forward pass on a per-model serialised generation loop, returning the standard OpenAI ChatCompletionResponse envelope. Pipeline per request: - Look up the LoadedModel by request.model (404 if absent). - Apply the Qwen3 chat template across all messages. - Tokenize, then spawn_blocking onto tokio's blocking pool to acquire the per-model arch lock and run prefill + greedy/temperature/top-p sampling via LogitsProcessor. - Stop on <|im_end|>/<|endoftext|> EOS or max_tokens (finish_reason "stop" vs "length"). - Decode with skip_special_tokens=true, build OpenAI response with prompt/completion/total usage counts. Supporting changes: - HarnessRegistry now stores Arc<dyn Harness> and caches a typed Arc<CandleHarness> so inference routes bypass dyn-Trait dispatch. - LoadedModel.arch becomes Arc<Mutex<ModelArch>> so the lock guard can be moved into spawn_blocking. - NeuronState gains an Option<Arc<CandleHarness>> field for the new inference route. - Typed InferenceError lets the handler map ModelNotLoaded → 404 and other failures → 500 without string-matching anyhow messages. - stream=true returns 501 until Stage 4 wires up SSE. - Two leftover mistral.rs string references in proxy.rs and cortex-cli (missed during the Stage 1 sweep) are corrected here. Three new default-feature tests cover the no-candle 503, model-not- loaded 404, and stream=true 501 paths. The cuda-integration test from Stage 2 still covers real load/unload; a streaming-feature gated test exercising actual generation will arrive with Stage 4. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 16:47:58 +03:00
parent 5c2bd1a1da
commit 729317d1ef
10 changed files with 412 additions and 22 deletions
--- a/crates/neuron/src/api.rs
+++ b/crates/neuron/src/api.rs
@@ -1,6 +1,7 @@
 //! HTTP API handlers for the neuron daemon.

 use crate::harness::HarnessRegistry;
+use crate::harness::candle::{CandleHarness, InferenceError};
 use crate::health::HealthCache;
 use axum::Router;
 use axum::extract::{Path, State};
@@ -9,6 +10,7 @@ use axum::response::{IntoResponse, Json};
 use axum::routing::{get, post};
 use cortex_core::discovery::{DiscoveryResponse, HealthResponse};
 use cortex_core::harness::ModelSpec;
+use cortex_core::openai::ChatCompletionRequest;
 use serde_json::{Value, json};
 use std::sync::Arc;
 use tokio::sync::RwLock;
@@ -18,6 +20,10 @@ pub struct NeuronState {
    pub discovery: DiscoveryResponse,
    pub health_cache: Arc<HealthCache>,
    pub registry: RwLock<HarnessRegistry>,
+    /// Typed handle to the candle harness for inference routes. Cached at
+    /// startup so `/v1/chat/completions` doesn't have to hold the registry
+    /// read lock or perform dyn-Trait dispatch per request.
+    pub candle: Option<Arc<CandleHarness>>,
 }

 /// Build the neuron API router.
@@ -29,6 +35,7 @@ pub fn neuron_routes() -> Router<Arc<NeuronState>> {
        .route("/models/load", post(load_model))
        .route("/models/unload", post(unload_model))
        .route("/models/{model_id}/endpoint", get(model_endpoint))
+        .route("/v1/chat/completions", post(chat_completions))
 }

 async fn discovery_handler(State(state): State<Arc<NeuronState>>) -> Json<DiscoveryResponse> {
@@ -102,3 +109,40 @@ async fn model_endpoint(
            .into_response(),
    }
 }
+
+/// OpenAI-compatible chat completions. Non-streaming for Stage 3; the
+/// streaming path is added in Stage 4.
+async fn chat_completions(
+    State(state): State<Arc<NeuronState>>,
+    Json(req): Json<ChatCompletionRequest>,
+) -> impl IntoResponse {
+    let Some(candle) = state.candle.as_ref().map(Arc::clone) else {
+        return (
+            StatusCode::SERVICE_UNAVAILABLE,
+            Json(json!({"error": "candle harness not enabled on this neuron"})),
+        )
+            .into_response();
+    };
+
+    if req.stream.unwrap_or(false) {
+        return (
+            StatusCode::NOT_IMPLEMENTED,
+            Json(json!({"error": "streaming responses arrive in Stage 4"})),
+        )
+            .into_response();
+    }
+
+    match candle.chat_completion(req).await {
+        Ok(resp) => Json(resp).into_response(),
+        Err(InferenceError::ModelNotLoaded(id)) => (
+            StatusCode::NOT_FOUND,
+            Json(json!({"error": format!("model '{id}' not loaded on this neuron")})),
+        )
+            .into_response(),
+        Err(InferenceError::Other(e)) => (
+            StatusCode::INTERNAL_SERVER_ERROR,
+            Json(json!({"error": e.to_string()})),
+        )
+            .into_response(),
+    }
+}