feat(neuron): OpenAI-compatible non-streaming chat completion
Stage 3 of the candle-native pivot. neuron now serves POST /v1/chat/completions backed by candle's quantized_qwen3 forward pass on a per-model serialised generation loop, returning the standard OpenAI ChatCompletionResponse envelope. Pipeline per request: - Look up the LoadedModel by request.model (404 if absent). - Apply the Qwen3 chat template across all messages. - Tokenize, then spawn_blocking onto tokio's blocking pool to acquire the per-model arch lock and run prefill + greedy/temperature/top-p sampling via LogitsProcessor. - Stop on <|im_end|>/<|endoftext|> EOS or max_tokens (finish_reason "stop" vs "length"). - Decode with skip_special_tokens=true, build OpenAI response with prompt/completion/total usage counts. Supporting changes: - HarnessRegistry now stores Arc<dyn Harness> and caches a typed Arc<CandleHarness> so inference routes bypass dyn-Trait dispatch. - LoadedModel.arch becomes Arc<Mutex<ModelArch>> so the lock guard can be moved into spawn_blocking. - NeuronState gains an Option<Arc<CandleHarness>> field for the new inference route. - Typed InferenceError lets the handler map ModelNotLoaded → 404 and other failures → 500 without string-matching anyhow messages. - stream=true returns 501 until Stage 4 wires up SSE. - Two leftover mistral.rs string references in proxy.rs and cortex-cli (missed during the Stage 1 sweep) are corrected here. Three new default-feature tests cover the no-candle 503, model-not- loaded 404, and stream=true 501 paths. The cuda-integration test from Stage 2 still covers real load/unload; a streaming-feature gated test exercising actual generation will arrive with Stage 4. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -14,6 +14,7 @@ async fn spawn_neuron(discovery: DiscoveryResponse) -> String {
|
||||
discovery,
|
||||
health_cache,
|
||||
registry: RwLock::new(registry),
|
||||
candle: None,
|
||||
});
|
||||
|
||||
let app = api::neuron_routes().with_state(state);
|
||||
@@ -152,11 +153,13 @@ async fn test_candle_harness_registers_and_rejects_bogus_model() {
|
||||
&HarnessSettings::default(),
|
||||
);
|
||||
|
||||
let candle = registry.candle();
|
||||
let health_cache = Arc::new(HealthCache::new());
|
||||
let state = Arc::new(NeuronState {
|
||||
discovery: fake_discovery(),
|
||||
health_cache,
|
||||
registry: RwLock::new(registry),
|
||||
candle,
|
||||
});
|
||||
|
||||
let app = api::neuron_routes().with_state(state);
|
||||
@@ -197,3 +200,118 @@ async fn test_candle_harness_registers_and_rejects_bogus_model() {
|
||||
let models: Vec<serde_json::Value> = resp.json().await.unwrap();
|
||||
assert!(models.is_empty());
|
||||
}
|
||||
|
||||
/// `/v1/chat/completions` returns 503 when no candle harness is registered.
|
||||
#[tokio::test]
|
||||
async fn test_chat_completions_no_candle_harness() {
|
||||
let registry = HarnessRegistry::new();
|
||||
let health_cache = Arc::new(HealthCache::new());
|
||||
let state = Arc::new(NeuronState {
|
||||
discovery: fake_discovery(),
|
||||
health_cache,
|
||||
registry: RwLock::new(registry),
|
||||
candle: None,
|
||||
});
|
||||
let app = api::neuron_routes().with_state(state);
|
||||
let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||
let addr = listener.local_addr().unwrap();
|
||||
tokio::spawn(async move {
|
||||
axum::serve(listener, app).await.unwrap();
|
||||
});
|
||||
let url = format!("http://{addr}");
|
||||
|
||||
let resp = reqwest::Client::new()
|
||||
.post(format!("{url}/v1/chat/completions"))
|
||||
.json(&json!({
|
||||
"model": "anything",
|
||||
"messages": [{"role": "user", "content": "hi"}]
|
||||
}))
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(resp.status(), 503);
|
||||
}
|
||||
|
||||
/// `/v1/chat/completions` returns 404 when the requested model isn't loaded.
|
||||
#[tokio::test]
|
||||
async fn test_chat_completions_model_not_loaded() {
|
||||
use cortex_core::harness::HarnessConfig;
|
||||
use neuron::config::HarnessSettings;
|
||||
|
||||
let registry = HarnessRegistry::from_configs(
|
||||
&[HarnessConfig {
|
||||
name: "candle".into(),
|
||||
}],
|
||||
"http://localhost:0",
|
||||
&HarnessSettings::default(),
|
||||
);
|
||||
let candle = registry.candle();
|
||||
let health_cache = Arc::new(HealthCache::new());
|
||||
let state = Arc::new(NeuronState {
|
||||
discovery: fake_discovery(),
|
||||
health_cache,
|
||||
registry: RwLock::new(registry),
|
||||
candle,
|
||||
});
|
||||
let app = api::neuron_routes().with_state(state);
|
||||
let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||
let addr = listener.local_addr().unwrap();
|
||||
tokio::spawn(async move {
|
||||
axum::serve(listener, app).await.unwrap();
|
||||
});
|
||||
let url = format!("http://{addr}");
|
||||
|
||||
let resp = reqwest::Client::new()
|
||||
.post(format!("{url}/v1/chat/completions"))
|
||||
.json(&json!({
|
||||
"model": "definitely/not-loaded",
|
||||
"messages": [{"role": "user", "content": "hi"}]
|
||||
}))
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(resp.status(), 404);
|
||||
}
|
||||
|
||||
/// `/v1/chat/completions` with `stream: true` returns 501 until Stage 4
|
||||
/// wires up SSE.
|
||||
#[tokio::test]
|
||||
async fn test_chat_completions_streaming_not_yet_implemented() {
|
||||
use cortex_core::harness::HarnessConfig;
|
||||
use neuron::config::HarnessSettings;
|
||||
|
||||
let registry = HarnessRegistry::from_configs(
|
||||
&[HarnessConfig {
|
||||
name: "candle".into(),
|
||||
}],
|
||||
"http://localhost:0",
|
||||
&HarnessSettings::default(),
|
||||
);
|
||||
let candle = registry.candle();
|
||||
let health_cache = Arc::new(HealthCache::new());
|
||||
let state = Arc::new(NeuronState {
|
||||
discovery: fake_discovery(),
|
||||
health_cache,
|
||||
registry: RwLock::new(registry),
|
||||
candle,
|
||||
});
|
||||
let app = api::neuron_routes().with_state(state);
|
||||
let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||
let addr = listener.local_addr().unwrap();
|
||||
tokio::spawn(async move {
|
||||
axum::serve(listener, app).await.unwrap();
|
||||
});
|
||||
let url = format!("http://{addr}");
|
||||
|
||||
let resp = reqwest::Client::new()
|
||||
.post(format!("{url}/v1/chat/completions"))
|
||||
.json(&json!({
|
||||
"model": "anything",
|
||||
"messages": [{"role": "user", "content": "hi"}],
|
||||
"stream": true
|
||||
}))
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(resp.status(), 501);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user