feat(neuron): wire candle harness load/unload via GGUF
Stage 2 of the candle-native pivot. Fleshes out CandleHarness with a LoadedModel registry keyed by model_id, hf-hub-backed GGUF download, and Qwen3 quantized weight construction via candle-transformers' quantized_qwen3 module. unload_model drops the entry; Drop on the candle ModelWeights frees device memory. Device selection prefers CUDA (gated behind the new `cuda` feature), falling back to CPU when CUDA is unavailable so default builds work on non-GPU hosts. The candle CUDA toolchain isn't pulled in unless `--features cuda` is passed, keeping CI green on CPU runners. Config gains a [harness.candle] block with an optional hf_cache path. HarnessRegistry::from_configs now takes HarnessSettings so per-harness config flows through. A gated tests/candle_lifecycle.rs exercises real load → list → unload → list-empty when run with `--features cuda-integration` against a host with HF network access. The default-feature test in tests/api.rs covers the wrong-harness rejection path without needing the network. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
1587
Cargo.lock
generated
1587
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -12,6 +12,18 @@ path = "src/lib.rs"
|
|||||||
name = "neuron"
|
name = "neuron"
|
||||||
path = "src/main.rs"
|
path = "src/main.rs"
|
||||||
|
|
||||||
|
[features]
|
||||||
|
default = []
|
||||||
|
# Enables CUDA acceleration in candle. Without this feature, candle
|
||||||
|
# compiles for CPU only and Device::new_cuda calls fall back to CPU.
|
||||||
|
cuda = [
|
||||||
|
"candle-core/cuda",
|
||||||
|
"candle-nn/cuda",
|
||||||
|
"candle-transformers/cuda",
|
||||||
|
]
|
||||||
|
# Reserved for GPU-only integration tests in later stages.
|
||||||
|
cuda-integration = ["cuda"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
cortex-core.workspace = true
|
cortex-core.workspace = true
|
||||||
tokio.workspace = true
|
tokio.workspace = true
|
||||||
@@ -27,6 +39,15 @@ clap.workspace = true
|
|||||||
figment.workspace = true
|
figment.workspace = true
|
||||||
toml.workspace = true
|
toml.workspace = true
|
||||||
|
|
||||||
|
# candle for in-process inference. CUDA support is gated behind the
|
||||||
|
# crate's `cuda` feature (default off) so the workspace builds on
|
||||||
|
# non-CUDA hosts and CI runners.
|
||||||
|
candle-core = "0.10.2"
|
||||||
|
candle-nn = "0.10.2"
|
||||||
|
candle-transformers = "0.10.2"
|
||||||
|
tokenizers = { version = "0.22", default-features = false, features = ["onig"] }
|
||||||
|
hf-hub = { version = "0.4", features = ["tokio"] }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
tokio = { workspace = true, features = ["test-util"] }
|
tokio = { workspace = true, features = ["test-util"] }
|
||||||
reqwest.workspace = true
|
reqwest.workspace = true
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ use figment::{
|
|||||||
providers::{Env, Format, Toml},
|
providers::{Env, Format, Toml},
|
||||||
};
|
};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::path::Path;
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct NeuronConfig {
|
pub struct NeuronConfig {
|
||||||
@@ -14,6 +14,25 @@ pub struct NeuronConfig {
|
|||||||
pub port: u16,
|
pub port: u16,
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub harnesses: Vec<HarnessConfig>,
|
pub harnesses: Vec<HarnessConfig>,
|
||||||
|
/// Per-harness configuration. Currently only `candle` is recognised.
|
||||||
|
#[serde(default)]
|
||||||
|
pub harness: HarnessSettings,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Settings for individual harness implementations. Each harness owns
|
||||||
|
/// its own sub-table so users only configure the harnesses they enable.
|
||||||
|
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
||||||
|
pub struct HarnessSettings {
|
||||||
|
#[serde(default)]
|
||||||
|
pub candle: CandleHarnessConfig,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
||||||
|
pub struct CandleHarnessConfig {
|
||||||
|
/// HuggingFace cache directory for model weights.
|
||||||
|
/// When unset, defers to hf-hub's default (~/.cache/huggingface).
|
||||||
|
#[serde(default)]
|
||||||
|
pub hf_cache: Option<PathBuf>,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn default_port() -> u16 {
|
fn default_port() -> u16 {
|
||||||
@@ -35,6 +54,7 @@ impl Default for NeuronConfig {
|
|||||||
Self {
|
Self {
|
||||||
port: 13131,
|
port: 13131,
|
||||||
harnesses: vec![],
|
harnesses: vec![],
|
||||||
|
harness: HarnessSettings::default(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,24 +1,121 @@
|
|||||||
//! Candle harness — in-process inference using huggingface/candle.
|
//! Candle harness — in-process inference using huggingface/candle.
|
||||||
//!
|
//!
|
||||||
//! This is the sole `Harness` implementation. Unlike the previous
|
//! This is the sole `Harness` implementation. Inference runs inside
|
||||||
//! mistralrs/llamacpp harnesses, candle inference runs inside the neuron
|
//! the neuron process; there is no external subprocess. Stage 2 wires
|
||||||
//! process itself — no external subprocess, no systemd indirection.
|
//! up GGUF (currently Qwen3 only) model load/unload via
|
||||||
//!
|
//! `candle-transformers::models::quantized_qwen3`. Stage 3 adds the
|
||||||
//! Stage 1 ships this as an inert skeleton; Stage 2 wires up actual
|
//! inference endpoint.
|
||||||
//! model load/unload via `candle-transformers`.
|
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::{Context, Result};
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
|
use candle_core::Device;
|
||||||
|
use candle_core::quantized::gguf_file;
|
||||||
|
use candle_transformers::models::quantized_qwen3::ModelWeights as QuantizedQwen3Weights;
|
||||||
use cortex_core::harness::{Harness, HarnessHealth, ModelInfo, ModelSpec};
|
use cortex_core::harness::{Harness, HarnessHealth, ModelInfo, ModelSpec};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use tokenizers::Tokenizer;
|
||||||
|
use tokio::sync::{Mutex, RwLock};
|
||||||
|
|
||||||
|
/// In-process candle harness. Owns the loaded model registry.
|
||||||
pub struct CandleHarness {
|
pub struct CandleHarness {
|
||||||
/// URL where this neuron serves inference (its own bind address).
|
models: Arc<RwLock<HashMap<String, Arc<LoadedModel>>>>,
|
||||||
|
hf_cache: Option<PathBuf>,
|
||||||
bind_url: String,
|
bind_url: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// A loaded model with its tokenizer, device placement, and architecture-
|
||||||
|
/// specific weights. The `arch` field is mutexed because future inference
|
||||||
|
/// calls take `&mut self` on the underlying ModelWeights (KV cache state).
|
||||||
|
pub struct LoadedModel {
|
||||||
|
pub model_id: String,
|
||||||
|
pub arch: Mutex<ModelArch>,
|
||||||
|
pub tokenizer: Tokenizer,
|
||||||
|
pub device: Device,
|
||||||
|
pub quant: Option<String>,
|
||||||
|
pub devices: Vec<u32>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Architecture-specific weights. Stage 2 supports only Qwen3 quantized;
|
||||||
|
/// Stage 8 broadens this to additional families and non-quantized variants.
|
||||||
|
pub enum ModelArch {
|
||||||
|
Qwen3Quantized(QuantizedQwen3Weights),
|
||||||
|
}
|
||||||
|
|
||||||
impl CandleHarness {
|
impl CandleHarness {
|
||||||
pub fn new(bind_url: String) -> Self {
|
pub fn new(bind_url: String, hf_cache: Option<PathBuf>) -> Self {
|
||||||
Self { bind_url }
|
Self {
|
||||||
|
models: Arc::new(RwLock::new(HashMap::new())),
|
||||||
|
hf_cache,
|
||||||
|
bind_url,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Pick a candle `Device` for the requested indices. Without the
|
||||||
|
/// `cuda` feature, or if CUDA initialisation fails, falls back to CPU.
|
||||||
|
fn pick_device(devices: &[u32]) -> Result<Device> {
|
||||||
|
let _idx = devices.first().copied().unwrap_or(0) as usize;
|
||||||
|
#[cfg(feature = "cuda")]
|
||||||
|
{
|
||||||
|
match Device::new_cuda(_idx) {
|
||||||
|
Ok(d) => return Ok(d),
|
||||||
|
Err(e) => tracing::warn!(
|
||||||
|
device = _idx,
|
||||||
|
error = %e,
|
||||||
|
"CUDA device unavailable, falling back to CPU"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(Device::Cpu)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Resolve a model spec to local GGUF and tokenizer file paths via
|
||||||
|
/// hf-hub. Downloads on first use; subsequent calls are cached.
|
||||||
|
async fn resolve_files(&self, spec: &ModelSpec) -> Result<(PathBuf, PathBuf)> {
|
||||||
|
let mut builder = hf_hub::api::tokio::ApiBuilder::new();
|
||||||
|
if let Some(cache) = &self.hf_cache {
|
||||||
|
builder = builder.with_cache_dir(cache.clone());
|
||||||
|
}
|
||||||
|
let api = builder.build().context("build hf-hub API")?;
|
||||||
|
let repo = api.model(spec.model_id.clone());
|
||||||
|
|
||||||
|
let info = repo
|
||||||
|
.info()
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("fetch HF repo info for {}", spec.model_id))?;
|
||||||
|
|
||||||
|
let quant = spec.quant.as_deref().unwrap_or("");
|
||||||
|
let quant_lc = quant.to_lowercase();
|
||||||
|
let gguf_filename = info
|
||||||
|
.siblings
|
||||||
|
.iter()
|
||||||
|
.map(|s| s.rfilename.as_str())
|
||||||
|
.filter(|name| name.to_lowercase().ends_with(".gguf"))
|
||||||
|
.find(|name| quant_lc.is_empty() || name.to_lowercase().contains(&quant_lc))
|
||||||
|
.ok_or_else(|| {
|
||||||
|
anyhow::anyhow!(
|
||||||
|
"no GGUF file matching quant {:?} in repo {}",
|
||||||
|
spec.quant,
|
||||||
|
spec.model_id
|
||||||
|
)
|
||||||
|
})?
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
tracing::info!(
|
||||||
|
model = %spec.model_id,
|
||||||
|
file = %gguf_filename,
|
||||||
|
"resolving GGUF (may be cached)"
|
||||||
|
);
|
||||||
|
let gguf_path = repo
|
||||||
|
.get(&gguf_filename)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("fetch GGUF {gguf_filename}"))?;
|
||||||
|
let tokenizer_path = repo
|
||||||
|
.get("tokenizer.json")
|
||||||
|
.await
|
||||||
|
.context("fetch tokenizer.json")?;
|
||||||
|
Ok((gguf_path, tokenizer_path))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -37,18 +134,98 @@ impl Harness for CandleHarness {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn list_models(&self) -> Result<Vec<ModelInfo>> {
|
async fn list_models(&self) -> Result<Vec<ModelInfo>> {
|
||||||
Ok(Vec::new())
|
let models = self.models.read().await;
|
||||||
|
Ok(models
|
||||||
|
.values()
|
||||||
|
.map(|m| ModelInfo {
|
||||||
|
id: m.model_id.clone(),
|
||||||
|
harness: "candle".into(),
|
||||||
|
status: "loaded".into(),
|
||||||
|
devices: m.devices.clone(),
|
||||||
|
vram_used_mb: None,
|
||||||
|
})
|
||||||
|
.collect())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn load_model(&self, _spec: &ModelSpec) -> Result<()> {
|
async fn load_model(&self, spec: &ModelSpec) -> Result<()> {
|
||||||
anyhow::bail!("candle harness load_model not implemented yet (Stage 2)")
|
if spec.harness != "candle" {
|
||||||
|
anyhow::bail!("expected harness=candle, got harness={}", spec.harness);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
let models = self.models.read().await;
|
||||||
|
if models.contains_key(&spec.model_id) {
|
||||||
|
anyhow::bail!("model '{}' already loaded", spec.model_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let devices = spec.devices.clone().unwrap_or_else(|| vec![0]);
|
||||||
|
let device = Self::pick_device(&devices)?;
|
||||||
|
|
||||||
|
let (gguf_path, tokenizer_path) = self.resolve_files(spec).await?;
|
||||||
|
|
||||||
|
let tokenizer = Tokenizer::from_file(&tokenizer_path)
|
||||||
|
.map_err(|e| anyhow::anyhow!("load tokenizer: {e}"))?;
|
||||||
|
|
||||||
|
// File I/O + GGUF parsing + tensor materialisation are CPU-bound,
|
||||||
|
// so run them on a blocking task to avoid stalling the runtime.
|
||||||
|
let device_for_load = device.clone();
|
||||||
|
let gguf_path_for_load = gguf_path.clone();
|
||||||
|
let model_id_for_log = spec.model_id.clone();
|
||||||
|
let arch = tokio::task::spawn_blocking(move || -> Result<ModelArch> {
|
||||||
|
tracing::info!(model = %model_id_for_log, path = ?gguf_path_for_load, "loading GGUF");
|
||||||
|
let mut file = std::fs::File::open(&gguf_path_for_load).context("open GGUF file")?;
|
||||||
|
let content = gguf_file::Content::read(&mut file)
|
||||||
|
.map_err(|e| anyhow::anyhow!("parse GGUF: {e}"))?;
|
||||||
|
|
||||||
|
let architecture = content
|
||||||
|
.metadata
|
||||||
|
.get("general.architecture")
|
||||||
|
.and_then(|v| v.to_string().ok().cloned())
|
||||||
|
.unwrap_or_default();
|
||||||
|
tracing::info!(architecture = %architecture, "GGUF architecture");
|
||||||
|
|
||||||
|
match architecture.as_str() {
|
||||||
|
"qwen3" => {
|
||||||
|
let weights =
|
||||||
|
QuantizedQwen3Weights::from_gguf(content, &mut file, &device_for_load)
|
||||||
|
.map_err(|e| anyhow::anyhow!("from_gguf qwen3: {e}"))?;
|
||||||
|
Ok(ModelArch::Qwen3Quantized(weights))
|
||||||
|
}
|
||||||
|
other => anyhow::bail!(
|
||||||
|
"unsupported GGUF architecture '{other}'; Stage 2 only supports qwen3"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.context("blocking load task panicked")??;
|
||||||
|
|
||||||
|
let loaded = Arc::new(LoadedModel {
|
||||||
|
model_id: spec.model_id.clone(),
|
||||||
|
arch: Mutex::new(arch),
|
||||||
|
tokenizer,
|
||||||
|
device,
|
||||||
|
quant: spec.quant.clone(),
|
||||||
|
devices,
|
||||||
|
});
|
||||||
|
|
||||||
|
let mut models = self.models.write().await;
|
||||||
|
models.insert(spec.model_id.clone(), loaded);
|
||||||
|
tracing::info!(model = %spec.model_id, "model loaded");
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn unload_model(&self, _model_id: &str) -> Result<()> {
|
async fn unload_model(&self, model_id: &str) -> Result<()> {
|
||||||
anyhow::bail!("candle harness unload_model not implemented yet (Stage 2)")
|
let mut models = self.models.write().await;
|
||||||
|
if models.remove(model_id).is_none() {
|
||||||
|
anyhow::bail!("model '{model_id}' not loaded");
|
||||||
|
}
|
||||||
|
tracing::info!(model = %model_id, "model unloaded");
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn inference_endpoint(&self, _model_id: &str) -> Option<String> {
|
async fn inference_endpoint(&self, model_id: &str) -> Option<String> {
|
||||||
Some(self.bind_url.clone())
|
let models = self.models.read().await;
|
||||||
|
models.contains_key(model_id).then(|| self.bind_url.clone())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -84,12 +84,19 @@ impl HarnessRegistry {
|
|||||||
/// `bind_url` is the URL where this neuron serves inference (its own
|
/// `bind_url` is the URL where this neuron serves inference (its own
|
||||||
/// listen address). In-process harnesses (currently the only kind)
|
/// listen address). In-process harnesses (currently the only kind)
|
||||||
/// return this URL from `inference_endpoint`.
|
/// return this URL from `inference_endpoint`.
|
||||||
pub fn from_configs(configs: &[HarnessConfig], bind_url: &str) -> Self {
|
pub fn from_configs(
|
||||||
|
configs: &[HarnessConfig],
|
||||||
|
bind_url: &str,
|
||||||
|
settings: &crate::config::HarnessSettings,
|
||||||
|
) -> Self {
|
||||||
let mut registry = Self::new();
|
let mut registry = Self::new();
|
||||||
for config in configs {
|
for config in configs {
|
||||||
match config.name.as_str() {
|
match config.name.as_str() {
|
||||||
"candle" => {
|
"candle" => {
|
||||||
registry.register(Box::new(candle::CandleHarness::new(bind_url.to_string())));
|
registry.register(Box::new(candle::CandleHarness::new(
|
||||||
|
bind_url.to_string(),
|
||||||
|
settings.candle.hf_cache.clone(),
|
||||||
|
)));
|
||||||
}
|
}
|
||||||
other => {
|
other => {
|
||||||
tracing::warn!(harness = other, "unknown harness type, skipping");
|
tracing::warn!(harness = other, "unknown harness type, skipping");
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ async fn main() -> Result<()> {
|
|||||||
// Build harness registry from config. In-process harnesses (candle)
|
// Build harness registry from config. In-process harnesses (candle)
|
||||||
// need to know neuron's own bind URL so they can return it from
|
// need to know neuron's own bind URL so they can return it from
|
||||||
// inference_endpoint.
|
// inference_endpoint.
|
||||||
let registry = HarnessRegistry::from_configs(&cfg.harnesses, &bind_url);
|
let registry = HarnessRegistry::from_configs(&cfg.harnesses, &bind_url, &cfg.harness);
|
||||||
discovery_result.harnesses = registry.names();
|
discovery_result.harnesses = registry.names();
|
||||||
|
|
||||||
let health_cache = Arc::new(health::HealthCache::new());
|
let health_cache = Arc::new(health::HealthCache::new());
|
||||||
|
|||||||
@@ -135,17 +135,21 @@ async fn test_models_empty_registry() {
|
|||||||
assert!(body.as_array().unwrap().is_empty());
|
assert!(body.as_array().unwrap().is_empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Verify the candle harness registers and the load endpoint returns a
|
/// Verify the candle harness registers, list is empty by default, and a
|
||||||
/// "not implemented" error in Stage 1 (Stage 2 wires up actual loading).
|
/// load attempt for an obviously-bogus model id returns a 4xx error
|
||||||
|
/// without crashing the daemon. Real load/unload exercising actual GGUF
|
||||||
|
/// download is covered by `tests/candle_lifecycle.rs` (cuda-integration).
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_candle_harness_registers_but_load_unimplemented() {
|
async fn test_candle_harness_registers_and_rejects_bogus_model() {
|
||||||
use cortex_core::harness::HarnessConfig;
|
use cortex_core::harness::HarnessConfig;
|
||||||
|
use neuron::config::HarnessSettings;
|
||||||
|
|
||||||
let registry = HarnessRegistry::from_configs(
|
let registry = HarnessRegistry::from_configs(
|
||||||
&[HarnessConfig {
|
&[HarnessConfig {
|
||||||
name: "candle".into(),
|
name: "candle".into(),
|
||||||
}],
|
}],
|
||||||
"http://localhost:13131",
|
"http://localhost:13131",
|
||||||
|
&HarnessSettings::default(),
|
||||||
);
|
);
|
||||||
|
|
||||||
let health_cache = Arc::new(HealthCache::new());
|
let health_cache = Arc::new(HealthCache::new());
|
||||||
@@ -165,7 +169,6 @@ async fn test_candle_harness_registers_but_load_unimplemented() {
|
|||||||
|
|
||||||
let client = reqwest::Client::new();
|
let client = reqwest::Client::new();
|
||||||
|
|
||||||
// GET /models — candle harness has no models loaded yet.
|
|
||||||
let resp = client
|
let resp = client
|
||||||
.get(format!("{neuron_url}/models"))
|
.get(format!("{neuron_url}/models"))
|
||||||
.send()
|
.send()
|
||||||
@@ -175,12 +178,22 @@ async fn test_candle_harness_registers_but_load_unimplemented() {
|
|||||||
let models: Vec<serde_json::Value> = resp.json().await.unwrap();
|
let models: Vec<serde_json::Value> = resp.json().await.unwrap();
|
||||||
assert!(models.is_empty());
|
assert!(models.is_empty());
|
||||||
|
|
||||||
// POST /models/load — Stage 1 skeleton returns an error.
|
// Sending a wrong-harness spec should be rejected synchronously
|
||||||
|
// without touching the network or the model registry.
|
||||||
let resp = client
|
let resp = client
|
||||||
.post(format!("{neuron_url}/models/load"))
|
.post(format!("{neuron_url}/models/load"))
|
||||||
.json(&json!({"model_id": "some-model", "harness": "candle"}))
|
.json(&json!({"model_id": "definitely/not-real", "harness": "not-candle"}))
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(resp.status(), 400);
|
assert_eq!(resp.status(), 400);
|
||||||
|
|
||||||
|
// Registry still empty.
|
||||||
|
let resp = client
|
||||||
|
.get(format!("{neuron_url}/models"))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let models: Vec<serde_json::Value> = resp.json().await.unwrap();
|
||||||
|
assert!(models.is_empty());
|
||||||
}
|
}
|
||||||
|
|||||||
90
crates/neuron/tests/candle_lifecycle.rs
Normal file
90
crates/neuron/tests/candle_lifecycle.rs
Normal file
@@ -0,0 +1,90 @@
|
|||||||
|
//! Real model load/unload lifecycle through the candle harness.
|
||||||
|
//!
|
||||||
|
//! Gated behind the `cuda-integration` feature because it downloads a
|
||||||
|
//! real (small) GGUF from HuggingFace and materialises tensors on the
|
||||||
|
//! configured device. Run on a host with network access and either a
|
||||||
|
//! CUDA GPU (when built with `--features cuda`) or enough CPU RAM to
|
||||||
|
//! hold the model.
|
||||||
|
//!
|
||||||
|
//! Usage:
|
||||||
|
//! cargo test -p neuron --features cuda-integration --test candle_lifecycle
|
||||||
|
//!
|
||||||
|
//! Optional environment variables:
|
||||||
|
//! NEURON_TEST_MODEL_ID — HuggingFace repo to load (default: a small
|
||||||
|
//! public Qwen3 GGUF repo).
|
||||||
|
//! NEURON_TEST_QUANT — quant substring matched against GGUF
|
||||||
|
//! filenames (default: "Q4_K_M").
|
||||||
|
//! HF_HOME — HuggingFace cache directory.
|
||||||
|
|
||||||
|
#![cfg(feature = "cuda-integration")]
|
||||||
|
|
||||||
|
use cortex_core::harness::{HarnessConfig, ModelSpec};
|
||||||
|
use neuron::config::HarnessSettings;
|
||||||
|
use neuron::harness::HarnessRegistry;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_candle_qwen3_load_unload_lifecycle() {
|
||||||
|
let _ = tracing_subscriber::fmt()
|
||||||
|
.with_test_writer()
|
||||||
|
.with_env_filter("info,neuron=debug")
|
||||||
|
.try_init();
|
||||||
|
|
||||||
|
let model_id = std::env::var("NEURON_TEST_MODEL_ID")
|
||||||
|
.unwrap_or_else(|_| "Qwen/Qwen3-0.6B-GGUF".to_string());
|
||||||
|
let quant = std::env::var("NEURON_TEST_QUANT").unwrap_or_else(|_| "Q4_K_M".to_string());
|
||||||
|
|
||||||
|
let mut settings = HarnessSettings::default();
|
||||||
|
if let Ok(home) = std::env::var("HF_HOME") {
|
||||||
|
settings.candle.hf_cache = Some(PathBuf::from(home));
|
||||||
|
}
|
||||||
|
|
||||||
|
let registry = HarnessRegistry::from_configs(
|
||||||
|
&[HarnessConfig {
|
||||||
|
name: "candle".into(),
|
||||||
|
}],
|
||||||
|
"http://localhost:13131",
|
||||||
|
&settings,
|
||||||
|
);
|
||||||
|
|
||||||
|
let spec = ModelSpec {
|
||||||
|
model_id: model_id.clone(),
|
||||||
|
harness: "candle".into(),
|
||||||
|
quant: Some(quant),
|
||||||
|
tensor_parallel: None,
|
||||||
|
devices: Some(vec![0]),
|
||||||
|
};
|
||||||
|
|
||||||
|
registry
|
||||||
|
.load_model(&spec)
|
||||||
|
.await
|
||||||
|
.expect("load_model should succeed");
|
||||||
|
|
||||||
|
let models = registry
|
||||||
|
.list_all_models()
|
||||||
|
.await
|
||||||
|
.expect("list_all_models");
|
||||||
|
assert_eq!(models.len(), 1, "expected exactly one loaded model");
|
||||||
|
assert_eq!(models[0].id, model_id);
|
||||||
|
assert_eq!(models[0].harness, "candle");
|
||||||
|
assert_eq!(models[0].status, "loaded");
|
||||||
|
|
||||||
|
let url = registry.inference_endpoint(&model_id).await;
|
||||||
|
assert_eq!(url, Some("http://localhost:13131".into()));
|
||||||
|
|
||||||
|
// Re-loading the same model should be rejected.
|
||||||
|
let again = registry.load_model(&spec).await;
|
||||||
|
assert!(again.is_err(), "second load should error");
|
||||||
|
|
||||||
|
registry
|
||||||
|
.unload_model(&model_id)
|
||||||
|
.await
|
||||||
|
.expect("unload_model should succeed");
|
||||||
|
|
||||||
|
let models = registry.list_all_models().await.expect("list_all_models");
|
||||||
|
assert!(models.is_empty(), "registry should be empty after unload");
|
||||||
|
|
||||||
|
// Unloading a model that isn't loaded should error.
|
||||||
|
let err = registry.unload_model(&model_id).await;
|
||||||
|
assert!(err.is_err(), "unload of missing model should error");
|
||||||
|
}
|
||||||
@@ -8,9 +8,17 @@
|
|||||||
port = 13131
|
port = 13131
|
||||||
|
|
||||||
# -- Harnesses ---------------------------------------------------------------
|
# -- Harnesses ---------------------------------------------------------------
|
||||||
# Each [[harnesses]] entry declares an inference engine. Currently only
|
# Each [[harnesses]] entry enables an inference engine. Currently only
|
||||||
# "candle" is supported — it runs in-process and uses huggingface/candle
|
# "candle" is supported — it runs in-process and uses huggingface/candle
|
||||||
# for inference on local CUDA devices.
|
# for inference on local CUDA devices (or CPU when CUDA is unavailable).
|
||||||
|
|
||||||
[[harnesses]]
|
[[harnesses]]
|
||||||
name = "candle"
|
name = "candle"
|
||||||
|
|
||||||
|
# -- Candle harness settings -------------------------------------------------
|
||||||
|
# Optional tuning for the candle harness.
|
||||||
|
|
||||||
|
[harness.candle]
|
||||||
|
# HuggingFace cache directory for model weights. When unset, hf-hub's
|
||||||
|
# default (~/.cache/huggingface) is used.
|
||||||
|
# hf_cache = "/var/lib/neuron/hf-cache"
|
||||||
|
|||||||
Reference in New Issue
Block a user