feat(neuron): wire candle harness load/unload via GGUF

Stage 2 of the candle-native pivot. Fleshes out CandleHarness with a
LoadedModel registry keyed by model_id, hf-hub-backed GGUF download,
and Qwen3 quantized weight construction via candle-transformers'
quantized_qwen3 module. unload_model drops the entry; Drop on the
candle ModelWeights frees device memory.

Device selection prefers CUDA (gated behind the new `cuda` feature),
falling back to CPU when CUDA is unavailable so default builds work
on non-GPU hosts. The candle CUDA toolchain isn't pulled in unless
`--features cuda` is passed, keeping CI green on CPU runners.

Config gains a [harness.candle] block with an optional hf_cache path.
HarnessRegistry::from_configs now takes HarnessSettings so per-harness
config flows through.

A gated tests/candle_lifecycle.rs exercises real load → list → unload
→ list-empty when run with `--features cuda-integration` against a
host with HF network access. The default-feature test in tests/api.rs
covers the wrong-harness rejection path without needing the network.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-18 16:02:49 +03:00
parent 3cccc2c56b
commit 5c2bd1a1da
9 changed files with 1934 additions and 47 deletions

1587
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -12,6 +12,18 @@ path = "src/lib.rs"
name = "neuron" name = "neuron"
path = "src/main.rs" path = "src/main.rs"
[features]
default = []
# Enables CUDA acceleration in candle. Without this feature, candle
# compiles for CPU only and Device::new_cuda calls fall back to CPU.
cuda = [
"candle-core/cuda",
"candle-nn/cuda",
"candle-transformers/cuda",
]
# Reserved for GPU-only integration tests in later stages.
cuda-integration = ["cuda"]
[dependencies] [dependencies]
cortex-core.workspace = true cortex-core.workspace = true
tokio.workspace = true tokio.workspace = true
@@ -27,6 +39,15 @@ clap.workspace = true
figment.workspace = true figment.workspace = true
toml.workspace = true toml.workspace = true
# candle for in-process inference. CUDA support is gated behind the
# crate's `cuda` feature (default off) so the workspace builds on
# non-CUDA hosts and CI runners.
candle-core = "0.10.2"
candle-nn = "0.10.2"
candle-transformers = "0.10.2"
tokenizers = { version = "0.22", default-features = false, features = ["onig"] }
hf-hub = { version = "0.4", features = ["tokio"] }
[dev-dependencies] [dev-dependencies]
tokio = { workspace = true, features = ["test-util"] } tokio = { workspace = true, features = ["test-util"] }
reqwest.workspace = true reqwest.workspace = true

View File

@@ -6,7 +6,7 @@ use figment::{
providers::{Env, Format, Toml}, providers::{Env, Format, Toml},
}; };
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::path::Path; use std::path::{Path, PathBuf};
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NeuronConfig { pub struct NeuronConfig {
@@ -14,6 +14,25 @@ pub struct NeuronConfig {
pub port: u16, pub port: u16,
#[serde(default)] #[serde(default)]
pub harnesses: Vec<HarnessConfig>, pub harnesses: Vec<HarnessConfig>,
/// Per-harness configuration. Currently only `candle` is recognised.
#[serde(default)]
pub harness: HarnessSettings,
}
/// Settings for individual harness implementations. Each harness owns
/// its own sub-table so users only configure the harnesses they enable.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct HarnessSettings {
#[serde(default)]
pub candle: CandleHarnessConfig,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct CandleHarnessConfig {
/// HuggingFace cache directory for model weights.
/// When unset, defers to hf-hub's default (~/.cache/huggingface).
#[serde(default)]
pub hf_cache: Option<PathBuf>,
} }
fn default_port() -> u16 { fn default_port() -> u16 {
@@ -35,6 +54,7 @@ impl Default for NeuronConfig {
Self { Self {
port: 13131, port: 13131,
harnesses: vec![], harnesses: vec![],
harness: HarnessSettings::default(),
} }
} }
} }

View File

@@ -1,24 +1,121 @@
//! Candle harness — in-process inference using huggingface/candle. //! Candle harness — in-process inference using huggingface/candle.
//! //!
//! This is the sole `Harness` implementation. Unlike the previous //! This is the sole `Harness` implementation. Inference runs inside
//! mistralrs/llamacpp harnesses, candle inference runs inside the neuron //! the neuron process; there is no external subprocess. Stage 2 wires
//! process itself — no external subprocess, no systemd indirection. //! up GGUF (currently Qwen3 only) model load/unload via
//! //! `candle-transformers::models::quantized_qwen3`. Stage 3 adds the
//! Stage 1 ships this as an inert skeleton; Stage 2 wires up actual //! inference endpoint.
//! model load/unload via `candle-transformers`.
use anyhow::Result; use anyhow::{Context, Result};
use async_trait::async_trait; use async_trait::async_trait;
use candle_core::Device;
use candle_core::quantized::gguf_file;
use candle_transformers::models::quantized_qwen3::ModelWeights as QuantizedQwen3Weights;
use cortex_core::harness::{Harness, HarnessHealth, ModelInfo, ModelSpec}; use cortex_core::harness::{Harness, HarnessHealth, ModelInfo, ModelSpec};
use std::collections::HashMap;
use std::path::PathBuf;
use std::sync::Arc;
use tokenizers::Tokenizer;
use tokio::sync::{Mutex, RwLock};
/// In-process candle harness. Owns the loaded model registry.
pub struct CandleHarness { pub struct CandleHarness {
/// URL where this neuron serves inference (its own bind address). models: Arc<RwLock<HashMap<String, Arc<LoadedModel>>>>,
hf_cache: Option<PathBuf>,
bind_url: String, bind_url: String,
} }
/// A loaded model with its tokenizer, device placement, and architecture-
/// specific weights. The `arch` field is mutexed because future inference
/// calls take `&mut self` on the underlying ModelWeights (KV cache state).
pub struct LoadedModel {
pub model_id: String,
pub arch: Mutex<ModelArch>,
pub tokenizer: Tokenizer,
pub device: Device,
pub quant: Option<String>,
pub devices: Vec<u32>,
}
/// Architecture-specific weights. Stage 2 supports only Qwen3 quantized;
/// Stage 8 broadens this to additional families and non-quantized variants.
pub enum ModelArch {
Qwen3Quantized(QuantizedQwen3Weights),
}
impl CandleHarness { impl CandleHarness {
pub fn new(bind_url: String) -> Self { pub fn new(bind_url: String, hf_cache: Option<PathBuf>) -> Self {
Self { bind_url } Self {
models: Arc::new(RwLock::new(HashMap::new())),
hf_cache,
bind_url,
}
}
/// Pick a candle `Device` for the requested indices. Without the
/// `cuda` feature, or if CUDA initialisation fails, falls back to CPU.
fn pick_device(devices: &[u32]) -> Result<Device> {
let _idx = devices.first().copied().unwrap_or(0) as usize;
#[cfg(feature = "cuda")]
{
match Device::new_cuda(_idx) {
Ok(d) => return Ok(d),
Err(e) => tracing::warn!(
device = _idx,
error = %e,
"CUDA device unavailable, falling back to CPU"
),
}
}
Ok(Device::Cpu)
}
/// Resolve a model spec to local GGUF and tokenizer file paths via
/// hf-hub. Downloads on first use; subsequent calls are cached.
async fn resolve_files(&self, spec: &ModelSpec) -> Result<(PathBuf, PathBuf)> {
let mut builder = hf_hub::api::tokio::ApiBuilder::new();
if let Some(cache) = &self.hf_cache {
builder = builder.with_cache_dir(cache.clone());
}
let api = builder.build().context("build hf-hub API")?;
let repo = api.model(spec.model_id.clone());
let info = repo
.info()
.await
.with_context(|| format!("fetch HF repo info for {}", spec.model_id))?;
let quant = spec.quant.as_deref().unwrap_or("");
let quant_lc = quant.to_lowercase();
let gguf_filename = info
.siblings
.iter()
.map(|s| s.rfilename.as_str())
.filter(|name| name.to_lowercase().ends_with(".gguf"))
.find(|name| quant_lc.is_empty() || name.to_lowercase().contains(&quant_lc))
.ok_or_else(|| {
anyhow::anyhow!(
"no GGUF file matching quant {:?} in repo {}",
spec.quant,
spec.model_id
)
})?
.to_string();
tracing::info!(
model = %spec.model_id,
file = %gguf_filename,
"resolving GGUF (may be cached)"
);
let gguf_path = repo
.get(&gguf_filename)
.await
.with_context(|| format!("fetch GGUF {gguf_filename}"))?;
let tokenizer_path = repo
.get("tokenizer.json")
.await
.context("fetch tokenizer.json")?;
Ok((gguf_path, tokenizer_path))
} }
} }
@@ -37,18 +134,98 @@ impl Harness for CandleHarness {
} }
async fn list_models(&self) -> Result<Vec<ModelInfo>> { async fn list_models(&self) -> Result<Vec<ModelInfo>> {
Ok(Vec::new()) let models = self.models.read().await;
Ok(models
.values()
.map(|m| ModelInfo {
id: m.model_id.clone(),
harness: "candle".into(),
status: "loaded".into(),
devices: m.devices.clone(),
vram_used_mb: None,
})
.collect())
} }
async fn load_model(&self, _spec: &ModelSpec) -> Result<()> { async fn load_model(&self, spec: &ModelSpec) -> Result<()> {
anyhow::bail!("candle harness load_model not implemented yet (Stage 2)") if spec.harness != "candle" {
anyhow::bail!("expected harness=candle, got harness={}", spec.harness);
}
{
let models = self.models.read().await;
if models.contains_key(&spec.model_id) {
anyhow::bail!("model '{}' already loaded", spec.model_id);
}
}
let devices = spec.devices.clone().unwrap_or_else(|| vec![0]);
let device = Self::pick_device(&devices)?;
let (gguf_path, tokenizer_path) = self.resolve_files(spec).await?;
let tokenizer = Tokenizer::from_file(&tokenizer_path)
.map_err(|e| anyhow::anyhow!("load tokenizer: {e}"))?;
// File I/O + GGUF parsing + tensor materialisation are CPU-bound,
// so run them on a blocking task to avoid stalling the runtime.
let device_for_load = device.clone();
let gguf_path_for_load = gguf_path.clone();
let model_id_for_log = spec.model_id.clone();
let arch = tokio::task::spawn_blocking(move || -> Result<ModelArch> {
tracing::info!(model = %model_id_for_log, path = ?gguf_path_for_load, "loading GGUF");
let mut file = std::fs::File::open(&gguf_path_for_load).context("open GGUF file")?;
let content = gguf_file::Content::read(&mut file)
.map_err(|e| anyhow::anyhow!("parse GGUF: {e}"))?;
let architecture = content
.metadata
.get("general.architecture")
.and_then(|v| v.to_string().ok().cloned())
.unwrap_or_default();
tracing::info!(architecture = %architecture, "GGUF architecture");
match architecture.as_str() {
"qwen3" => {
let weights =
QuantizedQwen3Weights::from_gguf(content, &mut file, &device_for_load)
.map_err(|e| anyhow::anyhow!("from_gguf qwen3: {e}"))?;
Ok(ModelArch::Qwen3Quantized(weights))
}
other => anyhow::bail!(
"unsupported GGUF architecture '{other}'; Stage 2 only supports qwen3"
),
}
})
.await
.context("blocking load task panicked")??;
let loaded = Arc::new(LoadedModel {
model_id: spec.model_id.clone(),
arch: Mutex::new(arch),
tokenizer,
device,
quant: spec.quant.clone(),
devices,
});
let mut models = self.models.write().await;
models.insert(spec.model_id.clone(), loaded);
tracing::info!(model = %spec.model_id, "model loaded");
Ok(())
} }
async fn unload_model(&self, _model_id: &str) -> Result<()> { async fn unload_model(&self, model_id: &str) -> Result<()> {
anyhow::bail!("candle harness unload_model not implemented yet (Stage 2)") let mut models = self.models.write().await;
if models.remove(model_id).is_none() {
anyhow::bail!("model '{model_id}' not loaded");
}
tracing::info!(model = %model_id, "model unloaded");
Ok(())
} }
async fn inference_endpoint(&self, _model_id: &str) -> Option<String> { async fn inference_endpoint(&self, model_id: &str) -> Option<String> {
Some(self.bind_url.clone()) let models = self.models.read().await;
models.contains_key(model_id).then(|| self.bind_url.clone())
} }
} }

View File

@@ -84,12 +84,19 @@ impl HarnessRegistry {
/// `bind_url` is the URL where this neuron serves inference (its own /// `bind_url` is the URL where this neuron serves inference (its own
/// listen address). In-process harnesses (currently the only kind) /// listen address). In-process harnesses (currently the only kind)
/// return this URL from `inference_endpoint`. /// return this URL from `inference_endpoint`.
pub fn from_configs(configs: &[HarnessConfig], bind_url: &str) -> Self { pub fn from_configs(
configs: &[HarnessConfig],
bind_url: &str,
settings: &crate::config::HarnessSettings,
) -> Self {
let mut registry = Self::new(); let mut registry = Self::new();
for config in configs { for config in configs {
match config.name.as_str() { match config.name.as_str() {
"candle" => { "candle" => {
registry.register(Box::new(candle::CandleHarness::new(bind_url.to_string()))); registry.register(Box::new(candle::CandleHarness::new(
bind_url.to_string(),
settings.candle.hf_cache.clone(),
)));
} }
other => { other => {
tracing::warn!(harness = other, "unknown harness type, skipping"); tracing::warn!(harness = other, "unknown harness type, skipping");

View File

@@ -51,7 +51,7 @@ async fn main() -> Result<()> {
// Build harness registry from config. In-process harnesses (candle) // Build harness registry from config. In-process harnesses (candle)
// need to know neuron's own bind URL so they can return it from // need to know neuron's own bind URL so they can return it from
// inference_endpoint. // inference_endpoint.
let registry = HarnessRegistry::from_configs(&cfg.harnesses, &bind_url); let registry = HarnessRegistry::from_configs(&cfg.harnesses, &bind_url, &cfg.harness);
discovery_result.harnesses = registry.names(); discovery_result.harnesses = registry.names();
let health_cache = Arc::new(health::HealthCache::new()); let health_cache = Arc::new(health::HealthCache::new());

View File

@@ -135,17 +135,21 @@ async fn test_models_empty_registry() {
assert!(body.as_array().unwrap().is_empty()); assert!(body.as_array().unwrap().is_empty());
} }
/// Verify the candle harness registers and the load endpoint returns a /// Verify the candle harness registers, list is empty by default, and a
/// "not implemented" error in Stage 1 (Stage 2 wires up actual loading). /// load attempt for an obviously-bogus model id returns a 4xx error
/// without crashing the daemon. Real load/unload exercising actual GGUF
/// download is covered by `tests/candle_lifecycle.rs` (cuda-integration).
#[tokio::test] #[tokio::test]
async fn test_candle_harness_registers_but_load_unimplemented() { async fn test_candle_harness_registers_and_rejects_bogus_model() {
use cortex_core::harness::HarnessConfig; use cortex_core::harness::HarnessConfig;
use neuron::config::HarnessSettings;
let registry = HarnessRegistry::from_configs( let registry = HarnessRegistry::from_configs(
&[HarnessConfig { &[HarnessConfig {
name: "candle".into(), name: "candle".into(),
}], }],
"http://localhost:13131", "http://localhost:13131",
&HarnessSettings::default(),
); );
let health_cache = Arc::new(HealthCache::new()); let health_cache = Arc::new(HealthCache::new());
@@ -165,7 +169,6 @@ async fn test_candle_harness_registers_but_load_unimplemented() {
let client = reqwest::Client::new(); let client = reqwest::Client::new();
// GET /models — candle harness has no models loaded yet.
let resp = client let resp = client
.get(format!("{neuron_url}/models")) .get(format!("{neuron_url}/models"))
.send() .send()
@@ -175,12 +178,22 @@ async fn test_candle_harness_registers_but_load_unimplemented() {
let models: Vec<serde_json::Value> = resp.json().await.unwrap(); let models: Vec<serde_json::Value> = resp.json().await.unwrap();
assert!(models.is_empty()); assert!(models.is_empty());
// POST /models/load — Stage 1 skeleton returns an error. // Sending a wrong-harness spec should be rejected synchronously
// without touching the network or the model registry.
let resp = client let resp = client
.post(format!("{neuron_url}/models/load")) .post(format!("{neuron_url}/models/load"))
.json(&json!({"model_id": "some-model", "harness": "candle"})) .json(&json!({"model_id": "definitely/not-real", "harness": "not-candle"}))
.send() .send()
.await .await
.unwrap(); .unwrap();
assert_eq!(resp.status(), 400); assert_eq!(resp.status(), 400);
// Registry still empty.
let resp = client
.get(format!("{neuron_url}/models"))
.send()
.await
.unwrap();
let models: Vec<serde_json::Value> = resp.json().await.unwrap();
assert!(models.is_empty());
} }

View File

@@ -0,0 +1,90 @@
//! Real model load/unload lifecycle through the candle harness.
//!
//! Gated behind the `cuda-integration` feature because it downloads a
//! real (small) GGUF from HuggingFace and materialises tensors on the
//! configured device. Run on a host with network access and either a
//! CUDA GPU (when built with `--features cuda`) or enough CPU RAM to
//! hold the model.
//!
//! Usage:
//! cargo test -p neuron --features cuda-integration --test candle_lifecycle
//!
//! Optional environment variables:
//! NEURON_TEST_MODEL_ID — HuggingFace repo to load (default: a small
//! public Qwen3 GGUF repo).
//! NEURON_TEST_QUANT — quant substring matched against GGUF
//! filenames (default: "Q4_K_M").
//! HF_HOME — HuggingFace cache directory.
#![cfg(feature = "cuda-integration")]
use cortex_core::harness::{HarnessConfig, ModelSpec};
use neuron::config::HarnessSettings;
use neuron::harness::HarnessRegistry;
use std::path::PathBuf;
#[tokio::test]
async fn test_candle_qwen3_load_unload_lifecycle() {
let _ = tracing_subscriber::fmt()
.with_test_writer()
.with_env_filter("info,neuron=debug")
.try_init();
let model_id = std::env::var("NEURON_TEST_MODEL_ID")
.unwrap_or_else(|_| "Qwen/Qwen3-0.6B-GGUF".to_string());
let quant = std::env::var("NEURON_TEST_QUANT").unwrap_or_else(|_| "Q4_K_M".to_string());
let mut settings = HarnessSettings::default();
if let Ok(home) = std::env::var("HF_HOME") {
settings.candle.hf_cache = Some(PathBuf::from(home));
}
let registry = HarnessRegistry::from_configs(
&[HarnessConfig {
name: "candle".into(),
}],
"http://localhost:13131",
&settings,
);
let spec = ModelSpec {
model_id: model_id.clone(),
harness: "candle".into(),
quant: Some(quant),
tensor_parallel: None,
devices: Some(vec![0]),
};
registry
.load_model(&spec)
.await
.expect("load_model should succeed");
let models = registry
.list_all_models()
.await
.expect("list_all_models");
assert_eq!(models.len(), 1, "expected exactly one loaded model");
assert_eq!(models[0].id, model_id);
assert_eq!(models[0].harness, "candle");
assert_eq!(models[0].status, "loaded");
let url = registry.inference_endpoint(&model_id).await;
assert_eq!(url, Some("http://localhost:13131".into()));
// Re-loading the same model should be rejected.
let again = registry.load_model(&spec).await;
assert!(again.is_err(), "second load should error");
registry
.unload_model(&model_id)
.await
.expect("unload_model should succeed");
let models = registry.list_all_models().await.expect("list_all_models");
assert!(models.is_empty(), "registry should be empty after unload");
// Unloading a model that isn't loaded should error.
let err = registry.unload_model(&model_id).await;
assert!(err.is_err(), "unload of missing model should error");
}

View File

@@ -8,9 +8,17 @@
port = 13131 port = 13131
# -- Harnesses --------------------------------------------------------------- # -- Harnesses ---------------------------------------------------------------
# Each [[harnesses]] entry declares an inference engine. Currently only # Each [[harnesses]] entry enables an inference engine. Currently only
# "candle" is supported — it runs in-process and uses huggingface/candle # "candle" is supported — it runs in-process and uses huggingface/candle
# for inference on local CUDA devices. # for inference on local CUDA devices (or CPU when CUDA is unavailable).
[[harnesses]] [[harnesses]]
name = "candle" name = "candle"
# -- Candle harness settings -------------------------------------------------
# Optional tuning for the candle harness.
[harness.candle]
# HuggingFace cache directory for model weights. When unset, hf-hub's
# default (~/.cache/huggingface) is used.
# hf_cache = "/var/lib/neuron/hf-cache"