feat(neuron): wire candle harness load/unload via GGUF

Stage 2 of the candle-native pivot. Fleshes out CandleHarness with a LoadedModel registry keyed by model_id, hf-hub-backed GGUF download, and Qwen3 quantized weight construction via candle-transformers' quantized_qwen3 module. unload_model drops the entry; Drop on the candle ModelWeights frees device memory. Device selection prefers CUDA (gated behind the new `cuda` feature), falling back to CPU when CUDA is unavailable so default builds work on non-GPU hosts. The candle CUDA toolchain isn't pulled in unless `--features cuda` is passed, keeping CI green on CPU runners. Config gains a [harness.candle] block with an optional hf_cache path. HarnessRegistry::from_configs now takes HarnessSettings so per-harness config flows through. A gated tests/candle_lifecycle.rs exercises real load → list → unload → list-empty when run with `--features cuda-integration` against a host with HF network access. The default-feature test in tests/api.rs covers the wrong-harness rejection path without needing the network. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 16:02:49 +03:00
parent 3cccc2c56b
commit 5c2bd1a1da
9 changed files with 1934 additions and 47 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/crates/neuron/Cargo.toml
+++ b/crates/neuron/Cargo.toml
@@ -12,6 +12,18 @@ path = "src/lib.rs"
 name = "neuron"
 path = "src/main.rs"
 [features]
 default = []
 # Enables CUDA acceleration in candle. Without this feature, candle
 # compiles for CPU only and Device::new_cuda calls fall back to CPU.
 cuda = [
    "candle-core/cuda",
    "candle-nn/cuda",
    "candle-transformers/cuda",
 ]
 # Reserved for GPU-only integration tests in later stages.
 cuda-integration = ["cuda"]
 [dependencies]
 cortex-core.workspace = true
 tokio.workspace = true
@@ -27,6 +39,15 @@ clap.workspace = true
 figment.workspace = true
 toml.workspace = true
 # candle for in-process inference. CUDA support is gated behind the
 # crate's `cuda` feature (default off) so the workspace builds on
 # non-CUDA hosts and CI runners.
 candle-core = "0.10.2"
 candle-nn = "0.10.2"
 candle-transformers = "0.10.2"
 tokenizers = { version = "0.22", default-features = false, features = ["onig"] }
 hf-hub = { version = "0.4", features = ["tokio"] }
 [dev-dependencies]
 tokio = { workspace = true, features = ["test-util"] }
 reqwest.workspace = true
--- a/crates/neuron/src/config.rs
+++ b/crates/neuron/src/config.rs
@@ -6,7 +6,7 @@ use figment::{
    providers::{Env, Format, Toml},
 };
 use serde::{Deserialize, Serialize};
-use std::path::Path;
+use std::path::{Path, PathBuf};
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct NeuronConfig {
@@ -14,6 +14,25 @@ pub struct NeuronConfig {
    pub port: u16,
    #[serde(default)]
    pub harnesses: Vec<HarnessConfig>,
    /// Per-harness configuration. Currently only `candle` is recognised.
    #[serde(default)]
    pub harness: HarnessSettings,
 }
 /// Settings for individual harness implementations. Each harness owns
 /// its own sub-table so users only configure the harnesses they enable.
 #[derive(Debug, Clone, Default, Serialize, Deserialize)]
 pub struct HarnessSettings {
    #[serde(default)]
    pub candle: CandleHarnessConfig,
 }
 #[derive(Debug, Clone, Default, Serialize, Deserialize)]
 pub struct CandleHarnessConfig {
    /// HuggingFace cache directory for model weights.
    /// When unset, defers to hf-hub's default (~/.cache/huggingface).
    #[serde(default)]
    pub hf_cache: Option<PathBuf>,
 }
 fn default_port() -> u16 {
@@ -35,6 +54,7 @@ impl Default for NeuronConfig {
        Self {
            port: 13131,
            harnesses: vec![],
            harness: HarnessSettings::default(),
        }
    }
 }
--- a/crates/neuron/src/harness/candle.rs
+++ b/crates/neuron/src/harness/candle.rs
@@ -1,24 +1,121 @@
 //! Candle harness — in-process inference using huggingface/candle.
 //!
-//! This is the sole `Harness` implementation. Unlike the previous
+//! This is the sole `Harness` implementation. Inference runs inside
-//! mistralrs/llamacpp harnesses, candle inference runs inside the neuron
+//! the neuron process; there is no external subprocess. Stage 2 wires
-//! process itself — no external subprocess, no systemd indirection.
+//! up GGUF (currently Qwen3 only) model load/unload via
-//!
+//! `candle-transformers::models::quantized_qwen3`. Stage 3 adds the
-//! Stage 1 ships this as an inert skeleton; Stage 2 wires up actual
+//! inference endpoint.
 //! model load/unload via `candle-transformers`.
-use anyhow::Result;
+use anyhow::{Context, Result};
 use async_trait::async_trait;
 use candle_core::Device;
 use candle_core::quantized::gguf_file;
 use candle_transformers::models::quantized_qwen3::ModelWeights as QuantizedQwen3Weights;
 use cortex_core::harness::{Harness, HarnessHealth, ModelInfo, ModelSpec};
 use std::collections::HashMap;
 use std::path::PathBuf;
 use std::sync::Arc;
 use tokenizers::Tokenizer;
 use tokio::sync::{Mutex, RwLock};
 /// In-process candle harness. Owns the loaded model registry.
 pub struct CandleHarness {
-    /// URL where this neuron serves inference (its own bind address).
+    models: Arc<RwLock<HashMap<String, Arc<LoadedModel>>>>,
    hf_cache: Option<PathBuf>,
    bind_url: String,
 }
 /// A loaded model with its tokenizer, device placement, and architecture-
 /// specific weights. The `arch` field is mutexed because future inference
 /// calls take `&mut self` on the underlying ModelWeights (KV cache state).
 pub struct LoadedModel {
    pub model_id: String,
    pub arch: Mutex<ModelArch>,
    pub tokenizer: Tokenizer,
    pub device: Device,
    pub quant: Option<String>,
    pub devices: Vec<u32>,
 }
 /// Architecture-specific weights. Stage 2 supports only Qwen3 quantized;
 /// Stage 8 broadens this to additional families and non-quantized variants.
 pub enum ModelArch {
    Qwen3Quantized(QuantizedQwen3Weights),
 }
 impl CandleHarness {
-    pub fn new(bind_url: String) -> Self {
+    pub fn new(bind_url: String, hf_cache: Option<PathBuf>) -> Self {
-        Self { bind_url }
+        Self {
            models: Arc::new(RwLock::new(HashMap::new())),
            hf_cache,
            bind_url,
        }
    }
    /// Pick a candle `Device` for the requested indices. Without the
    /// `cuda` feature, or if CUDA initialisation fails, falls back to CPU.
    fn pick_device(devices: &[u32]) -> Result<Device> {
        let _idx = devices.first().copied().unwrap_or(0) as usize;
        #[cfg(feature = "cuda")]
        {
            match Device::new_cuda(_idx) {
                Ok(d) => return Ok(d),
                Err(e) => tracing::warn!(
                    device = _idx,
                    error = %e,
                    "CUDA device unavailable, falling back to CPU"
                ),
            }
        }
        Ok(Device::Cpu)
    }
    /// Resolve a model spec to local GGUF and tokenizer file paths via
    /// hf-hub. Downloads on first use; subsequent calls are cached.
    async fn resolve_files(&self, spec: &ModelSpec) -> Result<(PathBuf, PathBuf)> {
        let mut builder = hf_hub::api::tokio::ApiBuilder::new();
        if let Some(cache) = &self.hf_cache {
            builder = builder.with_cache_dir(cache.clone());
        }
        let api = builder.build().context("build hf-hub API")?;
        let repo = api.model(spec.model_id.clone());
        let info = repo
            .info()
            .await
            .with_context(|| format!("fetch HF repo info for {}", spec.model_id))?;
        let quant = spec.quant.as_deref().unwrap_or("");
        let quant_lc = quant.to_lowercase();
        let gguf_filename = info
            .siblings
            .iter()
            .map(|s| s.rfilename.as_str())
            .filter(|name| name.to_lowercase().ends_with(".gguf"))
            .find(|name| quant_lc.is_empty() || name.to_lowercase().contains(&quant_lc))
            .ok_or_else(|| {
                anyhow::anyhow!(
                    "no GGUF file matching quant {:?} in repo {}",
                    spec.quant,
                    spec.model_id
                )
            })?
            .to_string();
        tracing::info!(
            model = %spec.model_id,
            file = %gguf_filename,
            "resolving GGUF (may be cached)"
        );
        let gguf_path = repo
            .get(&gguf_filename)
            .await
            .with_context(|| format!("fetch GGUF {gguf_filename}"))?;
        let tokenizer_path = repo
            .get("tokenizer.json")
            .await
            .context("fetch tokenizer.json")?;
        Ok((gguf_path, tokenizer_path))
    }
 }
@@ -37,18 +134,98 @@ impl Harness for CandleHarness {
    }
    async fn list_models(&self) -> Result<Vec<ModelInfo>> {
-        Ok(Vec::new())
+        let models = self.models.read().await;
        Ok(models
            .values()
            .map(|m| ModelInfo {
                id: m.model_id.clone(),
                harness: "candle".into(),
                status: "loaded".into(),
                devices: m.devices.clone(),
                vram_used_mb: None,
            })
            .collect())
    }
-    async fn load_model(&self, _spec: &ModelSpec) -> Result<()> {
+    async fn load_model(&self, spec: &ModelSpec) -> Result<()> {
-        anyhow::bail!("candle harness load_model not implemented yet (Stage 2)")
+        if spec.harness != "candle" {
            anyhow::bail!("expected harness=candle, got harness={}", spec.harness);
        }
        {
            let models = self.models.read().await;
            if models.contains_key(&spec.model_id) {
                anyhow::bail!("model '{}' already loaded", spec.model_id);
            }
        }
        let devices = spec.devices.clone().unwrap_or_else(|| vec![0]);
        let device = Self::pick_device(&devices)?;
        let (gguf_path, tokenizer_path) = self.resolve_files(spec).await?;
        let tokenizer = Tokenizer::from_file(&tokenizer_path)
            .map_err(|e| anyhow::anyhow!("load tokenizer: {e}"))?;
        // File I/O + GGUF parsing + tensor materialisation are CPU-bound,
        // so run them on a blocking task to avoid stalling the runtime.
        let device_for_load = device.clone();
        let gguf_path_for_load = gguf_path.clone();
        let model_id_for_log = spec.model_id.clone();
        let arch = tokio::task::spawn_blocking(move || -> Result<ModelArch> {
            tracing::info!(model = %model_id_for_log, path = ?gguf_path_for_load, "loading GGUF");
            let mut file = std::fs::File::open(&gguf_path_for_load).context("open GGUF file")?;
            let content = gguf_file::Content::read(&mut file)
                .map_err(|e| anyhow::anyhow!("parse GGUF: {e}"))?;
            let architecture = content
                .metadata
                .get("general.architecture")
                .and_then(|v| v.to_string().ok().cloned())
                .unwrap_or_default();
            tracing::info!(architecture = %architecture, "GGUF architecture");
            match architecture.as_str() {
                "qwen3" => {
                    let weights =
                        QuantizedQwen3Weights::from_gguf(content, &mut file, &device_for_load)
                            .map_err(|e| anyhow::anyhow!("from_gguf qwen3: {e}"))?;
                    Ok(ModelArch::Qwen3Quantized(weights))
                }
                other => anyhow::bail!(
                    "unsupported GGUF architecture '{other}'; Stage 2 only supports qwen3"
                ),
            }
        })
        .await
        .context("blocking load task panicked")??;
        let loaded = Arc::new(LoadedModel {
            model_id: spec.model_id.clone(),
            arch: Mutex::new(arch),
            tokenizer,
            device,
            quant: spec.quant.clone(),
            devices,
        });
        let mut models = self.models.write().await;
        models.insert(spec.model_id.clone(), loaded);
        tracing::info!(model = %spec.model_id, "model loaded");
        Ok(())
    }
-    async fn unload_model(&self, _model_id: &str) -> Result<()> {
+    async fn unload_model(&self, model_id: &str) -> Result<()> {
-        anyhow::bail!("candle harness unload_model not implemented yet (Stage 2)")
+        let mut models = self.models.write().await;
        if models.remove(model_id).is_none() {
            anyhow::bail!("model '{model_id}' not loaded");
        }
        tracing::info!(model = %model_id, "model unloaded");
        Ok(())
    }
-    async fn inference_endpoint(&self, _model_id: &str) -> Option<String> {
+    async fn inference_endpoint(&self, model_id: &str) -> Option<String> {
-        Some(self.bind_url.clone())
+        let models = self.models.read().await;
        models.contains_key(model_id).then(|| self.bind_url.clone())
    }
 }
--- a/crates/neuron/src/harness/mod.rs
+++ b/crates/neuron/src/harness/mod.rs
@@ -84,12 +84,19 @@ impl HarnessRegistry {
    /// `bind_url` is the URL where this neuron serves inference (its own
    /// listen address). In-process harnesses (currently the only kind)
    /// return this URL from `inference_endpoint`.
-    pub fn from_configs(configs: &[HarnessConfig], bind_url: &str) -> Self {
+    pub fn from_configs(
        configs: &[HarnessConfig],
        bind_url: &str,
        settings: &crate::config::HarnessSettings,
    ) -> Self {
        let mut registry = Self::new();
        for config in configs {
            match config.name.as_str() {
                "candle" => {
-                    registry.register(Box::new(candle::CandleHarness::new(bind_url.to_string())));
+                    registry.register(Box::new(candle::CandleHarness::new(
                        bind_url.to_string(),
                        settings.candle.hf_cache.clone(),
                    )));
                }
                other => {
                    tracing::warn!(harness = other, "unknown harness type, skipping");
--- a/crates/neuron/src/main.rs
+++ b/crates/neuron/src/main.rs
@@ -51,7 +51,7 @@ async fn main() -> Result<()> {
    // Build harness registry from config. In-process harnesses (candle)
    // need to know neuron's own bind URL so they can return it from
    // inference_endpoint.
-    let registry = HarnessRegistry::from_configs(&cfg.harnesses, &bind_url);
+    let registry = HarnessRegistry::from_configs(&cfg.harnesses, &bind_url, &cfg.harness);
    discovery_result.harnesses = registry.names();
    let health_cache = Arc::new(health::HealthCache::new());
--- a/crates/neuron/tests/api.rs
+++ b/crates/neuron/tests/api.rs
@@ -135,17 +135,21 @@ async fn test_models_empty_registry() {
    assert!(body.as_array().unwrap().is_empty());
 }
-/// Verify the candle harness registers and the load endpoint returns a
+/// Verify the candle harness registers, list is empty by default, and a
-/// "not implemented" error in Stage 1 (Stage 2 wires up actual loading).
+/// load attempt for an obviously-bogus model id returns a 4xx error
 /// without crashing the daemon. Real load/unload exercising actual GGUF
 /// download is covered by `tests/candle_lifecycle.rs` (cuda-integration).
 #[tokio::test]
-async fn test_candle_harness_registers_but_load_unimplemented() {
+async fn test_candle_harness_registers_and_rejects_bogus_model() {
    use cortex_core::harness::HarnessConfig;
    use neuron::config::HarnessSettings;
    let registry = HarnessRegistry::from_configs(
        &[HarnessConfig {
            name: "candle".into(),
        }],
        "http://localhost:13131",
        &HarnessSettings::default(),
    );
    let health_cache = Arc::new(HealthCache::new());
@@ -165,7 +169,6 @@ async fn test_candle_harness_registers_but_load_unimplemented() {
    let client = reqwest::Client::new();
    // GET /models — candle harness has no models loaded yet.
    let resp = client
        .get(format!("{neuron_url}/models"))
        .send()
@@ -175,12 +178,22 @@ async fn test_candle_harness_registers_but_load_unimplemented() {
    let models: Vec<serde_json::Value> = resp.json().await.unwrap();
    assert!(models.is_empty());
-    // POST /models/load — Stage 1 skeleton returns an error.
+    // Sending a wrong-harness spec should be rejected synchronously
    // without touching the network or the model registry.
    let resp = client
        .post(format!("{neuron_url}/models/load"))
-        .json(&json!({"model_id": "some-model", "harness": "candle"}))
+        .json(&json!({"model_id": "definitely/not-real", "harness": "not-candle"}))
        .send()
        .await
        .unwrap();
    assert_eq!(resp.status(), 400);
    // Registry still empty.
    let resp = client
        .get(format!("{neuron_url}/models"))
        .send()
        .await
        .unwrap();
    let models: Vec<serde_json::Value> = resp.json().await.unwrap();
    assert!(models.is_empty());
 }
--- a/crates/neuron/tests/candle_lifecycle.rs
+++ b/crates/neuron/tests/candle_lifecycle.rs
@@ -0,0 +1,90 @@
 //! Real model load/unload lifecycle through the candle harness.
 //!
 //! Gated behind the `cuda-integration` feature because it downloads a
 //! real (small) GGUF from HuggingFace and materialises tensors on the
 //! configured device. Run on a host with network access and either a
 //! CUDA GPU (when built with `--features cuda`) or enough CPU RAM to
 //! hold the model.
 //!
 //! Usage:
 //!   cargo test -p neuron --features cuda-integration --test candle_lifecycle
 //!
 //! Optional environment variables:
 //!   NEURON_TEST_MODEL_ID — HuggingFace repo to load (default: a small
 //!     public Qwen3 GGUF repo).
 //!   NEURON_TEST_QUANT    — quant substring matched against GGUF
 //!     filenames (default: "Q4_K_M").
 //!   HF_HOME              — HuggingFace cache directory.
 #![cfg(feature = "cuda-integration")]
 use cortex_core::harness::{HarnessConfig, ModelSpec};
 use neuron::config::HarnessSettings;
 use neuron::harness::HarnessRegistry;
 use std::path::PathBuf;
 #[tokio::test]
 async fn test_candle_qwen3_load_unload_lifecycle() {
    let _ = tracing_subscriber::fmt()
        .with_test_writer()
        .with_env_filter("info,neuron=debug")
        .try_init();
    let model_id = std::env::var("NEURON_TEST_MODEL_ID")
        .unwrap_or_else(|_| "Qwen/Qwen3-0.6B-GGUF".to_string());
    let quant = std::env::var("NEURON_TEST_QUANT").unwrap_or_else(|_| "Q4_K_M".to_string());
    let mut settings = HarnessSettings::default();
    if let Ok(home) = std::env::var("HF_HOME") {
        settings.candle.hf_cache = Some(PathBuf::from(home));
    }
    let registry = HarnessRegistry::from_configs(
        &[HarnessConfig {
            name: "candle".into(),
        }],
        "http://localhost:13131",
        &settings,
    );
    let spec = ModelSpec {
        model_id: model_id.clone(),
        harness: "candle".into(),
        quant: Some(quant),
        tensor_parallel: None,
        devices: Some(vec![0]),
    };
    registry
        .load_model(&spec)
        .await
        .expect("load_model should succeed");
    let models = registry
        .list_all_models()
        .await
        .expect("list_all_models");
    assert_eq!(models.len(), 1, "expected exactly one loaded model");
    assert_eq!(models[0].id, model_id);
    assert_eq!(models[0].harness, "candle");
    assert_eq!(models[0].status, "loaded");
    let url = registry.inference_endpoint(&model_id).await;
    assert_eq!(url, Some("http://localhost:13131".into()));
    // Re-loading the same model should be rejected.
    let again = registry.load_model(&spec).await;
    assert!(again.is_err(), "second load should error");
    registry
        .unload_model(&model_id)
        .await
        .expect("unload_model should succeed");
    let models = registry.list_all_models().await.expect("list_all_models");
    assert!(models.is_empty(), "registry should be empty after unload");
    // Unloading a model that isn't loaded should error.
    let err = registry.unload_model(&model_id).await;
    assert!(err.is_err(), "unload of missing model should error");
 }
--- a/neuron.example.toml
+++ b/neuron.example.toml
@@ -8,9 +8,17 @@
 port = 13131
 # -- Harnesses ---------------------------------------------------------------
-# Each [[harnesses]] entry declares an inference engine. Currently only
+# Each [[harnesses]] entry enables an inference engine. Currently only
 # "candle" is supported — it runs in-process and uses huggingface/candle
-# for inference on local CUDA devices.
+# for inference on local CUDA devices (or CPU when CUDA is unavailable).
 [[harnesses]]
 name = "candle"
 # -- Candle harness settings -------------------------------------------------
 # Optional tuning for the candle harness.
 [harness.candle]
 # HuggingFace cache directory for model weights. When unset, hf-hub's
 # default (~/.cache/huggingface) is used.
 # hf_cache = "/var/lib/neuron/hf-cache"