refactor(neuron): phase 4 — model loads move onto the device worker

Final structural slice of the per-device CUDA context-ownership refactor. The four remaining spawn_blocking sites that did CUDA work on the leader are gone: - Single-GPU GGUF load (`load_arch_gguf` spawn_blocking) → `Job::LoadGguf` dispatched on the worker. - Single-GPU dense load (`load_arch_dense` spawn_blocking) → `Job::LoadDense` on the worker. - TP shard load (`WorkerPool::load_dense_shard` spawn_blocking) → `Job::TpLoadShard`. The dispatch handler reads `state.nccl.comm()` directly — no cross-thread `Arc<Comm>` transfer, no `SendComm` wrapper for this path. The Phase 2 / Phase 3 bridges that moved freshly-built models across the channel boundary (`Job::TransferIn`, `Job::TransferInTp`, `Job::CloneLeaderComm`) are removed. Models are now constructed on the worker thread directly; the slab gets populated by `insert_arch` / the inline `tp_models.insert` in dispatch handlers. What this phase preserves: - CPU loads still use `tokio::task::spawn_blocking` against `Arc<Mutex<ModelArch>>`. There's no CUDA context to own on CPU and channel overhead would only add latency. Four `spawn_blocking` references remain in `candle.rs` (load_arch_gguf, load_arch_dense, chat_completion, chat_completion_stream) and all are deliberate CPU-only fallback. - Public API unchanged. `Harness::load_model`, `chat_completion`, HTTP routes all keep identical signatures. What this phase removes: - `SendComm` wrapper is no longer used in the load path (the Phase 3 bridge that justified it). It remains in `nccl_state.rs` for the Phase 1–3 era and any future cross-thread Comm move; consider deleting in a follow-up. - `Job::TransferIn`, `Job::TransferInTp`, `Job::CloneLeaderComm` and their handle convenience methods deleted. - The leader_device parameter on `load_dense_shard` is now `_` — unused since the worker has its own bound device. Removing the arg outright is a public-API change; keeping the underscore prefix preserves the signature and signals deadness without churn. Helper relocation: - `LlamaDense::from_parts` is a new pub(crate) constructor so the worker-thread loader can build a `LlamaDense` without going through the original `load_arch_dense` async function. - `check_dense_config_supported` is bumped to `pub(crate)` for the same reason. Sweep verified: `grep -rn spawn_blocking crates/neuron/src/harness/` returns only CPU-fallback hits in `candle.rs` + doc-comment references to the old design. All four leader-side CUDA `spawn_blocking` sites are gone. fmt + clippy clean; 37 lib tests + all integration tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-27 10:24:38 +03:00
parent 76ab24d98c
commit b4f3576d82
5 changed files with 475 additions and 225 deletions
--- a/crates/neuron/src/harness/candle.rs
+++ b/crates/neuron/src/harness/candle.rs
@@ -307,6 +307,26 @@ pub struct LlamaDense {
 }

 impl LlamaDense {
+    /// Constructor used by the dispatch-side loader. Keeps the field
+    /// names private while letting the worker thread build a
+    /// `LlamaDense` from already-loaded weights without going through
+    /// async candle code.
+    pub(crate) fn from_parts(
+        model: llama_dense::Llama,
+        cache: llama_dense::Cache,
+        config: llama_dense::Config,
+        dtype: DType,
+        device: Device,
+    ) -> Self {
+        Self {
+            model,
+            cache,
+            config,
+            dtype,
+            device,
+        }
+    }
+
    pub fn forward(&mut self, input: &Tensor, offset: usize) -> Result<Tensor> {
        Ok(self.model.forward(input, offset, &mut self.cache)?)
    }
@@ -348,7 +368,7 @@ const DENSE_SUPPORTED_MODEL_TYPES: &[&str] = &["llama", "qwen3", "qwen3_5", "qwe
 /// The result message names the model_type we saw, the supported set,
 /// and points at the files an operator (or future contributor) needs
 /// to touch to grow the supported set.
-fn check_dense_config_supported(config_json: &str, model_id: &str) -> Result<()> {
+pub(crate) fn check_dense_config_supported(config_json: &str, model_id: &str) -> Result<()> {
    let v: serde_json::Value = serde_json::from_str(config_json)
        .with_context(|| format!("parse config.json for '{model_id}' as JSON"))?;
    let model_type = v.get("model_type").and_then(|x| x.as_str()).unwrap_or("");
@@ -1547,42 +1567,47 @@ impl Harness for CandleHarness {
        let devices = spec.devices.clone().unwrap_or_else(|| vec![0]);
        let device = Self::pick_device(&devices)?;

-        // Dispatch by source format: GGUF (pre-quantized, single-GPU
-        // only path) vs safetensors dense (bf16/fp16; the path that
-        // grows TP support). `spec.quant` is the signal — Some means
-        // the operator picked a quantized GGUF; None means dense.
-        let (tokenizer_path, arch) = if spec.quant.is_some() {
-            self.load_arch_gguf(spec, &device).await?
-        } else {
-            self.load_arch_dense(spec, &device).await?
-        };
-
-        let tokenizer = Tokenizer::from_file(&tokenizer_path)
-            .map_err(|e| anyhow::anyhow!("load tokenizer: {e}"))?;
-
-        // Worker thread for the chosen device. CPU loads (CUDA
-        // unavailable / not requested) skip the worker — there's no
-        // context to own. For CUDA loads, the arch is transferred
-        // into the worker's slab now so the inference path can
-        // reference it via the returned `ArchHandle`. The explicit
-        // type annotation lets the no-cuda build resolve `None` to
-        // the right `Option<Arc<DeviceWorkerHandle>>` type.
+        // Phase 4: load directly on the worker thread for CUDA;
+        // legacy spawn_blocking + Arc<Mutex<>> only for CPU. Resolve
+        // hf-hub paths up front (always async), then either dispatch
+        // a load Job (CUDA) or call the legacy local loader (CPU).
        let worker: Option<Arc<super::device_worker::DeviceWorkerHandle>> = match &device {
            #[cfg(feature = "cuda")]
            Device::Cuda(_) => Some(self.ensure_device_worker(devices[0]).await?),
            _ => None,
        };
-        let (arch_local, arch_handle) = match &worker {
-            Some(w) => {
+
+        let (tokenizer_path, arch_local, arch_handle) = if let Some(w) = &worker {
+            // CUDA path: resolve, then load in the worker.
+            if spec.quant.is_some() {
+                let (gguf_path, tokenizer_path) = self.resolve_files(spec).await?;
                let handle = w
-                    .transfer_in(Box::new(arch))
+                    .load_gguf(gguf_path, spec.model_id.clone())
                    .await
-                    .map_err(|e| anyhow::anyhow!("transfer arch into device worker: {e}"))?;
-                (None, Some(handle))
+                    .map_err(|e| anyhow::anyhow!("worker load_gguf: {e}"))?;
+                (tokenizer_path, None, Some(handle))
+            } else {
+                let (config_path, tokenizer_path, safetensors_paths) =
+                    self.resolve_dense_files(spec).await?;
+                let handle = w
+                    .load_dense(config_path, safetensors_paths, spec.model_id.clone())
+                    .await
+                    .map_err(|e| anyhow::anyhow!("worker load_dense: {e}"))?;
+                (tokenizer_path, None, Some(handle))
            }
-            None => (Some(Arc::new(Mutex::new(arch))), None),
+        } else {
+            // CPU path: legacy spawn_blocking + Arc<Mutex<ModelArch>>.
+            let (tokenizer_path, arch) = if spec.quant.is_some() {
+                self.load_arch_gguf(spec, &device).await?
+            } else {
+                self.load_arch_dense(spec, &device).await?
+            };
+            (tokenizer_path, Some(Arc::new(Mutex::new(arch))), None)
        };

+        let tokenizer = Tokenizer::from_file(&tokenizer_path)
+            .map_err(|e| anyhow::anyhow!("load tokenizer: {e}"))?;
+
        let loaded = Arc::new(LoadedModel {
            model_id: spec.model_id.clone(),
            arch: arch_local,