fix(tp): add half dep + drop double-wrapped .w() on CudaDevice::alloc

Two follow-up cuda-only fixes surfaced by `cargo build --features cuda` inside the cuda-13.0 runner container: 1. `half::{bf16, f16}` was an undeclared dep. Added `half = "2.5"` (matching candle-core's pinned major) under the cuda feature flag. 2. `dev.alloc::<T>(n)` already returns `candle_core::Result` (it calls `.w()` internally on the cudarc error). Calling `.w()?` on top of that needs `From<candle_core::Error> for CudaError`, which doesn't exist — collapse to `?`. Removed the now-unused `cuda_backend::WrapErr` import. Verified by `cargo build -p neuron --features cuda` and `cargo clippy -p neuron --all-targets --features cuda -- -D warnings` inside `git.lair.cafe/gongfoo/runner-cuda-13.0` with the local glibc/CUDA-13.0 math_functions.h noexcept patch. CPU clippy/tests stay green. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 19:11:59 +03:00
parent 12549c9aed
commit 96d8755245
4 changed files with 10 additions and 4 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@
 .vscode/
 cortex.toml
 doc/plan/*
 /target-cuda/
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2116,6 +2116,7 @@ dependencies = [
 "cudarc 0.19.7",
 "figment",
 "futures",
 "half",
 "hf-hub",
 "reqwest",
 "serde",
--- a/crates/neuron/Cargo.toml
+++ b/crates/neuron/Cargo.toml
@@ -24,6 +24,7 @@ cuda = [
    "candle-nn/cuda",
    "candle-transformers/cuda",
    "dep:cudarc",
    "dep:half",
 ]
 # Use cuDNN for convolution / attention kernels. Requires CUDA.
 cudnn = [
@@ -68,6 +69,10 @@ candle-transformers = "0.10.2"
 # TP worker pool can call cudarc::nccl::{Comm, Id} directly. Gated on
 # the `cuda` feature; same toolchain requirement as candle's CUDA path.
 cudarc = { version = "0.19", optional = true, default-features = false, features = ["nccl", "cuda-version-from-build-system"] }
 # Used by the AllReduce CustomOp1 to type-dispatch on bf16/f16 candle
 # storages. Matches candle-core's pinned major version to avoid double-
 # compiling the `half` crate at conflicting versions.
 half = { version = "2.5", optional = true }
 tokenizers = { version = "0.22", default-features = false, features = ["onig"] }
 hf-hub = { version = "0.4", features = ["tokio"] }
--- a/crates/neuron/src/harness/tp/all_reduce.rs
+++ b/crates/neuron/src/harness/tp/all_reduce.rs
@@ -21,7 +21,6 @@
 #![cfg(feature = "cuda")]
 use candle_core::backend::BackendStorage;
 use candle_core::cuda_backend::WrapErr;
 use candle_core::{CpuStorage, CudaStorage, CustomOp1, DType, Layout, Result, Shape};
 use cudarc::nccl::{Comm, ReduceOp};
 use half::{bf16, f16};
@@ -87,7 +86,7 @@ impl CustomOp1 for AllReduce {
            DType::BF16 => {
                let src = s.as_cuda_slice::<bf16>()?;
                require_contiguous(src, l)?;
-                let mut dst = unsafe { dev.alloc::<bf16>(elem_count) }.w()?;
+                let mut dst = unsafe { dev.alloc::<bf16>(elem_count) }?;
                self.comm
                    .all_reduce(src, &mut dst, &ReduceOp::Sum)
                    .map_err(|e| candle_core::Error::Msg(format!("nccl all_reduce bf16: {e:?}")))?;
@@ -96,7 +95,7 @@ impl CustomOp1 for AllReduce {
            DType::F16 => {
                let src = s.as_cuda_slice::<f16>()?;
                require_contiguous(src, l)?;
-                let mut dst = unsafe { dev.alloc::<f16>(elem_count) }.w()?;
+                let mut dst = unsafe { dev.alloc::<f16>(elem_count) }?;
                self.comm
                    .all_reduce(src, &mut dst, &ReduceOp::Sum)
                    .map_err(|e| candle_core::Error::Msg(format!("nccl all_reduce f16: {e:?}")))?;
@@ -105,7 +104,7 @@ impl CustomOp1 for AllReduce {
            DType::F32 => {
                let src = s.as_cuda_slice::<f32>()?;
                require_contiguous(src, l)?;
-                let mut dst = unsafe { dev.alloc::<f32>(elem_count) }.w()?;
+                let mut dst = unsafe { dev.alloc::<f32>(elem_count) }?;
                self.comm
                    .all_reduce(src, &mut dst, &ReduceOp::Sum)
                    .map_err(|e| candle_core::Error::Msg(format!("nccl all_reduce f32: {e:?}")))?;