fix(tp): add half dep + drop double-wrapped .w() on CudaDevice::alloc
All checks were successful
build-prerelease / Resolve version stamps (push) Successful in 35s
CI / Format (push) Successful in 37s
CI / Clippy (push) Successful in 2m17s
CI / Test (push) Successful in 4m50s
build-prerelease / Build neuron-blackwell (push) Successful in 3m36s
CI / Build cortex SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
CI / Build neuron SRPM (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
build-prerelease / Build cortex binary (push) Successful in 4m32s
build-prerelease / Package cortex RPM (push) Successful in 1m25s
build-prerelease / Build neuron-ampere (push) Successful in 5m13s
build-prerelease / Build neuron-ada (push) Successful in 4m42s
build-prerelease / Package helexa-neuron-ada RPM (push) Successful in 2m52s
build-prerelease / Package helexa-neuron-ampere RPM (push) Successful in 3m0s
build-prerelease / Package helexa-neuron-blackwell RPM (push) Successful in 3m39s
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Successful in 1m12s

Two follow-up cuda-only fixes surfaced by `cargo build --features cuda`
inside the cuda-13.0 runner container:

1. `half::{bf16, f16}` was an undeclared dep. Added `half = "2.5"`
   (matching candle-core's pinned major) under the cuda feature flag.
2. `dev.alloc::<T>(n)` already returns `candle_core::Result` (it calls
   `.w()` internally on the cudarc error). Calling `.w()?` on top of
   that needs `From<candle_core::Error> for CudaError`, which doesn't
   exist — collapse to `?`. Removed the now-unused
   `cuda_backend::WrapErr` import.

Verified by `cargo build -p neuron --features cuda` and
`cargo clippy -p neuron --all-targets --features cuda -- -D warnings`
inside `git.lair.cafe/gongfoo/runner-cuda-13.0` with the local
glibc/CUDA-13.0 math_functions.h noexcept patch. CPU clippy/tests stay
green.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-19 19:11:59 +03:00
parent 12549c9aed
commit 96d8755245
4 changed files with 10 additions and 4 deletions

1
.gitignore vendored
View File

@@ -5,3 +5,4 @@
.vscode/ .vscode/
cortex.toml cortex.toml
doc/plan/* doc/plan/*
/target-cuda/

1
Cargo.lock generated
View File

@@ -2116,6 +2116,7 @@ dependencies = [
"cudarc 0.19.7", "cudarc 0.19.7",
"figment", "figment",
"futures", "futures",
"half",
"hf-hub", "hf-hub",
"reqwest", "reqwest",
"serde", "serde",

View File

@@ -24,6 +24,7 @@ cuda = [
"candle-nn/cuda", "candle-nn/cuda",
"candle-transformers/cuda", "candle-transformers/cuda",
"dep:cudarc", "dep:cudarc",
"dep:half",
] ]
# Use cuDNN for convolution / attention kernels. Requires CUDA. # Use cuDNN for convolution / attention kernels. Requires CUDA.
cudnn = [ cudnn = [
@@ -68,6 +69,10 @@ candle-transformers = "0.10.2"
# TP worker pool can call cudarc::nccl::{Comm, Id} directly. Gated on # TP worker pool can call cudarc::nccl::{Comm, Id} directly. Gated on
# the `cuda` feature; same toolchain requirement as candle's CUDA path. # the `cuda` feature; same toolchain requirement as candle's CUDA path.
cudarc = { version = "0.19", optional = true, default-features = false, features = ["nccl", "cuda-version-from-build-system"] } cudarc = { version = "0.19", optional = true, default-features = false, features = ["nccl", "cuda-version-from-build-system"] }
# Used by the AllReduce CustomOp1 to type-dispatch on bf16/f16 candle
# storages. Matches candle-core's pinned major version to avoid double-
# compiling the `half` crate at conflicting versions.
half = { version = "2.5", optional = true }
tokenizers = { version = "0.22", default-features = false, features = ["onig"] } tokenizers = { version = "0.22", default-features = false, features = ["onig"] }
hf-hub = { version = "0.4", features = ["tokio"] } hf-hub = { version = "0.4", features = ["tokio"] }

View File

@@ -21,7 +21,6 @@
#![cfg(feature = "cuda")] #![cfg(feature = "cuda")]
use candle_core::backend::BackendStorage; use candle_core::backend::BackendStorage;
use candle_core::cuda_backend::WrapErr;
use candle_core::{CpuStorage, CudaStorage, CustomOp1, DType, Layout, Result, Shape}; use candle_core::{CpuStorage, CudaStorage, CustomOp1, DType, Layout, Result, Shape};
use cudarc::nccl::{Comm, ReduceOp}; use cudarc::nccl::{Comm, ReduceOp};
use half::{bf16, f16}; use half::{bf16, f16};
@@ -87,7 +86,7 @@ impl CustomOp1 for AllReduce {
DType::BF16 => { DType::BF16 => {
let src = s.as_cuda_slice::<bf16>()?; let src = s.as_cuda_slice::<bf16>()?;
require_contiguous(src, l)?; require_contiguous(src, l)?;
let mut dst = unsafe { dev.alloc::<bf16>(elem_count) }.w()?; let mut dst = unsafe { dev.alloc::<bf16>(elem_count) }?;
self.comm self.comm
.all_reduce(src, &mut dst, &ReduceOp::Sum) .all_reduce(src, &mut dst, &ReduceOp::Sum)
.map_err(|e| candle_core::Error::Msg(format!("nccl all_reduce bf16: {e:?}")))?; .map_err(|e| candle_core::Error::Msg(format!("nccl all_reduce bf16: {e:?}")))?;
@@ -96,7 +95,7 @@ impl CustomOp1 for AllReduce {
DType::F16 => { DType::F16 => {
let src = s.as_cuda_slice::<f16>()?; let src = s.as_cuda_slice::<f16>()?;
require_contiguous(src, l)?; require_contiguous(src, l)?;
let mut dst = unsafe { dev.alloc::<f16>(elem_count) }.w()?; let mut dst = unsafe { dev.alloc::<f16>(elem_count) }?;
self.comm self.comm
.all_reduce(src, &mut dst, &ReduceOp::Sum) .all_reduce(src, &mut dst, &ReduceOp::Sum)
.map_err(|e| candle_core::Error::Msg(format!("nccl all_reduce f16: {e:?}")))?; .map_err(|e| candle_core::Error::Msg(format!("nccl all_reduce f16: {e:?}")))?;
@@ -105,7 +104,7 @@ impl CustomOp1 for AllReduce {
DType::F32 => { DType::F32 => {
let src = s.as_cuda_slice::<f32>()?; let src = s.as_cuda_slice::<f32>()?;
require_contiguous(src, l)?; require_contiguous(src, l)?;
let mut dst = unsafe { dev.alloc::<f32>(elem_count) }.w()?; let mut dst = unsafe { dev.alloc::<f32>(elem_count) }?;
self.comm self.comm
.all_reduce(src, &mut dst, &ReduceOp::Sum) .all_reduce(src, &mut dst, &ReduceOp::Sum)
.map_err(|e| candle_core::Error::Msg(format!("nccl all_reduce f32: {e:?}")))?; .map_err(|e| candle_core::Error::Msg(format!("nccl all_reduce f32: {e:?}")))?;