fix(tp): add half dep + drop double-wrapped .w() on CudaDevice::alloc
All checks were successful
build-prerelease / Resolve version stamps (push) Successful in 35s
CI / Format (push) Successful in 37s
CI / Clippy (push) Successful in 2m17s
CI / Test (push) Successful in 4m50s
build-prerelease / Build neuron-blackwell (push) Successful in 3m36s
CI / Build cortex SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
CI / Build neuron SRPM (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
build-prerelease / Build cortex binary (push) Successful in 4m32s
build-prerelease / Package cortex RPM (push) Successful in 1m25s
build-prerelease / Build neuron-ampere (push) Successful in 5m13s
build-prerelease / Build neuron-ada (push) Successful in 4m42s
build-prerelease / Package helexa-neuron-ada RPM (push) Successful in 2m52s
build-prerelease / Package helexa-neuron-ampere RPM (push) Successful in 3m0s
build-prerelease / Package helexa-neuron-blackwell RPM (push) Successful in 3m39s
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Successful in 1m12s
All checks were successful
build-prerelease / Resolve version stamps (push) Successful in 35s
CI / Format (push) Successful in 37s
CI / Clippy (push) Successful in 2m17s
CI / Test (push) Successful in 4m50s
build-prerelease / Build neuron-blackwell (push) Successful in 3m36s
CI / Build cortex SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
CI / Build neuron SRPM (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
build-prerelease / Build cortex binary (push) Successful in 4m32s
build-prerelease / Package cortex RPM (push) Successful in 1m25s
build-prerelease / Build neuron-ampere (push) Successful in 5m13s
build-prerelease / Build neuron-ada (push) Successful in 4m42s
build-prerelease / Package helexa-neuron-ada RPM (push) Successful in 2m52s
build-prerelease / Package helexa-neuron-ampere RPM (push) Successful in 3m0s
build-prerelease / Package helexa-neuron-blackwell RPM (push) Successful in 3m39s
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Successful in 1m12s
Two follow-up cuda-only fixes surfaced by `cargo build --features cuda`
inside the cuda-13.0 runner container:
1. `half::{bf16, f16}` was an undeclared dep. Added `half = "2.5"`
(matching candle-core's pinned major) under the cuda feature flag.
2. `dev.alloc::<T>(n)` already returns `candle_core::Result` (it calls
`.w()` internally on the cudarc error). Calling `.w()?` on top of
that needs `From<candle_core::Error> for CudaError`, which doesn't
exist — collapse to `?`. Removed the now-unused
`cuda_backend::WrapErr` import.
Verified by `cargo build -p neuron --features cuda` and
`cargo clippy -p neuron --all-targets --features cuda -- -D warnings`
inside `git.lair.cafe/gongfoo/runner-cuda-13.0` with the local
glibc/CUDA-13.0 math_functions.h noexcept patch. CPU clippy/tests stay
green.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -5,3 +5,4 @@
|
|||||||
.vscode/
|
.vscode/
|
||||||
cortex.toml
|
cortex.toml
|
||||||
doc/plan/*
|
doc/plan/*
|
||||||
|
/target-cuda/
|
||||||
|
|||||||
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -2116,6 +2116,7 @@ dependencies = [
|
|||||||
"cudarc 0.19.7",
|
"cudarc 0.19.7",
|
||||||
"figment",
|
"figment",
|
||||||
"futures",
|
"futures",
|
||||||
|
"half",
|
||||||
"hf-hub",
|
"hf-hub",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"serde",
|
"serde",
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ cuda = [
|
|||||||
"candle-nn/cuda",
|
"candle-nn/cuda",
|
||||||
"candle-transformers/cuda",
|
"candle-transformers/cuda",
|
||||||
"dep:cudarc",
|
"dep:cudarc",
|
||||||
|
"dep:half",
|
||||||
]
|
]
|
||||||
# Use cuDNN for convolution / attention kernels. Requires CUDA.
|
# Use cuDNN for convolution / attention kernels. Requires CUDA.
|
||||||
cudnn = [
|
cudnn = [
|
||||||
@@ -68,6 +69,10 @@ candle-transformers = "0.10.2"
|
|||||||
# TP worker pool can call cudarc::nccl::{Comm, Id} directly. Gated on
|
# TP worker pool can call cudarc::nccl::{Comm, Id} directly. Gated on
|
||||||
# the `cuda` feature; same toolchain requirement as candle's CUDA path.
|
# the `cuda` feature; same toolchain requirement as candle's CUDA path.
|
||||||
cudarc = { version = "0.19", optional = true, default-features = false, features = ["nccl", "cuda-version-from-build-system"] }
|
cudarc = { version = "0.19", optional = true, default-features = false, features = ["nccl", "cuda-version-from-build-system"] }
|
||||||
|
# Used by the AllReduce CustomOp1 to type-dispatch on bf16/f16 candle
|
||||||
|
# storages. Matches candle-core's pinned major version to avoid double-
|
||||||
|
# compiling the `half` crate at conflicting versions.
|
||||||
|
half = { version = "2.5", optional = true }
|
||||||
tokenizers = { version = "0.22", default-features = false, features = ["onig"] }
|
tokenizers = { version = "0.22", default-features = false, features = ["onig"] }
|
||||||
hf-hub = { version = "0.4", features = ["tokio"] }
|
hf-hub = { version = "0.4", features = ["tokio"] }
|
||||||
|
|
||||||
|
|||||||
@@ -21,7 +21,6 @@
|
|||||||
#![cfg(feature = "cuda")]
|
#![cfg(feature = "cuda")]
|
||||||
|
|
||||||
use candle_core::backend::BackendStorage;
|
use candle_core::backend::BackendStorage;
|
||||||
use candle_core::cuda_backend::WrapErr;
|
|
||||||
use candle_core::{CpuStorage, CudaStorage, CustomOp1, DType, Layout, Result, Shape};
|
use candle_core::{CpuStorage, CudaStorage, CustomOp1, DType, Layout, Result, Shape};
|
||||||
use cudarc::nccl::{Comm, ReduceOp};
|
use cudarc::nccl::{Comm, ReduceOp};
|
||||||
use half::{bf16, f16};
|
use half::{bf16, f16};
|
||||||
@@ -87,7 +86,7 @@ impl CustomOp1 for AllReduce {
|
|||||||
DType::BF16 => {
|
DType::BF16 => {
|
||||||
let src = s.as_cuda_slice::<bf16>()?;
|
let src = s.as_cuda_slice::<bf16>()?;
|
||||||
require_contiguous(src, l)?;
|
require_contiguous(src, l)?;
|
||||||
let mut dst = unsafe { dev.alloc::<bf16>(elem_count) }.w()?;
|
let mut dst = unsafe { dev.alloc::<bf16>(elem_count) }?;
|
||||||
self.comm
|
self.comm
|
||||||
.all_reduce(src, &mut dst, &ReduceOp::Sum)
|
.all_reduce(src, &mut dst, &ReduceOp::Sum)
|
||||||
.map_err(|e| candle_core::Error::Msg(format!("nccl all_reduce bf16: {e:?}")))?;
|
.map_err(|e| candle_core::Error::Msg(format!("nccl all_reduce bf16: {e:?}")))?;
|
||||||
@@ -96,7 +95,7 @@ impl CustomOp1 for AllReduce {
|
|||||||
DType::F16 => {
|
DType::F16 => {
|
||||||
let src = s.as_cuda_slice::<f16>()?;
|
let src = s.as_cuda_slice::<f16>()?;
|
||||||
require_contiguous(src, l)?;
|
require_contiguous(src, l)?;
|
||||||
let mut dst = unsafe { dev.alloc::<f16>(elem_count) }.w()?;
|
let mut dst = unsafe { dev.alloc::<f16>(elem_count) }?;
|
||||||
self.comm
|
self.comm
|
||||||
.all_reduce(src, &mut dst, &ReduceOp::Sum)
|
.all_reduce(src, &mut dst, &ReduceOp::Sum)
|
||||||
.map_err(|e| candle_core::Error::Msg(format!("nccl all_reduce f16: {e:?}")))?;
|
.map_err(|e| candle_core::Error::Msg(format!("nccl all_reduce f16: {e:?}")))?;
|
||||||
@@ -105,7 +104,7 @@ impl CustomOp1 for AllReduce {
|
|||||||
DType::F32 => {
|
DType::F32 => {
|
||||||
let src = s.as_cuda_slice::<f32>()?;
|
let src = s.as_cuda_slice::<f32>()?;
|
||||||
require_contiguous(src, l)?;
|
require_contiguous(src, l)?;
|
||||||
let mut dst = unsafe { dev.alloc::<f32>(elem_count) }.w()?;
|
let mut dst = unsafe { dev.alloc::<f32>(elem_count) }?;
|
||||||
self.comm
|
self.comm
|
||||||
.all_reduce(src, &mut dst, &ReduceOp::Sum)
|
.all_reduce(src, &mut dst, &ReduceOp::Sum)
|
||||||
.map_err(|e| candle_core::Error::Msg(format!("nccl all_reduce f32: {e:?}")))?;
|
.map_err(|e| candle_core::Error::Msg(format!("nccl all_reduce f32: {e:?}")))?;
|
||||||
|
|||||||
Reference in New Issue
Block a user