fix(neuron/tp): NcclError {e:?} + cudarc 0.19 deprecation cleanup
All checks were successful
CI / Format (push) Successful in 38s
build-prerelease / Resolve version stamps (push) Successful in 40s
CI / Clippy (push) Successful in 2m15s
build-prerelease / Build neuron-blackwell (push) Successful in 3m35s
CI / Test (push) Successful in 5m0s
CI / Build cortex SRPM (push) Has been skipped
CI / Build neuron SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
build-prerelease / Build cortex binary (push) Successful in 4m51s
build-prerelease / Package cortex RPM (push) Successful in 1m27s
build-prerelease / Build neuron-ampere (push) Successful in 4m55s
build-prerelease / Build neuron-ada (push) Successful in 4m57s
build-prerelease / Package helexa-neuron-ada RPM (push) Successful in 2m50s
build-prerelease / Package helexa-neuron-ampere RPM (push) Successful in 2m50s
build-prerelease / Package helexa-neuron-blackwell RPM (push) Successful in 3m37s
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Successful in 1m2s
All checks were successful
CI / Format (push) Successful in 38s
build-prerelease / Resolve version stamps (push) Successful in 40s
CI / Clippy (push) Successful in 2m15s
build-prerelease / Build neuron-blackwell (push) Successful in 3m35s
CI / Test (push) Successful in 5m0s
CI / Build cortex SRPM (push) Has been skipped
CI / Build neuron SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
build-prerelease / Build cortex binary (push) Successful in 4m51s
build-prerelease / Package cortex RPM (push) Successful in 1m27s
build-prerelease / Build neuron-ampere (push) Successful in 4m55s
build-prerelease / Build neuron-ada (push) Successful in 4m57s
build-prerelease / Package helexa-neuron-ada RPM (push) Successful in 2m50s
build-prerelease / Package helexa-neuron-ampere RPM (push) Successful in 2m50s
build-prerelease / Package helexa-neuron-blackwell RPM (push) Successful in 3m37s
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Successful in 1m2s
Two cuda-feature-only build errors only the CI runner catches:
1. cudarc::nccl::NcclError doesn't impl Display in 0.19.x, so the
`format!("...: {e}")` map_err calls fail to compile when the cuda
feature actually wires them up. Switch every NcclError-typed `{e}`
in nccl_state.rs to `{e:?}` — surfaces variant + ncclResult code
in the same diagnostic shape just via Debug instead of Display.
2. cudarc::CudaStream::memcpy_stod / memcpy_dtov are deprecated in
0.19.7 in favour of clone_htod / clone_dtoh. The replacements
take/return the same types, so the swap is mechanical.
Dev box can't compile with --features cuda (no nvcc), so these only
surface in the build-prerelease CUDA matrix jobs.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -118,7 +118,9 @@ mod cuda_impl {
|
|||||||
/// the leader to mint the shared communicator id which is then
|
/// the leader to mint the shared communicator id which is then
|
||||||
/// broadcast to every worker via the RPC `Init` message.
|
/// broadcast to every worker via the RPC `Init` message.
|
||||||
pub fn generate_comm_id_hex() -> Result<String, String> {
|
pub fn generate_comm_id_hex() -> Result<String, String> {
|
||||||
let id = Id::new().map_err(|e| format!("Id::new(): {e}"))?;
|
// NcclError lacks a Display impl in cudarc 0.19.x — surface
|
||||||
|
// via Debug throughout this module.
|
||||||
|
let id = Id::new().map_err(|e| format!("Id::new(): {e:?}"))?;
|
||||||
let bytes_u8: [u8; NCCL_ID_BYTES] = std::array::from_fn(|i| id.internal()[i] as u8);
|
let bytes_u8: [u8; NCCL_ID_BYTES] = std::array::from_fn(|i| id.internal()[i] as u8);
|
||||||
Ok(encode_hex(&bytes_u8))
|
Ok(encode_hex(&bytes_u8))
|
||||||
}
|
}
|
||||||
@@ -169,7 +171,7 @@ mod cuda_impl {
|
|||||||
let comm = Comm::from_rank(stream, cfg.rank as usize, cfg.world_size as usize, id)
|
let comm = Comm::from_rank(stream, cfg.rank as usize, cfg.world_size as usize, id)
|
||||||
.map_err(|e| {
|
.map_err(|e| {
|
||||||
format!(
|
format!(
|
||||||
"Comm::from_rank(rank={}, world={}) failed: {e}",
|
"Comm::from_rank(rank={}, world={}) failed: {e:?}",
|
||||||
cfg.rank, cfg.world_size
|
cfg.rank, cfg.world_size
|
||||||
)
|
)
|
||||||
})?;
|
})?;
|
||||||
@@ -182,15 +184,18 @@ mod cuda_impl {
|
|||||||
fn try_sanity_check(comm: &Comm) -> Result<u32, String> {
|
fn try_sanity_check(comm: &Comm) -> Result<u32, String> {
|
||||||
let stream = comm.stream().clone();
|
let stream = comm.stream().clone();
|
||||||
let input = stream
|
let input = stream
|
||||||
.memcpy_stod(&[1u32])
|
.clone_htod(&[1u32])
|
||||||
.map_err(|e| format!("htod sentinel: {e}"))?;
|
.map_err(|e| format!("htod sentinel: {e}"))?;
|
||||||
let mut output = stream
|
let mut output = stream
|
||||||
.alloc_zeros::<u32>(1)
|
.alloc_zeros::<u32>(1)
|
||||||
.map_err(|e| format!("alloc output: {e}"))?;
|
.map_err(|e| format!("alloc output: {e}"))?;
|
||||||
|
// cudarc::nccl::NcclError doesn't impl Display in 0.19.x —
|
||||||
|
// surface via Debug so we still see the variant + ncclResult
|
||||||
|
// code instead of a generic "{e}" failure.
|
||||||
comm.all_reduce(&input, &mut output, &ReduceOp::Sum)
|
comm.all_reduce(&input, &mut output, &ReduceOp::Sum)
|
||||||
.map_err(|e| format!("all_reduce: {e}"))?;
|
.map_err(|e| format!("all_reduce: {e:?}"))?;
|
||||||
let result = stream
|
let result = stream
|
||||||
.memcpy_dtov(&output)
|
.clone_dtoh(&output)
|
||||||
.map_err(|e| format!("dtoh result: {e}"))?;
|
.map_err(|e| format!("dtoh result: {e}"))?;
|
||||||
Ok(result[0])
|
Ok(result[0])
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user