fix(neuron/tp): NcclError {e:?} + cudarc 0.19 deprecation cleanup
All checks were successful
CI / Format (push) Successful in 38s
build-prerelease / Resolve version stamps (push) Successful in 40s
CI / Clippy (push) Successful in 2m15s
build-prerelease / Build neuron-blackwell (push) Successful in 3m35s
CI / Test (push) Successful in 5m0s
CI / Build cortex SRPM (push) Has been skipped
CI / Build neuron SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
build-prerelease / Build cortex binary (push) Successful in 4m51s
build-prerelease / Package cortex RPM (push) Successful in 1m27s
build-prerelease / Build neuron-ampere (push) Successful in 4m55s
build-prerelease / Build neuron-ada (push) Successful in 4m57s
build-prerelease / Package helexa-neuron-ada RPM (push) Successful in 2m50s
build-prerelease / Package helexa-neuron-ampere RPM (push) Successful in 2m50s
build-prerelease / Package helexa-neuron-blackwell RPM (push) Successful in 3m37s
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Successful in 1m2s

Two cuda-feature-only build errors only the CI runner catches:

1. cudarc::nccl::NcclError doesn't impl Display in 0.19.x, so the
   `format!("...: {e}")` map_err calls fail to compile when the cuda
   feature actually wires them up. Switch every NcclError-typed `{e}`
   in nccl_state.rs to `{e:?}` — surfaces variant + ncclResult code
   in the same diagnostic shape just via Debug instead of Display.
2. cudarc::CudaStream::memcpy_stod / memcpy_dtov are deprecated in
   0.19.7 in favour of clone_htod / clone_dtoh. The replacements
   take/return the same types, so the swap is mechanical.

Dev box can't compile with --features cuda (no nvcc), so these only
surface in the build-prerelease CUDA matrix jobs.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-19 17:24:13 +03:00
parent 93421f48e2
commit 8e882c0757

View File

@@ -118,7 +118,9 @@ mod cuda_impl {
/// the leader to mint the shared communicator id which is then /// the leader to mint the shared communicator id which is then
/// broadcast to every worker via the RPC `Init` message. /// broadcast to every worker via the RPC `Init` message.
pub fn generate_comm_id_hex() -> Result<String, String> { pub fn generate_comm_id_hex() -> Result<String, String> {
let id = Id::new().map_err(|e| format!("Id::new(): {e}"))?; // NcclError lacks a Display impl in cudarc 0.19.x — surface
// via Debug throughout this module.
let id = Id::new().map_err(|e| format!("Id::new(): {e:?}"))?;
let bytes_u8: [u8; NCCL_ID_BYTES] = std::array::from_fn(|i| id.internal()[i] as u8); let bytes_u8: [u8; NCCL_ID_BYTES] = std::array::from_fn(|i| id.internal()[i] as u8);
Ok(encode_hex(&bytes_u8)) Ok(encode_hex(&bytes_u8))
} }
@@ -169,7 +171,7 @@ mod cuda_impl {
let comm = Comm::from_rank(stream, cfg.rank as usize, cfg.world_size as usize, id) let comm = Comm::from_rank(stream, cfg.rank as usize, cfg.world_size as usize, id)
.map_err(|e| { .map_err(|e| {
format!( format!(
"Comm::from_rank(rank={}, world={}) failed: {e}", "Comm::from_rank(rank={}, world={}) failed: {e:?}",
cfg.rank, cfg.world_size cfg.rank, cfg.world_size
) )
})?; })?;
@@ -182,15 +184,18 @@ mod cuda_impl {
fn try_sanity_check(comm: &Comm) -> Result<u32, String> { fn try_sanity_check(comm: &Comm) -> Result<u32, String> {
let stream = comm.stream().clone(); let stream = comm.stream().clone();
let input = stream let input = stream
.memcpy_stod(&[1u32]) .clone_htod(&[1u32])
.map_err(|e| format!("htod sentinel: {e}"))?; .map_err(|e| format!("htod sentinel: {e}"))?;
let mut output = stream let mut output = stream
.alloc_zeros::<u32>(1) .alloc_zeros::<u32>(1)
.map_err(|e| format!("alloc output: {e}"))?; .map_err(|e| format!("alloc output: {e}"))?;
// cudarc::nccl::NcclError doesn't impl Display in 0.19.x —
// surface via Debug so we still see the variant + ncclResult
// code instead of a generic "{e}" failure.
comm.all_reduce(&input, &mut output, &ReduceOp::Sum) comm.all_reduce(&input, &mut output, &ReduceOp::Sum)
.map_err(|e| format!("all_reduce: {e}"))?; .map_err(|e| format!("all_reduce: {e:?}"))?;
let result = stream let result = stream
.memcpy_dtov(&output) .clone_dtoh(&output)
.map_err(|e| format!("dtoh result: {e}"))?; .map_err(|e| format!("dtoh result: {e}"))?;
Ok(result[0]) Ok(result[0])
} }