diff --git a/crates/neuron/src/harness/tp/nccl_state.rs b/crates/neuron/src/harness/tp/nccl_state.rs index 9f57fa9..7638599 100644 --- a/crates/neuron/src/harness/tp/nccl_state.rs +++ b/crates/neuron/src/harness/tp/nccl_state.rs @@ -118,7 +118,9 @@ mod cuda_impl { /// the leader to mint the shared communicator id which is then /// broadcast to every worker via the RPC `Init` message. pub fn generate_comm_id_hex() -> Result { - let id = Id::new().map_err(|e| format!("Id::new(): {e}"))?; + // NcclError lacks a Display impl in cudarc 0.19.x — surface + // via Debug throughout this module. + let id = Id::new().map_err(|e| format!("Id::new(): {e:?}"))?; let bytes_u8: [u8; NCCL_ID_BYTES] = std::array::from_fn(|i| id.internal()[i] as u8); Ok(encode_hex(&bytes_u8)) } @@ -169,7 +171,7 @@ mod cuda_impl { let comm = Comm::from_rank(stream, cfg.rank as usize, cfg.world_size as usize, id) .map_err(|e| { format!( - "Comm::from_rank(rank={}, world={}) failed: {e}", + "Comm::from_rank(rank={}, world={}) failed: {e:?}", cfg.rank, cfg.world_size ) })?; @@ -182,15 +184,18 @@ mod cuda_impl { fn try_sanity_check(comm: &Comm) -> Result { let stream = comm.stream().clone(); let input = stream - .memcpy_stod(&[1u32]) + .clone_htod(&[1u32]) .map_err(|e| format!("htod sentinel: {e}"))?; let mut output = stream .alloc_zeros::(1) .map_err(|e| format!("alloc output: {e}"))?; + // cudarc::nccl::NcclError doesn't impl Display in 0.19.x — + // surface via Debug so we still see the variant + ncclResult + // code instead of a generic "{e}" failure. comm.all_reduce(&input, &mut output, &ReduceOp::Sum) - .map_err(|e| format!("all_reduce: {e}"))?; + .map_err(|e| format!("all_reduce: {e:?}"))?; let result = stream - .memcpy_dtov(&output) + .clone_dtoh(&output) .map_err(|e| format!("dtoh result: {e}"))?; Ok(result[0]) }