diff --git a/crates/neuron/src/harness/tp/all_reduce.rs b/crates/neuron/src/harness/tp/all_reduce.rs index 7aedd4d..4b1100d 100644 --- a/crates/neuron/src/harness/tp/all_reduce.rs +++ b/crates/neuron/src/harness/tp/all_reduce.rs @@ -20,6 +20,7 @@ #![cfg(feature = "cuda")] +use candle_core::backend::BackendStorage; use candle_core::cuda_backend::WrapErr; use candle_core::{CpuStorage, CudaStorage, CustomOp1, DType, Layout, Result, Shape}; use cudarc::nccl::{Comm, ReduceOp}; @@ -61,8 +62,6 @@ impl CustomOp1 for AllReduce { } fn cuda_fwd(&self, s: &CudaStorage, l: &Layout) -> Result<(CudaStorage, Shape)> { - use cudarc::driver::DeviceSlice; - // Reject non-contiguous inputs explicitly — copying them // server-side would mask shape bugs (a TP layer feeding a // strided activation into all_reduce is almost certainly a