Stage 7b-iii (1/2): AllReduce CustomOp + ShardedVarBuilder-backed TP linears

Ports the canonical candle-examples/examples/llama_multiprocess/model.rs pattern into the harness. Two new files, one deletion: - harness/tp/all_reduce.rs — AllReduce wraps Arc<cudarc::nccl::Comm> and implements candle's CustomOp1 trait. cuda_fwd extracts the rank's CudaSlice<dtype> from a CudaStorage, asserts the input is contiguous (a strided activation hitting all_reduce is almost always a model construction bug), allocates an output CudaSlice on the same device, calls Comm::all_reduce(Sum), and wraps the result back as a CudaStorage. Handles BF16, F16, F32. NcclError surfaces via {e:?} (no Display impl in cudarc 0.19.x). Send/Sync hand-impl'd with the same NCCL-thread-safety caveat candle's example documents. - harness/tp/tp_linear.rs — ColumnParallelLinear and RowParallelLinear, both built on candle's ShardedVarBuilder + Shard hints. `vb.get_with_hints((), "weight", shard(dim, rank, ws))` reads JUST the rank's slice from the safetensors view; no full- tensor host materialisation. ColumnParallel.forward is a plain local matmul (output is naturally sharded). RowParallel.forward = local matmul + apply_op1_no_bwd(&self.all_reduce). On CPU / world_size == 1, the AllReduce is skipped and the partial output is returned as-is. Both layers are no-bias — every Qwen3-family target sets attention_bias=false; bias-aware sharding is a future-model concern. - Deletes harness/tp/sharded_linear.rs from 7b-ii. That commit's hand-rolled "load full + narrow" approach was useful exploration but candle's ShardedVarBuilder does the same work without materialising the full tensor on host. The 5 unit tests there verified the slicing math against an unsharded reference; that math now lives inside candle and is covered by candle's own tests. Next (7b-iii 2/2): TpQwen3Attention + TpQwen3MLP composing the column/row pair, then a TpQwen3Model that runs the full forward. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 18:14:54 +03:00
parent 5436af9c73
commit 8d3194f992
4 changed files with 257 additions and 406 deletions
--- a/crates/neuron/src/harness/tp/all_reduce.rs
+++ b/crates/neuron/src/harness/tp/all_reduce.rs
@@ -0,0 +1,121 @@
 //! `AllReduce` as a candle `CustomOp1` — the bridge between candle's
 //! `Tensor` graph and `cudarc::nccl::Comm::all_reduce`.
 //!
 //! Ported from the canonical
 //! `candle-examples/examples/llama_multiprocess/model.rs` pattern.
 //! Row-parallel layers apply this op after their local matmul to sum
 //! partial outputs across NCCL ranks.
 //!
 //! Available only under `--features cuda`; on CPU builds this module
 //! is empty and row-parallel layers degenerate to local matmul only
 //! (useful for compile-checking the model code; correctness requires
 //! cuda).
 //!
 //! Thread-safety caveat: NCCL communicators are technically only
 //! safe to use from a single thread at a time
 //! (https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/threadsafety.html).
 //! We hold the `AllReduce` behind an `Arc<Comm>` and only issue ops
 //! against it from the dedicated `spawn_blocking` thread the inference
 //! pipeline already uses for candle's forward passes.
 #![cfg(feature = "cuda")]
 use candle_core::cuda_backend::WrapErr;
 use candle_core::{CpuStorage, CudaStorage, CustomOp1, DType, Layout, Result, Shape};
 use cudarc::nccl::{Comm, ReduceOp};
 use half::{bf16, f16};
 use std::sync::Arc;
 /// Wraps an NCCL `Comm` so it can be plugged into a candle forward
 /// graph as a custom op. Each row-parallel layer holds one of these.
 pub struct AllReduce {
    comm: Arc<Comm>,
 }
 // SAFETY: `Comm` contains a raw `ncclComm_t` pointer; NCCL's docs note
 // that issuing ops against one comm from multiple threads concurrently
 // is unsafe. We serialise via the single spawn_blocking thread that
 // drives the model's forward pass. The Send/Sync impl is necessary
 // because candle's CustomOp1 trait bounds require it; the correctness
 // invariant is enforced at the call site, not the type level.
 unsafe impl Send for AllReduce {}
 unsafe impl Sync for AllReduce {}
 impl AllReduce {
    pub fn new(comm: Arc<Comm>) -> Self {
        Self { comm }
    }
    pub fn comm(&self) -> &Arc<Comm> {
        &self.comm
    }
 }
 impl CustomOp1 for AllReduce {
    fn name(&self) -> &'static str {
        "neuron.tp.all_reduce"
    }
    fn cpu_fwd(&self, _s: &CpuStorage, _l: &Layout) -> Result<(CpuStorage, Shape)> {
        candle_core::bail!("AllReduce custom-op invoked on CPU storage; TP requires CUDA")
    }
    fn cuda_fwd(&self, s: &CudaStorage, l: &Layout) -> Result<(CudaStorage, Shape)> {
        use cudarc::driver::DeviceSlice;
        // Reject non-contiguous inputs explicitly — copying them
        // server-side would mask shape bugs (a TP layer feeding a
        // strided activation into all_reduce is almost certainly a
        // model construction error).
        fn require_contiguous<T: cudarc::driver::DeviceRepr>(
            slice: &cudarc::driver::CudaSlice<T>,
            l: &Layout,
        ) -> Result<()> {
            match l.contiguous_offsets() {
                Some((0, n)) if n == slice.len() => Ok(()),
                _ => candle_core::bail!(
                    "AllReduce input is non-contiguous: layout={:?}, slice_len={}",
                    l,
                    slice.len()
                ),
            }
        }
        let elem_count = l.shape().elem_count();
        let dev = s.device().clone();
        let out = match s.dtype() {
            DType::BF16 => {
                let src = s.as_cuda_slice::<bf16>()?;
                require_contiguous(src, l)?;
                let mut dst = unsafe { dev.alloc::<bf16>(elem_count) }.w()?;
                self.comm
                    .all_reduce(src, &mut dst, &ReduceOp::Sum)
                    .map_err(|e| candle_core::Error::Msg(format!("nccl all_reduce bf16: {e:?}")))?;
                CudaStorage::wrap_cuda_slice(dst, dev)
            }
            DType::F16 => {
                let src = s.as_cuda_slice::<f16>()?;
                require_contiguous(src, l)?;
                let mut dst = unsafe { dev.alloc::<f16>(elem_count) }.w()?;
                self.comm
                    .all_reduce(src, &mut dst, &ReduceOp::Sum)
                    .map_err(|e| candle_core::Error::Msg(format!("nccl all_reduce f16: {e:?}")))?;
                CudaStorage::wrap_cuda_slice(dst, dev)
            }
            DType::F32 => {
                let src = s.as_cuda_slice::<f32>()?;
                require_contiguous(src, l)?;
                let mut dst = unsafe { dev.alloc::<f32>(elem_count) }.w()?;
                self.comm
                    .all_reduce(src, &mut dst, &ReduceOp::Sum)
                    .map_err(|e| candle_core::Error::Msg(format!("nccl all_reduce f32: {e:?}")))?;
                CudaStorage::wrap_cuda_slice(dst, dev)
            }
            dtype => candle_core::bail!(
                "AllReduce: unsupported dtype {dtype:?}; TP path expects bf16/f16/f32"
            ),
        };
        Ok((out, l.shape().clone()))
    }
 }
--- a/crates/neuron/src/harness/tp/mod.rs
+++ b/crates/neuron/src/harness/tp/mod.rs
@@ -17,9 +17,10 @@
 //! - **7b:** TP-aware Qwen3 inference dispatched through the pool.
 //! - **7c:** crash detection, streaming SSE, graceful unload.
 pub mod all_reduce;
 pub mod nccl_state;
 pub mod rpc;
-pub mod sharded_linear;
+pub mod tp_linear;
 pub mod worker;
 use anyhow::{Context, Result};
--- a/crates/neuron/src/harness/tp/sharded_linear.rs
+++ b/crates/neuron/src/harness/tp/sharded_linear.rs
@@ -1,405 +0,0 @@
 //! Tensor-parallel linear layers over `candle_nn::Linear`.
 //!
 //! Two sharding strategies, both following the Megatron-LM convention
 //! that's also what mistral.rs uses for vanilla Qwen3:
 //!
 //! - [`ColumnParallelLinear`] — splits the **output** dimension. Each
 //!   rank holds `out_features / world_size` rows of the weight matrix.
 //!   The forward pass is a plain local matmul; the output is *sharded*
 //!   (each rank produces a slice of the output vector). Used for
 //!   `q_proj` / `k_proj` / `v_proj` (sharding by head) and the FFN's
 //!   `gate_proj` / `up_proj`.
 //!
 //! - [`RowParallelLinear`] — splits the **input** dimension. Each
 //!   rank holds `in_features / world_size` columns of the weight
 //!   matrix and consumes a sharded input from upstream. Each rank's
 //!   local matmul produces a *partial* output; an `all_reduce(Sum)`
 //!   across ranks recovers the full activation. Used for `o_proj`
 //!   (after attention) and `down_proj` (after the FFN).
 //!
 //! Stage 7b-ii (this commit): the layers, sharded loading, local
 //! forward. The `all_reduce` collective lives in `forward_with_comm`
 //! and is wired up in 7b-iii when the full TP-aware Qwen3 model is
 //! assembled with an NCCL Comm in scope. Tests here exercise only
 //! the local (no-NCCL) math against an unsharded reference.
 use anyhow::{Context, Result};
 use candle_core::{Module, Tensor};
 use candle_nn::{Linear, VarBuilder};
 /// Direction of the parallelism split — selects which axis of the
 /// weight matrix the rank's local slice is taken from.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum ShardKind {
    /// Split the output dimension: rank `r` holds rows
    /// `[r * out/N .. (r+1) * out/N]` of the weight matrix. The
    /// downstream consumer either accepts a sharded activation
    /// (the next layer is also column-parallel) or merges via
    /// all-gather.
    Column,
    /// Split the input dimension: rank `r` holds columns
    /// `[r * in/N .. (r+1) * in/N]`. The forward pass produces a
    /// partial output; an `all_reduce(Sum)` across ranks yields the
    /// full activation.
    Row,
 }
 /// A linear layer whose weights have been sharded across NCCL ranks.
 ///
 /// Holds a standard `candle_nn::Linear` constructed from the local
 /// slice. The collective op (only meaningful for `Row`) is invoked
 /// by [`forward_with_comm`] — the trait `Module::forward` does just
 /// the local matmul, so callers that want correct semantics on a
 /// Row-parallel layer must drive the collective themselves.
 #[derive(Debug)]
 pub struct ShardedLinear {
    inner: Linear,
    kind: ShardKind,
    rank: u32,
    world_size: u32,
    /// Captured for diagnostics ("rank 3 layer says X but should say Y").
    /// `out_features` reflects the **logical** size (pre-shard) so the
    /// caller can validate against the model config without doing the
    /// arithmetic itself.
    logical_out_features: usize,
    logical_in_features: usize,
 }
 impl ShardedLinear {
    /// Load a column-parallel slice from a `VarBuilder`. Reads the
    /// full weight (and bias, if any) from the safetensors and
    /// narrows on dim 0 to the rank's slice. The bias is sharded the
    /// same way (each rank holds its own bias slice).
    ///
    /// Bails if `out_features` is not divisible by `world_size` — the
    /// same divisibility precondition mistral.rs's PR #2054-era code
    /// added an explicit guard for after the first TP shard attempt.
    pub fn load_column(
        vb: &VarBuilder,
        in_features: usize,
        out_features: usize,
        has_bias: bool,
        rank: u32,
        world_size: u32,
    ) -> Result<Self> {
        let path = vb.prefix();
        if !out_features.is_multiple_of(world_size as usize) {
            anyhow::bail!(
                "column-parallel '{path}': out_features={out_features} \
                 not divisible by world_size={world_size}"
            );
        }
        let shard = out_features / world_size as usize;
        let start = rank as usize * shard;
        let full_w = vb
            .get((out_features, in_features), "weight")
            .with_context(|| format!("load weight for column-parallel '{path}'"))?;
        let weight = full_w
            .narrow(0, start, shard)
            .with_context(|| format!("narrow weight rows for column-parallel '{path}'"))?
            .contiguous()
            .with_context(|| format!("contiguous weight for column-parallel '{path}'"))?;
        // Drop the full tensor as soon as we have the shard so peak
        // host RAM during load tracks shard-size, not full-size, once
        // all narrows complete (Rust's drop semantics handle this
        // because `full_w` goes out of scope here).
        drop(full_w);
        let bias = if has_bias {
            let full_b = vb
                .get(out_features, "bias")
                .with_context(|| format!("load bias for column-parallel '{path}'"))?;
            let b = full_b
                .narrow(0, start, shard)
                .with_context(|| format!("narrow bias for column-parallel '{path}'"))?
                .contiguous()
                .with_context(|| format!("contiguous bias for column-parallel '{path}'"))?;
            Some(b)
        } else {
            None
        };
        Ok(Self {
            inner: Linear::new(weight, bias),
            kind: ShardKind::Column,
            rank,
            world_size,
            logical_out_features: out_features,
            logical_in_features: in_features,
        })
    }
    /// Load a row-parallel slice from a `VarBuilder`. Reads the full
    /// weight and narrows on dim 1 to the rank's column slice. The
    /// bias, if any, lives **only on rank 0** — every other rank
    /// holds `None`. This keeps the post-`all_reduce` semantics
    /// correct: each rank contributes its partial sum without the
    /// bias, then rank 0's bias (added in `forward_with_comm`) lands
    /// on the result exactly once.
    pub fn load_row(
        vb: &VarBuilder,
        in_features: usize,
        out_features: usize,
        has_bias: bool,
        rank: u32,
        world_size: u32,
    ) -> Result<Self> {
        let path = vb.prefix();
        if !in_features.is_multiple_of(world_size as usize) {
            anyhow::bail!(
                "row-parallel '{path}': in_features={in_features} \
                 not divisible by world_size={world_size}"
            );
        }
        let shard = in_features / world_size as usize;
        let start = rank as usize * shard;
        let full_w = vb
            .get((out_features, in_features), "weight")
            .with_context(|| format!("load weight for row-parallel '{path}'"))?;
        let weight = full_w
            .narrow(1, start, shard)
            .with_context(|| format!("narrow weight cols for row-parallel '{path}'"))?
            .contiguous()
            .with_context(|| format!("contiguous weight for row-parallel '{path}'"))?;
        drop(full_w);
        let bias = if has_bias && rank == 0 {
            let b = vb
                .get(out_features, "bias")
                .with_context(|| format!("load bias for row-parallel '{path}'"))?;
            Some(b)
        } else {
            None
        };
        Ok(Self {
            inner: Linear::new(weight, bias),
            kind: ShardKind::Row,
            rank,
            world_size,
            logical_out_features: out_features,
            logical_in_features: in_features,
        })
    }
    pub fn kind(&self) -> ShardKind {
        self.kind
    }
    pub fn rank(&self) -> u32 {
        self.rank
    }
    pub fn world_size(&self) -> u32 {
        self.world_size
    }
    pub fn logical_in_features(&self) -> usize {
        self.logical_in_features
    }
    pub fn logical_out_features(&self) -> usize {
        self.logical_out_features
    }
 }
 impl Module for ShardedLinear {
    /// Local matmul only. For `Row`-parallel layers, the output is a
    /// *partial sum* — call [`Self::forward_with_comm`] to get the
    /// reduced result. Implementing `Module` lets a `ShardedLinear`
    /// be drop-in for any `Module`-shaped consumer that doesn't need
    /// the reduce step (column-parallel layers; tests).
    fn forward(&self, x: &Tensor) -> candle_core::Result<Tensor> {
        self.inner.forward(x)
    }
 }
 #[cfg(feature = "cuda")]
 impl ShardedLinear {
    /// Forward pass that issues an `all_reduce(Sum)` for row-parallel
    /// layers. Column-parallel layers just delegate to the local
    /// matmul (their output is naturally sharded; the next consumer
    /// will either gather or accept the shard).
    pub fn forward_with_comm(&self, x: &Tensor, comm: &cudarc::nccl::Comm) -> Result<Tensor> {
        let local = self
            .inner
            .forward(x)
            .map_err(|e| anyhow::anyhow!("local matmul: {e}"))?;
        match self.kind {
            ShardKind::Column => Ok(local),
            ShardKind::Row => {
                // TODO Stage 7b-iii: wrap `local`'s CudaSlice with a
                // matching output buffer, call comm.all_reduce(Sum),
                // return the result. The cudarc::nccl all_reduce
                // signature takes `&S: DevicePtr<T>` + `&mut R: DevicePtrMut<T>`,
                // both backed by `CudaSlice<T>`. candle stores its
                // Tensor data behind its own slab — extracting the
                // underlying CudaSlice safely is a separate piece of
                // plumbing best landed alongside the model assembly,
                // so this body is a placeholder.
                let _ = comm;
                anyhow::bail!(
                    "ShardedLinear::forward_with_comm row-parallel reduce \
                     lands in Stage 7b-iii alongside the model assembly; \
                     7b-ii ships only the local matmul"
                );
            }
        }
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use candle_core::{DType, Device, Tensor};
    use candle_nn::var_builder::VarBuilderArgs;
    use std::collections::HashMap;
    /// Build a VarBuilder over an in-memory map of tensors. Used by
    /// the tests to fake a safetensors source without touching disk.
    fn vb_from_map(tensors: HashMap<String, Tensor>, device: &Device) -> VarBuilder<'static> {
        VarBuilderArgs::from_tensors(tensors, DType::F32, device)
    }
    /// World_size=2 column-parallel split of a 4x3 weight. Each rank's
    /// local matmul on the same input should be 2 rows of the
    /// reference (full) matmul.
    #[test]
    fn column_parallel_shards_output_correctly() {
        let device = Device::Cpu;
        // weight (out=4, in=3): rows are easy to identify by value.
        let w = Tensor::from_slice(
            &[
                1f32, 2., 3., // row 0
                4., 5., 6., // row 1
                7., 8., 9., // row 2
                10., 11., 12., // row 3
            ],
            (4, 3),
            &device,
        )
        .unwrap();
        let mut tensors = HashMap::new();
        tensors.insert("foo.weight".into(), w.clone());
        let vb_root = vb_from_map(tensors, &device);
        let vb_foo = vb_root.pp("foo");
        // rank 0 of world_size 2 gets rows 0..2.
        let r0 = ShardedLinear::load_column(&vb_foo, 3, 4, false, 0, 2).unwrap();
        // rank 1 gets rows 2..4.
        let r1 = ShardedLinear::load_column(&vb_foo, 3, 4, false, 1, 2).unwrap();
        let x = Tensor::from_slice(&[1f32, 0., 0.], (1, 3), &device).unwrap();
        let y0 = r0.forward(&x).unwrap().to_vec2::<f32>().unwrap();
        let y1 = r1.forward(&x).unwrap().to_vec2::<f32>().unwrap();
        // Full reference: x @ w.T → [1, 4, 7, 10]. Rank 0 owns [1, 4],
        // rank 1 owns [7, 10].
        assert_eq!(y0, vec![vec![1.0, 4.0]]);
        assert_eq!(y1, vec![vec![7.0, 10.0]]);
    }
    /// World_size=2 row-parallel split of a 4x4 weight. Each rank's
    /// local matmul on its half of the input should be a partial sum;
    /// summing the two partials should equal the unsharded reference.
    #[test]
    fn row_parallel_partials_sum_to_full() {
        let device = Device::Cpu;
        // weight (out=4, in=4): use distinct values per column so the
        // partial sums are obviously different.
        let w = Tensor::from_slice(
            &[
                1f32, 2., 3., 4., // row 0
                5., 6., 7., 8., // row 1
                9., 10., 11., 12., // row 2
                13., 14., 15., 16., // row 3
            ],
            (4, 4),
            &device,
        )
        .unwrap();
        let mut tensors = HashMap::new();
        tensors.insert("bar.weight".into(), w.clone());
        let vb_root = vb_from_map(tensors, &device);
        let vb_bar = vb_root.pp("bar");
        let r0 = ShardedLinear::load_row(&vb_bar, 4, 4, false, 0, 2).unwrap();
        let r1 = ShardedLinear::load_row(&vb_bar, 4, 4, false, 1, 2).unwrap();
        // x split: rank 0 takes x[..2], rank 1 takes x[2..].
        let x_full = Tensor::from_slice(&[1f32, 1., 1., 1.], (1, 4), &device).unwrap();
        let x0 = x_full.narrow(1, 0, 2).unwrap();
        let x1 = x_full.narrow(1, 2, 2).unwrap();
        let y0 = r0.forward(&x0).unwrap();
        let y1 = r1.forward(&x1).unwrap();
        let summed = (y0 + y1).unwrap().to_vec2::<f32>().unwrap();
        // Reference: x_full @ w.T = [1+2+3+4, 5+6+7+8, 9+10+11+12, 13+14+15+16]
        //                         = [10, 26, 42, 58].
        assert_eq!(summed, vec![vec![10.0, 26.0, 42.0, 58.0]]);
    }
    /// Row-parallel bias lives only on rank 0; other ranks have None.
    /// (Verifies the rank-0-only bias contract.)
    #[test]
    fn row_parallel_bias_only_on_rank_zero() {
        let device = Device::Cpu;
        let w = Tensor::zeros((4, 4), DType::F32, &device).unwrap();
        let b = Tensor::from_slice(&[1f32, 1., 1., 1.], 4, &device).unwrap();
        let mut tensors = HashMap::new();
        tensors.insert("baz.weight".into(), w);
        tensors.insert("baz.bias".into(), b);
        let vb_root = vb_from_map(tensors, &device);
        let vb_baz = vb_root.pp("baz");
        let r0 = ShardedLinear::load_row(&vb_baz, 4, 4, true, 0, 2).unwrap();
        let r1 = ShardedLinear::load_row(&vb_baz, 4, 4, true, 1, 2).unwrap();
        // We can't introspect the Linear's bias from the public API,
        // but we can run forward of zero-weight rank 1 and confirm
        // the output is zero (no bias added on non-zero ranks).
        let x = Tensor::ones((1, 2), DType::F32, &device).unwrap();
        let y1 = r1.forward(&x).unwrap().to_vec2::<f32>().unwrap();
        assert_eq!(y1, vec![vec![0.0, 0.0, 0.0, 0.0]]);
        let y0 = r0.forward(&x).unwrap().to_vec2::<f32>().unwrap();
        // Rank 0 weight is zero but bias is [1,1,1,1] → output should be [1,1,1,1].
        assert_eq!(y0, vec![vec![1.0, 1.0, 1.0, 1.0]]);
    }
    #[test]
    fn column_parallel_rejects_non_divisible_out_features() {
        let device = Device::Cpu;
        let w = Tensor::zeros((5, 3), DType::F32, &device).unwrap();
        let mut tensors = HashMap::new();
        tensors.insert("nope.weight".into(), w);
        let vb_root = vb_from_map(tensors, &device);
        let vb_nope = vb_root.pp("nope");
        let err = ShardedLinear::load_column(&vb_nope, 3, 5, false, 0, 2).unwrap_err();
        let msg = format!("{err:#}");
        assert!(
            msg.contains("not divisible by world_size"),
            "expected divisibility error, got: {msg}"
        );
    }
    #[test]
    fn row_parallel_rejects_non_divisible_in_features() {
        let device = Device::Cpu;
        let w = Tensor::zeros((4, 5), DType::F32, &device).unwrap();
        let mut tensors = HashMap::new();
        tensors.insert("nope.weight".into(), w);
        let vb_root = vb_from_map(tensors, &device);
        let vb_nope = vb_root.pp("nope");
        let err = ShardedLinear::load_row(&vb_nope, 5, 4, false, 0, 2).unwrap_err();
        let msg = format!("{err:#}");
        assert!(
            msg.contains("not divisible by world_size"),
            "expected divisibility error, got: {msg}"
        );
    }
 }
--- a/crates/neuron/src/harness/tp/tp_linear.rs
+++ b/crates/neuron/src/harness/tp/tp_linear.rs
@@ -0,0 +1,134 @@
 //! Tensor-parallel linear layers built on candle's `ShardedVarBuilder`
 //! and `Shard` sharding hints.
 //!
 //! candle reads only the rank's slice of each weight tensor from
 //! safetensors via `view.slice(start..stop)` — no full-tensor host
 //! materialisation. That's a memory-efficiency win over hand-rolled
 //! "load full + narrow" sharding (which the earlier
 //! `sharded_linear.rs` exploration demonstrated but didn't pay for).
 //!
 //! Two layer types:
 //!
 //! - [`ColumnParallelLinear`] — output-sharded; forward is a plain
 //!   local matmul. The downstream consumer either accepts a sharded
 //!   activation (next layer is also column-parallel) or all-gathers.
 //! - [`RowParallelLinear`] — input-sharded; forward = local matmul
 //!   then `AllReduce` `CustomOp1` to sum partials across ranks.
 //!
 //! Both assume **no bias** — every Qwen3-family weight layout we
 //! actually target (Qwen3, Qwen3-Coder, Qwen3.6 base, etc.) sets
 //! `attention_bias=false` and the MLP layers are no-bias. Adding bias
 //! support is mechanical when a future model needs it; the design
 //! choice would be: column-parallel shards the bias along dim 0;
 //! row-parallel holds the bias only on rank 0 so the post-`AllReduce`
 //! sum carries it exactly once.
 use anyhow::{Context, Result};
 use candle_core::{Module, Tensor};
 use candle_nn::Linear;
 use candle_nn::var_builder::{Shard, ShardedVarBuilder};
 #[cfg(feature = "cuda")]
 use super::all_reduce::AllReduce;
 /// Helper to build a [`Shard`] hint for a given dimension.
 pub(crate) fn shard(dim: usize, rank: u32, world_size: u32) -> Shard {
    Shard {
        dim,
        rank: rank as usize,
        world_size: world_size as usize,
    }
 }
 /// Output-dim sharded linear (column-parallel). Holds a standard
 /// `candle_nn::Linear` whose `weight` is the rank's slice of the full
 /// `[out_features, in_features]` tensor along dim 0.
 pub struct ColumnParallelLinear {
    inner: Linear,
 }
 impl ColumnParallelLinear {
    /// Load this rank's column-parallel slice from a
    /// `ShardedVarBuilder`. The provided `vb` must already be `pp`-ed
    /// to the layer's path (e.g. `vb.pp("model.layers.0.self_attn.q_proj")`).
    pub fn load(vb: &ShardedVarBuilder, rank: u32, world_size: u32) -> Result<Self> {
        let weight = vb
            .get_with_hints((), "weight", shard(0, rank, world_size))
            .with_context(|| format!("load column-parallel '{}' weight", vb.prefix()))?;
        Ok(Self {
            inner: Linear::new(weight, None),
        })
    }
 }
 impl Module for ColumnParallelLinear {
    fn forward(&self, x: &Tensor) -> candle_core::Result<Tensor> {
        self.inner.forward(x)
    }
 }
 /// Input-dim sharded linear (row-parallel).
 ///
 /// Holds a sharded `Linear` plus an `AllReduce` op the forward chains
 /// after the local matmul to recover the full activation.
 pub struct RowParallelLinear {
    inner: Linear,
    #[cfg(feature = "cuda")]
    all_reduce: AllReduce,
    /// Whether the AllReduce should run. Column-parallel ↔ row-parallel
    /// is a pair: the column output is sharded, the row input is
    /// sharded, and the AllReduce gives back the full output. For
    /// `world_size = 1` the AllReduce is a no-op so we skip it.
    needs_reduce: bool,
 }
 impl RowParallelLinear {
    /// Load this rank's row-parallel slice from a `ShardedVarBuilder`.
    ///
    /// Under `cuda`, `comm` is the NCCL communicator the row-parallel
    /// `AllReduce` runs against. On CPU builds the parameter is
    /// elided — forward returns the partial sum, which is the *wrong*
    /// answer for inference but lets us compile-check the model.
    #[cfg(feature = "cuda")]
    pub fn load(
        vb: &ShardedVarBuilder,
        rank: u32,
        world_size: u32,
        comm: std::sync::Arc<cudarc::nccl::Comm>,
    ) -> Result<Self> {
        let weight = vb
            .get_with_hints((), "weight", shard(1, rank, world_size))
            .with_context(|| format!("load row-parallel '{}' weight", vb.prefix()))?;
        Ok(Self {
            inner: Linear::new(weight, None),
            all_reduce: AllReduce::new(comm),
            needs_reduce: world_size > 1,
        })
    }
    #[cfg(not(feature = "cuda"))]
    pub fn load(vb: &ShardedVarBuilder, rank: u32, world_size: u32) -> Result<Self> {
        let weight = vb
            .get_with_hints((), "weight", shard(1, rank, world_size))
            .with_context(|| format!("load row-parallel '{}' weight", vb.prefix()))?;
        Ok(Self {
            inner: Linear::new(weight, None),
            needs_reduce: world_size > 1,
        })
    }
 }
 impl Module for RowParallelLinear {
    /// Local matmul followed by an `AllReduce` (when `cuda` and
    /// `world_size > 1`). On CPU or single-rank, returns the partial
    /// output directly — which is *only* correct for `world_size == 1`.
    fn forward(&self, x: &Tensor) -> candle_core::Result<Tensor> {
        let local = self.inner.forward(x)?;
        #[cfg(feature = "cuda")]
        if self.needs_reduce {
            return local.apply_op1_no_bwd(&self.all_reduce);
        }
        let _ = self.needs_reduce;
        Ok(local)
    }
 }