feat(neuron): TP-vision Stage 1 — replicated vision tower on the TP model

Load the full, unsharded model.visual.* vision tower on every TP rank (leader + each subprocess worker mmaps the same local safetensors) when config.vision_config is present. VisionTower::load already takes a ShardedVarBuilder whose plain .get() returns the full replicated tensor, so the tower loads identically regardless of world_size — no sharding, no NCCL broadcast. - TpQwen3_5ForCausalLM gains vision: Option<VisionTower> + image_token_id, plus has_vision/image_token_id/encode_image/forward_with_vision, mirroring the single-GPU Qwen3_5ForCausalLM wrapper. - TpQwen3_5Model::forward_with_vision mirrors the single-GPU forward_inner splice: embed locally, replace rows at image_token_id positions, run the sharded decoder stack. Because every rank encodes the same pixels through its replicated tower, the spliced input embeddings are identical across ranks — preserving the TP replicated-hidden-state invariant the row-parallel AllReduce relies on. - splice_runs is now pub(crate) and shared with the TP model. No caller yet — Stage 2 wires the RPC/worker path that invokes encode_image + forward_with_vision per rank. Most of this compiles on the non-cuda build (only the cuda load variant's tower line is gated); CI's CUDA type-check covers the rest. Refs TP-vision plan Stage 1. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-04 15:00:05 +03:00
parent 7bb033b4ed
commit 9a24b05866
2 changed files with 159 additions and 3 deletions
--- a/crates/neuron/src/harness/arch/qwen3_5/mod.rs
+++ b/crates/neuron/src/harness/arch/qwen3_5/mod.rs
@@ -236,7 +236,11 @@ fn default_partial_rotary_factor() -> f32 {
 /// `slice_assign` per run. For typical Qwen3.6 requests this is one
 /// or two runs per image; `slice_assign` does one tensor copy per
 /// run, which is cheap relative to the decoder forward pass.
-fn splice_runs(h: &Tensor, img: &Tensor, positions: &[u32]) -> candle_core::Result<Tensor> {
+pub(crate) fn splice_runs(
    h: &Tensor,
    img: &Tensor,
    positions: &[u32],
 ) -> candle_core::Result<Tensor> {
    debug_assert!(
        !positions.is_empty(),
        "splice_runs precondition: non-empty positions"
--- a/crates/neuron/src/harness/tp/tp_qwen3_5.rs
+++ b/crates/neuron/src/harness/tp/tp_qwen3_5.rs
@@ -46,6 +46,8 @@ use super::tp_linear::{ColumnParallelLinear, RowParallelLinear};
 use crate::harness::arch::qwen3_5::linear_attn::repeat_interleave;
 use crate::harness::arch::qwen3_5::rmsnorm::{Qwen3_5RmsNorm, Qwen3_5RmsNormGated, l2norm};
 use crate::harness::arch::qwen3_5::rope::RotaryEmbedding;
 use crate::harness::arch::qwen3_5::splice_runs;
 use crate::harness::arch::qwen3_5::vision::VisionTower;
 pub use crate::harness::arch::qwen3_5::{Config, TextConfig};
 // ─── linear-attention (Gated DeltaNet) ──────────────────────────────
@@ -990,11 +992,103 @@ impl TpQwen3_5Model {
        }
        self.norm.forward(&h)
    }
    /// Forward with image-embedding splice (TP, replicated tower).
    ///
    /// Mirrors the single-GPU `Qwen3_5Model::forward_inner` splice:
    /// embed locally, replace the rows at `image_token_id` positions
    /// with the image patch embeddings, then run the sharded decoder
    /// stack. The TP invariant is that every rank holds an identical
    /// hidden state (only the attention/MLP matmuls shard, with a
    /// trailing `AllReduce`). That holds here because every rank
    /// encodes the *same* pixels through its *replicated* vision tower
    /// and so produces identical `image_embeds` — no broadcast needed.
    pub fn forward_with_vision(
        &mut self,
        input: &Tensor,
        offset: usize,
        image_embeds: &Tensor,
        image_token_id: u32,
    ) -> candle_core::Result<Tensor> {
        let (b, l) = input.dims2()?;
        let mut h = self.embed_tokens.forward(input)?;
        // Locate the image-token positions in the (pre-expanded) input
        // ids and splice the patch rows in. Same CPU-side scan as the
        // single-GPU path; the count must match the patch dimension or
        // the prompt expansion is wrong.
        let ids: Vec<u32> = input.flatten_all()?.to_vec1()?;
        let mut positions: Vec<u32> = Vec::with_capacity(image_embeds.dim(0)?);
        for (idx, id) in ids.iter().enumerate() {
            if *id == image_token_id {
                positions.push(idx as u32);
            }
        }
        let n_img_tokens = image_embeds.dim(0)?;
        if positions.len() != n_img_tokens {
            candle_core::bail!(
                "TP forward_with_vision: prompt has {} image-token positions but \
                 image_embeds carries {} tokens — ensure the per-image patch-count \
                 expansion has been applied",
                positions.len(),
                n_img_tokens,
            );
        }
        if !positions.is_empty() {
            let img = image_embeds.to_dtype(self.dtype)?;
            h = splice_runs(&h, &img, &positions)?;
        }
        let causal = if l == 1 {
            None
        } else {
            Some(self.causal_mask(b, l, offset)?)
        };
        for layer in &mut self.layers {
            h = layer.forward(&h, causal.as_ref(), offset)?;
        }
        self.norm.forward(&h)
    }
 }
 pub struct TpQwen3_5ForCausalLM {
    base: TpQwen3_5Model,
    lm_head: super::tp_linear::MaybeQuantLinear,
    /// Replicated vision tower (TP-vision). Loaded on every rank from
    /// the full, unsharded `model.visual.*` weights; `None` for
    /// text-only checkpoints. Each rank encodes the same image
    /// independently — no sharding, no broadcast — which keeps the
    /// spliced input embeddings identical across ranks (the
    /// replicated-hidden-state invariant the sharded layers rely on).
    vision: Option<VisionTower>,
    /// `<|image_pad|>` sentinel id (mirrors `Config::image_token_id`);
    /// the splice target for `forward_with_vision`.
    image_token_id: Option<u32>,
 }
 /// Load the replicated vision tower from the unsharded `model.visual.*`
 /// weights when the config carries a `vision_config` block. Shared by
 /// the cuda and non-cuda `load` variants. `vb.pp("model.visual")`
 /// resolves against the same full safetensors every rank mmaps; plain
 /// `.get()` on a `ShardedVarBuilder` returns the full (replicated)
 /// tensor, so this loads identically regardless of `world_size`.
 fn load_replicated_vision_tower(
    config: &Config,
    vb: &ShardedVarBuilder,
 ) -> Result<Option<VisionTower>> {
    match config.vision_config.clone() {
        Some(vcfg) => {
            tracing::info!(
                depth = vcfg.depth,
                hidden_size = vcfg.hidden_size,
                "loading qwen3_5 vision tower (TP replicated)"
            );
            let tower = VisionTower::load(vcfg, vb.pp("model.visual"))
                .context("load qwen3_5 vision tower (model.visual.*) [TP replicated]")?;
            Ok(Some(tower))
        }
        None => Ok(None),
    }
 }
 impl TpQwen3_5ForCausalLM {
@@ -1012,7 +1106,14 @@ impl TpQwen3_5ForCausalLM {
        let cfg = &config.text_config;
        let base = TpQwen3_5Model::load(cfg, vb, mmap, rank, world_size, comm, quant)?;
        let lm_head = build_lm_head(cfg, vb, &base, quant)?;
-        let model = Self { base, lm_head };
+        let vision = load_replicated_vision_tower(&config, vb)?;
        let image_token_id = config.image_token_id;
        let model = Self {
            base,
            lm_head,
            vision,
            image_token_id,
        };
        log_construction_complete(cfg, rank, world_size, quant, model.device());
        Ok(model)
    }
@@ -1029,17 +1130,68 @@ impl TpQwen3_5ForCausalLM {
        let cfg = &config.text_config;
        let base = TpQwen3_5Model::load(cfg, vb, mmap, rank, world_size, quant)?;
        let lm_head = build_lm_head(cfg, vb, &base, quant)?;
-        let model = Self { base, lm_head };
+        let vision = load_replicated_vision_tower(&config, vb)?;
        let image_token_id = config.image_token_id;
        let model = Self {
            base,
            lm_head,
            vision,
            image_token_id,
        };
        log_construction_complete(cfg, rank, world_size, quant, model.device());
        Ok(model)
    }
    /// True when this TP load materialised a replicated vision tower.
    /// Drives capability advertising and the Stage 3 vision dispatch.
    pub fn has_vision(&self) -> bool {
        self.vision.is_some()
    }
    /// `<|image_pad|>` sentinel id, when known.
    pub fn image_token_id(&self) -> Option<u32> {
        self.image_token_id
    }
    /// Encode one preprocessed `(C, H, W)` image into LM-side patch
    /// embeddings `(N_lm, hidden)` via this rank's replicated tower.
    /// Errors when loaded without a vision tower.
    pub fn encode_image(&self, image: &Tensor) -> Result<Tensor> {
        self.vision
            .as_ref()
            .ok_or_else(|| {
                anyhow::anyhow!(
                    "encode_image: this TP Qwen3.6 load has no vision tower \
                     (config.json::vision_config absent or weights missing)"
                )
            })?
            .forward(image)
    }
    pub fn forward(&mut self, input: &Tensor, offset: usize) -> candle_core::Result<Tensor> {
        let (_, l) = input.dims2()?;
        let hidden = self.base.forward(input, offset)?;
        hidden.i((.., l - 1.., ..))?.apply(&self.lm_head)
    }
    /// Forward with image-embedding splice (TP). Mirrors `forward` but
    /// routes through `TpQwen3_5Model::forward_with_vision` so the
    /// per-rank input embeddings get the image patches spliced in at
    /// `image_token_id` positions before the sharded decoder stack.
    pub fn forward_with_vision(
        &mut self,
        input: &Tensor,
        offset: usize,
        image_embeds: &Tensor,
        image_token_id: u32,
    ) -> candle_core::Result<Tensor> {
        let (_, l) = input.dims2()?;
        let hidden = self
            .base
            .forward_with_vision(input, offset, image_embeds, image_token_id)?;
        hidden.i((.., l - 1.., ..))?.apply(&self.lm_head)
    }
    pub fn clear_kv_cache(&mut self) {
        self.base.clear_kv_cache();
    }