feat(neuron): TP-vision Stage 2 — per-rank image RPC + worker plumbing

Carry image content through the TP forward path so every rank encodes and splices locally (replicated tower, no embedding broadcast). - rpc.rs: new WorkerRequest::GenerateStepWithImages carrying the source image data URIs + image_token_id for the single-shot vision prefill; worker still replies GenerateStepOk. Round-trip test added. - tp_qwen3_5.rs: TpQwen3_5ForCausalLM::forward_with_images — encode each preprocessed image through the rank's replicated tower, cat, splice, forward. Shared by leader and worker so every rank runs identical work. - tp/mod.rs: TpLeaderModel::forward_with_images and WorkerPool::generate_step_with_images (mirrors generate_step: fan out GenerateStepWithImages to subprocess ranks, run the leader's image forward on its device worker thread, drain, combine). - worker.rs: WorkerModel::forward_with_images + handle_generate_step_with_images — each subprocess rank preprocesses the same data URIs via the shared deterministic preprocess_data_uri, encodes, splices, forwards. - device_worker: Job::TpForwardLogitsWithImages + tp_forward_logits_with_images dispatch handler + DeviceWorkerHandle::tp_forward_logits_with_images. Determinism: every rank runs the same preprocess on the same source URIs through the same replicated tower, so the spliced hidden state matches across ranks — preserving the replicated-hidden-state invariant the row-parallel AllReduce relies on, with no NCCL broadcast. No caller yet — Stage 3 wires the TP chat/stream entry points to invoke generate_step_with_images for image prefill. cuda-gated plumbing covered by CI's CUDA type-check; rpc/route/forward_with_images compile on the non-cuda build. Refs TP-vision plan Stage 2. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-04 15:08:08 +03:00
parent 9a24b05866
commit 4994b94c84
7 changed files with 508 additions and 0 deletions
--- a/crates/neuron/src/harness/device_worker/dispatch.rs
+++ b/crates/neuron/src/harness/device_worker/dispatch.rs
@@ -262,6 +262,25 @@ pub(crate) fn run(device_index: u32, rx: Receiver<Job>, poisoned: Arc<AtomicBool
                let result = tp_forward_logits(&mut state, handle, &tokens, offset);
                let _ = reply.send(result);
            }
            #[cfg(feature = "cuda")]
            Job::TpForwardLogitsWithImages {
                handle,
                tokens,
                offset,
                image_token_id,
                image_data_uris,
                reply,
            } => {
                let result = tp_forward_logits_with_images(
                    &mut state,
                    handle,
                    &tokens,
                    offset,
                    image_token_id,
                    &image_data_uris,
                );
                let _ = reply.send(result);
            }
            // Handled by the matches!() check above; reaching here
            // means a Shutdown slipped past which is a bug.
            Job::Shutdown => unreachable!("Shutdown should break above"),
@@ -734,6 +753,61 @@ fn tp_forward_logits(
    Ok(values)
 }
 /// Image-bearing leader forward (rank 0). Preprocesses each source
 /// `image_data_uris` entry through the same deterministic
 /// `preprocess_data_uri` every rank runs, uploads to the leader's
 /// device, encodes + splices + forwards via
 /// `TpLeaderModel::forward_with_images`, and copies the `[vocab]`
 /// logits to CPU. Mirrors the single-GPU `forward_logits_with_images`
 /// but on the TP leader's replicated tower.
 #[cfg(feature = "cuda")]
 fn tp_forward_logits_with_images(
    state: &mut DeviceWorkerState,
    handle: TpHandle,
    tokens: &[u32],
    offset: usize,
    image_token_id: u32,
    image_data_uris: &[String],
 ) -> anyhow::Result<Vec<f32>> {
    use crate::harness::preprocess::{PreprocessProfile, preprocess_data_uri};
    use candle_core::{DType, Tensor};
    if image_data_uris.is_empty() {
        anyhow::bail!("TpForwardLogitsWithImages dispatched with zero images");
    }
    // Preprocess every image into a device-resident (C, H, W) tensor.
    // Same fixed-resolution profile + decode path the subprocess workers
    // run, so the encoded embeddings match across ranks bit-for-bit.
    let profile = PreprocessProfile::qwen3_6();
    let (h, w) = (
        profile.target_height as usize,
        profile.target_width as usize,
    );
    let mut pixels: Vec<Tensor> = Vec::with_capacity(image_data_uris.len());
    for (idx, uri) in image_data_uris.iter().enumerate() {
        let px = preprocess_data_uri(uri, &profile)
            .with_context(|| format!("preprocess image[{idx}] (TP leader)"))?;
        let t = Tensor::from_vec(px, (3, h, w), &state.device)?;
        pixels.push(t);
    }
    let input = Tensor::new(tokens, &state.device)?.unsqueeze(0)?;
    let model = state.tp_models.get_mut(&handle).ok_or_else(|| {
        anyhow::anyhow!(
            "TpForwardLogitsWithImages: no model for handle {}",
            handle.0
        )
    })?;
    let logits = model.forward_with_images(&input, offset, &pixels, image_token_id)?;
    let logits = logits.squeeze(0)?.squeeze(0)?;
    let logits = logits.to_dtype(DType::F32)?.flatten_all()?;
    let values = logits.to_vec1::<f32>()?;
    Ok(values)
 }
 /// Forward step + copy the `[vocab]` logits to a CPU `Vec<f32>` ready
 /// for sampling on the async caller. The model's `device()` (CUDA or
 /// CPU) determines where the kernel runs; this fn doesn't care.
--- a/crates/neuron/src/harness/device_worker/jobs.rs
+++ b/crates/neuron/src/harness/device_worker/jobs.rs
@@ -231,6 +231,23 @@ pub enum Job {
        offset: usize,
        reply: oneshot::Sender<Result<Vec<f32>>>,
    },
    /// Image-bearing leader (rank 0) forward for the single-shot vision
    /// prefill. The handler preprocesses each `image_data_uris` entry
    /// (the same deterministic path every rank runs), encodes through
    /// the leader's replicated tower, splices at `image_token_id`, and
    /// returns CPU-side `[vocab]` logits. Image tensors never escape the
    /// worker thread. Caller fans out `GenerateStepWithImages` to the
    /// subprocess ranks and drains them; only the leader forward moves
    /// here.
    #[cfg(feature = "cuda")]
    TpForwardLogitsWithImages {
        handle: TpHandle,
        tokens: Vec<u32>,
        offset: usize,
        image_token_id: u32,
        image_data_uris: Vec<String>,
        reply: oneshot::Sender<Result<Vec<f32>>>,
    },
    /// Tell the worker to break its dispatch loop and exit. Any jobs
    /// queued after this in the channel reply `Err` to their oneshot
    /// senders (the senders are dropped on the worker's exit, which
--- a/crates/neuron/src/harness/device_worker/mod.rs
+++ b/crates/neuron/src/harness/device_worker/mod.rs
@@ -572,6 +572,47 @@ impl DeviceWorkerHandle {
        }
    }
    /// Image-bearing TP leader forward (single-shot vision prefill).
    /// Routes `Job::TpForwardLogitsWithImages` onto the worker thread;
    /// the handler preprocesses + encodes + splices + forwards and
    /// returns CPU-side `[vocab]` logits. The `WorkerPool` fans the
    /// matching `GenerateStepWithImages` out to subprocess ranks so the
    /// row-parallel collectives complete.
    #[cfg(feature = "cuda")]
    pub async fn tp_forward_logits_with_images(
        &self,
        handle: TpHandle,
        tokens: Vec<u32>,
        offset: usize,
        image_token_id: u32,
        image_data_uris: Vec<String>,
    ) -> Result<Vec<f32>, WorkerError> {
        if self.poisoned.load(Ordering::Acquire) {
            return Err(WorkerError::Poisoned {
                device_index: self.device_index,
            });
        }
        let (reply_tx, reply_rx) = oneshot::channel();
        self.tx
            .send(Job::TpForwardLogitsWithImages {
                handle,
                tokens,
                offset,
                image_token_id,
                image_data_uris,
                reply: reply_tx,
            })
            .map_err(|_| WorkerError::Gone {
                device_index: self.device_index,
            })?;
        match reply_rx.await {
            Ok(result) => result.map_err(WorkerError::from),
            Err(_) => Err(WorkerError::Gone {
                device_index: self.device_index,
            }),
        }
    }
    /// Send `Job::Shutdown` and join the thread. Idempotent — calling
    /// twice is a no-op the second time.
    pub fn shutdown(&self) -> anyhow::Result<()> {
--- a/crates/neuron/src/harness/tp/mod.rs
+++ b/crates/neuron/src/harness/tp/mod.rs
@@ -62,6 +62,25 @@ impl TpLeaderModel {
        }
    }
    /// Image-bearing forward on rank 0. Only the vision-capable
    /// `qwen3_5` arch supports it; the dense `qwen3` arch has no tower.
    pub fn forward_with_images(
        &mut self,
        input: &candle_core::Tensor,
        offset: usize,
        image_pixels: &[candle_core::Tensor],
        image_token_id: u32,
    ) -> candle_core::Result<candle_core::Tensor> {
        match self {
            TpLeaderModel::Qwen3_5(m) => {
                m.forward_with_images(input, offset, image_pixels, image_token_id)
            }
            TpLeaderModel::Qwen3(_) => {
                candle_core::bail!("forward_with_images: qwen3 (dense) has no vision tower")
            }
        }
    }
    pub fn clear_kv_cache(&mut self) {
        match self {
            TpLeaderModel::Qwen3(m) => m.clear_kv_cache(),
@@ -687,6 +706,129 @@ impl WorkerPool {
        }
    }
    /// Image-bearing variant of [`Self::generate_step`] for the
    /// single-shot vision prefill. Identical fan-out / leader-forward /
    /// drain shape, but every rank runs the encode + splice path:
    ///
    /// - subprocess workers get `GenerateStepWithImages` (carrying the
    ///   source `image_data_uris`); each preprocesses + encodes through
    ///   its replicated tower and splices locally;
    /// - the leader runs the same encode + splice + forward on its
    ///   device worker thread via `tp_forward_logits_with_images`.
    ///
    /// The row-parallel `AllReduce`s synchronise the ranks exactly as in
    /// the text path. Because the tower is replicated and the preprocess
    /// is deterministic, every rank's spliced hidden state matches — no
    /// embedding broadcast. Only used for prefill; decode reuses
    /// `generate_step`.
    #[cfg(feature = "cuda")]
    pub async fn generate_step_with_images(
        &mut self,
        model_id: &str,
        leader_handle: super::device_worker::TpHandle,
        tokens: Vec<u32>,
        offset: usize,
        image_token_id: u32,
        image_data_uris: Vec<String>,
    ) -> Result<Vec<f32>> {
        let step_start = std::time::Instant::now();
        let tokens_len = tokens.len();
        tracing::debug!(
            model = %model_id,
            tokens = tokens_len,
            offset,
            images = image_data_uris.len(),
            "WorkerPool::generate_step_with_images: fan-out"
        );
        // 1. Fan-out the image-bearing prefill to subprocess workers.
        for w in &mut self.workers {
            w.send_only(&WorkerRequest::GenerateStepWithImages {
                model_id: model_id.to_string(),
                tokens: tokens.clone(),
                offset,
                image_token_id,
                image_data_uris: image_data_uris.clone(),
            })
            .await?;
        }
        // 2. Leader's image forward on its device worker thread. The
        //    AllReduce CustomOps block until every worker issues the
        //    matching collective; CPU-side logits keep the device tensor
        //    from escaping the worker thread.
        let leader_start = std::time::Instant::now();
        let leader_result = self
            .leader_worker
            .tp_forward_logits_with_images(
                leader_handle,
                tokens,
                offset,
                image_token_id,
                image_data_uris,
            )
            .await;
        let leader_ok = leader_result.is_ok();
        let leader_ms = leader_start.elapsed().as_millis();
        if !leader_ok {
            let detail = leader_result
                .as_ref()
                .err()
                .map(|e| format!("{e:#}"))
                .unwrap_or_default();
            tracing::warn!(
                model = %model_id,
                tokens = tokens_len,
                offset,
                leader_ms,
                error = %detail,
                "WorkerPool::generate_step_with_images: leader forward failed"
            );
        }
        // 3. ALWAYS drain worker responses, regardless of the leader's
        //    outcome, so stale GenerateStepOk replies don't poison the
        //    next request's recv (same invariant as generate_step).
        let worker_errors = drain_workers(&mut self.workers, |r| match r {
            WorkerResponse::GenerateStepOk => Ok(()),
            WorkerResponse::Error { kind, message } => Err(format!("[{kind}]: {message}")),
            other => Err(format!("expected GenerateStepOk, got {other:?}")),
        })
        .await;
        tracing::debug!(
            model = %model_id,
            leader_ms,
            leader_ok,
            errors = worker_errors.len(),
            total_ms = step_start.elapsed().as_millis(),
            "WorkerPool::generate_step_with_images: workers drained"
        );
        match leader_result {
            Ok(values) => {
                if worker_errors.is_empty() {
                    Ok(values)
                } else {
                    anyhow::bail!(
                        "GenerateStepWithImages: leader succeeded but workers failed: {}",
                        worker_errors.join("; ")
                    )
                }
            }
            Err(e) => {
                if worker_errors.is_empty() {
                    Err(anyhow::Error::new(e)
                        .context("GenerateStepWithImages: leader forward failed"))
                } else {
                    Err(anyhow::Error::new(e).context(format!(
                        "GenerateStepWithImages: leader forward failed and workers also failed: {}",
                        worker_errors.join("; ")
                    )))
                }
            }
        }
    }
    /// Reset the KV cache for `model_id` on every rank. Called at the
    /// start of every inference so a fresh request doesn't attend over
    /// the previous one's tokens.
--- a/crates/neuron/src/harness/tp/rpc.rs
+++ b/crates/neuron/src/harness/tp/rpc.rs
@@ -88,6 +88,29 @@ pub enum WorkerRequest {
        offset: usize,
    },
    /// Like `GenerateStep` but the prefill carries image content. Every
    /// rank preprocesses the same `image_data_uris` through its
    /// *replicated* vision tower, splices the resulting patch embeddings
    /// at `image_token_id` positions, and runs the forward — the
    /// row-parallel `AllReduce`s still synchronise every rank. Because
    /// the tower is replicated and `preprocess_data_uri` is
    /// deterministic, the spliced hidden state is identical on every
    /// rank, so no embedding broadcast is needed. Sent only for the
    /// (single-shot) image-bearing prefill; decode steps use plain
    /// `GenerateStep`. Worker replies with the same `GenerateStepOk`.
    GenerateStepWithImages {
        model_id: String,
        tokens: Vec<u32>,
        offset: usize,
        /// `<|image_pad|>` sentinel id (248056 for Qwen3.6); splice
        /// target in the expanded token stream.
        image_token_id: u32,
        /// Source image data URIs (`data:image/...;base64,...`), one per
        /// image in prompt order. Each rank decodes + preprocesses these
        /// identically; tens of KB each, so cheap over the stdin pipe.
        image_data_uris: Vec<String>,
    },
    /// Reset the KV cache for this model on this rank. Sent at the
    /// start of every inference so a fresh request doesn't accidentally
    /// attend over the previous one's tokens.
@@ -191,6 +214,32 @@ mod tests {
        assert_eq!(wire, r#"{"op":"init","comm_id":"deadbeef"}"#);
    }
    #[test]
    fn request_generate_step_with_images_round_trip() {
        let req = WorkerRequest::GenerateStepWithImages {
            model_id: "Qwen/Qwen3.6-27B".into(),
            tokens: vec![1, 2, 248056, 3],
            offset: 0,
            image_token_id: 248056,
            image_data_uris: vec!["data:image/png;base64,AAA=".into()],
        };
        let wire = serde_json::to_string(&req).unwrap();
        assert!(wire.contains(r#""op":"generate_step_with_images""#));
        match roundtrip(&req) {
            WorkerRequest::GenerateStepWithImages {
                tokens,
                image_token_id,
                image_data_uris,
                ..
            } => {
                assert_eq!(tokens, vec![1, 2, 248056, 3]);
                assert_eq!(image_token_id, 248056);
                assert_eq!(image_data_uris.len(), 1);
            }
            other => panic!("expected GenerateStepWithImages, got {other:?}"),
        }
    }
    #[test]
    fn request_shutdown_round_trip() {
        assert_eq!(
--- a/crates/neuron/src/harness/tp/tp_qwen3_5.rs
+++ b/crates/neuron/src/harness/tp/tp_qwen3_5.rs
@@ -1192,6 +1192,38 @@ impl TpQwen3_5ForCausalLM {
        hidden.i((.., l - 1.., ..))?.apply(&self.lm_head)
    }
    /// End-to-end image prefill on one rank: encode each preprocessed
    /// `(C, H, W)` pixel tensor through this rank's replicated tower,
    /// concatenate the per-image embeddings along the patch axis, and
    /// forward with the splice. Shared by the leader (`TpLeaderModel`)
    /// and the subprocess worker (`WorkerModel`) so every rank runs the
    /// identical encode → splice → forward and keeps the replicated
    /// hidden state in lockstep. Returns last-position logits
    /// `(B, 1, vocab)`, same contract as `forward`.
    pub fn forward_with_images(
        &mut self,
        input: &Tensor,
        offset: usize,
        image_pixels: &[Tensor],
        image_token_id: u32,
    ) -> candle_core::Result<Tensor> {
        if image_pixels.is_empty() {
            candle_core::bail!("forward_with_images: called with zero images");
        }
        // Encode each image (immutable borrows of the tower) before the
        // mutable forward below; the borrows end as each owned embedding
        // is pushed.
        let mut per_image = Vec::with_capacity(image_pixels.len());
        for (idx, img) in image_pixels.iter().enumerate() {
            let embed = self
                .encode_image(img)
                .map_err(|e| candle_core::Error::Msg(format!("encode image[{idx}]: {e:#}")))?;
            per_image.push(embed);
        }
        let image_embeds = Tensor::cat(&per_image.iter().collect::<Vec<_>>(), 0)?;
        self.forward_with_vision(input, offset, &image_embeds, image_token_id)
    }
    pub fn clear_kv_cache(&mut self) {
        self.base.clear_kv_cache();
    }
--- a/crates/neuron/src/harness/tp/worker.rs
+++ b/crates/neuron/src/harness/tp/worker.rs
@@ -47,6 +47,28 @@ impl WorkerModel {
        }
    }
    /// Image-bearing forward on this rank. Only the vision-capable
    /// `qwen3_5` arch has a replicated tower; the dense `qwen3` arch
    /// errors. The returned logits are discarded by the caller (the
    /// leader samples from its own rank-0 copy) — the value is the NCCL
    /// collectives the forward issues.
    fn forward_with_images(
        &mut self,
        input: &candle_core::Tensor,
        offset: usize,
        image_pixels: &[candle_core::Tensor],
        image_token_id: u32,
    ) -> candle_core::Result<candle_core::Tensor> {
        match self {
            WorkerModel::Qwen3_5(m) => {
                m.forward_with_images(input, offset, image_pixels, image_token_id)
            }
            WorkerModel::Qwen3(_) => {
                candle_core::bail!("forward_with_images: qwen3 (dense) has no vision tower")
            }
        }
    }
    fn clear_kv_cache(&mut self) {
        match self {
            WorkerModel::Qwen3(m) => m.clear_kv_cache(),
@@ -167,6 +189,19 @@ impl WorkerState {
                tokens,
                offset,
            } => self.handle_generate_step(&model_id, tokens, offset),
            WorkerRequest::GenerateStepWithImages {
                model_id,
                tokens,
                offset,
                image_token_id,
                image_data_uris,
            } => self.handle_generate_step_with_images(
                &model_id,
                tokens,
                offset,
                image_token_id,
                image_data_uris,
            ),
            WorkerRequest::ClearKvCache { model_id } => self.handle_clear_kv_cache(&model_id),
            WorkerRequest::UnloadModel { model_id } => self.handle_unload_model(&model_id),
            WorkerRequest::Shutdown => WorkerResponse::Bye,
@@ -418,6 +453,124 @@ impl WorkerState {
        }
    }
    /// Image-bearing prefill on this rank. Preprocesses each source data
    /// URI through the same deterministic `preprocess_data_uri` the
    /// leader runs, encodes through this rank's replicated tower, and
    /// splices + forwards. The logits are discarded (the leader samples
    /// from rank 0); the row-parallel `AllReduce`s are the point.
    #[cfg(feature = "cuda")]
    fn handle_generate_step_with_images(
        &mut self,
        model_id: &str,
        tokens: Vec<u32>,
        offset: usize,
        image_token_id: u32,
        image_data_uris: Vec<String>,
    ) -> WorkerResponse {
        use crate::harness::preprocess::{PreprocessProfile, preprocess_data_uri};
        use candle_core::Tensor;
        if image_data_uris.is_empty() {
            return WorkerResponse::Error {
                kind: "bad_request".into(),
                message: "GenerateStepWithImages with zero images".into(),
            };
        }
        let Some(model) = self.models.get_mut(model_id) else {
            return WorkerResponse::Error {
                kind: "model_not_loaded".into(),
                message: format!("model '{model_id}' not loaded on rank {}", self.config.rank),
            };
        };
        let device = model.device().clone();
        // Preprocess each image identically to the leader so the encoded
        // embeddings — and thus the spliced hidden state — match across
        // ranks. Fixed 448×448 profile.
        let profile = PreprocessProfile::qwen3_6();
        let (h, w) = (
            profile.target_height as usize,
            profile.target_width as usize,
        );
        let mut pixels: Vec<Tensor> = Vec::with_capacity(image_data_uris.len());
        for (idx, uri) in image_data_uris.iter().enumerate() {
            let px = match preprocess_data_uri(uri, &profile) {
                Ok(p) => p,
                Err(e) => {
                    return WorkerResponse::Error {
                        kind: "bad_request".into(),
                        message: format!("preprocess image[{idx}]: {e:#}"),
                    };
                }
            };
            match Tensor::from_vec(px, (3, h, w), &device) {
                Ok(t) => pixels.push(t),
                Err(e) => {
                    return WorkerResponse::Error {
                        kind: "forward_failed".into(),
                        message: format!("build image[{idx}] tensor: {e}"),
                    };
                }
            }
        }
        let input = match Tensor::new(tokens.as_slice(), &device).and_then(|t| t.unsqueeze(0)) {
            Ok(t) => t,
            Err(e) => {
                return WorkerResponse::Error {
                    kind: "forward_failed".into(),
                    message: format!("build input tensor: {e}"),
                };
            }
        };
        let start = std::time::Instant::now();
        tracing::debug!(
            rank = self.config.rank,
            model = %model_id,
            tokens = tokens.len(),
            offset,
            images = pixels.len(),
            "worker GenerateStepWithImages: forward starting"
        );
        // Drop the logits — the leader samples from its own rank-0 copy.
        if let Err(e) = model.forward_with_images(&input, offset, &pixels, image_token_id) {
            tracing::warn!(
                rank = self.config.rank,
                model = %model_id,
                elapsed_ms = start.elapsed().as_millis(),
                error = %e,
                "worker GenerateStepWithImages: forward failed"
            );
            return WorkerResponse::Error {
                kind: "forward_failed".into(),
                message: format!("TP image forward: {e}"),
            };
        }
        tracing::debug!(
            rank = self.config.rank,
            model = %model_id,
            elapsed_ms = start.elapsed().as_millis(),
            "worker GenerateStepWithImages: forward done"
        );
        WorkerResponse::GenerateStepOk
    }
    #[cfg(not(feature = "cuda"))]
    fn handle_generate_step_with_images(
        &mut self,
        _model_id: &str,
        _tokens: Vec<u32>,
        _offset: usize,
        _image_token_id: u32,
        _image_data_uris: Vec<String>,
    ) -> WorkerResponse {
        WorkerResponse::Error {
            kind: "cuda_feature_not_enabled".into(),
            message: "GenerateStepWithImages requires --features cuda".into(),
        }
    }
    #[cfg(feature = "cuda")]
    fn handle_clear_kv_cache(&mut self, model_id: &str) -> WorkerResponse {
        let Some(model) = self.models.get_mut(model_id) else {