feat(neuron): dynamic-resolution images via Qwen smart_resize (#14)

Replace the fixed 448×448-square preprocess with native-aspect `smart_resize`, and thread the resulting per-image grid through the LM so spatial structure survives non-square images (documents, screenshots, charts, panoramas, OCR) instead of being squished into a square. - preprocess.rs: port Qwen `smart_resize` (factor = patch×merge = 32; pixel budget [min,max], default 256²–1024² → 64–1024 LM tokens). `PreprocessProfile` drops the fixed target dims for `factor`/`min_pixels`/ `max_pixels`; `preprocess`/`preprocess_data_uri` now return the resized `(h, w)`; add `resized_dims_for_uri` (decode + resize, no normalize) for the TP leader's token count. - rope.rs: `compute_mrope_index`/`get_rope_index` take per-image `grids: &[(lm_gh, lm_gw)]` instead of assuming a square `isqrt(run)`. Walk image runs in order, validate `run == gh*gw`, emit row-major positions, resume the shared counter at `base + max(gh,gw)`. Correct for multiple images of differing grids interleaved with text. - candle.rs: `VisionMeta`/`LoadedModel`/`TpLoadedModel` carry the `image_grid_factor` (patch×merge) instead of the constant 196; all four prompt-build sites compute per-image counts from each image's resized grid (single-GPU from the extracted `ImageInput.h/w`, TP from `resized_dims_for_uri`). `ModelArch` gains `vision_grid_factor`. - single-GPU (`mod.rs`, `dispatch.rs`) and TP (`tp_qwen3_5.rs::prefill_with_images_chunked`, `dispatch.rs`, `tp/worker.rs`) thread the grids into `get_rope_index`. Each TP rank recomputes grids from its own deterministic preprocess — no rpc.rs change, single source of truth. The vision tower itself was already grid-general (recent pos-embed interpolation + 2D rotary fix). No patch-count cap: pos-embed is interpolated to any grid; `max_pixels` bounds cost (O(patches²) ViT attention + prefill) instead. Tests: smart_resize (aspect/cap/floor/reject), `compute_mrope_index` non-square + two-image + mismatch cases, square-grid regression guard. Non-cuda build + clippy + full workspace tests green; TP load/dispatch paths are cuda-gated → Gitea CUDA type-check. Operator pixel-budget config + remaining doc cleanup follow in C5. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-04 22:47:27 +03:00
parent dc048ffcc9
commit c97a8654f5
8 changed files with 425 additions and 169 deletions
--- a/crates/neuron/src/harness/arch/qwen3_5/mod.rs
+++ b/crates/neuron/src/harness/arch/qwen3_5/mod.rs
@@ -404,7 +404,7 @@ impl Qwen3_5Model {
    }

    pub fn forward(&mut self, input: &Tensor, offset: usize) -> candle_core::Result<Tensor> {
-        self.forward_inner(input, offset, None, None)
+        self.forward_inner(input, offset, None, None, &[])
    }

    /// Forward with image-embedding splice. Stage B of the vision plan.
@@ -437,8 +437,15 @@ impl Qwen3_5Model {
        offset: usize,
        image_embeds: &Tensor,
        image_token_id: u32,
+        grids: &[(usize, usize)],
    ) -> candle_core::Result<Tensor> {
-        self.forward_inner(input_ids, offset, Some(image_embeds), Some(image_token_id))
+        self.forward_inner(
+            input_ids,
+            offset,
+            Some(image_embeds),
+            Some(image_token_id),
+            grids,
+        )
    }

    fn forward_inner(
@@ -447,6 +454,7 @@ impl Qwen3_5Model {
        offset: usize,
        image_embeds: Option<&Tensor>,
        image_token_id: Option<u32>,
+        grids: &[(usize, usize)],
    ) -> candle_core::Result<Tensor> {
        let (b, l) = input.dims2()?;
        let mut h = self.embed_tokens.forward(input)?;
@@ -483,7 +491,7 @@ impl Qwen3_5Model {
                h = splice_runs(&h, &img, &positions)?;
            }

-            let (text, height, width, delta) = rope::get_rope_index(&ids, tok_id)
+            let (text, height, width, delta) = rope::get_rope_index(&ids, tok_id, grids)
                .map_err(|e| candle_core::Error::Msg(format!("get_rope_index: {e}")))?;
            self.rope_delta = delta;
            let pos = rope::mrope_position_tensor(&text, &height, &width, &self.device)?;
@@ -603,11 +611,12 @@ impl Qwen3_5ForCausalLM {
        offset: usize,
        image_embeds: &Tensor,
        image_token_id: u32,
+        grids: &[(usize, usize)],
    ) -> candle_core::Result<Tensor> {
        let (_, l) = input.dims2()?;
-        let hidden = self
-            .base
-            .forward_with_vision(input, offset, image_embeds, image_token_id)?;
+        let hidden =
+            self.base
+                .forward_with_vision(input, offset, image_embeds, image_token_id, grids)?;
        hidden.i((.., l - 1.., ..))?.apply(&self.lm_head)
    }

--- a/crates/neuron/src/harness/arch/qwen3_5/rope.rs
+++ b/crates/neuron/src/harness/arch/qwen3_5/rope.rs
@@ -260,28 +260,40 @@ pub(crate) fn mrope_enabled() -> bool {
 /// off, returns plain sequential identity positions on all three axes
 /// (`mrope_cos_sin` then reduces exactly to plain RoPE), restoring the
 /// pre-M-RoPE behaviour without touching the rest of the forward.
-pub(crate) fn get_rope_index(input_ids: &[u32], image_token_id: u32) -> Result<MRopeIndex> {
+pub(crate) fn get_rope_index(
+    input_ids: &[u32],
+    image_token_id: u32,
+    grids: &[(usize, usize)],
+) -> Result<MRopeIndex> {
    if !mrope_enabled() {
        let seq: Vec<i64> = (0..input_ids.len() as i64).collect();
        return Ok((seq.clone(), seq.clone(), seq, 0));
    }
-    compute_mrope_index(input_ids, image_token_id)
+    compute_mrope_index(input_ids, image_token_id, grids)
 }

 /// The real interleaved-M-RoPE position-id computation (always active in
 /// unit tests; gated behind [`get_rope_index`] at runtime).
 ///
-/// Fixed-resolution assumption (Stage C): each image run is a perfect
-/// square with `grid_t = 1` (still image) and `grid_h = grid_w =
-/// isqrt(run_len)` — 196 → 14×14. Dynamic resolution (#14) would thread
-/// real per-image grids instead.
-pub(crate) fn compute_mrope_index(input_ids: &[u32], image_token_id: u32) -> Result<MRopeIndex> {
+/// `grids` carries the post-merge LM grid `(lm_gh, lm_gw)` for each image
+/// run, in prompt order — a run length alone cannot recover its
+/// factorisation, so the grids must be passed (#14 dynamic resolution).
+/// Each image is a still frame (`grid_t = 1`); its tokens get
+/// `[base, base + hh, base + ww]` row-major and the shared counter
+/// resumes at `base + max(lm_gh, lm_gw)`. Multi-image is correct because
+/// the counter threads across images and interleaved text.
+pub(crate) fn compute_mrope_index(
+    input_ids: &[u32],
+    image_token_id: u32,
+    grids: &[(usize, usize)],
+) -> Result<MRopeIndex> {
    let n = input_ids.len();
    let mut text = Vec::with_capacity(n);
    let mut height = Vec::with_capacity(n);
    let mut width = Vec::with_capacity(n);
    let mut counter: i64 = 0;
    let mut i = 0;
+    let mut k = 0; // index into `grids`, one per image run
    while i < n {
        if input_ids[i] == image_token_id {
            let start = i;
@@ -289,25 +301,30 @@ pub(crate) fn compute_mrope_index(input_ids: &[u32], image_token_id: u32) -> Res
                i += 1;
            }
            let run = i - start;
-            let g = run.isqrt();
-            if g * g != run {
+            let (grid_h, grid_w) = *grids.get(k).ok_or_else(|| {
+                anyhow::anyhow!(
+                    "get_rope_index: image run #{k} (len {run}) has no matching grid \
+                     ({} grids supplied)",
+                    grids.len()
+                )
+            })?;
+            k += 1;
+            if grid_h * grid_w != run {
                anyhow::bail!(
-                    "get_rope_index: image run length {run} is not a perfect square \
-                     (fixed-resolution Stage C assumes a square grid; dynamic resolution is #14)"
+                    "get_rope_index: image run #{} length {run} != grid {grid_h}×{grid_w} = {}",
+                    k - 1,
+                    grid_h * grid_w
                );
            }
-            let (grid_t, grid_h, grid_w) = (1usize, g, g);
            let base = counter;
-            for tt in 0..grid_t {
            for hh in 0..grid_h {
                for ww in 0..grid_w {
-                        text.push(base + tt as i64);
+                    text.push(base); // grid_t = 1 → temporal axis const
                    height.push(base + hh as i64);
                    width.push(base + ww as i64);
                }
            }
-            }
-            counter = base + grid_t.max(grid_h).max(grid_w) as i64;
+            counter = base + grid_h.max(grid_w) as i64;
        } else {
            text.push(counter);
            height.push(counter);
@@ -316,6 +333,12 @@ pub(crate) fn compute_mrope_index(input_ids: &[u32], image_token_id: u32) -> Res
            i += 1;
        }
    }
+    if k != grids.len() {
+        anyhow::bail!(
+            "get_rope_index: prompt has {k} image run(s) but {} grid(s) were supplied",
+            grids.len()
+        );
+    }
    let delta = counter - n as i64;
    Ok((text, height, width, delta))
 }
@@ -447,7 +470,7 @@ mod tests {

    #[test]
    fn get_rope_index_text_only_is_sequential() {
-        let (t, h, w, delta) = compute_mrope_index(&[1, 2, 3, 4], 99).unwrap();
+        let (t, h, w, delta) = compute_mrope_index(&[1, 2, 3, 4], 99, &[]).unwrap();
        assert_eq!(t, vec![0, 1, 2, 3]);
        assert_eq!(h, vec![0, 1, 2, 3]);
        assert_eq!(w, vec![0, 1, 2, 3]);
@@ -456,12 +479,12 @@ mod tests {

    #[test]
    fn get_rope_index_text_image_text() {
-        // [text, image(2x2 run of 4), text]. image_token = 99.
+        // [text, image(2x2 run of 4), text]. image_token = 99, grid (2,2).
        let ids = [1u32, 99, 99, 99, 99, 2];
-        let (t, h, w, delta) = compute_mrope_index(&ids, 99).unwrap();
-        // token 0: text → 0. image base=1, grid 1x2x2:
+        let (t, h, w, delta) = compute_mrope_index(&ids, 99, &[(2, 2)]).unwrap();
+        // token 0: text → 0. image base=1, grid 2x2:
        //   t all = 1; h = base+row = [1,1,2,2]; w = base+col = [1,2,1,2].
-        // resume from base + max(1,2,2) = 3. trailing text → 3.
+        // resume from base + max(2,2) = 3. trailing text → 3.
        assert_eq!(t, vec![0, 1, 1, 1, 1, 3]);
        assert_eq!(h, vec![0, 1, 1, 2, 2, 3]);
        assert_eq!(w, vec![0, 1, 2, 1, 2, 3]);
@@ -472,25 +495,52 @@ mod tests {
        assert_eq!(6 + delta, 4);
    }

+    #[test]
+    fn get_rope_index_nonsquare_single_image() {
+        // text + image(2 rows × 3 cols = 6 tokens). grid (2,3).
+        let ids = [1u32, 99, 99, 99, 99, 99, 99];
+        let (t, h, w, delta) = compute_mrope_index(&ids, 99, &[(2, 3)]).unwrap();
+        // base = 1; row-major h = [0,0,0,1,1,1]+1, w = [0,1,2,0,1,2]+1.
+        assert_eq!(t, vec![0, 1, 1, 1, 1, 1, 1]);
+        assert_eq!(h, vec![0, 1, 1, 1, 2, 2, 2]);
+        assert_eq!(w, vec![0, 1, 2, 3, 1, 2, 3]);
+        // resume from base + max(2,3) = 4; seq_len 7, counter 4 → delta -3.
+        assert_eq!(delta, 4 - 7);
+    }
+
+    #[test]
+    fn get_rope_index_two_images_different_grids() {
+        // img(2x2)=4, text, img(1x3)=3. grids [(2,2),(1,3)].
+        let ids = [99, 99, 99, 99, 7, 99, 99, 99];
+        let (t, h, w, delta) = compute_mrope_index(&ids, 99, &[(2, 2), (1, 3)]).unwrap();
+        // img1 base=0 → t=0, h=[0,0,1,1], w=[0,1,0,1]; resume max(2,2)=2.
+        // text at counter 2. img2 base=3 → t=3, h=[3,3,3], w=[3,4,5];
+        // resume 3+max(1,3)=6.
+        assert_eq!(t, vec![0, 0, 0, 0, 2, 3, 3, 3]);
+        assert_eq!(h, vec![0, 0, 1, 1, 2, 3, 3, 3]);
+        assert_eq!(w, vec![0, 1, 0, 1, 2, 3, 4, 5]);
+        assert_eq!(delta, 6 - 8);
+    }
+
    #[test]
    fn get_rope_index_on_by_default() {
        // With NEURON_MROPE unset (default ON), the runtime path returns
-        // the real interleaved-M-RoPE positions, so image tokens carry
-        // their 2D grid coords (height differs from the text counter).
-        // (NEURON_MROPE=0 would fall back to identity; not asserted here
-        // since it depends on env.)
-        let (t, h, w, _delta) = get_rope_index(&[1, 99, 99, 99, 99, 2], 99).unwrap();
-        // Same as compute_mrope_index: 2x2 image after one text token.
+        // the real interleaved-M-RoPE positions. (NEURON_MROPE=0 would fall
+        // back to identity; not asserted here since it depends on env.)
+        let (t, h, w, _delta) = get_rope_index(&[1, 99, 99, 99, 99, 2], 99, &[(2, 2)]).unwrap();
        assert_eq!(t, vec![0, 1, 1, 1, 1, 3]);
        assert_eq!(h, vec![0, 1, 1, 2, 2, 3]);
        assert_eq!(w, vec![0, 1, 2, 1, 2, 3]);
    }

    #[test]
-    fn get_rope_index_rejects_non_square_image_run() {
-        // 196 is square (14x14) — ok. 195 is not.
-        assert!(compute_mrope_index(&[99u32; 196], 99).is_ok());
-        assert!(compute_mrope_index(&[99u32; 195], 99).is_err());
+    fn get_rope_index_grid_mismatches_error() {
+        // run length != grid product.
+        assert!(compute_mrope_index(&[99u32; 6], 99, &[(2, 2)]).is_err());
+        // too few grids for the number of image runs.
+        assert!(compute_mrope_index(&[99, 99, 7, 99], 99, &[(1, 2)]).is_err());
+        // too many grids.
+        assert!(compute_mrope_index(&[99, 99], 99, &[(1, 2), (1, 1)]).is_err());
    }

    #[test]
@@ -501,7 +551,7 @@ mod tests {
        let dev = Device::Cpu;
        let rope = RotaryEmbedding::new(DType::F32, &qwen36_cfg(), &dev).unwrap();
        let ids = [1u32, 99, 99, 99, 99]; // text + 2x2 image
-        let (t, h, w, _d) = compute_mrope_index(&ids, 99).unwrap();
+        let (t, h, w, _d) = compute_mrope_index(&ids, 99, &[(2, 2)]).unwrap();
        let pos = mrope_position_tensor(&t, &h, &w, &dev).unwrap();
        assert_eq!(pos.dims(), &[3, 5]);
        let (cos, _sin) = rope.mrope_cos_sin(&pos).unwrap();
@@ -518,7 +568,7 @@ mod tests {
    fn get_rope_index_196_is_14x14() {
        let mut ids = vec![1u32]; // one text token
        ids.extend(std::iter::repeat_n(99u32, 196));
-        let (t, h, w, _delta) = compute_mrope_index(&ids, 99).unwrap();
+        let (t, h, w, _delta) = compute_mrope_index(&ids, 99, &[(14, 14)]).unwrap();
        // image base = 1. Last image token (index 196) is grid (h=13,w=13).
        assert_eq!(*t.last().unwrap(), 1, "grid_t=1 → temporal const at base");
        assert_eq!(h[1], 1, "first image row at base");
--- a/crates/neuron/src/harness/candle.rs
+++ b/crates/neuron/src/harness/candle.rs
@@ -210,13 +210,11 @@ pub struct LoadedModel {
    /// targets and the worker forward uses it to locate splice
    /// positions in the LM input embeddings.
    pub image_token_id: Option<u32>,
-    /// LM-side tokens this model's vision tower emits per image at
-    /// the Stage B fixed resolution (448×448 → 196 for Qwen3.6).
-    /// `None` for text-only models. Set at load time so the
-    /// hot path doesn't recompute it per request. Stage B fixed
-    /// resolution → constant; dynamic resolution per #14 makes it
-    /// per-image.
-    pub lm_tokens_per_image: Option<usize>,
+    /// `patch_size × spatial_merge_size` — divides a resized pixel
+    /// dimension into LM-grid units. Per-image LM token count is
+    /// `(h/factor) × (w/factor)` (#14 dynamic resolution). `None` for
+    /// text-only models. Set at load time.
+    pub image_grid_factor: Option<usize>,
 }

 impl LoadedModel {
@@ -288,9 +286,9 @@ pub struct TpLoadedModel {
    pub has_vision: bool,
    /// `<|image_pad|>` token id — same as [`LoadedModel::image_token_id`].
    pub image_token_id: Option<u32>,
-    /// LM-side tokens per image at the fixed 448×448 resolution — same
-    /// as [`LoadedModel::lm_tokens_per_image`].
-    pub lm_tokens_per_image: Option<usize>,
+    /// Pixel→LM-grid divisor — same as
+    /// [`LoadedModel::image_grid_factor`].
+    pub image_grid_factor: Option<usize>,
 }

 #[cfg(feature = "cuda")]
@@ -394,10 +392,11 @@ impl ModelArch {
        offset: usize,
        image_embeds: &Tensor,
        image_token_id: u32,
+        grids: &[(usize, usize)],
    ) -> Result<Tensor> {
        let raw = match self {
            ModelArch::Qwen3_5Dense(m) => {
-                m.forward_with_vision(input, offset, image_embeds, image_token_id)?
+                m.forward_with_vision(input, offset, image_embeds, image_token_id, grids)?
            }
            other => anyhow::bail!(
                "forward_with_vision: architecture {} has no vision tower",
@@ -407,6 +406,20 @@ impl ModelArch {
        squeeze_to_vocab(&raw)
    }

+    /// `patch_size × spatial_merge_size` for the loaded vision tower —
+    /// divides a resized pixel dim into LM-grid units (an image of
+    /// resized `(h, w)` yields the LM grid `(h/factor, w/factor)`).
+    /// `None` for architectures/checkpoints without a vision tower.
+    pub fn vision_grid_factor(&self) -> Option<usize> {
+        match self {
+            ModelArch::Qwen3_5Dense(m) => m.vision().map(|v| {
+                let c = v.config();
+                c.patch_size * c.spatial_merge_size
+            }),
+            _ => None,
+        }
+    }
+
    /// Encode a preprocessed image into LM-side token embeddings via
    /// the loaded vision tower. Stage A5.
    ///
@@ -1683,10 +1696,10 @@ impl CandleHarness {
                    .ok_or_else(|| InferenceError::VisionUnsupported {
                        model_id: request.model.clone(),
                    })?;
-                let patches_per_image = loaded
-                    .lm_tokens_per_image
-                    .ok_or_else(|| InferenceError::VisionUnsupported {
+                let factor = loaded.image_grid_factor.ok_or_else(|| {
+                    InferenceError::VisionUnsupported {
                        model_id: request.model.clone(),
+                    }
                })?;
                let profile = super::preprocess::PreprocessProfile::qwen3_6();
                let images = extract_images_from_request(&request, &profile).map_err(|e| {
@@ -1699,7 +1712,12 @@ impl CandleHarness {
                        "request has image content but extractor produced zero images"
                    )));
                }
-                let per_image_counts: Vec<usize> = vec![patches_per_image; images.len()];
+                // Per-image LM token count from each image's resized grid
+                // (#14 dynamic resolution; was a constant 196).
+                let per_image_counts: Vec<usize> = images
+                    .iter()
+                    .map(|im| (im.h / factor) * (im.w / factor))
+                    .collect();
                prompt_tokens =
                    expand_image_pad_tokens(&prompt_tokens, image_token_id, &per_image_counts)
                        .map_err(InferenceError::Other)?;
@@ -2059,10 +2077,11 @@ impl CandleHarness {
                        .ok_or_else(|| InferenceError::VisionUnsupported {
                            model_id: request.model.clone(),
                        })?;
-                let patches_per_image = loaded.lm_tokens_per_image.ok_or_else(|| {
-                    InferenceError::VisionUnsupported {
+                let factor =
+                    loaded
+                        .image_grid_factor
+                        .ok_or_else(|| InferenceError::VisionUnsupported {
                            model_id: request.model.clone(),
-                    }
                        })?;
                let profile = super::preprocess::PreprocessProfile::qwen3_6();
                let images = extract_images_from_request(&request, &profile)
@@ -2072,7 +2091,11 @@ impl CandleHarness {
                        "request has image content but extractor produced zero images"
                    )));
                }
-                let per_image_counts: Vec<usize> = vec![patches_per_image; images.len()];
+                // Per-image LM token count from each image's resized grid (#14).
+                let per_image_counts: Vec<usize> = images
+                    .iter()
+                    .map(|im| (im.h / factor) * (im.w / factor))
+                    .collect();
                prompt_tokens =
                    expand_image_pad_tokens(&prompt_tokens, image_token_id, &per_image_counts)
                        .map_err(InferenceError::Other)?;
@@ -2526,7 +2549,7 @@ impl Harness for CandleHarness {
            chat_template,
            has_vision: vision_meta.has_vision,
            image_token_id: vision_meta.image_token_id,
-            lm_tokens_per_image: vision_meta.lm_tokens_per_image,
+            image_grid_factor: vision_meta.image_grid_factor,
        });

        let mut models = self.models.write().await;
@@ -2742,7 +2765,7 @@ impl CandleHarness {
            tracing::info!(
                model = %spec.model_id,
                image_token_id = ?vision_meta.image_token_id,
-                lm_tokens_per_image = ?vision_meta.lm_tokens_per_image,
+                image_grid_factor = ?vision_meta.image_grid_factor,
                "TP load: vision tower present, advertising vision capability"
            );
        }
@@ -2764,7 +2787,7 @@ impl CandleHarness {
            chat_template,
            has_vision: vision_meta.has_vision,
            image_token_id: vision_meta.image_token_id,
-            lm_tokens_per_image: vision_meta.lm_tokens_per_image,
+            image_grid_factor: vision_meta.image_grid_factor,
        });

        let mut models = self.models.write().await;
@@ -2938,8 +2961,8 @@ impl CandleHarness {
                    .ok_or_else(|| InferenceError::VisionUnsupported {
                        model_id: request.model.clone(),
                    })?;
-            let patches_per_image =
-                tp.lm_tokens_per_image
+            let factor = tp
+                .image_grid_factor
                .ok_or_else(|| InferenceError::VisionUnsupported {
                    model_id: request.model.clone(),
                })?;
@@ -2949,7 +2972,21 @@ impl CandleHarness {
                    "request has image content but extractor produced zero data URIs"
                )));
            }
-            let per_image_counts: Vec<usize> = vec![patches_per_image; data_uris.len()];
+            // Per-image LM token count from each image's resized grid (#14).
+            // Decode header + smart_resize only; the workers re-derive the
+            // same dims when they preprocess for the replicated tower.
+            let profile = super::preprocess::PreprocessProfile::qwen3_6();
+            let per_image_counts: Vec<usize> = data_uris
+                .iter()
+                .enumerate()
+                .map(|(i, uri)| {
+                    let (h, w) =
+                        super::preprocess::resized_dims_for_uri(uri, &profile).map_err(|e| {
+                            InferenceError::Other(anyhow::anyhow!("resized_dims image #{i}: {e}"))
+                        })?;
+                    Ok::<usize, InferenceError>((h as usize / factor) * (w as usize / factor))
+                })
+                .collect::<Result<Vec<_>, _>>()?;
            prompt_tokens =
                expand_image_pad_tokens(&prompt_tokens, image_token_id, &per_image_counts)
                    .map_err(InferenceError::Other)?;
@@ -3457,8 +3494,8 @@ async fn chat_completion_tp_inner(
                .ok_or_else(|| InferenceError::VisionUnsupported {
                    model_id: request.model.clone(),
                })?;
-        let patches_per_image =
-            tp.lm_tokens_per_image
+        let factor = tp
+            .image_grid_factor
            .ok_or_else(|| InferenceError::VisionUnsupported {
                model_id: request.model.clone(),
            })?;
@@ -3468,7 +3505,19 @@ async fn chat_completion_tp_inner(
                "request has image content but extractor produced zero data URIs"
            )));
        }
-        let per_image_counts: Vec<usize> = vec![patches_per_image; data_uris.len()];
+        // Per-image LM token count from each image's resized grid (#14).
+        let profile = super::preprocess::PreprocessProfile::qwen3_6();
+        let per_image_counts: Vec<usize> = data_uris
+            .iter()
+            .enumerate()
+            .map(|(i, uri)| {
+                let (h, w) =
+                    super::preprocess::resized_dims_for_uri(uri, &profile).map_err(|e| {
+                        InferenceError::Other(anyhow::anyhow!("resized_dims image #{i}: {e}"))
+                    })?;
+                Ok::<usize, InferenceError>((h as usize / factor) * (w as usize / factor))
+            })
+            .collect::<Result<Vec<_>, _>>()?;
        prompt_tokens = expand_image_pad_tokens(&prompt_tokens, image_token_id, &per_image_counts)
            .map_err(InferenceError::Other)?;
        Some((data_uris, image_token_id))
@@ -3917,10 +3966,12 @@ fn build_prompt_for_request(
 struct VisionMeta {
    has_vision: bool,
    image_token_id: Option<u32>,
-    /// LM-side tokens this model's vision tower emits per image at
-    /// the Stage B fixed `PreprocessProfile::qwen3_6()` resolution
-    /// (448×448). Equal to `(H/patch_size/spatial_merge_size)²`.
-    lm_tokens_per_image: Option<usize>,
+    /// `patch_size × spatial_merge_size` — the divisor that turns a
+    /// resized pixel dimension into an LM-grid dimension. An image of
+    /// resized `(h, w)` emits `(h/factor) × (w/factor)` LM tokens (#14
+    /// dynamic resolution; was a constant 196 at the old fixed 448²).
+    /// `None` for text-only models.
+    image_grid_factor: Option<usize>,
 }

 impl VisionMeta {
@@ -3949,22 +4000,18 @@ impl VisionMeta {
            .get("image_token_id")
            .and_then(|x| x.as_u64())
            .map(|n| n as u32);
-        // Compute LM tokens per image at the Stage B fixed resolution
-        // (PreprocessProfile::qwen3_6() → 448×448). One LM token per
-        // spatial-merge group of patches.
-        let target_h = super::preprocess::PreprocessProfile::qwen3_6().target_height as usize;
-        let target_w = super::preprocess::PreprocessProfile::qwen3_6().target_width as usize;
-        let lm_tokens_per_image = if patch_size > 0 && spatial_merge_size > 0 {
-            let gh = target_h / patch_size / spatial_merge_size;
-            let gw = target_w / patch_size / spatial_merge_size;
-            Some(gh * gw)
+        // The pixel→LM-grid divisor. An image resized to (h, w) emits
+        // (h/factor) × (w/factor) LM tokens — computed per image at
+        // request time now that resolution is dynamic (#14).
+        let image_grid_factor = if patch_size > 0 && spatial_merge_size > 0 {
+            Some(patch_size * spatial_merge_size)
        } else {
            None
        };
        Self {
            has_vision: true,
            image_token_id,
-            lm_tokens_per_image,
+            image_grid_factor,
        }
    }
 }
@@ -4011,13 +4058,13 @@ fn extract_images_from_request(
                    .and_then(|v| v.get("url"))
                    .and_then(|v| v.as_str())
                    .ok_or_else(|| anyhow::anyhow!("image_url part missing url field"))?;
-                let pixels = super::preprocess::preprocess_data_uri(url, profile)
+                let (pixels, h, w) = super::preprocess::preprocess_data_uri(url, profile)
                    .with_context(|| format!("preprocess image #{}", out.len()))?;
                out.push(super::device_worker::jobs::ImageInput {
                    pixels,
                    c: 3,
-                    h: profile.target_height as usize,
-                    w: profile.target_width as usize,
+                    h: h as usize,
+                    w: w as usize,
                });
            }
        }
--- a/crates/neuron/src/harness/device_worker/dispatch.rs
+++ b/crates/neuron/src/harness/device_worker/dispatch.rs
@@ -779,19 +779,17 @@ fn tp_forward_logits_with_images(
        anyhow::bail!("TpForwardLogitsWithImages dispatched with zero images");
    }

-    // Preprocess every image into a device-resident (C, H, W) tensor.
-    // Same fixed-resolution profile + decode path the subprocess workers
-    // run, so the encoded embeddings match across ranks bit-for-bit.
+    // Preprocess every image into a device-resident (C, H, W) tensor at
+    // its native-aspect resized dims (#14). Same `smart_resize` + decode
+    // path the subprocess workers run, so the encoded embeddings — and
+    // the per-image grids derived from these dims — match across ranks
+    // bit-for-bit.
    let profile = PreprocessProfile::qwen3_6();
-    let (h, w) = (
-        profile.target_height as usize,
-        profile.target_width as usize,
-    );
    let mut pixels: Vec<Tensor> = Vec::with_capacity(image_data_uris.len());
    for (idx, uri) in image_data_uris.iter().enumerate() {
-        let px = preprocess_data_uri(uri, &profile)
+        let (px, h, w) = preprocess_data_uri(uri, &profile)
            .with_context(|| format!("preprocess image[{idx}] (TP leader)"))?;
-        let t = Tensor::from_vec(px, (3, h, w), &state.device)?;
+        let t = Tensor::from_vec(px, (3, h as usize, w as usize), &state.device)?;
        pixels.push(t);
    }

@@ -877,9 +875,17 @@ fn forward_logits_with_images(
        anyhow::anyhow!("ForwardLogitsWithImages: no model for handle {}", handle.0)
    })?;

+    // pixel→LM-grid divisor (patch×merge) for this tower; each image's
+    // LM grid is (h/factor, w/factor) (#14 dynamic resolution).
+    let factor = arch.vision_grid_factor().ok_or_else(|| {
+        anyhow::anyhow!("ForwardLogitsWithImages: loaded model has no vision tower")
+    })?;
+
    // Encode every image on the worker's device, collecting per-image
-    // post-merger embeddings as device-resident tensors.
+    // post-merger embeddings as device-resident tensors plus their LM
+    // grids (for the interleaved-M-RoPE position ids).
    let mut per_image: Vec<Tensor> = Vec::with_capacity(images.len());
+    let mut grids: Vec<(usize, usize)> = Vec::with_capacity(images.len());
    for (idx, img) in images.into_iter().enumerate() {
        anyhow::ensure!(
            img.pixels.len() == img.c * img.h * img.w,
@@ -889,6 +895,7 @@ fn forward_logits_with_images(
            img.h,
            img.w,
        );
+        grids.push((img.h / factor, img.w / factor));
        let image = Tensor::from_vec(img.pixels, (img.c, img.h, img.w), &state.device)?;
        let embed = arch
            .encode_image(&image)
@@ -901,7 +908,7 @@ fn forward_logits_with_images(
    let image_embeds = Tensor::cat(&per_image.iter().collect::<Vec<_>>(), 0)?;

    let input = Tensor::new(tokens, &state.device)?.unsqueeze(0)?;
-    let logits = arch.forward_with_vision(&input, offset, &image_embeds, image_token_id)?;
+    let logits = arch.forward_with_vision(&input, offset, &image_embeds, image_token_id, &grids)?;
    let values = logits
        .to_dtype(DType::F32)?
        .flatten_all()?
--- a/crates/neuron/src/harness/device_worker/jobs.rs
+++ b/crates/neuron/src/harness/device_worker/jobs.rs
@@ -36,8 +36,13 @@ pub struct TpHandle(pub u64);
 /// `Clone` so the vision-aware dispatch in `chat_completion` can
 /// match `&vision_route` (carrying borrowed images) and still hand
 /// owned `Vec<ImageInput>` to the worker job. The clone cost is one
-/// pixel-buffer memcpy per image — fine at fixed-resolution sizes
-/// (3 × 448 × 448 × 4 bytes = ~2.4 MiB per image).
+/// pixel-buffer memcpy per image — now variable with dynamic resolution
+/// (#14): `3 × h × w × 4` bytes, up to ~6.3 MiB at the default 1024²
+/// `max_pixels` budget.
+///
+/// `h`/`w` are the **resized** dims (factor-aligned), so the per-image LM
+/// grid is `(h/factor, w/factor)` — derived downstream for the splice
+/// and the interleaved-M-RoPE position ids.
 #[derive(Clone)]
 pub struct ImageInput {
    pub pixels: Vec<f32>,
--- a/crates/neuron/src/harness/preprocess.rs
+++ b/crates/neuron/src/harness/preprocess.rs
@@ -2,11 +2,11 @@
 //!
 //! Decodes `data:image/...;base64,...` URIs from OpenAI-style
 //! `image_url` content parts into the patch tensors a candle vision
-//! tower expects. Stage A ships **fixed resolution** — every image
-//! is resized to the same target dimensions (default 448×448 for
-//! Qwen3.6, configurable per-call) so the patch count is constant
-//! per image. Variable resolution per [Qwen2VL convention] is tracked
-//! as issue #14.
+//! tower expects. Resolution is **dynamic** (#14): each image is
+//! resized to its native aspect via Qwen `smart_resize` — a
+//! factor-aligned `(h, w)` whose pixel count lands in the profile's
+//! `[min_pixels, max_pixels]` budget — so the LM token count varies per
+//! image (`(h/factor) × (w/factor)`).
 //!
 //! Spec reference: `doc/vision-qwen3_6-spec.md` — preprocessor
 //! section.
@@ -21,7 +21,7 @@
 //! Pipeline (per image):
 //!   1. data: URI → base64 decode → bytes
 //!   2. bytes → image::DynamicImage (PNG/JPEG/WebP/etc)
-//!   3. resize_exact to target H×W (pixel space)
+//!   3. smart_resize to a native-aspect, factor-aligned H×W (pixel space)
 //!   4. RGB→f32, normalise per mean/std
 //!   5. layout to (C, H, W) tensor
 //!
@@ -34,39 +34,93 @@ use base64::Engine;
 use image::DynamicImage;
 use image::imageops::FilterType;

-/// Preprocessing target. Captures the resize dimensions and the
-/// channel-wise normalisation constants from the model's
-/// `preprocessor_config.json`. Stage A ships a single `qwen3_6()`
-/// constructor for fixed-resolution Qwen3.6 preprocessing; other
-/// models can ship their own profile when added.
+/// Preprocessing target. Captures the resize policy (Qwen `smart_resize`
+/// factor + pixel budget) and the channel-wise normalisation constants
+/// from the model's `preprocessor_config.json`. Images are resized to
+/// their **native aspect** — a factor-aligned `(h, w)` whose pixel count
+/// lands in `[min_pixels, max_pixels]` — not a fixed square (#14).
 #[derive(Debug, Clone)]
 pub struct PreprocessProfile {
-    pub target_height: u32,
-    pub target_width: u32,
+    /// Both output dims are multiples of this. For Qwen3.6 it is
+    /// `patch_size(16) × spatial_merge_size(2) = 32`, so the post-merge
+    /// LM grid is exactly `(h/factor, w/factor)`.
+    pub factor: u32,
+    /// Lower pixel bound — tiny images are upscaled to at least this.
+    pub min_pixels: u32,
+    /// Upper pixel bound — large images are downscaled to at most this.
+    /// Caps per-image LM tokens (`max_pixels / factor²`) and the
+    /// O(patches²) ViT attention cost.
+    pub max_pixels: u32,
    pub image_mean: [f32; 3],
    pub image_std: [f32; 3],
 }

 impl PreprocessProfile {
-    /// Stage A profile for Qwen3.6. Resize to 448×448, normalise to
-    /// `[-1, 1]` via mean=std=0.5. Fits within the model's
-    /// `num_position_embeddings=2304` budget at 28×28 = 784 patches
-    /// before merging.
+    /// Profile for Qwen3.6. Native-aspect `smart_resize` (factor 32),
+    /// normalise to `[-1, 1]` via mean=std=0.5. Pixel budget defaults:
+    /// `min = 256² = 65536` (→ 8×8 = 64 LM tokens) and
+    /// `max = 1024² = 1048576` (→ 32×32 = 1024 LM tokens) — generous for
+    /// documents/OCR, bounded for serving on 2×RTX5090. (Operator
+    /// override lands with the `[harness.candle.vision]` config in #14 C5.)
    pub fn qwen3_6() -> Self {
        Self {
-            target_height: 448,
-            target_width: 448,
+            factor: 32,
+            min_pixels: 65_536,
+            max_pixels: 1_048_576,
            image_mean: [0.5, 0.5, 0.5],
            image_std: [0.5, 0.5, 0.5],
        }
    }

-    /// Per-channel CHW tensor length: 3 * H * W.
-    pub fn pixels_chw(&self) -> usize {
-        3 * (self.target_height as usize) * (self.target_width as usize)
+    /// The factor-aligned `(h, w)` this profile would resize a source
+    /// `src_h × src_w` image to. Pure integer policy — no pixel work.
+    pub fn resized_dims(&self, src_h: u32, src_w: u32) -> Result<(u32, u32)> {
+        smart_resize(src_h, src_w, self.factor, self.min_pixels, self.max_pixels)
    }
 }

+/// Qwen `smart_resize`: the smallest `factor`-aligned `(h_bar, w_bar)`
+/// that preserves aspect ratio as closely as possible while keeping the
+/// pixel count within `[min_pixels, max_pixels]`. Direct port of the
+/// canonical Qwen2-VL / Qwen3-VL image-processor function (so neuron's
+/// grid matches what the model was trained on).
+///
+/// Returns `(height, width)`. Errors if the aspect ratio exceeds 200:1
+/// (degenerate input — a 1-pixel-tall strip), matching upstream.
+pub fn smart_resize(
+    height: u32,
+    width: u32,
+    factor: u32,
+    min_pixels: u32,
+    max_pixels: u32,
+) -> Result<(u32, u32)> {
+    let h = height.max(1) as f64;
+    let w = width.max(1) as f64;
+    let ratio = h.max(w) / h.min(w);
+    if ratio > 200.0 {
+        anyhow::bail!(
+            "image aspect ratio {ratio:.1}:1 exceeds the 200:1 limit ({height}×{width}); \
+             refusing to resize"
+        );
+    }
+    let f = factor as f64;
+    let (minp, maxp) = (min_pixels as f64, max_pixels as f64);
+    // round-to-nearest-factor (may be 0 for sub-factor inputs; the
+    // min-pixels branch below grows it back up).
+    let mut h_bar = (h / f).round() * f;
+    let mut w_bar = (w / f).round() * f;
+    if h_bar * w_bar > maxp {
+        let beta = (h * w / maxp).sqrt();
+        h_bar = f.max((h / beta / f).floor() * f);
+        w_bar = f.max((w / beta / f).floor() * f);
+    } else if h_bar * w_bar < minp {
+        let beta = (minp / (h * w)).sqrt();
+        h_bar = (h * beta / f).ceil() * f;
+        w_bar = (w * beta / f).ceil() * f;
+    }
+    Ok((h_bar as u32, w_bar as u32))
+}
+
 /// Decode a `data:image/...;base64,...` URI into an in-memory image.
 ///
 /// Accepts the OpenAI Chat Completions `image_url` shape — a string
@@ -106,16 +160,13 @@ pub fn decode_data_uri(uri: &str) -> Result<DynamicImage> {
 /// faster on CPU. Quality difference is marginal for downstream
 /// vision-encoder consumption. The numerical-validation issue (#15)
 /// will quantify any discrepancy.
-pub fn preprocess(img: &DynamicImage, profile: &PreprocessProfile) -> Vec<f32> {
+pub fn preprocess(img: &DynamicImage, profile: &PreprocessProfile) -> Result<(Vec<f32>, u32, u32)> {
+    let (h_bar, w_bar) = profile.resized_dims(img.height(), img.width())?;
    let rgb = img
-        .resize_exact(
-            profile.target_width,
-            profile.target_height,
-            FilterType::Triangle,
-        )
+        .resize_exact(w_bar, h_bar, FilterType::Triangle)
        .to_rgb8();
-    let h = profile.target_height as usize;
-    let w = profile.target_width as usize;
+    let h = h_bar as usize;
+    let w = w_bar as usize;
    let mut out = vec![0.0_f32; 3 * h * w];
    // Row-major (C, H, W). Candle's Conv2d expects NCHW, so this is
    // the natural layout — the caller stacks `n` of these along the
@@ -131,16 +182,27 @@ pub fn preprocess(img: &DynamicImage, profile: &PreprocessProfile) -> Vec<f32> {
            }
        }
    }
-    out
+    Ok((out, h_bar, w_bar))
 }

-/// Combined helper: decode + preprocess in one call. Most call
-/// sites just want the final tensor; the two-step path exists for
-/// callers (tests, future video preprocessing) that need the
+/// Combined helper: decode + preprocess in one call. Returns the
+/// `(3, h, w)` row-major pixels plus the resized `(h, w)` — the caller
+/// needs the dims to build the tensor and to derive the LM token grid
+/// `(h/factor, w/factor)`. Most call sites use this; the two-step path
+/// exists for callers (tests, future video preprocessing) that need the
 /// intermediate `DynamicImage`.
-pub fn preprocess_data_uri(uri: &str, profile: &PreprocessProfile) -> Result<Vec<f32>> {
+pub fn preprocess_data_uri(uri: &str, profile: &PreprocessProfile) -> Result<(Vec<f32>, u32, u32)> {
    let img = decode_data_uri(uri)?;
-    Ok(preprocess(&img, profile))
+    preprocess(&img, profile)
+}
+
+/// Resized `(h, w)` for a data-URI image **without** running the pixel
+/// normalisation — decode header + `smart_resize` only. Lets a caller
+/// that just needs the LM token count (e.g. the TP leader expanding the
+/// prompt) avoid materialising the full pixel tensor twice.
+pub fn resized_dims_for_uri(uri: &str, profile: &PreprocessProfile) -> Result<(u32, u32)> {
+    let img = decode_data_uri(uri)?;
+    profile.resized_dims(img.height(), img.width())
 }

 #[cfg(test)]
@@ -205,13 +267,17 @@ mod tests {
        // decoding so this test isolates the resize+normalise path.
        let img: ImageBuffer<Rgb<u8>, Vec<u8>> = ImageBuffer::from_pixel(2, 2, Rgb([255, 0, 0]));
        let dyn_img = DynamicImage::ImageRgb8(img);
-        let out = preprocess(&dyn_img, &profile);
+        let (out, h_bar, w_bar) = preprocess(&dyn_img, &profile).expect("preprocess");

-        assert_eq!(out.len(), profile.pixels_chw());
+        let h = h_bar as usize;
+        let w = w_bar as usize;
+        assert_eq!(out.len(), 3 * h * w);
+        // Dims are factor-aligned and at least the min-pixel floor.
+        assert_eq!(h_bar % profile.factor, 0);
+        assert_eq!(w_bar % profile.factor, 0);
+        assert!(h * w >= profile.min_pixels as usize);
        // After mean=0.5, std=0.5: red channel (255/255=1.0) → (1.0 - 0.5)/0.5 = 1.0
        // green/blue (0.0) → (0.0 - 0.5)/0.5 = -1.0
-        let h = profile.target_height as usize;
-        let w = profile.target_width as usize;
        assert!(
            (out[0] - 1.0).abs() < 1e-5,
            "R[0] should be 1.0, got {}",
@@ -229,9 +295,12 @@ mod tests {
    #[test]
    fn preprocess_data_uri_end_to_end() {
        let profile = PreprocessProfile::qwen3_6();
-        let out = preprocess_data_uri(&red_png_uri(), &profile).expect("e2e preprocess");
-        assert_eq!(out.len(), profile.pixels_chw());
+        let (out, h, w) = preprocess_data_uri(&red_png_uri(), &profile).expect("e2e preprocess");
+        assert_eq!(out.len(), 3 * h as usize * w as usize);
        assert!(out.iter().all(|v| v.is_finite()));
+        // resized_dims_for_uri agrees with the full preprocess.
+        let (h2, w2) = resized_dims_for_uri(&red_png_uri(), &profile).expect("dims");
+        assert_eq!((h, w), (h2, w2));
    }

    #[test]
@@ -240,10 +309,10 @@ mod tests {
        // 1x1 grayscale = 200 → after conversion to RGB, all three
        // channels equal 200, normalised → (200/255 - 0.5)/0.5 ≈ 0.569
        let gray = DynamicImage::ImageLuma8(ImageBuffer::from_pixel(1, 1, image::Luma([200])));
-        let out = preprocess(&gray, &profile);
+        let (out, h_bar, w_bar) = preprocess(&gray, &profile).expect("preprocess");
        let expected = ((200.0 / 255.0) - 0.5) / 0.5;
-        let h = profile.target_height as usize;
-        let w = profile.target_width as usize;
+        let h = h_bar as usize;
+        let w = w_bar as usize;
        for c in 0..3 {
            let v = out[c * h * w];
            assert!(
@@ -252,4 +321,52 @@ mod tests {
            );
        }
    }
+
+    #[test]
+    fn smart_resize_keeps_factor_aligned_square_in_budget() {
+        // 448×448 sits inside [65536, 1048576] and is factor-aligned →
+        // unchanged. (Regression guard for the old fixed-res sweet spot.)
+        let (h, w) = smart_resize(448, 448, 32, 65_536, 1_048_576).unwrap();
+        assert_eq!((h, w), (448, 448));
+    }
+
+    #[test]
+    fn smart_resize_preserves_aspect_and_caps_at_max() {
+        // 3000×4000 (landscape) → downscaled under max_pixels, aspect kept.
+        let (h, w) = smart_resize(3000, 4000, 32, 65_536, 1_048_576).unwrap();
+        assert_eq!(h % 32, 0);
+        assert_eq!(w % 32, 0);
+        assert!(
+            (h as u64) * (w as u64) <= 1_048_576,
+            "must respect max_pixels"
+        );
+        assert!(w > h, "landscape orientation preserved");
+        // aspect ≈ 4000/3000 = 1.333; allow a factor-rounding tolerance.
+        let ar = w as f64 / h as f64;
+        assert!((ar - 4.0 / 3.0).abs() < 0.15, "aspect ~4:3, got {ar:.3}");
+    }
+
+    #[test]
+    fn smart_resize_floors_tiny_image_at_min() {
+        // 16×16 → upscaled to at least min_pixels, factor-aligned.
+        let (h, w) = smart_resize(16, 16, 32, 65_536, 1_048_576).unwrap();
+        assert_eq!(h % 32, 0);
+        assert_eq!(w % 32, 0);
+        assert!((h as u64) * (w as u64) >= 65_536, "must respect min_pixels");
+    }
+
+    #[test]
+    fn smart_resize_tall_nonsquare_stays_nonsquare() {
+        // A tall screenshot keeps portrait orientation.
+        let (h, w) = smart_resize(2000, 500, 32, 65_536, 1_048_576).unwrap();
+        assert!(h > w, "portrait orientation preserved");
+        assert_eq!(h % 32, 0);
+        assert_eq!(w % 32, 0);
+    }
+
+    #[test]
+    fn smart_resize_rejects_extreme_aspect() {
+        let err = smart_resize(1, 500, 32, 65_536, 1_048_576).unwrap_err();
+        assert!(format!("{err:#}").contains("200:1"));
+    }
 }
--- a/crates/neuron/src/harness/tp/tp_qwen3_5.rs
+++ b/crates/neuron/src/harness/tp/tp_qwen3_5.rs
@@ -1288,15 +1288,39 @@ impl TpQwen3_5ForCausalLM {
        let device = self.device().clone();
        let image_embeds = self.encode_images_concat(image_pixels)?;

+        // Each image's LM grid (lm_gh, lm_gw) = (h/factor, w/factor),
+        // factor = patch×merge. Recomputed per rank from this rank's own
+        // pixel tensors — deterministic, so every rank's grids (and hence
+        // M-RoPE positions) match without crossing the RPC (#14).
+        let factor = self
+            .vision
+            .as_ref()
+            .map(|v| {
+                let c = v.config();
+                c.patch_size * c.spatial_merge_size
+            })
+            .ok_or_else(|| {
+                candle_core::Error::Msg(
+                    "prefill_with_images_chunked: loaded without a vision tower".into(),
+                )
+            })?;
+        let grids: Vec<(usize, usize)> = image_pixels
+            .iter()
+            .map(|t| {
+                let (_, h, w) = t.dims3()?;
+                Ok::<(usize, usize), candle_core::Error>((h / factor, w / factor))
+            })
+            .collect::<candle_core::Result<Vec<_>>>()?;
+
        // Interleaved-M-RoPE 3D position ids for the whole prompt,
        // computed once and sliced per chunk so every rank assigns image
-        // tokens their 14×14 grid coordinates (and text after the image
-        // resumes from the compressed counter). `rope_delta` is stored on
-        // the base model for the decode that follows this prefill. Every
-        // chunk — text or image — uses the M-RoPE slice, because the image
-        // shifts the positions of the text around it.
+        // tokens their grid coordinates (and text after an image resumes
+        // from the compressed counter). `rope_delta` is stored on the base
+        // model for the decode that follows this prefill. Every chunk —
+        // text or image — uses the M-RoPE slice, because each image shifts
+        // the positions of the text around it.
        let (text, height, width, delta) =
-            crate::harness::arch::qwen3_5::rope::get_rope_index(tokens, image_token_id)
+            crate::harness::arch::qwen3_5::rope::get_rope_index(tokens, image_token_id, &grids)
                .map_err(|e| candle_core::Error::Msg(format!("get_rope_index: {e}")))?;
        self.base.set_rope_delta(delta);
        let full_pos = crate::harness::arch::qwen3_5::rope::mrope_position_tensor(
--- a/crates/neuron/src/harness/tp/worker.rs
+++ b/crates/neuron/src/harness/tp/worker.rs
@@ -494,16 +494,13 @@ impl WorkerState {
        let device = model.device().clone();

        // Preprocess each image identically to the leader so the encoded
-        // embeddings — and thus the spliced hidden state — match across
-        // ranks. Fixed 448×448 profile.
+        // embeddings — and thus the spliced hidden state and per-image
+        // grids — match across ranks. Native-aspect `smart_resize` (#14);
+        // deterministic, so each rank derives the same dims.
        let profile = PreprocessProfile::qwen3_6();
-        let (h, w) = (
-            profile.target_height as usize,
-            profile.target_width as usize,
-        );
        let mut pixels: Vec<Tensor> = Vec::with_capacity(image_data_uris.len());
        for (idx, uri) in image_data_uris.iter().enumerate() {
-            let px = match preprocess_data_uri(uri, &profile) {
+            let (px, h, w) = match preprocess_data_uri(uri, &profile) {
                Ok(p) => p,
                Err(e) => {
                    return WorkerResponse::Error {
@@ -512,7 +509,7 @@ impl WorkerState {
                    };
                }
            };
-            match Tensor::from_vec(px, (3, h, w), &device) {
+            match Tensor::from_vec(px, (3, h as usize, w as usize), &device) {
                Ok(t) => pixels.push(t),
                Err(e) => {
                    return WorkerResponse::Error {