From c97a8654f505c2d118e011da833d7da3c27b49cd Mon Sep 17 00:00:00 2001
From: rob thijssen <grenade@rob.tn>
Date: Thu, 4 Jun 2026 22:47:27 +0300
Subject: [PATCH] feat(neuron): dynamic-resolution images via Qwen smart_resize
 (#14)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the fixed 448×448-square preprocess with native-aspect
`smart_resize`, and thread the resulting per-image grid through the LM
so spatial structure survives non-square images (documents, screenshots,
charts, panoramas, OCR) instead of being squished into a square.

- preprocess.rs: port Qwen `smart_resize` (factor = patch×merge = 32;
  pixel budget [min,max], default 256²–1024² → 64–1024 LM tokens).
  `PreprocessProfile` drops the fixed target dims for `factor`/`min_pixels`/
  `max_pixels`; `preprocess`/`preprocess_data_uri` now return the resized
  `(h, w)`; add `resized_dims_for_uri` (decode + resize, no normalize) for
  the TP leader's token count.
- rope.rs: `compute_mrope_index`/`get_rope_index` take per-image
  `grids: &[(lm_gh, lm_gw)]` instead of assuming a square `isqrt(run)`.
  Walk image runs in order, validate `run == gh*gw`, emit row-major
  positions, resume the shared counter at `base + max(gh,gw)`. Correct
  for multiple images of differing grids interleaved with text.
- candle.rs: `VisionMeta`/`LoadedModel`/`TpLoadedModel` carry the
  `image_grid_factor` (patch×merge) instead of the constant 196; all four
  prompt-build sites compute per-image counts from each image's resized
  grid (single-GPU from the extracted `ImageInput.h/w`, TP from
  `resized_dims_for_uri`). `ModelArch` gains `vision_grid_factor`.
- single-GPU (`mod.rs`, `dispatch.rs`) and TP
  (`tp_qwen3_5.rs::prefill_with_images_chunked`, `dispatch.rs`,
  `tp/worker.rs`) thread the grids into `get_rope_index`. Each TP rank
  recomputes grids from its own deterministic preprocess — no rpc.rs
  change, single source of truth.

The vision tower itself was already grid-general (recent pos-embed
interpolation + 2D rotary fix). No patch-count cap: pos-embed is
interpolated to any grid; `max_pixels` bounds cost (O(patches²) ViT
attention + prefill) instead.

Tests: smart_resize (aspect/cap/floor/reject), `compute_mrope_index`
non-square + two-image + mismatch cases, square-grid regression guard.
Non-cuda build + clippy + full workspace tests green; TP load/dispatch
paths are cuda-gated → Gitea CUDA type-check. Operator pixel-budget
config + remaining doc cleanup follow in C5.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 crates/neuron/src/harness/arch/qwen3_5/mod.rs |  21 +-
 .../neuron/src/harness/arch/qwen3_5/rope.rs   | 124 +++++++----
 crates/neuron/src/harness/candle.rs           | 155 ++++++++-----
 .../src/harness/device_worker/dispatch.rs     |  29 ++-
 .../neuron/src/harness/device_worker/jobs.rs  |   9 +-
 crates/neuron/src/harness/preprocess.rs       | 207 ++++++++++++++----
 crates/neuron/src/harness/tp/tp_qwen3_5.rs    |  36 ++-
 crates/neuron/src/harness/tp/worker.rs        |  13 +-
 8 files changed, 425 insertions(+), 169 deletions(-)
diff --git a/crates/neuron/src/harness/arch/qwen3_5/mod.rs b/crates/neuron/src/harness/arch/qwen3_5/mod.rs
index 95b5b64..ed2375c 100644
--- a/crates/neuron/src/harness/arch/qwen3_5/mod.rs
+++ b/crates/neuron/src/harness/arch/qwen3_5/mod.rs
@@ -404,7 +404,7 @@ impl Qwen3_5Model {
     }
 
     pub fn forward(&mut self, input: &Tensor, offset: usize) -> candle_core::Result<Tensor> {
-        self.forward_inner(input, offset, None, None)
+        self.forward_inner(input, offset, None, None, &[])
     }
 
     /// Forward with image-embedding splice. Stage B of the vision plan.
@@ -437,8 +437,15 @@ impl Qwen3_5Model {
         offset: usize,
         image_embeds: &Tensor,
         image_token_id: u32,
+        grids: &[(usize, usize)],
     ) -> candle_core::Result<Tensor> {
-        self.forward_inner(input_ids, offset, Some(image_embeds), Some(image_token_id))
+        self.forward_inner(
+            input_ids,
+            offset,
+            Some(image_embeds),
+            Some(image_token_id),
+            grids,
+        )
     }
 
     fn forward_inner(
@@ -447,6 +454,7 @@ impl Qwen3_5Model {
         offset: usize,
         image_embeds: Option<&Tensor>,
         image_token_id: Option<u32>,
+        grids: &[(usize, usize)],
     ) -> candle_core::Result<Tensor> {
         let (b, l) = input.dims2()?;
         let mut h = self.embed_tokens.forward(input)?;
@@ -483,7 +491,7 @@ impl Qwen3_5Model {
                 h = splice_runs(&h, &img, &positions)?;
             }
 
-            let (text, height, width, delta) = rope::get_rope_index(&ids, tok_id)
+            let (text, height, width, delta) = rope::get_rope_index(&ids, tok_id, grids)
                 .map_err(|e| candle_core::Error::Msg(format!("get_rope_index: {e}")))?;
             self.rope_delta = delta;
             let pos = rope::mrope_position_tensor(&text, &height, &width, &self.device)?;
@@ -603,11 +611,12 @@ impl Qwen3_5ForCausalLM {
         offset: usize,
         image_embeds: &Tensor,
         image_token_id: u32,
+        grids: &[(usize, usize)],
     ) -> candle_core::Result<Tensor> {
         let (_, l) = input.dims2()?;
-        let hidden = self
-            .base
-            .forward_with_vision(input, offset, image_embeds, image_token_id)?;
+        let hidden =
+            self.base
+                .forward_with_vision(input, offset, image_embeds, image_token_id, grids)?;
         hidden.i((.., l - 1.., ..))?.apply(&self.lm_head)
     }
 
diff --git a/crates/neuron/src/harness/arch/qwen3_5/rope.rs b/crates/neuron/src/harness/arch/qwen3_5/rope.rs
index 1d547b1..32bcc4d 100644
--- a/crates/neuron/src/harness/arch/qwen3_5/rope.rs
+++ b/crates/neuron/src/harness/arch/qwen3_5/rope.rs
@@ -260,28 +260,40 @@ pub(crate) fn mrope_enabled() -> bool {
 /// off, returns plain sequential identity positions on all three axes
 /// (`mrope_cos_sin` then reduces exactly to plain RoPE), restoring the
 /// pre-M-RoPE behaviour without touching the rest of the forward.
-pub(crate) fn get_rope_index(input_ids: &[u32], image_token_id: u32) -> Result<MRopeIndex> {
+pub(crate) fn get_rope_index(
+    input_ids: &[u32],
+    image_token_id: u32,
+    grids: &[(usize, usize)],
+) -> Result<MRopeIndex> {
     if !mrope_enabled() {
         let seq: Vec<i64> = (0..input_ids.len() as i64).collect();
         return Ok((seq.clone(), seq.clone(), seq, 0));
     }
-    compute_mrope_index(input_ids, image_token_id)
+    compute_mrope_index(input_ids, image_token_id, grids)
 }
 
 /// The real interleaved-M-RoPE position-id computation (always active in
 /// unit tests; gated behind [`get_rope_index`] at runtime).
 ///
-/// Fixed-resolution assumption (Stage C): each image run is a perfect
-/// square with `grid_t = 1` (still image) and `grid_h = grid_w =
-/// isqrt(run_len)` — 196 → 14×14. Dynamic resolution (#14) would thread
-/// real per-image grids instead.
-pub(crate) fn compute_mrope_index(input_ids: &[u32], image_token_id: u32) -> Result<MRopeIndex> {
+/// `grids` carries the post-merge LM grid `(lm_gh, lm_gw)` for each image
+/// run, in prompt order — a run length alone cannot recover its
+/// factorisation, so the grids must be passed (#14 dynamic resolution).
+/// Each image is a still frame (`grid_t = 1`); its tokens get
+/// `[base, base + hh, base + ww]` row-major and the shared counter
+/// resumes at `base + max(lm_gh, lm_gw)`. Multi-image is correct because
+/// the counter threads across images and interleaved text.
+pub(crate) fn compute_mrope_index(
+    input_ids: &[u32],
+    image_token_id: u32,
+    grids: &[(usize, usize)],
+) -> Result<MRopeIndex> {
     let n = input_ids.len();
     let mut text = Vec::with_capacity(n);
     let mut height = Vec::with_capacity(n);
     let mut width = Vec::with_capacity(n);
     let mut counter: i64 = 0;
     let mut i = 0;
+    let mut k = 0; // index into `grids`, one per image run
     while i < n {
         if input_ids[i] == image_token_id {
             let start = i;
@@ -289,25 +301,30 @@ pub(crate) fn compute_mrope_index(input_ids: &[u32], image_token_id: u32) -> Res
                 i += 1;
             }
             let run = i - start;
-            let g = run.isqrt();
-            if g * g != run {
+            let (grid_h, grid_w) = *grids.get(k).ok_or_else(|| {
+                anyhow::anyhow!(
+                    "get_rope_index: image run #{k} (len {run}) has no matching grid \
+                     ({} grids supplied)",
+                    grids.len()
+                )
+            })?;
+            k += 1;
+            if grid_h * grid_w != run {
                 anyhow::bail!(
-                    "get_rope_index: image run length {run} is not a perfect square \
-                     (fixed-resolution Stage C assumes a square grid; dynamic resolution is #14)"
+                    "get_rope_index: image run #{} length {run} != grid {grid_h}×{grid_w} = {}",
+                    k - 1,
+                    grid_h * grid_w
                 );
             }
-            let (grid_t, grid_h, grid_w) = (1usize, g, g);
             let base = counter;
-            for tt in 0..grid_t {
-                for hh in 0..grid_h {
-                    for ww in 0..grid_w {
-                        text.push(base + tt as i64);
-                        height.push(base + hh as i64);
-                        width.push(base + ww as i64);
-                    }
+            for hh in 0..grid_h {
+                for ww in 0..grid_w {
+                    text.push(base); // grid_t = 1 → temporal axis const
+                    height.push(base + hh as i64);
+                    width.push(base + ww as i64);
                 }
             }
-            counter = base + grid_t.max(grid_h).max(grid_w) as i64;
+            counter = base + grid_h.max(grid_w) as i64;
         } else {
             text.push(counter);
             height.push(counter);
@@ -316,6 +333,12 @@ pub(crate) fn compute_mrope_index(input_ids: &[u32], image_token_id: u32) -> Res
             i += 1;
         }
     }
+    if k != grids.len() {
+        anyhow::bail!(
+            "get_rope_index: prompt has {k} image run(s) but {} grid(s) were supplied",
+            grids.len()
+        );
+    }
     let delta = counter - n as i64;
     Ok((text, height, width, delta))
 }
@@ -447,7 +470,7 @@ mod tests {
 
     #[test]
     fn get_rope_index_text_only_is_sequential() {
-        let (t, h, w, delta) = compute_mrope_index(&[1, 2, 3, 4], 99).unwrap();
+        let (t, h, w, delta) = compute_mrope_index(&[1, 2, 3, 4], 99, &[]).unwrap();
         assert_eq!(t, vec![0, 1, 2, 3]);
         assert_eq!(h, vec![0, 1, 2, 3]);
         assert_eq!(w, vec![0, 1, 2, 3]);
@@ -456,12 +479,12 @@ mod tests {
 
     #[test]
     fn get_rope_index_text_image_text() {
-        // [text, image(2x2 run of 4), text]. image_token = 99.
+        // [text, image(2x2 run of 4), text]. image_token = 99, grid (2,2).
         let ids = [1u32, 99, 99, 99, 99, 2];
-        let (t, h, w, delta) = compute_mrope_index(&ids, 99).unwrap();
-        // token 0: text → 0. image base=1, grid 1x2x2:
+        let (t, h, w, delta) = compute_mrope_index(&ids, 99, &[(2, 2)]).unwrap();
+        // token 0: text → 0. image base=1, grid 2x2:
         //   t all = 1; h = base+row = [1,1,2,2]; w = base+col = [1,2,1,2].
-        // resume from base + max(1,2,2) = 3. trailing text → 3.
+        // resume from base + max(2,2) = 3. trailing text → 3.
         assert_eq!(t, vec![0, 1, 1, 1, 1, 3]);
         assert_eq!(h, vec![0, 1, 1, 2, 2, 3]);
         assert_eq!(w, vec![0, 1, 2, 1, 2, 3]);
@@ -472,25 +495,52 @@ mod tests {
         assert_eq!(6 + delta, 4);
     }
 
+    #[test]
+    fn get_rope_index_nonsquare_single_image() {
+        // text + image(2 rows × 3 cols = 6 tokens). grid (2,3).
+        let ids = [1u32, 99, 99, 99, 99, 99, 99];
+        let (t, h, w, delta) = compute_mrope_index(&ids, 99, &[(2, 3)]).unwrap();
+        // base = 1; row-major h = [0,0,0,1,1,1]+1, w = [0,1,2,0,1,2]+1.
+        assert_eq!(t, vec![0, 1, 1, 1, 1, 1, 1]);
+        assert_eq!(h, vec![0, 1, 1, 1, 2, 2, 2]);
+        assert_eq!(w, vec![0, 1, 2, 3, 1, 2, 3]);
+        // resume from base + max(2,3) = 4; seq_len 7, counter 4 → delta -3.
+        assert_eq!(delta, 4 - 7);
+    }
+
+    #[test]
+    fn get_rope_index_two_images_different_grids() {
+        // img(2x2)=4, text, img(1x3)=3. grids [(2,2),(1,3)].
+        let ids = [99, 99, 99, 99, 7, 99, 99, 99];
+        let (t, h, w, delta) = compute_mrope_index(&ids, 99, &[(2, 2), (1, 3)]).unwrap();
+        // img1 base=0 → t=0, h=[0,0,1,1], w=[0,1,0,1]; resume max(2,2)=2.
+        // text at counter 2. img2 base=3 → t=3, h=[3,3,3], w=[3,4,5];
+        // resume 3+max(1,3)=6.
+        assert_eq!(t, vec![0, 0, 0, 0, 2, 3, 3, 3]);
+        assert_eq!(h, vec![0, 0, 1, 1, 2, 3, 3, 3]);
+        assert_eq!(w, vec![0, 1, 0, 1, 2, 3, 4, 5]);
+        assert_eq!(delta, 6 - 8);
+    }
+
     #[test]
     fn get_rope_index_on_by_default() {
         // With NEURON_MROPE unset (default ON), the runtime path returns
-        // the real interleaved-M-RoPE positions, so image tokens carry
-        // their 2D grid coords (height differs from the text counter).
-        // (NEURON_MROPE=0 would fall back to identity; not asserted here
-        // since it depends on env.)
-        let (t, h, w, _delta) = get_rope_index(&[1, 99, 99, 99, 99, 2], 99).unwrap();
-        // Same as compute_mrope_index: 2x2 image after one text token.
+        // the real interleaved-M-RoPE positions. (NEURON_MROPE=0 would fall
+        // back to identity; not asserted here since it depends on env.)
+        let (t, h, w, _delta) = get_rope_index(&[1, 99, 99, 99, 99, 2], 99, &[(2, 2)]).unwrap();
         assert_eq!(t, vec![0, 1, 1, 1, 1, 3]);
         assert_eq!(h, vec![0, 1, 1, 2, 2, 3]);
         assert_eq!(w, vec![0, 1, 2, 1, 2, 3]);
     }
 
     #[test]
-    fn get_rope_index_rejects_non_square_image_run() {
-        // 196 is square (14x14) — ok. 195 is not.
-        assert!(compute_mrope_index(&[99u32; 196], 99).is_ok());
-        assert!(compute_mrope_index(&[99u32; 195], 99).is_err());
+    fn get_rope_index_grid_mismatches_error() {
+        // run length != grid product.
+        assert!(compute_mrope_index(&[99u32; 6], 99, &[(2, 2)]).is_err());
+        // too few grids for the number of image runs.
+        assert!(compute_mrope_index(&[99, 99, 7, 99], 99, &[(1, 2)]).is_err());
+        // too many grids.
+        assert!(compute_mrope_index(&[99, 99], 99, &[(1, 2), (1, 1)]).is_err());
     }
 
     #[test]
@@ -501,7 +551,7 @@ mod tests {
         let dev = Device::Cpu;
         let rope = RotaryEmbedding::new(DType::F32, &qwen36_cfg(), &dev).unwrap();
         let ids = [1u32, 99, 99, 99, 99]; // text + 2x2 image
-        let (t, h, w, _d) = compute_mrope_index(&ids, 99).unwrap();
+        let (t, h, w, _d) = compute_mrope_index(&ids, 99, &[(2, 2)]).unwrap();
         let pos = mrope_position_tensor(&t, &h, &w, &dev).unwrap();
         assert_eq!(pos.dims(), &[3, 5]);
         let (cos, _sin) = rope.mrope_cos_sin(&pos).unwrap();
@@ -518,7 +568,7 @@ mod tests {
     fn get_rope_index_196_is_14x14() {
         let mut ids = vec![1u32]; // one text token
         ids.extend(std::iter::repeat_n(99u32, 196));
-        let (t, h, w, _delta) = compute_mrope_index(&ids, 99).unwrap();
+        let (t, h, w, _delta) = compute_mrope_index(&ids, 99, &[(14, 14)]).unwrap();
         // image base = 1. Last image token (index 196) is grid (h=13,w=13).
         assert_eq!(*t.last().unwrap(), 1, "grid_t=1 → temporal const at base");
         assert_eq!(h[1], 1, "first image row at base");
diff --git a/crates/neuron/src/harness/candle.rs b/crates/neuron/src/harness/candle.rs
index 613d8d3..2dc8da7 100644
--- a/crates/neuron/src/harness/candle.rs
+++ b/crates/neuron/src/harness/candle.rs
@@ -210,13 +210,11 @@ pub struct LoadedModel {
     /// targets and the worker forward uses it to locate splice
     /// positions in the LM input embeddings.
     pub image_token_id: Option<u32>,
-    /// LM-side tokens this model's vision tower emits per image at
-    /// the Stage B fixed resolution (448×448 → 196 for Qwen3.6).
-    /// `None` for text-only models. Set at load time so the
-    /// hot path doesn't recompute it per request. Stage B fixed
-    /// resolution → constant; dynamic resolution per #14 makes it
-    /// per-image.
-    pub lm_tokens_per_image: Option<usize>,
+    /// `patch_size × spatial_merge_size` — divides a resized pixel
+    /// dimension into LM-grid units. Per-image LM token count is
+    /// `(h/factor) × (w/factor)` (#14 dynamic resolution). `None` for
+    /// text-only models. Set at load time.
+    pub image_grid_factor: Option<usize>,
 }
 
 impl LoadedModel {
@@ -288,9 +286,9 @@ pub struct TpLoadedModel {
     pub has_vision: bool,
     /// `<|image_pad|>` token id — same as [`LoadedModel::image_token_id`].
     pub image_token_id: Option<u32>,
-    /// LM-side tokens per image at the fixed 448×448 resolution — same
-    /// as [`LoadedModel::lm_tokens_per_image`].
-    pub lm_tokens_per_image: Option<usize>,
+    /// Pixel→LM-grid divisor — same as
+    /// [`LoadedModel::image_grid_factor`].
+    pub image_grid_factor: Option<usize>,
 }
 
 #[cfg(feature = "cuda")]
@@ -394,10 +392,11 @@ impl ModelArch {
         offset: usize,
         image_embeds: &Tensor,
         image_token_id: u32,
+        grids: &[(usize, usize)],
     ) -> Result<Tensor> {
         let raw = match self {
             ModelArch::Qwen3_5Dense(m) => {
-                m.forward_with_vision(input, offset, image_embeds, image_token_id)?
+                m.forward_with_vision(input, offset, image_embeds, image_token_id, grids)?
             }
             other => anyhow::bail!(
                 "forward_with_vision: architecture {} has no vision tower",
@@ -407,6 +406,20 @@ impl ModelArch {
         squeeze_to_vocab(&raw)
     }
 
+    /// `patch_size × spatial_merge_size` for the loaded vision tower —
+    /// divides a resized pixel dim into LM-grid units (an image of
+    /// resized `(h, w)` yields the LM grid `(h/factor, w/factor)`).
+    /// `None` for architectures/checkpoints without a vision tower.
+    pub fn vision_grid_factor(&self) -> Option<usize> {
+        match self {
+            ModelArch::Qwen3_5Dense(m) => m.vision().map(|v| {
+                let c = v.config();
+                c.patch_size * c.spatial_merge_size
+            }),
+            _ => None,
+        }
+    }
+
     /// Encode a preprocessed image into LM-side token embeddings via
     /// the loaded vision tower. Stage A5.
     ///
@@ -1683,11 +1696,11 @@ impl CandleHarness {
                     .ok_or_else(|| InferenceError::VisionUnsupported {
                         model_id: request.model.clone(),
                     })?;
-                let patches_per_image = loaded
-                    .lm_tokens_per_image
-                    .ok_or_else(|| InferenceError::VisionUnsupported {
+                let factor = loaded.image_grid_factor.ok_or_else(|| {
+                    InferenceError::VisionUnsupported {
                         model_id: request.model.clone(),
-                    })?;
+                    }
+                })?;
                 let profile = super::preprocess::PreprocessProfile::qwen3_6();
                 let images = extract_images_from_request(&request, &profile).map_err(|e| {
                     InferenceError::Other(anyhow::anyhow!("extract_images: {e}"))
@@ -1699,7 +1712,12 @@ impl CandleHarness {
                         "request has image content but extractor produced zero images"
                     )));
                 }
-                let per_image_counts: Vec<usize> = vec![patches_per_image; images.len()];
+                // Per-image LM token count from each image's resized grid
+                // (#14 dynamic resolution; was a constant 196).
+                let per_image_counts: Vec<usize> = images
+                    .iter()
+                    .map(|im| (im.h / factor) * (im.w / factor))
+                    .collect();
                 prompt_tokens =
                     expand_image_pad_tokens(&prompt_tokens, image_token_id, &per_image_counts)
                         .map_err(InferenceError::Other)?;
@@ -2059,11 +2077,12 @@ impl CandleHarness {
                         .ok_or_else(|| InferenceError::VisionUnsupported {
                             model_id: request.model.clone(),
                         })?;
-                let patches_per_image = loaded.lm_tokens_per_image.ok_or_else(|| {
-                    InferenceError::VisionUnsupported {
-                        model_id: request.model.clone(),
-                    }
-                })?;
+                let factor =
+                    loaded
+                        .image_grid_factor
+                        .ok_or_else(|| InferenceError::VisionUnsupported {
+                            model_id: request.model.clone(),
+                        })?;
                 let profile = super::preprocess::PreprocessProfile::qwen3_6();
                 let images = extract_images_from_request(&request, &profile)
                     .map_err(|e| InferenceError::Other(anyhow::anyhow!("extract_images: {e}")))?;
@@ -2072,7 +2091,11 @@ impl CandleHarness {
                         "request has image content but extractor produced zero images"
                     )));
                 }
-                let per_image_counts: Vec<usize> = vec![patches_per_image; images.len()];
+                // Per-image LM token count from each image's resized grid (#14).
+                let per_image_counts: Vec<usize> = images
+                    .iter()
+                    .map(|im| (im.h / factor) * (im.w / factor))
+                    .collect();
                 prompt_tokens =
                     expand_image_pad_tokens(&prompt_tokens, image_token_id, &per_image_counts)
                         .map_err(InferenceError::Other)?;
@@ -2526,7 +2549,7 @@ impl Harness for CandleHarness {
             chat_template,
             has_vision: vision_meta.has_vision,
             image_token_id: vision_meta.image_token_id,
-            lm_tokens_per_image: vision_meta.lm_tokens_per_image,
+            image_grid_factor: vision_meta.image_grid_factor,
         });
 
         let mut models = self.models.write().await;
@@ -2742,7 +2765,7 @@ impl CandleHarness {
             tracing::info!(
                 model = %spec.model_id,
                 image_token_id = ?vision_meta.image_token_id,
-                lm_tokens_per_image = ?vision_meta.lm_tokens_per_image,
+                image_grid_factor = ?vision_meta.image_grid_factor,
                 "TP load: vision tower present, advertising vision capability"
             );
         }
@@ -2764,7 +2787,7 @@ impl CandleHarness {
             chat_template,
             has_vision: vision_meta.has_vision,
             image_token_id: vision_meta.image_token_id,
-            lm_tokens_per_image: vision_meta.lm_tokens_per_image,
+            image_grid_factor: vision_meta.image_grid_factor,
         });
 
         let mut models = self.models.write().await;
@@ -2938,18 +2961,32 @@ impl CandleHarness {
                     .ok_or_else(|| InferenceError::VisionUnsupported {
                         model_id: request.model.clone(),
                     })?;
-            let patches_per_image =
-                tp.lm_tokens_per_image
-                    .ok_or_else(|| InferenceError::VisionUnsupported {
-                        model_id: request.model.clone(),
-                    })?;
+            let factor = tp
+                .image_grid_factor
+                .ok_or_else(|| InferenceError::VisionUnsupported {
+                    model_id: request.model.clone(),
+                })?;
             let data_uris = extract_image_data_uris(&request);
             if data_uris.is_empty() {
                 return Err(InferenceError::Other(anyhow::anyhow!(
                     "request has image content but extractor produced zero data URIs"
                 )));
             }
-            let per_image_counts: Vec<usize> = vec![patches_per_image; data_uris.len()];
+            // Per-image LM token count from each image's resized grid (#14).
+            // Decode header + smart_resize only; the workers re-derive the
+            // same dims when they preprocess for the replicated tower.
+            let profile = super::preprocess::PreprocessProfile::qwen3_6();
+            let per_image_counts: Vec<usize> = data_uris
+                .iter()
+                .enumerate()
+                .map(|(i, uri)| {
+                    let (h, w) =
+                        super::preprocess::resized_dims_for_uri(uri, &profile).map_err(|e| {
+                            InferenceError::Other(anyhow::anyhow!("resized_dims image #{i}: {e}"))
+                        })?;
+                    Ok::<usize, InferenceError>((h as usize / factor) * (w as usize / factor))
+                })
+                .collect::<Result<Vec<_>, _>>()?;
             prompt_tokens =
                 expand_image_pad_tokens(&prompt_tokens, image_token_id, &per_image_counts)
                     .map_err(InferenceError::Other)?;
@@ -3457,18 +3494,30 @@ async fn chat_completion_tp_inner(
                 .ok_or_else(|| InferenceError::VisionUnsupported {
                     model_id: request.model.clone(),
                 })?;
-        let patches_per_image =
-            tp.lm_tokens_per_image
-                .ok_or_else(|| InferenceError::VisionUnsupported {
-                    model_id: request.model.clone(),
-                })?;
+        let factor = tp
+            .image_grid_factor
+            .ok_or_else(|| InferenceError::VisionUnsupported {
+                model_id: request.model.clone(),
+            })?;
         let data_uris = extract_image_data_uris(&request);
         if data_uris.is_empty() {
             return Err(InferenceError::Other(anyhow::anyhow!(
                 "request has image content but extractor produced zero data URIs"
             )));
         }
-        let per_image_counts: Vec<usize> = vec![patches_per_image; data_uris.len()];
+        // Per-image LM token count from each image's resized grid (#14).
+        let profile = super::preprocess::PreprocessProfile::qwen3_6();
+        let per_image_counts: Vec<usize> = data_uris
+            .iter()
+            .enumerate()
+            .map(|(i, uri)| {
+                let (h, w) =
+                    super::preprocess::resized_dims_for_uri(uri, &profile).map_err(|e| {
+                        InferenceError::Other(anyhow::anyhow!("resized_dims image #{i}: {e}"))
+                    })?;
+                Ok::<usize, InferenceError>((h as usize / factor) * (w as usize / factor))
+            })
+            .collect::<Result<Vec<_>, _>>()?;
         prompt_tokens = expand_image_pad_tokens(&prompt_tokens, image_token_id, &per_image_counts)
             .map_err(InferenceError::Other)?;
         Some((data_uris, image_token_id))
@@ -3917,10 +3966,12 @@ fn build_prompt_for_request(
 struct VisionMeta {
     has_vision: bool,
     image_token_id: Option<u32>,
-    /// LM-side tokens this model's vision tower emits per image at
-    /// the Stage B fixed `PreprocessProfile::qwen3_6()` resolution
-    /// (448×448). Equal to `(H/patch_size/spatial_merge_size)²`.
-    lm_tokens_per_image: Option<usize>,
+    /// `patch_size × spatial_merge_size` — the divisor that turns a
+    /// resized pixel dimension into an LM-grid dimension. An image of
+    /// resized `(h, w)` emits `(h/factor) × (w/factor)` LM tokens (#14
+    /// dynamic resolution; was a constant 196 at the old fixed 448²).
+    /// `None` for text-only models.
+    image_grid_factor: Option<usize>,
 }
 
 impl VisionMeta {
@@ -3949,22 +4000,18 @@ impl VisionMeta {
             .get("image_token_id")
             .and_then(|x| x.as_u64())
             .map(|n| n as u32);
-        // Compute LM tokens per image at the Stage B fixed resolution
-        // (PreprocessProfile::qwen3_6() → 448×448). One LM token per
-        // spatial-merge group of patches.
-        let target_h = super::preprocess::PreprocessProfile::qwen3_6().target_height as usize;
-        let target_w = super::preprocess::PreprocessProfile::qwen3_6().target_width as usize;
-        let lm_tokens_per_image = if patch_size > 0 && spatial_merge_size > 0 {
-            let gh = target_h / patch_size / spatial_merge_size;
-            let gw = target_w / patch_size / spatial_merge_size;
-            Some(gh * gw)
+        // The pixel→LM-grid divisor. An image resized to (h, w) emits
+        // (h/factor) × (w/factor) LM tokens — computed per image at
+        // request time now that resolution is dynamic (#14).
+        let image_grid_factor = if patch_size > 0 && spatial_merge_size > 0 {
+            Some(patch_size * spatial_merge_size)
         } else {
             None
         };
         Self {
             has_vision: true,
             image_token_id,
-            lm_tokens_per_image,
+            image_grid_factor,
         }
     }
 }
@@ -4011,13 +4058,13 @@ fn extract_images_from_request(
                     .and_then(|v| v.get("url"))
                     .and_then(|v| v.as_str())
                     .ok_or_else(|| anyhow::anyhow!("image_url part missing url field"))?;
-                let pixels = super::preprocess::preprocess_data_uri(url, profile)
+                let (pixels, h, w) = super::preprocess::preprocess_data_uri(url, profile)
                     .with_context(|| format!("preprocess image #{}", out.len()))?;
                 out.push(super::device_worker::jobs::ImageInput {
                     pixels,
                     c: 3,
-                    h: profile.target_height as usize,
-                    w: profile.target_width as usize,
+                    h: h as usize,
+                    w: w as usize,
                 });
             }
         }
diff --git a/crates/neuron/src/harness/device_worker/dispatch.rs b/crates/neuron/src/harness/device_worker/dispatch.rs
index 62c60c2..6df69ef 100644
--- a/crates/neuron/src/harness/device_worker/dispatch.rs
+++ b/crates/neuron/src/harness/device_worker/dispatch.rs
@@ -779,19 +779,17 @@ fn tp_forward_logits_with_images(
         anyhow::bail!("TpForwardLogitsWithImages dispatched with zero images");
     }
 
-    // Preprocess every image into a device-resident (C, H, W) tensor.
-    // Same fixed-resolution profile + decode path the subprocess workers
-    // run, so the encoded embeddings match across ranks bit-for-bit.
+    // Preprocess every image into a device-resident (C, H, W) tensor at
+    // its native-aspect resized dims (#14). Same `smart_resize` + decode
+    // path the subprocess workers run, so the encoded embeddings — and
+    // the per-image grids derived from these dims — match across ranks
+    // bit-for-bit.
     let profile = PreprocessProfile::qwen3_6();
-    let (h, w) = (
-        profile.target_height as usize,
-        profile.target_width as usize,
-    );
     let mut pixels: Vec<Tensor> = Vec::with_capacity(image_data_uris.len());
     for (idx, uri) in image_data_uris.iter().enumerate() {
-        let px = preprocess_data_uri(uri, &profile)
+        let (px, h, w) = preprocess_data_uri(uri, &profile)
             .with_context(|| format!("preprocess image[{idx}] (TP leader)"))?;
-        let t = Tensor::from_vec(px, (3, h, w), &state.device)?;
+        let t = Tensor::from_vec(px, (3, h as usize, w as usize), &state.device)?;
         pixels.push(t);
     }
 
@@ -877,9 +875,17 @@ fn forward_logits_with_images(
         anyhow::anyhow!("ForwardLogitsWithImages: no model for handle {}", handle.0)
     })?;
 
+    // pixel→LM-grid divisor (patch×merge) for this tower; each image's
+    // LM grid is (h/factor, w/factor) (#14 dynamic resolution).
+    let factor = arch.vision_grid_factor().ok_or_else(|| {
+        anyhow::anyhow!("ForwardLogitsWithImages: loaded model has no vision tower")
+    })?;
+
     // Encode every image on the worker's device, collecting per-image
-    // post-merger embeddings as device-resident tensors.
+    // post-merger embeddings as device-resident tensors plus their LM
+    // grids (for the interleaved-M-RoPE position ids).
     let mut per_image: Vec<Tensor> = Vec::with_capacity(images.len());
+    let mut grids: Vec<(usize, usize)> = Vec::with_capacity(images.len());
     for (idx, img) in images.into_iter().enumerate() {
         anyhow::ensure!(
             img.pixels.len() == img.c * img.h * img.w,
@@ -889,6 +895,7 @@ fn forward_logits_with_images(
             img.h,
             img.w,
         );
+        grids.push((img.h / factor, img.w / factor));
         let image = Tensor::from_vec(img.pixels, (img.c, img.h, img.w), &state.device)?;
         let embed = arch
             .encode_image(&image)
@@ -901,7 +908,7 @@ fn forward_logits_with_images(
     let image_embeds = Tensor::cat(&per_image.iter().collect::<Vec<_>>(), 0)?;
 
     let input = Tensor::new(tokens, &state.device)?.unsqueeze(0)?;
-    let logits = arch.forward_with_vision(&input, offset, &image_embeds, image_token_id)?;
+    let logits = arch.forward_with_vision(&input, offset, &image_embeds, image_token_id, &grids)?;
     let values = logits
         .to_dtype(DType::F32)?
         .flatten_all()?
diff --git a/crates/neuron/src/harness/device_worker/jobs.rs b/crates/neuron/src/harness/device_worker/jobs.rs
index fc3587a..d53826e 100644
--- a/crates/neuron/src/harness/device_worker/jobs.rs
+++ b/crates/neuron/src/harness/device_worker/jobs.rs
@@ -36,8 +36,13 @@ pub struct TpHandle(pub u64);
 /// `Clone` so the vision-aware dispatch in `chat_completion` can
 /// match `&vision_route` (carrying borrowed images) and still hand
 /// owned `Vec<ImageInput>` to the worker job. The clone cost is one
-/// pixel-buffer memcpy per image — fine at fixed-resolution sizes
-/// (3 × 448 × 448 × 4 bytes = ~2.4 MiB per image).
+/// pixel-buffer memcpy per image — now variable with dynamic resolution
+/// (#14): `3 × h × w × 4` bytes, up to ~6.3 MiB at the default 1024²
+/// `max_pixels` budget.
+///
+/// `h`/`w` are the **resized** dims (factor-aligned), so the per-image LM
+/// grid is `(h/factor, w/factor)` — derived downstream for the splice
+/// and the interleaved-M-RoPE position ids.
 #[derive(Clone)]
 pub struct ImageInput {
     pub pixels: Vec<f32>,
diff --git a/crates/neuron/src/harness/preprocess.rs b/crates/neuron/src/harness/preprocess.rs
index 0356f4d..72a0f6f 100644
--- a/crates/neuron/src/harness/preprocess.rs
+++ b/crates/neuron/src/harness/preprocess.rs
@@ -2,11 +2,11 @@
 //!
 //! Decodes `data:image/...;base64,...` URIs from OpenAI-style
 //! `image_url` content parts into the patch tensors a candle vision
-//! tower expects. Stage A ships **fixed resolution** — every image
-//! is resized to the same target dimensions (default 448×448 for
-//! Qwen3.6, configurable per-call) so the patch count is constant
-//! per image. Variable resolution per [Qwen2VL convention] is tracked
-//! as issue #14.
+//! tower expects. Resolution is **dynamic** (#14): each image is
+//! resized to its native aspect via Qwen `smart_resize` — a
+//! factor-aligned `(h, w)` whose pixel count lands in the profile's
+//! `[min_pixels, max_pixels]` budget — so the LM token count varies per
+//! image (`(h/factor) × (w/factor)`).
 //!
 //! Spec reference: `doc/vision-qwen3_6-spec.md` — preprocessor
 //! section.
@@ -21,7 +21,7 @@
 //! Pipeline (per image):
 //!   1. data: URI → base64 decode → bytes
 //!   2. bytes → image::DynamicImage (PNG/JPEG/WebP/etc)
-//!   3. resize_exact to target H×W (pixel space)
+//!   3. smart_resize to a native-aspect, factor-aligned H×W (pixel space)
 //!   4. RGB→f32, normalise per mean/std
 //!   5. layout to (C, H, W) tensor
 //!
@@ -34,39 +34,93 @@ use base64::Engine;
 use image::DynamicImage;
 use image::imageops::FilterType;
 
-/// Preprocessing target. Captures the resize dimensions and the
-/// channel-wise normalisation constants from the model's
-/// `preprocessor_config.json`. Stage A ships a single `qwen3_6()`
-/// constructor for fixed-resolution Qwen3.6 preprocessing; other
-/// models can ship their own profile when added.
+/// Preprocessing target. Captures the resize policy (Qwen `smart_resize`
+/// factor + pixel budget) and the channel-wise normalisation constants
+/// from the model's `preprocessor_config.json`. Images are resized to
+/// their **native aspect** — a factor-aligned `(h, w)` whose pixel count
+/// lands in `[min_pixels, max_pixels]` — not a fixed square (#14).
 #[derive(Debug, Clone)]
 pub struct PreprocessProfile {
-    pub target_height: u32,
-    pub target_width: u32,
+    /// Both output dims are multiples of this. For Qwen3.6 it is
+    /// `patch_size(16) × spatial_merge_size(2) = 32`, so the post-merge
+    /// LM grid is exactly `(h/factor, w/factor)`.
+    pub factor: u32,
+    /// Lower pixel bound — tiny images are upscaled to at least this.
+    pub min_pixels: u32,
+    /// Upper pixel bound — large images are downscaled to at most this.
+    /// Caps per-image LM tokens (`max_pixels / factor²`) and the
+    /// O(patches²) ViT attention cost.
+    pub max_pixels: u32,
     pub image_mean: [f32; 3],
     pub image_std: [f32; 3],
 }
 
 impl PreprocessProfile {
-    /// Stage A profile for Qwen3.6. Resize to 448×448, normalise to
-    /// `[-1, 1]` via mean=std=0.5. Fits within the model's
-    /// `num_position_embeddings=2304` budget at 28×28 = 784 patches
-    /// before merging.
+    /// Profile for Qwen3.6. Native-aspect `smart_resize` (factor 32),
+    /// normalise to `[-1, 1]` via mean=std=0.5. Pixel budget defaults:
+    /// `min = 256² = 65536` (→ 8×8 = 64 LM tokens) and
+    /// `max = 1024² = 1048576` (→ 32×32 = 1024 LM tokens) — generous for
+    /// documents/OCR, bounded for serving on 2×RTX5090. (Operator
+    /// override lands with the `[harness.candle.vision]` config in #14 C5.)
     pub fn qwen3_6() -> Self {
         Self {
-            target_height: 448,
-            target_width: 448,
+            factor: 32,
+            min_pixels: 65_536,
+            max_pixels: 1_048_576,
             image_mean: [0.5, 0.5, 0.5],
             image_std: [0.5, 0.5, 0.5],
         }
     }
 
-    /// Per-channel CHW tensor length: 3 * H * W.
-    pub fn pixels_chw(&self) -> usize {
-        3 * (self.target_height as usize) * (self.target_width as usize)
+    /// The factor-aligned `(h, w)` this profile would resize a source
+    /// `src_h × src_w` image to. Pure integer policy — no pixel work.
+    pub fn resized_dims(&self, src_h: u32, src_w: u32) -> Result<(u32, u32)> {
+        smart_resize(src_h, src_w, self.factor, self.min_pixels, self.max_pixels)
     }
 }
 
+/// Qwen `smart_resize`: the smallest `factor`-aligned `(h_bar, w_bar)`
+/// that preserves aspect ratio as closely as possible while keeping the
+/// pixel count within `[min_pixels, max_pixels]`. Direct port of the
+/// canonical Qwen2-VL / Qwen3-VL image-processor function (so neuron's
+/// grid matches what the model was trained on).
+///
+/// Returns `(height, width)`. Errors if the aspect ratio exceeds 200:1
+/// (degenerate input — a 1-pixel-tall strip), matching upstream.
+pub fn smart_resize(
+    height: u32,
+    width: u32,
+    factor: u32,
+    min_pixels: u32,
+    max_pixels: u32,
+) -> Result<(u32, u32)> {
+    let h = height.max(1) as f64;
+    let w = width.max(1) as f64;
+    let ratio = h.max(w) / h.min(w);
+    if ratio > 200.0 {
+        anyhow::bail!(
+            "image aspect ratio {ratio:.1}:1 exceeds the 200:1 limit ({height}×{width}); \
+             refusing to resize"
+        );
+    }
+    let f = factor as f64;
+    let (minp, maxp) = (min_pixels as f64, max_pixels as f64);
+    // round-to-nearest-factor (may be 0 for sub-factor inputs; the
+    // min-pixels branch below grows it back up).
+    let mut h_bar = (h / f).round() * f;
+    let mut w_bar = (w / f).round() * f;
+    if h_bar * w_bar > maxp {
+        let beta = (h * w / maxp).sqrt();
+        h_bar = f.max((h / beta / f).floor() * f);
+        w_bar = f.max((w / beta / f).floor() * f);
+    } else if h_bar * w_bar < minp {
+        let beta = (minp / (h * w)).sqrt();
+        h_bar = (h * beta / f).ceil() * f;
+        w_bar = (w * beta / f).ceil() * f;
+    }
+    Ok((h_bar as u32, w_bar as u32))
+}
+
 /// Decode a `data:image/...;base64,...` URI into an in-memory image.
 ///
 /// Accepts the OpenAI Chat Completions `image_url` shape — a string
@@ -106,16 +160,13 @@ pub fn decode_data_uri(uri: &str) -> Result<DynamicImage> {
 /// faster on CPU. Quality difference is marginal for downstream
 /// vision-encoder consumption. The numerical-validation issue (#15)
 /// will quantify any discrepancy.
-pub fn preprocess(img: &DynamicImage, profile: &PreprocessProfile) -> Vec<f32> {
+pub fn preprocess(img: &DynamicImage, profile: &PreprocessProfile) -> Result<(Vec<f32>, u32, u32)> {
+    let (h_bar, w_bar) = profile.resized_dims(img.height(), img.width())?;
     let rgb = img
-        .resize_exact(
-            profile.target_width,
-            profile.target_height,
-            FilterType::Triangle,
-        )
+        .resize_exact(w_bar, h_bar, FilterType::Triangle)
         .to_rgb8();
-    let h = profile.target_height as usize;
-    let w = profile.target_width as usize;
+    let h = h_bar as usize;
+    let w = w_bar as usize;
     let mut out = vec![0.0_f32; 3 * h * w];
     // Row-major (C, H, W). Candle's Conv2d expects NCHW, so this is
     // the natural layout — the caller stacks `n` of these along the
@@ -131,16 +182,27 @@ pub fn preprocess(img: &DynamicImage, profile: &PreprocessProfile) -> Vec<f32> {
             }
         }
     }
-    out
+    Ok((out, h_bar, w_bar))
 }
 
-/// Combined helper: decode + preprocess in one call. Most call
-/// sites just want the final tensor; the two-step path exists for
-/// callers (tests, future video preprocessing) that need the
+/// Combined helper: decode + preprocess in one call. Returns the
+/// `(3, h, w)` row-major pixels plus the resized `(h, w)` — the caller
+/// needs the dims to build the tensor and to derive the LM token grid
+/// `(h/factor, w/factor)`. Most call sites use this; the two-step path
+/// exists for callers (tests, future video preprocessing) that need the
 /// intermediate `DynamicImage`.
-pub fn preprocess_data_uri(uri: &str, profile: &PreprocessProfile) -> Result<Vec<f32>> {
+pub fn preprocess_data_uri(uri: &str, profile: &PreprocessProfile) -> Result<(Vec<f32>, u32, u32)> {
     let img = decode_data_uri(uri)?;
-    Ok(preprocess(&img, profile))
+    preprocess(&img, profile)
+}
+
+/// Resized `(h, w)` for a data-URI image **without** running the pixel
+/// normalisation — decode header + `smart_resize` only. Lets a caller
+/// that just needs the LM token count (e.g. the TP leader expanding the
+/// prompt) avoid materialising the full pixel tensor twice.
+pub fn resized_dims_for_uri(uri: &str, profile: &PreprocessProfile) -> Result<(u32, u32)> {
+    let img = decode_data_uri(uri)?;
+    profile.resized_dims(img.height(), img.width())
 }
 
 #[cfg(test)]
@@ -205,13 +267,17 @@ mod tests {
         // decoding so this test isolates the resize+normalise path.
         let img: ImageBuffer<Rgb<u8>, Vec<u8>> = ImageBuffer::from_pixel(2, 2, Rgb([255, 0, 0]));
         let dyn_img = DynamicImage::ImageRgb8(img);
-        let out = preprocess(&dyn_img, &profile);
+        let (out, h_bar, w_bar) = preprocess(&dyn_img, &profile).expect("preprocess");
 
-        assert_eq!(out.len(), profile.pixels_chw());
+        let h = h_bar as usize;
+        let w = w_bar as usize;
+        assert_eq!(out.len(), 3 * h * w);
+        // Dims are factor-aligned and at least the min-pixel floor.
+        assert_eq!(h_bar % profile.factor, 0);
+        assert_eq!(w_bar % profile.factor, 0);
+        assert!(h * w >= profile.min_pixels as usize);
         // After mean=0.5, std=0.5: red channel (255/255=1.0) → (1.0 - 0.5)/0.5 = 1.0
         // green/blue (0.0) → (0.0 - 0.5)/0.5 = -1.0
-        let h = profile.target_height as usize;
-        let w = profile.target_width as usize;
         assert!(
             (out[0] - 1.0).abs() < 1e-5,
             "R[0] should be 1.0, got {}",
@@ -229,9 +295,12 @@ mod tests {
     #[test]
     fn preprocess_data_uri_end_to_end() {
         let profile = PreprocessProfile::qwen3_6();
-        let out = preprocess_data_uri(&red_png_uri(), &profile).expect("e2e preprocess");
-        assert_eq!(out.len(), profile.pixels_chw());
+        let (out, h, w) = preprocess_data_uri(&red_png_uri(), &profile).expect("e2e preprocess");
+        assert_eq!(out.len(), 3 * h as usize * w as usize);
         assert!(out.iter().all(|v| v.is_finite()));
+        // resized_dims_for_uri agrees with the full preprocess.
+        let (h2, w2) = resized_dims_for_uri(&red_png_uri(), &profile).expect("dims");
+        assert_eq!((h, w), (h2, w2));
     }
 
     #[test]
@@ -240,10 +309,10 @@ mod tests {
         // 1x1 grayscale = 200 → after conversion to RGB, all three
         // channels equal 200, normalised → (200/255 - 0.5)/0.5 ≈ 0.569
         let gray = DynamicImage::ImageLuma8(ImageBuffer::from_pixel(1, 1, image::Luma([200])));
-        let out = preprocess(&gray, &profile);
+        let (out, h_bar, w_bar) = preprocess(&gray, &profile).expect("preprocess");
         let expected = ((200.0 / 255.0) - 0.5) / 0.5;
-        let h = profile.target_height as usize;
-        let w = profile.target_width as usize;
+        let h = h_bar as usize;
+        let w = w_bar as usize;
         for c in 0..3 {
             let v = out[c * h * w];
             assert!(
@@ -252,4 +321,52 @@ mod tests {
             );
         }
     }
+
+    #[test]
+    fn smart_resize_keeps_factor_aligned_square_in_budget() {
+        // 448×448 sits inside [65536, 1048576] and is factor-aligned →
+        // unchanged. (Regression guard for the old fixed-res sweet spot.)
+        let (h, w) = smart_resize(448, 448, 32, 65_536, 1_048_576).unwrap();
+        assert_eq!((h, w), (448, 448));
+    }
+
+    #[test]
+    fn smart_resize_preserves_aspect_and_caps_at_max() {
+        // 3000×4000 (landscape) → downscaled under max_pixels, aspect kept.
+        let (h, w) = smart_resize(3000, 4000, 32, 65_536, 1_048_576).unwrap();
+        assert_eq!(h % 32, 0);
+        assert_eq!(w % 32, 0);
+        assert!(
+            (h as u64) * (w as u64) <= 1_048_576,
+            "must respect max_pixels"
+        );
+        assert!(w > h, "landscape orientation preserved");
+        // aspect ≈ 4000/3000 = 1.333; allow a factor-rounding tolerance.
+        let ar = w as f64 / h as f64;
+        assert!((ar - 4.0 / 3.0).abs() < 0.15, "aspect ~4:3, got {ar:.3}");
+    }
+
+    #[test]
+    fn smart_resize_floors_tiny_image_at_min() {
+        // 16×16 → upscaled to at least min_pixels, factor-aligned.
+        let (h, w) = smart_resize(16, 16, 32, 65_536, 1_048_576).unwrap();
+        assert_eq!(h % 32, 0);
+        assert_eq!(w % 32, 0);
+        assert!((h as u64) * (w as u64) >= 65_536, "must respect min_pixels");
+    }
+
+    #[test]
+    fn smart_resize_tall_nonsquare_stays_nonsquare() {
+        // A tall screenshot keeps portrait orientation.
+        let (h, w) = smart_resize(2000, 500, 32, 65_536, 1_048_576).unwrap();
+        assert!(h > w, "portrait orientation preserved");
+        assert_eq!(h % 32, 0);
+        assert_eq!(w % 32, 0);
+    }
+
+    #[test]
+    fn smart_resize_rejects_extreme_aspect() {
+        let err = smart_resize(1, 500, 32, 65_536, 1_048_576).unwrap_err();
+        assert!(format!("{err:#}").contains("200:1"));
+    }
 }
diff --git a/crates/neuron/src/harness/tp/tp_qwen3_5.rs b/crates/neuron/src/harness/tp/tp_qwen3_5.rs
index afe2713..654d2ac 100644
--- a/crates/neuron/src/harness/tp/tp_qwen3_5.rs
+++ b/crates/neuron/src/harness/tp/tp_qwen3_5.rs
@@ -1288,15 +1288,39 @@ impl TpQwen3_5ForCausalLM {
         let device = self.device().clone();
         let image_embeds = self.encode_images_concat(image_pixels)?;
 
+        // Each image's LM grid (lm_gh, lm_gw) = (h/factor, w/factor),
+        // factor = patch×merge. Recomputed per rank from this rank's own
+        // pixel tensors — deterministic, so every rank's grids (and hence
+        // M-RoPE positions) match without crossing the RPC (#14).
+        let factor = self
+            .vision
+            .as_ref()
+            .map(|v| {
+                let c = v.config();
+                c.patch_size * c.spatial_merge_size
+            })
+            .ok_or_else(|| {
+                candle_core::Error::Msg(
+                    "prefill_with_images_chunked: loaded without a vision tower".into(),
+                )
+            })?;
+        let grids: Vec<(usize, usize)> = image_pixels
+            .iter()
+            .map(|t| {
+                let (_, h, w) = t.dims3()?;
+                Ok::<(usize, usize), candle_core::Error>((h / factor, w / factor))
+            })
+            .collect::<candle_core::Result<Vec<_>>>()?;
+
         // Interleaved-M-RoPE 3D position ids for the whole prompt,
         // computed once and sliced per chunk so every rank assigns image
-        // tokens their 14×14 grid coordinates (and text after the image
-        // resumes from the compressed counter). `rope_delta` is stored on
-        // the base model for the decode that follows this prefill. Every
-        // chunk — text or image — uses the M-RoPE slice, because the image
-        // shifts the positions of the text around it.
+        // tokens their grid coordinates (and text after an image resumes
+        // from the compressed counter). `rope_delta` is stored on the base
+        // model for the decode that follows this prefill. Every chunk —
+        // text or image — uses the M-RoPE slice, because each image shifts
+        // the positions of the text around it.
         let (text, height, width, delta) =
-            crate::harness::arch::qwen3_5::rope::get_rope_index(tokens, image_token_id)
+            crate::harness::arch::qwen3_5::rope::get_rope_index(tokens, image_token_id, &grids)
                 .map_err(|e| candle_core::Error::Msg(format!("get_rope_index: {e}")))?;
         self.base.set_rope_delta(delta);
         let full_pos = crate::harness::arch::qwen3_5::rope::mrope_position_tensor(
diff --git a/crates/neuron/src/harness/tp/worker.rs b/crates/neuron/src/harness/tp/worker.rs
index 7dd34a1..d1e1415 100644
--- a/crates/neuron/src/harness/tp/worker.rs
+++ b/crates/neuron/src/harness/tp/worker.rs
@@ -494,16 +494,13 @@ impl WorkerState {
         let device = model.device().clone();
 
         // Preprocess each image identically to the leader so the encoded
-        // embeddings — and thus the spliced hidden state — match across
-        // ranks. Fixed 448×448 profile.
+        // embeddings — and thus the spliced hidden state and per-image
+        // grids — match across ranks. Native-aspect `smart_resize` (#14);
+        // deterministic, so each rank derives the same dims.
         let profile = PreprocessProfile::qwen3_6();
-        let (h, w) = (
-            profile.target_height as usize,
-            profile.target_width as usize,
-        );
         let mut pixels: Vec<Tensor> = Vec::with_capacity(image_data_uris.len());
         for (idx, uri) in image_data_uris.iter().enumerate() {
-            let px = match preprocess_data_uri(uri, &profile) {
+            let (px, h, w) = match preprocess_data_uri(uri, &profile) {
                 Ok(p) => p,
                 Err(e) => {
                     return WorkerResponse::Error {
@@ -512,7 +509,7 @@ impl WorkerState {
                     };
                 }
             };
-            match Tensor::from_vec(px, (3, h, w), &device) {
+            match Tensor::from_vec(px, (3, h as usize, w as usize), &device) {
                 Ok(t) => pixels.push(t),
                 Err(e) => {
                     return WorkerResponse::Error {