From c97a8654f505c2d118e011da833d7da3c27b49cd Mon Sep 17 00:00:00 2001 From: rob thijssen Date: Thu, 4 Jun 2026 22:47:27 +0300 Subject: [PATCH] feat(neuron): dynamic-resolution images via Qwen smart_resize (#14) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the fixed 448×448-square preprocess with native-aspect `smart_resize`, and thread the resulting per-image grid through the LM so spatial structure survives non-square images (documents, screenshots, charts, panoramas, OCR) instead of being squished into a square. - preprocess.rs: port Qwen `smart_resize` (factor = patch×merge = 32; pixel budget [min,max], default 256²–1024² → 64–1024 LM tokens). `PreprocessProfile` drops the fixed target dims for `factor`/`min_pixels`/ `max_pixels`; `preprocess`/`preprocess_data_uri` now return the resized `(h, w)`; add `resized_dims_for_uri` (decode + resize, no normalize) for the TP leader's token count. - rope.rs: `compute_mrope_index`/`get_rope_index` take per-image `grids: &[(lm_gh, lm_gw)]` instead of assuming a square `isqrt(run)`. Walk image runs in order, validate `run == gh*gw`, emit row-major positions, resume the shared counter at `base + max(gh,gw)`. Correct for multiple images of differing grids interleaved with text. - candle.rs: `VisionMeta`/`LoadedModel`/`TpLoadedModel` carry the `image_grid_factor` (patch×merge) instead of the constant 196; all four prompt-build sites compute per-image counts from each image's resized grid (single-GPU from the extracted `ImageInput.h/w`, TP from `resized_dims_for_uri`). `ModelArch` gains `vision_grid_factor`. - single-GPU (`mod.rs`, `dispatch.rs`) and TP (`tp_qwen3_5.rs::prefill_with_images_chunked`, `dispatch.rs`, `tp/worker.rs`) thread the grids into `get_rope_index`. Each TP rank recomputes grids from its own deterministic preprocess — no rpc.rs change, single source of truth. The vision tower itself was already grid-general (recent pos-embed interpolation + 2D rotary fix). No patch-count cap: pos-embed is interpolated to any grid; `max_pixels` bounds cost (O(patches²) ViT attention + prefill) instead. Tests: smart_resize (aspect/cap/floor/reject), `compute_mrope_index` non-square + two-image + mismatch cases, square-grid regression guard. Non-cuda build + clippy + full workspace tests green; TP load/dispatch paths are cuda-gated → Gitea CUDA type-check. Operator pixel-budget config + remaining doc cleanup follow in C5. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/neuron/src/harness/arch/qwen3_5/mod.rs | 21 +- .../neuron/src/harness/arch/qwen3_5/rope.rs | 124 +++++++---- crates/neuron/src/harness/candle.rs | 155 ++++++++----- .../src/harness/device_worker/dispatch.rs | 29 ++- .../neuron/src/harness/device_worker/jobs.rs | 9 +- crates/neuron/src/harness/preprocess.rs | 207 ++++++++++++++---- crates/neuron/src/harness/tp/tp_qwen3_5.rs | 36 ++- crates/neuron/src/harness/tp/worker.rs | 13 +- 8 files changed, 425 insertions(+), 169 deletions(-) diff --git a/crates/neuron/src/harness/arch/qwen3_5/mod.rs b/crates/neuron/src/harness/arch/qwen3_5/mod.rs index 95b5b64..ed2375c 100644 --- a/crates/neuron/src/harness/arch/qwen3_5/mod.rs +++ b/crates/neuron/src/harness/arch/qwen3_5/mod.rs @@ -404,7 +404,7 @@ impl Qwen3_5Model { } pub fn forward(&mut self, input: &Tensor, offset: usize) -> candle_core::Result { - self.forward_inner(input, offset, None, None) + self.forward_inner(input, offset, None, None, &[]) } /// Forward with image-embedding splice. Stage B of the vision plan. @@ -437,8 +437,15 @@ impl Qwen3_5Model { offset: usize, image_embeds: &Tensor, image_token_id: u32, + grids: &[(usize, usize)], ) -> candle_core::Result { - self.forward_inner(input_ids, offset, Some(image_embeds), Some(image_token_id)) + self.forward_inner( + input_ids, + offset, + Some(image_embeds), + Some(image_token_id), + grids, + ) } fn forward_inner( @@ -447,6 +454,7 @@ impl Qwen3_5Model { offset: usize, image_embeds: Option<&Tensor>, image_token_id: Option, + grids: &[(usize, usize)], ) -> candle_core::Result { let (b, l) = input.dims2()?; let mut h = self.embed_tokens.forward(input)?; @@ -483,7 +491,7 @@ impl Qwen3_5Model { h = splice_runs(&h, &img, &positions)?; } - let (text, height, width, delta) = rope::get_rope_index(&ids, tok_id) + let (text, height, width, delta) = rope::get_rope_index(&ids, tok_id, grids) .map_err(|e| candle_core::Error::Msg(format!("get_rope_index: {e}")))?; self.rope_delta = delta; let pos = rope::mrope_position_tensor(&text, &height, &width, &self.device)?; @@ -603,11 +611,12 @@ impl Qwen3_5ForCausalLM { offset: usize, image_embeds: &Tensor, image_token_id: u32, + grids: &[(usize, usize)], ) -> candle_core::Result { let (_, l) = input.dims2()?; - let hidden = self - .base - .forward_with_vision(input, offset, image_embeds, image_token_id)?; + let hidden = + self.base + .forward_with_vision(input, offset, image_embeds, image_token_id, grids)?; hidden.i((.., l - 1.., ..))?.apply(&self.lm_head) } diff --git a/crates/neuron/src/harness/arch/qwen3_5/rope.rs b/crates/neuron/src/harness/arch/qwen3_5/rope.rs index 1d547b1..32bcc4d 100644 --- a/crates/neuron/src/harness/arch/qwen3_5/rope.rs +++ b/crates/neuron/src/harness/arch/qwen3_5/rope.rs @@ -260,28 +260,40 @@ pub(crate) fn mrope_enabled() -> bool { /// off, returns plain sequential identity positions on all three axes /// (`mrope_cos_sin` then reduces exactly to plain RoPE), restoring the /// pre-M-RoPE behaviour without touching the rest of the forward. -pub(crate) fn get_rope_index(input_ids: &[u32], image_token_id: u32) -> Result { +pub(crate) fn get_rope_index( + input_ids: &[u32], + image_token_id: u32, + grids: &[(usize, usize)], +) -> Result { if !mrope_enabled() { let seq: Vec = (0..input_ids.len() as i64).collect(); return Ok((seq.clone(), seq.clone(), seq, 0)); } - compute_mrope_index(input_ids, image_token_id) + compute_mrope_index(input_ids, image_token_id, grids) } /// The real interleaved-M-RoPE position-id computation (always active in /// unit tests; gated behind [`get_rope_index`] at runtime). /// -/// Fixed-resolution assumption (Stage C): each image run is a perfect -/// square with `grid_t = 1` (still image) and `grid_h = grid_w = -/// isqrt(run_len)` — 196 → 14×14. Dynamic resolution (#14) would thread -/// real per-image grids instead. -pub(crate) fn compute_mrope_index(input_ids: &[u32], image_token_id: u32) -> Result { +/// `grids` carries the post-merge LM grid `(lm_gh, lm_gw)` for each image +/// run, in prompt order — a run length alone cannot recover its +/// factorisation, so the grids must be passed (#14 dynamic resolution). +/// Each image is a still frame (`grid_t = 1`); its tokens get +/// `[base, base + hh, base + ww]` row-major and the shared counter +/// resumes at `base + max(lm_gh, lm_gw)`. Multi-image is correct because +/// the counter threads across images and interleaved text. +pub(crate) fn compute_mrope_index( + input_ids: &[u32], + image_token_id: u32, + grids: &[(usize, usize)], +) -> Result { let n = input_ids.len(); let mut text = Vec::with_capacity(n); let mut height = Vec::with_capacity(n); let mut width = Vec::with_capacity(n); let mut counter: i64 = 0; let mut i = 0; + let mut k = 0; // index into `grids`, one per image run while i < n { if input_ids[i] == image_token_id { let start = i; @@ -289,25 +301,30 @@ pub(crate) fn compute_mrope_index(input_ids: &[u32], image_token_id: u32) -> Res i += 1; } let run = i - start; - let g = run.isqrt(); - if g * g != run { + let (grid_h, grid_w) = *grids.get(k).ok_or_else(|| { + anyhow::anyhow!( + "get_rope_index: image run #{k} (len {run}) has no matching grid \ + ({} grids supplied)", + grids.len() + ) + })?; + k += 1; + if grid_h * grid_w != run { anyhow::bail!( - "get_rope_index: image run length {run} is not a perfect square \ - (fixed-resolution Stage C assumes a square grid; dynamic resolution is #14)" + "get_rope_index: image run #{} length {run} != grid {grid_h}×{grid_w} = {}", + k - 1, + grid_h * grid_w ); } - let (grid_t, grid_h, grid_w) = (1usize, g, g); let base = counter; - for tt in 0..grid_t { - for hh in 0..grid_h { - for ww in 0..grid_w { - text.push(base + tt as i64); - height.push(base + hh as i64); - width.push(base + ww as i64); - } + for hh in 0..grid_h { + for ww in 0..grid_w { + text.push(base); // grid_t = 1 → temporal axis const + height.push(base + hh as i64); + width.push(base + ww as i64); } } - counter = base + grid_t.max(grid_h).max(grid_w) as i64; + counter = base + grid_h.max(grid_w) as i64; } else { text.push(counter); height.push(counter); @@ -316,6 +333,12 @@ pub(crate) fn compute_mrope_index(input_ids: &[u32], image_token_id: u32) -> Res i += 1; } } + if k != grids.len() { + anyhow::bail!( + "get_rope_index: prompt has {k} image run(s) but {} grid(s) were supplied", + grids.len() + ); + } let delta = counter - n as i64; Ok((text, height, width, delta)) } @@ -447,7 +470,7 @@ mod tests { #[test] fn get_rope_index_text_only_is_sequential() { - let (t, h, w, delta) = compute_mrope_index(&[1, 2, 3, 4], 99).unwrap(); + let (t, h, w, delta) = compute_mrope_index(&[1, 2, 3, 4], 99, &[]).unwrap(); assert_eq!(t, vec![0, 1, 2, 3]); assert_eq!(h, vec![0, 1, 2, 3]); assert_eq!(w, vec![0, 1, 2, 3]); @@ -456,12 +479,12 @@ mod tests { #[test] fn get_rope_index_text_image_text() { - // [text, image(2x2 run of 4), text]. image_token = 99. + // [text, image(2x2 run of 4), text]. image_token = 99, grid (2,2). let ids = [1u32, 99, 99, 99, 99, 2]; - let (t, h, w, delta) = compute_mrope_index(&ids, 99).unwrap(); - // token 0: text → 0. image base=1, grid 1x2x2: + let (t, h, w, delta) = compute_mrope_index(&ids, 99, &[(2, 2)]).unwrap(); + // token 0: text → 0. image base=1, grid 2x2: // t all = 1; h = base+row = [1,1,2,2]; w = base+col = [1,2,1,2]. - // resume from base + max(1,2,2) = 3. trailing text → 3. + // resume from base + max(2,2) = 3. trailing text → 3. assert_eq!(t, vec![0, 1, 1, 1, 1, 3]); assert_eq!(h, vec![0, 1, 1, 2, 2, 3]); assert_eq!(w, vec![0, 1, 2, 1, 2, 3]); @@ -472,25 +495,52 @@ mod tests { assert_eq!(6 + delta, 4); } + #[test] + fn get_rope_index_nonsquare_single_image() { + // text + image(2 rows × 3 cols = 6 tokens). grid (2,3). + let ids = [1u32, 99, 99, 99, 99, 99, 99]; + let (t, h, w, delta) = compute_mrope_index(&ids, 99, &[(2, 3)]).unwrap(); + // base = 1; row-major h = [0,0,0,1,1,1]+1, w = [0,1,2,0,1,2]+1. + assert_eq!(t, vec![0, 1, 1, 1, 1, 1, 1]); + assert_eq!(h, vec![0, 1, 1, 1, 2, 2, 2]); + assert_eq!(w, vec![0, 1, 2, 3, 1, 2, 3]); + // resume from base + max(2,3) = 4; seq_len 7, counter 4 → delta -3. + assert_eq!(delta, 4 - 7); + } + + #[test] + fn get_rope_index_two_images_different_grids() { + // img(2x2)=4, text, img(1x3)=3. grids [(2,2),(1,3)]. + let ids = [99, 99, 99, 99, 7, 99, 99, 99]; + let (t, h, w, delta) = compute_mrope_index(&ids, 99, &[(2, 2), (1, 3)]).unwrap(); + // img1 base=0 → t=0, h=[0,0,1,1], w=[0,1,0,1]; resume max(2,2)=2. + // text at counter 2. img2 base=3 → t=3, h=[3,3,3], w=[3,4,5]; + // resume 3+max(1,3)=6. + assert_eq!(t, vec![0, 0, 0, 0, 2, 3, 3, 3]); + assert_eq!(h, vec![0, 0, 1, 1, 2, 3, 3, 3]); + assert_eq!(w, vec![0, 1, 0, 1, 2, 3, 4, 5]); + assert_eq!(delta, 6 - 8); + } + #[test] fn get_rope_index_on_by_default() { // With NEURON_MROPE unset (default ON), the runtime path returns - // the real interleaved-M-RoPE positions, so image tokens carry - // their 2D grid coords (height differs from the text counter). - // (NEURON_MROPE=0 would fall back to identity; not asserted here - // since it depends on env.) - let (t, h, w, _delta) = get_rope_index(&[1, 99, 99, 99, 99, 2], 99).unwrap(); - // Same as compute_mrope_index: 2x2 image after one text token. + // the real interleaved-M-RoPE positions. (NEURON_MROPE=0 would fall + // back to identity; not asserted here since it depends on env.) + let (t, h, w, _delta) = get_rope_index(&[1, 99, 99, 99, 99, 2], 99, &[(2, 2)]).unwrap(); assert_eq!(t, vec![0, 1, 1, 1, 1, 3]); assert_eq!(h, vec![0, 1, 1, 2, 2, 3]); assert_eq!(w, vec![0, 1, 2, 1, 2, 3]); } #[test] - fn get_rope_index_rejects_non_square_image_run() { - // 196 is square (14x14) — ok. 195 is not. - assert!(compute_mrope_index(&[99u32; 196], 99).is_ok()); - assert!(compute_mrope_index(&[99u32; 195], 99).is_err()); + fn get_rope_index_grid_mismatches_error() { + // run length != grid product. + assert!(compute_mrope_index(&[99u32; 6], 99, &[(2, 2)]).is_err()); + // too few grids for the number of image runs. + assert!(compute_mrope_index(&[99, 99, 7, 99], 99, &[(1, 2)]).is_err()); + // too many grids. + assert!(compute_mrope_index(&[99, 99], 99, &[(1, 2), (1, 1)]).is_err()); } #[test] @@ -501,7 +551,7 @@ mod tests { let dev = Device::Cpu; let rope = RotaryEmbedding::new(DType::F32, &qwen36_cfg(), &dev).unwrap(); let ids = [1u32, 99, 99, 99, 99]; // text + 2x2 image - let (t, h, w, _d) = compute_mrope_index(&ids, 99).unwrap(); + let (t, h, w, _d) = compute_mrope_index(&ids, 99, &[(2, 2)]).unwrap(); let pos = mrope_position_tensor(&t, &h, &w, &dev).unwrap(); assert_eq!(pos.dims(), &[3, 5]); let (cos, _sin) = rope.mrope_cos_sin(&pos).unwrap(); @@ -518,7 +568,7 @@ mod tests { fn get_rope_index_196_is_14x14() { let mut ids = vec![1u32]; // one text token ids.extend(std::iter::repeat_n(99u32, 196)); - let (t, h, w, _delta) = compute_mrope_index(&ids, 99).unwrap(); + let (t, h, w, _delta) = compute_mrope_index(&ids, 99, &[(14, 14)]).unwrap(); // image base = 1. Last image token (index 196) is grid (h=13,w=13). assert_eq!(*t.last().unwrap(), 1, "grid_t=1 → temporal const at base"); assert_eq!(h[1], 1, "first image row at base"); diff --git a/crates/neuron/src/harness/candle.rs b/crates/neuron/src/harness/candle.rs index 613d8d3..2dc8da7 100644 --- a/crates/neuron/src/harness/candle.rs +++ b/crates/neuron/src/harness/candle.rs @@ -210,13 +210,11 @@ pub struct LoadedModel { /// targets and the worker forward uses it to locate splice /// positions in the LM input embeddings. pub image_token_id: Option, - /// LM-side tokens this model's vision tower emits per image at - /// the Stage B fixed resolution (448×448 → 196 for Qwen3.6). - /// `None` for text-only models. Set at load time so the - /// hot path doesn't recompute it per request. Stage B fixed - /// resolution → constant; dynamic resolution per #14 makes it - /// per-image. - pub lm_tokens_per_image: Option, + /// `patch_size × spatial_merge_size` — divides a resized pixel + /// dimension into LM-grid units. Per-image LM token count is + /// `(h/factor) × (w/factor)` (#14 dynamic resolution). `None` for + /// text-only models. Set at load time. + pub image_grid_factor: Option, } impl LoadedModel { @@ -288,9 +286,9 @@ pub struct TpLoadedModel { pub has_vision: bool, /// `<|image_pad|>` token id — same as [`LoadedModel::image_token_id`]. pub image_token_id: Option, - /// LM-side tokens per image at the fixed 448×448 resolution — same - /// as [`LoadedModel::lm_tokens_per_image`]. - pub lm_tokens_per_image: Option, + /// Pixel→LM-grid divisor — same as + /// [`LoadedModel::image_grid_factor`]. + pub image_grid_factor: Option, } #[cfg(feature = "cuda")] @@ -394,10 +392,11 @@ impl ModelArch { offset: usize, image_embeds: &Tensor, image_token_id: u32, + grids: &[(usize, usize)], ) -> Result { let raw = match self { ModelArch::Qwen3_5Dense(m) => { - m.forward_with_vision(input, offset, image_embeds, image_token_id)? + m.forward_with_vision(input, offset, image_embeds, image_token_id, grids)? } other => anyhow::bail!( "forward_with_vision: architecture {} has no vision tower", @@ -407,6 +406,20 @@ impl ModelArch { squeeze_to_vocab(&raw) } + /// `patch_size × spatial_merge_size` for the loaded vision tower — + /// divides a resized pixel dim into LM-grid units (an image of + /// resized `(h, w)` yields the LM grid `(h/factor, w/factor)`). + /// `None` for architectures/checkpoints without a vision tower. + pub fn vision_grid_factor(&self) -> Option { + match self { + ModelArch::Qwen3_5Dense(m) => m.vision().map(|v| { + let c = v.config(); + c.patch_size * c.spatial_merge_size + }), + _ => None, + } + } + /// Encode a preprocessed image into LM-side token embeddings via /// the loaded vision tower. Stage A5. /// @@ -1683,11 +1696,11 @@ impl CandleHarness { .ok_or_else(|| InferenceError::VisionUnsupported { model_id: request.model.clone(), })?; - let patches_per_image = loaded - .lm_tokens_per_image - .ok_or_else(|| InferenceError::VisionUnsupported { + let factor = loaded.image_grid_factor.ok_or_else(|| { + InferenceError::VisionUnsupported { model_id: request.model.clone(), - })?; + } + })?; let profile = super::preprocess::PreprocessProfile::qwen3_6(); let images = extract_images_from_request(&request, &profile).map_err(|e| { InferenceError::Other(anyhow::anyhow!("extract_images: {e}")) @@ -1699,7 +1712,12 @@ impl CandleHarness { "request has image content but extractor produced zero images" ))); } - let per_image_counts: Vec = vec![patches_per_image; images.len()]; + // Per-image LM token count from each image's resized grid + // (#14 dynamic resolution; was a constant 196). + let per_image_counts: Vec = images + .iter() + .map(|im| (im.h / factor) * (im.w / factor)) + .collect(); prompt_tokens = expand_image_pad_tokens(&prompt_tokens, image_token_id, &per_image_counts) .map_err(InferenceError::Other)?; @@ -2059,11 +2077,12 @@ impl CandleHarness { .ok_or_else(|| InferenceError::VisionUnsupported { model_id: request.model.clone(), })?; - let patches_per_image = loaded.lm_tokens_per_image.ok_or_else(|| { - InferenceError::VisionUnsupported { - model_id: request.model.clone(), - } - })?; + let factor = + loaded + .image_grid_factor + .ok_or_else(|| InferenceError::VisionUnsupported { + model_id: request.model.clone(), + })?; let profile = super::preprocess::PreprocessProfile::qwen3_6(); let images = extract_images_from_request(&request, &profile) .map_err(|e| InferenceError::Other(anyhow::anyhow!("extract_images: {e}")))?; @@ -2072,7 +2091,11 @@ impl CandleHarness { "request has image content but extractor produced zero images" ))); } - let per_image_counts: Vec = vec![patches_per_image; images.len()]; + // Per-image LM token count from each image's resized grid (#14). + let per_image_counts: Vec = images + .iter() + .map(|im| (im.h / factor) * (im.w / factor)) + .collect(); prompt_tokens = expand_image_pad_tokens(&prompt_tokens, image_token_id, &per_image_counts) .map_err(InferenceError::Other)?; @@ -2526,7 +2549,7 @@ impl Harness for CandleHarness { chat_template, has_vision: vision_meta.has_vision, image_token_id: vision_meta.image_token_id, - lm_tokens_per_image: vision_meta.lm_tokens_per_image, + image_grid_factor: vision_meta.image_grid_factor, }); let mut models = self.models.write().await; @@ -2742,7 +2765,7 @@ impl CandleHarness { tracing::info!( model = %spec.model_id, image_token_id = ?vision_meta.image_token_id, - lm_tokens_per_image = ?vision_meta.lm_tokens_per_image, + image_grid_factor = ?vision_meta.image_grid_factor, "TP load: vision tower present, advertising vision capability" ); } @@ -2764,7 +2787,7 @@ impl CandleHarness { chat_template, has_vision: vision_meta.has_vision, image_token_id: vision_meta.image_token_id, - lm_tokens_per_image: vision_meta.lm_tokens_per_image, + image_grid_factor: vision_meta.image_grid_factor, }); let mut models = self.models.write().await; @@ -2938,18 +2961,32 @@ impl CandleHarness { .ok_or_else(|| InferenceError::VisionUnsupported { model_id: request.model.clone(), })?; - let patches_per_image = - tp.lm_tokens_per_image - .ok_or_else(|| InferenceError::VisionUnsupported { - model_id: request.model.clone(), - })?; + let factor = tp + .image_grid_factor + .ok_or_else(|| InferenceError::VisionUnsupported { + model_id: request.model.clone(), + })?; let data_uris = extract_image_data_uris(&request); if data_uris.is_empty() { return Err(InferenceError::Other(anyhow::anyhow!( "request has image content but extractor produced zero data URIs" ))); } - let per_image_counts: Vec = vec![patches_per_image; data_uris.len()]; + // Per-image LM token count from each image's resized grid (#14). + // Decode header + smart_resize only; the workers re-derive the + // same dims when they preprocess for the replicated tower. + let profile = super::preprocess::PreprocessProfile::qwen3_6(); + let per_image_counts: Vec = data_uris + .iter() + .enumerate() + .map(|(i, uri)| { + let (h, w) = + super::preprocess::resized_dims_for_uri(uri, &profile).map_err(|e| { + InferenceError::Other(anyhow::anyhow!("resized_dims image #{i}: {e}")) + })?; + Ok::((h as usize / factor) * (w as usize / factor)) + }) + .collect::, _>>()?; prompt_tokens = expand_image_pad_tokens(&prompt_tokens, image_token_id, &per_image_counts) .map_err(InferenceError::Other)?; @@ -3457,18 +3494,30 @@ async fn chat_completion_tp_inner( .ok_or_else(|| InferenceError::VisionUnsupported { model_id: request.model.clone(), })?; - let patches_per_image = - tp.lm_tokens_per_image - .ok_or_else(|| InferenceError::VisionUnsupported { - model_id: request.model.clone(), - })?; + let factor = tp + .image_grid_factor + .ok_or_else(|| InferenceError::VisionUnsupported { + model_id: request.model.clone(), + })?; let data_uris = extract_image_data_uris(&request); if data_uris.is_empty() { return Err(InferenceError::Other(anyhow::anyhow!( "request has image content but extractor produced zero data URIs" ))); } - let per_image_counts: Vec = vec![patches_per_image; data_uris.len()]; + // Per-image LM token count from each image's resized grid (#14). + let profile = super::preprocess::PreprocessProfile::qwen3_6(); + let per_image_counts: Vec = data_uris + .iter() + .enumerate() + .map(|(i, uri)| { + let (h, w) = + super::preprocess::resized_dims_for_uri(uri, &profile).map_err(|e| { + InferenceError::Other(anyhow::anyhow!("resized_dims image #{i}: {e}")) + })?; + Ok::((h as usize / factor) * (w as usize / factor)) + }) + .collect::, _>>()?; prompt_tokens = expand_image_pad_tokens(&prompt_tokens, image_token_id, &per_image_counts) .map_err(InferenceError::Other)?; Some((data_uris, image_token_id)) @@ -3917,10 +3966,12 @@ fn build_prompt_for_request( struct VisionMeta { has_vision: bool, image_token_id: Option, - /// LM-side tokens this model's vision tower emits per image at - /// the Stage B fixed `PreprocessProfile::qwen3_6()` resolution - /// (448×448). Equal to `(H/patch_size/spatial_merge_size)²`. - lm_tokens_per_image: Option, + /// `patch_size × spatial_merge_size` — the divisor that turns a + /// resized pixel dimension into an LM-grid dimension. An image of + /// resized `(h, w)` emits `(h/factor) × (w/factor)` LM tokens (#14 + /// dynamic resolution; was a constant 196 at the old fixed 448²). + /// `None` for text-only models. + image_grid_factor: Option, } impl VisionMeta { @@ -3949,22 +4000,18 @@ impl VisionMeta { .get("image_token_id") .and_then(|x| x.as_u64()) .map(|n| n as u32); - // Compute LM tokens per image at the Stage B fixed resolution - // (PreprocessProfile::qwen3_6() → 448×448). One LM token per - // spatial-merge group of patches. - let target_h = super::preprocess::PreprocessProfile::qwen3_6().target_height as usize; - let target_w = super::preprocess::PreprocessProfile::qwen3_6().target_width as usize; - let lm_tokens_per_image = if patch_size > 0 && spatial_merge_size > 0 { - let gh = target_h / patch_size / spatial_merge_size; - let gw = target_w / patch_size / spatial_merge_size; - Some(gh * gw) + // The pixel→LM-grid divisor. An image resized to (h, w) emits + // (h/factor) × (w/factor) LM tokens — computed per image at + // request time now that resolution is dynamic (#14). + let image_grid_factor = if patch_size > 0 && spatial_merge_size > 0 { + Some(patch_size * spatial_merge_size) } else { None }; Self { has_vision: true, image_token_id, - lm_tokens_per_image, + image_grid_factor, } } } @@ -4011,13 +4058,13 @@ fn extract_images_from_request( .and_then(|v| v.get("url")) .and_then(|v| v.as_str()) .ok_or_else(|| anyhow::anyhow!("image_url part missing url field"))?; - let pixels = super::preprocess::preprocess_data_uri(url, profile) + let (pixels, h, w) = super::preprocess::preprocess_data_uri(url, profile) .with_context(|| format!("preprocess image #{}", out.len()))?; out.push(super::device_worker::jobs::ImageInput { pixels, c: 3, - h: profile.target_height as usize, - w: profile.target_width as usize, + h: h as usize, + w: w as usize, }); } } diff --git a/crates/neuron/src/harness/device_worker/dispatch.rs b/crates/neuron/src/harness/device_worker/dispatch.rs index 62c60c2..6df69ef 100644 --- a/crates/neuron/src/harness/device_worker/dispatch.rs +++ b/crates/neuron/src/harness/device_worker/dispatch.rs @@ -779,19 +779,17 @@ fn tp_forward_logits_with_images( anyhow::bail!("TpForwardLogitsWithImages dispatched with zero images"); } - // Preprocess every image into a device-resident (C, H, W) tensor. - // Same fixed-resolution profile + decode path the subprocess workers - // run, so the encoded embeddings match across ranks bit-for-bit. + // Preprocess every image into a device-resident (C, H, W) tensor at + // its native-aspect resized dims (#14). Same `smart_resize` + decode + // path the subprocess workers run, so the encoded embeddings — and + // the per-image grids derived from these dims — match across ranks + // bit-for-bit. let profile = PreprocessProfile::qwen3_6(); - let (h, w) = ( - profile.target_height as usize, - profile.target_width as usize, - ); let mut pixels: Vec = Vec::with_capacity(image_data_uris.len()); for (idx, uri) in image_data_uris.iter().enumerate() { - let px = preprocess_data_uri(uri, &profile) + let (px, h, w) = preprocess_data_uri(uri, &profile) .with_context(|| format!("preprocess image[{idx}] (TP leader)"))?; - let t = Tensor::from_vec(px, (3, h, w), &state.device)?; + let t = Tensor::from_vec(px, (3, h as usize, w as usize), &state.device)?; pixels.push(t); } @@ -877,9 +875,17 @@ fn forward_logits_with_images( anyhow::anyhow!("ForwardLogitsWithImages: no model for handle {}", handle.0) })?; + // pixel→LM-grid divisor (patch×merge) for this tower; each image's + // LM grid is (h/factor, w/factor) (#14 dynamic resolution). + let factor = arch.vision_grid_factor().ok_or_else(|| { + anyhow::anyhow!("ForwardLogitsWithImages: loaded model has no vision tower") + })?; + // Encode every image on the worker's device, collecting per-image - // post-merger embeddings as device-resident tensors. + // post-merger embeddings as device-resident tensors plus their LM + // grids (for the interleaved-M-RoPE position ids). let mut per_image: Vec = Vec::with_capacity(images.len()); + let mut grids: Vec<(usize, usize)> = Vec::with_capacity(images.len()); for (idx, img) in images.into_iter().enumerate() { anyhow::ensure!( img.pixels.len() == img.c * img.h * img.w, @@ -889,6 +895,7 @@ fn forward_logits_with_images( img.h, img.w, ); + grids.push((img.h / factor, img.w / factor)); let image = Tensor::from_vec(img.pixels, (img.c, img.h, img.w), &state.device)?; let embed = arch .encode_image(&image) @@ -901,7 +908,7 @@ fn forward_logits_with_images( let image_embeds = Tensor::cat(&per_image.iter().collect::>(), 0)?; let input = Tensor::new(tokens, &state.device)?.unsqueeze(0)?; - let logits = arch.forward_with_vision(&input, offset, &image_embeds, image_token_id)?; + let logits = arch.forward_with_vision(&input, offset, &image_embeds, image_token_id, &grids)?; let values = logits .to_dtype(DType::F32)? .flatten_all()? diff --git a/crates/neuron/src/harness/device_worker/jobs.rs b/crates/neuron/src/harness/device_worker/jobs.rs index fc3587a..d53826e 100644 --- a/crates/neuron/src/harness/device_worker/jobs.rs +++ b/crates/neuron/src/harness/device_worker/jobs.rs @@ -36,8 +36,13 @@ pub struct TpHandle(pub u64); /// `Clone` so the vision-aware dispatch in `chat_completion` can /// match `&vision_route` (carrying borrowed images) and still hand /// owned `Vec` to the worker job. The clone cost is one -/// pixel-buffer memcpy per image — fine at fixed-resolution sizes -/// (3 × 448 × 448 × 4 bytes = ~2.4 MiB per image). +/// pixel-buffer memcpy per image — now variable with dynamic resolution +/// (#14): `3 × h × w × 4` bytes, up to ~6.3 MiB at the default 1024² +/// `max_pixels` budget. +/// +/// `h`/`w` are the **resized** dims (factor-aligned), so the per-image LM +/// grid is `(h/factor, w/factor)` — derived downstream for the splice +/// and the interleaved-M-RoPE position ids. #[derive(Clone)] pub struct ImageInput { pub pixels: Vec, diff --git a/crates/neuron/src/harness/preprocess.rs b/crates/neuron/src/harness/preprocess.rs index 0356f4d..72a0f6f 100644 --- a/crates/neuron/src/harness/preprocess.rs +++ b/crates/neuron/src/harness/preprocess.rs @@ -2,11 +2,11 @@ //! //! Decodes `data:image/...;base64,...` URIs from OpenAI-style //! `image_url` content parts into the patch tensors a candle vision -//! tower expects. Stage A ships **fixed resolution** — every image -//! is resized to the same target dimensions (default 448×448 for -//! Qwen3.6, configurable per-call) so the patch count is constant -//! per image. Variable resolution per [Qwen2VL convention] is tracked -//! as issue #14. +//! tower expects. Resolution is **dynamic** (#14): each image is +//! resized to its native aspect via Qwen `smart_resize` — a +//! factor-aligned `(h, w)` whose pixel count lands in the profile's +//! `[min_pixels, max_pixels]` budget — so the LM token count varies per +//! image (`(h/factor) × (w/factor)`). //! //! Spec reference: `doc/vision-qwen3_6-spec.md` — preprocessor //! section. @@ -21,7 +21,7 @@ //! Pipeline (per image): //! 1. data: URI → base64 decode → bytes //! 2. bytes → image::DynamicImage (PNG/JPEG/WebP/etc) -//! 3. resize_exact to target H×W (pixel space) +//! 3. smart_resize to a native-aspect, factor-aligned H×W (pixel space) //! 4. RGB→f32, normalise per mean/std //! 5. layout to (C, H, W) tensor //! @@ -34,39 +34,93 @@ use base64::Engine; use image::DynamicImage; use image::imageops::FilterType; -/// Preprocessing target. Captures the resize dimensions and the -/// channel-wise normalisation constants from the model's -/// `preprocessor_config.json`. Stage A ships a single `qwen3_6()` -/// constructor for fixed-resolution Qwen3.6 preprocessing; other -/// models can ship their own profile when added. +/// Preprocessing target. Captures the resize policy (Qwen `smart_resize` +/// factor + pixel budget) and the channel-wise normalisation constants +/// from the model's `preprocessor_config.json`. Images are resized to +/// their **native aspect** — a factor-aligned `(h, w)` whose pixel count +/// lands in `[min_pixels, max_pixels]` — not a fixed square (#14). #[derive(Debug, Clone)] pub struct PreprocessProfile { - pub target_height: u32, - pub target_width: u32, + /// Both output dims are multiples of this. For Qwen3.6 it is + /// `patch_size(16) × spatial_merge_size(2) = 32`, so the post-merge + /// LM grid is exactly `(h/factor, w/factor)`. + pub factor: u32, + /// Lower pixel bound — tiny images are upscaled to at least this. + pub min_pixels: u32, + /// Upper pixel bound — large images are downscaled to at most this. + /// Caps per-image LM tokens (`max_pixels / factor²`) and the + /// O(patches²) ViT attention cost. + pub max_pixels: u32, pub image_mean: [f32; 3], pub image_std: [f32; 3], } impl PreprocessProfile { - /// Stage A profile for Qwen3.6. Resize to 448×448, normalise to - /// `[-1, 1]` via mean=std=0.5. Fits within the model's - /// `num_position_embeddings=2304` budget at 28×28 = 784 patches - /// before merging. + /// Profile for Qwen3.6. Native-aspect `smart_resize` (factor 32), + /// normalise to `[-1, 1]` via mean=std=0.5. Pixel budget defaults: + /// `min = 256² = 65536` (→ 8×8 = 64 LM tokens) and + /// `max = 1024² = 1048576` (→ 32×32 = 1024 LM tokens) — generous for + /// documents/OCR, bounded for serving on 2×RTX5090. (Operator + /// override lands with the `[harness.candle.vision]` config in #14 C5.) pub fn qwen3_6() -> Self { Self { - target_height: 448, - target_width: 448, + factor: 32, + min_pixels: 65_536, + max_pixels: 1_048_576, image_mean: [0.5, 0.5, 0.5], image_std: [0.5, 0.5, 0.5], } } - /// Per-channel CHW tensor length: 3 * H * W. - pub fn pixels_chw(&self) -> usize { - 3 * (self.target_height as usize) * (self.target_width as usize) + /// The factor-aligned `(h, w)` this profile would resize a source + /// `src_h × src_w` image to. Pure integer policy — no pixel work. + pub fn resized_dims(&self, src_h: u32, src_w: u32) -> Result<(u32, u32)> { + smart_resize(src_h, src_w, self.factor, self.min_pixels, self.max_pixels) } } +/// Qwen `smart_resize`: the smallest `factor`-aligned `(h_bar, w_bar)` +/// that preserves aspect ratio as closely as possible while keeping the +/// pixel count within `[min_pixels, max_pixels]`. Direct port of the +/// canonical Qwen2-VL / Qwen3-VL image-processor function (so neuron's +/// grid matches what the model was trained on). +/// +/// Returns `(height, width)`. Errors if the aspect ratio exceeds 200:1 +/// (degenerate input — a 1-pixel-tall strip), matching upstream. +pub fn smart_resize( + height: u32, + width: u32, + factor: u32, + min_pixels: u32, + max_pixels: u32, +) -> Result<(u32, u32)> { + let h = height.max(1) as f64; + let w = width.max(1) as f64; + let ratio = h.max(w) / h.min(w); + if ratio > 200.0 { + anyhow::bail!( + "image aspect ratio {ratio:.1}:1 exceeds the 200:1 limit ({height}×{width}); \ + refusing to resize" + ); + } + let f = factor as f64; + let (minp, maxp) = (min_pixels as f64, max_pixels as f64); + // round-to-nearest-factor (may be 0 for sub-factor inputs; the + // min-pixels branch below grows it back up). + let mut h_bar = (h / f).round() * f; + let mut w_bar = (w / f).round() * f; + if h_bar * w_bar > maxp { + let beta = (h * w / maxp).sqrt(); + h_bar = f.max((h / beta / f).floor() * f); + w_bar = f.max((w / beta / f).floor() * f); + } else if h_bar * w_bar < minp { + let beta = (minp / (h * w)).sqrt(); + h_bar = (h * beta / f).ceil() * f; + w_bar = (w * beta / f).ceil() * f; + } + Ok((h_bar as u32, w_bar as u32)) +} + /// Decode a `data:image/...;base64,...` URI into an in-memory image. /// /// Accepts the OpenAI Chat Completions `image_url` shape — a string @@ -106,16 +160,13 @@ pub fn decode_data_uri(uri: &str) -> Result { /// faster on CPU. Quality difference is marginal for downstream /// vision-encoder consumption. The numerical-validation issue (#15) /// will quantify any discrepancy. -pub fn preprocess(img: &DynamicImage, profile: &PreprocessProfile) -> Vec { +pub fn preprocess(img: &DynamicImage, profile: &PreprocessProfile) -> Result<(Vec, u32, u32)> { + let (h_bar, w_bar) = profile.resized_dims(img.height(), img.width())?; let rgb = img - .resize_exact( - profile.target_width, - profile.target_height, - FilterType::Triangle, - ) + .resize_exact(w_bar, h_bar, FilterType::Triangle) .to_rgb8(); - let h = profile.target_height as usize; - let w = profile.target_width as usize; + let h = h_bar as usize; + let w = w_bar as usize; let mut out = vec![0.0_f32; 3 * h * w]; // Row-major (C, H, W). Candle's Conv2d expects NCHW, so this is // the natural layout — the caller stacks `n` of these along the @@ -131,16 +182,27 @@ pub fn preprocess(img: &DynamicImage, profile: &PreprocessProfile) -> Vec { } } } - out + Ok((out, h_bar, w_bar)) } -/// Combined helper: decode + preprocess in one call. Most call -/// sites just want the final tensor; the two-step path exists for -/// callers (tests, future video preprocessing) that need the +/// Combined helper: decode + preprocess in one call. Returns the +/// `(3, h, w)` row-major pixels plus the resized `(h, w)` — the caller +/// needs the dims to build the tensor and to derive the LM token grid +/// `(h/factor, w/factor)`. Most call sites use this; the two-step path +/// exists for callers (tests, future video preprocessing) that need the /// intermediate `DynamicImage`. -pub fn preprocess_data_uri(uri: &str, profile: &PreprocessProfile) -> Result> { +pub fn preprocess_data_uri(uri: &str, profile: &PreprocessProfile) -> Result<(Vec, u32, u32)> { let img = decode_data_uri(uri)?; - Ok(preprocess(&img, profile)) + preprocess(&img, profile) +} + +/// Resized `(h, w)` for a data-URI image **without** running the pixel +/// normalisation — decode header + `smart_resize` only. Lets a caller +/// that just needs the LM token count (e.g. the TP leader expanding the +/// prompt) avoid materialising the full pixel tensor twice. +pub fn resized_dims_for_uri(uri: &str, profile: &PreprocessProfile) -> Result<(u32, u32)> { + let img = decode_data_uri(uri)?; + profile.resized_dims(img.height(), img.width()) } #[cfg(test)] @@ -205,13 +267,17 @@ mod tests { // decoding so this test isolates the resize+normalise path. let img: ImageBuffer, Vec> = ImageBuffer::from_pixel(2, 2, Rgb([255, 0, 0])); let dyn_img = DynamicImage::ImageRgb8(img); - let out = preprocess(&dyn_img, &profile); + let (out, h_bar, w_bar) = preprocess(&dyn_img, &profile).expect("preprocess"); - assert_eq!(out.len(), profile.pixels_chw()); + let h = h_bar as usize; + let w = w_bar as usize; + assert_eq!(out.len(), 3 * h * w); + // Dims are factor-aligned and at least the min-pixel floor. + assert_eq!(h_bar % profile.factor, 0); + assert_eq!(w_bar % profile.factor, 0); + assert!(h * w >= profile.min_pixels as usize); // After mean=0.5, std=0.5: red channel (255/255=1.0) → (1.0 - 0.5)/0.5 = 1.0 // green/blue (0.0) → (0.0 - 0.5)/0.5 = -1.0 - let h = profile.target_height as usize; - let w = profile.target_width as usize; assert!( (out[0] - 1.0).abs() < 1e-5, "R[0] should be 1.0, got {}", @@ -229,9 +295,12 @@ mod tests { #[test] fn preprocess_data_uri_end_to_end() { let profile = PreprocessProfile::qwen3_6(); - let out = preprocess_data_uri(&red_png_uri(), &profile).expect("e2e preprocess"); - assert_eq!(out.len(), profile.pixels_chw()); + let (out, h, w) = preprocess_data_uri(&red_png_uri(), &profile).expect("e2e preprocess"); + assert_eq!(out.len(), 3 * h as usize * w as usize); assert!(out.iter().all(|v| v.is_finite())); + // resized_dims_for_uri agrees with the full preprocess. + let (h2, w2) = resized_dims_for_uri(&red_png_uri(), &profile).expect("dims"); + assert_eq!((h, w), (h2, w2)); } #[test] @@ -240,10 +309,10 @@ mod tests { // 1x1 grayscale = 200 → after conversion to RGB, all three // channels equal 200, normalised → (200/255 - 0.5)/0.5 ≈ 0.569 let gray = DynamicImage::ImageLuma8(ImageBuffer::from_pixel(1, 1, image::Luma([200]))); - let out = preprocess(&gray, &profile); + let (out, h_bar, w_bar) = preprocess(&gray, &profile).expect("preprocess"); let expected = ((200.0 / 255.0) - 0.5) / 0.5; - let h = profile.target_height as usize; - let w = profile.target_width as usize; + let h = h_bar as usize; + let w = w_bar as usize; for c in 0..3 { let v = out[c * h * w]; assert!( @@ -252,4 +321,52 @@ mod tests { ); } } + + #[test] + fn smart_resize_keeps_factor_aligned_square_in_budget() { + // 448×448 sits inside [65536, 1048576] and is factor-aligned → + // unchanged. (Regression guard for the old fixed-res sweet spot.) + let (h, w) = smart_resize(448, 448, 32, 65_536, 1_048_576).unwrap(); + assert_eq!((h, w), (448, 448)); + } + + #[test] + fn smart_resize_preserves_aspect_and_caps_at_max() { + // 3000×4000 (landscape) → downscaled under max_pixels, aspect kept. + let (h, w) = smart_resize(3000, 4000, 32, 65_536, 1_048_576).unwrap(); + assert_eq!(h % 32, 0); + assert_eq!(w % 32, 0); + assert!( + (h as u64) * (w as u64) <= 1_048_576, + "must respect max_pixels" + ); + assert!(w > h, "landscape orientation preserved"); + // aspect ≈ 4000/3000 = 1.333; allow a factor-rounding tolerance. + let ar = w as f64 / h as f64; + assert!((ar - 4.0 / 3.0).abs() < 0.15, "aspect ~4:3, got {ar:.3}"); + } + + #[test] + fn smart_resize_floors_tiny_image_at_min() { + // 16×16 → upscaled to at least min_pixels, factor-aligned. + let (h, w) = smart_resize(16, 16, 32, 65_536, 1_048_576).unwrap(); + assert_eq!(h % 32, 0); + assert_eq!(w % 32, 0); + assert!((h as u64) * (w as u64) >= 65_536, "must respect min_pixels"); + } + + #[test] + fn smart_resize_tall_nonsquare_stays_nonsquare() { + // A tall screenshot keeps portrait orientation. + let (h, w) = smart_resize(2000, 500, 32, 65_536, 1_048_576).unwrap(); + assert!(h > w, "portrait orientation preserved"); + assert_eq!(h % 32, 0); + assert_eq!(w % 32, 0); + } + + #[test] + fn smart_resize_rejects_extreme_aspect() { + let err = smart_resize(1, 500, 32, 65_536, 1_048_576).unwrap_err(); + assert!(format!("{err:#}").contains("200:1")); + } } diff --git a/crates/neuron/src/harness/tp/tp_qwen3_5.rs b/crates/neuron/src/harness/tp/tp_qwen3_5.rs index afe2713..654d2ac 100644 --- a/crates/neuron/src/harness/tp/tp_qwen3_5.rs +++ b/crates/neuron/src/harness/tp/tp_qwen3_5.rs @@ -1288,15 +1288,39 @@ impl TpQwen3_5ForCausalLM { let device = self.device().clone(); let image_embeds = self.encode_images_concat(image_pixels)?; + // Each image's LM grid (lm_gh, lm_gw) = (h/factor, w/factor), + // factor = patch×merge. Recomputed per rank from this rank's own + // pixel tensors — deterministic, so every rank's grids (and hence + // M-RoPE positions) match without crossing the RPC (#14). + let factor = self + .vision + .as_ref() + .map(|v| { + let c = v.config(); + c.patch_size * c.spatial_merge_size + }) + .ok_or_else(|| { + candle_core::Error::Msg( + "prefill_with_images_chunked: loaded without a vision tower".into(), + ) + })?; + let grids: Vec<(usize, usize)> = image_pixels + .iter() + .map(|t| { + let (_, h, w) = t.dims3()?; + Ok::<(usize, usize), candle_core::Error>((h / factor, w / factor)) + }) + .collect::>>()?; + // Interleaved-M-RoPE 3D position ids for the whole prompt, // computed once and sliced per chunk so every rank assigns image - // tokens their 14×14 grid coordinates (and text after the image - // resumes from the compressed counter). `rope_delta` is stored on - // the base model for the decode that follows this prefill. Every - // chunk — text or image — uses the M-RoPE slice, because the image - // shifts the positions of the text around it. + // tokens their grid coordinates (and text after an image resumes + // from the compressed counter). `rope_delta` is stored on the base + // model for the decode that follows this prefill. Every chunk — + // text or image — uses the M-RoPE slice, because each image shifts + // the positions of the text around it. let (text, height, width, delta) = - crate::harness::arch::qwen3_5::rope::get_rope_index(tokens, image_token_id) + crate::harness::arch::qwen3_5::rope::get_rope_index(tokens, image_token_id, &grids) .map_err(|e| candle_core::Error::Msg(format!("get_rope_index: {e}")))?; self.base.set_rope_delta(delta); let full_pos = crate::harness::arch::qwen3_5::rope::mrope_position_tensor( diff --git a/crates/neuron/src/harness/tp/worker.rs b/crates/neuron/src/harness/tp/worker.rs index 7dd34a1..d1e1415 100644 --- a/crates/neuron/src/harness/tp/worker.rs +++ b/crates/neuron/src/harness/tp/worker.rs @@ -494,16 +494,13 @@ impl WorkerState { let device = model.device().clone(); // Preprocess each image identically to the leader so the encoded - // embeddings — and thus the spliced hidden state — match across - // ranks. Fixed 448×448 profile. + // embeddings — and thus the spliced hidden state and per-image + // grids — match across ranks. Native-aspect `smart_resize` (#14); + // deterministic, so each rank derives the same dims. let profile = PreprocessProfile::qwen3_6(); - let (h, w) = ( - profile.target_height as usize, - profile.target_width as usize, - ); let mut pixels: Vec = Vec::with_capacity(image_data_uris.len()); for (idx, uri) in image_data_uris.iter().enumerate() { - let px = match preprocess_data_uri(uri, &profile) { + let (px, h, w) = match preprocess_data_uri(uri, &profile) { Ok(p) => p, Err(e) => { return WorkerResponse::Error { @@ -512,7 +509,7 @@ impl WorkerState { }; } }; - match Tensor::from_vec(px, (3, h, w), &device) { + match Tensor::from_vec(px, (3, h as usize, w as usize), &device) { Ok(t) => pixels.push(t), Err(e) => { return WorkerResponse::Error {