diff --git a/crates/neuron/src/harness/arch/qwen3_5/mod.rs b/crates/neuron/src/harness/arch/qwen3_5/mod.rs index ed2375c..e6f0da4 100644 --- a/crates/neuron/src/harness/arch/qwen3_5/mod.rs +++ b/crates/neuron/src/harness/arch/qwen3_5/mod.rs @@ -422,15 +422,10 @@ impl Qwen3_5Model { /// /// The splice replaces the LM's text-side embedding at each /// `image_token_id` position with the corresponding row from - /// `image_embeds`. After the splice the decoder runs unchanged. - /// - /// **MRoPE gap.** Qwen3.6's `rope_parameters` declares MRoPE - /// (interleaved text/height/width axes); Stage B applies plain - /// text-position RoPE to image tokens. The model still attends - /// to image content but loses spatial structure that MRoPE-aware - /// position encoding would preserve. Tracked under issue #15 - /// (numerical validation) — quality benchmark from Stage D should - /// surface the impact, and the fix lives in `rope::RotaryEmbedding`. + /// `image_embeds`. After the splice the decoder runs the interleaved + /// M-RoPE path: `grids` carries each image's post-merge LM grid + /// `(lm_gh, lm_gw)` so `get_rope_index` assigns image tokens their 2D + /// coordinates (dynamic resolution, #14). pub fn forward_with_vision( &mut self, input_ids: &Tensor, @@ -461,7 +456,7 @@ impl Qwen3_5Model { // Vision path: splice image embeddings at `image_token_id` // positions and build interleaved M-RoPE cos/sin so image tokens - // carry their 14×14 grid coordinates. Text / decode skip the + // carry their 2D (lm_gh × lm_gw) grid coordinates. Text / decode skip the // device→host id copy entirely and take the plain-RoPE fast path // — bit-for-bit the pre-M-RoPE behaviour when `rope_delta == 0`. let (cos, sin) = if let (Some(img), Some(tok_id)) = (image_embeds, image_token_id) { diff --git a/crates/neuron/src/harness/preprocess.rs b/crates/neuron/src/harness/preprocess.rs index 72a0f6f..02409cf 100644 --- a/crates/neuron/src/harness/preprocess.rs +++ b/crates/neuron/src/harness/preprocess.rs @@ -55,18 +55,36 @@ pub struct PreprocessProfile { pub image_std: [f32; 3], } +/// Default pixel budget for Qwen3.6 (`256² … 1024²` → 64 … 1024 LM +/// tokens/image). Generous for documents/OCR, bounded for serving on +/// 2×RTX5090. Operators tune with `NEURON_VISION_MIN_PIXELS` / +/// `NEURON_VISION_MAX_PIXELS` (matching the other `NEURON_VISION_*` knobs). +const QWEN3_6_MIN_PIXELS: u32 = 65_536; +const QWEN3_6_MAX_PIXELS: u32 = 1_048_576; + +fn env_pixels(name: &str, default: u32) -> u32 { + std::env::var(name) + .ok() + .and_then(|v| v.trim().parse::().ok()) + .unwrap_or(default) +} + impl PreprocessProfile { /// Profile for Qwen3.6. Native-aspect `smart_resize` (factor 32), - /// normalise to `[-1, 1]` via mean=std=0.5. Pixel budget defaults: - /// `min = 256² = 65536` (→ 8×8 = 64 LM tokens) and - /// `max = 1024² = 1048576` (→ 32×32 = 1024 LM tokens) — generous for - /// documents/OCR, bounded for serving on 2×RTX5090. (Operator - /// override lands with the `[harness.candle.vision]` config in #14 C5.) + /// normalise to `[-1, 1]` via mean=std=0.5. Pixel budget defaults to + /// [`QWEN3_6_MIN_PIXELS`]…[`QWEN3_6_MAX_PIXELS`], overridable via the + /// `NEURON_VISION_MIN_PIXELS` / `NEURON_VISION_MAX_PIXELS` env vars. + /// The budget is clamped sane: `min ≥ factor²` (at least one LM token) + /// and `max ≥ min`. pub fn qwen3_6() -> Self { + let factor = 32u32; + let f2 = factor * factor; + let min_pixels = env_pixels("NEURON_VISION_MIN_PIXELS", QWEN3_6_MIN_PIXELS).max(f2); + let max_pixels = env_pixels("NEURON_VISION_MAX_PIXELS", QWEN3_6_MAX_PIXELS).max(min_pixels); Self { - factor: 32, - min_pixels: 65_536, - max_pixels: 1_048_576, + factor, + min_pixels, + max_pixels, image_mean: [0.5, 0.5, 0.5], image_std: [0.5, 0.5, 0.5], } @@ -369,4 +387,18 @@ mod tests { let err = smart_resize(1, 500, 32, 65_536, 1_048_576).unwrap_err(); assert!(format!("{err:#}").contains("200:1")); } + + #[test] + fn qwen3_6_default_budget_bounds_lm_tokens() { + // A huge source image caps at max_pixels → the per-image LM token + // count stays within budget (so it can't blow NEURON_MAX_PROMPT_TOKENS). + let p = PreprocessProfile::qwen3_6(); + let (h, w) = p.resized_dims(8000, 6000).unwrap(); + let lm_tokens = (h / p.factor) * (w / p.factor); + let budget = p.max_pixels / (p.factor * p.factor); + assert!( + lm_tokens <= budget, + "max-res image LM tokens {lm_tokens} must stay within budget {budget}" + ); + } }