feat(neuron): operator pixel-budget env override + doc cleanup (#14 C5)

- PreprocessProfile::qwen3_6() reads NEURON_VISION_MIN_PIXELS / NEURON_VISION_MAX_PIXELS (clamped to factor² ≤ min ≤ max), matching the NEURON_VISION_LEGACY_* / NEURON_MROPE knob convention. Defaults remain 256²…1024² (64…1024 LM tokens/image). - Test: a max-resolution source caps within the token budget (can't blow NEURON_MAX_PROMPT_TOKENS). - Strip stale fixed-resolution / "MRoPE gap (#15)" / 14×14 language from the preprocess, mod, and rope doc-comments now that resolution is dynamic and M-RoPE is implemented. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-04 22:50:03 +03:00
parent c97a8654f5
commit d311c8ca7a
2 changed files with 45 additions and 18 deletions
--- a/crates/neuron/src/harness/arch/qwen3_5/mod.rs
+++ b/crates/neuron/src/harness/arch/qwen3_5/mod.rs
@@ -422,15 +422,10 @@ impl Qwen3_5Model {
    ///
    /// The splice replaces the LM's text-side embedding at each
    /// `image_token_id` position with the corresponding row from
-    /// `image_embeds`. After the splice the decoder runs unchanged.
-    ///
-    /// **MRoPE gap.** Qwen3.6's `rope_parameters` declares MRoPE
-    /// (interleaved text/height/width axes); Stage B applies plain
-    /// text-position RoPE to image tokens. The model still attends
-    /// to image content but loses spatial structure that MRoPE-aware
-    /// position encoding would preserve. Tracked under issue #15
-    /// (numerical validation) — quality benchmark from Stage D should
-    /// surface the impact, and the fix lives in `rope::RotaryEmbedding`.
+    /// `image_embeds`. After the splice the decoder runs the interleaved
+    /// M-RoPE path: `grids` carries each image's post-merge LM grid
+    /// `(lm_gh, lm_gw)` so `get_rope_index` assigns image tokens their 2D
+    /// coordinates (dynamic resolution, #14).
    pub fn forward_with_vision(
        &mut self,
        input_ids: &Tensor,
@@ -461,7 +456,7 @@ impl Qwen3_5Model {

        // Vision path: splice image embeddings at `image_token_id`
        // positions and build interleaved M-RoPE cos/sin so image tokens
-        // carry their 14×14 grid coordinates. Text / decode skip the
+        // carry their 2D (lm_gh × lm_gw) grid coordinates. Text / decode skip the
        // device→host id copy entirely and take the plain-RoPE fast path
        // — bit-for-bit the pre-M-RoPE behaviour when `rope_delta == 0`.
        let (cos, sin) = if let (Some(img), Some(tok_id)) = (image_embeds, image_token_id) {
--- a/crates/neuron/src/harness/preprocess.rs
+++ b/crates/neuron/src/harness/preprocess.rs
@@ -55,18 +55,36 @@ pub struct PreprocessProfile {
    pub image_std: [f32; 3],
 }

+/// Default pixel budget for Qwen3.6 (`256² … 1024²` → 64 … 1024 LM
+/// tokens/image). Generous for documents/OCR, bounded for serving on
+/// 2×RTX5090. Operators tune with `NEURON_VISION_MIN_PIXELS` /
+/// `NEURON_VISION_MAX_PIXELS` (matching the other `NEURON_VISION_*` knobs).
+const QWEN3_6_MIN_PIXELS: u32 = 65_536;
+const QWEN3_6_MAX_PIXELS: u32 = 1_048_576;
+
+fn env_pixels(name: &str, default: u32) -> u32 {
+    std::env::var(name)
+        .ok()
+        .and_then(|v| v.trim().parse::<u32>().ok())
+        .unwrap_or(default)
+}
+
 impl PreprocessProfile {
    /// Profile for Qwen3.6. Native-aspect `smart_resize` (factor 32),
-    /// normalise to `[-1, 1]` via mean=std=0.5. Pixel budget defaults:
-    /// `min = 256² = 65536` (→ 8×8 = 64 LM tokens) and
-    /// `max = 1024² = 1048576` (→ 32×32 = 1024 LM tokens) — generous for
-    /// documents/OCR, bounded for serving on 2×RTX5090. (Operator
-    /// override lands with the `[harness.candle.vision]` config in #14 C5.)
+    /// normalise to `[-1, 1]` via mean=std=0.5. Pixel budget defaults to
+    /// [`QWEN3_6_MIN_PIXELS`]…[`QWEN3_6_MAX_PIXELS`], overridable via the
+    /// `NEURON_VISION_MIN_PIXELS` / `NEURON_VISION_MAX_PIXELS` env vars.
+    /// The budget is clamped sane: `min ≥ factor²` (at least one LM token)
+    /// and `max ≥ min`.
    pub fn qwen3_6() -> Self {
+        let factor = 32u32;
+        let f2 = factor * factor;
+        let min_pixels = env_pixels("NEURON_VISION_MIN_PIXELS", QWEN3_6_MIN_PIXELS).max(f2);
+        let max_pixels = env_pixels("NEURON_VISION_MAX_PIXELS", QWEN3_6_MAX_PIXELS).max(min_pixels);
        Self {
-            factor: 32,
-            min_pixels: 65_536,
-            max_pixels: 1_048_576,
+            factor,
+            min_pixels,
+            max_pixels,
            image_mean: [0.5, 0.5, 0.5],
            image_std: [0.5, 0.5, 0.5],
        }
@@ -369,4 +387,18 @@ mod tests {
        let err = smart_resize(1, 500, 32, 65_536, 1_048_576).unwrap_err();
        assert!(format!("{err:#}").contains("200:1"));
    }
+
+    #[test]
+    fn qwen3_6_default_budget_bounds_lm_tokens() {
+        // A huge source image caps at max_pixels → the per-image LM token
+        // count stays within budget (so it can't blow NEURON_MAX_PROMPT_TOKENS).
+        let p = PreprocessProfile::qwen3_6();
+        let (h, w) = p.resized_dims(8000, 6000).unwrap();
+        let lm_tokens = (h / p.factor) * (w / p.factor);
+        let budget = p.max_pixels / (p.factor * p.factor);
+        assert!(
+            lm_tokens <= budget,
+            "max-res image LM tokens {lm_tokens} must stay within budget {budget}"
+        );
+    }
 }