fix(neuron/candle): dense Qwen3 returns rank-3 logits, double-squeeze

Caught by live validation against Qwen/Qwen3-1.7B on beast: HTTP 500 "unexpected rank, expected: 1, got: 2 ([1, 151936])" Candle's qwen3::ModelForCausalLM::forward returns shape [B, 1, V] (no final squeeze) while quantized_qwen3::ModelWeights::forward returns [B, V] (with squeeze(1) at the end). My match arms applied a single squeeze(0) uniformly, which is correct for the quantized [1, V] → [V] but leaves the dense at [1, V] → which then trips apply_repeat_penalty::to_vec1() expecting rank 1. Dense match arms now strip both batch and seq dims: model.forward(&input, offset)?.squeeze(0)?.squeeze(0)? Also fixes validate-neuron.sh's `${3:-Q4_K_M}` → `${3-Q4_K_M}` (no colon) so passing an explicit empty third arg now drives the dense path instead of falling back to Q4_K_M. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 17:49:43 +03:00
parent 8e882c0757
commit 5436af9c73
2 changed files with 18 additions and 7 deletions
--- a/crates/neuron/src/harness/candle.rs
+++ b/crates/neuron/src/harness/candle.rs
@@ -714,8 +714,11 @@ fn run_inference(
        ModelArch::Qwen3Dense(model) => {
            model.clear_kv_cache();
            let input = Tensor::new(prompt_tokens, device)?.unsqueeze(0)?;
-            let logits = model.forward(&input, 0)?;
+            // qwen3::ModelForCausalLM::forward returns [B, 1, V] —
-            let logits = logits.squeeze(0)?;
+            // no final squeeze on the dense path, unlike the quantized
            // variant which returns [B, V]. Strip both batch and seq
            // dims to get the rank-1 logits LogitsProcessor expects.
            let logits = model.forward(&input, 0)?.squeeze(0)?.squeeze(0)?;
            sample_with_penalty(&logits, &generated, &mut logits_processor)?
        }
    };
@@ -735,8 +738,11 @@ fn run_inference(
            }
            ModelArch::Qwen3Dense(model) => {
                let input = Tensor::new(&[next_token], device)?.unsqueeze(0)?;
-                let logits = model.forward(&input, prompt_tokens.len() + index)?;
+                // Dense returns [B, 1, V]; strip both leading dims.
-                let logits = logits.squeeze(0)?;
+                let logits = model
                    .forward(&input, prompt_tokens.len() + index)?
                    .squeeze(0)?
                    .squeeze(0)?;
                sample_with_penalty(&logits, &generated, &mut logits_processor)?
            }
        };
@@ -852,8 +858,11 @@ fn run_inference_streaming(
                }
                ModelArch::Qwen3Dense(model) => {
                    let input = Tensor::new(&[next_token], device)?.unsqueeze(0)?;
-                    let logits = model.forward(&input, prompt_tokens.len() + index)?;
+                    // Dense returns [B, 1, V]; strip both leading dims.
-                    let logits = logits.squeeze(0)?;
+                    let logits = model
                        .forward(&input, prompt_tokens.len() + index)?
                        .squeeze(0)?
                        .squeeze(0)?;
                    sample_with_penalty(&logits, &all_tokens, &mut logits_processor)?
                }
            };
--- a/script/validate-neuron.sh
+++ b/script/validate-neuron.sh
@@ -22,7 +22,9 @@ set -euo pipefail
 HOST="${1:-beast.hanzalova.internal}"
 MODEL_ID="${2:-unsloth/Qwen3-0.6B-GGUF}"
-QUANT="${3:-Q4_K_M}"
+# `${3-Q4_K_M}` (no colon) only uses the default when the arg is
 # UNSET — passing an explicit empty string drives the dense path.
 QUANT="${3-Q4_K_M}"
 PORT="${NEURON_PORT:-13131}"
 BASE="http://${HOST}:${PORT}"