From 5436af9c7360d5a5789dec5ca27160793a71b512 Mon Sep 17 00:00:00 2001
From: rob thijssen <grenade@rob.tn>
Date: Tue, 19 May 2026 17:49:43 +0300
Subject: [PATCH] fix(neuron/candle): dense Qwen3 returns rank-3 logits,
 double-squeeze
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Caught by live validation against Qwen/Qwen3-1.7B on beast:
  HTTP 500 "unexpected rank, expected: 1, got: 2 ([1, 151936])"

Candle's qwen3::ModelForCausalLM::forward returns shape [B, 1, V]
(no final squeeze) while quantized_qwen3::ModelWeights::forward
returns [B, V] (with squeeze(1) at the end). My match arms applied
a single squeeze(0) uniformly, which is correct for the quantized
[1, V] → [V] but leaves the dense at [1, V] → which then trips
apply_repeat_penalty::to_vec1() expecting rank 1.

Dense match arms now strip both batch and seq dims:
  model.forward(&input, offset)?.squeeze(0)?.squeeze(0)?

Also fixes validate-neuron.sh's `${3:-Q4_K_M}` → `${3-Q4_K_M}`
(no colon) so passing an explicit empty third arg now drives the
dense path instead of falling back to Q4_K_M.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/neuron/src/harness/candle.rs | 21 +++++++++++++++------
 script/validate-neuron.sh           |  4 +++-
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/crates/neuron/src/harness/candle.rs b/crates/neuron/src/harness/candle.rs
index ff8dc3e..778dc86 100644
--- a/crates/neuron/src/harness/candle.rs
+++ b/crates/neuron/src/harness/candle.rs
@@ -714,8 +714,11 @@ fn run_inference(
         ModelArch::Qwen3Dense(model) => {
             model.clear_kv_cache();
             let input = Tensor::new(prompt_tokens, device)?.unsqueeze(0)?;
-            let logits = model.forward(&input, 0)?;
-            let logits = logits.squeeze(0)?;
+            // qwen3::ModelForCausalLM::forward returns [B, 1, V] —
+            // no final squeeze on the dense path, unlike the quantized
+            // variant which returns [B, V]. Strip both batch and seq
+            // dims to get the rank-1 logits LogitsProcessor expects.
+            let logits = model.forward(&input, 0)?.squeeze(0)?.squeeze(0)?;
             sample_with_penalty(&logits, &generated, &mut logits_processor)?
         }
     };
@@ -735,8 +738,11 @@ fn run_inference(
             }
             ModelArch::Qwen3Dense(model) => {
                 let input = Tensor::new(&[next_token], device)?.unsqueeze(0)?;
-                let logits = model.forward(&input, prompt_tokens.len() + index)?;
-                let logits = logits.squeeze(0)?;
+                // Dense returns [B, 1, V]; strip both leading dims.
+                let logits = model
+                    .forward(&input, prompt_tokens.len() + index)?
+                    .squeeze(0)?
+                    .squeeze(0)?;
                 sample_with_penalty(&logits, &generated, &mut logits_processor)?
             }
         };
@@ -852,8 +858,11 @@ fn run_inference_streaming(
                 }
                 ModelArch::Qwen3Dense(model) => {
                     let input = Tensor::new(&[next_token], device)?.unsqueeze(0)?;
-                    let logits = model.forward(&input, prompt_tokens.len() + index)?;
-                    let logits = logits.squeeze(0)?;
+                    // Dense returns [B, 1, V]; strip both leading dims.
+                    let logits = model
+                        .forward(&input, prompt_tokens.len() + index)?
+                        .squeeze(0)?
+                        .squeeze(0)?;
                     sample_with_penalty(&logits, &all_tokens, &mut logits_processor)?
                 }
             };
diff --git a/script/validate-neuron.sh b/script/validate-neuron.sh
index b0f974f..fae0cf4 100755
--- a/script/validate-neuron.sh
+++ b/script/validate-neuron.sh
@@ -22,7 +22,9 @@ set -euo pipefail
 
 HOST="${1:-beast.hanzalova.internal}"
 MODEL_ID="${2:-unsloth/Qwen3-0.6B-GGUF}"
-QUANT="${3:-Q4_K_M}"
+# `${3-Q4_K_M}` (no colon) only uses the default when the arg is
+# UNSET — passing an explicit empty string drives the dense path.
+QUANT="${3-Q4_K_M}"
 PORT="${NEURON_PORT:-13131}"
 BASE="http://${HOST}:${PORT}"