From 5436af9c7360d5a5789dec5ca27160793a71b512 Mon Sep 17 00:00:00 2001 From: rob thijssen Date: Tue, 19 May 2026 17:49:43 +0300 Subject: [PATCH] fix(neuron/candle): dense Qwen3 returns rank-3 logits, double-squeeze MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Caught by live validation against Qwen/Qwen3-1.7B on beast: HTTP 500 "unexpected rank, expected: 1, got: 2 ([1, 151936])" Candle's qwen3::ModelForCausalLM::forward returns shape [B, 1, V] (no final squeeze) while quantized_qwen3::ModelWeights::forward returns [B, V] (with squeeze(1) at the end). My match arms applied a single squeeze(0) uniformly, which is correct for the quantized [1, V] → [V] but leaves the dense at [1, V] → which then trips apply_repeat_penalty::to_vec1() expecting rank 1. Dense match arms now strip both batch and seq dims: model.forward(&input, offset)?.squeeze(0)?.squeeze(0)? Also fixes validate-neuron.sh's `${3:-Q4_K_M}` → `${3-Q4_K_M}` (no colon) so passing an explicit empty third arg now drives the dense path instead of falling back to Q4_K_M. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/neuron/src/harness/candle.rs | 21 +++++++++++++++------ script/validate-neuron.sh | 4 +++- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/crates/neuron/src/harness/candle.rs b/crates/neuron/src/harness/candle.rs index ff8dc3e..778dc86 100644 --- a/crates/neuron/src/harness/candle.rs +++ b/crates/neuron/src/harness/candle.rs @@ -714,8 +714,11 @@ fn run_inference( ModelArch::Qwen3Dense(model) => { model.clear_kv_cache(); let input = Tensor::new(prompt_tokens, device)?.unsqueeze(0)?; - let logits = model.forward(&input, 0)?; - let logits = logits.squeeze(0)?; + // qwen3::ModelForCausalLM::forward returns [B, 1, V] — + // no final squeeze on the dense path, unlike the quantized + // variant which returns [B, V]. Strip both batch and seq + // dims to get the rank-1 logits LogitsProcessor expects. + let logits = model.forward(&input, 0)?.squeeze(0)?.squeeze(0)?; sample_with_penalty(&logits, &generated, &mut logits_processor)? } }; @@ -735,8 +738,11 @@ fn run_inference( } ModelArch::Qwen3Dense(model) => { let input = Tensor::new(&[next_token], device)?.unsqueeze(0)?; - let logits = model.forward(&input, prompt_tokens.len() + index)?; - let logits = logits.squeeze(0)?; + // Dense returns [B, 1, V]; strip both leading dims. + let logits = model + .forward(&input, prompt_tokens.len() + index)? + .squeeze(0)? + .squeeze(0)?; sample_with_penalty(&logits, &generated, &mut logits_processor)? } }; @@ -852,8 +858,11 @@ fn run_inference_streaming( } ModelArch::Qwen3Dense(model) => { let input = Tensor::new(&[next_token], device)?.unsqueeze(0)?; - let logits = model.forward(&input, prompt_tokens.len() + index)?; - let logits = logits.squeeze(0)?; + // Dense returns [B, 1, V]; strip both leading dims. + let logits = model + .forward(&input, prompt_tokens.len() + index)? + .squeeze(0)? + .squeeze(0)?; sample_with_penalty(&logits, &all_tokens, &mut logits_processor)? } }; diff --git a/script/validate-neuron.sh b/script/validate-neuron.sh index b0f974f..fae0cf4 100755 --- a/script/validate-neuron.sh +++ b/script/validate-neuron.sh @@ -22,7 +22,9 @@ set -euo pipefail HOST="${1:-beast.hanzalova.internal}" MODEL_ID="${2:-unsloth/Qwen3-0.6B-GGUF}" -QUANT="${3:-Q4_K_M}" +# `${3-Q4_K_M}` (no colon) only uses the default when the arg is +# UNSET — passing an explicit empty string drives the dense path. +QUANT="${3-Q4_K_M}" PORT="${NEURON_PORT:-13131}" BASE="http://${HOST}:${PORT}"