From 495d3f7c05c3594cb3a47a0c5b984bdd9d24fe49 Mon Sep 17 00:00:00 2001 From: rob thijssen Date: Wed, 20 May 2026 21:13:19 +0300 Subject: [PATCH] fix(qwen3_5): promote beta to F32 alongside q/k/v in delta rule MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The single-GPU dense load of Qwen/Qwen3.5-0.8B succeeded but the first inference forward bombed with `dtype mismatch in mul, lhs: F32, rhs: BF16`. Trace through the recurrent delta-rule loop: let q = (q.to_dtype(F32)? * scale)?; // F32 let k = k.to_dtype(F32)?; // F32 let v = v.to_dtype(F32)?; // F32 // g built from A_log/dt_bias // F32 // beta = sigmoid(b) // BF16 (sigmoid preserves dtype) ... let delta = (v_t - kv_mem)?.broadcast_mul(&beta_col)?; ^^^^^^^^^^^^^ ^^^^^^^^^ F32 BF16 ← mismatch `g` was already F32 because it was constructed from `a_log.to_dtype(F32)` + `dt_bias.to_dtype(F32)` earlier in the function. `beta` came from `sigmoid(b)` where `b` was the model dtype (BF16), so beta stayed BF16 and the multiplication tripped candle's dtype-mismatch check. Promote beta to F32 at the same point we promote q/k/v. Caught by the validate-neuron.sh probe against Qwen/Qwen3.5-0.8B on beast — load returned 200, then `POST /v1/chat/completions` returned the dtype error. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/neuron/src/harness/arch/qwen3_5/linear_attn.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/crates/neuron/src/harness/arch/qwen3_5/linear_attn.rs b/crates/neuron/src/harness/arch/qwen3_5/linear_attn.rs index 3ce654d..102277b 100644 --- a/crates/neuron/src/harness/arch/qwen3_5/linear_attn.rs +++ b/crates/neuron/src/harness/arch/qwen3_5/linear_attn.rs @@ -307,6 +307,12 @@ impl GatedDeltaNet { let q = (q.to_dtype(candle_core::DType::F32)? * scale)?; let k = k.to_dtype(candle_core::DType::F32)?; let v = v.to_dtype(candle_core::DType::F32)?; + // `g` is already F32 (constructed from A_log/dt_bias in f32 above); + // `beta` came from sigmoid(b) which kept the model dtype, so we + // need to promote it here too — otherwise the per-token + // `(v_t - kv_mem).broadcast_mul(&beta_col)` mixes F32 LHS with + // BF16 RHS and trips candle's dtype-mismatch check. + let beta = beta.to_dtype(candle_core::DType::F32)?; // Initialise the recurrent state from cache or zeros. let mut state = match self.state.recurrent_state.take() {