diff --git a/crates/neuron/src/harness/arch/qwen3_5/linear_attn.rs b/crates/neuron/src/harness/arch/qwen3_5/linear_attn.rs index 3ce654d..102277b 100644 --- a/crates/neuron/src/harness/arch/qwen3_5/linear_attn.rs +++ b/crates/neuron/src/harness/arch/qwen3_5/linear_attn.rs @@ -307,6 +307,12 @@ impl GatedDeltaNet { let q = (q.to_dtype(candle_core::DType::F32)? * scale)?; let k = k.to_dtype(candle_core::DType::F32)?; let v = v.to_dtype(candle_core::DType::F32)?; + // `g` is already F32 (constructed from A_log/dt_bias in f32 above); + // `beta` came from sigmoid(b) which kept the model dtype, so we + // need to promote it here too — otherwise the per-token + // `(v_t - kv_mem).broadcast_mul(&beta_col)` mixes F32 LHS with + // BF16 RHS and trips candle's dtype-mismatch check. + let beta = beta.to_dtype(candle_core::DType::F32)?; // Initialise the recurrent state from cache or zeros. let mut state = match self.state.recurrent_state.take() {