From ee663e5e9976ae97a3f0ccc51efff920033f9dd9 Mon Sep 17 00:00:00 2001 From: rob thijssen Date: Thu, 21 May 2026 21:50:45 +0300 Subject: [PATCH] fix(stage-8e-2e): bump quant prefill threshold to M > 64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The M > 8 threshold from 8e-2d activated forward_via_f16 on the test case (M=30) and slightly regressed prefill (143 -> 133 T/s). The dequant cost (~30 MB f16 per linear * ~480 calls per prefill = ~200 ms) eats the cuBLAS GEMM speedup at small M. Move the crossover to M > 64 so short prefills (typical for the validate probe) stay on the GGUF GEMV kernel where per-call cost is comparable but the dequant tax is zero. Long prefills still get the dequant-then-cuBLAS-GEMM path where the GEMM scaling amortises the fixed dequant cost. Doesn't close the gap to mistralrs's 423 T/s on Q5K prefill — that needs either a dequant cache (gives back the ISQ memory win) or a fused dequant+gemm kernel. Both larger projects. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/neuron/src/harness/tp/tp_linear.rs | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/crates/neuron/src/harness/tp/tp_linear.rs b/crates/neuron/src/harness/tp/tp_linear.rs index 663ba7a..769b8b9 100644 --- a/crates/neuron/src/harness/tp/tp_linear.rs +++ b/crates/neuron/src/harness/tp/tp_linear.rs @@ -78,10 +78,20 @@ impl MaybeQuantLinear { /// `QMatMul::forward` wins (it operates on quantized blocks directly /// and accumulates in registers). /// -/// 8 is conservative: candle's f16 GEMM beats the GGUF GEMV anywhere -/// the M dim gets non-trivial (>=4 typically), but the dequantize -/// cost is fixed per call so the crossover is a small constant. -const QUANT_PREFILL_M_THRESHOLD: usize = 8; +/// Empirical: at M=30 on Qwen3.6-27B / RTX 5090, forward_via_f16 was +/// slightly *slower* than the GGUF GEMV kernel — the per-call dequant +/// cost (~30 MB f16 written to global memory per linear × ~480 calls +/// per prefill) eats the cuBLAS GEMM speedup at small M. The +/// crossover where the GEMM scaling actually beats the fixed dequant +/// tax sits well above M=8. +/// +/// 64 is a conservative crossover that keeps short-prompt prefills +/// on the GGUF kernel (where the per-call cost is comparable to the +/// f16 path but the dequant tax is zero) and only activates the +/// dequant-then-GEMM path for long prefills where the GEMM size +/// makes amortising worth it. A proper fix is either a dequant +/// cache or a fused dequant+gemm cuda kernel — both larger projects. +const QUANT_PREFILL_M_THRESHOLD: usize = 64; impl Module for MaybeQuantLinear { fn forward(&self, x: &Tensor) -> candle_core::Result {