From 09c945f81e83110fa84190b35c2e50fa6c1c0320 Mon Sep 17 00:00:00 2001 From: rob thijssen Date: Thu, 21 May 2026 11:50:30 +0300 Subject: [PATCH] feat(stage-8d-4): dispatch chunked_gated_delta_rule_recurrence at prefill run_delta_rule_cuda now picks between the per-token kernel and the BT=64 chunked variant based on seq_len. Threshold = 64 matches mistralrs. Prefill on Qwen3.6-27B (typical seq_len in the hundreds) drops from one block-launch per token to one per 64-token chunk. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/harness/arch/qwen3_5/linear_attn.rs | 31 ++++++++++++++----- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/crates/neuron/src/harness/arch/qwen3_5/linear_attn.rs b/crates/neuron/src/harness/arch/qwen3_5/linear_attn.rs index 8ddac01..5c3cb2c 100644 --- a/crates/neuron/src/harness/arch/qwen3_5/linear_attn.rs +++ b/crates/neuron/src/harness/arch/qwen3_5/linear_attn.rs @@ -406,14 +406,29 @@ fn run_delta_rule_cuda( let g_bh = g.flatten(0, 1)?.contiguous()?; let beta_bh = beta.flatten(0, 1)?.contiguous()?; let mut state_bh = state.flatten(0, 1)?.contiguous()?; - let output_bh = crate::cuda::gdn::gated_delta_rule_recurrence_cuda( - &q_bh, - &k_bh, - &v_bh, - &g_bh, - &beta_bh, - &mut state_bh, - )?; + // For long prefills, the chunked kernel (BT=64) processes a chunk + // of tokens at a time instead of one-by-one — same delta-rule math, + // far fewer block launches. Threshold matches mistralrs. + const CHUNK_THRESHOLD: usize = 64; + let output_bh = if seq_len >= CHUNK_THRESHOLD { + crate::cuda::gdn::chunked_gated_delta_rule_recurrence_cuda( + &q_bh, + &k_bh, + &v_bh, + &g_bh, + &beta_bh, + &mut state_bh, + )? + } else { + crate::cuda::gdn::gated_delta_rule_recurrence_cuda( + &q_bh, + &k_bh, + &v_bh, + &g_bh, + &beta_bh, + &mut state_bh, + )? + }; let core_attn_out = output_bh.reshape((batch_size, num_heads, seq_len, head_v_dim))?; let new_state = state_bh.reshape((batch_size, num_heads, head_k_dim, head_v_dim))?; Ok((core_attn_out, new_state))