feat(stage-8d-1): import mistralrs GDN CUDA kernels — build infra only

Stage 8d (new): port the Gated DeltaNet CUDA kernels from EricLBuehler/mistral.rs to close the ~500x decode performance gap we measured on Qwen3.6-27B TP-2 (~12s/token in our pure-candle path vs ~37 T/s in mistralrs on the same hardware). This commit lays the build infrastructure with zero behavioural change. Subsequent commits (8d-2 .. 8d-5) wire each kernel into the qwen3_5 architecture and TP variant. Added: - `crates/neuron/build.rs` — uses `cudaforge::KernelBuilder` to compile every `src/cuda/*.cu` file into `libneuroncuda.a` under the `cuda` feature, then links it + `cudart`. Mirrors mistralrs's `mistralrs-core/build.rs` setup verbatim (same NVCC flag set, same sm_<80 bf16 gate). - `crates/neuron/src/cuda/gdn.cu` — five kernels ported verbatim from upstream: * `gated_delta_rule_recurrence` (V-tiled per-token decode) * `chunked_gated_delta_rule_recurrence` (BT=64 chunked prefill) * `causal_conv1d_update` (single-token conv decode) * `causal_conv1d_full` (multi-token conv prefill) * `fused_gdn_gating` (beta = sigmoid(b); g = -exp(A_log) * softplus(a + dt_bias)) - `crates/neuron/src/cuda/gdn.rs` — Rust wrappers around the kernels, cudarc::CudaSlice::device_ptr boilerplate identical to upstream. - `crates/neuron/src/cuda/ffi.rs` — `extern "C"` decls (subset of upstream's ffi.rs covering only the five GDN kernels; MoE / SSM / top-k decls land here when we absorb those too). - `crates/neuron/src/cuda/mod.rs` — re-exports + module docs. Cargo wiring: `cudaforge` added as an optional build-dep, activated by the `cuda` feature. CPU build is unchanged (the `cuda/` module is fully `#[cfg(feature = "cuda")]`). The cuda feature build inside the patched container compiles `gdn.cu` (1 of 1 kernels) and links clean. Licensing: upstream files preserve their MIT origin via per-file comment banners pointing to the mistralrs path. No behaviour-relevant edits to the .cu kernels — local diff against upstream is just the banner. The `.rs` wrappers and `ffi.rs` subset are also from upstream; their structure (module path `crate::cuda::ffi::*`) matches identically so future kernel imports drop in unchanged. CPU clippy + 32 lib tests pass; `cargo clippy --features cuda` clean inside the runner container. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-21 11:34:11 +03:00
parent 70eb6af42b
commit 1ebbe87651
8 changed files with 1375 additions and 0 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2113,6 +2113,7 @@ dependencies = [
 "candle-transformers",
 "clap",
 "cortex-core",
 "cudaforge",
 "cudarc 0.19.7",
 "figment",
 "futures",
--- a/crates/neuron/Cargo.toml
+++ b/crates/neuron/Cargo.toml
@@ -25,6 +25,7 @@ cuda = [
    "candle-transformers/cuda",
    "dep:cudarc",
    "dep:half",
    "dep:cudaforge",
 ]
 # Use cuDNN for convolution / attention kernels. Requires CUDA.
 cudnn = [
@@ -79,3 +80,13 @@ hf-hub = { version = "0.4", features = ["tokio"] }
 [dev-dependencies]
 tokio = { workspace = true, features = ["test-util"] }
 reqwest.workspace = true
 [build-dependencies]
 # Used by `build.rs` to compile `src/cuda/*.cu` into `libneuroncuda.a`
 # under the `cuda` feature. Matches mistralrs's upstream build setup
 # (their `mistralrs-core/build.rs` uses the same constructor).
 cudaforge = { version = "0.1", optional = true }
 [package.metadata.docs.rs]
 # Skip the CUDA path on docs.rs (it lacks nvcc).
 no-default-features = true
--- a/crates/neuron/build.rs
+++ b/crates/neuron/build.rs
@@ -0,0 +1,66 @@
 //! Build script: compile the CUDA kernels in `src/cuda/*.cu` into a
 //! static library and link it under the `cuda` feature.
 //!
 //! Patterned on `EricLBuehler/mistral.rs::mistralrs-core/build.rs` —
 //! same `cudaforge::KernelBuilder` invocation, same NVCC flag set.
 fn main() {
    #[cfg(feature = "cuda")]
    {
        use std::path::PathBuf;
        println!("cargo:rerun-if-changed=build.rs");
        println!("cargo:rerun-if-changed=src/cuda/");
        let build_dir = PathBuf::from(std::env::var("OUT_DIR").unwrap());
        let mut builder = cudaforge::KernelBuilder::new()
            .source_glob("src/cuda/*.cu")
            .out_dir(&build_dir)
            .arg("-std=c++17")
            .arg("-O3")
            .arg("-U__CUDA_NO_HALF_OPERATORS__")
            .arg("-U__CUDA_NO_HALF_CONVERSIONS__")
            .arg("-U__CUDA_NO_HALF2_OPERATORS__")
            .arg("-U__CUDA_NO_BFLOAT16_CONVERSIONS__")
            .arg("--expt-relaxed-constexpr")
            .arg("--expt-extended-lambda")
            .arg("--use_fast_math")
            .arg("--compiler-options")
            .arg("-fPIC");
        // sm_<80 doesn't have bf16 intrinsics for WMMA — gate the
        // bf16-only kernels off in that case. (Mirrors upstream.)
        if let Some(compute_cap) = builder.get_compute_cap()
            && compute_cap < 80
        {
            builder = builder.arg("-DNO_BF16_KERNEL");
        }
        let target = std::env::var("TARGET").unwrap();
        let out_file = if target.contains("msvc") {
            build_dir.join("neuroncuda.lib")
        } else {
            build_dir.join("libneuroncuda.a")
        };
        builder
            .build_lib(out_file)
            .expect("neuron cuda build failed");
        println!("cargo:rustc-link-search={}", build_dir.display());
        println!("cargo:rustc-link-lib=neuroncuda");
        println!("cargo:rustc-link-lib=dylib=cudart");
        if target.contains("msvc") {
            // No extra runtime library needed.
        } else if target.contains("apple")
            || target.contains("freebsd")
            || target.contains("openbsd")
        {
            println!("cargo:rustc-link-lib=dylib=c++");
        } else if target.contains("android") {
            println!("cargo:rustc-link-lib=dylib=c++_shared");
        } else {
            println!("cargo:rustc-link-lib=dylib=stdc++");
        }
    }
 }
--- a/crates/neuron/src/cuda/ffi.rs
+++ b/crates/neuron/src/cuda/ffi.rs
@@ -0,0 +1,84 @@
 //! FFI declarations for the CUDA kernels in `gdn.cu`.
 //!
 //! Subset of `EricLBuehler/mistral.rs::mistralrs-core/src/cuda/ffi.rs`
 //! covering only the Gated DeltaNet kernels we currently use. Other
 //! kernels in the upstream file (MoE GEMM, top-k, Mamba selective
 //! scan, etc.) would land here too as we absorb them.
 //!
 //! All function declarations are MIT-licensed from upstream and
 //! unchanged apart from this header.
 use std::ffi::c_void;
 #[allow(dead_code)]
 unsafe extern "C" {
    // GDN (Gated Delta Net) kernels for qwen3_5 / Qwen3-Next.
    pub(crate) fn gated_delta_rule_recurrence(
        q: *const f32,
        k: *const f32,
        v: *const f32,
        g: *const f32,
        beta: *const f32,
        state: *mut f32,
        output: *mut f32,
        bh: i32,
        seq_len: i32,
        k_dim: i32,
        v_dim: i32,
        stream: i64,
    );
    /// Chunked GDN recurrence for prefill (processes tokens in BT=64 chunks).
    pub(crate) fn chunked_gated_delta_rule_recurrence(
        q: *const f32,
        k: *const f32,
        v: *const f32,
        g: *const f32,
        beta: *const f32,
        state: *mut f32,
        output: *mut f32,
        bh: i32,
        seq_len: i32,
        k_dim: i32,
        v_dim: i32,
        stream: i64,
    );
    pub(crate) fn causal_conv1d_update(
        x: *const c_void,
        weight: *const c_void,
        conv_state: *mut c_void,
        output: *mut c_void,
        batch_size: i32,
        conv_dim: i32,
        kernel_size: i32,
        dtype: i32,
        stream: i64,
    );
    pub(crate) fn causal_conv1d_full(
        x: *const c_void,
        weight: *const c_void,
        conv_state_out: *mut c_void,
        output: *mut c_void,
        batch_size: i32,
        conv_dim: i32,
        seq_len: i32,
        kernel_size: i32,
        dtype: i32,
        stream: i64,
    );
    pub(crate) fn fused_gdn_gating(
        b: *const c_void,
        a: *const c_void,
        a_log: *const f32,
        dt_bias: *const f32,
        beta_out: *mut c_void,
        g_out: *mut c_void,
        total_elements: i32,
        num_heads: i32,
        dtype: i32,
        stream: i64,
    );
 }
--- a/crates/neuron/src/cuda/gdn.cu
+++ b/crates/neuron/src/cuda/gdn.cu
@@ -0,0 +1,711 @@
 // Gated DeltaNet CUDA kernels for Qwen3-Next (`model_type = "qwen3_5"`).
 //
 // Ported verbatim from `EricLBuehler/mistral.rs` under MIT terms.
 // Upstream path: mistralrs-core/src/cuda/gdn.cu. Local edits in this
 // file are limited to this banner; the kernels are unchanged so a
 // diff against upstream stays minimal.
 //
 // Five kernels exposed via `extern "C"` shims at the bottom:
 //   - gated_delta_rule_recurrence            (per-token decode)
 //   - chunked_gated_delta_rule_recurrence    (BT=64 chunked prefill)
 //   - causal_conv1d_update                    (single-token conv decode)
 //   - causal_conv1d_full                      (multi-token conv prefill)
 //   - fused_gdn_gating                        (beta = sigmoid(b);
 //                                              g = -exp(A_log) * softplus(a + dt_bias))
 #include "cuda_bf16.h"
 #include "cuda_fp16.h"
 #include <cmath>
 #include <cstdint>
 #include <cuda_runtime.h>
 // ============================================================================
 // Kernel 1: gated_delta_rule_recurrence (optimized)
 //
 // V-tiled recurrence with compile-time K dimension for register residency.
 // Grid: (ceil(V/BV), B*H), Block: (BV,). Each thread owns BK registers of
 // state. Shared memory holds k_buf and q_buf (2*BK floats).
 //
 // Optimizations over naive version:
 //   - Template BK -> float s[BK] lives in true registers (1 cycle vs ~30)
 //   - #pragma unroll on all k-loops -> full ILP
 //   - Fused decay+kv_mem pass and fused state_update+output pass
 //   - __fmaf_rn intrinsics for guaranteed fused multiply-add
 //   - BV=64 threads -> 2 warps, 6 blocks/SM on Ampere
 //
 // q,k: [BH, S, K]  v: [BH, S, V]  g,beta: [BH, S]
 // state: [BH, K, V] (in/out)  output: [BH, S, V]
 // ============================================================================
 // Optimized kernel: BK known at compile time -> registers + full unrolling
 template <int BK, int BV>
 __global__ void gated_delta_rule_recurrence_kernel_tiled(
    const float *__restrict__ q,    // [BH, S, K]
    const float *__restrict__ k,    // [BH, S, K]
    const float *__restrict__ v,    // [BH, S, V]
    const float *__restrict__ g,    // [BH, S]
    const float *__restrict__ beta, // [BH, S]
    float *__restrict__ state,      // [BH, K, V]
    float *__restrict__ output,     // [BH, S, V]
    int seq_len, int v_dim) {
  const int v_tile = blockIdx.x;       // which V-tile
  const int bh = blockIdx.y;           // batch*head index
  const int tid = threadIdx.x;         // thread within tile [0, BV)
  const int v_idx = v_tile * BV + tid; // global V index
  if (v_idx >= v_dim)
    return;
  // Pointers for this (batch, head)
  const float *q_bh = q + bh * seq_len * BK;
  const float *k_bh = k + bh * seq_len * BK;
  const float *v_bh = v + bh * seq_len * v_dim;
  const float *g_bh = g + bh * seq_len;
  const float *beta_bh = beta + bh * seq_len;
  float *state_bh = state + bh * BK * v_dim;
  float *out_bh = output + bh * seq_len * v_dim;
  // Shared memory: k_buf[BK] + q_buf[BK]
  __shared__ float k_buf[BK];
  __shared__ float q_buf[BK];
  // Load state column into registers — BK is compile-time, so this is
  // a true register array (not spilled to local memory)
  float s[BK];
 #pragma unroll
  for (int j = 0; j < BK; j++) {
    s[j] = state_bh[j * v_dim + v_idx];
  }
  for (int t = 0; t < seq_len; t++) {
 // Collaboratively load k_t into shared memory
 // BK / BV loads per thread (e.g. 128/64 = 2)
 #pragma unroll
    for (int j = tid; j < BK; j += BV) {
      k_buf[j] = k_bh[t * BK + j];
    }
    __syncthreads();
    // Load scalars for this timestep
    float decay = expf(g_bh[t]);
    float beta_t = beta_bh[t];
    float v_t = v_bh[t * v_dim + v_idx];
    // Fused pass 1: decay state + compute kv_mem
    float kv_mem = 0.0f;
 #pragma unroll
    for (int j = 0; j < BK; j++) {
      s[j] *= decay;
      kv_mem = __fmaf_rn(s[j], k_buf[j], kv_mem);
    }
    // Delta rule
    float delta = (v_t - kv_mem) * beta_t;
 // Collaboratively load q_t into shared memory
 #pragma unroll
    for (int j = tid; j < BK; j += BV) {
      q_buf[j] = q_bh[t * BK + j];
    }
    __syncthreads();
    // Fused pass 2: update state + compute output
    float y_t = 0.0f;
 #pragma unroll
    for (int j = 0; j < BK; j++) {
      s[j] = __fmaf_rn(k_buf[j], delta, s[j]);
      y_t = __fmaf_rn(s[j], q_buf[j], y_t);
    }
    out_bh[t * v_dim + v_idx] = y_t;
    __syncthreads();
  }
 // Write state back
 #pragma unroll
  for (int j = 0; j < BK; j++) {
    state_bh[j * v_dim + v_idx] = s[j];
  }
 }
 // Fallback kernel: runtime k_dim, still V-tiled for occupancy
 template <int BV, int MAX_K>
 __global__ void gated_delta_rule_recurrence_kernel_fallback(
    const float *__restrict__ q, const float *__restrict__ k,
    const float *__restrict__ v, const float *__restrict__ g,
    const float *__restrict__ beta, float *__restrict__ state,
    float *__restrict__ output, int seq_len, int k_dim, int v_dim) {
  const int v_tile = blockIdx.x;
  const int bh = blockIdx.y;
  const int tid = threadIdx.x;
  const int v_idx = v_tile * BV + tid;
  if (v_idx >= v_dim)
    return;
  const float *q_bh = q + bh * seq_len * k_dim;
  const float *k_bh = k + bh * seq_len * k_dim;
  const float *v_bh = v + bh * seq_len * v_dim;
  const float *g_bh = g + bh * seq_len;
  const float *beta_bh = beta + bh * seq_len;
  float *state_bh = state + bh * k_dim * v_dim;
  float *out_bh = output + bh * seq_len * v_dim;
  extern __shared__ float shared[];
  float *k_buf = shared;
  float *q_buf = shared + k_dim;
  float s[MAX_K];
  for (int j = 0; j < k_dim; j++) {
    s[j] = state_bh[j * v_dim + v_idx];
  }
  for (int t = 0; t < seq_len; t++) {
    for (int j = tid; j < k_dim; j += BV) {
      k_buf[j] = k_bh[t * k_dim + j];
    }
    __syncthreads();
    float decay = expf(g_bh[t]);
    float beta_t = beta_bh[t];
    float v_t = v_bh[t * v_dim + v_idx];
    float kv_mem = 0.0f;
    for (int j = 0; j < k_dim; j++) {
      s[j] *= decay;
      kv_mem = __fmaf_rn(s[j], k_buf[j], kv_mem);
    }
    float delta = (v_t - kv_mem) * beta_t;
    for (int j = tid; j < k_dim; j += BV) {
      q_buf[j] = q_bh[t * k_dim + j];
    }
    __syncthreads();
    float y_t = 0.0f;
    for (int j = 0; j < k_dim; j++) {
      s[j] = __fmaf_rn(k_buf[j], delta, s[j]);
      y_t = __fmaf_rn(s[j], q_buf[j], y_t);
    }
    out_bh[t * v_dim + v_idx] = y_t;
    __syncthreads();
  }
  for (int j = 0; j < k_dim; j++) {
    state_bh[j * v_dim + v_idx] = s[j];
  }
 }
 extern "C" void gated_delta_rule_recurrence(const float *q, const float *k,
                                            const float *v, const float *g,
                                            const float *beta, float *state,
                                            float *output, int bh, int seq_len,
                                            int k_dim, int v_dim,
                                            int64_t stream) {
  const cudaStream_t custream = (cudaStream_t)stream;
  if (k_dim == 128) {
    // Fast path for Qwen3-Next (k_dim=128)
    constexpr int BK = 128;
    constexpr int BV = 64;
    dim3 grid((v_dim + BV - 1) / BV, bh);
    dim3 block(BV);
    gated_delta_rule_recurrence_kernel_tiled<BK, BV>
        <<<grid, block, 0, custream>>>(q, k, v, g, beta, state, output, seq_len,
                                       v_dim);
  } else if (k_dim == 64) {
    // Fast path for models with k_dim=64
    constexpr int BK = 64;
    constexpr int BV = 64;
    dim3 grid((v_dim + BV - 1) / BV, bh);
    dim3 block(BV);
    gated_delta_rule_recurrence_kernel_tiled<BK, BV>
        <<<grid, block, 0, custream>>>(q, k, v, g, beta, state, output, seq_len,
                                       v_dim);
  } else {
    // Fallback for other k_dim values (runtime loop, still V-tiled)
    constexpr int BV = 64;
    constexpr int MAX_K = 256;
    dim3 grid((v_dim + BV - 1) / BV, bh);
    dim3 block(BV);
    size_t smem = 2 * k_dim * sizeof(float);
    gated_delta_rule_recurrence_kernel_fallback<BV, MAX_K>
        <<<grid, block, smem, custream>>>(q, k, v, g, beta, state, output,
                                          seq_len, k_dim, v_dim);
  }
 }
 // ============================================================================
 // Kernel 1b: chunked_gated_delta_rule_recurrence (prefill optimization)
 //
 // Processes prefill tokens in BT-token chunks instead of one at a time.
 // Within each chunk: parallel prefix sum of g, cooperative kk_dot computation,
 // forward substitution (triangular solve), output computation, and state
 // update.
 //
 // Same thread model as Kernel 1: one block per (v_tile, batch*head),
 // one thread per V-column. Each thread owns BK registers of state.
 //
 // Shared memory holds:
 //   k_chunk[BT * BK]  -- key vectors for current chunk
 //   kk_dot[BT * BT]   -- dot(k[i], k[j]) lower-triangular matrix
 //   gcum[BT]           -- cumulative sum of g within chunk
 //   beta_s[BT]         -- beta values for chunk
 //   q_buf[BK]          -- q vector (loaded one row at a time)
 //
 // q,k: [BH, S, K]  v: [BH, S, V]  g,beta: [BH, S]
 // state: [BH, K, V] (in/out)  output: [BH, S, V]
 // ============================================================================
 template <int BT, int BK, int BV>
 __global__ void
 chunked_gated_delta_rule_kernel(const float *__restrict__ q,    // [BH, S, K]
                                const float *__restrict__ k,    // [BH, S, K]
                                const float *__restrict__ v,    // [BH, S, V]
                                const float *__restrict__ g,    // [BH, S]
                                const float *__restrict__ beta, // [BH, S]
                                float *__restrict__ state,      // [BH, K, V]
                                float *__restrict__ output,     // [BH, S, V]
                                int seq_len, int v_dim) {
  const int v_tile = blockIdx.x;
  const int bh = blockIdx.y;
  const int tid = threadIdx.x;
  const int v_idx = v_tile * BV + tid;
  if (v_idx >= v_dim)
    return;
  const int num_chunks = (seq_len + BT - 1) / BT;
  // Pointers for this (batch, head)
  const float *q_bh = q + bh * seq_len * BK;
  const float *k_bh = k + bh * seq_len * BK;
  const float *v_bh = v + bh * seq_len * v_dim;
  const float *g_bh = g + bh * seq_len;
  const float *beta_bh = beta + bh * seq_len;
  float *state_bh = state + bh * BK * v_dim;
  float *out_bh = output + bh * seq_len * v_dim;
  // Dynamic shared memory layout
  extern __shared__ float smem[];
  float *k_chunk = smem;                  // [BT * BK]
  float *kk_dot = smem + BT * BK;         // [BT * BT]
  float *gcum = smem + BT * BK + BT * BT; // [BT]
  float *beta_s = gcum + BT;              // [BT]
  float *q_buf = beta_s + BT;             // [BK]
  // Load state column into registers
  float s[BK];
 #pragma unroll
  for (int j = 0; j < BK; j++) {
    s[j] = state_bh[j * v_dim + v_idx];
  }
  // Per-thread register array for corrected deltas
  float delta[BT];
  for (int c = 0; c < num_chunks; c++) {
    const int chunk_start = c * BT;
    const int chunk_len = min(BT, seq_len - chunk_start);
    // === Phase 1: Cooperative load of k, beta, g into shared memory ===
    for (int t = 0; t < chunk_len; t++) {
      for (int j = tid; j < BK; j += BV) {
        k_chunk[t * BK + j] = k_bh[(chunk_start + t) * BK + j];
      }
    }
    if (tid < chunk_len) {
      beta_s[tid] = beta_bh[chunk_start + tid];
      gcum[tid] = g_bh[chunk_start + tid];
    }
    __syncthreads();
    // === Phase 1b: Parallel prefix sum of g (Hillis-Steele) ===
    for (int stride = 1; stride < BT; stride <<= 1) {
      float prev = 0.0f;
      if (tid < chunk_len && (int)tid >= stride)
        prev = gcum[tid - stride];
      __syncthreads();
      if (tid < chunk_len && (int)tid >= stride)
        gcum[tid] += prev;
      __syncthreads();
    }
    // === Phase 2: Compute kk_dot[i][j] = dot(k[i], k[j]) for j < i ===
    // Only lower-triangular entries needed (strictly lower)
    for (int idx = tid; idx < chunk_len * chunk_len; idx += BV) {
      int i = idx / chunk_len;
      int j = idx % chunk_len;
      if (j < i) {
        float dot = 0.0f;
        for (int d = 0; d < BK; d++) {
          dot = __fmaf_rn(k_chunk[i * BK + d], k_chunk[j * BK + d], dot);
        }
        kk_dot[i * BT + j] = dot;
      }
    }
    __syncthreads();
    // === Phase 3: Forward substitution (per V-column, in registers) ===
    // Computes corrected delta values via triangular solve
    for (int i = 0; i < chunk_len; i++) {
      float v_i = v_bh[(chunk_start + i) * v_dim + v_idx];
      float decay_i = expf(gcum[i]);
      float beta_i = beta_s[i];
      // Inter-chunk contribution: state @ k[i] with decay
      float kv_mem = 0.0f;
 #pragma unroll
      for (int d = 0; d < BK; d++) {
        kv_mem = __fmaf_rn(s[d] * decay_i, k_chunk[i * BK + d], kv_mem);
      }
      float rhs = beta_i * (v_i - kv_mem);
      // Subtract lower-triangular contributions (intra-chunk)
      for (int j = 0; j < i; j++) {
        float a_ij = beta_i * kk_dot[i * BT + j] * expf(gcum[i] - gcum[j]);
        rhs -= a_ij * delta[j];
      }
      delta[i] = rhs;
    }
    // === Phase 4: Output computation (per V-column) ===
    for (int i = 0; i < chunk_len; i++) {
      // Cooperatively load q[i] into shared
      for (int j = tid; j < BK; j += BV) {
        q_buf[j] = q_bh[(chunk_start + i) * BK + j];
      }
      __syncthreads();
      float decay_i = expf(gcum[i]);
      // Inter-chunk contribution: q[i] @ (state * decay)
      float o_val = 0.0f;
 #pragma unroll
      for (int d = 0; d < BK; d++) {
        o_val = __fmaf_rn(q_buf[d], s[d] * decay_i, o_val);
      }
      // Intra-chunk contribution: sum_{j<=i} dot(q[i], k[j]) * delta[j] *
      // exp(gcum[i] - gcum[j])
      for (int j = 0; j <= i; j++) {
        float qk_dot = 0.0f;
        for (int d = 0; d < BK; d++) {
          qk_dot = __fmaf_rn(q_buf[d], k_chunk[j * BK + d], qk_dot);
        }
        o_val += qk_dot * delta[j] * expf(gcum[i] - gcum[j]);
      }
      out_bh[(chunk_start + i) * v_dim + v_idx] = o_val;
      __syncthreads();
    }
    // === Phase 5: State update for next chunk ===
    float g_total = gcum[chunk_len - 1];
 #pragma unroll
    for (int d = 0; d < BK; d++) {
      float s_new = s[d] * expf(g_total);
      for (int t = 0; t < chunk_len; t++) {
        s_new += k_chunk[t * BK + d] * delta[t] * expf(g_total - gcum[t]);
      }
      s[d] = s_new;
    }
    __syncthreads();
  }
  // Write final state back
 #pragma unroll
  for (int j = 0; j < BK; j++) {
    state_bh[j * v_dim + v_idx] = s[j];
  }
 }
 extern "C" void chunked_gated_delta_rule_recurrence(
    const float *q, const float *k, const float *v, const float *g,
    const float *beta, float *state, float *output, int bh, int seq_len,
    int k_dim, int v_dim, int64_t stream) {
  const cudaStream_t custream = (cudaStream_t)stream;
  if (k_dim == 128) {
    constexpr int BT = 64;
    constexpr int BK = 128;
    constexpr int BV = 64;
    // Shared memory: BT*BK + BT*BT + BT + BT + BK floats
    size_t smem = (BT * BK + BT * BT + 2 * BT + BK) * sizeof(float);
    // Request extended shared memory
    auto kernel = chunked_gated_delta_rule_kernel<BT, BK, BV>;
    cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
                         smem);
    dim3 grid((v_dim + BV - 1) / BV, bh);
    dim3 block(BV);
    kernel<<<grid, block, smem, custream>>>(q, k, v, g, beta, state, output,
                                            seq_len, v_dim);
  } else if (k_dim == 64) {
    constexpr int BT = 64;
    constexpr int BK = 64;
    constexpr int BV = 64;
    size_t smem = (BT * BK + BT * BT + 2 * BT + BK) * sizeof(float);
    auto kernel = chunked_gated_delta_rule_kernel<BT, BK, BV>;
    cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
                         smem);
    dim3 grid((v_dim + BV - 1) / BV, bh);
    dim3 block(BV);
    kernel<<<grid, block, smem, custream>>>(q, k, v, g, beta, state, output,
                                            seq_len, v_dim);
  } else {
    // Fallback: use the sequential kernel for unsupported k_dim
    gated_delta_rule_recurrence(q, k, v, g, beta, state, output, bh, seq_len,
                                k_dim, v_dim, stream);
  }
 }
 // ============================================================================
 // Kernel 2a: causal_conv1d_update (decode path, single step)
 //
 // Each thread handles one channel: shift conv_state left by 1,
 // insert new value, dot product with weight, apply SiLU.
 //
 // x: [B, conv_dim, 1]  weight: [conv_dim, kernel_size]
 // conv_state: [B, conv_dim, kernel_size] (in/out)
 // output: [B, conv_dim, 1]
 // ============================================================================
 template <typename T>
 __global__ void causal_conv1d_update_kernel(
    const T *__restrict__ x,      // [B, conv_dim, 1]
    const T *__restrict__ weight, // [conv_dim, kernel_size]
    T *__restrict__ conv_state,   // [B, conv_dim, kernel_size]
    T *__restrict__ output,       // [B, conv_dim, 1]
    int batch_size, int conv_dim, int kernel_size) {
  const int ch = blockIdx.x * blockDim.x + threadIdx.x;
  const int b = blockIdx.y;
  if (ch >= conv_dim || b >= batch_size)
    return;
  // Pointer to this batch/channel's conv state
  T *cs = conv_state + (b * conv_dim + ch) * kernel_size;
  const T *w = weight + ch * kernel_size;
  // Shift state left by 1
  for (int i = 0; i < kernel_size - 1; i++) {
    cs[i] = cs[i + 1];
  }
  // Insert new value
  cs[kernel_size - 1] = x[b * conv_dim + ch];
  // Dot product with weight
  float acc = 0.0f;
  for (int i = 0; i < kernel_size; i++) {
    acc += (float)cs[i] * (float)w[i];
  }
  // SiLU activation: x * sigmoid(x)
  float sig = 1.0f / (1.0f + expf(-acc));
  float result = acc * sig;
  output[b * conv_dim + ch] = (T)result;
 }
 extern "C" void causal_conv1d_update(const void *x, const void *weight,
                                     void *conv_state, void *output,
                                     int batch_size, int conv_dim,
                                     int kernel_size, int dtype,
                                     int64_t stream) {
  const cudaStream_t custream = (cudaStream_t)stream;
  dim3 block(256);
  dim3 grid((conv_dim + 255) / 256, batch_size);
  if (dtype == 0) {
    // f16
    causal_conv1d_update_kernel<__half><<<grid, block, 0, custream>>>(
        (const __half *)x, (const __half *)weight, (__half *)conv_state,
        (__half *)output, batch_size, conv_dim, kernel_size);
  } else {
    // bf16
    causal_conv1d_update_kernel<__nv_bfloat16><<<grid, block, 0, custream>>>(
        (const __nv_bfloat16 *)x, (const __nv_bfloat16 *)weight,
        (__nv_bfloat16 *)conv_state, (__nv_bfloat16 *)output, batch_size,
        conv_dim, kernel_size);
  }
 }
 // ============================================================================
 // Kernel 2b: causal_conv1d_full (prefill path)
 //
 // Each thread handles one (channel, position): causal window with
 // zero-padding, dot product with weight, SiLU.
 // A second pass writes the conv_state from the last kernel_size positions.
 //
 // x: [B, conv_dim, S]  weight: [conv_dim, kernel_size]
 // conv_state_out: [B, conv_dim, kernel_size]  output: [B, conv_dim, S]
 // ============================================================================
 template <typename T>
 __global__ void causal_conv1d_full_kernel(
    const T *__restrict__ x,      // [B, conv_dim, S]
    const T *__restrict__ weight, // [conv_dim, kernel_size]
    T *__restrict__ output,       // [B, conv_dim, S]
    int batch_size, int conv_dim, int seq_len, int kernel_size) {
  const int ch = blockIdx.x * blockDim.x + threadIdx.x;
  const int pos = blockIdx.y;
  const int b = blockIdx.z;
  if (ch >= conv_dim || pos >= seq_len || b >= batch_size)
    return;
  const T *x_bch = x + (b * conv_dim + ch) * seq_len;
  const T *w = weight + ch * kernel_size;
  // Causal convolution: sum over kernel_size window ending at pos
  float acc = 0.0f;
  for (int i = 0; i < kernel_size; i++) {
    int src_pos = pos - (kernel_size - 1) + i;
    float x_val = (src_pos >= 0) ? (float)x_bch[src_pos] : 0.0f;
    acc += x_val * (float)w[i];
  }
  // SiLU
  float sig = 1.0f / (1.0f + expf(-acc));
  float result = acc * sig;
  output[(b * conv_dim + ch) * seq_len + pos] = (T)result;
 }
 template <typename T>
 __global__ void save_conv_state_kernel(
    const T *__restrict__ x,        // [B, conv_dim, S]
    T *__restrict__ conv_state_out, // [B, conv_dim, kernel_size]
    int batch_size, int conv_dim, int seq_len, int kernel_size) {
  const int ch = blockIdx.x * blockDim.x + threadIdx.x;
  const int b = blockIdx.y;
  if (ch >= conv_dim || b >= batch_size)
    return;
  const T *x_bch = x + (b * conv_dim + ch) * seq_len;
  T *cs = conv_state_out + (b * conv_dim + ch) * kernel_size;
  // Save last kernel_size positions (zero-pad if seq_len < kernel_size)
  int pad = kernel_size - seq_len;
  for (int i = 0; i < kernel_size; i++) {
    if (i < pad) {
      cs[i] = (T)0.0f;
    } else {
      cs[i] = x_bch[seq_len - kernel_size + i];
    }
  }
 }
 extern "C" void causal_conv1d_full(const void *x, const void *weight,
                                   void *conv_state_out, void *output,
                                   int batch_size, int conv_dim, int seq_len,
                                   int kernel_size, int dtype, int64_t stream) {
  const cudaStream_t custream = (cudaStream_t)stream;
  // Main convolution kernel
  dim3 block(256);
  dim3 grid((conv_dim + 255) / 256, seq_len, batch_size);
  if (dtype == 0) {
    causal_conv1d_full_kernel<__half><<<grid, block, 0, custream>>>(
        (const __half *)x, (const __half *)weight, (__half *)output, batch_size,
        conv_dim, seq_len, kernel_size);
    // Save conv state
    dim3 grid2((conv_dim + 255) / 256, batch_size);
    save_conv_state_kernel<__half><<<grid2, block, 0, custream>>>(
        (const __half *)x, (__half *)conv_state_out, batch_size, conv_dim,
        seq_len, kernel_size);
  } else {
    causal_conv1d_full_kernel<__nv_bfloat16><<<grid, block, 0, custream>>>(
        (const __nv_bfloat16 *)x, (const __nv_bfloat16 *)weight,
        (__nv_bfloat16 *)output, batch_size, conv_dim, seq_len, kernel_size);
    dim3 grid2((conv_dim + 255) / 256, batch_size);
    save_conv_state_kernel<__nv_bfloat16><<<grid2, block, 0, custream>>>(
        (const __nv_bfloat16 *)x, (__nv_bfloat16 *)conv_state_out, batch_size,
        conv_dim, seq_len, kernel_size);
  }
 }
 // ============================================================================
 // Kernel 3: fused_gdn_gating
 //
 // Fuses: beta = sigmoid(b), g = -exp(a_log) * softplus(a + dt_bias)
 // a_log and dt_bias are per-head (broadcast over batch*seq).
 //
 // b, a: [total]  a_log, dt_bias: [num_heads]
 // beta_out, g_out: [total]
 // ============================================================================
 template <typename T>
 __global__ void
 fused_gdn_gating_kernel(const T *__restrict__ b,           // [total]
                        const T *__restrict__ a,           // [total]
                        const float *__restrict__ a_log,   // [num_heads]
                        const float *__restrict__ dt_bias, // [num_heads]
                        T *__restrict__ beta_out,          // [total]
                        T *__restrict__ g_out,             // [total]
                        int total_elements, int num_heads) {
  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx >= total_elements)
    return;
  // Head index: elements are laid out as [..., num_heads]
  int head_idx = idx % num_heads;
  // beta = sigmoid(b)
  float b_val = (float)b[idx];
  float beta = 1.0f / (1.0f + expf(-b_val));
  // g = -exp(a_log) * softplus(a + dt_bias)
  float a_val = (float)a[idx];
  float a_log_val = a_log[head_idx];
  float dt_bias_val = dt_bias[head_idx];
  float sp_input = a_val + dt_bias_val;
  float softplus_val = logf(1.0f + expf(sp_input));
  float g_val = -expf(a_log_val) * softplus_val;
  beta_out[idx] = (T)beta;
  g_out[idx] = (T)g_val;
 }
 extern "C" void fused_gdn_gating(const void *b, const void *a,
                                 const float *a_log, const float *dt_bias,
                                 void *beta_out, void *g_out,
                                 int total_elements, int num_heads, int dtype,
                                 int64_t stream) {
  const cudaStream_t custream = (cudaStream_t)stream;
  dim3 block(256);
  dim3 grid((total_elements + 255) / 256);
  if (dtype == 0) {
    fused_gdn_gating_kernel<__half><<<grid, block, 0, custream>>>(
        (const __half *)b, (const __half *)a, a_log, dt_bias,
        (__half *)beta_out, (__half *)g_out, total_elements, num_heads);
  } else {
    fused_gdn_gating_kernel<__nv_bfloat16><<<grid, block, 0, custream>>>(
        (const __nv_bfloat16 *)b, (const __nv_bfloat16 *)a, a_log, dt_bias,
        (__nv_bfloat16 *)beta_out, (__nv_bfloat16 *)g_out, total_elements,
        num_heads);
  }
 }
--- a/crates/neuron/src/cuda/gdn.rs
+++ b/crates/neuron/src/cuda/gdn.rs
@@ -0,0 +1,486 @@
 //! Rust wrappers around the Gated DeltaNet CUDA kernels in `gdn.cu`.
 //!
 //! Ported verbatim from `EricLBuehler/mistral.rs` under MIT terms.
 //! Upstream path: `mistralrs-core/src/cuda/gdn.rs`. The only edits in
 //! this file are this header comment — the FFI path module name is
 //! `crate::cuda::ffi`, identical to upstream's layout.
 #![allow(clippy::cast_possible_truncation)]
 use candle_core::{Result, Tensor};
 #[cfg(feature = "cuda")]
 use candle_core::DType;
 /// CUDA-accelerated gated delta rule recurrence.
 ///
 /// Inputs (all contiguous, f32):
 ///   q, k: [BH, S, K]  v: [BH, S, V]  g, beta: [BH, S]
 ///   state: [BH, K, V] (mutated in place)
 ///
 /// Returns: output [BH, S, V]
 #[cfg(feature = "cuda")]
 pub fn gated_delta_rule_recurrence_cuda(
    q: &Tensor,
    k: &Tensor,
    v: &Tensor,
    g: &Tensor,
    beta: &Tensor,
    state: &mut Tensor,
 ) -> Result<Tensor> {
    use candle::cuda_backend::cudarc::driver::DevicePtr;
    use candle_core as candle;
    let (bh, seq_len, k_dim) = q.dims3()?;
    let v_dim = v.dim(2)?;
    let dev = q.device().as_cuda_device()?;
    let (q_s, q_l) = q.storage_and_layout();
    let q_s = match &*q_s {
        candle::Storage::Cuda(c) => c.as_cuda_slice::<f32>()?,
        _ => candle::bail!("q must be a cuda tensor"),
    };
    let q_offset = q_l.start_offset();
    let (k_s, k_l) = k.storage_and_layout();
    let k_s = match &*k_s {
        candle::Storage::Cuda(c) => c.as_cuda_slice::<f32>()?,
        _ => candle::bail!("k must be a cuda tensor"),
    };
    let k_offset = k_l.start_offset();
    let (v_s, v_l) = v.storage_and_layout();
    let v_s = match &*v_s {
        candle::Storage::Cuda(c) => c.as_cuda_slice::<f32>()?,
        _ => candle::bail!("v must be a cuda tensor"),
    };
    let v_offset = v_l.start_offset();
    let (g_s, g_l) = g.storage_and_layout();
    let g_s = match &*g_s {
        candle::Storage::Cuda(c) => c.as_cuda_slice::<f32>()?,
        _ => candle::bail!("g must be a cuda tensor"),
    };
    let g_offset = g_l.start_offset();
    let (beta_s, beta_l) = beta.storage_and_layout();
    let beta_s = match &*beta_s {
        candle::Storage::Cuda(c) => c.as_cuda_slice::<f32>()?,
        _ => candle::bail!("beta must be a cuda tensor"),
    };
    let beta_offset = beta_l.start_offset();
    let (state_s, state_l) = state.storage_and_layout();
    let state_s = match &*state_s {
        candle::Storage::Cuda(c) => c.as_cuda_slice::<f32>()?,
        _ => candle::bail!("state must be a cuda tensor"),
    };
    let state_offset = state_l.start_offset();
    let output_buf = unsafe { dev.alloc::<f32>(bh * seq_len * v_dim) }?;
    let stream = dev.cuda_stream().cu_stream() as i64;
    unsafe {
        crate::cuda::ffi::gated_delta_rule_recurrence(
            q_s.slice(q_offset..).device_ptr(q_s.stream()).0 as *const f32,
            k_s.slice(k_offset..).device_ptr(k_s.stream()).0 as *const f32,
            v_s.slice(v_offset..).device_ptr(v_s.stream()).0 as *const f32,
            g_s.slice(g_offset..).device_ptr(g_s.stream()).0 as *const f32,
            beta_s.slice(beta_offset..).device_ptr(beta_s.stream()).0 as *const f32,
            state_s.slice(state_offset..).device_ptr(state_s.stream()).0 as *mut f32,
            output_buf.device_ptr(output_buf.stream()).0 as *mut f32,
            bh as i32,
            seq_len as i32,
            k_dim as i32,
            v_dim as i32,
            stream,
        );
    }
    // The kernel wrote state in-place via the raw pointer; rewrap
    // (state tensor's underlying CudaSlice was modified directly)
    let output_storage = candle::CudaStorage::wrap_cuda_slice(output_buf, dev.clone());
    Ok(Tensor::from((
        candle::Storage::Cuda(output_storage),
        (bh, seq_len, v_dim),
    )))
 }
 #[cfg(not(feature = "cuda"))]
 #[allow(unused)]
 pub fn gated_delta_rule_recurrence_cuda(
    _q: &Tensor,
    _k: &Tensor,
    _v: &Tensor,
    _g: &Tensor,
    _beta: &Tensor,
    _state: &mut Tensor,
 ) -> Result<Tensor> {
    candle_core::bail!("gated_delta_rule_recurrence_cuda requires the cuda feature")
 }
 /// CUDA-accelerated chunked gated delta rule recurrence (prefill optimization).
 ///
 /// Processes prefill tokens in 64-token chunks instead of one at a time.
 /// Same interface as `gated_delta_rule_recurrence_cuda`.
 ///
 /// Inputs (all contiguous, f32):
 ///   q, k: [BH, S, K]  v: [BH, S, V]  g, beta: [BH, S]
 ///   state: [BH, K, V] (mutated in place)
 ///
 /// Returns: output [BH, S, V]
 #[cfg(feature = "cuda")]
 pub fn chunked_gated_delta_rule_recurrence_cuda(
    q: &Tensor,
    k: &Tensor,
    v: &Tensor,
    g: &Tensor,
    beta: &Tensor,
    state: &mut Tensor,
 ) -> Result<Tensor> {
    use candle::cuda_backend::cudarc::driver::DevicePtr;
    use candle_core as candle;
    let (bh, seq_len, k_dim) = q.dims3()?;
    let v_dim = v.dim(2)?;
    let dev = q.device().as_cuda_device()?;
    let (q_s, q_l) = q.storage_and_layout();
    let q_s = match &*q_s {
        candle::Storage::Cuda(c) => c.as_cuda_slice::<f32>()?,
        _ => candle::bail!("q must be a cuda tensor"),
    };
    let q_offset = q_l.start_offset();
    let (k_s, k_l) = k.storage_and_layout();
    let k_s = match &*k_s {
        candle::Storage::Cuda(c) => c.as_cuda_slice::<f32>()?,
        _ => candle::bail!("k must be a cuda tensor"),
    };
    let k_offset = k_l.start_offset();
    let (v_s, v_l) = v.storage_and_layout();
    let v_s = match &*v_s {
        candle::Storage::Cuda(c) => c.as_cuda_slice::<f32>()?,
        _ => candle::bail!("v must be a cuda tensor"),
    };
    let v_offset = v_l.start_offset();
    let (g_s, g_l) = g.storage_and_layout();
    let g_s = match &*g_s {
        candle::Storage::Cuda(c) => c.as_cuda_slice::<f32>()?,
        _ => candle::bail!("g must be a cuda tensor"),
    };
    let g_offset = g_l.start_offset();
    let (beta_s, beta_l) = beta.storage_and_layout();
    let beta_s = match &*beta_s {
        candle::Storage::Cuda(c) => c.as_cuda_slice::<f32>()?,
        _ => candle::bail!("beta must be a cuda tensor"),
    };
    let beta_offset = beta_l.start_offset();
    let (state_s, state_l) = state.storage_and_layout();
    let state_s = match &*state_s {
        candle::Storage::Cuda(c) => c.as_cuda_slice::<f32>()?,
        _ => candle::bail!("state must be a cuda tensor"),
    };
    let state_offset = state_l.start_offset();
    let output_buf = unsafe { dev.alloc::<f32>(bh * seq_len * v_dim) }?;
    let stream = dev.cuda_stream().cu_stream() as i64;
    unsafe {
        crate::cuda::ffi::chunked_gated_delta_rule_recurrence(
            q_s.slice(q_offset..).device_ptr(q_s.stream()).0 as *const f32,
            k_s.slice(k_offset..).device_ptr(k_s.stream()).0 as *const f32,
            v_s.slice(v_offset..).device_ptr(v_s.stream()).0 as *const f32,
            g_s.slice(g_offset..).device_ptr(g_s.stream()).0 as *const f32,
            beta_s.slice(beta_offset..).device_ptr(beta_s.stream()).0 as *const f32,
            state_s.slice(state_offset..).device_ptr(state_s.stream()).0 as *mut f32,
            output_buf.device_ptr(output_buf.stream()).0 as *mut f32,
            bh as i32,
            seq_len as i32,
            k_dim as i32,
            v_dim as i32,
            stream,
        );
    }
    let output_storage = candle::CudaStorage::wrap_cuda_slice(output_buf, dev.clone());
    Ok(Tensor::from((
        candle::Storage::Cuda(output_storage),
        (bh, seq_len, v_dim),
    )))
 }
 #[cfg(not(feature = "cuda"))]
 #[allow(unused)]
 pub fn chunked_gated_delta_rule_recurrence_cuda(
    _q: &Tensor,
    _k: &Tensor,
    _v: &Tensor,
    _g: &Tensor,
    _beta: &Tensor,
    _state: &mut Tensor,
 ) -> Result<Tensor> {
    candle_core::bail!("chunked_gated_delta_rule_recurrence_cuda requires the cuda feature")
 }
 /// CUDA-accelerated causal conv1d (both update and full paths).
 ///
 /// For update (is_update=true):
 ///   x: [B, conv_dim, 1]  weight: [conv_dim, kernel_size]
 ///   conv_state: [B, conv_dim, kernel_size] (mutated in place for update)
 ///   Returns: (output [B, conv_dim, 1], updated conv_state)
 ///
 /// For full (is_update=false):
 ///   x: [B, conv_dim, S]  weight: [conv_dim, kernel_size]
 ///   Returns: (output [B, conv_dim, S], new conv_state [B, conv_dim, kernel_size])
 #[cfg(feature = "cuda")]
 pub fn causal_conv1d_cuda(
    x: &Tensor,
    weight: &Tensor,
    conv_state: &Tensor,
    kernel_size: usize,
    is_update: bool,
 ) -> Result<(Tensor, Tensor)> {
    use candle::cuda_backend::cudarc::driver::DevicePtr;
    use candle_core as candle;
    use core::ffi::c_void;
    fn cuda_fwd<
        T: candle::cuda_backend::CudaDType + candle::cuda_backend::cudarc::driver::DeviceRepr,
    >(
        x: &Tensor,
        weight: &Tensor,
        conv_state: &Tensor,
        kernel_size: usize,
        is_update: bool,
        dtype_code: i32,
    ) -> Result<(Tensor, Tensor)> {
        let dev = x.device().as_cuda_device()?;
        let (batch_size, conv_dim, seq_len) = x.dims3()?;
        let (x_s, x_l) = x.storage_and_layout();
        let x_s = match &*x_s {
            candle::Storage::Cuda(c) => c.as_cuda_slice::<T>()?,
            _ => candle::bail!("x must be a cuda tensor"),
        };
        let x_offset = x_l.start_offset();
        let (w_s, w_l) = weight.storage_and_layout();
        let w_s = match &*w_s {
            candle::Storage::Cuda(c) => c.as_cuda_slice::<T>()?,
            _ => candle::bail!("weight must be a cuda tensor"),
        };
        let w_offset = w_l.start_offset();
        let stream = dev.cuda_stream().cu_stream() as i64;
        if is_update {
            // Clone conv_state so the kernel can mutate it in place
            let conv_state_new = conv_state.clone();
            let output_buf = unsafe { dev.alloc::<T>(batch_size * conv_dim) }?;
            // Scope the borrow of conv_state_new so we can move it later
            {
                let (cs_s, cs_l) = conv_state_new.storage_and_layout();
                let cs_s = match &*cs_s {
                    candle::Storage::Cuda(c) => c.as_cuda_slice::<T>()?,
                    _ => candle::bail!("conv_state must be a cuda tensor"),
                };
                let cs_offset = cs_l.start_offset();
                unsafe {
                    crate::cuda::ffi::causal_conv1d_update(
                        x_s.slice(x_offset..).device_ptr(x_s.stream()).0 as *const c_void,
                        w_s.slice(w_offset..).device_ptr(w_s.stream()).0 as *const c_void,
                        cs_s.slice(cs_offset..).device_ptr(cs_s.stream()).0 as *mut c_void,
                        output_buf.device_ptr(output_buf.stream()).0 as *mut c_void,
                        batch_size as i32,
                        conv_dim as i32,
                        kernel_size as i32,
                        dtype_code,
                        stream,
                    );
                }
            }
            let output_storage = candle::CudaStorage::wrap_cuda_slice(output_buf, dev.clone());
            let output = Tensor::from((
                candle::Storage::Cuda(output_storage),
                (batch_size, conv_dim, 1usize),
            ));
            Ok((output, conv_state_new))
        } else {
            // Full path: allocate new conv_state and output
            let output_buf = unsafe { dev.alloc::<T>(batch_size * conv_dim * seq_len) }?;
            let cs_buf = unsafe { dev.alloc::<T>(batch_size * conv_dim * kernel_size) }?;
            unsafe {
                crate::cuda::ffi::causal_conv1d_full(
                    x_s.slice(x_offset..).device_ptr(x_s.stream()).0 as *const c_void,
                    w_s.slice(w_offset..).device_ptr(w_s.stream()).0 as *const c_void,
                    cs_buf.device_ptr(cs_buf.stream()).0 as *mut c_void,
                    output_buf.device_ptr(output_buf.stream()).0 as *mut c_void,
                    batch_size as i32,
                    conv_dim as i32,
                    seq_len as i32,
                    kernel_size as i32,
                    dtype_code,
                    stream,
                );
            }
            let output_storage = candle::CudaStorage::wrap_cuda_slice(output_buf, dev.clone());
            let output = Tensor::from((
                candle::Storage::Cuda(output_storage),
                (batch_size, conv_dim, seq_len),
            ));
            let cs_storage = candle::CudaStorage::wrap_cuda_slice(cs_buf, dev.clone());
            let new_conv_state = Tensor::from((
                candle::Storage::Cuda(cs_storage),
                (batch_size, conv_dim, kernel_size),
            ));
            Ok((output, new_conv_state))
        }
    }
    match x.dtype() {
        DType::F16 => cuda_fwd::<half::f16>(x, weight, conv_state, kernel_size, is_update, 0),
        DType::BF16 => cuda_fwd::<half::bf16>(x, weight, conv_state, kernel_size, is_update, 1),
        other => candle_core::bail!("causal_conv1d_cuda only supports f16/bf16, got {:?}", other),
    }
 }
 #[cfg(not(feature = "cuda"))]
 #[allow(unused)]
 pub fn causal_conv1d_cuda(
    _x: &Tensor,
    _weight: &Tensor,
    _conv_state: &Tensor,
    _kernel_size: usize,
    _is_update: bool,
 ) -> Result<(Tensor, Tensor)> {
    candle_core::bail!("causal_conv1d_cuda requires the cuda feature")
 }
 /// CUDA-accelerated fused GDN gating computation.
 ///
 /// Computes: beta = sigmoid(b), g = -exp(a_log) * softplus(a + dt_bias)
 ///
 /// b, a: [total_elements] in f16/bf16
 /// a_log, dt_bias: [num_heads] in f32
 ///
 /// Returns: (beta, g) in original dtype
 #[cfg(feature = "cuda")]
 pub fn fused_gdn_gating_cuda(
    b: &Tensor,
    a: &Tensor,
    a_log: &Tensor,
    dt_bias: &Tensor,
 ) -> Result<(Tensor, Tensor)> {
    use candle::cuda_backend::cudarc::driver::DevicePtr;
    use candle_core as candle;
    use core::ffi::c_void;
    fn cuda_fwd<
        T: candle::cuda_backend::CudaDType + candle::cuda_backend::cudarc::driver::DeviceRepr,
    >(
        b: &Tensor,
        a: &Tensor,
        a_log: &Tensor,
        dt_bias: &Tensor,
        dtype_code: i32,
    ) -> Result<(Tensor, Tensor)> {
        let total_elements = b.elem_count();
        let num_heads = a_log.elem_count();
        let shape = b.shape().clone();
        let dev = b.device().as_cuda_device()?;
        let (b_s, b_l) = b.storage_and_layout();
        let b_s = match &*b_s {
            candle::Storage::Cuda(c) => c.as_cuda_slice::<T>()?,
            _ => candle::bail!("b must be a cuda tensor"),
        };
        let b_offset = b_l.start_offset();
        let (a_s, a_l) = a.storage_and_layout();
        let a_s = match &*a_s {
            candle::Storage::Cuda(c) => c.as_cuda_slice::<T>()?,
            _ => candle::bail!("a must be a cuda tensor"),
        };
        let a_offset = a_l.start_offset();
        let (alog_s, alog_l) = a_log.storage_and_layout();
        let alog_s = match &*alog_s {
            candle::Storage::Cuda(c) => c.as_cuda_slice::<f32>()?,
            _ => candle::bail!("a_log must be a cuda tensor"),
        };
        let alog_offset = alog_l.start_offset();
        let (dtb_s, dtb_l) = dt_bias.storage_and_layout();
        let dtb_s = match &*dtb_s {
            candle::Storage::Cuda(c) => c.as_cuda_slice::<f32>()?,
            _ => candle::bail!("dt_bias must be a cuda tensor"),
        };
        let dtb_offset = dtb_l.start_offset();
        let beta_buf = unsafe { dev.alloc::<T>(total_elements) }?;
        let g_buf = unsafe { dev.alloc::<T>(total_elements) }?;
        let stream = dev.cuda_stream().cu_stream() as i64;
        unsafe {
            crate::cuda::ffi::fused_gdn_gating(
                b_s.slice(b_offset..).device_ptr(b_s.stream()).0 as *const c_void,
                a_s.slice(a_offset..).device_ptr(a_s.stream()).0 as *const c_void,
                alog_s.slice(alog_offset..).device_ptr(alog_s.stream()).0 as *const f32,
                dtb_s.slice(dtb_offset..).device_ptr(dtb_s.stream()).0 as *const f32,
                beta_buf.device_ptr(beta_buf.stream()).0 as *mut c_void,
                g_buf.device_ptr(g_buf.stream()).0 as *mut c_void,
                total_elements as i32,
                num_heads as i32,
                dtype_code,
                stream,
            );
        }
        let beta_storage = candle::CudaStorage::wrap_cuda_slice(beta_buf, dev.clone());
        let beta = Tensor::from((candle::Storage::Cuda(beta_storage), shape.clone()));
        let g_storage = candle::CudaStorage::wrap_cuda_slice(g_buf, dev.clone());
        let g = Tensor::from((candle::Storage::Cuda(g_storage), shape));
        Ok((beta, g))
    }
    match b.dtype() {
        DType::F16 => cuda_fwd::<half::f16>(b, a, a_log, dt_bias, 0),
        DType::BF16 => cuda_fwd::<half::bf16>(b, a, a_log, dt_bias, 1),
        other => candle_core::bail!(
            "fused_gdn_gating_cuda only supports f16/bf16, got {:?}",
            other
        ),
    }
 }
 #[cfg(not(feature = "cuda"))]
 #[allow(unused)]
 pub fn fused_gdn_gating_cuda(
    _b: &Tensor,
    _a: &Tensor,
    _a_log: &Tensor,
    _dt_bias: &Tensor,
 ) -> Result<(Tensor, Tensor)> {
    candle_core::bail!("fused_gdn_gating_cuda requires the cuda feature")
 }
--- a/crates/neuron/src/cuda/mod.rs
+++ b/crates/neuron/src/cuda/mod.rs
@@ -0,0 +1,15 @@
 //! CUDA kernels and their Rust wrappers.
 //!
 //! Currently scoped to what we need for Qwen3-Next (`qwen3_5`)
 //! inference performance — the Gated DeltaNet kernels ported from
 //! `EricLBuehler/mistral.rs` (MIT). Each kernel lives in a `.cu`
 //! file alongside this module; `build.rs` compiles them all into a
 //! static lib via `cudaforge` and links it under the `cuda` feature.
 //!
 //! When we absorb more upstream kernels (MoE GEMM, top-k, Mamba SSM,
 //! etc.) they land here in their own `.cu` + `.rs` pairs.
 #[cfg(feature = "cuda")]
 pub mod ffi;
 #[cfg(feature = "cuda")]
 pub mod gdn;
--- a/crates/neuron/src/lib.rs
+++ b/crates/neuron/src/lib.rs
@@ -1,5 +1,6 @@
 pub mod api;
 pub mod config;
 pub mod cuda;
 pub mod discovery;
 pub mod harness;
 pub mod health;