fix(neuron): trim cudarc mempool after clear_kv_cache to release VRAM

cudarc's stream-ordered memory pool retains freed blocks (cuMemFreeAsync returns memory to the device's default mempool, not to the OS), so mem_get_info under-reports free VRAM between requests. With Qwen/Qwen3.6-27B TP=2, the second consecutive chat completion saw ~4.5 GB of "missing" free VRAM and either OOMed or tripped cuBLAS into CUBLAS_STATUS_INTERNAL_ERROR depending on quant. Add a cuda-gated trim_device_pool helper that, after each successful clear_kv_cache, synchronizes the context and calls cuMemPoolTrimTo(pool, 0) against the device's default mempool. Failures (no async-alloc support, transient driver errors) are non-fatal and log at debug. The before/after free-VRAM delta is logged so an operator can correlate the trim with the next request's prefill VRAM. ConcatKvCache::reset() in candle-nn 0.10.2 already drops its tensors correctly; the leak was strictly at the cudarc pool layer. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-27 12:36:13 +03:00
parent c4954e0eed
commit cdf0f4e66d
1 changed files with 75 additions and 0 deletions
--- a/crates/neuron/src/harness/device_worker/dispatch.rs
+++ b/crates/neuron/src/harness/device_worker/dispatch.rs
@@ -144,6 +144,9 @@ pub(crate) fn run(device_index: u32, rx: Receiver<Job>, poisoned: Arc<AtomicBool
                    Some(arch) => arch.clear_kv_cache(),
                    None => Err(anyhow::anyhow!("ClearKv: no model for handle {}", handle.0)),
                };
                if result.is_ok() {
                    trim_device_pool(&state);
                }
                let _ = reply.send(result);
            }
            Job::ForwardLogits {
@@ -214,6 +217,9 @@ pub(crate) fn run(device_index: u32, rx: Receiver<Job>, poisoned: Arc<AtomicBool
                        handle.0
                    )),
                };
                if result.is_ok() {
                    trim_device_pool(&state);
                }
                let _ = reply.send(result);
            }
            #[cfg(feature = "cuda")]
@@ -338,6 +344,75 @@ fn query_vram(_state: &DeviceWorkerState) -> anyhow::Result<(u64, u64)> {
    Ok((0, 0))
 }
 /// Force cudarc's stream-ordered memory pool to release every block it
 /// is holding back to the system. After `ConcatKvCache::reset()` drops
 /// its tensors, the underlying `CudaSlice::drop` calls `cuMemFreeAsync`,
 /// which returns the blocks to the device's default mempool but not to
 /// the OS — `mem_get_info` still reports them as used. The next
 /// request's prefill then sees a falsely-small free pool and either
 /// OOMs or trips cuBLAS into `CUBLAS_STATUS_INTERNAL_ERROR`.
 ///
 /// Calling `cuMemPoolTrimTo(pool, 0)` after each `clear_kv_cache`
 /// returns those blocks. We synchronize first so any pending
 /// `cuMemFreeAsync` operations have settled. Failures are non-fatal:
 /// the pool may not exist on legacy drivers, or a transient driver
 /// error may prevent the trim — neither breaks correctness, the next
 /// request just sees a less-recovered free pool.
 #[cfg(feature = "cuda")]
 fn trim_device_pool(state: &DeviceWorkerState) {
    use candle_core::cuda::cudarc::driver::result::{device, mem_pool};
    let Some(ctx) = state.ctx.as_ref() else {
        return;
    };
    let (before_free, _) = match query_vram(state) {
        Ok(v) => v,
        Err(_) => (0, 0),
    };
    if let Err(e) = ctx.synchronize() {
        tracing::debug!(
            device_index = state.device_index,
            error = ?e,
            "trim_device_pool: synchronize failed; skipping trim"
        );
        return;
    }
    let dev = ctx.cu_device();
    let pool = match unsafe { device::get_default_mem_pool(dev) } {
        Ok(p) => p,
        Err(e) => {
            tracing::debug!(
                device_index = state.device_index,
                error = ?e,
                "trim_device_pool: get_default_mem_pool failed"
            );
            return;
        }
    };
    if let Err(e) = unsafe { mem_pool::trim_to(pool, 0) } {
        tracing::debug!(
            device_index = state.device_index,
            error = ?e,
            "trim_device_pool: cuMemPoolTrimTo failed"
        );
        return;
    }
    let (after_free, _) = match query_vram(state) {
        Ok(v) => v,
        Err(_) => (0, 0),
    };
    let freed_mb = after_free.saturating_sub(before_free);
    tracing::debug!(
        device_index = state.device_index,
        before_free_mb = before_free,
        after_free_mb = after_free,
        freed_mb,
        "trim_device_pool: trimmed pool"
    );
 }
 #[cfg(not(feature = "cuda"))]
 fn trim_device_pool(_state: &DeviceWorkerState) {}
 /// Insert a freshly-built `ModelArch` into the slab and mint a fresh
 /// `ArchHandle`. Used by both `LoadGguf` and `LoadDense` dispatch
 /// handlers — they differ only in *how* the arch is built; the