diag(stage-8d-6): per-layer VRAM logging in TP load path
All checks were successful
build-prerelease / Resolve version stamps (push) Successful in 30s
CI / Format (push) Successful in 33s
CI / Clippy (push) Successful in 2m14s
build-prerelease / Build neuron-blackwell (push) Successful in 3m59s
CI / Test (push) Successful in 4m58s
build-prerelease / Build cortex binary (push) Successful in 4m36s
CI / Build cortex SRPM (push) Has been skipped
CI / Build neuron SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
build-prerelease / Package cortex RPM (push) Successful in 1m26s
build-prerelease / Build neuron-ampere (push) Successful in 4m52s
build-prerelease / Build neuron-ada (push) Successful in 5m11s
build-prerelease / Package helexa-neuron-ada RPM (push) Successful in 2m56s
build-prerelease / Package helexa-neuron-ampere RPM (push) Successful in 3m1s
build-prerelease / Package helexa-neuron-blackwell RPM (push) Successful in 3m52s
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Successful in 1m0s

Wraps each TpQwen3_5DecoderLayer::load in a with_context that captures
free/total VRAM on failure, plus an info-level log after every layer
that succeeds. Uses cudarc::driver::result::mem_get_info — same API
mistralrs uses.

Diagnostic only: forward path is unchanged. Helps distinguish true
VRAM exhaustion from allocator fragmentation when loading large
models at BF16 on 2x consumer GPUs.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-21 12:54:05 +03:00
parent cc95fe28d9
commit 89d98d1fb2

View File

@@ -782,8 +782,9 @@ impl TpQwen3_5Model {
let vb_l = text_vb.pp("layers");
let mut layers = Vec::with_capacity(cfg.num_hidden_layers);
log_vram(&device, rank, "before layer 0");
for i in 0..cfg.num_hidden_layers {
layers.push(TpQwen3_5DecoderLayer::load(
let layer = TpQwen3_5DecoderLayer::load(
cfg,
rotary.clone(),
i,
@@ -791,7 +792,13 @@ impl TpQwen3_5Model {
rank,
world_size,
comm.clone(),
)?);
)
.with_context(|| {
let (free_mb, total_mb) = cuda_mem_mb(&device);
format!("load layer {i} (rank {rank}): free={free_mb}MB / total={total_mb}MB")
})?;
layers.push(layer);
log_vram(&device, rank, &format!("after layer {i}"));
}
let norm = Qwen3_5RmsNorm::load(&text_vb.pp("norm"), cfg.hidden_size, cfg.rms_norm_eps)?;
@@ -1053,3 +1060,48 @@ fn load_fused_qkv_slice_3d(
.contiguous()
.with_context(|| format!("materialise fused conv slice for rank {r}"))
}
/// Query the cuda driver for free/total VRAM on the current device.
/// Returns `(free_mb, total_mb)`. Returns `(0, 0)` if the query fails
/// (so logging never crashes the load path).
#[cfg(feature = "cuda")]
fn cuda_mem_mb(device: &Device) -> (usize, usize) {
use candle_core::cuda::cudarc::driver::result;
use candle_core::cuda_backend::WrapErr;
let Device::Cuda(dev) = device else {
return (0, 0);
};
let Ok(()) = dev.cuda_stream().context().bind_to_thread().w() else {
return (0, 0);
};
match result::mem_get_info() {
Ok((free, total)) => (free / (1024 * 1024), total / (1024 * 1024)),
Err(_) => (0, 0),
}
}
#[cfg(not(feature = "cuda"))]
#[allow(dead_code)]
fn cuda_mem_mb(_device: &Device) -> (usize, usize) {
(0, 0)
}
/// Info-log the current device's free VRAM with a tag. No-op when the
/// query fails or on cpu.
#[cfg(feature = "cuda")]
fn log_vram(device: &Device, rank: u32, tag: &str) {
let (free_mb, total_mb) = cuda_mem_mb(device);
if total_mb > 0 {
tracing::info!(
target: "neuron::tp::load",
rank,
free_mb,
total_mb,
"{tag}"
);
}
}
#[cfg(not(feature = "cuda"))]
#[allow(dead_code)]
fn log_vram(_device: &Device, _rank: u32, _tag: &str) {}