diag(stage-8d-6): per-layer VRAM logging in TP load path
All checks were successful
build-prerelease / Resolve version stamps (push) Successful in 30s
CI / Format (push) Successful in 33s
CI / Clippy (push) Successful in 2m14s
build-prerelease / Build neuron-blackwell (push) Successful in 3m59s
CI / Test (push) Successful in 4m58s
build-prerelease / Build cortex binary (push) Successful in 4m36s
CI / Build cortex SRPM (push) Has been skipped
CI / Build neuron SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
build-prerelease / Package cortex RPM (push) Successful in 1m26s
build-prerelease / Build neuron-ampere (push) Successful in 4m52s
build-prerelease / Build neuron-ada (push) Successful in 5m11s
build-prerelease / Package helexa-neuron-ada RPM (push) Successful in 2m56s
build-prerelease / Package helexa-neuron-ampere RPM (push) Successful in 3m1s
build-prerelease / Package helexa-neuron-blackwell RPM (push) Successful in 3m52s
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Successful in 1m0s
All checks were successful
build-prerelease / Resolve version stamps (push) Successful in 30s
CI / Format (push) Successful in 33s
CI / Clippy (push) Successful in 2m14s
build-prerelease / Build neuron-blackwell (push) Successful in 3m59s
CI / Test (push) Successful in 4m58s
build-prerelease / Build cortex binary (push) Successful in 4m36s
CI / Build cortex SRPM (push) Has been skipped
CI / Build neuron SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
build-prerelease / Package cortex RPM (push) Successful in 1m26s
build-prerelease / Build neuron-ampere (push) Successful in 4m52s
build-prerelease / Build neuron-ada (push) Successful in 5m11s
build-prerelease / Package helexa-neuron-ada RPM (push) Successful in 2m56s
build-prerelease / Package helexa-neuron-ampere RPM (push) Successful in 3m1s
build-prerelease / Package helexa-neuron-blackwell RPM (push) Successful in 3m52s
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Successful in 1m0s
Wraps each TpQwen3_5DecoderLayer::load in a with_context that captures free/total VRAM on failure, plus an info-level log after every layer that succeeds. Uses cudarc::driver::result::mem_get_info — same API mistralrs uses. Diagnostic only: forward path is unchanged. Helps distinguish true VRAM exhaustion from allocator fragmentation when loading large models at BF16 on 2x consumer GPUs. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -782,8 +782,9 @@ impl TpQwen3_5Model {
|
|||||||
|
|
||||||
let vb_l = text_vb.pp("layers");
|
let vb_l = text_vb.pp("layers");
|
||||||
let mut layers = Vec::with_capacity(cfg.num_hidden_layers);
|
let mut layers = Vec::with_capacity(cfg.num_hidden_layers);
|
||||||
|
log_vram(&device, rank, "before layer 0");
|
||||||
for i in 0..cfg.num_hidden_layers {
|
for i in 0..cfg.num_hidden_layers {
|
||||||
layers.push(TpQwen3_5DecoderLayer::load(
|
let layer = TpQwen3_5DecoderLayer::load(
|
||||||
cfg,
|
cfg,
|
||||||
rotary.clone(),
|
rotary.clone(),
|
||||||
i,
|
i,
|
||||||
@@ -791,7 +792,13 @@ impl TpQwen3_5Model {
|
|||||||
rank,
|
rank,
|
||||||
world_size,
|
world_size,
|
||||||
comm.clone(),
|
comm.clone(),
|
||||||
)?);
|
)
|
||||||
|
.with_context(|| {
|
||||||
|
let (free_mb, total_mb) = cuda_mem_mb(&device);
|
||||||
|
format!("load layer {i} (rank {rank}): free={free_mb}MB / total={total_mb}MB")
|
||||||
|
})?;
|
||||||
|
layers.push(layer);
|
||||||
|
log_vram(&device, rank, &format!("after layer {i}"));
|
||||||
}
|
}
|
||||||
|
|
||||||
let norm = Qwen3_5RmsNorm::load(&text_vb.pp("norm"), cfg.hidden_size, cfg.rms_norm_eps)?;
|
let norm = Qwen3_5RmsNorm::load(&text_vb.pp("norm"), cfg.hidden_size, cfg.rms_norm_eps)?;
|
||||||
@@ -1053,3 +1060,48 @@ fn load_fused_qkv_slice_3d(
|
|||||||
.contiguous()
|
.contiguous()
|
||||||
.with_context(|| format!("materialise fused conv slice for rank {r}"))
|
.with_context(|| format!("materialise fused conv slice for rank {r}"))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Query the cuda driver for free/total VRAM on the current device.
|
||||||
|
/// Returns `(free_mb, total_mb)`. Returns `(0, 0)` if the query fails
|
||||||
|
/// (so logging never crashes the load path).
|
||||||
|
#[cfg(feature = "cuda")]
|
||||||
|
fn cuda_mem_mb(device: &Device) -> (usize, usize) {
|
||||||
|
use candle_core::cuda::cudarc::driver::result;
|
||||||
|
use candle_core::cuda_backend::WrapErr;
|
||||||
|
let Device::Cuda(dev) = device else {
|
||||||
|
return (0, 0);
|
||||||
|
};
|
||||||
|
let Ok(()) = dev.cuda_stream().context().bind_to_thread().w() else {
|
||||||
|
return (0, 0);
|
||||||
|
};
|
||||||
|
match result::mem_get_info() {
|
||||||
|
Ok((free, total)) => (free / (1024 * 1024), total / (1024 * 1024)),
|
||||||
|
Err(_) => (0, 0),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(feature = "cuda"))]
|
||||||
|
#[allow(dead_code)]
|
||||||
|
fn cuda_mem_mb(_device: &Device) -> (usize, usize) {
|
||||||
|
(0, 0)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Info-log the current device's free VRAM with a tag. No-op when the
|
||||||
|
/// query fails or on cpu.
|
||||||
|
#[cfg(feature = "cuda")]
|
||||||
|
fn log_vram(device: &Device, rank: u32, tag: &str) {
|
||||||
|
let (free_mb, total_mb) = cuda_mem_mb(device);
|
||||||
|
if total_mb > 0 {
|
||||||
|
tracing::info!(
|
||||||
|
target: "neuron::tp::load",
|
||||||
|
rank,
|
||||||
|
free_mb,
|
||||||
|
total_mb,
|
||||||
|
"{tag}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(feature = "cuda"))]
|
||||||
|
#[allow(dead_code)]
|
||||||
|
fn log_vram(_device: &Device, _rank: u32, _tag: &str) {}
|
||||||
|
|||||||
Reference in New Issue
Block a user