feat(neuron): construction-complete vram/config dump + logits health + per-step vram

Three additive diagnostics that turn the 2026-05-27 q5k Qwen3.6-27B incident from "guess at KV cache / quant sizes" into "read the journal": 1. Construction-complete summary in TpQwen3_5ForCausalLM::load and TpQwen3ForCausalLM::load. After the last "after layer N" log fires, each rank emits a single info line with: free_mb/total_mb (the number that drops by ~9 GB between per-layer and first-request on beast, with no inference traffic), every resolved config knob (vocab_size, hidden_size, num_layers, head_dim, num_kv_heads, max_position_embeddings), and a per-token KV-cache byte estimate. For Qwen3-Next also includes the linear/full-attention layer split so the hybrid architecture's cache cost is unambiguous. 2. Logits health snapshot on sample failure. Today the failure logs "A weight is negative, too large or not a valid number" with no context — was it a NaN cascade, an Inf, a negative weight? `logits_health(&logits)` computes nan/pos_inf/neg_inf/neg counts plus finite_min/max/mean on the failure path (zero cost on the success path) and emits a warn line just before the wrapper's terminal "failed, model marked poisoned" log. Wired into both the prefill and decode sample sites of the non-streaming AND streaming TP chat paths. 3. VRAM snapshot at prefill complete + every decode step. The "prefill complete" info line now carries vram_free_mb so the activations + KV growth from the prefill itself is visible. The per-step trace line gets vram_free_mb too, so an operator running with RUST_LOG=trace can watch headroom shrink token by token. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-27 09:04:55 +03:00
parent 24e20dcb5c
commit 7c19da9361
3 changed files with 304 additions and 8 deletions
--- a/crates/neuron/src/harness/candle.rs
+++ b/crates/neuron/src/harness/candle.rs
@@ -386,6 +386,95 @@ fn resolve_hf_cache(explicit: Option<PathBuf>) -> Option<PathBuf> {
    None
 }

+/// Summary stats over a 1-D logits tensor, used for the failure log
+/// when sampling rejects the distribution. Gathers nan/inf/negative
+/// counts and finite min/max/mean — enough to distinguish a NaN
+/// cascade (all-NaN, typical of softmax overflow propagating) from
+/// an Inf at a single position (numerical edge case) from negative
+/// weights (different bug entirely).
+///
+/// Computed only on the failure path, so the to_vec1 copy cost is
+/// paid at most once per poisoned model.
+#[derive(Debug)]
+#[allow(dead_code)]
+struct LogitsHealth {
+    len: usize,
+    nan: usize,
+    pos_inf: usize,
+    neg_inf: usize,
+    neg: usize,
+    finite_min: Option<f32>,
+    finite_max: Option<f32>,
+    finite_mean: Option<f32>,
+}
+
+#[allow(dead_code)]
+fn logits_health(t: &Tensor) -> LogitsHealth {
+    let values: Vec<f32> = match t
+        .to_dtype(candle_core::DType::F32)
+        .and_then(|t| t.flatten_all())
+        .and_then(|t| t.to_vec1::<f32>())
+    {
+        Ok(v) => v,
+        Err(_) => {
+            return LogitsHealth {
+                len: 0,
+                nan: 0,
+                pos_inf: 0,
+                neg_inf: 0,
+                neg: 0,
+                finite_min: None,
+                finite_max: None,
+                finite_mean: None,
+            };
+        }
+    };
+    let mut nan = 0usize;
+    let mut pos_inf = 0usize;
+    let mut neg_inf = 0usize;
+    let mut neg = 0usize;
+    let mut finite_min = f32::INFINITY;
+    let mut finite_max = f32::NEG_INFINITY;
+    let mut finite_sum = 0.0_f64;
+    let mut finite_count = 0usize;
+    for &v in &values {
+        if v.is_nan() {
+            nan += 1;
+        } else if v == f32::INFINITY {
+            pos_inf += 1;
+        } else if v == f32::NEG_INFINITY {
+            neg_inf += 1;
+        } else {
+            if v < 0.0 {
+                neg += 1;
+            }
+            if v < finite_min {
+                finite_min = v;
+            }
+            if v > finite_max {
+                finite_max = v;
+            }
+            finite_sum += v as f64;
+            finite_count += 1;
+        }
+    }
+    let finite_mean = if finite_count > 0 {
+        Some((finite_sum / finite_count as f64) as f32)
+    } else {
+        None
+    };
+    LogitsHealth {
+        len: values.len(),
+        nan,
+        pos_inf,
+        neg_inf,
+        neg,
+        finite_min: (finite_count > 0).then_some(finite_min),
+        finite_max: (finite_count > 0).then_some(finite_max),
+        finite_mean,
+    }
+}
+
 /// Build the InferenceError reported to a client when their request
 /// hits a model that's been marked poisoned by an earlier driver
 /// failure. The message names the model and the recovery procedure so
@@ -1624,10 +1713,24 @@ impl CandleHarness {
                            break 'work;
                        }
                    };
+                    let (post_prefill_vram_free_mb, _) =
+                        device_vram_mb(&tp_for_task.leader_device);
+                    tracing::info!(
+                        model = %model_id,
+                        prompt_len,
+                        vram_free_mb = post_prefill_vram_free_mb,
+                        "TP chat_completion (stream): prefill complete"
+                    );
                    let mut next_token =
                        match sample_with_penalty(&logits, &all_tokens, &mut logits_processor) {
                            Ok(t) => t,
                            Err(e) => {
+                                let health = logits_health(&logits);
+                                tracing::warn!(
+                                    model = %model_id,
+                                    ?health,
+                                    "TP chat_completion (stream): prefill sample failed; logits unhealthy"
+                                );
                                failure = Some(format!("prefill sample: {e:#}"));
                                break 'work;
                            }
@@ -1676,10 +1779,24 @@ impl CandleHarness {
                            ) {
                                Ok(t) => t,
                                Err(e) => {
+                                    let health = logits_health(&logits);
+                                    tracing::warn!(
+                                        model = %model_id,
+                                        step = index,
+                                        ?health,
+                                        "TP chat_completion (stream): decode sample failed; logits unhealthy"
+                                    );
                                    failure = Some(format!("decode sample {index}: {e:#}"));
                                    break 'work;
                                }
                            };
+                            tracing::trace!(
+                                model = %model_id,
+                                step = index,
+                                next_token,
+                                vram_free_mb = device_vram_mb(&tp_for_task.leader_device).0,
+                                "TP chat_completion (stream): decode step"
+                            );
                            if Some(next_token) == eos_id {
                                finish_reason = "stop".into();
                                break;
@@ -1845,14 +1962,31 @@ async fn chat_completion_tp_inner(
        .generate_step(&model_id, leader_arc.clone(), prompt_tokens.clone(), 0)
        .await
        .map_err(InferenceError::Other)?;
+    let (post_prefill_vram_free_mb, _) = device_vram_mb(&tp.leader_device);
    tracing::info!(
        model = %model_id,
        prompt_len,
        elapsed_ms = prefill_start.elapsed().as_millis(),
+        vram_free_mb = post_prefill_vram_free_mb,
        "TP chat_completion: prefill complete"
    );
-    let mut next_token = sample_with_penalty(&logits, &generated, &mut logits_processor)
-        .map_err(InferenceError::Other)?;
+    let mut next_token = match sample_with_penalty(&logits, &generated, &mut logits_processor) {
+        Ok(t) => t,
+        Err(e) => {
+            // Logits health snapshot — the surrounding wrapper logs
+            // "failed, model marked poisoned" with the error chain;
+            // this WARN sits just above that and carries the actual
+            // numerical state so an operator can tell at a glance
+            // whether it was a NaN cascade, an Inf, or something else.
+            let health = logits_health(&logits);
+            tracing::warn!(
+                model = %model_id,
+                ?health,
+                "TP chat_completion: prefill sample failed; logits unhealthy"
+            );
+            return Err(InferenceError::Other(e));
+        }
+    };

    if Some(next_token) == eos_id {
        finish_reason = "stop".into();
@@ -1870,13 +2004,26 @@ async fn chat_completion_tp_inner(
                )
                .await
                .map_err(InferenceError::Other)?;
-            next_token = sample_with_penalty(&logits, &generated, &mut logits_processor)
-                .map_err(InferenceError::Other)?;
+            next_token = match sample_with_penalty(&logits, &generated, &mut logits_processor) {
+                Ok(t) => t,
+                Err(e) => {
+                    let health = logits_health(&logits);
+                    tracing::warn!(
+                        model = %model_id,
+                        step = index,
+                        ?health,
+                        "TP chat_completion: decode sample failed; logits unhealthy"
+                    );
+                    return Err(InferenceError::Other(e));
+                }
+            };
+            let step_vram_free_mb = device_vram_mb(&tp.leader_device).0;
            tracing::trace!(
                model = %model_id,
                step = index,
                next_token,
                step_ms = step_start.elapsed().as_millis(),
+                vram_free_mb = step_vram_free_mb,
                "TP chat_completion: decode step"
            );
            if Some(next_token) == eos_id {
--- a/crates/neuron/src/harness/tp/tp_qwen3.rs
+++ b/crates/neuron/src/harness/tp/tp_qwen3.rs
@@ -562,14 +562,18 @@ impl TpQwen3ForCausalLM {
    ) -> Result<Self> {
        let base = TpQwen3Model::load(cfg, vb, rank, world_size, comm)?;
        let lm_head = build_lm_head(cfg, vb, &base)?;
-        Ok(Self { base, lm_head })
+        let model = Self { base, lm_head };
+        log_construction_complete(cfg, rank, world_size, model.device());
+        Ok(model)
    }

    #[cfg(not(feature = "cuda"))]
    pub fn load(cfg: &Config, vb: &ShardedVarBuilder, rank: u32, world_size: u32) -> Result<Self> {
        let base = TpQwen3Model::load(cfg, vb, rank, world_size)?;
        let lm_head = build_lm_head(cfg, vb, &base)?;
-        Ok(Self { base, lm_head })
+        let model = Self { base, lm_head };
+        log_construction_complete(cfg, rank, world_size, model.device());
+        Ok(model)
    }

    pub fn forward(&mut self, input: &Tensor, offset: usize) -> candle_core::Result<Tensor> {
@@ -603,3 +607,72 @@ fn build_lm_head(cfg: &Config, vb: &ShardedVarBuilder, base: &TpQwen3Model) -> R
        Ok(Linear::new(weight, None))
    }
 }
+
+/// VRAM accounting + config dump emitted at the end of
+/// `TpQwen3ForCausalLM::load`. Same intent as the Qwen3-Next variant
+/// in tp_qwen3_5.rs — surface the resolved hyperparameters and
+/// per-rank free VRAM in one line so an operator chasing an OOM or a
+/// numerical issue doesn't have to grep the per-layer load logs.
+#[cfg(feature = "cuda")]
+fn log_construction_complete(cfg: &Config, rank: u32, world_size: u32, device: &Device) {
+    use candle_core::cuda::cudarc::driver::result;
+    use candle_core::cuda_backend::WrapErr;
+    let (free_mb, total_mb) = if let Device::Cuda(dev) = device {
+        if dev.cuda_stream().context().bind_to_thread().w().is_ok() {
+            match result::mem_get_info() {
+                Ok((free, total)) => (free / (1024 * 1024), total / (1024 * 1024)),
+                Err(_) => (0, 0),
+            }
+        } else {
+            (0, 0)
+        }
+    } else {
+        (0, 0)
+    };
+    // Per-rank KV cache cost at one token: K + V × bf16. Vanilla
+    // Qwen3 is dense attention end-to-end, so every layer
+    // contributes. Knowing per-token bytes lets the operator estimate
+    // headroom for a given prompt length before hitting an edge.
+    let per_rank_num_kv_heads = (cfg.num_key_value_heads / world_size as usize).max(1);
+    let kv_bytes_per_token_per_layer = per_rank_num_kv_heads * cfg.head_dim * 2 * 2;
+    let kv_bytes_per_token = kv_bytes_per_token_per_layer * cfg.num_hidden_layers;
+    tracing::info!(
+        target: "neuron::tp::load",
+        rank,
+        world_size,
+        free_mb,
+        total_mb,
+        vocab_size = cfg.vocab_size,
+        hidden_size = cfg.hidden_size,
+        num_hidden_layers = cfg.num_hidden_layers,
+        num_attention_heads = cfg.num_attention_heads,
+        num_key_value_heads = cfg.num_key_value_heads,
+        head_dim = cfg.head_dim,
+        max_position_embeddings = cfg.max_position_embeddings,
+        per_rank_num_kv_heads,
+        kv_bytes_per_token,
+        "Qwen3 model construction complete"
+    );
+}
+
+#[cfg(not(feature = "cuda"))]
+fn log_construction_complete(cfg: &Config, rank: u32, world_size: u32, _device: &Device) {
+    let per_rank_num_kv_heads = (cfg.num_key_value_heads / world_size as usize).max(1);
+    let kv_bytes_per_token_per_layer = per_rank_num_kv_heads * cfg.head_dim * 2 * 2;
+    let kv_bytes_per_token = kv_bytes_per_token_per_layer * cfg.num_hidden_layers;
+    tracing::info!(
+        target: "neuron::tp::load",
+        rank,
+        world_size,
+        vocab_size = cfg.vocab_size,
+        hidden_size = cfg.hidden_size,
+        num_hidden_layers = cfg.num_hidden_layers,
+        num_attention_heads = cfg.num_attention_heads,
+        num_key_value_heads = cfg.num_key_value_heads,
+        head_dim = cfg.head_dim,
+        max_position_embeddings = cfg.max_position_embeddings,
+        per_rank_num_kv_heads,
+        kv_bytes_per_token,
+        "Qwen3 model construction complete"
+    );
+}
--- a/crates/neuron/src/harness/tp/tp_qwen3_5.rs
+++ b/crates/neuron/src/harness/tp/tp_qwen3_5.rs
@@ -1012,7 +1012,9 @@ impl TpQwen3_5ForCausalLM {
        let cfg = &config.text_config;
        let base = TpQwen3_5Model::load(cfg, vb, mmap, rank, world_size, comm, quant)?;
        let lm_head = build_lm_head(cfg, vb, &base, quant)?;
-        Ok(Self { base, lm_head })
+        let model = Self { base, lm_head };
+        log_construction_complete(cfg, rank, world_size, quant, model.device());
+        Ok(model)
    }

    #[cfg(not(feature = "cuda"))]
@@ -1027,7 +1029,9 @@ impl TpQwen3_5ForCausalLM {
        let cfg = &config.text_config;
        let base = TpQwen3_5Model::load(cfg, vb, mmap, rank, world_size, quant)?;
        let lm_head = build_lm_head(cfg, vb, &base, quant)?;
-        Ok(Self { base, lm_head })
+        let model = Self { base, lm_head };
+        log_construction_complete(cfg, rank, world_size, quant, model.device());
+        Ok(model)
    }

    pub fn forward(&mut self, input: &Tensor, offset: usize) -> candle_core::Result<Tensor> {
@@ -1129,3 +1133,75 @@ fn log_vram(device: &Device, rank: u32, tag: &str) {
 #[cfg(not(feature = "cuda"))]
 #[allow(dead_code)]
 fn log_vram(_device: &Device, _rank: u32, _tag: &str) {}
+
+/// Summary line emitted at end of `TpQwen3_5ForCausalLM::load`, after
+/// the per-layer load loop AND after the lm_head + any post-construct
+/// allocations. Logs the resolved config knobs (the ones an operator
+/// would want to know when chasing a numerical or OOM issue) plus a
+/// final free/total VRAM snapshot per rank.
+///
+/// The free_mb here is the most diagnostic number we have at this
+/// stage: the gap between the last "after layer N" log and this line
+/// is everything else the model construction allocated — lm_head,
+/// embedding (if not tied), per-layer buffers held by candle's
+/// allocator, the RotaryEmbedding tables, and any working space.
+///
+/// `kv_cache_per_layer_per_token_bytes` is a back-of-envelope estimate
+/// — the actual cache grows as inference proceeds, but knowing the
+/// per-token cost at this point lets an operator estimate "for a
+/// 14k-token prompt I need ~X GB extra VRAM" without having to dig
+/// into the architecture's attention modules.
+fn log_construction_complete(
+    cfg: &TextConfig,
+    rank: u32,
+    world_size: u32,
+    quant: Option<GgmlDType>,
+    device: &Device,
+) {
+    let (free_mb, total_mb) = cuda_mem_mb(device);
+    // Distribution of attention kinds across layers. Qwen3-Next is
+    // hybrid: most layers are linear (Gated DeltaNet), a few are full
+    // softmax attention. Knowing the split at a glance helps when
+    // reasoning about KV cache size — only full-attention layers
+    // contribute to the standard kv cache.
+    let mut full_attn_layers = 0;
+    let mut linear_attn_layers = 0;
+    for kind in &cfg.layer_types {
+        match kind.as_str() {
+            "full_attention" => full_attn_layers += 1,
+            "linear_attention" => linear_attn_layers += 1,
+            _ => {}
+        }
+    }
+    // KV cache per-layer-per-token byte estimate for the per-rank
+    // full-attention layers. bf16 = 2 bytes, K + V doubles it, and
+    // sharded across world_size. Linear-attention layers carry a
+    // fixed-size state instead of a growing cache.
+    let per_rank_num_kv_heads = (cfg.num_key_value_heads / world_size as usize).max(1);
+    let kv_bytes_per_token_per_layer = per_rank_num_kv_heads * cfg.head_dim * 2 /* K+V */ * 2 /* bf16 */;
+    let kv_bytes_per_token = kv_bytes_per_token_per_layer * full_attn_layers;
+    tracing::info!(
+        target: "neuron::tp::load",
+        rank,
+        world_size,
+        quant = ?quant,
+        free_mb,
+        total_mb,
+        vocab_size = cfg.vocab_size,
+        hidden_size = cfg.hidden_size,
+        num_hidden_layers = cfg.num_hidden_layers,
+        num_attention_heads = cfg.num_attention_heads,
+        num_key_value_heads = cfg.num_key_value_heads,
+        head_dim = cfg.head_dim,
+        max_position_embeddings = cfg.max_position_embeddings,
+        full_attn_layers,
+        linear_attn_layers,
+        linear_num_value_heads = cfg.linear_num_value_heads,
+        linear_num_key_heads = cfg.linear_num_key_heads,
+        linear_key_head_dim = cfg.linear_key_head_dim,
+        linear_value_head_dim = cfg.linear_value_head_dim,
+        per_rank_num_kv_heads,
+        kv_bytes_per_token,
+        "Qwen3-Next model construction complete"
+    );
+}