From ea0e0f791121b647000597cd07771876ceefe68a Mon Sep 17 00:00:00 2001 From: rob thijssen Date: Tue, 26 May 2026 12:22:30 +0300 Subject: [PATCH] fix(neuron,tp): log leader forward errors with full context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Worker rank failures were already surfaced at WARN, but the leader's own forward Result::Err was silently coerced to a `leader_ok=false` bool. When the leader and a worker both fail together — the typical shape of a CUDA OOM cascading into an illegal-address — the journal showed only the worker side and an operator had to guess what hit rank 0. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/neuron/src/harness/tp/mod.rs | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/crates/neuron/src/harness/tp/mod.rs b/crates/neuron/src/harness/tp/mod.rs index a637b9a..1f65a23 100644 --- a/crates/neuron/src/harness/tp/mod.rs +++ b/crates/neuron/src/harness/tp/mod.rs @@ -656,10 +656,32 @@ impl WorkerPool { .await .context("leader forward task panicked"); let leader_ok = matches!(leader_result, Ok(Ok(_))); + let leader_ms = leader_start.elapsed().as_millis(); + // Surface the leader's own error at WARN. Previously this was + // silently coerced to `leader_ok=false` while only worker + // ranks' errors got logged — when both the leader and a worker + // fail together (the typical "CUDA context is now poisoned" + // pattern after an OOM), the operator could see only the + // worker side and had to guess what hit rank 0. + if !leader_ok { + let detail = match &leader_result { + Ok(Err(e)) => format!("{e:#}"), + Err(e) => format!("task: {e:#}"), + Ok(Ok(_)) => unreachable!("leader_ok=false implies an error path"), + }; + tracing::warn!( + model = %model_id, + tokens = tokens_len, + offset, + leader_ms, + error = %detail, + "WorkerPool::generate_step: leader forward failed" + ); + } tracing::debug!( model = %model_id, tokens = tokens_len, - leader_ms = leader_start.elapsed().as_millis(), + leader_ms, leader_ok, "WorkerPool::generate_step: leader forward returned" );