fix(neuron,tp): log leader forward errors with full context
Worker rank failures were already surfaced at WARN, but the leader's own forward Result::Err was silently coerced to a `leader_ok=false` bool. When the leader and a worker both fail together — the typical shape of a CUDA OOM cascading into an illegal-address — the journal showed only the worker side and an operator had to guess what hit rank 0. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -656,10 +656,32 @@ impl WorkerPool {
|
|||||||
.await
|
.await
|
||||||
.context("leader forward task panicked");
|
.context("leader forward task panicked");
|
||||||
let leader_ok = matches!(leader_result, Ok(Ok(_)));
|
let leader_ok = matches!(leader_result, Ok(Ok(_)));
|
||||||
|
let leader_ms = leader_start.elapsed().as_millis();
|
||||||
|
// Surface the leader's own error at WARN. Previously this was
|
||||||
|
// silently coerced to `leader_ok=false` while only worker
|
||||||
|
// ranks' errors got logged — when both the leader and a worker
|
||||||
|
// fail together (the typical "CUDA context is now poisoned"
|
||||||
|
// pattern after an OOM), the operator could see only the
|
||||||
|
// worker side and had to guess what hit rank 0.
|
||||||
|
if !leader_ok {
|
||||||
|
let detail = match &leader_result {
|
||||||
|
Ok(Err(e)) => format!("{e:#}"),
|
||||||
|
Err(e) => format!("task: {e:#}"),
|
||||||
|
Ok(Ok(_)) => unreachable!("leader_ok=false implies an error path"),
|
||||||
|
};
|
||||||
|
tracing::warn!(
|
||||||
|
model = %model_id,
|
||||||
|
tokens = tokens_len,
|
||||||
|
offset,
|
||||||
|
leader_ms,
|
||||||
|
error = %detail,
|
||||||
|
"WorkerPool::generate_step: leader forward failed"
|
||||||
|
);
|
||||||
|
}
|
||||||
tracing::debug!(
|
tracing::debug!(
|
||||||
model = %model_id,
|
model = %model_id,
|
||||||
tokens = tokens_len,
|
tokens = tokens_len,
|
||||||
leader_ms = leader_start.elapsed().as_millis(),
|
leader_ms,
|
||||||
leader_ok,
|
leader_ok,
|
||||||
"WorkerPool::generate_step: leader forward returned"
|
"WorkerPool::generate_step: leader forward returned"
|
||||||
);
|
);
|
||||||
|
|||||||
Reference in New Issue
Block a user