fix(neuron,tp): log leader forward errors with full context

Worker rank failures were already surfaced at WARN, but the leader's
own forward Result::Err was silently coerced to a `leader_ok=false`
bool. When the leader and a worker both fail together — the typical
shape of a CUDA OOM cascading into an illegal-address — the journal
showed only the worker side and an operator had to guess what hit
rank 0.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-26 12:22:30 +03:00
parent aa88d37509
commit ea0e0f7911

View File

@@ -656,10 +656,32 @@ impl WorkerPool {
.await
.context("leader forward task panicked");
let leader_ok = matches!(leader_result, Ok(Ok(_)));
let leader_ms = leader_start.elapsed().as_millis();
// Surface the leader's own error at WARN. Previously this was
// silently coerced to `leader_ok=false` while only worker
// ranks' errors got logged — when both the leader and a worker
// fail together (the typical "CUDA context is now poisoned"
// pattern after an OOM), the operator could see only the
// worker side and had to guess what hit rank 0.
if !leader_ok {
let detail = match &leader_result {
Ok(Err(e)) => format!("{e:#}"),
Err(e) => format!("task: {e:#}"),
Ok(Ok(_)) => unreachable!("leader_ok=false implies an error path"),
};
tracing::warn!(
model = %model_id,
tokens = tokens_len,
offset,
leader_ms,
error = %detail,
"WorkerPool::generate_step: leader forward failed"
);
}
tracing::debug!(
model = %model_id,
tokens = tokens_len,
leader_ms = leader_start.elapsed().as_millis(),
leader_ms,
leader_ok,
"WorkerPool::generate_step: leader forward returned"
);