diff --git a/crates/neuron/src/harness/candle.rs b/crates/neuron/src/harness/candle.rs index c1757e8..12ecb1d 100644 --- a/crates/neuron/src/harness/candle.rs +++ b/crates/neuron/src/harness/candle.rs @@ -1408,21 +1408,34 @@ impl CandleHarness { }) .await; - // Any failure inside the spawn_blocking touched CUDA via - // candle's forward / cache code, so we treat it as a - // device-poisoning event. The terminal log at the bottom - // of the wrapper reports the error; this flag stops the - // NEXT request from going down the same path. + // Distinguish "inference returned Err" (almost always a + // candle/CUDA failure that propagated through `?`, e.g. + // an OOM or driver error — the context is unreliable, + // poison the model) from "spawn_blocking task panicked + // or was cancelled" (a Rust-level panic in the closure, + // not a device fault; failing the one request without + // tearing down the model for everyone else is correct). match inference_result { Ok(Ok(v)) => v, Ok(Err(e)) => { loaded.poisoned.store(true, Ordering::Release); return Err(InferenceError::Other(e)); } - Err(e) => { - loaded.poisoned.store(true, Ordering::Release); + Err(join_err) => { + let cause = if join_err.is_panic() { + "panicked" + } else if join_err.is_cancelled() { + "was cancelled" + } else { + "ended abnormally" + }; + tracing::error!( + cause, + error = %join_err, + "chat_completion: inference task {cause}; model NOT marked poisoned" + ); return Err(InferenceError::Other(anyhow::anyhow!( - "inference task panicked: {e}" + "inference task {cause}: {join_err}" ))); } } @@ -2054,28 +2067,48 @@ impl CandleHarness { let tp_for_marker = Arc::clone(&tp); let handle = tokio::spawn(chat_completion_tp_inner(tp, request).instrument(span.clone())); - let result = match handle.await { - Ok(r) => r, - Err(join_err) => Err(InferenceError::Other(anyhow::anyhow!( - "TP inference task panicked or was cancelled: {join_err}" - ))), - }; - if let Err(ref e) = result { - // Mark poisoned: a failure inside the spawned task either - // hit a CUDA/NCCL driver error directly or surfaced as a - // task panic. Both cases leave the worker subprocesses in - // an unknown state — refuse subsequent requests until an - // operator unload+reloads. This is the gate that turned - // the 2026-05-26 silent-hang into a clean 5xx. - tp_for_marker.poisoned.store(true, Ordering::Release); - let _g = span.enter(); - tracing::error!( - error = %format!("{e:#}"), - total_ms = req_start.elapsed().as_millis(), - "TP chat_completion: failed, model marked poisoned" - ); + match handle.await { + Ok(Ok(resp)) => Ok(resp), + Ok(Err(e)) => { + // The inner task returned Err — a real inference + // failure that propagated through `?`. CUDA / NCCL + // driver errors leave the device context unrecoverable, + // so poison the model. This is the gate that turned + // the 2026-05-26 silent-hang into a clean 5xx. + tp_for_marker.poisoned.store(true, Ordering::Release); + let _g = span.enter(); + tracing::error!( + error = %format!("{e:#}"), + total_ms = req_start.elapsed().as_millis(), + "TP chat_completion: failed, model marked poisoned" + ); + Err(e) + } + Err(join_err) => { + // JoinError: the spawned task panicked or was cancelled. + // Tokenizer / sampling / serialisation panics don't touch + // the device, so don't poison the model — failing this + // one request is enough. (CUDA failures arrive as Err + // through `?`, not as panics, and are handled above.) + let cause = if join_err.is_panic() { + "panicked" + } else if join_err.is_cancelled() { + "was cancelled" + } else { + "ended abnormally" + }; + let _g = span.enter(); + tracing::error!( + cause, + error = %join_err, + total_ms = req_start.elapsed().as_millis(), + "TP chat_completion: inference task {cause}; model NOT marked poisoned" + ); + Err(InferenceError::Other(anyhow::anyhow!( + "TP inference task {cause}: {join_err}" + ))) + } } - result } /// Streaming counterpart to `chat_completion_tp`. Same per-step