fix(neuron): don't poison the model on tokio JoinError panics

CUDA driver failures propagate as Err through `?` and become `Ok(Err(InferenceError::Other(_)))` from the spawned task — those are real device faults and still poison the model. Tokio JoinError is different: it fires on Rust-level panic (tokenizer bug, sampler bug, serialisation, the UTF-8 slice that landed in commit bd04d7f before the fix) or task cancellation. Those don't touch the device context, so failing the one request without tearing down the model is correct. Two sites changed: - chat_completion's CPU spawn_blocking handler — JoinError no longer sets loaded.poisoned. - chat_completion_tp's tokio::spawn wrapper — JoinError no longer sets tp_for_marker.poisoned. The inner-Err case still does. Each path logs the cause (panicked / was cancelled / ended abnormally) explicitly so the journal makes the new behaviour obvious — search for "model NOT marked poisoned" to find these events. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-27 18:02:52 +03:00
parent bd04d7f580
commit f05882369d
1 changed files with 62 additions and 29 deletions
--- a/crates/neuron/src/harness/candle.rs
+++ b/crates/neuron/src/harness/candle.rs
@@ -1408,21 +1408,34 @@ impl CandleHarness {
                    })
                    .await;
-                // Any failure inside the spawn_blocking touched CUDA via
+                // Distinguish "inference returned Err" (almost always a
-                // candle's forward / cache code, so we treat it as a
+                // candle/CUDA failure that propagated through `?`, e.g.
-                // device-poisoning event. The terminal log at the bottom
+                // an OOM or driver error — the context is unreliable,
-                // of the wrapper reports the error; this flag stops the
+                // poison the model) from "spawn_blocking task panicked
-                // NEXT request from going down the same path.
+                // or was cancelled" (a Rust-level panic in the closure,
                // not a device fault; failing the one request without
                // tearing down the model for everyone else is correct).
                match inference_result {
                    Ok(Ok(v)) => v,
                    Ok(Err(e)) => {
                        loaded.poisoned.store(true, Ordering::Release);
                        return Err(InferenceError::Other(e));
                    }
-                    Err(e) => {
+                    Err(join_err) => {
-                        loaded.poisoned.store(true, Ordering::Release);
+                        let cause = if join_err.is_panic() {
                            "panicked"
                        } else if join_err.is_cancelled() {
                            "was cancelled"
                        } else {
                            "ended abnormally"
                        };
                        tracing::error!(
                            cause,
                            error = %join_err,
                            "chat_completion: inference task {cause}; model NOT marked poisoned"
                        );
                        return Err(InferenceError::Other(anyhow::anyhow!(
-                            "inference task panicked: {e}"
+                            "inference task {cause}: {join_err}"
                        )));
                    }
                }
@@ -2054,28 +2067,48 @@ impl CandleHarness {
        let tp_for_marker = Arc::clone(&tp);
        let handle = tokio::spawn(chat_completion_tp_inner(tp, request).instrument(span.clone()));
-        let result = match handle.await {
+        match handle.await {
-            Ok(r) => r,
+            Ok(Ok(resp)) => Ok(resp),
-            Err(join_err) => Err(InferenceError::Other(anyhow::anyhow!(
+            Ok(Err(e)) => {
-                "TP inference task panicked or was cancelled: {join_err}"
+                // The inner task returned Err — a real inference
-            ))),
+                // failure that propagated through `?`. CUDA / NCCL
-        };
+                // driver errors leave the device context unrecoverable,
-        if let Err(ref e) = result {
+                // so poison the model. This is the gate that turned
-            // Mark poisoned: a failure inside the spawned task either
+                // the 2026-05-26 silent-hang into a clean 5xx.
-            // hit a CUDA/NCCL driver error directly or surfaced as a
+                tp_for_marker.poisoned.store(true, Ordering::Release);
-            // task panic. Both cases leave the worker subprocesses in
+                let _g = span.enter();
-            // an unknown state — refuse subsequent requests until an
+                tracing::error!(
-            // operator unload+reloads. This is the gate that turned
+                    error = %format!("{e:#}"),
-            // the 2026-05-26 silent-hang into a clean 5xx.
+                    total_ms = req_start.elapsed().as_millis(),
-            tp_for_marker.poisoned.store(true, Ordering::Release);
+                    "TP chat_completion: failed, model marked poisoned"
-            let _g = span.enter();
+                );
-            tracing::error!(
+                Err(e)
-                error = %format!("{e:#}"),
+            }
-                total_ms = req_start.elapsed().as_millis(),
+            Err(join_err) => {
-                "TP chat_completion: failed, model marked poisoned"
+                // JoinError: the spawned task panicked or was cancelled.
-            );
+                // Tokenizer / sampling / serialisation panics don't touch
                // the device, so don't poison the model — failing this
                // one request is enough. (CUDA failures arrive as Err
                // through `?`, not as panics, and are handled above.)
                let cause = if join_err.is_panic() {
                    "panicked"
                } else if join_err.is_cancelled() {
                    "was cancelled"
                } else {
                    "ended abnormally"
                };
                let _g = span.enter();
                tracing::error!(
                    cause,
                    error = %join_err,
                    total_ms = req_start.elapsed().as_millis(),
                    "TP chat_completion: inference task {cause}; model NOT marked poisoned"
                );
                Err(InferenceError::Other(anyhow::anyhow!(
                    "TP inference task {cause}: {join_err}"
                )))
            }
        }
        result
    }
    /// Streaming counterpart to `chat_completion_tp`. Same per-step