From f05882369d8dc923459eb5d8754c8dc784dedaea Mon Sep 17 00:00:00 2001
From: rob thijssen <grenade@rob.tn>
Date: Wed, 27 May 2026 18:02:52 +0300
Subject: [PATCH] fix(neuron): don't poison the model on tokio JoinError panics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CUDA driver failures propagate as Err through `?` and become
`Ok(Err(InferenceError::Other(_)))` from the spawned task — those are
real device faults and still poison the model. Tokio JoinError is
different: it fires on Rust-level panic (tokenizer bug, sampler bug,
serialisation, the UTF-8 slice that landed in commit bd04d7f before
the fix) or task cancellation. Those don't touch the device context,
so failing the one request without tearing down the model is correct.

Two sites changed:

  - chat_completion's CPU spawn_blocking handler — JoinError no longer
    sets loaded.poisoned.
  - chat_completion_tp's tokio::spawn wrapper — JoinError no longer
    sets tp_for_marker.poisoned. The inner-Err case still does.

Each path logs the cause (panicked / was cancelled / ended abnormally)
explicitly so the journal makes the new behaviour obvious — search for
"model NOT marked poisoned" to find these events.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/neuron/src/harness/candle.rs | 91 ++++++++++++++++++++---------
 1 file changed, 62 insertions(+), 29 deletions(-)

diff --git a/crates/neuron/src/harness/candle.rs b/crates/neuron/src/harness/candle.rs
index c1757e8..12ecb1d 100644
--- a/crates/neuron/src/harness/candle.rs
+++ b/crates/neuron/src/harness/candle.rs
@@ -1408,21 +1408,34 @@ impl CandleHarness {
                     })
                     .await;
 
-                // Any failure inside the spawn_blocking touched CUDA via
-                // candle's forward / cache code, so we treat it as a
-                // device-poisoning event. The terminal log at the bottom
-                // of the wrapper reports the error; this flag stops the
-                // NEXT request from going down the same path.
+                // Distinguish "inference returned Err" (almost always a
+                // candle/CUDA failure that propagated through `?`, e.g.
+                // an OOM or driver error — the context is unreliable,
+                // poison the model) from "spawn_blocking task panicked
+                // or was cancelled" (a Rust-level panic in the closure,
+                // not a device fault; failing the one request without
+                // tearing down the model for everyone else is correct).
                 match inference_result {
                     Ok(Ok(v)) => v,
                     Ok(Err(e)) => {
                         loaded.poisoned.store(true, Ordering::Release);
                         return Err(InferenceError::Other(e));
                     }
-                    Err(e) => {
-                        loaded.poisoned.store(true, Ordering::Release);
+                    Err(join_err) => {
+                        let cause = if join_err.is_panic() {
+                            "panicked"
+                        } else if join_err.is_cancelled() {
+                            "was cancelled"
+                        } else {
+                            "ended abnormally"
+                        };
+                        tracing::error!(
+                            cause,
+                            error = %join_err,
+                            "chat_completion: inference task {cause}; model NOT marked poisoned"
+                        );
                         return Err(InferenceError::Other(anyhow::anyhow!(
-                            "inference task panicked: {e}"
+                            "inference task {cause}: {join_err}"
                         )));
                     }
                 }
@@ -2054,28 +2067,48 @@ impl CandleHarness {
 
         let tp_for_marker = Arc::clone(&tp);
         let handle = tokio::spawn(chat_completion_tp_inner(tp, request).instrument(span.clone()));
-        let result = match handle.await {
-            Ok(r) => r,
-            Err(join_err) => Err(InferenceError::Other(anyhow::anyhow!(
-                "TP inference task panicked or was cancelled: {join_err}"
-            ))),
-        };
-        if let Err(ref e) = result {
-            // Mark poisoned: a failure inside the spawned task either
-            // hit a CUDA/NCCL driver error directly or surfaced as a
-            // task panic. Both cases leave the worker subprocesses in
-            // an unknown state — refuse subsequent requests until an
-            // operator unload+reloads. This is the gate that turned
-            // the 2026-05-26 silent-hang into a clean 5xx.
-            tp_for_marker.poisoned.store(true, Ordering::Release);
-            let _g = span.enter();
-            tracing::error!(
-                error = %format!("{e:#}"),
-                total_ms = req_start.elapsed().as_millis(),
-                "TP chat_completion: failed, model marked poisoned"
-            );
+        match handle.await {
+            Ok(Ok(resp)) => Ok(resp),
+            Ok(Err(e)) => {
+                // The inner task returned Err — a real inference
+                // failure that propagated through `?`. CUDA / NCCL
+                // driver errors leave the device context unrecoverable,
+                // so poison the model. This is the gate that turned
+                // the 2026-05-26 silent-hang into a clean 5xx.
+                tp_for_marker.poisoned.store(true, Ordering::Release);
+                let _g = span.enter();
+                tracing::error!(
+                    error = %format!("{e:#}"),
+                    total_ms = req_start.elapsed().as_millis(),
+                    "TP chat_completion: failed, model marked poisoned"
+                );
+                Err(e)
+            }
+            Err(join_err) => {
+                // JoinError: the spawned task panicked or was cancelled.
+                // Tokenizer / sampling / serialisation panics don't touch
+                // the device, so don't poison the model — failing this
+                // one request is enough. (CUDA failures arrive as Err
+                // through `?`, not as panics, and are handled above.)
+                let cause = if join_err.is_panic() {
+                    "panicked"
+                } else if join_err.is_cancelled() {
+                    "was cancelled"
+                } else {
+                    "ended abnormally"
+                };
+                let _g = span.enter();
+                tracing::error!(
+                    cause,
+                    error = %join_err,
+                    total_ms = req_start.elapsed().as_millis(),
+                    "TP chat_completion: inference task {cause}; model NOT marked poisoned"
+                );
+                Err(InferenceError::Other(anyhow::anyhow!(
+                    "TP inference task {cause}: {join_err}"
+                )))
+            }
         }
-        result
     }
 
     /// Streaming counterpart to `chat_completion_tp`. Same per-step