diff --git a/crates/neuron/src/main.rs b/crates/neuron/src/main.rs
index d194b5a..a0612be 100644
--- a/crates/neuron/src/main.rs
+++ b/crates/neuron/src/main.rs
@@ -211,6 +211,13 @@ async fn daemon(args: Args) -> Result<()> {
     let registry = state.registry.read().await;
     startup::unload_all_models(&registry).await;
     tracing::info!("shutdown complete");
-
-    Ok(())
+    // Fast-exit instead of returning. Returning lets `#[tokio::main]`
+    // drop the runtime, which in turn waits on the blocking thread
+    // pool to drain. After a CUDA driver error (OOM → illegal address)
+    // a spawn_blocking thread can be wedged inside `cuCtxGetCurrent`,
+    // and tokio's drain has no timeout. systemd then SIGABRTs us and
+    // dumps core. Skipping the drain hands the OS a clean exit code;
+    // the OS reaps the stuck threads. See the 2026-05-26 incident
+    // captured under "Stack trace of thread 2951308" in the journal.
+    std::process::exit(0);
 }
diff --git a/crates/neuron/src/startup.rs b/crates/neuron/src/startup.rs
index 3d348ff..012ae8f 100644
--- a/crates/neuron/src/startup.rs
+++ b/crates/neuron/src/startup.rs
@@ -7,9 +7,17 @@
 
 use crate::harness::HarnessRegistry;
 use cortex_core::harness::ModelSpec;
-use std::time::Instant;
+use std::time::{Duration, Instant};
 use tokio::signal;
 
+/// Maximum time we wait on a single `unload_model` call during
+/// shutdown. The TP unload path tries `Arc::try_unwrap`, which fails
+/// fast when an inference is in flight, so a healthy unload returns
+/// in milliseconds. The timeout exists to bound a *future* unload
+/// path that might genuinely block on a stuck worker, so a single
+/// wedged model can't burn the whole systemd TimeoutStopSec window.
+const UNLOAD_TIMEOUT: Duration = Duration::from_secs(20);
+
 /// Load each spec sequentially against the registry, treating
 /// individual failures as warnings rather than fatal errors.
 ///
@@ -79,19 +87,44 @@ pub async fn unload_all_models(registry: &HarnessRegistry) {
     }
 
     tracing::info!(count = listed.len(), "unloading models for shutdown");
+    let mut stuck = 0;
     for model in listed {
         let start = Instant::now();
-        match registry.unload_model(&model.id).await {
-            Ok(()) => tracing::info!(
+        match tokio::time::timeout(UNLOAD_TIMEOUT, registry.unload_model(&model.id)).await {
+            Ok(Ok(())) => tracing::info!(
                 model = %model.id,
                 elapsed_ms = start.elapsed().as_millis() as u64,
                 "unloaded"
             ),
-            Err(e) => tracing::warn!(
-                model = %model.id,
-                error = %e,
-                "unload failed during shutdown"
-            ),
+            // Most common shape today: TP unload bails because an
+            // inference is still mid-flight (the spawned task holds
+            // an `Arc<TpLoadedModel>` clone). Promoted from warn to
+            // error and tagged with the request-state so the operator
+            // can correlate with the chat_completion logs above.
+            Ok(Err(e)) => {
+                stuck += 1;
+                tracing::error!(
+                    model = %model.id,
+                    error = %e,
+                    elapsed_ms = start.elapsed().as_millis() as u64,
+                    "unload failed during shutdown"
+                );
+            }
+            Err(_) => {
+                stuck += 1;
+                tracing::error!(
+                    model = %model.id,
+                    timeout_secs = UNLOAD_TIMEOUT.as_secs(),
+                    "unload timed out during shutdown, continuing"
+                );
+            }
         }
     }
+    if stuck > 0 {
+        tracing::error!(
+            stuck,
+            "shutdown leaving {stuck} model(s) loaded; VRAM will be \
+             reclaimed by the OS on process exit"
+        );
+    }
 }