fix(neuron,shutdown): time-bound unloads, fast-exit past tokio drain

Two failure modes from the 2026-05-26 beast incident: 1. `unload_all_models` looped through models calling `unload_model`, logging individual failures at warn. The cumulative effect was a single warn line for the failed unload then "shutdown complete" — no signal that the model was actually still loaded. Now each unload is bounded by a 20s timeout, failures escalate to error, and a summary "leaving N model(s) loaded" line fires when anything is stuck so the operator knows the OS will reclaim VRAM after exit. 2. Returning `Ok(())` from `main` after the unload sweep dropped the tokio runtime, which then waited indefinitely on a CUDA-stuck spawn_blocking thread (the journal's "Stack trace of thread 2951308" — spinning on `cuCtxGetCurrent`). systemd's TimeoutStopSec fired 2 minutes later, SIGABRT, core dump. Replacing the return with `std::process::exit(0)` skips the runtime drain and hands the OS a clean exit code; stuck threads get reaped with the process. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-26 12:30:06 +03:00
parent fc6ef0ee0f
commit 67f79c868f
2 changed files with 50 additions and 10 deletions
--- a/crates/neuron/src/main.rs
+++ b/crates/neuron/src/main.rs
@@ -211,6 +211,13 @@ async fn daemon(args: Args) -> Result<()> {
    let registry = state.registry.read().await;
    startup::unload_all_models(&registry).await;
    tracing::info!("shutdown complete");
-
+    // Fast-exit instead of returning. Returning lets `#[tokio::main]`
-    Ok(())
+    // drop the runtime, which in turn waits on the blocking thread
    // pool to drain. After a CUDA driver error (OOM → illegal address)
    // a spawn_blocking thread can be wedged inside `cuCtxGetCurrent`,
    // and tokio's drain has no timeout. systemd then SIGABRTs us and
    // dumps core. Skipping the drain hands the OS a clean exit code;
    // the OS reaps the stuck threads. See the 2026-05-26 incident
    // captured under "Stack trace of thread 2951308" in the journal.
    std::process::exit(0);
 }
--- a/crates/neuron/src/startup.rs
+++ b/crates/neuron/src/startup.rs
@@ -7,9 +7,17 @@
 use crate::harness::HarnessRegistry;
 use cortex_core::harness::ModelSpec;
-use std::time::Instant;
+use std::time::{Duration, Instant};
 use tokio::signal;
 /// Maximum time we wait on a single `unload_model` call during
 /// shutdown. The TP unload path tries `Arc::try_unwrap`, which fails
 /// fast when an inference is in flight, so a healthy unload returns
 /// in milliseconds. The timeout exists to bound a *future* unload
 /// path that might genuinely block on a stuck worker, so a single
 /// wedged model can't burn the whole systemd TimeoutStopSec window.
 const UNLOAD_TIMEOUT: Duration = Duration::from_secs(20);
 /// Load each spec sequentially against the registry, treating
 /// individual failures as warnings rather than fatal errors.
 ///
@@ -79,19 +87,44 @@ pub async fn unload_all_models(registry: &HarnessRegistry) {
    }
    tracing::info!(count = listed.len(), "unloading models for shutdown");
    let mut stuck = 0;
    for model in listed {
        let start = Instant::now();
-        match registry.unload_model(&model.id).await {
+        match tokio::time::timeout(UNLOAD_TIMEOUT, registry.unload_model(&model.id)).await {
-            Ok(()) => tracing::info!(
+            Ok(Ok(())) => tracing::info!(
                model = %model.id,
                elapsed_ms = start.elapsed().as_millis() as u64,
                "unloaded"
            ),
-            Err(e) => tracing::warn!(
+            // Most common shape today: TP unload bails because an
            // inference is still mid-flight (the spawned task holds
            // an `Arc<TpLoadedModel>` clone). Promoted from warn to
            // error and tagged with the request-state so the operator
            // can correlate with the chat_completion logs above.
            Ok(Err(e)) => {
                stuck += 1;
                tracing::error!(
                    model = %model.id,
                    error = %e,
                    elapsed_ms = start.elapsed().as_millis() as u64,
                    "unload failed during shutdown"
-            ),
+                );
            }
            Err(_) => {
                stuck += 1;
                tracing::error!(
                    model = %model.id,
                    timeout_secs = UNLOAD_TIMEOUT.as_secs(),
                    "unload timed out during shutdown, continuing"
                );
            }
        }
    }
    if stuck > 0 {
        tracing::error!(
            stuck,
            "shutdown leaving {stuck} model(s) loaded; VRAM will be \
             reclaimed by the OS on process exit"
        );
    }
 }