diff --git a/crates/neuron/src/main.rs b/crates/neuron/src/main.rs index d194b5a..a0612be 100644 --- a/crates/neuron/src/main.rs +++ b/crates/neuron/src/main.rs @@ -211,6 +211,13 @@ async fn daemon(args: Args) -> Result<()> { let registry = state.registry.read().await; startup::unload_all_models(®istry).await; tracing::info!("shutdown complete"); - - Ok(()) + // Fast-exit instead of returning. Returning lets `#[tokio::main]` + // drop the runtime, which in turn waits on the blocking thread + // pool to drain. After a CUDA driver error (OOM → illegal address) + // a spawn_blocking thread can be wedged inside `cuCtxGetCurrent`, + // and tokio's drain has no timeout. systemd then SIGABRTs us and + // dumps core. Skipping the drain hands the OS a clean exit code; + // the OS reaps the stuck threads. See the 2026-05-26 incident + // captured under "Stack trace of thread 2951308" in the journal. + std::process::exit(0); } diff --git a/crates/neuron/src/startup.rs b/crates/neuron/src/startup.rs index 3d348ff..012ae8f 100644 --- a/crates/neuron/src/startup.rs +++ b/crates/neuron/src/startup.rs @@ -7,9 +7,17 @@ use crate::harness::HarnessRegistry; use cortex_core::harness::ModelSpec; -use std::time::Instant; +use std::time::{Duration, Instant}; use tokio::signal; +/// Maximum time we wait on a single `unload_model` call during +/// shutdown. The TP unload path tries `Arc::try_unwrap`, which fails +/// fast when an inference is in flight, so a healthy unload returns +/// in milliseconds. The timeout exists to bound a *future* unload +/// path that might genuinely block on a stuck worker, so a single +/// wedged model can't burn the whole systemd TimeoutStopSec window. +const UNLOAD_TIMEOUT: Duration = Duration::from_secs(20); + /// Load each spec sequentially against the registry, treating /// individual failures as warnings rather than fatal errors. /// @@ -79,19 +87,44 @@ pub async fn unload_all_models(registry: &HarnessRegistry) { } tracing::info!(count = listed.len(), "unloading models for shutdown"); + let mut stuck = 0; for model in listed { let start = Instant::now(); - match registry.unload_model(&model.id).await { - Ok(()) => tracing::info!( + match tokio::time::timeout(UNLOAD_TIMEOUT, registry.unload_model(&model.id)).await { + Ok(Ok(())) => tracing::info!( model = %model.id, elapsed_ms = start.elapsed().as_millis() as u64, "unloaded" ), - Err(e) => tracing::warn!( - model = %model.id, - error = %e, - "unload failed during shutdown" - ), + // Most common shape today: TP unload bails because an + // inference is still mid-flight (the spawned task holds + // an `Arc` clone). Promoted from warn to + // error and tagged with the request-state so the operator + // can correlate with the chat_completion logs above. + Ok(Err(e)) => { + stuck += 1; + tracing::error!( + model = %model.id, + error = %e, + elapsed_ms = start.elapsed().as_millis() as u64, + "unload failed during shutdown" + ); + } + Err(_) => { + stuck += 1; + tracing::error!( + model = %model.id, + timeout_secs = UNLOAD_TIMEOUT.as_secs(), + "unload timed out during shutdown, continuing" + ); + } } } + if stuck > 0 { + tracing::error!( + stuck, + "shutdown leaving {stuck} model(s) loaded; VRAM will be \ + reclaimed by the OS on process exit" + ); + } }