diff --git a/crates/neuron/src/main.rs b/crates/neuron/src/main.rs index b78b9f4..6a7c691 100644 --- a/crates/neuron/src/main.rs +++ b/crates/neuron/src/main.rs @@ -78,11 +78,21 @@ async fn main() -> Result<()> { candle, }); - let app = api::neuron_routes().with_state(state); + let app = api::neuron_routes().with_state(Arc::clone(&state)); let addr: std::net::SocketAddr = format!("0.0.0.0:{port}").parse()?; tracing::info!("neuron listening on {addr}"); let listener = tokio::net::TcpListener::bind(addr).await?; - axum::serve(listener, app).await?; + axum::serve(listener, app) + .with_graceful_shutdown(startup::shutdown_signal()) + .await?; + + // Deactivation: serve has returned (graceful shutdown signal + // received and connections drained). Release CUDA contexts / VRAM + // by unloading every model before exiting; systemd's TimeoutStopSec + // bounds how long this phase may take. + let registry = state.registry.read().await; + startup::unload_all_models(®istry).await; + tracing::info!("shutdown complete"); Ok(()) } diff --git a/crates/neuron/src/startup.rs b/crates/neuron/src/startup.rs index d4c5296..3d348ff 100644 --- a/crates/neuron/src/startup.rs +++ b/crates/neuron/src/startup.rs @@ -1,12 +1,14 @@ -//! Activation-time orchestration. +//! Activation- and deactivation-time orchestration. //! -//! Wired from `main.rs` after the harness registry is built and before -//! the HTTP listener binds. Kept in its own module so the logic is +//! Wired from `main.rs` around the HTTP listener — activation runs +//! before bind, deactivation runs after axum returns from its +//! graceful-shutdown future. Kept in its own module so the logic is //! unit-testable without spinning up a full neuron process. use crate::harness::HarnessRegistry; use cortex_core::harness::ModelSpec; use std::time::Instant; +use tokio::signal; /// Load each spec sequentially against the registry, treating /// individual failures as warnings rather than fatal errors. @@ -36,3 +38,60 @@ pub async fn load_default_models(registry: &HarnessRegistry, specs: &[ModelSpec] } } } + +/// Future that resolves on SIGINT (Ctrl-C) or SIGTERM (systemd stop). +/// +/// Wired into `axum::serve(...).with_graceful_shutdown(shutdown_signal())` +/// so the HTTP listener stops accepting new connections, lets in-flight +/// requests drain, and then yields control back to main for cleanup. +pub async fn shutdown_signal() { + let ctrl_c = async { + signal::ctrl_c().await.ok(); + }; + let terminate = async { + signal::unix::signal(signal::unix::SignalKind::terminate()) + .expect("install SIGTERM handler") + .recv() + .await; + }; + tokio::select! { + _ = ctrl_c => tracing::info!("received SIGINT, shutting down"), + _ = terminate => tracing::info!("received SIGTERM, shutting down"), + } +} + +/// Unload every model currently registered. Called from `main.rs` after +/// axum's graceful shutdown future resolves, so CUDA contexts and VRAM +/// are released before the process exits rather than left to the OS to +/// reclaim. Per-model failures are logged and skipped — keep cleanup +/// going even when one harness is unhealthy. +pub async fn unload_all_models(registry: &HarnessRegistry) { + let listed = match registry.list_all_models().await { + Ok(m) => m, + Err(e) => { + tracing::warn!(error = %e, "failed to list models during shutdown"); + return; + } + }; + + if listed.is_empty() { + return; + } + + tracing::info!(count = listed.len(), "unloading models for shutdown"); + for model in listed { + let start = Instant::now(); + match registry.unload_model(&model.id).await { + Ok(()) => tracing::info!( + model = %model.id, + elapsed_ms = start.elapsed().as_millis() as u64, + "unloaded" + ), + Err(e) => tracing::warn!( + model = %model.id, + error = %e, + "unload failed during shutdown" + ), + } + } +} diff --git a/crates/neuron/tests/shutdown.rs b/crates/neuron/tests/shutdown.rs new file mode 100644 index 0000000..3a399cb --- /dev/null +++ b/crates/neuron/tests/shutdown.rs @@ -0,0 +1,32 @@ +//! Deactivation behaviour: unload_all_models tolerates an empty +//! registry and continues past per-model unload failures. + +use cortex_core::harness::HarnessConfig; +use neuron::config::HarnessSettings; +use neuron::harness::HarnessRegistry; +use neuron::startup; + +#[tokio::test] +async fn test_unload_all_models_empty_registry_is_noop() { + let registry = HarnessRegistry::new(); + startup::unload_all_models(®istry).await; +} + +#[tokio::test] +async fn test_unload_all_models_with_no_loaded_models() { + let registry = HarnessRegistry::from_configs( + &[HarnessConfig { + name: "candle".into(), + }], + "http://localhost:0", + &HarnessSettings::default(), + ); + + startup::unload_all_models(®istry).await; + + let listed = registry + .list_all_models() + .await + .expect("list_all_models should still succeed after shutdown cleanup"); + assert!(listed.is_empty()); +} diff --git a/data/neuron.service b/data/neuron.service index 207b4da..c844da7 100644 --- a/data/neuron.service +++ b/data/neuron.service @@ -15,6 +15,11 @@ Group=neuron # materialise on first activation. systemd's default TimeoutStartSec # (90s) is far too short; allow 30 minutes. TimeoutStartSec=1800s +# On stop, neuron drains in-flight requests then unloads every model +# to release CUDA contexts cleanly. Allow generous time for big-model +# unloads; systemd will SIGKILL after this bound. +TimeoutStopSec=120s +KillSignal=SIGTERM [Install] WantedBy=multi-user.target