feat(neuron): graceful unload-on-shutdown via SIGTERM/SIGINT
Stage 6 of the candle-native pivot. Adds first-class deactivation: neuron now drains in-flight requests on SIGTERM (systemd stop) or SIGINT (Ctrl-C), then unloads every loaded model before the process exits — releasing CUDA contexts and VRAM cleanly rather than leaving the OS to reclaim them. Mechanism: - startup::shutdown_signal() resolves on either ctrl_c() or a SIGTERM listener. - axum::serve(...).with_graceful_shutdown(shutdown_signal()) stops accepting new connections, lets active requests finish, then returns control to main. - startup::unload_all_models(®istry) iterates list_all_models() and calls unload per entry. Per-model failures are logged warnings; cleanup continues. Empty registry is a fast no-op. - main holds an Arc<NeuronState> reference past axum's lifetime so the registry is still reachable for the unload sweep. data/neuron.service: - TimeoutStopSec=120s — generous bound for big-model unloads before systemd escalates to SIGKILL. - KillSignal=SIGTERM — explicit, matches the handler. Two non-gated tests cover the empty-registry no-op and the no-models- loaded path. Real load-then-unload-on-shutdown is exercised by the cuda-integration test from Stage 2 (which calls unload_model directly) and observable on a real GPU host by stopping the service and watching nvidia-smi. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -78,11 +78,21 @@ async fn main() -> Result<()> {
|
|||||||
candle,
|
candle,
|
||||||
});
|
});
|
||||||
|
|
||||||
let app = api::neuron_routes().with_state(state);
|
let app = api::neuron_routes().with_state(Arc::clone(&state));
|
||||||
let addr: std::net::SocketAddr = format!("0.0.0.0:{port}").parse()?;
|
let addr: std::net::SocketAddr = format!("0.0.0.0:{port}").parse()?;
|
||||||
tracing::info!("neuron listening on {addr}");
|
tracing::info!("neuron listening on {addr}");
|
||||||
let listener = tokio::net::TcpListener::bind(addr).await?;
|
let listener = tokio::net::TcpListener::bind(addr).await?;
|
||||||
axum::serve(listener, app).await?;
|
axum::serve(listener, app)
|
||||||
|
.with_graceful_shutdown(startup::shutdown_signal())
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
// Deactivation: serve has returned (graceful shutdown signal
|
||||||
|
// received and connections drained). Release CUDA contexts / VRAM
|
||||||
|
// by unloading every model before exiting; systemd's TimeoutStopSec
|
||||||
|
// bounds how long this phase may take.
|
||||||
|
let registry = state.registry.read().await;
|
||||||
|
startup::unload_all_models(®istry).await;
|
||||||
|
tracing::info!("shutdown complete");
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,12 +1,14 @@
|
|||||||
//! Activation-time orchestration.
|
//! Activation- and deactivation-time orchestration.
|
||||||
//!
|
//!
|
||||||
//! Wired from `main.rs` after the harness registry is built and before
|
//! Wired from `main.rs` around the HTTP listener — activation runs
|
||||||
//! the HTTP listener binds. Kept in its own module so the logic is
|
//! before bind, deactivation runs after axum returns from its
|
||||||
|
//! graceful-shutdown future. Kept in its own module so the logic is
|
||||||
//! unit-testable without spinning up a full neuron process.
|
//! unit-testable without spinning up a full neuron process.
|
||||||
|
|
||||||
use crate::harness::HarnessRegistry;
|
use crate::harness::HarnessRegistry;
|
||||||
use cortex_core::harness::ModelSpec;
|
use cortex_core::harness::ModelSpec;
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
|
use tokio::signal;
|
||||||
|
|
||||||
/// Load each spec sequentially against the registry, treating
|
/// Load each spec sequentially against the registry, treating
|
||||||
/// individual failures as warnings rather than fatal errors.
|
/// individual failures as warnings rather than fatal errors.
|
||||||
@@ -36,3 +38,60 @@ pub async fn load_default_models(registry: &HarnessRegistry, specs: &[ModelSpec]
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Future that resolves on SIGINT (Ctrl-C) or SIGTERM (systemd stop).
|
||||||
|
///
|
||||||
|
/// Wired into `axum::serve(...).with_graceful_shutdown(shutdown_signal())`
|
||||||
|
/// so the HTTP listener stops accepting new connections, lets in-flight
|
||||||
|
/// requests drain, and then yields control back to main for cleanup.
|
||||||
|
pub async fn shutdown_signal() {
|
||||||
|
let ctrl_c = async {
|
||||||
|
signal::ctrl_c().await.ok();
|
||||||
|
};
|
||||||
|
let terminate = async {
|
||||||
|
signal::unix::signal(signal::unix::SignalKind::terminate())
|
||||||
|
.expect("install SIGTERM handler")
|
||||||
|
.recv()
|
||||||
|
.await;
|
||||||
|
};
|
||||||
|
tokio::select! {
|
||||||
|
_ = ctrl_c => tracing::info!("received SIGINT, shutting down"),
|
||||||
|
_ = terminate => tracing::info!("received SIGTERM, shutting down"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Unload every model currently registered. Called from `main.rs` after
|
||||||
|
/// axum's graceful shutdown future resolves, so CUDA contexts and VRAM
|
||||||
|
/// are released before the process exits rather than left to the OS to
|
||||||
|
/// reclaim. Per-model failures are logged and skipped — keep cleanup
|
||||||
|
/// going even when one harness is unhealthy.
|
||||||
|
pub async fn unload_all_models(registry: &HarnessRegistry) {
|
||||||
|
let listed = match registry.list_all_models().await {
|
||||||
|
Ok(m) => m,
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(error = %e, "failed to list models during shutdown");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if listed.is_empty() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
tracing::info!(count = listed.len(), "unloading models for shutdown");
|
||||||
|
for model in listed {
|
||||||
|
let start = Instant::now();
|
||||||
|
match registry.unload_model(&model.id).await {
|
||||||
|
Ok(()) => tracing::info!(
|
||||||
|
model = %model.id,
|
||||||
|
elapsed_ms = start.elapsed().as_millis() as u64,
|
||||||
|
"unloaded"
|
||||||
|
),
|
||||||
|
Err(e) => tracing::warn!(
|
||||||
|
model = %model.id,
|
||||||
|
error = %e,
|
||||||
|
"unload failed during shutdown"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
32
crates/neuron/tests/shutdown.rs
Normal file
32
crates/neuron/tests/shutdown.rs
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
//! Deactivation behaviour: unload_all_models tolerates an empty
|
||||||
|
//! registry and continues past per-model unload failures.
|
||||||
|
|
||||||
|
use cortex_core::harness::HarnessConfig;
|
||||||
|
use neuron::config::HarnessSettings;
|
||||||
|
use neuron::harness::HarnessRegistry;
|
||||||
|
use neuron::startup;
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_unload_all_models_empty_registry_is_noop() {
|
||||||
|
let registry = HarnessRegistry::new();
|
||||||
|
startup::unload_all_models(®istry).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_unload_all_models_with_no_loaded_models() {
|
||||||
|
let registry = HarnessRegistry::from_configs(
|
||||||
|
&[HarnessConfig {
|
||||||
|
name: "candle".into(),
|
||||||
|
}],
|
||||||
|
"http://localhost:0",
|
||||||
|
&HarnessSettings::default(),
|
||||||
|
);
|
||||||
|
|
||||||
|
startup::unload_all_models(®istry).await;
|
||||||
|
|
||||||
|
let listed = registry
|
||||||
|
.list_all_models()
|
||||||
|
.await
|
||||||
|
.expect("list_all_models should still succeed after shutdown cleanup");
|
||||||
|
assert!(listed.is_empty());
|
||||||
|
}
|
||||||
@@ -15,6 +15,11 @@ Group=neuron
|
|||||||
# materialise on first activation. systemd's default TimeoutStartSec
|
# materialise on first activation. systemd's default TimeoutStartSec
|
||||||
# (90s) is far too short; allow 30 minutes.
|
# (90s) is far too short; allow 30 minutes.
|
||||||
TimeoutStartSec=1800s
|
TimeoutStartSec=1800s
|
||||||
|
# On stop, neuron drains in-flight requests then unloads every model
|
||||||
|
# to release CUDA contexts cleanly. Allow generous time for big-model
|
||||||
|
# unloads; systemd will SIGKILL after this bound.
|
||||||
|
TimeoutStopSec=120s
|
||||||
|
KillSignal=SIGTERM
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
|
|||||||
Reference in New Issue
Block a user