//! Activation- and deactivation-time orchestration. //! //! Wired from `main.rs` around the HTTP listener — activation runs //! before bind, deactivation runs after axum returns from its //! graceful-shutdown future. Kept in its own module so the logic is //! unit-testable without spinning up a full neuron process. use crate::harness::HarnessRegistry; use cortex_core::harness::ModelSpec; use std::time::Instant; use tokio::signal; /// Load each spec sequentially against the registry, treating /// individual failures as warnings rather than fatal errors. /// /// VRAM contention makes parallel loads risky; the sequential path is /// boring but correct. The function logs elapsed time per load so an /// operator can see which model is hogging activation. pub async fn load_default_models(registry: &HarnessRegistry, specs: &[ModelSpec]) { if specs.is_empty() { return; } tracing::info!(count = specs.len(), "loading default models"); for spec in specs { let start = Instant::now(); match registry.load_model(spec).await { Ok(()) => tracing::info!( model = %spec.model_id, elapsed_ms = start.elapsed().as_millis() as u64, "loaded default model" ), Err(e) => tracing::warn!( model = %spec.model_id, error = %e, elapsed_ms = start.elapsed().as_millis() as u64, "failed to load default model, continuing" ), } } } /// Future that resolves on SIGINT (Ctrl-C) or SIGTERM (systemd stop). /// /// Wired into `axum::serve(...).with_graceful_shutdown(shutdown_signal())` /// so the HTTP listener stops accepting new connections, lets in-flight /// requests drain, and then yields control back to main for cleanup. pub async fn shutdown_signal() { let ctrl_c = async { signal::ctrl_c().await.ok(); }; let terminate = async { signal::unix::signal(signal::unix::SignalKind::terminate()) .expect("install SIGTERM handler") .recv() .await; }; tokio::select! { _ = ctrl_c => tracing::info!("received SIGINT, shutting down"), _ = terminate => tracing::info!("received SIGTERM, shutting down"), } } /// Unload every model currently registered. Called from `main.rs` after /// axum's graceful shutdown future resolves, so CUDA contexts and VRAM /// are released before the process exits rather than left to the OS to /// reclaim. Per-model failures are logged and skipped — keep cleanup /// going even when one harness is unhealthy. pub async fn unload_all_models(registry: &HarnessRegistry) { let listed = match registry.list_all_models().await { Ok(m) => m, Err(e) => { tracing::warn!(error = %e, "failed to list models during shutdown"); return; } }; if listed.is_empty() { return; } tracing::info!(count = listed.len(), "unloading models for shutdown"); for model in listed { let start = Instant::now(); match registry.unload_model(&model.id).await { Ok(()) => tracing::info!( model = %model.id, elapsed_ms = start.elapsed().as_millis() as u64, "unloaded" ), Err(e) => tracing::warn!( model = %model.id, error = %e, "unload failed during shutdown" ), } } }