Stage 6 of the candle-native pivot. Adds first-class deactivation: neuron now drains in-flight requests on SIGTERM (systemd stop) or SIGINT (Ctrl-C), then unloads every loaded model before the process exits — releasing CUDA contexts and VRAM cleanly rather than leaving the OS to reclaim them. Mechanism: - startup::shutdown_signal() resolves on either ctrl_c() or a SIGTERM listener. - axum::serve(...).with_graceful_shutdown(shutdown_signal()) stops accepting new connections, lets active requests finish, then returns control to main. - startup::unload_all_models(®istry) iterates list_all_models() and calls unload per entry. Per-model failures are logged warnings; cleanup continues. Empty registry is a fast no-op. - main holds an Arc<NeuronState> reference past axum's lifetime so the registry is still reachable for the unload sweep. data/neuron.service: - TimeoutStopSec=120s — generous bound for big-model unloads before systemd escalates to SIGKILL. - KillSignal=SIGTERM — explicit, matches the handler. Two non-gated tests cover the empty-registry no-op and the no-models- loaded path. Real load-then-unload-on-shutdown is exercised by the cuda-integration test from Stage 2 (which calls unload_model directly) and observable on a real GPU host by stopping the service and watching nvidia-smi. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
99 lines
3.3 KiB
Rust
99 lines
3.3 KiB
Rust
use anyhow::Result;
|
|
use clap::Parser;
|
|
use neuron::{api, config::NeuronConfig, discovery, harness::HarnessRegistry, health, startup};
|
|
use std::sync::Arc;
|
|
use std::time::Instant;
|
|
use tokio::sync::RwLock;
|
|
use tracing_subscriber::EnvFilter;
|
|
|
|
#[derive(Parser)]
|
|
#[command(name = "neuron")]
|
|
#[command(about = "Per-node daemon for cortex inference clusters")]
|
|
#[command(version)]
|
|
struct Args {
|
|
/// Port to listen on (overrides config file).
|
|
#[arg(short, long)]
|
|
port: Option<u16>,
|
|
|
|
/// Path to the neuron config file.
|
|
#[arg(short, long, default_value = "neuron.toml")]
|
|
config: String,
|
|
}
|
|
|
|
#[tokio::main]
|
|
async fn main() -> Result<()> {
|
|
tracing_subscriber::fmt()
|
|
.with_env_filter(
|
|
EnvFilter::try_from_default_env()
|
|
.unwrap_or_else(|_| EnvFilter::new("info,neuron=debug")),
|
|
)
|
|
.init();
|
|
|
|
let args = Args::parse();
|
|
|
|
let cfg = NeuronConfig::load(&args.config).unwrap_or_else(|e| {
|
|
tracing::warn!(path = %args.config, error = %e, "config not found, using defaults");
|
|
NeuronConfig::default()
|
|
});
|
|
|
|
let port = args.port.unwrap_or(cfg.port);
|
|
let bind_url = format!("http://localhost:{port}");
|
|
let start_time = Instant::now();
|
|
|
|
tracing::info!("running hardware discovery");
|
|
let mut discovery_result = discovery::discover_system().await?;
|
|
tracing::info!(
|
|
hostname = %discovery_result.hostname,
|
|
devices = discovery_result.devices.len(),
|
|
"discovery complete"
|
|
);
|
|
|
|
// Build harness registry from config. In-process harnesses (candle)
|
|
// need to know neuron's own bind URL so they can return it from
|
|
// inference_endpoint.
|
|
let registry = HarnessRegistry::from_configs(&cfg.harnesses, &bind_url, &cfg.harness);
|
|
discovery_result.harnesses = registry.names();
|
|
let candle = registry.candle();
|
|
|
|
// Activation: load default models before binding the listener.
|
|
// Each load may take tens of seconds to several minutes depending
|
|
// on model size and HF cache state — keep TimeoutStartSec in the
|
|
// systemd unit generous enough to cover the slowest entry.
|
|
startup::load_default_models(®istry, &cfg.default_models).await;
|
|
|
|
let health_cache = Arc::new(health::HealthCache::new());
|
|
health_cache
|
|
.set_has_gpus(!discovery_result.devices.is_empty())
|
|
.await;
|
|
|
|
let poller_cache = Arc::clone(&health_cache);
|
|
tokio::spawn(async move {
|
|
poller_cache.poll_loop(start_time).await;
|
|
});
|
|
|
|
let state = Arc::new(api::NeuronState {
|
|
discovery: discovery_result,
|
|
health_cache,
|
|
registry: RwLock::new(registry),
|
|
candle,
|
|
});
|
|
|
|
let app = api::neuron_routes().with_state(Arc::clone(&state));
|
|
let addr: std::net::SocketAddr = format!("0.0.0.0:{port}").parse()?;
|
|
tracing::info!("neuron listening on {addr}");
|
|
let listener = tokio::net::TcpListener::bind(addr).await?;
|
|
axum::serve(listener, app)
|
|
.with_graceful_shutdown(startup::shutdown_signal())
|
|
.await?;
|
|
|
|
// Deactivation: serve has returned (graceful shutdown signal
|
|
// received and connections drained). Release CUDA contexts / VRAM
|
|
// by unloading every model before exiting; systemd's TimeoutStopSec
|
|
// bounds how long this phase may take.
|
|
let registry = state.registry.read().await;
|
|
startup::unload_all_models(®istry).await;
|
|
tracing::info!("shutdown complete");
|
|
|
|
Ok(())
|
|
}
|