From 6779b7526a25fb22b4e2e2cd6b0366dbe28acaa1 Mon Sep 17 00:00:00 2001 From: rob thijssen Date: Mon, 18 May 2026 17:56:08 +0300 Subject: [PATCH] feat(neuron): load default_models on service activation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stage 5 of the candle-native pivot. Adds first-class support for auto-loading a configured set of models when the neuron service activates. Config: - NeuronConfig.default_models: Vec (defaults to []). - neuron.example.toml ships a commented [[default_models]] example. Activation flow (crates/neuron/src/startup.rs::load_default_models): - Sequential — VRAM contention makes parallel loads risky. - Per-entry timing logged at info level on success. - Failures logged as warnings; the next entry is still attempted. - An empty list short-circuits without log noise. Called from main.rs after the registry is built and before the axum listener binds, so /models reflects the loaded state from the very first request. data/neuron.service gains TimeoutStartSec=1800s. With activation blocked on potentially slow first-time HF downloads + GGUF materialisation, systemd's default 90s would kill larger model loads mid-flight. Two non-gated tests in tests/activation.rs cover the continues-past-failure and empty-list paths using a synthetically unknown harness name to fail loads fast without touching the network. The cuda-integration test from earlier stages still exercises the real load/unload lifecycle. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/neuron/src/config.rs | 9 ++++- crates/neuron/src/lib.rs | 1 + crates/neuron/src/main.rs | 8 ++++- crates/neuron/src/startup.rs | 38 +++++++++++++++++++++ crates/neuron/tests/activation.rs | 56 +++++++++++++++++++++++++++++++ data/neuron.service | 5 +++ neuron.example.toml | 16 +++++++++ 7 files changed, 131 insertions(+), 2 deletions(-) create mode 100644 crates/neuron/src/startup.rs create mode 100644 crates/neuron/tests/activation.rs diff --git a/crates/neuron/src/config.rs b/crates/neuron/src/config.rs index e9b2250..a9e62f4 100644 --- a/crates/neuron/src/config.rs +++ b/crates/neuron/src/config.rs @@ -1,6 +1,6 @@ //! Neuron configuration loaded from neuron.toml. -use cortex_core::harness::HarnessConfig; +use cortex_core::harness::{HarnessConfig, ModelSpec}; use figment::{ Figment, providers::{Env, Format, Toml}, @@ -17,6 +17,12 @@ pub struct NeuronConfig { /// Per-harness configuration. Currently only `candle` is recognised. #[serde(default)] pub harness: HarnessSettings, + /// Models to auto-load when the neuron service activates. Each entry + /// is loaded sequentially before the HTTP listener binds. A failure + /// on any single entry logs a warning and proceeds — broken entries + /// don't prevent the rest of the fleet from starting. + #[serde(default)] + pub default_models: Vec, } /// Settings for individual harness implementations. Each harness owns @@ -55,6 +61,7 @@ impl Default for NeuronConfig { port: 13131, harnesses: vec![], harness: HarnessSettings::default(), + default_models: vec![], } } } diff --git a/crates/neuron/src/lib.rs b/crates/neuron/src/lib.rs index d860b25..5c72182 100644 --- a/crates/neuron/src/lib.rs +++ b/crates/neuron/src/lib.rs @@ -3,3 +3,4 @@ pub mod config; pub mod discovery; pub mod harness; pub mod health; +pub mod startup; diff --git a/crates/neuron/src/main.rs b/crates/neuron/src/main.rs index ed889e8..b78b9f4 100644 --- a/crates/neuron/src/main.rs +++ b/crates/neuron/src/main.rs @@ -1,6 +1,6 @@ use anyhow::Result; use clap::Parser; -use neuron::{api, config::NeuronConfig, discovery, harness::HarnessRegistry, health}; +use neuron::{api, config::NeuronConfig, discovery, harness::HarnessRegistry, health, startup}; use std::sync::Arc; use std::time::Instant; use tokio::sync::RwLock; @@ -55,6 +55,12 @@ async fn main() -> Result<()> { discovery_result.harnesses = registry.names(); let candle = registry.candle(); + // Activation: load default models before binding the listener. + // Each load may take tens of seconds to several minutes depending + // on model size and HF cache state — keep TimeoutStartSec in the + // systemd unit generous enough to cover the slowest entry. + startup::load_default_models(®istry, &cfg.default_models).await; + let health_cache = Arc::new(health::HealthCache::new()); health_cache .set_has_gpus(!discovery_result.devices.is_empty()) diff --git a/crates/neuron/src/startup.rs b/crates/neuron/src/startup.rs new file mode 100644 index 0000000..d4c5296 --- /dev/null +++ b/crates/neuron/src/startup.rs @@ -0,0 +1,38 @@ +//! Activation-time orchestration. +//! +//! Wired from `main.rs` after the harness registry is built and before +//! the HTTP listener binds. Kept in its own module so the logic is +//! unit-testable without spinning up a full neuron process. + +use crate::harness::HarnessRegistry; +use cortex_core::harness::ModelSpec; +use std::time::Instant; + +/// Load each spec sequentially against the registry, treating +/// individual failures as warnings rather than fatal errors. +/// +/// VRAM contention makes parallel loads risky; the sequential path is +/// boring but correct. The function logs elapsed time per load so an +/// operator can see which model is hogging activation. +pub async fn load_default_models(registry: &HarnessRegistry, specs: &[ModelSpec]) { + if specs.is_empty() { + return; + } + tracing::info!(count = specs.len(), "loading default models"); + for spec in specs { + let start = Instant::now(); + match registry.load_model(spec).await { + Ok(()) => tracing::info!( + model = %spec.model_id, + elapsed_ms = start.elapsed().as_millis() as u64, + "loaded default model" + ), + Err(e) => tracing::warn!( + model = %spec.model_id, + error = %e, + elapsed_ms = start.elapsed().as_millis() as u64, + "failed to load default model, continuing" + ), + } + } +} diff --git a/crates/neuron/tests/activation.rs b/crates/neuron/tests/activation.rs new file mode 100644 index 0000000..8daa972 --- /dev/null +++ b/crates/neuron/tests/activation.rs @@ -0,0 +1,56 @@ +//! Activation-time behaviour: load_default_models continues past +//! individual failures so a single broken catalogue entry doesn't +//! prevent the rest of the fleet from starting. + +use cortex_core::harness::{HarnessConfig, ModelSpec}; +use neuron::config::HarnessSettings; +use neuron::harness::HarnessRegistry; +use neuron::startup; + +#[tokio::test] +async fn test_load_default_models_skips_unknown_harness() { + let registry = HarnessRegistry::from_configs( + &[HarnessConfig { + name: "candle".into(), + }], + "http://localhost:0", + &HarnessSettings::default(), + ); + + // Both entries fail synchronously inside the registry — no network + // call escapes (the harness lookup mismatches before hf-hub is + // touched). The function should still return cleanly. + let specs = vec![ + ModelSpec { + model_id: "model-a".into(), + harness: "no-such-harness".into(), + quant: None, + tensor_parallel: None, + devices: None, + }, + ModelSpec { + model_id: "model-b".into(), + harness: "no-such-harness".into(), + quant: None, + tensor_parallel: None, + devices: None, + }, + ]; + + startup::load_default_models(®istry, &specs).await; + + let listed = registry + .list_all_models() + .await + .expect("list_all_models should succeed"); + assert!( + listed.is_empty(), + "no models should be loaded after failed entries" + ); +} + +#[tokio::test] +async fn test_load_default_models_empty_is_noop() { + let registry = HarnessRegistry::new(); + startup::load_default_models(®istry, &[]).await; +} diff --git a/data/neuron.service b/data/neuron.service index 84428dd..207b4da 100644 --- a/data/neuron.service +++ b/data/neuron.service @@ -10,6 +10,11 @@ Restart=on-failure RestartSec=5 User=neuron Group=neuron +# Loading default_models from neuron.toml happens before the HTTP +# listener binds; large models can take many minutes to download and +# materialise on first activation. systemd's default TimeoutStartSec +# (90s) is far too short; allow 30 minutes. +TimeoutStartSec=1800s [Install] WantedBy=multi-user.target diff --git a/neuron.example.toml b/neuron.example.toml index 8a71372..46108a6 100644 --- a/neuron.example.toml +++ b/neuron.example.toml @@ -22,3 +22,19 @@ name = "candle" # HuggingFace cache directory for model weights. When unset, hf-hub's # default (~/.cache/huggingface) is used. # hf_cache = "/var/lib/neuron/hf-cache" + +# -- Default models ---------------------------------------------------------- +# Models listed here are loaded automatically when the neuron service +# activates. Loading is sequential — a slow or failing entry doesn't +# block the rest of the fleet, but it does push out the time before +# neuron starts serving HTTP, so keep the list short. Operators can +# load additional models on demand via POST /models/load. +# +# Make sure data/neuron.service's TimeoutStartSec is generous enough to +# cover the slowest entry's first-time download + materialisation. + +# [[default_models]] +# model_id = "Qwen/Qwen3-0.6B-GGUF" +# harness = "candle" +# quant = "Q4_K_M" +# devices = [0]