feat(neuron): load default_models on service activation

Stage 5 of the candle-native pivot. Adds first-class support for auto-loading a configured set of models when the neuron service activates. Config: - NeuronConfig.default_models: Vec<ModelSpec> (defaults to []). - neuron.example.toml ships a commented [[default_models]] example. Activation flow (crates/neuron/src/startup.rs::load_default_models): - Sequential — VRAM contention makes parallel loads risky. - Per-entry timing logged at info level on success. - Failures logged as warnings; the next entry is still attempted. - An empty list short-circuits without log noise. Called from main.rs after the registry is built and before the axum listener binds, so /models reflects the loaded state from the very first request. data/neuron.service gains TimeoutStartSec=1800s. With activation blocked on potentially slow first-time HF downloads + GGUF materialisation, systemd's default 90s would kill larger model loads mid-flight. Two non-gated tests in tests/activation.rs cover the continues-past-failure and empty-list paths using a synthetically unknown harness name to fail loads fast without touching the network. The cuda-integration test from earlier stages still exercises the real load/unload lifecycle. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 17:56:08 +03:00
parent 84f5662df1
commit 6779b7526a
7 changed files with 131 additions and 2 deletions
--- a/crates/neuron/src/config.rs
+++ b/crates/neuron/src/config.rs
@@ -1,6 +1,6 @@
 //! Neuron configuration loaded from neuron.toml.

-use cortex_core::harness::HarnessConfig;
+use cortex_core::harness::{HarnessConfig, ModelSpec};
 use figment::{
    Figment,
    providers::{Env, Format, Toml},
@@ -17,6 +17,12 @@ pub struct NeuronConfig {
    /// Per-harness configuration. Currently only `candle` is recognised.
    #[serde(default)]
    pub harness: HarnessSettings,
+    /// Models to auto-load when the neuron service activates. Each entry
+    /// is loaded sequentially before the HTTP listener binds. A failure
+    /// on any single entry logs a warning and proceeds — broken entries
+    /// don't prevent the rest of the fleet from starting.
+    #[serde(default)]
+    pub default_models: Vec<ModelSpec>,
 }

 /// Settings for individual harness implementations. Each harness owns
@@ -55,6 +61,7 @@ impl Default for NeuronConfig {
            port: 13131,
            harnesses: vec![],
            harness: HarnessSettings::default(),
+            default_models: vec![],
        }
    }
 }
--- a/crates/neuron/src/lib.rs
+++ b/crates/neuron/src/lib.rs
@@ -3,3 +3,4 @@ pub mod config;
 pub mod discovery;
 pub mod harness;
 pub mod health;
+pub mod startup;
--- a/crates/neuron/src/main.rs
+++ b/crates/neuron/src/main.rs
@@ -1,6 +1,6 @@
 use anyhow::Result;
 use clap::Parser;
-use neuron::{api, config::NeuronConfig, discovery, harness::HarnessRegistry, health};
+use neuron::{api, config::NeuronConfig, discovery, harness::HarnessRegistry, health, startup};
 use std::sync::Arc;
 use std::time::Instant;
 use tokio::sync::RwLock;
@@ -55,6 +55,12 @@ async fn main() -> Result<()> {
    discovery_result.harnesses = registry.names();
    let candle = registry.candle();

+    // Activation: load default models before binding the listener.
+    // Each load may take tens of seconds to several minutes depending
+    // on model size and HF cache state — keep TimeoutStartSec in the
+    // systemd unit generous enough to cover the slowest entry.
+    startup::load_default_models(&registry, &cfg.default_models).await;
+
    let health_cache = Arc::new(health::HealthCache::new());
    health_cache
        .set_has_gpus(!discovery_result.devices.is_empty())
--- a/crates/neuron/src/startup.rs
+++ b/crates/neuron/src/startup.rs
@@ -0,0 +1,38 @@
+//! Activation-time orchestration.
+//!
+//! Wired from `main.rs` after the harness registry is built and before
+//! the HTTP listener binds. Kept in its own module so the logic is
+//! unit-testable without spinning up a full neuron process.
+
+use crate::harness::HarnessRegistry;
+use cortex_core::harness::ModelSpec;
+use std::time::Instant;
+
+/// Load each spec sequentially against the registry, treating
+/// individual failures as warnings rather than fatal errors.
+///
+/// VRAM contention makes parallel loads risky; the sequential path is
+/// boring but correct. The function logs elapsed time per load so an
+/// operator can see which model is hogging activation.
+pub async fn load_default_models(registry: &HarnessRegistry, specs: &[ModelSpec]) {
+    if specs.is_empty() {
+        return;
+    }
+    tracing::info!(count = specs.len(), "loading default models");
+    for spec in specs {
+        let start = Instant::now();
+        match registry.load_model(spec).await {
+            Ok(()) => tracing::info!(
+                model = %spec.model_id,
+                elapsed_ms = start.elapsed().as_millis() as u64,
+                "loaded default model"
+            ),
+            Err(e) => tracing::warn!(
+                model = %spec.model_id,
+                error = %e,
+                elapsed_ms = start.elapsed().as_millis() as u64,
+                "failed to load default model, continuing"
+            ),
+        }
+    }
+}
--- a/crates/neuron/tests/activation.rs
+++ b/crates/neuron/tests/activation.rs
@@ -0,0 +1,56 @@
+//! Activation-time behaviour: load_default_models continues past
+//! individual failures so a single broken catalogue entry doesn't
+//! prevent the rest of the fleet from starting.
+
+use cortex_core::harness::{HarnessConfig, ModelSpec};
+use neuron::config::HarnessSettings;
+use neuron::harness::HarnessRegistry;
+use neuron::startup;
+
+#[tokio::test]
+async fn test_load_default_models_skips_unknown_harness() {
+    let registry = HarnessRegistry::from_configs(
+        &[HarnessConfig {
+            name: "candle".into(),
+        }],
+        "http://localhost:0",
+        &HarnessSettings::default(),
+    );
+
+    // Both entries fail synchronously inside the registry — no network
+    // call escapes (the harness lookup mismatches before hf-hub is
+    // touched). The function should still return cleanly.
+    let specs = vec![
+        ModelSpec {
+            model_id: "model-a".into(),
+            harness: "no-such-harness".into(),
+            quant: None,
+            tensor_parallel: None,
+            devices: None,
+        },
+        ModelSpec {
+            model_id: "model-b".into(),
+            harness: "no-such-harness".into(),
+            quant: None,
+            tensor_parallel: None,
+            devices: None,
+        },
+    ];
+
+    startup::load_default_models(&registry, &specs).await;
+
+    let listed = registry
+        .list_all_models()
+        .await
+        .expect("list_all_models should succeed");
+    assert!(
+        listed.is_empty(),
+        "no models should be loaded after failed entries"
+    );
+}
+
+#[tokio::test]
+async fn test_load_default_models_empty_is_noop() {
+    let registry = HarnessRegistry::new();
+    startup::load_default_models(&registry, &[]).await;
+}
--- a/data/neuron.service
+++ b/data/neuron.service
@@ -10,6 +10,11 @@ Restart=on-failure
 RestartSec=5
 User=neuron
 Group=neuron
+# Loading default_models from neuron.toml happens before the HTTP
+# listener binds; large models can take many minutes to download and
+# materialise on first activation. systemd's default TimeoutStartSec
+# (90s) is far too short; allow 30 minutes.
+TimeoutStartSec=1800s

 [Install]
 WantedBy=multi-user.target
--- a/neuron.example.toml
+++ b/neuron.example.toml
@@ -22,3 +22,19 @@ name = "candle"
 # HuggingFace cache directory for model weights. When unset, hf-hub's
 # default (~/.cache/huggingface) is used.
 # hf_cache = "/var/lib/neuron/hf-cache"
+
+# -- Default models ----------------------------------------------------------
+# Models listed here are loaded automatically when the neuron service
+# activates. Loading is sequential — a slow or failing entry doesn't
+# block the rest of the fleet, but it does push out the time before
+# neuron starts serving HTTP, so keep the list short. Operators can
+# load additional models on demand via POST /models/load.
+#
+# Make sure data/neuron.service's TimeoutStartSec is generous enough to
+# cover the slowest entry's first-time download + materialisation.
+
+# [[default_models]]
+# model_id = "Qwen/Qwen3-0.6B-GGUF"
+# harness = "candle"
+# quant = "Q4_K_M"
+# devices = [0]