From 6779b7526a25fb22b4e2e2cd6b0366dbe28acaa1 Mon Sep 17 00:00:00 2001
From: rob thijssen <grenade@rob.tn>
Date: Mon, 18 May 2026 17:56:08 +0300
Subject: [PATCH] feat(neuron): load default_models on service activation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Stage 5 of the candle-native pivot. Adds first-class support for
auto-loading a configured set of models when the neuron service
activates.

Config:
- NeuronConfig.default_models: Vec<ModelSpec> (defaults to []).
- neuron.example.toml ships a commented [[default_models]] example.

Activation flow (crates/neuron/src/startup.rs::load_default_models):
- Sequential — VRAM contention makes parallel loads risky.
- Per-entry timing logged at info level on success.
- Failures logged as warnings; the next entry is still attempted.
- An empty list short-circuits without log noise.

Called from main.rs after the registry is built and before the axum
listener binds, so /models reflects the loaded state from the very
first request.

data/neuron.service gains TimeoutStartSec=1800s. With activation
blocked on potentially slow first-time HF downloads + GGUF
materialisation, systemd's default 90s would kill larger model loads
mid-flight.

Two non-gated tests in tests/activation.rs cover the
continues-past-failure and empty-list paths using a synthetically
unknown harness name to fail loads fast without touching the network.
The cuda-integration test from earlier stages still exercises the
real load/unload lifecycle.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/neuron/src/config.rs       |  9 ++++-
 crates/neuron/src/lib.rs          |  1 +
 crates/neuron/src/main.rs         |  8 ++++-
 crates/neuron/src/startup.rs      | 38 +++++++++++++++++++++
 crates/neuron/tests/activation.rs | 56 +++++++++++++++++++++++++++++++
 data/neuron.service               |  5 +++
 neuron.example.toml               | 16 +++++++++
 7 files changed, 131 insertions(+), 2 deletions(-)
 create mode 100644 crates/neuron/src/startup.rs
 create mode 100644 crates/neuron/tests/activation.rs
diff --git a/crates/neuron/src/config.rs b/crates/neuron/src/config.rs
index e9b2250..a9e62f4 100644
--- a/crates/neuron/src/config.rs
+++ b/crates/neuron/src/config.rs
@@ -1,6 +1,6 @@
 //! Neuron configuration loaded from neuron.toml.
 
-use cortex_core::harness::HarnessConfig;
+use cortex_core::harness::{HarnessConfig, ModelSpec};
 use figment::{
     Figment,
     providers::{Env, Format, Toml},
@@ -17,6 +17,12 @@ pub struct NeuronConfig {
     /// Per-harness configuration. Currently only `candle` is recognised.
     #[serde(default)]
     pub harness: HarnessSettings,
+    /// Models to auto-load when the neuron service activates. Each entry
+    /// is loaded sequentially before the HTTP listener binds. A failure
+    /// on any single entry logs a warning and proceeds — broken entries
+    /// don't prevent the rest of the fleet from starting.
+    #[serde(default)]
+    pub default_models: Vec<ModelSpec>,
 }
 
 /// Settings for individual harness implementations. Each harness owns
@@ -55,6 +61,7 @@ impl Default for NeuronConfig {
             port: 13131,
             harnesses: vec![],
             harness: HarnessSettings::default(),
+            default_models: vec![],
         }
     }
 }
diff --git a/crates/neuron/src/lib.rs b/crates/neuron/src/lib.rs
index d860b25..5c72182 100644
--- a/crates/neuron/src/lib.rs
+++ b/crates/neuron/src/lib.rs
@@ -3,3 +3,4 @@ pub mod config;
 pub mod discovery;
 pub mod harness;
 pub mod health;
+pub mod startup;
diff --git a/crates/neuron/src/main.rs b/crates/neuron/src/main.rs
index ed889e8..b78b9f4 100644
--- a/crates/neuron/src/main.rs
+++ b/crates/neuron/src/main.rs
@@ -1,6 +1,6 @@
 use anyhow::Result;
 use clap::Parser;
-use neuron::{api, config::NeuronConfig, discovery, harness::HarnessRegistry, health};
+use neuron::{api, config::NeuronConfig, discovery, harness::HarnessRegistry, health, startup};
 use std::sync::Arc;
 use std::time::Instant;
 use tokio::sync::RwLock;
@@ -55,6 +55,12 @@ async fn main() -> Result<()> {
     discovery_result.harnesses = registry.names();
     let candle = registry.candle();
 
+    // Activation: load default models before binding the listener.
+    // Each load may take tens of seconds to several minutes depending
+    // on model size and HF cache state — keep TimeoutStartSec in the
+    // systemd unit generous enough to cover the slowest entry.
+    startup::load_default_models(&registry, &cfg.default_models).await;
+
     let health_cache = Arc::new(health::HealthCache::new());
     health_cache
         .set_has_gpus(!discovery_result.devices.is_empty())
diff --git a/crates/neuron/src/startup.rs b/crates/neuron/src/startup.rs
new file mode 100644
index 0000000..d4c5296
--- /dev/null
+++ b/crates/neuron/src/startup.rs
@@ -0,0 +1,38 @@
+//! Activation-time orchestration.
+//!
+//! Wired from `main.rs` after the harness registry is built and before
+//! the HTTP listener binds. Kept in its own module so the logic is
+//! unit-testable without spinning up a full neuron process.
+
+use crate::harness::HarnessRegistry;
+use cortex_core::harness::ModelSpec;
+use std::time::Instant;
+
+/// Load each spec sequentially against the registry, treating
+/// individual failures as warnings rather than fatal errors.
+///
+/// VRAM contention makes parallel loads risky; the sequential path is
+/// boring but correct. The function logs elapsed time per load so an
+/// operator can see which model is hogging activation.
+pub async fn load_default_models(registry: &HarnessRegistry, specs: &[ModelSpec]) {
+    if specs.is_empty() {
+        return;
+    }
+    tracing::info!(count = specs.len(), "loading default models");
+    for spec in specs {
+        let start = Instant::now();
+        match registry.load_model(spec).await {
+            Ok(()) => tracing::info!(
+                model = %spec.model_id,
+                elapsed_ms = start.elapsed().as_millis() as u64,
+                "loaded default model"
+            ),
+            Err(e) => tracing::warn!(
+                model = %spec.model_id,
+                error = %e,
+                elapsed_ms = start.elapsed().as_millis() as u64,
+                "failed to load default model, continuing"
+            ),
+        }
+    }
+}
diff --git a/crates/neuron/tests/activation.rs b/crates/neuron/tests/activation.rs
new file mode 100644
index 0000000..8daa972
--- /dev/null
+++ b/crates/neuron/tests/activation.rs
@@ -0,0 +1,56 @@
+//! Activation-time behaviour: load_default_models continues past
+//! individual failures so a single broken catalogue entry doesn't
+//! prevent the rest of the fleet from starting.
+
+use cortex_core::harness::{HarnessConfig, ModelSpec};
+use neuron::config::HarnessSettings;
+use neuron::harness::HarnessRegistry;
+use neuron::startup;
+
+#[tokio::test]
+async fn test_load_default_models_skips_unknown_harness() {
+    let registry = HarnessRegistry::from_configs(
+        &[HarnessConfig {
+            name: "candle".into(),
+        }],
+        "http://localhost:0",
+        &HarnessSettings::default(),
+    );
+
+    // Both entries fail synchronously inside the registry — no network
+    // call escapes (the harness lookup mismatches before hf-hub is
+    // touched). The function should still return cleanly.
+    let specs = vec![
+        ModelSpec {
+            model_id: "model-a".into(),
+            harness: "no-such-harness".into(),
+            quant: None,
+            tensor_parallel: None,
+            devices: None,
+        },
+        ModelSpec {
+            model_id: "model-b".into(),
+            harness: "no-such-harness".into(),
+            quant: None,
+            tensor_parallel: None,
+            devices: None,
+        },
+    ];
+
+    startup::load_default_models(&registry, &specs).await;
+
+    let listed = registry
+        .list_all_models()
+        .await
+        .expect("list_all_models should succeed");
+    assert!(
+        listed.is_empty(),
+        "no models should be loaded after failed entries"
+    );
+}
+
+#[tokio::test]
+async fn test_load_default_models_empty_is_noop() {
+    let registry = HarnessRegistry::new();
+    startup::load_default_models(&registry, &[]).await;
+}
diff --git a/data/neuron.service b/data/neuron.service
index 84428dd..207b4da 100644
--- a/data/neuron.service
+++ b/data/neuron.service
@@ -10,6 +10,11 @@ Restart=on-failure
 RestartSec=5
 User=neuron
 Group=neuron
+# Loading default_models from neuron.toml happens before the HTTP
+# listener binds; large models can take many minutes to download and
+# materialise on first activation. systemd's default TimeoutStartSec
+# (90s) is far too short; allow 30 minutes.
+TimeoutStartSec=1800s
 
 [Install]
 WantedBy=multi-user.target
diff --git a/neuron.example.toml b/neuron.example.toml
index 8a71372..46108a6 100644
--- a/neuron.example.toml
+++ b/neuron.example.toml
@@ -22,3 +22,19 @@ name = "candle"
 # HuggingFace cache directory for model weights. When unset, hf-hub's
 # default (~/.cache/huggingface) is used.
 # hf_cache = "/var/lib/neuron/hf-cache"
+
+# -- Default models ----------------------------------------------------------
+# Models listed here are loaded automatically when the neuron service
+# activates. Loading is sequential — a slow or failing entry doesn't
+# block the rest of the fleet, but it does push out the time before
+# neuron starts serving HTTP, so keep the list short. Operators can
+# load additional models on demand via POST /models/load.
+#
+# Make sure data/neuron.service's TimeoutStartSec is generous enough to
+# cover the slowest entry's first-time download + materialisation.
+
+# [[default_models]]
+# model_id = "Qwen/Qwen3-0.6B-GGUF"
+# harness = "candle"
+# quant = "Q4_K_M"
+# devices = [0]