refactor: cortex talks to neurons instead of mistral.rs directly
Replace NodeConfig (static vram_mb, pinned) with NeuronEndpoint.
Hardware discovery and model pinning now come from neuron API and
models.toml catalogue respectively.
- config.rs: nodes -> neurons, add models_config path
- catalogue.rs: ModelProfile with pinned_on, ModelCatalogue
- poller.rs: poll neuron GET /models (ModelInfo format)
- router.rs: resolve inference endpoint via neuron GET /models/{id}/endpoint
- evictor.rs: call neuron POST /models/unload
- node.rs: remove vram_mb, pinned fields (come from discovery/catalogue)
- All 22 gateway tests updated to mock neuron API
- Remove MistralModelsResponse, ModelLifecycleRequest (no longer needed)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
67
crates/cortex-core/src/catalogue.rs
Normal file
67
crates/cortex-core/src/catalogue.rs
Normal file
@@ -0,0 +1,67 @@
|
||||
//! Model catalogue — profiles describing how to serve each model.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::Path;
|
||||
|
||||
/// A model serving profile loaded from models.toml.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ModelProfile {
|
||||
pub id: String,
|
||||
pub harness: String,
|
||||
#[serde(default)]
|
||||
pub quant: Option<String>,
|
||||
/// Estimated VRAM usage in MB when loaded.
|
||||
#[serde(default)]
|
||||
pub vram_mb: Option<u64>,
|
||||
/// Minimum number of GPU devices required.
|
||||
#[serde(default = "default_min_devices")]
|
||||
pub min_devices: u32,
|
||||
/// Minimum VRAM per device in MB.
|
||||
#[serde(default)]
|
||||
pub min_device_vram_mb: Option<u64>,
|
||||
/// Neurons where this model should never be evicted.
|
||||
#[serde(default)]
|
||||
pub pinned_on: Vec<String>,
|
||||
}
|
||||
|
||||
fn default_min_devices() -> u32 {
|
||||
1
|
||||
}
|
||||
|
||||
/// The full model catalogue.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
||||
pub struct ModelCatalogue {
|
||||
#[serde(default)]
|
||||
pub models: Vec<ModelProfile>,
|
||||
}
|
||||
|
||||
impl ModelCatalogue {
|
||||
/// Load the catalogue from a TOML file. Returns empty catalogue if file doesn't exist.
|
||||
pub fn load(path: impl AsRef<Path>) -> Self {
|
||||
let path = path.as_ref();
|
||||
if !path.exists() {
|
||||
tracing::info!(path = %path.display(), "no model catalogue found, using empty");
|
||||
return Self::default();
|
||||
}
|
||||
match std::fs::read_to_string(path) {
|
||||
Ok(contents) => match toml::from_str(&contents) {
|
||||
Ok(cat) => cat,
|
||||
Err(e) => {
|
||||
tracing::warn!(path = %path.display(), error = %e, "failed to parse model catalogue");
|
||||
Self::default()
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
tracing::warn!(path = %path.display(), error = %e, "failed to read model catalogue");
|
||||
Self::default()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if a model is pinned on a given neuron.
|
||||
pub fn is_pinned(&self, model_id: &str, neuron_name: &str) -> bool {
|
||||
self.models
|
||||
.iter()
|
||||
.any(|p| p.id == model_id && p.pinned_on.contains(&neuron_name.to_string()))
|
||||
}
|
||||
}
|
||||
@@ -9,7 +9,15 @@ use std::path::Path;
|
||||
pub struct GatewayConfig {
|
||||
pub gateway: GatewaySettings,
|
||||
pub eviction: EvictionSettings,
|
||||
pub nodes: Vec<NodeConfig>,
|
||||
/// Neuron endpoints (replaces old NodeConfig with static vram_mb/pinned).
|
||||
pub neurons: Vec<NeuronEndpoint>,
|
||||
/// Path to the model catalogue file (default: "models.toml").
|
||||
#[serde(default = "default_models_path")]
|
||||
pub models_config: String,
|
||||
}
|
||||
|
||||
fn default_models_path() -> String {
|
||||
"models.toml".into()
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
@@ -24,8 +32,7 @@ pub struct GatewaySettings {
|
||||
pub struct EvictionSettings {
|
||||
/// Eviction strategy: "lru" or "priority"
|
||||
pub strategy: EvictionStrategy,
|
||||
/// Restart the mistralrs process after this many load/unload cycles
|
||||
/// to reclaim fragmented VRAM. 0 = never.
|
||||
/// Number of load/unload cycles before flagging for defrag. 0 = never.
|
||||
#[serde(default)]
|
||||
pub defrag_after_cycles: u32,
|
||||
}
|
||||
@@ -37,23 +44,19 @@ pub enum EvictionStrategy {
|
||||
Priority,
|
||||
}
|
||||
|
||||
/// A neuron endpoint in the fleet. Hardware details come from
|
||||
/// neuron's /discovery endpoint, not from config.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct NodeConfig {
|
||||
/// Human-readable node name (e.g. "gpu-large")
|
||||
pub struct NeuronEndpoint {
|
||||
/// Human-readable node name (e.g. "beast")
|
||||
pub name: String,
|
||||
/// Base URL of the mistralrs HTTP server (e.g. "http://gpu-large.internal:8080")
|
||||
/// Base URL of the neuron daemon (e.g. "http://beast.internal:9090")
|
||||
pub endpoint: String,
|
||||
/// Total VRAM in MB across all GPUs on this node
|
||||
pub vram_mb: u64,
|
||||
/// Model IDs that should never be evicted from this node
|
||||
#[serde(default)]
|
||||
pub pinned: Vec<String>,
|
||||
}
|
||||
|
||||
impl GatewayConfig {
|
||||
/// Load configuration from a TOML file, with environment variable overrides.
|
||||
/// Env vars are prefixed with `CORTEX_` and use `__` as a separator
|
||||
/// (e.g. `CORTEX_GATEWAY__LISTEN=0.0.0.0:9000`).
|
||||
/// Env vars are prefixed with `CORTEX_` and use `__` as a separator.
|
||||
pub fn load(path: impl AsRef<Path>) -> Result<Self, Box<figment::Error>> {
|
||||
Figment::new()
|
||||
.merge(Toml::file(path))
|
||||
@@ -74,7 +77,8 @@ impl Default for GatewayConfig {
|
||||
strategy: EvictionStrategy::Lru,
|
||||
defrag_after_cycles: 50,
|
||||
},
|
||||
nodes: vec![],
|
||||
neurons: vec![],
|
||||
models_config: default_models_path(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
pub mod anthropic;
|
||||
pub mod catalogue;
|
||||
pub mod config;
|
||||
pub mod discovery;
|
||||
pub mod harness;
|
||||
|
||||
@@ -2,13 +2,12 @@ use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Runtime state of a single node in the fleet.
|
||||
/// Runtime state of a single neuron in the fleet.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct NodeState {
|
||||
pub name: String,
|
||||
/// Base URL of the neuron daemon (e.g. "http://beast.internal:9090").
|
||||
pub endpoint: String,
|
||||
pub vram_mb: u64,
|
||||
pub pinned: Vec<String>,
|
||||
pub healthy: bool,
|
||||
pub models: HashMap<String, ModelEntry>,
|
||||
/// Number of load/unload cycles since last process restart.
|
||||
@@ -27,7 +26,7 @@ pub struct ModelEntry {
|
||||
pub vram_estimate_mb: Option<u64>,
|
||||
}
|
||||
|
||||
/// Model lifecycle status, matching the mistral.rs API.
|
||||
/// Model lifecycle status.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum ModelStatus {
|
||||
@@ -52,23 +51,3 @@ pub struct ModelLocation {
|
||||
pub status: ModelStatus,
|
||||
pub vram_estimate_mb: Option<u64>,
|
||||
}
|
||||
|
||||
/// Response from mistral.rs `GET /v1/models`.
|
||||
/// This is the upstream format we parse when polling nodes.
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub struct MistralModelsResponse {
|
||||
pub data: Vec<MistralModelEntry>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub struct MistralModelEntry {
|
||||
pub id: String,
|
||||
#[serde(default)]
|
||||
pub status: Option<String>,
|
||||
}
|
||||
|
||||
/// Request body for mistral.rs model lifecycle endpoints.
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct ModelLifecycleRequest {
|
||||
pub model_id: String,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user