refactor: cortex talks to neurons instead of mistral.rs directly

Replace NodeConfig (static vram_mb, pinned) with NeuronEndpoint. Hardware discovery and model pinning now come from neuron API and models.toml catalogue respectively. - config.rs: nodes -> neurons, add models_config path - catalogue.rs: ModelProfile with pinned_on, ModelCatalogue - poller.rs: poll neuron GET /models (ModelInfo format) - router.rs: resolve inference endpoint via neuron GET /models/{id}/endpoint - evictor.rs: call neuron POST /models/unload - node.rs: remove vram_mb, pinned fields (come from discovery/catalogue) - All 22 gateway tests updated to mock neuron API - Remove MistralModelsResponse, ModelLifecycleRequest (no longer needed) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-15 14:42:52 +03:00
parent 26e5e7ead8
commit e42e8ee81f
19 changed files with 385 additions and 437 deletions
--- a/crates/cortex-core/src/catalogue.rs
+++ b/crates/cortex-core/src/catalogue.rs
@@ -0,0 +1,67 @@
+//! Model catalogue — profiles describing how to serve each model.
+
+use serde::{Deserialize, Serialize};
+use std::path::Path;
+
+/// A model serving profile loaded from models.toml.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ModelProfile {
+    pub id: String,
+    pub harness: String,
+    #[serde(default)]
+    pub quant: Option<String>,
+    /// Estimated VRAM usage in MB when loaded.
+    #[serde(default)]
+    pub vram_mb: Option<u64>,
+    /// Minimum number of GPU devices required.
+    #[serde(default = "default_min_devices")]
+    pub min_devices: u32,
+    /// Minimum VRAM per device in MB.
+    #[serde(default)]
+    pub min_device_vram_mb: Option<u64>,
+    /// Neurons where this model should never be evicted.
+    #[serde(default)]
+    pub pinned_on: Vec<String>,
+}
+
+fn default_min_devices() -> u32 {
+    1
+}
+
+/// The full model catalogue.
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
+pub struct ModelCatalogue {
+    #[serde(default)]
+    pub models: Vec<ModelProfile>,
+}
+
+impl ModelCatalogue {
+    /// Load the catalogue from a TOML file. Returns empty catalogue if file doesn't exist.
+    pub fn load(path: impl AsRef<Path>) -> Self {
+        let path = path.as_ref();
+        if !path.exists() {
+            tracing::info!(path = %path.display(), "no model catalogue found, using empty");
+            return Self::default();
+        }
+        match std::fs::read_to_string(path) {
+            Ok(contents) => match toml::from_str(&contents) {
+                Ok(cat) => cat,
+                Err(e) => {
+                    tracing::warn!(path = %path.display(), error = %e, "failed to parse model catalogue");
+                    Self::default()
+                }
+            },
+            Err(e) => {
+                tracing::warn!(path = %path.display(), error = %e, "failed to read model catalogue");
+                Self::default()
+            }
+        }
+    }
+
+    /// Check if a model is pinned on a given neuron.
+    pub fn is_pinned(&self, model_id: &str, neuron_name: &str) -> bool {
+        self.models
+            .iter()
+            .any(|p| p.id == model_id && p.pinned_on.contains(&neuron_name.to_string()))
+    }
+}
--- a/crates/cortex-core/src/config.rs
+++ b/crates/cortex-core/src/config.rs
@@ -9,7 +9,15 @@ use std::path::Path;
 pub struct GatewayConfig {
    pub gateway: GatewaySettings,
    pub eviction: EvictionSettings,
-    pub nodes: Vec<NodeConfig>,
+    /// Neuron endpoints (replaces old NodeConfig with static vram_mb/pinned).
+    pub neurons: Vec<NeuronEndpoint>,
+    /// Path to the model catalogue file (default: "models.toml").
+    #[serde(default = "default_models_path")]
+    pub models_config: String,
+}
+
+fn default_models_path() -> String {
+    "models.toml".into()
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -24,8 +32,7 @@ pub struct GatewaySettings {
 pub struct EvictionSettings {
    /// Eviction strategy: "lru" or "priority"
    pub strategy: EvictionStrategy,
-    /// Restart the mistralrs process after this many load/unload cycles
-    /// to reclaim fragmented VRAM. 0 = never.
+    /// Number of load/unload cycles before flagging for defrag. 0 = never.
    #[serde(default)]
    pub defrag_after_cycles: u32,
 }
@@ -37,23 +44,19 @@ pub enum EvictionStrategy {
    Priority,
 }

+/// A neuron endpoint in the fleet. Hardware details come from
+/// neuron's /discovery endpoint, not from config.
 #[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct NodeConfig {
-    /// Human-readable node name (e.g. "gpu-large")
+pub struct NeuronEndpoint {
+    /// Human-readable node name (e.g. "beast")
    pub name: String,
-    /// Base URL of the mistralrs HTTP server (e.g. "http://gpu-large.internal:8080")
+    /// Base URL of the neuron daemon (e.g. "http://beast.internal:9090")
    pub endpoint: String,
-    /// Total VRAM in MB across all GPUs on this node
-    pub vram_mb: u64,
-    /// Model IDs that should never be evicted from this node
-    #[serde(default)]
-    pub pinned: Vec<String>,
 }

 impl GatewayConfig {
    /// Load configuration from a TOML file, with environment variable overrides.
-    /// Env vars are prefixed with `CORTEX_` and use `__` as a separator
-    /// (e.g. `CORTEX_GATEWAY__LISTEN=0.0.0.0:9000`).
+    /// Env vars are prefixed with `CORTEX_` and use `__` as a separator.
    pub fn load(path: impl AsRef<Path>) -> Result<Self, Box<figment::Error>> {
        Figment::new()
            .merge(Toml::file(path))
@@ -74,7 +77,8 @@ impl Default for GatewayConfig {
                strategy: EvictionStrategy::Lru,
                defrag_after_cycles: 50,
            },
-            nodes: vec![],
+            neurons: vec![],
+            models_config: default_models_path(),
        }
    }
 }
--- a/crates/cortex-core/src/lib.rs
+++ b/crates/cortex-core/src/lib.rs
@@ -1,4 +1,5 @@
 pub mod anthropic;
+pub mod catalogue;
 pub mod config;
 pub mod discovery;
 pub mod harness;
--- a/crates/cortex-core/src/node.rs
+++ b/crates/cortex-core/src/node.rs
@@ -2,13 +2,12 @@ use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize};
 use std::collections::HashMap;

-/// Runtime state of a single node in the fleet.
+/// Runtime state of a single neuron in the fleet.
 #[derive(Debug, Clone)]
 pub struct NodeState {
    pub name: String,
+    /// Base URL of the neuron daemon (e.g. "http://beast.internal:9090").
    pub endpoint: String,
-    pub vram_mb: u64,
-    pub pinned: Vec<String>,
    pub healthy: bool,
    pub models: HashMap<String, ModelEntry>,
    /// Number of load/unload cycles since last process restart.
@@ -27,7 +26,7 @@ pub struct ModelEntry {
    pub vram_estimate_mb: Option<u64>,
 }

-/// Model lifecycle status, matching the mistral.rs API.
+/// Model lifecycle status.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 #[serde(rename_all = "lowercase")]
 pub enum ModelStatus {
@@ -52,23 +51,3 @@ pub struct ModelLocation {
    pub status: ModelStatus,
    pub vram_estimate_mb: Option<u64>,
 }
-
-/// Response from mistral.rs `GET /v1/models`.
-/// This is the upstream format we parse when polling nodes.
-#[derive(Debug, Clone, Deserialize)]
-pub struct MistralModelsResponse {
-    pub data: Vec<MistralModelEntry>,
-}
-
-#[derive(Debug, Clone, Deserialize)]
-pub struct MistralModelEntry {
-    pub id: String,
-    #[serde(default)]
-    pub status: Option<String>,
-}
-
-/// Request body for mistral.rs model lifecycle endpoints.
-#[derive(Debug, Clone, Serialize)]
-pub struct ModelLifecycleRequest {
-    pub model_id: String,
-}