refactor: cortex talks to neurons instead of mistral.rs directly

Replace NodeConfig (static vram_mb, pinned) with NeuronEndpoint. Hardware discovery and model pinning now come from neuron API and models.toml catalogue respectively. - config.rs: nodes -> neurons, add models_config path - catalogue.rs: ModelProfile with pinned_on, ModelCatalogue - poller.rs: poll neuron GET /models (ModelInfo format) - router.rs: resolve inference endpoint via neuron GET /models/{id}/endpoint - evictor.rs: call neuron POST /models/unload - node.rs: remove vram_mb, pinned fields (come from discovery/catalogue) - All 22 gateway tests updated to mock neuron API - Remove MistralModelsResponse, ModelLifecycleRequest (no longer needed) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-15 14:42:52 +03:00
parent 26e5e7ead8
commit e42e8ee81f
19 changed files with 385 additions and 437 deletions
--- a/crates/cortex-gateway/src/evictor.rs
+++ b/crates/cortex-gateway/src/evictor.rs
@@ -1,29 +1,19 @@
 //! Model eviction logic.
 //!
-//! The evictor runs as a background task. When the router determines that a
-//! model needs to be loaded on a node but VRAM is tight, it can request
-//! eviction via a channel. The evictor then:
-//!   1. Identifies the LRU model on that node (excluding pinned models)
-//!   2. Calls `POST /v1/models/unload` on the node
-//!   3. Increments the lifecycle cycle counter (for defrag tracking)
+//! The evictor identifies the LRU model on a node (excluding pinned models),
+//! calls neuron's `POST /models/unload` to free the model, and updates
+//! local state.

 use crate::state::CortexState;
-use cortex_core::node::{ModelLifecycleRequest, ModelStatus};
+use cortex_core::node::ModelStatus;
 use std::sync::Arc;
 use std::time::Duration;

-/// Runs forever. Currently a placeholder that periodically checks for
-/// eviction opportunities. In the future, this will be driven by a
-/// channel from the router when VRAM pressure is detected.
+/// Runs forever. Placeholder for future channel-driven eviction.
 pub async fn eviction_loop(fleet: Arc<CortexState>) {
-    // TODO: Replace this polling approach with a channel-driven design
-    // where the router sends eviction requests when it detects that a
-    // model load would exceed available VRAM.
    loop {
        tokio::time::sleep(Duration::from_secs(30)).await;
-        // Placeholder: the actual eviction logic is in `evict_lru_on_node`,
-        // called on demand by the router.
-        let _ = &fleet; // suppress unused warning
+        let _ = &fleet;
    }
 }

@@ -33,18 +23,19 @@ pub async fn evict_lru_on_node(
    fleet: &CortexState,
    node_name: &str,
 ) -> anyhow::Result<Option<String>> {
-    let (endpoint, candidate) = {
+    let (neuron_endpoint, candidate) = {
        let nodes = fleet.nodes.read().await;
        let Some(node) = nodes.get(node_name) else {
            anyhow::bail!("node '{node_name}' not found");
        };

-        // Find the loaded model with the oldest last_accessed, excluding pinned.
+        // Find the loaded model with the oldest last_accessed,
+        // excluding models pinned on this neuron (from catalogue).
        let candidate = node
            .models
            .values()
            .filter(|m| m.status == ModelStatus::Loaded)
-            .filter(|m| !node.pinned.contains(&m.id))
+            .filter(|m| !fleet.catalogue.is_pinned(&m.id, node_name))
            .min_by_key(|m| m.last_accessed)
            .map(|m| m.id.clone());

@@ -58,18 +49,16 @@ pub async fn evict_lru_on_node(

    tracing::info!(node = node_name, model = %model_id, "evicting model");

-    let url = format!("{endpoint}/v1/models/unload");
+    // Call neuron's unload endpoint.
+    let url = format!("{neuron_endpoint}/models/unload");
    let resp = fleet
        .http_client
        .post(&url)
-        .json(&ModelLifecycleRequest {
-            model_id: model_id.clone(),
-        })
+        .json(&serde_json::json!({ "model_id": model_id }))
        .send()
        .await?;

    if resp.status().is_success() {
-        // Update local state.
        let mut nodes = fleet.nodes.write().await;
        if let Some(node) = nodes.get_mut(node_name) {
            if let Some(entry) = node.models.get_mut(&model_id) {
@@ -77,14 +66,13 @@ pub async fn evict_lru_on_node(
            }
            node.lifecycle_cycles += 1;

-            // Check if we should flag for defrag.
            if fleet.eviction.defrag_after_cycles > 0
                && node.lifecycle_cycles >= fleet.eviction.defrag_after_cycles
            {
                tracing::warn!(
                    node = node_name,
                    cycles = node.lifecycle_cycles,
-                    "VRAM fragmentation threshold reached — consider restarting mistralrs"
+                    "VRAM fragmentation threshold reached — consider restarting harness"
                );
            }
        }