feat(cortex): unified /v1/models — catalogue × topology feasibility + cold-load

Realises [project-unified-models-endpoint]: cortex now surfaces every model the operator has provisioned in the catalogue, transparently cold-loads on the first request, and routes the request once the load is done — without per-node configuration or client awareness of which neuron hosts what. cortex-core changes: - NodeState gains `discovery: Option<DiscoveryResponse>` — populated once per neuron on first successful poll, cached forever after (topology is invariant for a neuron process). - ModelProfile gains `is_feasible_on(neuron, devices)` with the pinned_on / min_devices / min_device_vram_mb logic + 5 unit tests. - CortexModelEntry expanded with OpenAI-compatible (`id`, `object`, `created`, `owned_by`) plus helexa-specific extension fields (`loaded`, `feasible_on`, `locations`). cortex-gateway changes: - poller.rs: `maybe_poll_discovery` fetches `GET /discovery` once per neuron and caches on NodeState. - handlers.rs::list_models rewritten as union of (catalogue × topology feasibility) + (currently loaded somewhere). Catalogue-defined models surface even when not yet loaded. - router.rs::resolve gains priority 3 (catalogue cold-load): 1. loaded somewhere → route there 2. unloaded somewhere → route + lazy load via neuron 3. in catalogue → pick feasible neuron, POST /models/load, wait, route. Cache the new entry locally so subsequent requests skip the poll wait. 4. else 404 - pick_feasible_neuron prefers pinned_on neurons, falls back to any feasible one (stable by name). - profile_to_spec translates ModelProfile → ModelSpec, picking devices by VRAM floor and setting tensor_parallel = min_devices for multi- device profiles. - "already loaded" responses from neuron are tolerated (two concurrent requests racing the same cold-load is a benign outcome). models.example.toml rewritten to reflect the canonical helexa fleet (beast = 2x RTX 5090, benjy = RTX 4090, quadbrat = RTX 3060) with a working TP example (Qwen3.6-27B pinned on beast) plus single-GPU profiles for the smaller models. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 07:39:04 +03:00
parent f72dee094f
commit 735945ee81
7 changed files with 528 additions and 54 deletions
--- a/crates/cortex-core/src/catalogue.rs
+++ b/crates/cortex-core/src/catalogue.rs
@@ -1,5 +1,6 @@
 //! Model catalogue — profiles describing how to serve each model.

+use crate::discovery::DeviceInfo;
 use serde::{Deserialize, Serialize};
 use std::path::Path;

@@ -64,4 +65,103 @@ impl ModelCatalogue {
            .iter()
            .any(|p| p.id == model_id && p.pinned_on.contains(&neuron_name.to_string()))
    }
+
+    /// Find a profile by model id.
+    pub fn get(&self, model_id: &str) -> Option<&ModelProfile> {
+        self.models.iter().find(|p| p.id == model_id)
+    }
+}
+
+impl ModelProfile {
+    /// True iff this profile's placement constraints can be satisfied
+    /// by the named neuron with the given device topology.
+    ///
+    /// Constraints checked:
+    /// - `pinned_on`: non-empty → neuron must be on the list.
+    /// - `min_devices`: neuron must have at least this many devices.
+    /// - `min_device_vram_mb`: at least `min_devices` of the neuron's
+    ///   devices must each meet this VRAM floor.
+    pub fn is_feasible_on(&self, neuron_name: &str, devices: &[DeviceInfo]) -> bool {
+        if !self.pinned_on.is_empty() && !self.pinned_on.iter().any(|n| n == neuron_name) {
+            return false;
+        }
+        if (devices.len() as u32) < self.min_devices {
+            return false;
+        }
+        if let Some(min_vram) = self.min_device_vram_mb {
+            let big_enough = devices
+                .iter()
+                .filter(|d| d.vram_total_mb >= min_vram)
+                .count() as u32;
+            if big_enough < self.min_devices {
+                return false;
+            }
+        }
+        true
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::discovery::DeviceInfo;
+
+    fn device(idx: u32, vram_mb: u64) -> DeviceInfo {
+        DeviceInfo {
+            index: idx,
+            name: format!("DEV-{idx}"),
+            vram_total_mb: vram_mb,
+            compute_capability: "8.6".into(),
+        }
+    }
+
+    fn profile() -> ModelProfile {
+        ModelProfile {
+            id: "Qwen/Qwen3.6-27B".into(),
+            harness: "candle".into(),
+            quant: None,
+            vram_mb: Some(45_000),
+            min_devices: 2,
+            min_device_vram_mb: Some(24_000),
+            pinned_on: vec![],
+        }
+    }
+
+    #[test]
+    fn feasible_when_two_devices_meet_vram_floor() {
+        let p = profile();
+        let devices = [device(0, 32_000), device(1, 32_000)];
+        assert!(p.is_feasible_on("beast", &devices));
+    }
+
+    #[test]
+    fn infeasible_when_only_one_device() {
+        let p = profile();
+        let devices = [device(0, 64_000)];
+        assert!(!p.is_feasible_on("benjy", &devices));
+    }
+
+    #[test]
+    fn infeasible_when_one_device_underspec() {
+        let p = profile();
+        let devices = [device(0, 32_000), device(1, 12_000)];
+        assert!(!p.is_feasible_on("mixed", &devices));
+    }
+
+    #[test]
+    fn pinned_on_excludes_other_neurons() {
+        let mut p = profile();
+        p.pinned_on = vec!["beast".into()];
+        let devices = [device(0, 32_000), device(1, 32_000)];
+        assert!(p.is_feasible_on("beast", &devices));
+        assert!(!p.is_feasible_on("benjy", &devices));
+    }
+
+    #[test]
+    fn no_vram_floor_just_needs_min_devices() {
+        let mut p = profile();
+        p.min_device_vram_mb = None;
+        let devices = [device(0, 1_000), device(1, 1_000)];
+        assert!(p.is_feasible_on("anywhere", &devices));
+    }
 }
--- a/crates/cortex-core/src/node.rs
+++ b/crates/cortex-core/src/node.rs
@@ -1,3 +1,4 @@
+use crate::discovery::DiscoveryResponse;
 use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
@@ -13,6 +14,12 @@ pub struct NodeState {
    /// Number of load/unload cycles since last process restart.
    pub lifecycle_cycles: u32,
    pub last_poll: Option<DateTime<Utc>>,
+    /// Result of the most recent successful `GET /discovery` against
+    /// this neuron. Cached forever once obtained — device topology is
+    /// invariant for a given neuron process. `None` until the first
+    /// successful poll. Used by the router and `/v1/models` to do
+    /// catalogue × topology feasibility checks.
+    pub discovery: Option<DiscoveryResponse>,
 }

 /// A model registered on a node, with its runtime status.
@@ -36,12 +43,32 @@ pub enum ModelStatus {
 }

 /// Unified model entry as exposed by the gateway's `/v1/models` endpoint.
-/// Includes which node(s) host this model and their status.
+///
+/// The first four fields (`id`, `object`, `created`, `owned_by`) match
+/// OpenAI's `/v1/models` shape verbatim, so existing OpenAI-aware
+/// tooling deserialises this without custom code. The remaining fields
+/// are helexa-specific extensions — OpenAI clients ignore unknown
+/// fields and other consumers can read them for placement / debugging.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct CortexModelEntry {
    pub id: String,
+    /// Always `"model"` per OpenAI's contract.
    pub object: String,
-    /// Which nodes have this model (and their status).
+    /// Unix-second timestamp; cortex stamps this at response time.
+    pub created: u64,
+    /// OpenAI's "publisher" field — `"helexa"` for everything we serve.
+    pub owned_by: String,
+    /// True if any neuron currently has this model loaded. False for
+    /// catalogue entries that are feasible but not yet loaded.
+    pub loaded: bool,
+    /// Neurons whose discovered topology can satisfy this model's
+    /// catalogue placement constraints. Empty for models that are
+    /// loaded somewhere but not present in the catalogue (cortex has
+    /// no feasibility opinion on those).
+    pub feasible_on: Vec<String>,
+    /// Where this model is actually loaded right now. Subset of (or
+    /// disjoint from) `feasible_on` depending on whether the catalogue
+    /// covers this model.
    pub locations: Vec<ModelLocation>,
 }