From 735945ee810d3ef2f205059e533e1280c5577a33 Mon Sep 17 00:00:00 2001 From: rob thijssen Date: Wed, 20 May 2026 07:39:04 +0300 Subject: [PATCH] =?UTF-8?q?feat(cortex):=20unified=20/v1/models=20?= =?UTF-8?q?=E2=80=94=20catalogue=20=C3=97=20topology=20feasibility=20+=20c?= =?UTF-8?q?old-load?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Realises [project-unified-models-endpoint]: cortex now surfaces every model the operator has provisioned in the catalogue, transparently cold-loads on the first request, and routes the request once the load is done — without per-node configuration or client awareness of which neuron hosts what. cortex-core changes: - NodeState gains `discovery: Option` — populated once per neuron on first successful poll, cached forever after (topology is invariant for a neuron process). - ModelProfile gains `is_feasible_on(neuron, devices)` with the pinned_on / min_devices / min_device_vram_mb logic + 5 unit tests. - CortexModelEntry expanded with OpenAI-compatible (`id`, `object`, `created`, `owned_by`) plus helexa-specific extension fields (`loaded`, `feasible_on`, `locations`). cortex-gateway changes: - poller.rs: `maybe_poll_discovery` fetches `GET /discovery` once per neuron and caches on NodeState. - handlers.rs::list_models rewritten as union of (catalogue × topology feasibility) + (currently loaded somewhere). Catalogue-defined models surface even when not yet loaded. - router.rs::resolve gains priority 3 (catalogue cold-load): 1. loaded somewhere → route there 2. unloaded somewhere → route + lazy load via neuron 3. in catalogue → pick feasible neuron, POST /models/load, wait, route. Cache the new entry locally so subsequent requests skip the poll wait. 4. else 404 - pick_feasible_neuron prefers pinned_on neurons, falls back to any feasible one (stable by name). - profile_to_spec translates ModelProfile → ModelSpec, picking devices by VRAM floor and setting tensor_parallel = min_devices for multi- device profiles. - "already loaded" responses from neuron are tolerated (two concurrent requests racing the same cold-load is a benign outcome). models.example.toml rewritten to reflect the canonical helexa fleet (beast = 2x RTX 5090, benjy = RTX 4090, quadbrat = RTX 3060) with a working TP example (Qwen3.6-27B pinned on beast) plus single-GPU profiles for the smaller models. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/cortex-core/src/catalogue.rs | 100 ++++++++++ crates/cortex-core/src/node.rs | 31 ++- crates/cortex-gateway/src/handlers.rs | 75 +++++++- crates/cortex-gateway/src/poller.rs | 53 ++++++ crates/cortex-gateway/src/router.rs | 260 +++++++++++++++++++++++--- crates/cortex-gateway/src/state.rs | 1 + models.example.toml | 62 ++++-- 7 files changed, 528 insertions(+), 54 deletions(-) diff --git a/crates/cortex-core/src/catalogue.rs b/crates/cortex-core/src/catalogue.rs index daefc85..1656e4f 100644 --- a/crates/cortex-core/src/catalogue.rs +++ b/crates/cortex-core/src/catalogue.rs @@ -1,5 +1,6 @@ //! Model catalogue — profiles describing how to serve each model. +use crate::discovery::DeviceInfo; use serde::{Deserialize, Serialize}; use std::path::Path; @@ -64,4 +65,103 @@ impl ModelCatalogue { .iter() .any(|p| p.id == model_id && p.pinned_on.contains(&neuron_name.to_string())) } + + /// Find a profile by model id. + pub fn get(&self, model_id: &str) -> Option<&ModelProfile> { + self.models.iter().find(|p| p.id == model_id) + } +} + +impl ModelProfile { + /// True iff this profile's placement constraints can be satisfied + /// by the named neuron with the given device topology. + /// + /// Constraints checked: + /// - `pinned_on`: non-empty → neuron must be on the list. + /// - `min_devices`: neuron must have at least this many devices. + /// - `min_device_vram_mb`: at least `min_devices` of the neuron's + /// devices must each meet this VRAM floor. + pub fn is_feasible_on(&self, neuron_name: &str, devices: &[DeviceInfo]) -> bool { + if !self.pinned_on.is_empty() && !self.pinned_on.iter().any(|n| n == neuron_name) { + return false; + } + if (devices.len() as u32) < self.min_devices { + return false; + } + if let Some(min_vram) = self.min_device_vram_mb { + let big_enough = devices + .iter() + .filter(|d| d.vram_total_mb >= min_vram) + .count() as u32; + if big_enough < self.min_devices { + return false; + } + } + true + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::discovery::DeviceInfo; + + fn device(idx: u32, vram_mb: u64) -> DeviceInfo { + DeviceInfo { + index: idx, + name: format!("DEV-{idx}"), + vram_total_mb: vram_mb, + compute_capability: "8.6".into(), + } + } + + fn profile() -> ModelProfile { + ModelProfile { + id: "Qwen/Qwen3.6-27B".into(), + harness: "candle".into(), + quant: None, + vram_mb: Some(45_000), + min_devices: 2, + min_device_vram_mb: Some(24_000), + pinned_on: vec![], + } + } + + #[test] + fn feasible_when_two_devices_meet_vram_floor() { + let p = profile(); + let devices = [device(0, 32_000), device(1, 32_000)]; + assert!(p.is_feasible_on("beast", &devices)); + } + + #[test] + fn infeasible_when_only_one_device() { + let p = profile(); + let devices = [device(0, 64_000)]; + assert!(!p.is_feasible_on("benjy", &devices)); + } + + #[test] + fn infeasible_when_one_device_underspec() { + let p = profile(); + let devices = [device(0, 32_000), device(1, 12_000)]; + assert!(!p.is_feasible_on("mixed", &devices)); + } + + #[test] + fn pinned_on_excludes_other_neurons() { + let mut p = profile(); + p.pinned_on = vec!["beast".into()]; + let devices = [device(0, 32_000), device(1, 32_000)]; + assert!(p.is_feasible_on("beast", &devices)); + assert!(!p.is_feasible_on("benjy", &devices)); + } + + #[test] + fn no_vram_floor_just_needs_min_devices() { + let mut p = profile(); + p.min_device_vram_mb = None; + let devices = [device(0, 1_000), device(1, 1_000)]; + assert!(p.is_feasible_on("anywhere", &devices)); + } } diff --git a/crates/cortex-core/src/node.rs b/crates/cortex-core/src/node.rs index 860926a..e67ab89 100644 --- a/crates/cortex-core/src/node.rs +++ b/crates/cortex-core/src/node.rs @@ -1,3 +1,4 @@ +use crate::discovery::DiscoveryResponse; use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; @@ -13,6 +14,12 @@ pub struct NodeState { /// Number of load/unload cycles since last process restart. pub lifecycle_cycles: u32, pub last_poll: Option>, + /// Result of the most recent successful `GET /discovery` against + /// this neuron. Cached forever once obtained — device topology is + /// invariant for a given neuron process. `None` until the first + /// successful poll. Used by the router and `/v1/models` to do + /// catalogue × topology feasibility checks. + pub discovery: Option, } /// A model registered on a node, with its runtime status. @@ -36,12 +43,32 @@ pub enum ModelStatus { } /// Unified model entry as exposed by the gateway's `/v1/models` endpoint. -/// Includes which node(s) host this model and their status. +/// +/// The first four fields (`id`, `object`, `created`, `owned_by`) match +/// OpenAI's `/v1/models` shape verbatim, so existing OpenAI-aware +/// tooling deserialises this without custom code. The remaining fields +/// are helexa-specific extensions — OpenAI clients ignore unknown +/// fields and other consumers can read them for placement / debugging. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct CortexModelEntry { pub id: String, + /// Always `"model"` per OpenAI's contract. pub object: String, - /// Which nodes have this model (and their status). + /// Unix-second timestamp; cortex stamps this at response time. + pub created: u64, + /// OpenAI's "publisher" field — `"helexa"` for everything we serve. + pub owned_by: String, + /// True if any neuron currently has this model loaded. False for + /// catalogue entries that are feasible but not yet loaded. + pub loaded: bool, + /// Neurons whose discovered topology can satisfy this model's + /// catalogue placement constraints. Empty for models that are + /// loaded somewhere but not present in the catalogue (cortex has + /// no feasibility opinion on those). + pub feasible_on: Vec, + /// Where this model is actually loaded right now. Subset of (or + /// disjoint from) `feasible_on` depending on whether the catalogue + /// covers this model. pub locations: Vec, } diff --git a/crates/cortex-gateway/src/handlers.rs b/crates/cortex-gateway/src/handlers.rs index 294dd3d..af61a49 100644 --- a/crates/cortex-gateway/src/handlers.rs +++ b/crates/cortex-gateway/src/handlers.rs @@ -185,12 +185,62 @@ async fn anthropic_messages( } } -/// `GET /v1/models` — aggregate models from all nodes. +/// `GET /v1/models` — union of (catalogue × topology feasibility) and +/// (currently loaded somewhere). The result is what the fleet *could* +/// serve, not just what's already loaded — so OpenAI-compatible tools +/// see every model the operator has provisioned, and cortex +/// transparently cold-loads the first time one is requested. async fn list_models(State(fleet): State>) -> Json { + use std::collections::HashMap; + let now = Utc::now().timestamp() as u64; let nodes = fleet.nodes.read().await; - let mut model_map: std::collections::HashMap = - std::collections::HashMap::new(); + let catalogue = &fleet.catalogue; + let mut entries: HashMap = HashMap::new(); + + // Pass 1: catalogue × topology. For every catalogue profile, find + // healthy neurons whose discovered devices satisfy the profile. + // Catalogue-defined models surface here even if nothing has loaded + // them yet — that's the point of the unified endpoint. + for profile in &catalogue.models { + let mut feasible_on = Vec::new(); + for node in nodes.values() { + if !node.healthy { + continue; + } + let Some(disc) = node.discovery.as_ref() else { + continue; + }; + if profile.is_feasible_on(&node.name, &disc.devices) { + feasible_on.push(node.name.clone()); + } + } + if feasible_on.is_empty() { + // The catalogue lists this model but no neuron's topology + // matches — surface it as not-loaded with no feasible + // location. Hides nothing; lets operators see why a + // configured model isn't reachable. + feasible_on.clear(); + } + entries.insert( + profile.id.clone(), + CortexModelEntry { + id: profile.id.clone(), + object: "model".into(), + created: now, + owned_by: "helexa".into(), + loaded: false, + feasible_on, + locations: Vec::new(), + }, + ); + } + + // Pass 2: layer the actually-loaded state on top. For each + // (node, model) entry, attach a ModelLocation. If the model isn't + // in the catalogue, create a new CortexModelEntry from scratch — + // cortex doesn't refuse to surface a manually-loaded model just + // because the operator didn't enumerate it in models.toml. for node in nodes.values() { for (model_id, entry) in &node.models { let location = ModelLocation { @@ -198,19 +248,30 @@ async fn list_models(State(fleet): State>) -> Json { status: entry.status, vram_estimate_mb: entry.vram_estimate_mb, }; - model_map + let was_loaded = matches!(entry.status, cortex_core::node::ModelStatus::Loaded); + entries .entry(model_id.clone()) - .and_modify(|e| e.locations.push(location.clone())) + .and_modify(|e| { + e.locations.push(location.clone()); + if was_loaded { + e.loaded = true; + } + }) .or_insert_with(|| CortexModelEntry { id: model_id.clone(), object: "model".into(), + created: now, + owned_by: "helexa".into(), + loaded: was_loaded, + // Not in catalogue — cortex has no opinion on + // feasibility; leave empty. + feasible_on: Vec::new(), locations: vec![location], }); } } - let data: Vec = model_map.values().map(|e| json!(e)).collect(); - + let data: Vec = entries.values().map(|e| json!(e)).collect(); Json(json!({ "object": "list", "data": data, diff --git a/crates/cortex-gateway/src/poller.rs b/crates/cortex-gateway/src/poller.rs index 28340fb..2dbb308 100644 --- a/crates/cortex-gateway/src/poller.rs +++ b/crates/cortex-gateway/src/poller.rs @@ -3,6 +3,7 @@ use crate::state::CortexState; use chrono::Utc; +use cortex_core::discovery::DiscoveryResponse; use cortex_core::harness::ModelInfo; use cortex_core::node::{ModelEntry, ModelStatus}; use std::sync::Arc; @@ -25,7 +26,59 @@ pub async fn poll_once(fleet: &CortexState) { } } +/// One-shot fetch of `GET /discovery`. Cached on the NodeState forever +/// after the first success — topology is invariant for a given neuron +/// process. Skipped when the cache is already populated. +async fn maybe_poll_discovery(fleet: &CortexState, name: &str, endpoint: &str) { + { + let nodes = fleet.nodes.read().await; + match nodes.get(name) { + Some(n) if n.discovery.is_some() => return, + _ => {} + } + } + let url = format!("{endpoint}/discovery"); + let resp = match fleet + .http_client + .get(&url) + .timeout(Duration::from_secs(5)) + .send() + .await + { + Ok(r) if r.status().is_success() => r, + Ok(r) => { + tracing::debug!(node = name, status = %r.status(), "discovery probe non-success"); + return; + } + Err(e) => { + tracing::debug!(node = name, error = %e, "discovery probe unreachable"); + return; + } + }; + match resp.json::().await { + Ok(d) => { + let mut nodes = fleet.nodes.write().await; + if let Some(node) = nodes.get_mut(name) { + tracing::info!( + node = name, + hostname = %d.hostname, + devices = d.devices.len(), + "discovery cached" + ); + node.discovery = Some(d); + } + } + Err(e) => { + tracing::warn!(node = name, error = %e, "failed to parse /discovery response"); + } + } +} + async fn poll_neuron(fleet: &CortexState, name: &str, endpoint: &str) { + // Topology first — cheap once cached, and the router needs it to + // route requests against catalogue entries that aren't loaded yet. + maybe_poll_discovery(fleet, name, endpoint).await; + let url = format!("{endpoint}/models"); let result = fleet diff --git a/crates/cortex-gateway/src/router.rs b/crates/cortex-gateway/src/router.rs index 7962871..4b90a7b 100644 --- a/crates/cortex-gateway/src/router.rs +++ b/crates/cortex-gateway/src/router.rs @@ -2,13 +2,21 @@ //! //! Given a model ID from an inbound request, determine which node should //! handle it. Priority: -//! 1. Node where the model is currently `Loaded` -//! 2. Node where the model is `Unloaded` (will lazy-load on request) -//! 3. Error: model not found on any node +//! 1. Node where the model is currently `Loaded` → use it. +//! 2. Node where the model is `Unloaded` → use it; neuron's existing +//! lazy-load behaviour will reload before serving the request. +//! 3. Model is in the catalogue → pick a feasible neuron, call +//! `POST /models/load`, wait for the load to complete, then +//! proxy. First-request cold-load latency is acceptable per the +//! unified-endpoint contract. +//! 4. Not in catalogue, not loaded anywhere → 404. use crate::state::CortexState; +use cortex_core::catalogue::ModelProfile; +use cortex_core::harness::ModelSpec; use cortex_core::node::ModelStatus; use std::sync::Arc; +use std::time::Duration; /// The routing decision: which node endpoint to proxy the request to. #[derive(Debug, Clone)] @@ -16,18 +24,31 @@ pub struct RouteDecision { pub node_name: String, /// The inference endpoint to proxy to (from neuron's /models/{id}/endpoint). pub endpoint: String, - /// Whether the model will need to load (cold start). + /// Whether the model will need to load (cold start). Set to true + /// when we proxied to an `Unloaded` node (lazy load on neuron) or + /// when we just triggered an explicit cold-load via the catalogue + /// path. pub cold_start: bool, } #[derive(Debug, thiserror::Error)] pub enum RouteError { - #[error("model '{0}' not found on any node")] + #[error("model '{0}' not found on any node and not in catalogue")] ModelNotFound(String), #[error("no healthy nodes available")] NoHealthyNodes, #[error("failed to resolve inference endpoint for model '{0}' on node '{1}'")] EndpointResolveFailed(String, String), + #[error( + "model '{model_id}' is in the catalogue but no healthy neuron's topology satisfies its constraints" + )] + NoFeasibleNeuron { model_id: String }, + #[error("cold-load of '{model_id}' on '{node}' failed: {message}")] + ColdLoadFailed { + model_id: String, + node: String, + message: String, + }, } /// Resolve which node should serve a request for the given model. @@ -36,42 +57,231 @@ pub async fn resolve( fleet: &Arc, model_id: &str, ) -> Result { - let (node_name, neuron_endpoint, cold_start) = { + // Snapshot loaded / unloaded state from the poller cache. + let (loaded_route, unloaded_route, any_healthy) = { let nodes = fleet.nodes.read().await; - - let mut loaded_candidate = None; - let mut unloaded_candidate = None; - + let mut loaded_route = None; + let mut unloaded_route = None; + let mut any_healthy = false; for node in nodes.values() { if !node.healthy { continue; } + any_healthy = true; if let Some(entry) = node.models.get(model_id) { match entry.status { ModelStatus::Loaded | ModelStatus::Reloading => { - loaded_candidate = Some((node.name.clone(), node.endpoint.clone(), false)); + loaded_route = Some((node.name.clone(), node.endpoint.clone(), false)); break; } ModelStatus::Unloaded => { - if unloaded_candidate.is_none() { - unloaded_candidate = - Some((node.name.clone(), node.endpoint.clone(), true)); + if unloaded_route.is_none() { + unloaded_route = Some((node.name.clone(), node.endpoint.clone(), true)); } } } } } - - loaded_candidate.or(unloaded_candidate).ok_or_else(|| { - if nodes.values().any(|n| n.healthy) { - RouteError::ModelNotFound(model_id.to_string()) - } else { - RouteError::NoHealthyNodes - } - })? + (loaded_route, unloaded_route, any_healthy) }; - // Ask the neuron for the inference endpoint for this model. + if !any_healthy { + return Err(RouteError::NoHealthyNodes); + } + + // Priority 1: already loaded. + if let Some((node_name, neuron_endpoint, cold_start)) = loaded_route { + return finish(fleet, &node_name, &neuron_endpoint, model_id, cold_start).await; + } + + // Priority 2: known to neuron but unloaded (neuron's lazy load). + if let Some((node_name, neuron_endpoint, cold_start)) = unloaded_route { + return finish(fleet, &node_name, &neuron_endpoint, model_id, cold_start).await; + } + + // Priority 3: catalogue × topology cold-load. + if let Some(profile) = fleet.catalogue.get(model_id) { + let (node_name, neuron_endpoint) = pick_feasible_neuron(fleet, profile).await?; + cold_load(fleet, &node_name, &neuron_endpoint, profile).await?; + return finish(fleet, &node_name, &neuron_endpoint, model_id, true).await; + } + + Err(RouteError::ModelNotFound(model_id.to_string())) +} + +/// Pick a healthy neuron whose discovered topology satisfies the +/// profile. Preference order: +/// 1. A neuron from `profile.pinned_on` that is healthy + feasible. +/// 2. Otherwise, any healthy + feasible neuron, stable by name. +async fn pick_feasible_neuron( + fleet: &Arc, + profile: &ModelProfile, +) -> Result<(String, String), RouteError> { + let nodes = fleet.nodes.read().await; + let mut candidates: Vec<(String, String, bool)> = Vec::new(); + for node in nodes.values() { + if !node.healthy { + continue; + } + let Some(disc) = node.discovery.as_ref() else { + continue; + }; + if !profile.is_feasible_on(&node.name, &disc.devices) { + continue; + } + let pinned = profile.pinned_on.iter().any(|n| n == &node.name); + candidates.push((node.name.clone(), node.endpoint.clone(), pinned)); + } + candidates.sort_by(|a, b| { + b.2.cmp(&a.2) // pinned first (true > false) + .then(a.0.cmp(&b.0)) + }); + let pick = candidates.into_iter().next(); + pick.map(|(n, e, _)| (n, e)) + .ok_or_else(|| RouteError::NoFeasibleNeuron { + model_id: profile.id.clone(), + }) +} + +/// Issue `POST {endpoint}/models/load` for this profile on this neuron, +/// blocking until the load completes (neuron's load endpoint is +/// synchronous — it returns 200 once VRAM is materialised). On success +/// also inserts a `Loaded` entry into the local NodeState cache so the +/// caller's subsequent endpoint lookup sees the new model without +/// waiting for the next poll cycle. +async fn cold_load( + fleet: &Arc, + node_name: &str, + neuron_endpoint: &str, + profile: &ModelProfile, +) -> Result<(), RouteError> { + let spec = profile_to_spec(fleet, node_name, profile).await; + let url = format!("{neuron_endpoint}/models/load"); + tracing::info!(model = %profile.id, node = node_name, "cold-loading via /models/load"); + + // Generous timeout: a fresh download + safetensors mmap + device + // copy for a 30B-class dense model can comfortably exceed 5 min on + // a slow link. The HTTP client's own default already covers most + // of this; pin a longer per-request bound just here. + let resp = match fleet + .http_client + .post(&url) + .timeout(Duration::from_secs(1800)) + .json(&spec) + .send() + .await + { + Ok(r) => r, + Err(e) => { + return Err(RouteError::ColdLoadFailed { + model_id: profile.id.clone(), + node: node_name.to_string(), + message: format!("HTTP request failed: {e}"), + }); + } + }; + + let status = resp.status(); + if !status.is_success() { + let body = resp.text().await.unwrap_or_default(); + // Neuron returns 400 "already loaded" when two concurrent + // requests race the same model. Treat that as success — both + // requests effectively achieved the same end state. + if body.contains("already loaded") { + tracing::info!( + model = %profile.id, + node = node_name, + "cold-load saw 'already loaded' — treating as success" + ); + } else { + return Err(RouteError::ColdLoadFailed { + model_id: profile.id.clone(), + node: node_name.to_string(), + message: format!("HTTP {status}: {body}"), + }); + } + } else { + tracing::info!(model = %profile.id, node = node_name, "cold-load returned 200"); + } + + // Warm the cache: insert a Loaded ModelEntry so the next + // resolve() finds the model without waiting for the poll loop. + { + let mut nodes = fleet.nodes.write().await; + if let Some(node) = nodes.get_mut(node_name) { + node.models.insert( + profile.id.clone(), + cortex_core::node::ModelEntry { + id: profile.id.clone(), + status: ModelStatus::Loaded, + last_accessed: Some(chrono::Utc::now()), + vram_estimate_mb: profile.vram_mb, + }, + ); + } + } + Ok(()) +} + +/// Translate a `ModelProfile` to a `ModelSpec` neuron's /models/load +/// accepts. Devices are picked from the neuron's discovered topology — +/// the first `min_devices` indices that meet `min_device_vram_mb`. +async fn profile_to_spec( + fleet: &Arc, + node_name: &str, + profile: &ModelProfile, +) -> ModelSpec { + let devices = { + let nodes = fleet.nodes.read().await; + let mut picked: Vec = Vec::new(); + if let Some(node) = nodes.get(node_name) + && let Some(disc) = &node.discovery + { + let min_vram = profile.min_device_vram_mb.unwrap_or(0); + for d in &disc.devices { + if d.vram_total_mb >= min_vram { + picked.push(d.index); + if picked.len() as u32 >= profile.min_devices { + break; + } + } + } + } + if picked.is_empty() { + // Fall back to a 0..min_devices default; pick_feasible_neuron + // already verified the topology satisfies the constraints, + // so this only fires if discovery raced or was lost. + (0..profile.min_devices).collect() + } else { + picked + } + }; + + let tensor_parallel = if profile.min_devices > 1 { + Some(profile.min_devices) + } else { + None + }; + + ModelSpec { + model_id: profile.id.clone(), + harness: profile.harness.clone(), + quant: profile.quant.clone(), + tensor_parallel, + devices: Some(devices), + } +} + +/// Resolve neuron's `/models/{id}/endpoint` to its inference URL and +/// build the final `RouteDecision`. Shared by all three priority +/// branches above. +async fn finish( + fleet: &Arc, + node_name: &str, + neuron_endpoint: &str, + model_id: &str, + cold_start: bool, +) -> Result { let endpoint_url = format!( "{}/models/{}/endpoint", neuron_endpoint, @@ -90,11 +300,11 @@ pub async fn resolve( }; let endpoint = inference_endpoint.ok_or_else(|| { - RouteError::EndpointResolveFailed(model_id.to_string(), node_name.clone()) + RouteError::EndpointResolveFailed(model_id.to_string(), node_name.to_string()) })?; Ok(RouteDecision { - node_name, + node_name: node_name.to_string(), endpoint, cold_start, }) diff --git a/crates/cortex-gateway/src/state.rs b/crates/cortex-gateway/src/state.rs index b5bec20..6699889 100644 --- a/crates/cortex-gateway/src/state.rs +++ b/crates/cortex-gateway/src/state.rs @@ -26,6 +26,7 @@ impl CortexState { models: HashMap::new(), lifecycle_cycles: 0, last_poll: None, + discovery: None, }, ); } diff --git a/models.example.toml b/models.example.toml index 0f2c9c3..cd9e3d5 100644 --- a/models.example.toml +++ b/models.example.toml @@ -2,28 +2,50 @@ # # Copy to /etc/cortex/models.toml and adjust for your environment. # Describes how to serve each model. Cortex matches these profiles -# against discovered neuron topologies for placement decisions. +# against discovered neuron topologies for placement decisions; the +# resulting `(catalogue × topology)` set is what `GET /v1/models` +# returns and what the router can cold-load on demand. +# +# Field reference: +# id - HuggingFace model id, exact match. +# harness - which engine handles inference (currently "candle"). +# quant - GGUF quantisation tag for the file in the HF repo +# (e.g. "Q4_K_M"). Omit/empty for the dense +# safetensors path. TP requires dense. +# vram_mb - rough estimate; advisory only, not enforced. +# min_devices - GPU count this profile needs. TP profiles use +# the same value as the tensor-parallel size. +# min_device_vram_mb - each device must meet this VRAM floor for the +# neuron to be considered "feasible". +# pinned_on - optional whitelist of neuron names. Non-empty +# narrows feasibility to just those neurons and +# protects the model from LRU eviction there. +# +# The examples below match the canonical helexa fleet +# (beast = 2x RTX 5090, benjy = RTX 4090, quadbrat = RTX 3060). +# Tensor-parallel target — only beast has two big GPUs. [[models]] -id = "your-org/large-model" +id = "Qwen/Qwen3.6-27B" +harness = "candle" +vram_mb = 54000 +min_devices = 2 +min_device_vram_mb = 24000 +pinned_on = ["beast"] + +# Mid-size dense model — fits on benjy or beast. +[[models]] +id = "Qwen/Qwen3-8B" +harness = "candle" +vram_mb = 18000 +min_devices = 1 +min_device_vram_mb = 16000 + +# Small GGUF quantised — runs on the smallest neuron (quadbrat). +[[models]] +id = "unsloth/Qwen3-0.6B-GGUF" harness = "candle" quant = "Q4_K_M" -vram_mb = 19000 -min_devices = 2 -min_device_vram_mb = 10000 -pinned_on = ["gpu-large"] - -[[models]] -id = "your-org/medium-model" -harness = "candle" -quant = "Q6_K" -vram_mb = 12000 -min_devices = 1 -pinned_on = ["gpu-medium"] - -[[models]] -id = "your-org/embedding-model" -harness = "candle" -quant = "Q8_0" -vram_mb = 8000 +vram_mb = 500 min_devices = 1 +min_device_vram_mb = 4000