refactor: cortex talks to neurons instead of mistral.rs directly
Replace NodeConfig (static vram_mb, pinned) with NeuronEndpoint.
Hardware discovery and model pinning now come from neuron API and
models.toml catalogue respectively.
- config.rs: nodes -> neurons, add models_config path
- catalogue.rs: ModelProfile with pinned_on, ModelCatalogue
- poller.rs: poll neuron GET /models (ModelInfo format)
- router.rs: resolve inference endpoint via neuron GET /models/{id}/endpoint
- evictor.rs: call neuron POST /models/unload
- node.rs: remove vram_mb, pinned fields (come from discovery/catalogue)
- All 22 gateway tests updated to mock neuron API
- Remove MistralModelsResponse, ModelLifecycleRequest (no longer needed)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,15 +1,16 @@
|
||||
//! Background poller that periodically queries each node's `/v1/models`
|
||||
//! endpoint to refresh the fleet state.
|
||||
//! Background poller that periodically queries each neuron's API
|
||||
//! to refresh the fleet state.
|
||||
|
||||
use crate::state::CortexState;
|
||||
use chrono::Utc;
|
||||
use cortex_core::node::{MistralModelsResponse, ModelEntry, ModelStatus};
|
||||
use cortex_core::harness::ModelInfo;
|
||||
use cortex_core::node::{ModelEntry, ModelStatus};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
const POLL_INTERVAL: Duration = Duration::from_secs(10);
|
||||
|
||||
/// Runs forever, polling all nodes on a fixed interval.
|
||||
/// Runs forever, polling all neurons on a fixed interval.
|
||||
pub async fn poll_loop(fleet: Arc<CortexState>) {
|
||||
loop {
|
||||
poll_once(&fleet).await;
|
||||
@@ -17,15 +18,15 @@ pub async fn poll_loop(fleet: Arc<CortexState>) {
|
||||
}
|
||||
}
|
||||
|
||||
/// Poll all nodes once. Used by `poll_loop` and available for testing.
|
||||
/// Poll all neurons once. Used by `poll_loop` and available for testing.
|
||||
pub async fn poll_once(fleet: &CortexState) {
|
||||
for nc in &fleet.node_configs {
|
||||
poll_node(fleet, &nc.name, &nc.endpoint).await;
|
||||
for nc in &fleet.neuron_configs {
|
||||
poll_neuron(fleet, &nc.name, &nc.endpoint).await;
|
||||
}
|
||||
}
|
||||
|
||||
async fn poll_node(fleet: &CortexState, name: &str, endpoint: &str) {
|
||||
let url = format!("{endpoint}/v1/models");
|
||||
async fn poll_neuron(fleet: &CortexState, name: &str, endpoint: &str) {
|
||||
let url = format!("{endpoint}/models");
|
||||
|
||||
let result = fleet
|
||||
.http_client
|
||||
@@ -41,38 +42,36 @@ async fn poll_node(fleet: &CortexState, name: &str, endpoint: &str) {
|
||||
|
||||
match result {
|
||||
Ok(resp) if resp.status().is_success() => {
|
||||
match resp.json::<MistralModelsResponse>().await {
|
||||
Ok(models_resp) => {
|
||||
// Merge upstream model list into our state, preserving
|
||||
// our local metadata (last_accessed, vram_estimate).
|
||||
match resp.json::<Vec<ModelInfo>>().await {
|
||||
Ok(models) => {
|
||||
let mut seen = std::collections::HashSet::new();
|
||||
for upstream in &models_resp.data {
|
||||
for upstream in &models {
|
||||
seen.insert(upstream.id.clone());
|
||||
let status = parse_status(upstream.status.as_deref());
|
||||
let status = parse_status(&upstream.status);
|
||||
|
||||
node.models
|
||||
.entry(upstream.id.clone())
|
||||
.and_modify(|e| {
|
||||
e.status = status;
|
||||
e.vram_estimate_mb = upstream.vram_used_mb;
|
||||
})
|
||||
.or_insert_with(|| ModelEntry {
|
||||
id: upstream.id.clone(),
|
||||
status,
|
||||
last_accessed: None,
|
||||
vram_estimate_mb: None,
|
||||
vram_estimate_mb: upstream.vram_used_mb,
|
||||
});
|
||||
}
|
||||
|
||||
// Remove models that are no longer reported by the node
|
||||
// (e.g. after a config change / restart).
|
||||
// Remove models no longer reported by the neuron.
|
||||
node.models.retain(|id, _| seen.contains(id));
|
||||
|
||||
node.healthy = true;
|
||||
node.last_poll = Some(Utc::now());
|
||||
tracing::debug!(node = name, models = models_resp.data.len(), "poll ok");
|
||||
tracing::debug!(node = name, models = models.len(), "poll ok");
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(node = name, error = %e, "failed to parse /v1/models response");
|
||||
tracing::warn!(node = name, error = %e, "failed to parse /models response");
|
||||
node.healthy = false;
|
||||
}
|
||||
}
|
||||
@@ -81,24 +80,22 @@ async fn poll_node(fleet: &CortexState, name: &str, endpoint: &str) {
|
||||
tracing::warn!(
|
||||
node = name,
|
||||
status = %resp.status(),
|
||||
"node returned non-success status"
|
||||
"neuron returned non-success status"
|
||||
);
|
||||
node.healthy = false;
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(node = name, error = %e, "failed to reach node");
|
||||
tracing::warn!(node = name, error = %e, "failed to reach neuron");
|
||||
node.healthy = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_status(s: Option<&str>) -> ModelStatus {
|
||||
fn parse_status(s: &str) -> ModelStatus {
|
||||
match s {
|
||||
Some("loaded") => ModelStatus::Loaded,
|
||||
Some("unloaded") => ModelStatus::Unloaded,
|
||||
Some("reloading") => ModelStatus::Reloading,
|
||||
// If the status field is absent, assume loaded (older mistral.rs versions
|
||||
// may not include it).
|
||||
"loaded" => ModelStatus::Loaded,
|
||||
"unloaded" => ModelStatus::Unloaded,
|
||||
"reloading" => ModelStatus::Reloading,
|
||||
_ => ModelStatus::Loaded,
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user