Files
cortex/crates/cortex-gateway/src/evictor.rs
rob thijssen e42e8ee81f
All checks were successful
CI / Format, lint, build, test (push) Successful in 2m46s
CI / Build SRPM (push) Has been skipped
CI / Publish to COPR (push) Has been skipped
refactor: cortex talks to neurons instead of mistral.rs directly
Replace NodeConfig (static vram_mb, pinned) with NeuronEndpoint.
Hardware discovery and model pinning now come from neuron API and
models.toml catalogue respectively.

- config.rs: nodes -> neurons, add models_config path
- catalogue.rs: ModelProfile with pinned_on, ModelCatalogue
- poller.rs: poll neuron GET /models (ModelInfo format)
- router.rs: resolve inference endpoint via neuron GET /models/{id}/endpoint
- evictor.rs: call neuron POST /models/unload
- node.rs: remove vram_mb, pinned fields (come from discovery/catalogue)
- All 22 gateway tests updated to mock neuron API
- Remove MistralModelsResponse, ModelLifecycleRequest (no longer needed)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-15 14:42:52 +03:00

95 lines
3.1 KiB
Rust

//! Model eviction logic.
//!
//! The evictor identifies the LRU model on a node (excluding pinned models),
//! calls neuron's `POST /models/unload` to free the model, and updates
//! local state.
use crate::state::CortexState;
use cortex_core::node::ModelStatus;
use std::sync::Arc;
use std::time::Duration;
/// Runs forever. Placeholder for future channel-driven eviction.
pub async fn eviction_loop(fleet: Arc<CortexState>) {
loop {
tokio::time::sleep(Duration::from_secs(30)).await;
let _ = &fleet;
}
}
/// Evict the least-recently-used model on a given node.
/// Returns the model ID that was evicted, or None if nothing could be evicted.
pub async fn evict_lru_on_node(
fleet: &CortexState,
node_name: &str,
) -> anyhow::Result<Option<String>> {
let (neuron_endpoint, candidate) = {
let nodes = fleet.nodes.read().await;
let Some(node) = nodes.get(node_name) else {
anyhow::bail!("node '{node_name}' not found");
};
// Find the loaded model with the oldest last_accessed,
// excluding models pinned on this neuron (from catalogue).
let candidate = node
.models
.values()
.filter(|m| m.status == ModelStatus::Loaded)
.filter(|m| !fleet.catalogue.is_pinned(&m.id, node_name))
.min_by_key(|m| m.last_accessed)
.map(|m| m.id.clone());
(node.endpoint.clone(), candidate)
};
let Some(model_id) = candidate else {
tracing::info!(node = node_name, "no evictable models found");
return Ok(None);
};
tracing::info!(node = node_name, model = %model_id, "evicting model");
// Call neuron's unload endpoint.
let url = format!("{neuron_endpoint}/models/unload");
let resp = fleet
.http_client
.post(&url)
.json(&serde_json::json!({ "model_id": model_id }))
.send()
.await?;
if resp.status().is_success() {
let mut nodes = fleet.nodes.write().await;
if let Some(node) = nodes.get_mut(node_name) {
if let Some(entry) = node.models.get_mut(&model_id) {
entry.status = ModelStatus::Unloaded;
}
node.lifecycle_cycles += 1;
if fleet.eviction.defrag_after_cycles > 0
&& node.lifecycle_cycles >= fleet.eviction.defrag_after_cycles
{
tracing::warn!(
node = node_name,
cycles = node.lifecycle_cycles,
"VRAM fragmentation threshold reached — consider restarting harness"
);
}
}
tracing::info!(node = node_name, model = %model_id, "model evicted");
Ok(Some(model_id))
} else {
let status = resp.status();
let body = resp.text().await.unwrap_or_default();
tracing::error!(
node = node_name,
model = %model_id,
status = %status,
body = %body,
"failed to evict model"
);
anyhow::bail!("eviction failed: {status} {body}");
}
}