Files
cortex/crates/cortex-gateway/tests/aliases.rs
rob thijssen 24e20dcb5c
All checks were successful
build-prerelease / Resolve version stamps (push) Successful in 39s
CI / Format (push) Successful in 40s
CI / Clippy (push) Successful in 2m21s
CI / Test (push) Successful in 4m40s
build-prerelease / Build neuron-blackwell (push) Successful in 3m38s
CI / Build cortex SRPM (push) Has been skipped
CI / Build neuron SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
build-prerelease / Build cortex binary (push) Successful in 4m19s
build-prerelease / Package cortex RPM (push) Successful in 1m21s
build-prerelease / Build neuron-ampere (push) Successful in 5m20s
build-prerelease / Build neuron-ada (push) Successful in 4m45s
build-prerelease / Package helexa-neuron-ampere RPM (push) Successful in 2m59s
build-prerelease / Package helexa-neuron-ada RPM (push) Successful in 3m10s
build-prerelease / Package helexa-neuron-blackwell RPM (push) Successful in 9m40s
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Successful in 1m3s
feat(catalogue,gateway): model aliases (helexa/small, helexa/balanced, helexa/large)
Operators can now define tier aliases in models.toml:

  [aliases]
  "helexa/small" = "Qwen/Qwen3-1.7B"
  "helexa/balanced" = "Qwen/Qwen3-8B"
  "helexa/large" = "Qwen/Qwen3.6-27B"

A client request for `model: "helexa/small"` is resolved to the concrete
model id at routing time. The gateway also rewrites the proxied body's
`model` field to the concrete id so neuron sees a name that matches its
loaded handle (otherwise the harness rejects the request).

Motivated by the finger-in-the-wind benchmark: same "what's the capital
of Georgia" probe runs in 2.5s on the 1.7B vs 6.7s on the 27B with
identical correctness. Aliases let clients pick a latency tier without
hardcoding model ids, and let operators swap targets without changing
client code.

Changes:
  * cortex-core: `ModelCatalogue` gains `aliases: HashMap<String, String>`
    + `resolve_alias(&str) -> &str`. Unit tests cover the basic
    resolution + TOML round-trip.
  * cortex-gateway:
    * `RouteDecision` gains `resolved_model_id: String`. `router::resolve`
      consumes aliases at entry and threads the concrete id through.
    * Handlers (chat_completions, completions, anthropic_messages
      streaming + non-streaming) rewrite the body's `model` field with
      `rewrite_model_in_body` before proxying, using the resolved id
      for metrics labels, LRU touch, and the body itself.
    * `/v1/models` (Pass 4) emits each alias as its own entry mirroring
      the target's `loaded` flag, feasible_on, and locations — clients
      browsing the endpoint see both names and can pick either.
  * `models.toml` declares the three tier aliases; `models.example.toml`
    documents the section as opt-in.
  * Integration tests verify: end-to-end alias→concrete request flow,
    alias surfacing in /v1/models, and no-op fall-through for
    non-alias model ids.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-26 16:10:41 +03:00

266 lines
9.0 KiB
Rust

//! Alias resolution: a client request with `model: "helexa/small"`
//! routes to the concrete model id (e.g. `Qwen/Qwen3-1.7B`), with the
//! proxied request body rewritten so the upstream neuron sees a model
//! name that matches its loaded handle.
mod common;
use cortex_core::config::{
EvictionSettings, EvictionStrategy, GatewayConfig, GatewaySettings, NeuronEndpoint,
};
use cortex_core::node::{ModelEntry, ModelStatus};
use cortex_gateway::state::CortexState;
use serde_json::json;
use std::path::PathBuf;
use std::sync::Arc;
use tokio::net::TcpListener;
/// Write a `models.toml` with one alias to a unique temp path. Returns
/// the path; the file persists for the test process and gets reaped by
/// the OS at exit. Using $XDG_RUNTIME_DIR fallback for the temp dir
/// keeps the file off shared /tmp on CI without pulling in tempfile.
fn write_models_toml(alias: &str, target: &str) -> PathBuf {
let contents = format!(
r#"
[aliases]
"{alias}" = "{target}"
"#
);
let mut path = std::env::temp_dir();
let pid = std::process::id();
let now = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_nanos();
path.push(format!("cortex-test-models-{pid}-{now}.toml"));
std::fs::write(&path, contents).expect("write temp models.toml");
path
}
#[tokio::test]
async fn test_alias_resolves_in_chat_completions() {
let mock_url = common::spawn_mock_neuron().await;
let models_path = write_models_toml("helexa/small", "test-model");
let config = GatewayConfig {
gateway: GatewaySettings {
listen: "127.0.0.1:0".into(),
metrics_listen: "127.0.0.1:0".into(),
},
eviction: EvictionSettings {
strategy: EvictionStrategy::Lru,
defrag_after_cycles: 0,
},
neurons: vec![NeuronEndpoint {
name: "mock-node".into(),
endpoint: mock_url,
}],
models_config: models_path.to_string_lossy().to_string(),
};
let fleet = Arc::new(CortexState::from_config(&config));
// Seed the node as healthy with the concrete model loaded under
// the target id. The poller doesn't run in this test; we just
// populate state manually.
{
let mut nodes = fleet.nodes.write().await;
let node = nodes.get_mut("mock-node").expect("node must exist");
node.healthy = true;
node.models.insert(
"test-model".into(),
ModelEntry {
id: "test-model".into(),
status: ModelStatus::Loaded,
last_accessed: None,
vram_estimate_mb: None,
},
);
}
// Sanity: the catalogue actually picked up the alias.
assert_eq!(
fleet.catalogue.resolve_alias("helexa/small"),
"test-model",
"alias should resolve to target id"
);
// Spawn the gateway against this fleet.
let app = cortex_gateway::build_app(Arc::clone(&fleet));
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
let gateway_addr = listener.local_addr().unwrap();
tokio::spawn(async move {
axum::serve(listener, app).await.unwrap();
});
let gateway_url = format!("http://{gateway_addr}");
// Send a chat completion against the alias. The mock backend
// echoes back the `model` field it received — so a body whose
// model wasn't rewritten would come back as "helexa/small", and a
// properly-rewritten one as "test-model".
let client = reqwest::Client::new();
let resp = client
.post(format!("{gateway_url}/v1/chat/completions"))
.json(&json!({
"model": "helexa/small",
"messages": [{"role": "user", "content": "hi"}],
}))
.send()
.await
.expect("gateway should respond");
assert!(resp.status().is_success(), "gateway returned non-2xx");
let body: serde_json::Value = resp.json().await.expect("response is JSON");
assert_eq!(
body.get("model").and_then(|m| m.as_str()),
Some("test-model"),
"mock backend should have seen the resolved model id, not the alias"
);
}
#[tokio::test]
async fn test_aliases_surface_in_v1_models() {
let mock_url = common::spawn_mock_neuron().await;
let models_path = write_models_toml("helexa/small", "test-model");
let config = GatewayConfig {
gateway: GatewaySettings {
listen: "127.0.0.1:0".into(),
metrics_listen: "127.0.0.1:0".into(),
},
eviction: EvictionSettings {
strategy: EvictionStrategy::Lru,
defrag_after_cycles: 0,
},
neurons: vec![NeuronEndpoint {
name: "mock-node".into(),
endpoint: mock_url,
}],
models_config: models_path.to_string_lossy().to_string(),
};
let fleet = Arc::new(CortexState::from_config(&config));
// Seed the target as loaded so the alias's mirrored entry shows
// loaded=true.
{
let mut nodes = fleet.nodes.write().await;
let node = nodes.get_mut("mock-node").expect("node must exist");
node.healthy = true;
node.models.insert(
"test-model".into(),
ModelEntry {
id: "test-model".into(),
status: ModelStatus::Loaded,
last_accessed: None,
vram_estimate_mb: Some(2000),
},
);
}
let app = cortex_gateway::build_app(Arc::clone(&fleet));
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
let gateway_addr = listener.local_addr().unwrap();
tokio::spawn(async move {
axum::serve(listener, app).await.unwrap();
});
let gateway_url = format!("http://{gateway_addr}");
let resp = reqwest::get(format!("{gateway_url}/v1/models"))
.await
.expect("gateway should respond");
let body: serde_json::Value = resp.json().await.unwrap();
let entries = body
.get("data")
.and_then(|d| d.as_array())
.expect("data array");
// Both the alias and the target should be present.
let ids: Vec<&str> = entries
.iter()
.filter_map(|e| e.get("id").and_then(|v| v.as_str()))
.collect();
assert!(ids.contains(&"test-model"), "target should be listed");
assert!(ids.contains(&"helexa/small"), "alias should be listed");
// The alias's `loaded` flag and locations should mirror the target.
let alias_entry = entries
.iter()
.find(|e| e.get("id").and_then(|v| v.as_str()) == Some("helexa/small"))
.expect("alias entry");
assert_eq!(alias_entry.get("loaded"), Some(&json!(true)));
let locations = alias_entry
.get("locations")
.and_then(|l| l.as_array())
.expect("locations array");
assert_eq!(locations.len(), 1);
assert_eq!(
locations[0].get("node").and_then(|n| n.as_str()),
Some("mock-node")
);
}
#[tokio::test]
async fn test_alias_falls_through_for_unmapped_model() {
// Catalogue has an alias for some-other-thing but the request
// model "test-model" isn't an alias; resolution should be a no-op.
let mock_url = common::spawn_mock_neuron().await;
let models_path = write_models_toml("helexa/large", "definitely-not-loaded");
let config = GatewayConfig {
gateway: GatewaySettings {
listen: "127.0.0.1:0".into(),
metrics_listen: "127.0.0.1:0".into(),
},
eviction: EvictionSettings {
strategy: EvictionStrategy::Lru,
defrag_after_cycles: 0,
},
neurons: vec![NeuronEndpoint {
name: "mock-node".into(),
endpoint: mock_url,
}],
models_config: models_path.to_string_lossy().to_string(),
};
let fleet = Arc::new(CortexState::from_config(&config));
{
let mut nodes = fleet.nodes.write().await;
let node = nodes.get_mut("mock-node").expect("node must exist");
node.healthy = true;
node.models.insert(
"test-model".into(),
ModelEntry {
id: "test-model".into(),
status: ModelStatus::Loaded,
last_accessed: None,
vram_estimate_mb: None,
},
);
}
let app = cortex_gateway::build_app(Arc::clone(&fleet));
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
let gateway_addr = listener.local_addr().unwrap();
tokio::spawn(async move {
axum::serve(listener, app).await.unwrap();
});
let gateway_url = format!("http://{gateway_addr}");
let resp = reqwest::Client::new()
.post(format!("{gateway_url}/v1/chat/completions"))
.json(&json!({
"model": "test-model",
"messages": [{"role": "user", "content": "hi"}],
}))
.send()
.await
.unwrap();
assert!(resp.status().is_success());
let body: serde_json::Value = resp.json().await.unwrap();
assert_eq!(
body.get("model").and_then(|m| m.as_str()),
Some("test-model")
);
}