feat(catalogue,gateway): model aliases (helexa/small, helexa/balanced, helexa/large)

Operators can now define tier aliases in models.toml: [aliases] "helexa/small" = "Qwen/Qwen3-1.7B" "helexa/balanced" = "Qwen/Qwen3-8B" "helexa/large" = "Qwen/Qwen3.6-27B" A client request for `model: "helexa/small"` is resolved to the concrete model id at routing time. The gateway also rewrites the proxied body's `model` field to the concrete id so neuron sees a name that matches its loaded handle (otherwise the harness rejects the request). Motivated by the finger-in-the-wind benchmark: same "what's the capital of Georgia" probe runs in 2.5s on the 1.7B vs 6.7s on the 27B with identical correctness. Aliases let clients pick a latency tier without hardcoding model ids, and let operators swap targets without changing client code. Changes: * cortex-core: `ModelCatalogue` gains `aliases: HashMap<String, String>` + `resolve_alias(&str) -> &str`. Unit tests cover the basic resolution + TOML round-trip. * cortex-gateway: * `RouteDecision` gains `resolved_model_id: String`. `router::resolve` consumes aliases at entry and threads the concrete id through. * Handlers (chat_completions, completions, anthropic_messages streaming + non-streaming) rewrite the body's `model` field with `rewrite_model_in_body` before proxying, using the resolved id for metrics labels, LRU touch, and the body itself. * `/v1/models` (Pass 4) emits each alias as its own entry mirroring the target's `loaded` flag, feasible_on, and locations — clients browsing the endpoint see both names and can pick either. * `models.toml` declares the three tier aliases; `models.example.toml` documents the section as opt-in. * Integration tests verify: end-to-end alias→concrete request flow, alias surfacing in /v1/models, and no-op fall-through for non-alias model ids. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-26 16:10:41 +03:00
parent becf61b9c1
commit 24e20dcb5c
5 changed files with 426 additions and 7 deletions
--- a/crates/cortex-gateway/src/handlers.rs
+++ b/crates/cortex-gateway/src/handlers.rs
@@ -60,15 +60,16 @@ async fn chat_completions(
        }
    };

-    touch_model(&fleet, &route.node_name, &model_id).await;
+    touch_model(&fleet, &route.node_name, &route.resolved_model_id).await;

+    let body = rewrite_model_in_body(body, &route.resolved_model_id);
    proxy_with_metrics(
        &fleet,
        &route,
        "/v1/chat/completions",
        headers,
        body,
-        &model_id,
+        &route.resolved_model_id,
    )
    .await
 }
@@ -107,9 +108,18 @@ async fn completions(
        }
    };

-    touch_model(&fleet, &route.node_name, &model_id).await;
+    touch_model(&fleet, &route.node_name, &route.resolved_model_id).await;

-    proxy_with_metrics(&fleet, &route, "/v1/completions", headers, body, &model_id).await
+    let body = rewrite_model_in_body(body, &route.resolved_model_id);
+    proxy_with_metrics(
+        &fleet,
+        &route,
+        "/v1/completions",
+        headers,
+        body,
+        &route.resolved_model_id,
+    )
+    .await
 }

 /// `POST /v1/messages` — accept Anthropic format, translate, proxy, translate back.
@@ -166,10 +176,15 @@ async fn anthropic_messages(
        }
    };

-    touch_model(&fleet, &route.node_name, &model_id).await;
+    touch_model(&fleet, &route.node_name, &route.resolved_model_id).await;
+
+    // Swap the alias for the concrete id in the translated body so
+    // neuron's harness sees a model name that matches what it has
+    // loaded.
+    let openai_body = rewrite_model_in_body(openai_body, &route.resolved_model_id);

    let labels = [
-        ("model", model_id.clone()),
+        ("model", route.resolved_model_id.clone()),
        ("node", route.node_name.clone()),
    ];
    metrics::counter!("cortex_requests_total", &labels).increment(1);
@@ -434,6 +449,35 @@ async fn list_models(State(fleet): State<Arc<CortexState>>) -> Json<Value> {
        }
    }

+    // Pass 4: surface aliases as their own entries pointing at the
+    // same locations as the target id, so a client browsing /v1/models
+    // sees "helexa/small" / "helexa/balanced" / "helexa/large" (or
+    // whatever the operator defined) and can request inference
+    // against them directly. Aliases that point at unknown targets
+    // are skipped — surfacing a dead alias would be misleading.
+    for (alias, target) in &catalogue.aliases {
+        let Some(target_entry) = entries.get(target).cloned() else {
+            tracing::warn!(
+                alias = alias,
+                target = target,
+                "alias points at a model not present in catalogue or fleet; skipping"
+            );
+            continue;
+        };
+        entries.insert(
+            alias.clone(),
+            CortexModelEntry {
+                id: alias.clone(),
+                object: "model".into(),
+                created: now,
+                owned_by: "helexa".into(),
+                loaded: target_entry.loaded,
+                feasible_on: target_entry.feasible_on,
+                locations: target_entry.locations,
+            },
+        );
+    }
+
    let data: Vec<Value> = entries.values().map(|e| json!(e)).collect();
    Json(json!({
        "object": "list",
@@ -512,6 +556,38 @@ fn extract_model(body: &[u8]) -> Option<String> {
    v.get("model")?.as_str().map(|s| s.to_string())
 }

+/// Rewrite the `model` field of an OpenAI-style JSON request body to
+/// the resolved concrete id. Returns the original bytes if `new_model`
+/// matches what's already there or the body fails to parse — the
+/// caller has already extracted `model` via `extract_model`, so a
+/// parse failure here would only happen on a body the client crafted
+/// to defeat us, and we'd rather proxy it unchanged than 500.
+///
+/// Needed because neuron rejects requests whose `model` field doesn't
+/// match a loaded model, so a client that sends `model: "helexa/small"`
+/// would hit a 404 at the harness unless we swap it for the concrete
+/// id the alias resolved to.
+fn rewrite_model_in_body(body: Bytes, new_model: &str) -> Bytes {
+    let Ok(mut v) = serde_json::from_slice::<Value>(&body) else {
+        return body;
+    };
+    let needs_rewrite = v
+        .get("model")
+        .and_then(|m| m.as_str())
+        .map(|m| m != new_model)
+        .unwrap_or(false);
+    if !needs_rewrite {
+        return body;
+    }
+    if let Value::Object(obj) = &mut v {
+        obj.insert("model".into(), Value::String(new_model.to_string()));
+    }
+    match serde_json::to_vec(&v) {
+        Ok(bytes) => Bytes::from(bytes),
+        Err(_) => body,
+    }
+}
+
 fn error_response(status: u16, message: &str) -> Response {
    let code = axum::http::StatusCode::from_u16(status)
        .unwrap_or(axum::http::StatusCode::INTERNAL_SERVER_ERROR);
--- a/crates/cortex-gateway/src/router.rs
+++ b/crates/cortex-gateway/src/router.rs
@@ -29,6 +29,13 @@ pub struct RouteDecision {
    /// when we just triggered an explicit cold-load via the catalogue
    /// path.
    pub cold_start: bool,
+    /// The concrete model id we actually routed to. Equal to the
+    /// caller's requested id unless an alias was resolved (e.g. caller
+    /// asked for `helexa/small`, this carries `Qwen/Qwen3-1.7B`). The
+    /// handler uses this to rewrite the request body's `model` field
+    /// before proxying — neurons reject requests where the body's
+    /// model name doesn't match a loaded model.
+    pub resolved_model_id: String,
 }

 #[derive(Debug, thiserror::Error)]
@@ -55,8 +62,20 @@ pub enum RouteError {
 /// Asks the neuron for the inference endpoint after selecting a node.
 pub async fn resolve(
    fleet: &Arc<CortexState>,
-    model_id: &str,
+    requested_model_id: &str,
 ) -> Result<RouteDecision, RouteError> {
+    // Alias resolution first — swap `helexa/small` (etc.) for the
+    // concrete id before any node lookups so the rest of routing,
+    // loading, and metrics deal in concrete ids only. `resolve_alias`
+    // returns the input verbatim when it isn't an alias.
+    let model_id = fleet.catalogue.resolve_alias(requested_model_id);
+    if model_id != requested_model_id {
+        tracing::debug!(
+            requested = requested_model_id,
+            resolved = model_id,
+            "alias resolved"
+        );
+    }
    // Snapshot loaded / unloaded state from the poller cache.
    let (loaded_route, unloaded_route, any_healthy) = {
        let nodes = fleet.nodes.read().await;
@@ -326,6 +345,7 @@ async fn finish(
        node_name: node_name.to_string(),
        endpoint,
        cold_start,
+        resolved_model_id: model_id.to_string(),
    })
 }