feat(cortex-gateway): proxy /v1/responses to neuron
Some checks failed
CI / CUDA type-check (push) Failing after 12s
build-prerelease / Resolve version stamps (push) Successful in 33s
CI / Format (push) Successful in 37s
CI / Clippy (push) Failing after 1m5s
build-prerelease / Build cortex binary (push) Successful in 4m26s
CI / Test (push) Successful in 5m17s
CI / Build cortex SRPM (push) Has been skipped
CI / Build neuron SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
build-prerelease / Build neuron-blackwell (push) Successful in 5m39s
build-prerelease / Package cortex RPM (push) Successful in 1m24s
build-prerelease / Package helexa-neuron-ada RPM (push) Has been cancelled
build-prerelease / Package helexa-neuron-ampere RPM (push) Has been cancelled
build-prerelease / Package helexa-neuron-blackwell RPM (push) Has been cancelled
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Has been cancelled
build-prerelease / Build neuron-ada (push) Has been cancelled
build-prerelease / Build neuron-ampere (push) Has been cancelled
Some checks failed
CI / CUDA type-check (push) Failing after 12s
build-prerelease / Resolve version stamps (push) Successful in 33s
CI / Format (push) Successful in 37s
CI / Clippy (push) Failing after 1m5s
build-prerelease / Build cortex binary (push) Successful in 4m26s
CI / Test (push) Successful in 5m17s
CI / Build cortex SRPM (push) Has been skipped
CI / Build neuron SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
build-prerelease / Build neuron-blackwell (push) Successful in 5m39s
build-prerelease / Package cortex RPM (push) Successful in 1m24s
build-prerelease / Package helexa-neuron-ada RPM (push) Has been cancelled
build-prerelease / Package helexa-neuron-ampere RPM (push) Has been cancelled
build-prerelease / Package helexa-neuron-blackwell RPM (push) Has been cancelled
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Has been cancelled
build-prerelease / Build neuron-ada (push) Has been cancelled
build-prerelease / Build neuron-ampere (push) Has been cancelled
Step 3 of the Responses rollout: plain proxy route on the gateway,
no translation. Neuron speaks the Responses API natively after Step
2 (commit 957f704), so the gateway just needs the same routing
shape it uses for /v1/chat/completions — extract `model`, resolve
via router::resolve, forward verbatim.
- New `POST /v1/responses` handler in handlers.rs::responses.
- Mock neuron under tests/common/mod.rs gains a `/v1/responses`
endpoint that mirrors the ResponsesResponse shape neuron emits.
- New integration test file `tests/responses.rs` exercises:
- Happy path (200, body round-trips, ResponsesUsage shape).
- Unknown model → 404 (matches chat-completions error shape).
- Missing `model` field → 400 (same extract_model helper).
Streaming proxy works through the same path as chat completions —
the upstream Content-Type (`text/event-stream` for stream:true,
`application/json` otherwise) propagates through proxy_with_metrics
unchanged. Live-stream integration tests against a streaming mock
deferred until we exercise the path against a real neuron, since
the chat-completions streaming test already covers the proxy's
SSE forwarding mechanics.
Three new tests; clippy + fmt clean across the workspace.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -20,6 +20,7 @@ pub fn api_routes() -> Router<Arc<CortexState>> {
|
||||
Router::new()
|
||||
.route("/v1/chat/completions", post(chat_completions))
|
||||
.route("/v1/completions", post(completions))
|
||||
.route("/v1/responses", post(responses))
|
||||
.route("/v1/models", get(list_models))
|
||||
.route("/v1/messages", post(anthropic_messages))
|
||||
.route("/health", get(health))
|
||||
@@ -74,6 +75,58 @@ async fn chat_completions(
|
||||
.await
|
||||
}
|
||||
|
||||
/// `POST /v1/responses` — proxy to the appropriate backend node.
|
||||
///
|
||||
/// Same routing shape as [`chat_completions`]: extract `model` from
|
||||
/// the body, resolve to a node, forward verbatim. No translation —
|
||||
/// neuron speaks the Responses API natively (see
|
||||
/// `crates/neuron/src/wire/openai_responses.rs`), so the gateway is
|
||||
/// a pass-through. Streaming and non-streaming are handled
|
||||
/// identically; the upstream `Content-Type` (text/event-stream vs.
|
||||
/// application/json) propagates through the proxy.
|
||||
async fn responses(
|
||||
State(fleet): State<Arc<CortexState>>,
|
||||
headers: HeaderMap,
|
||||
body: Bytes,
|
||||
) -> Response {
|
||||
let model_id = match extract_model(&body) {
|
||||
Some(m) => m,
|
||||
None => {
|
||||
tracing::warn!(
|
||||
handler = "responses",
|
||||
"rejected: missing 'model' field in request body"
|
||||
);
|
||||
return error_response(400, "missing 'model' field in request body");
|
||||
}
|
||||
};
|
||||
|
||||
let route = match router::resolve(&fleet, &model_id).await {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
handler = "responses",
|
||||
model = %model_id,
|
||||
error = %e,
|
||||
"route resolve failed"
|
||||
);
|
||||
return error_response(404, &e.to_string());
|
||||
}
|
||||
};
|
||||
|
||||
touch_model(&fleet, &route.node_name, &route.resolved_model_id).await;
|
||||
|
||||
let body = rewrite_model_in_body(body, &route.resolved_model_id);
|
||||
proxy_with_metrics(
|
||||
&fleet,
|
||||
&route,
|
||||
"/v1/responses",
|
||||
headers,
|
||||
body,
|
||||
&route.resolved_model_id,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// `POST /v1/completions` — proxy completions endpoint.
|
||||
async fn completions(
|
||||
State(fleet): State<Arc<CortexState>>,
|
||||
|
||||
@@ -44,6 +44,7 @@ pub async fn spawn_mock_neuron() -> String {
|
||||
post(|Json(_body): Json<Value>| async { Json(json!({"status": "unloaded"})) }),
|
||||
)
|
||||
.route("/v1/chat/completions", post(mock_chat_completions))
|
||||
.route("/v1/responses", post(mock_responses))
|
||||
.route("/v1/models", get(mock_v1_models));
|
||||
|
||||
tokio::spawn(async move {
|
||||
@@ -93,6 +94,39 @@ async fn mock_chat_completions(Json(body): Json<Value>) -> Json<Value> {
|
||||
}))
|
||||
}
|
||||
|
||||
async fn mock_responses(Json(body): Json<Value>) -> Json<Value> {
|
||||
let model = body
|
||||
.get("model")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("unknown");
|
||||
// Echo the model field back and synthesise a tiny ResponsesResponse.
|
||||
// Mirrors the shape neuron's /v1/responses handler emits so the
|
||||
// gateway test only needs to assert the proxy round-tripped it.
|
||||
Json(json!({
|
||||
"id": "resp-test-001",
|
||||
"object": "response",
|
||||
"created_at": 1700000000_u64,
|
||||
"status": "completed",
|
||||
"model": model,
|
||||
"output": [{
|
||||
"type": "message",
|
||||
"id": "msg-test-001",
|
||||
"role": "assistant",
|
||||
"content": [{
|
||||
"type": "output_text",
|
||||
"text": "Hello from mock backend",
|
||||
"annotations": []
|
||||
}],
|
||||
"status": "completed"
|
||||
}],
|
||||
"usage": {
|
||||
"input_tokens": 5,
|
||||
"output_tokens": 5,
|
||||
"total_tokens": 10
|
||||
}
|
||||
}))
|
||||
}
|
||||
|
||||
/// Spawns a mock neuron that returns SSE streaming responses for chat completions.
|
||||
pub async fn spawn_streaming_mock_neuron(chunk_count: usize, chunk_delay: Duration) -> String {
|
||||
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||
|
||||
91
crates/cortex-gateway/tests/responses.rs
Normal file
91
crates/cortex-gateway/tests/responses.rs
Normal file
@@ -0,0 +1,91 @@
|
||||
//! Integration tests for the `/v1/responses` proxy route.
|
||||
//!
|
||||
//! The gateway forwards the request body to whichever neuron has the
|
||||
//! model loaded. These tests exercise the routing decision (200 on a
|
||||
//! known model, 404 on an unknown model, 400 on a missing model
|
||||
//! field) and confirm the response body round-trips verbatim.
|
||||
|
||||
mod common;
|
||||
|
||||
use serde_json::json;
|
||||
|
||||
/// Happy path: gateway routes a `/v1/responses` request to the neuron
|
||||
/// that has the model loaded, and the neuron's response body
|
||||
/// arrives at the client unchanged.
|
||||
#[tokio::test]
|
||||
async fn test_responses_proxy() {
|
||||
let mock_url = common::spawn_mock_neuron().await;
|
||||
let gw_url = common::spawn_gateway(&mock_url).await;
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let resp = client
|
||||
.post(format!("{gw_url}/v1/responses"))
|
||||
.header("content-type", "application/json")
|
||||
.json(&json!({
|
||||
"model": "test-model",
|
||||
"input": "Hi"
|
||||
}))
|
||||
.send()
|
||||
.await
|
||||
.expect("request should succeed");
|
||||
|
||||
assert_eq!(resp.status(), 200);
|
||||
|
||||
let body: serde_json::Value = resp.json().await.expect("valid JSON response");
|
||||
assert_eq!(body["id"], "resp-test-001");
|
||||
assert_eq!(body["object"], "response");
|
||||
assert_eq!(body["model"], "test-model");
|
||||
assert_eq!(body["status"], "completed");
|
||||
assert_eq!(
|
||||
body["output"][0]["content"][0]["text"],
|
||||
"Hello from mock backend"
|
||||
);
|
||||
// Usage shape is the Responses-specific (input/output_tokens),
|
||||
// not the chat-completions one (prompt/completion_tokens). Asserts
|
||||
// the proxy didn't accidentally route through the wrong handler.
|
||||
assert_eq!(body["usage"]["total_tokens"], 10);
|
||||
assert!(body["usage"].get("input_tokens").is_some());
|
||||
}
|
||||
|
||||
/// A request that targets a model not present in the catalogue gets
|
||||
/// 404 from the router. This matches the chat-completions handler's
|
||||
/// behaviour — same error path, same status code, so a client can
|
||||
/// share retry logic across the two routes.
|
||||
#[tokio::test]
|
||||
async fn test_responses_model_not_found() {
|
||||
let mock_url = common::spawn_mock_neuron().await;
|
||||
let gw_url = common::spawn_gateway(&mock_url).await;
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let resp = client
|
||||
.post(format!("{gw_url}/v1/responses"))
|
||||
.json(&json!({
|
||||
"model": "not-in-catalogue",
|
||||
"input": "Hi"
|
||||
}))
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(resp.status(), 404);
|
||||
}
|
||||
|
||||
/// A request body without a `model` field can't be routed; the
|
||||
/// gateway returns 400 before reaching a backend. Same as the
|
||||
/// chat-completions handler — extracted via the same `extract_model`
|
||||
/// helper.
|
||||
#[tokio::test]
|
||||
async fn test_responses_missing_model_field() {
|
||||
let mock_url = common::spawn_mock_neuron().await;
|
||||
let gw_url = common::spawn_gateway(&mock_url).await;
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let resp = client
|
||||
.post(format!("{gw_url}/v1/responses"))
|
||||
.json(&json!({
|
||||
"input": "Hi"
|
||||
}))
|
||||
.send()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(resp.status(), 400);
|
||||
}
|
||||
Reference in New Issue
Block a user