From 5ed1140c9747c7e23cc09ed27d711acd1e76dc6b Mon Sep 17 00:00:00 2001 From: rob thijssen Date: Sun, 31 May 2026 11:21:43 +0300 Subject: [PATCH] feat(cortex-gateway): proxy /v1/responses to neuron MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Step 3 of the Responses rollout: plain proxy route on the gateway, no translation. Neuron speaks the Responses API natively after Step 2 (commit 957f704), so the gateway just needs the same routing shape it uses for /v1/chat/completions — extract `model`, resolve via router::resolve, forward verbatim. - New `POST /v1/responses` handler in handlers.rs::responses. - Mock neuron under tests/common/mod.rs gains a `/v1/responses` endpoint that mirrors the ResponsesResponse shape neuron emits. - New integration test file `tests/responses.rs` exercises: - Happy path (200, body round-trips, ResponsesUsage shape). - Unknown model → 404 (matches chat-completions error shape). - Missing `model` field → 400 (same extract_model helper). Streaming proxy works through the same path as chat completions — the upstream Content-Type (`text/event-stream` for stream:true, `application/json` otherwise) propagates through proxy_with_metrics unchanged. Live-stream integration tests against a streaming mock deferred until we exercise the path against a real neuron, since the chat-completions streaming test already covers the proxy's SSE forwarding mechanics. Three new tests; clippy + fmt clean across the workspace. Co-Authored-By: Claude Opus 4.7 --- crates/cortex-gateway/src/handlers.rs | 53 +++++++++++++ crates/cortex-gateway/tests/common/mod.rs | 34 +++++++++ crates/cortex-gateway/tests/responses.rs | 91 +++++++++++++++++++++++ 3 files changed, 178 insertions(+) create mode 100644 crates/cortex-gateway/tests/responses.rs diff --git a/crates/cortex-gateway/src/handlers.rs b/crates/cortex-gateway/src/handlers.rs index a6df990..fe6e294 100644 --- a/crates/cortex-gateway/src/handlers.rs +++ b/crates/cortex-gateway/src/handlers.rs @@ -20,6 +20,7 @@ pub fn api_routes() -> Router> { Router::new() .route("/v1/chat/completions", post(chat_completions)) .route("/v1/completions", post(completions)) + .route("/v1/responses", post(responses)) .route("/v1/models", get(list_models)) .route("/v1/messages", post(anthropic_messages)) .route("/health", get(health)) @@ -74,6 +75,58 @@ async fn chat_completions( .await } +/// `POST /v1/responses` — proxy to the appropriate backend node. +/// +/// Same routing shape as [`chat_completions`]: extract `model` from +/// the body, resolve to a node, forward verbatim. No translation — +/// neuron speaks the Responses API natively (see +/// `crates/neuron/src/wire/openai_responses.rs`), so the gateway is +/// a pass-through. Streaming and non-streaming are handled +/// identically; the upstream `Content-Type` (text/event-stream vs. +/// application/json) propagates through the proxy. +async fn responses( + State(fleet): State>, + headers: HeaderMap, + body: Bytes, +) -> Response { + let model_id = match extract_model(&body) { + Some(m) => m, + None => { + tracing::warn!( + handler = "responses", + "rejected: missing 'model' field in request body" + ); + return error_response(400, "missing 'model' field in request body"); + } + }; + + let route = match router::resolve(&fleet, &model_id).await { + Ok(r) => r, + Err(e) => { + tracing::warn!( + handler = "responses", + model = %model_id, + error = %e, + "route resolve failed" + ); + return error_response(404, &e.to_string()); + } + }; + + touch_model(&fleet, &route.node_name, &route.resolved_model_id).await; + + let body = rewrite_model_in_body(body, &route.resolved_model_id); + proxy_with_metrics( + &fleet, + &route, + "/v1/responses", + headers, + body, + &route.resolved_model_id, + ) + .await +} + /// `POST /v1/completions` — proxy completions endpoint. async fn completions( State(fleet): State>, diff --git a/crates/cortex-gateway/tests/common/mod.rs b/crates/cortex-gateway/tests/common/mod.rs index a1911a4..5a12c9b 100644 --- a/crates/cortex-gateway/tests/common/mod.rs +++ b/crates/cortex-gateway/tests/common/mod.rs @@ -44,6 +44,7 @@ pub async fn spawn_mock_neuron() -> String { post(|Json(_body): Json| async { Json(json!({"status": "unloaded"})) }), ) .route("/v1/chat/completions", post(mock_chat_completions)) + .route("/v1/responses", post(mock_responses)) .route("/v1/models", get(mock_v1_models)); tokio::spawn(async move { @@ -93,6 +94,39 @@ async fn mock_chat_completions(Json(body): Json) -> Json { })) } +async fn mock_responses(Json(body): Json) -> Json { + let model = body + .get("model") + .and_then(|v| v.as_str()) + .unwrap_or("unknown"); + // Echo the model field back and synthesise a tiny ResponsesResponse. + // Mirrors the shape neuron's /v1/responses handler emits so the + // gateway test only needs to assert the proxy round-tripped it. + Json(json!({ + "id": "resp-test-001", + "object": "response", + "created_at": 1700000000_u64, + "status": "completed", + "model": model, + "output": [{ + "type": "message", + "id": "msg-test-001", + "role": "assistant", + "content": [{ + "type": "output_text", + "text": "Hello from mock backend", + "annotations": [] + }], + "status": "completed" + }], + "usage": { + "input_tokens": 5, + "output_tokens": 5, + "total_tokens": 10 + } + })) +} + /// Spawns a mock neuron that returns SSE streaming responses for chat completions. pub async fn spawn_streaming_mock_neuron(chunk_count: usize, chunk_delay: Duration) -> String { let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); diff --git a/crates/cortex-gateway/tests/responses.rs b/crates/cortex-gateway/tests/responses.rs new file mode 100644 index 0000000..9e83a56 --- /dev/null +++ b/crates/cortex-gateway/tests/responses.rs @@ -0,0 +1,91 @@ +//! Integration tests for the `/v1/responses` proxy route. +//! +//! The gateway forwards the request body to whichever neuron has the +//! model loaded. These tests exercise the routing decision (200 on a +//! known model, 404 on an unknown model, 400 on a missing model +//! field) and confirm the response body round-trips verbatim. + +mod common; + +use serde_json::json; + +/// Happy path: gateway routes a `/v1/responses` request to the neuron +/// that has the model loaded, and the neuron's response body +/// arrives at the client unchanged. +#[tokio::test] +async fn test_responses_proxy() { + let mock_url = common::spawn_mock_neuron().await; + let gw_url = common::spawn_gateway(&mock_url).await; + + let client = reqwest::Client::new(); + let resp = client + .post(format!("{gw_url}/v1/responses")) + .header("content-type", "application/json") + .json(&json!({ + "model": "test-model", + "input": "Hi" + })) + .send() + .await + .expect("request should succeed"); + + assert_eq!(resp.status(), 200); + + let body: serde_json::Value = resp.json().await.expect("valid JSON response"); + assert_eq!(body["id"], "resp-test-001"); + assert_eq!(body["object"], "response"); + assert_eq!(body["model"], "test-model"); + assert_eq!(body["status"], "completed"); + assert_eq!( + body["output"][0]["content"][0]["text"], + "Hello from mock backend" + ); + // Usage shape is the Responses-specific (input/output_tokens), + // not the chat-completions one (prompt/completion_tokens). Asserts + // the proxy didn't accidentally route through the wrong handler. + assert_eq!(body["usage"]["total_tokens"], 10); + assert!(body["usage"].get("input_tokens").is_some()); +} + +/// A request that targets a model not present in the catalogue gets +/// 404 from the router. This matches the chat-completions handler's +/// behaviour — same error path, same status code, so a client can +/// share retry logic across the two routes. +#[tokio::test] +async fn test_responses_model_not_found() { + let mock_url = common::spawn_mock_neuron().await; + let gw_url = common::spawn_gateway(&mock_url).await; + + let client = reqwest::Client::new(); + let resp = client + .post(format!("{gw_url}/v1/responses")) + .json(&json!({ + "model": "not-in-catalogue", + "input": "Hi" + })) + .send() + .await + .unwrap(); + assert_eq!(resp.status(), 404); +} + +/// A request body without a `model` field can't be routed; the +/// gateway returns 400 before reaching a backend. Same as the +/// chat-completions handler — extracted via the same `extract_model` +/// helper. +#[tokio::test] +async fn test_responses_missing_model_field() { + let mock_url = common::spawn_mock_neuron().await; + let gw_url = common::spawn_gateway(&mock_url).await; + + let client = reqwest::Client::new(); + let resp = client + .post(format!("{gw_url}/v1/responses")) + .json(&json!({ + "input": "Hi" + })) + .send() + .await + .unwrap(); + assert_eq!(resp.status(), 400); +}