From 5ed1140c9747c7e23cc09ed27d711acd1e76dc6b Mon Sep 17 00:00:00 2001
From: rob thijssen <grenade@rob.tn>
Date: Sun, 31 May 2026 11:21:43 +0300
Subject: [PATCH] feat(cortex-gateway): proxy /v1/responses to neuron
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Step 3 of the Responses rollout: plain proxy route on the gateway,
no translation. Neuron speaks the Responses API natively after Step
2 (commit 957f704), so the gateway just needs the same routing
shape it uses for /v1/chat/completions — extract `model`, resolve
via router::resolve, forward verbatim.

- New `POST /v1/responses` handler in handlers.rs::responses.
- Mock neuron under tests/common/mod.rs gains a `/v1/responses`
  endpoint that mirrors the ResponsesResponse shape neuron emits.
- New integration test file `tests/responses.rs` exercises:
  - Happy path (200, body round-trips, ResponsesUsage shape).
  - Unknown model → 404 (matches chat-completions error shape).
  - Missing `model` field → 400 (same extract_model helper).

Streaming proxy works through the same path as chat completions —
the upstream Content-Type (`text/event-stream` for stream:true,
`application/json` otherwise) propagates through proxy_with_metrics
unchanged. Live-stream integration tests against a streaming mock
deferred until we exercise the path against a real neuron, since
the chat-completions streaming test already covers the proxy's
SSE forwarding mechanics.

Three new tests; clippy + fmt clean across the workspace.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 crates/cortex-gateway/src/handlers.rs     | 53 +++++++++++++
 crates/cortex-gateway/tests/common/mod.rs | 34 +++++++++
 crates/cortex-gateway/tests/responses.rs  | 91 +++++++++++++++++++++++
 3 files changed, 178 insertions(+)
 create mode 100644 crates/cortex-gateway/tests/responses.rs
diff --git a/crates/cortex-gateway/src/handlers.rs b/crates/cortex-gateway/src/handlers.rs
index a6df990..fe6e294 100644
--- a/crates/cortex-gateway/src/handlers.rs
+++ b/crates/cortex-gateway/src/handlers.rs
@@ -20,6 +20,7 @@ pub fn api_routes() -> Router<Arc<CortexState>> {
     Router::new()
         .route("/v1/chat/completions", post(chat_completions))
         .route("/v1/completions", post(completions))
+        .route("/v1/responses", post(responses))
         .route("/v1/models", get(list_models))
         .route("/v1/messages", post(anthropic_messages))
         .route("/health", get(health))
@@ -74,6 +75,58 @@ async fn chat_completions(
     .await
 }
 
+/// `POST /v1/responses` — proxy to the appropriate backend node.
+///
+/// Same routing shape as [`chat_completions`]: extract `model` from
+/// the body, resolve to a node, forward verbatim. No translation —
+/// neuron speaks the Responses API natively (see
+/// `crates/neuron/src/wire/openai_responses.rs`), so the gateway is
+/// a pass-through. Streaming and non-streaming are handled
+/// identically; the upstream `Content-Type` (text/event-stream vs.
+/// application/json) propagates through the proxy.
+async fn responses(
+    State(fleet): State<Arc<CortexState>>,
+    headers: HeaderMap,
+    body: Bytes,
+) -> Response {
+    let model_id = match extract_model(&body) {
+        Some(m) => m,
+        None => {
+            tracing::warn!(
+                handler = "responses",
+                "rejected: missing 'model' field in request body"
+            );
+            return error_response(400, "missing 'model' field in request body");
+        }
+    };
+
+    let route = match router::resolve(&fleet, &model_id).await {
+        Ok(r) => r,
+        Err(e) => {
+            tracing::warn!(
+                handler = "responses",
+                model = %model_id,
+                error = %e,
+                "route resolve failed"
+            );
+            return error_response(404, &e.to_string());
+        }
+    };
+
+    touch_model(&fleet, &route.node_name, &route.resolved_model_id).await;
+
+    let body = rewrite_model_in_body(body, &route.resolved_model_id);
+    proxy_with_metrics(
+        &fleet,
+        &route,
+        "/v1/responses",
+        headers,
+        body,
+        &route.resolved_model_id,
+    )
+    .await
+}
+
 /// `POST /v1/completions` — proxy completions endpoint.
 async fn completions(
     State(fleet): State<Arc<CortexState>>,
diff --git a/crates/cortex-gateway/tests/common/mod.rs b/crates/cortex-gateway/tests/common/mod.rs
index a1911a4..5a12c9b 100644
--- a/crates/cortex-gateway/tests/common/mod.rs
+++ b/crates/cortex-gateway/tests/common/mod.rs
@@ -44,6 +44,7 @@ pub async fn spawn_mock_neuron() -> String {
             post(|Json(_body): Json<Value>| async { Json(json!({"status": "unloaded"})) }),
         )
         .route("/v1/chat/completions", post(mock_chat_completions))
+        .route("/v1/responses", post(mock_responses))
         .route("/v1/models", get(mock_v1_models));
 
     tokio::spawn(async move {
@@ -93,6 +94,39 @@ async fn mock_chat_completions(Json(body): Json<Value>) -> Json<Value> {
     }))
 }
 
+async fn mock_responses(Json(body): Json<Value>) -> Json<Value> {
+    let model = body
+        .get("model")
+        .and_then(|v| v.as_str())
+        .unwrap_or("unknown");
+    // Echo the model field back and synthesise a tiny ResponsesResponse.
+    // Mirrors the shape neuron's /v1/responses handler emits so the
+    // gateway test only needs to assert the proxy round-tripped it.
+    Json(json!({
+        "id": "resp-test-001",
+        "object": "response",
+        "created_at": 1700000000_u64,
+        "status": "completed",
+        "model": model,
+        "output": [{
+            "type": "message",
+            "id": "msg-test-001",
+            "role": "assistant",
+            "content": [{
+                "type": "output_text",
+                "text": "Hello from mock backend",
+                "annotations": []
+            }],
+            "status": "completed"
+        }],
+        "usage": {
+            "input_tokens": 5,
+            "output_tokens": 5,
+            "total_tokens": 10
+        }
+    }))
+}
+
 /// Spawns a mock neuron that returns SSE streaming responses for chat completions.
 pub async fn spawn_streaming_mock_neuron(chunk_count: usize, chunk_delay: Duration) -> String {
     let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
diff --git a/crates/cortex-gateway/tests/responses.rs b/crates/cortex-gateway/tests/responses.rs
new file mode 100644
index 0000000..9e83a56
--- /dev/null
+++ b/crates/cortex-gateway/tests/responses.rs
@@ -0,0 +1,91 @@
+//! Integration tests for the `/v1/responses` proxy route.
+//!
+//! The gateway forwards the request body to whichever neuron has the
+//! model loaded. These tests exercise the routing decision (200 on a
+//! known model, 404 on an unknown model, 400 on a missing model
+//! field) and confirm the response body round-trips verbatim.
+
+mod common;
+
+use serde_json::json;
+
+/// Happy path: gateway routes a `/v1/responses` request to the neuron
+/// that has the model loaded, and the neuron's response body
+/// arrives at the client unchanged.
+#[tokio::test]
+async fn test_responses_proxy() {
+    let mock_url = common::spawn_mock_neuron().await;
+    let gw_url = common::spawn_gateway(&mock_url).await;
+
+    let client = reqwest::Client::new();
+    let resp = client
+        .post(format!("{gw_url}/v1/responses"))
+        .header("content-type", "application/json")
+        .json(&json!({
+            "model": "test-model",
+            "input": "Hi"
+        }))
+        .send()
+        .await
+        .expect("request should succeed");
+
+    assert_eq!(resp.status(), 200);
+
+    let body: serde_json::Value = resp.json().await.expect("valid JSON response");
+    assert_eq!(body["id"], "resp-test-001");
+    assert_eq!(body["object"], "response");
+    assert_eq!(body["model"], "test-model");
+    assert_eq!(body["status"], "completed");
+    assert_eq!(
+        body["output"][0]["content"][0]["text"],
+        "Hello from mock backend"
+    );
+    // Usage shape is the Responses-specific (input/output_tokens),
+    // not the chat-completions one (prompt/completion_tokens). Asserts
+    // the proxy didn't accidentally route through the wrong handler.
+    assert_eq!(body["usage"]["total_tokens"], 10);
+    assert!(body["usage"].get("input_tokens").is_some());
+}
+
+/// A request that targets a model not present in the catalogue gets
+/// 404 from the router. This matches the chat-completions handler's
+/// behaviour — same error path, same status code, so a client can
+/// share retry logic across the two routes.
+#[tokio::test]
+async fn test_responses_model_not_found() {
+    let mock_url = common::spawn_mock_neuron().await;
+    let gw_url = common::spawn_gateway(&mock_url).await;
+
+    let client = reqwest::Client::new();
+    let resp = client
+        .post(format!("{gw_url}/v1/responses"))
+        .json(&json!({
+            "model": "not-in-catalogue",
+            "input": "Hi"
+        }))
+        .send()
+        .await
+        .unwrap();
+    assert_eq!(resp.status(), 404);
+}
+
+/// A request body without a `model` field can't be routed; the
+/// gateway returns 400 before reaching a backend. Same as the
+/// chat-completions handler — extracted via the same `extract_model`
+/// helper.
+#[tokio::test]
+async fn test_responses_missing_model_field() {
+    let mock_url = common::spawn_mock_neuron().await;
+    let gw_url = common::spawn_gateway(&mock_url).await;
+
+    let client = reqwest::Client::new();
+    let resp = client
+        .post(format!("{gw_url}/v1/responses"))
+        .json(&json!({
+            "input": "Hi"
+        }))
+        .send()
+        .await
+        .unwrap();
+    assert_eq!(resp.status(), 400);
+}