fix(gateway): full observability + stop leaking upstream bodies

Comprehensive sweep across cortex-gateway's request handling. Every failure path now emits exactly one structured warn (or error) event on the cortex side with the wire-level detail an operator needs; the API response carries only a generic message plus, where useful, the upstream status code. proxy.rs::forward_request: - warn on network failure (network error, target URL). - warn on upstream non-2xx (status, target URL). Streaming body still passes through to the client; we just can't snippet without breaking the stream. - warn on response-build failure. - ProxyError::into_response no longer interpolates the inner error into the API body — generic "upstream request failed" / "failed to build response" instead. handlers.rs::chat_completions, handlers.rs::completions: - warn on missing model field, with handler= label. - warn on route resolve failure with model + error chain. The user-facing 404 keeps the RouteError Display string (which is short, informative, and contains no internal detail beyond the model id and config'd node names). handlers.rs::anthropic_messages: - warn on invalid Anthropic body, on translated-OpenAI serialise failure (which is internal), on route resolve, on upstream network error, on upstream non-2xx (with 512-char body snippet for parse errors), on upstream body read, on response parse. - All warns share consistent field shape: handler, model, node, url, status / error / body as applicable. - API response messages are now uniformly generic. - Adds an info-level "proxying request" log on the non-streaming path so successful proxies are also visible. handlers.rs::proxy_with_metrics: - still calls e.into_response() but proxy::forward_request already warn'd at the wire layer, so no double-log here. Tests: - All 32 existing unit tests + 22 gateway integration tests + 4 new router tests pass. - Tests that asserted on the "no healthy nodes" / "not found" strings still match because RouteError messages are preserved in the 404 user-facing path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-22 07:17:26 +03:00
parent 0f00f72b47
commit aa88d37509
2 changed files with 166 additions and 27 deletions
--- a/crates/cortex-gateway/src/handlers.rs
+++ b/crates/cortex-gateway/src/handlers.rs
@@ -34,12 +34,30 @@ async fn chat_completions(
 ) -> Response {
    let model_id = match extract_model(&body) {
        Some(m) => m,
-        None => return error_response(400, "missing 'model' field in request body"),
+        None => {
            tracing::warn!(
                handler = "chat_completions",
                "rejected: missing 'model' field in request body"
            );
            return error_response(400, "missing 'model' field in request body");
        }
    };
    let route = match router::resolve(&fleet, &model_id).await {
        Ok(r) => r,
-        Err(e) => return error_response(404, &e.to_string()),
+        Err(e) => {
            tracing::warn!(
                handler = "chat_completions",
                model = %model_id,
                error = %e,
                "route resolve failed"
            );
            // RouteError's Display strings are short and informative
            // ("model 'X' not found...", "no healthy nodes available")
            // — fine to surface to the caller. The warn above carries
            // any extra context for operators.
            return error_response(404, &e.to_string());
        }
    };
    touch_model(&fleet, &route.node_name, &model_id).await;
@@ -63,12 +81,30 @@ async fn completions(
 ) -> Response {
    let model_id = match extract_model(&body) {
        Some(m) => m,
-        None => return error_response(400, "missing 'model' field in request body"),
+        None => {
            tracing::warn!(
                handler = "completions",
                "rejected: missing 'model' field in request body"
            );
            return error_response(400, "missing 'model' field in request body");
        }
    };
    let route = match router::resolve(&fleet, &model_id).await {
        Ok(r) => r,
-        Err(e) => return error_response(404, &e.to_string()),
+        Err(e) => {
            tracing::warn!(
                handler = "completions",
                model = %model_id,
                error = %e,
                "route resolve failed"
            );
            // RouteError's Display strings are short and informative
            // ("model 'X' not found...", "no healthy nodes available")
            // — fine to surface to the caller. The warn above carries
            // any extra context for operators.
            return error_response(404, &e.to_string());
        }
    };
    touch_model(&fleet, &route.node_name, &model_id).await;
@@ -85,7 +121,14 @@ async fn anthropic_messages(
    // Parse as Anthropic request.
    let anth_req: cortex_core::anthropic::MessagesRequest = match serde_json::from_slice(&body) {
        Ok(r) => r,
-        Err(e) => return error_response(400, &format!("invalid Anthropic request: {e}")),
+        Err(e) => {
            tracing::warn!(
                handler = "anthropic_messages",
                error = %e,
                "rejected: invalid Anthropic request body"
            );
            return error_response(400, "invalid Anthropic request body");
        }
    };
    let model_id = anth_req.model.clone();
@@ -95,12 +138,32 @@ async fn anthropic_messages(
    let openai_req = cortex_core::translate::anthropic_to_openai(anth_req);
    let openai_body = match serde_json::to_vec(&openai_req) {
        Ok(b) => Bytes::from(b),
-        Err(e) => return error_response(500, &format!("translation error: {e}")),
+        Err(e) => {
            tracing::error!(
                handler = "anthropic_messages",
                model = %model_id,
                error = %e,
                "internal: failed to serialise translated OpenAI request"
            );
            return error_response(500, "internal translation error");
        }
    };
    let route = match router::resolve(&fleet, &model_id).await {
        Ok(r) => r,
-        Err(e) => return error_response(404, &e.to_string()),
+        Err(e) => {
            tracing::warn!(
                handler = "anthropic_messages",
                model = %model_id,
                error = %e,
                "route resolve failed"
            );
            // RouteError's Display strings are short and informative
            // ("model 'X' not found...", "no healthy nodes available")
            // — fine to surface to the caller. The warn above carries
            // any extra context for operators.
            return error_response(404, &e.to_string());
        }
    };
    touch_model(&fleet, &route.node_name, &model_id).await;
@@ -133,12 +196,22 @@ async fn anthropic_messages(
            Ok(resp) => resp,
            Err(e) => {
                metrics::counter!("cortex_request_errors_total", &labels).increment(1);
                // forward_request already warn'd with the wire-level
                // detail; no need to log again here.
                e.into_response()
            }
        }
    } else {
        // Non-streaming: proxy, buffer full response, translate back to Anthropic.
        let target_url = format!("{}/v1/chat/completions", route.endpoint);
        tracing::info!(
            handler = "anthropic_messages",
            model = %model_id,
            node = %route.node_name,
            url = %target_url,
            cold_start = route.cold_start,
            "proxying request"
        );
        let upstream_resp = fleet
            .http_client
            .post(&target_url)
@@ -152,28 +225,31 @@ async fn anthropic_messages(
            Err(e) => {
                metrics::counter!("cortex_request_errors_total", &labels).increment(1);
                tracing::warn!(
                    handler = "anthropic_messages",
                    model = %model_id,
                    node = %route.node_name,
-                    target = %target_url,
+                    url = %target_url,
                    error = %e,
-                    "anthropic proxy: upstream request failed (network)"
+                    "upstream request failed (network)"
                );
                return error_response(502, "upstream request failed");
            }
        };
-        if !upstream_resp.status().is_success() {
+        let upstream_status = upstream_resp.status();
        if !upstream_status.is_success() {
            metrics::counter!("cortex_request_errors_total", &labels).increment(1);
-            let status = upstream_resp.status().as_u16();
+            let status = upstream_status.as_u16();
            let body = upstream_resp.text().await.unwrap_or_default();
            let body_snippet = body.chars().take(512).collect::<String>();
            tracing::warn!(
                handler = "anthropic_messages",
                model = %model_id,
                node = %route.node_name,
-                target = %target_url,
+                url = %target_url,
                status,
                body = %body_snippet,
-                "anthropic proxy: upstream returned non-2xx"
+                "upstream returned non-2xx"
            );
            return error_response(status, &format!("upstream returned {status}"));
        }
@@ -182,7 +258,15 @@ async fn anthropic_messages(
            Ok(b) => b,
            Err(e) => {
                metrics::counter!("cortex_request_errors_total", &labels).increment(1);
-                return error_response(502, &format!("failed to read upstream response: {e}"));
+                tracing::warn!(
                    handler = "anthropic_messages",
                    model = %model_id,
                    node = %route.node_name,
                    url = %target_url,
                    error = %e,
                    "failed to read upstream response body"
                );
                return error_response(502, "failed to read upstream response");
            }
        };
@@ -191,7 +275,20 @@ async fn anthropic_messages(
                Ok(r) => r,
                Err(e) => {
                    metrics::counter!("cortex_request_errors_total", &labels).increment(1);
-                    return error_response(502, &format!("failed to parse upstream response: {e}"));
+                    let body_snippet = String::from_utf8_lossy(&body_bytes)
                        .chars()
                        .take(512)
                        .collect::<String>();
                    tracing::warn!(
                        handler = "anthropic_messages",
                        model = %model_id,
                        node = %route.node_name,
                        url = %target_url,
                        error = %e,
                        body = %body_snippet,
                        "failed to parse upstream response as OpenAI ChatCompletionResponse"
                    );
                    return error_response(502, "malformed upstream response");
                }
            };
@@ -343,6 +440,9 @@ async fn proxy_with_metrics(
        }
        Err(e) => {
            metrics::counter!("cortex_request_errors_total", &labels).increment(1);
            // proxy::forward_request already warn'd with wire-level
            // detail (target URL, error, status). ProxyError::into_response
            // now returns a generic message — no body leak.
            e.into_response()
        }
    }
--- a/crates/cortex-gateway/src/proxy.rs
+++ b/crates/cortex-gateway/src/proxy.rs
@@ -12,6 +12,13 @@ use axum::response::{IntoResponse, Response};
 use reqwest::Client;
 /// Proxy a request body to the resolved backend node and stream the response.
 ///
 /// Logging contract: every call emits exactly one structured event at
 /// info / warn level for operator visibility, regardless of outcome.
 /// Network-level failures and non-2xx upstream statuses are warn'd here
 /// (closest to the wire); the user-facing response carries only the
 /// status code and a generic message — implementation detail (body,
 /// error chain) lives in the log, never in the API surface.
 pub async fn forward_request(
    client: &Client,
    route: &RouteDecision,
@@ -37,10 +44,33 @@ pub async fn forward_request(
        req_builder = req_builder.header(key, value);
    }
-    let upstream_resp = req_builder.send().await.map_err(ProxyError::Upstream)?;
+    let upstream_resp = match req_builder.send().await {
        Ok(r) => r,
        Err(e) => {
            tracing::warn!(
                node = %route.node_name,
                url = %url,
                error = %e,
                "proxy: upstream request failed (network)"
            );
            return Err(ProxyError::Upstream(e));
        }
    };
-    let status =
+    let upstream_status = upstream_resp.status();
-        StatusCode::from_u16(upstream_resp.status().as_u16()).unwrap_or(StatusCode::BAD_GATEWAY);
+    if !upstream_status.is_success() {
        // Streaming body — can't snippet without breaking the stream
        // pass-through. Log status + URL; the client still gets the
        // upstream status, just without the leaked body.
        tracing::warn!(
            node = %route.node_name,
            url = %url,
            status = upstream_status.as_u16(),
            "proxy: upstream returned non-2xx"
        );
    }
    let status = StatusCode::from_u16(upstream_status.as_u16()).unwrap_or(StatusCode::BAD_GATEWAY);
    let resp_headers = upstream_resp.headers().clone();
    let stream = upstream_resp.bytes_stream();
@@ -52,28 +82,37 @@ pub async fn forward_request(
        response = response.header(key, value);
    }
-    response
+    response.body(body).map_err(|e| {
-        .body(body)
+        tracing::warn!(
-        .map_err(|e| ProxyError::ResponseBuild(e.to_string()))
+            node = %route.node_name,
            url = %url,
            error = %e,
            "proxy: failed to build response"
        );
        ProxyError::ResponseBuild(e.to_string())
    })
 }
 #[derive(Debug, thiserror::Error)]
 pub enum ProxyError {
-    #[error("upstream request failed: {0}")]
+    #[error("upstream request failed")]
    Upstream(reqwest::Error),
-    #[error("failed to build response: {0}")]
+    #[error("failed to build response")]
    ResponseBuild(String),
 }
 impl IntoResponse for ProxyError {
    fn into_response(self) -> Response {
-        let status = match &self {
+        let (status, message) = match &self {
-            ProxyError::Upstream(_) => StatusCode::BAD_GATEWAY,
+            ProxyError::Upstream(_) => (StatusCode::BAD_GATEWAY, "upstream request failed"),
-            ProxyError::ResponseBuild(_) => StatusCode::INTERNAL_SERVER_ERROR,
+            ProxyError::ResponseBuild(_) => (
                StatusCode::INTERNAL_SERVER_ERROR,
                "failed to build response",
            ),
        };
        let body = serde_json::json!({
            "error": {
-                "message": self.to_string(),
+                "message": message,
                "type": "proxy_error",
            }
        });