fix(gateway): full observability + stop leaking upstream bodies
All checks were successful
build-prerelease / Resolve version stamps (push) Successful in 39s
CI / Format (push) Successful in 42s
CI / Clippy (push) Successful in 2m27s
build-prerelease / Build neuron-blackwell (push) Successful in 3m39s
CI / Test (push) Successful in 4m42s
CI / Build cortex SRPM (push) Has been skipped
CI / Build neuron SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
build-prerelease / Build cortex binary (push) Successful in 4m31s
build-prerelease / Package cortex RPM (push) Successful in 1m21s
build-prerelease / Build neuron-ampere (push) Successful in 4m53s
build-prerelease / Build neuron-ada (push) Successful in 5m7s
build-prerelease / Package helexa-neuron-ada RPM (push) Successful in 2m58s
build-prerelease / Package helexa-neuron-ampere RPM (push) Successful in 3m3s
build-prerelease / Package helexa-neuron-blackwell RPM (push) Successful in 3m43s
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Successful in 1m3s

Comprehensive sweep across cortex-gateway's request handling. Every
failure path now emits exactly one structured warn (or error) event
on the cortex side with the wire-level detail an operator needs;
the API response carries only a generic message plus, where useful,
the upstream status code.

proxy.rs::forward_request:
- warn on network failure (network error, target URL).
- warn on upstream non-2xx (status, target URL). Streaming body still
  passes through to the client; we just can't snippet without
  breaking the stream.
- warn on response-build failure.
- ProxyError::into_response no longer interpolates the inner error
  into the API body — generic "upstream request failed" / "failed to
  build response" instead.

handlers.rs::chat_completions, handlers.rs::completions:
- warn on missing model field, with handler= label.
- warn on route resolve failure with model + error chain. The
  user-facing 404 keeps the RouteError Display string (which is
  short, informative, and contains no internal detail beyond the
  model id and config'd node names).

handlers.rs::anthropic_messages:
- warn on invalid Anthropic body, on translated-OpenAI serialise
  failure (which is internal), on route resolve, on upstream network
  error, on upstream non-2xx (with 512-char body snippet for parse
  errors), on upstream body read, on response parse.
- All warns share consistent field shape: handler, model, node, url,
  status / error / body as applicable.
- API response messages are now uniformly generic.
- Adds an info-level "proxying request" log on the non-streaming
  path so successful proxies are also visible.

handlers.rs::proxy_with_metrics:
- still calls e.into_response() but proxy::forward_request already
  warn'd at the wire layer, so no double-log here.

Tests:
- All 32 existing unit tests + 22 gateway integration tests + 4
  new router tests pass.
- Tests that asserted on the "no healthy nodes" / "not found"
  strings still match because RouteError messages are preserved
  in the 404 user-facing path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-22 07:17:26 +03:00
parent 0f00f72b47
commit aa88d37509
2 changed files with 166 additions and 27 deletions

View File

@@ -34,12 +34,30 @@ async fn chat_completions(
) -> Response { ) -> Response {
let model_id = match extract_model(&body) { let model_id = match extract_model(&body) {
Some(m) => m, Some(m) => m,
None => return error_response(400, "missing 'model' field in request body"), None => {
tracing::warn!(
handler = "chat_completions",
"rejected: missing 'model' field in request body"
);
return error_response(400, "missing 'model' field in request body");
}
}; };
let route = match router::resolve(&fleet, &model_id).await { let route = match router::resolve(&fleet, &model_id).await {
Ok(r) => r, Ok(r) => r,
Err(e) => return error_response(404, &e.to_string()), Err(e) => {
tracing::warn!(
handler = "chat_completions",
model = %model_id,
error = %e,
"route resolve failed"
);
// RouteError's Display strings are short and informative
// ("model 'X' not found...", "no healthy nodes available")
// — fine to surface to the caller. The warn above carries
// any extra context for operators.
return error_response(404, &e.to_string());
}
}; };
touch_model(&fleet, &route.node_name, &model_id).await; touch_model(&fleet, &route.node_name, &model_id).await;
@@ -63,12 +81,30 @@ async fn completions(
) -> Response { ) -> Response {
let model_id = match extract_model(&body) { let model_id = match extract_model(&body) {
Some(m) => m, Some(m) => m,
None => return error_response(400, "missing 'model' field in request body"), None => {
tracing::warn!(
handler = "completions",
"rejected: missing 'model' field in request body"
);
return error_response(400, "missing 'model' field in request body");
}
}; };
let route = match router::resolve(&fleet, &model_id).await { let route = match router::resolve(&fleet, &model_id).await {
Ok(r) => r, Ok(r) => r,
Err(e) => return error_response(404, &e.to_string()), Err(e) => {
tracing::warn!(
handler = "completions",
model = %model_id,
error = %e,
"route resolve failed"
);
// RouteError's Display strings are short and informative
// ("model 'X' not found...", "no healthy nodes available")
// — fine to surface to the caller. The warn above carries
// any extra context for operators.
return error_response(404, &e.to_string());
}
}; };
touch_model(&fleet, &route.node_name, &model_id).await; touch_model(&fleet, &route.node_name, &model_id).await;
@@ -85,7 +121,14 @@ async fn anthropic_messages(
// Parse as Anthropic request. // Parse as Anthropic request.
let anth_req: cortex_core::anthropic::MessagesRequest = match serde_json::from_slice(&body) { let anth_req: cortex_core::anthropic::MessagesRequest = match serde_json::from_slice(&body) {
Ok(r) => r, Ok(r) => r,
Err(e) => return error_response(400, &format!("invalid Anthropic request: {e}")), Err(e) => {
tracing::warn!(
handler = "anthropic_messages",
error = %e,
"rejected: invalid Anthropic request body"
);
return error_response(400, "invalid Anthropic request body");
}
}; };
let model_id = anth_req.model.clone(); let model_id = anth_req.model.clone();
@@ -95,12 +138,32 @@ async fn anthropic_messages(
let openai_req = cortex_core::translate::anthropic_to_openai(anth_req); let openai_req = cortex_core::translate::anthropic_to_openai(anth_req);
let openai_body = match serde_json::to_vec(&openai_req) { let openai_body = match serde_json::to_vec(&openai_req) {
Ok(b) => Bytes::from(b), Ok(b) => Bytes::from(b),
Err(e) => return error_response(500, &format!("translation error: {e}")), Err(e) => {
tracing::error!(
handler = "anthropic_messages",
model = %model_id,
error = %e,
"internal: failed to serialise translated OpenAI request"
);
return error_response(500, "internal translation error");
}
}; };
let route = match router::resolve(&fleet, &model_id).await { let route = match router::resolve(&fleet, &model_id).await {
Ok(r) => r, Ok(r) => r,
Err(e) => return error_response(404, &e.to_string()), Err(e) => {
tracing::warn!(
handler = "anthropic_messages",
model = %model_id,
error = %e,
"route resolve failed"
);
// RouteError's Display strings are short and informative
// ("model 'X' not found...", "no healthy nodes available")
// — fine to surface to the caller. The warn above carries
// any extra context for operators.
return error_response(404, &e.to_string());
}
}; };
touch_model(&fleet, &route.node_name, &model_id).await; touch_model(&fleet, &route.node_name, &model_id).await;
@@ -133,12 +196,22 @@ async fn anthropic_messages(
Ok(resp) => resp, Ok(resp) => resp,
Err(e) => { Err(e) => {
metrics::counter!("cortex_request_errors_total", &labels).increment(1); metrics::counter!("cortex_request_errors_total", &labels).increment(1);
// forward_request already warn'd with the wire-level
// detail; no need to log again here.
e.into_response() e.into_response()
} }
} }
} else { } else {
// Non-streaming: proxy, buffer full response, translate back to Anthropic. // Non-streaming: proxy, buffer full response, translate back to Anthropic.
let target_url = format!("{}/v1/chat/completions", route.endpoint); let target_url = format!("{}/v1/chat/completions", route.endpoint);
tracing::info!(
handler = "anthropic_messages",
model = %model_id,
node = %route.node_name,
url = %target_url,
cold_start = route.cold_start,
"proxying request"
);
let upstream_resp = fleet let upstream_resp = fleet
.http_client .http_client
.post(&target_url) .post(&target_url)
@@ -152,28 +225,31 @@ async fn anthropic_messages(
Err(e) => { Err(e) => {
metrics::counter!("cortex_request_errors_total", &labels).increment(1); metrics::counter!("cortex_request_errors_total", &labels).increment(1);
tracing::warn!( tracing::warn!(
handler = "anthropic_messages",
model = %model_id, model = %model_id,
node = %route.node_name, node = %route.node_name,
target = %target_url, url = %target_url,
error = %e, error = %e,
"anthropic proxy: upstream request failed (network)" "upstream request failed (network)"
); );
return error_response(502, "upstream request failed"); return error_response(502, "upstream request failed");
} }
}; };
if !upstream_resp.status().is_success() { let upstream_status = upstream_resp.status();
if !upstream_status.is_success() {
metrics::counter!("cortex_request_errors_total", &labels).increment(1); metrics::counter!("cortex_request_errors_total", &labels).increment(1);
let status = upstream_resp.status().as_u16(); let status = upstream_status.as_u16();
let body = upstream_resp.text().await.unwrap_or_default(); let body = upstream_resp.text().await.unwrap_or_default();
let body_snippet = body.chars().take(512).collect::<String>(); let body_snippet = body.chars().take(512).collect::<String>();
tracing::warn!( tracing::warn!(
handler = "anthropic_messages",
model = %model_id, model = %model_id,
node = %route.node_name, node = %route.node_name,
target = %target_url, url = %target_url,
status, status,
body = %body_snippet, body = %body_snippet,
"anthropic proxy: upstream returned non-2xx" "upstream returned non-2xx"
); );
return error_response(status, &format!("upstream returned {status}")); return error_response(status, &format!("upstream returned {status}"));
} }
@@ -182,7 +258,15 @@ async fn anthropic_messages(
Ok(b) => b, Ok(b) => b,
Err(e) => { Err(e) => {
metrics::counter!("cortex_request_errors_total", &labels).increment(1); metrics::counter!("cortex_request_errors_total", &labels).increment(1);
return error_response(502, &format!("failed to read upstream response: {e}")); tracing::warn!(
handler = "anthropic_messages",
model = %model_id,
node = %route.node_name,
url = %target_url,
error = %e,
"failed to read upstream response body"
);
return error_response(502, "failed to read upstream response");
} }
}; };
@@ -191,7 +275,20 @@ async fn anthropic_messages(
Ok(r) => r, Ok(r) => r,
Err(e) => { Err(e) => {
metrics::counter!("cortex_request_errors_total", &labels).increment(1); metrics::counter!("cortex_request_errors_total", &labels).increment(1);
return error_response(502, &format!("failed to parse upstream response: {e}")); let body_snippet = String::from_utf8_lossy(&body_bytes)
.chars()
.take(512)
.collect::<String>();
tracing::warn!(
handler = "anthropic_messages",
model = %model_id,
node = %route.node_name,
url = %target_url,
error = %e,
body = %body_snippet,
"failed to parse upstream response as OpenAI ChatCompletionResponse"
);
return error_response(502, "malformed upstream response");
} }
}; };
@@ -343,6 +440,9 @@ async fn proxy_with_metrics(
} }
Err(e) => { Err(e) => {
metrics::counter!("cortex_request_errors_total", &labels).increment(1); metrics::counter!("cortex_request_errors_total", &labels).increment(1);
// proxy::forward_request already warn'd with wire-level
// detail (target URL, error, status). ProxyError::into_response
// now returns a generic message — no body leak.
e.into_response() e.into_response()
} }
} }

View File

@@ -12,6 +12,13 @@ use axum::response::{IntoResponse, Response};
use reqwest::Client; use reqwest::Client;
/// Proxy a request body to the resolved backend node and stream the response. /// Proxy a request body to the resolved backend node and stream the response.
///
/// Logging contract: every call emits exactly one structured event at
/// info / warn level for operator visibility, regardless of outcome.
/// Network-level failures and non-2xx upstream statuses are warn'd here
/// (closest to the wire); the user-facing response carries only the
/// status code and a generic message — implementation detail (body,
/// error chain) lives in the log, never in the API surface.
pub async fn forward_request( pub async fn forward_request(
client: &Client, client: &Client,
route: &RouteDecision, route: &RouteDecision,
@@ -37,10 +44,33 @@ pub async fn forward_request(
req_builder = req_builder.header(key, value); req_builder = req_builder.header(key, value);
} }
let upstream_resp = req_builder.send().await.map_err(ProxyError::Upstream)?; let upstream_resp = match req_builder.send().await {
Ok(r) => r,
Err(e) => {
tracing::warn!(
node = %route.node_name,
url = %url,
error = %e,
"proxy: upstream request failed (network)"
);
return Err(ProxyError::Upstream(e));
}
};
let status = let upstream_status = upstream_resp.status();
StatusCode::from_u16(upstream_resp.status().as_u16()).unwrap_or(StatusCode::BAD_GATEWAY); if !upstream_status.is_success() {
// Streaming body — can't snippet without breaking the stream
// pass-through. Log status + URL; the client still gets the
// upstream status, just without the leaked body.
tracing::warn!(
node = %route.node_name,
url = %url,
status = upstream_status.as_u16(),
"proxy: upstream returned non-2xx"
);
}
let status = StatusCode::from_u16(upstream_status.as_u16()).unwrap_or(StatusCode::BAD_GATEWAY);
let resp_headers = upstream_resp.headers().clone(); let resp_headers = upstream_resp.headers().clone();
let stream = upstream_resp.bytes_stream(); let stream = upstream_resp.bytes_stream();
@@ -52,28 +82,37 @@ pub async fn forward_request(
response = response.header(key, value); response = response.header(key, value);
} }
response response.body(body).map_err(|e| {
.body(body) tracing::warn!(
.map_err(|e| ProxyError::ResponseBuild(e.to_string())) node = %route.node_name,
url = %url,
error = %e,
"proxy: failed to build response"
);
ProxyError::ResponseBuild(e.to_string())
})
} }
#[derive(Debug, thiserror::Error)] #[derive(Debug, thiserror::Error)]
pub enum ProxyError { pub enum ProxyError {
#[error("upstream request failed: {0}")] #[error("upstream request failed")]
Upstream(reqwest::Error), Upstream(reqwest::Error),
#[error("failed to build response: {0}")] #[error("failed to build response")]
ResponseBuild(String), ResponseBuild(String),
} }
impl IntoResponse for ProxyError { impl IntoResponse for ProxyError {
fn into_response(self) -> Response { fn into_response(self) -> Response {
let status = match &self { let (status, message) = match &self {
ProxyError::Upstream(_) => StatusCode::BAD_GATEWAY, ProxyError::Upstream(_) => (StatusCode::BAD_GATEWAY, "upstream request failed"),
ProxyError::ResponseBuild(_) => StatusCode::INTERNAL_SERVER_ERROR, ProxyError::ResponseBuild(_) => (
StatusCode::INTERNAL_SERVER_ERROR,
"failed to build response",
),
}; };
let body = serde_json::json!({ let body = serde_json::json!({
"error": { "error": {
"message": self.to_string(), "message": message,
"type": "proxy_error", "type": "proxy_error",
} }
}); });