feat: add per-request Prometheus metrics instrumentation

Emit cortex_requests_total, cortex_request_duration_seconds, cortex_request_errors_total, and cortex_cold_starts_total with model and node labels on every proxied request. Add install_test_recorder() for testing metrics without HTTP listener. Integration test verifies counters and histograms appear after proxy. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-14 19:42:09 +03:00
parent 29c8f10761
commit 67b9b044d3
4 changed files with 152 additions and 40 deletions
--- a/crates/cortex-gateway/src/handlers.rs
+++ b/crates/cortex-gateway/src/handlers.rs
@@ -2,6 +2,7 @@

 use crate::proxy;
 use crate::router;
+use crate::router::RouteDecision;
 use crate::state::CortexState;
 use axum::Router;
 use axum::body::Bytes;
@@ -13,6 +14,7 @@ use chrono::Utc;
 use cortex_core::node::{CortexModelEntry, ModelLocation};
 use serde_json::{Value, json};
 use std::sync::Arc;
+use std::time::Instant;

 pub fn api_routes() -> Router<Arc<CortexState>> {
    Router::new()
@@ -42,18 +44,15 @@ async fn chat_completions(

    touch_model(&fleet, &route.node_name, &model_id).await;

-    match proxy::forward_request(
-        &fleet.http_client,
+    proxy_with_metrics(
+        &fleet,
        &route,
        "/v1/chat/completions",
        headers,
        body,
+        &model_id,
    )
    .await
-    {
-        Ok(resp) => resp,
-        Err(e) => e.into_response(),
-    }
 }

 /// `POST /v1/completions` — proxy completions endpoint.
@@ -74,11 +73,7 @@ async fn completions(

    touch_model(&fleet, &route.node_name, &model_id).await;

-    match proxy::forward_request(&fleet.http_client, &route, "/v1/completions", headers, body).await
-    {
-        Ok(resp) => resp,
-        Err(e) => e.into_response(),
-    }
+    proxy_with_metrics(&fleet, &route, "/v1/completions", headers, body, &model_id).await
 }

 /// `POST /v1/messages` — accept Anthropic format, translate, proxy, translate back.
@@ -110,21 +105,36 @@ async fn anthropic_messages(

    touch_model(&fleet, &route.node_name, &model_id).await;

+    let labels = [
+        ("model", model_id.clone()),
+        ("node", route.node_name.clone()),
+    ];
+    metrics::counter!("cortex_requests_total", &labels).increment(1);
+    if route.cold_start {
+        metrics::counter!("cortex_cold_starts_total", &labels).increment(1);
+    }
+    let start = Instant::now();
+
    if is_streaming {
        // TODO: streaming Anthropic translation requires converting SSE format.
        // For now, proxy the OpenAI SSE stream directly (clients that can handle
        // OpenAI SSE will work; full Anthropic SSE translation is a follow-up).
-        match proxy::forward_request(
+        let result = proxy::forward_request(
            &fleet.http_client,
            &route,
            "/v1/chat/completions",
            headers,
            openai_body,
        )
-        .await
-        {
+        .await;
+        metrics::histogram!("cortex_request_duration_seconds", &labels)
+            .record(start.elapsed().as_secs_f64());
+        match result {
            Ok(resp) => resp,
-            Err(e) => e.into_response(),
+            Err(e) => {
+                metrics::counter!("cortex_request_errors_total", &labels).increment(1);
+                e.into_response()
+            }
        }
    } else {
        // Non-streaming: proxy, buffer full response, translate back to Anthropic.
@@ -138,10 +148,14 @@ async fn anthropic_messages(

        let upstream_resp = match upstream_resp {
            Ok(r) => r,
-            Err(e) => return error_response(502, &format!("upstream request failed: {e}")),
+            Err(e) => {
+                metrics::counter!("cortex_request_errors_total", &labels).increment(1);
+                return error_response(502, &format!("upstream request failed: {e}"));
+            }
        };

        if !upstream_resp.status().is_success() {
+            metrics::counter!("cortex_request_errors_total", &labels).increment(1);
            let status = upstream_resp.status().as_u16();
            let body = upstream_resp.text().await.unwrap_or_default();
            return error_response(status, &format!("upstream error: {body}"));
@@ -150,6 +164,7 @@ async fn anthropic_messages(
        let body_bytes = match upstream_resp.bytes().await {
            Ok(b) => b,
            Err(e) => {
+                metrics::counter!("cortex_request_errors_total", &labels).increment(1);
                return error_response(502, &format!("failed to read upstream response: {e}"));
            }
        };
@@ -158,10 +173,13 @@ async fn anthropic_messages(
            match serde_json::from_slice(&body_bytes) {
                Ok(r) => r,
                Err(e) => {
+                    metrics::counter!("cortex_request_errors_total", &labels).increment(1);
                    return error_response(502, &format!("failed to parse upstream response: {e}"));
                }
            };

+        metrics::histogram!("cortex_request_duration_seconds", &labels)
+            .record(start.elapsed().as_secs_f64());
        let anthropic_resp = cortex_core::translate::openai_to_anthropic(openai_resp);
        Json(json!(anthropic_resp)).into_response()
    }
@@ -216,6 +234,42 @@ async fn health(State(fleet): State<Arc<CortexState>>) -> Json<Value> {

 // ── Helpers ──────────────────────────────────────────────────────────

+/// Proxy a request with metrics instrumentation.
+async fn proxy_with_metrics(
+    fleet: &CortexState,
+    route: &RouteDecision,
+    path: &str,
+    headers: HeaderMap,
+    body: Bytes,
+    model_id: &str,
+) -> Response {
+    let labels = [
+        ("model", model_id.to_string()),
+        ("node", route.node_name.clone()),
+    ];
+
+    metrics::counter!("cortex_requests_total", &labels).increment(1);
+    if route.cold_start {
+        metrics::counter!("cortex_cold_starts_total", &labels).increment(1);
+    }
+
+    let start = Instant::now();
+    let result = proxy::forward_request(&fleet.http_client, route, path, headers, body).await;
+    let duration = start.elapsed();
+
+    match result {
+        Ok(resp) => {
+            metrics::histogram!("cortex_request_duration_seconds", &labels)
+                .record(duration.as_secs_f64());
+            resp
+        }
+        Err(e) => {
+            metrics::counter!("cortex_request_errors_total", &labels).increment(1);
+            e.into_response()
+        }
+    }
+}
+
 /// Update `last_accessed` timestamp for a model on a node (drives LRU eviction).
 async fn touch_model(fleet: &CortexState, node_name: &str, model_id: &str) {
    let mut nodes = fleet.nodes.write().await;
--- a/crates/cortex-gateway/src/metrics.rs
+++ b/crates/cortex-gateway/src/metrics.rs
@@ -18,10 +18,21 @@ pub fn install(listen: &str) -> Result<()> {
        .map_err(|e| anyhow::anyhow!("failed to install Prometheus exporter: {e}"))?;

    tracing::info!("prometheus metrics exporter on {addr}");
+    describe_metrics();
+    Ok(())
+}

-    // Register histograms and counters used by the proxy layer.
-    // The `metrics` crate lazily creates metrics on first use, but
-    // describing them up front gives Prometheus proper HELP/TYPE lines.
+/// Install a recorder for testing (no HTTP listener). Returns a handle
+/// that can render the current metrics as Prometheus text.
+pub fn install_test_recorder() -> Result<metrics_exporter_prometheus::PrometheusHandle> {
+    let handle = PrometheusBuilder::new()
+        .install_recorder()
+        .map_err(|e| anyhow::anyhow!("failed to install test recorder: {e}"))?;
+    describe_metrics();
+    Ok(handle)
+}
+
+fn describe_metrics() {
    metrics::describe_histogram!(
        "cortex_request_duration_seconds",
        "Total request latency in seconds"
@@ -44,6 +55,4 @@ pub fn install(listen: &str) -> Result<()> {
        "cortex_cold_starts_total",
        "Total number of cold-start model loads"
    );
-
-    Ok(())
 }