//! Axum HTTP handlers for the gateway API surface. use crate::proxy; use crate::router; use crate::router::RouteDecision; use crate::state::CortexState; use axum::Router; use axum::body::Bytes; use axum::extract::State; use axum::http::HeaderMap; use axum::response::{IntoResponse, Json, Response}; use axum::routing::{get, post}; use chrono::Utc; use cortex_core::node::{CortexModelEntry, ModelLocation}; use serde_json::{Value, json}; use std::sync::Arc; use std::time::Instant; pub fn api_routes() -> Router> { Router::new() .route("/v1/chat/completions", post(chat_completions)) .route("/v1/completions", post(completions)) .route("/v1/models", get(list_models)) .route("/v1/messages", post(anthropic_messages)) .route("/health", get(health)) .route("/", get(health)) } /// `POST /v1/chat/completions` — proxy to the appropriate backend node. async fn chat_completions( State(fleet): State>, headers: HeaderMap, body: Bytes, ) -> Response { let model_id = match extract_model(&body) { Some(m) => m, None => { tracing::warn!( handler = "chat_completions", "rejected: missing 'model' field in request body" ); return error_response(400, "missing 'model' field in request body"); } }; let route = match router::resolve(&fleet, &model_id).await { Ok(r) => r, Err(e) => { tracing::warn!( handler = "chat_completions", model = %model_id, error = %e, "route resolve failed" ); // RouteError's Display strings are short and informative // ("model 'X' not found...", "no healthy nodes available") // — fine to surface to the caller. The warn above carries // any extra context for operators. return error_response(404, &e.to_string()); } }; touch_model(&fleet, &route.node_name, &model_id).await; proxy_with_metrics( &fleet, &route, "/v1/chat/completions", headers, body, &model_id, ) .await } /// `POST /v1/completions` — proxy completions endpoint. async fn completions( State(fleet): State>, headers: HeaderMap, body: Bytes, ) -> Response { let model_id = match extract_model(&body) { Some(m) => m, None => { tracing::warn!( handler = "completions", "rejected: missing 'model' field in request body" ); return error_response(400, "missing 'model' field in request body"); } }; let route = match router::resolve(&fleet, &model_id).await { Ok(r) => r, Err(e) => { tracing::warn!( handler = "completions", model = %model_id, error = %e, "route resolve failed" ); // RouteError's Display strings are short and informative // ("model 'X' not found...", "no healthy nodes available") // — fine to surface to the caller. The warn above carries // any extra context for operators. return error_response(404, &e.to_string()); } }; touch_model(&fleet, &route.node_name, &model_id).await; proxy_with_metrics(&fleet, &route, "/v1/completions", headers, body, &model_id).await } /// `POST /v1/messages` — accept Anthropic format, translate, proxy, translate back. async fn anthropic_messages( State(fleet): State>, headers: HeaderMap, body: Bytes, ) -> Response { // Parse as Anthropic request. let anth_req: cortex_core::anthropic::MessagesRequest = match serde_json::from_slice(&body) { Ok(r) => r, Err(e) => { tracing::warn!( handler = "anthropic_messages", error = %e, "rejected: invalid Anthropic request body" ); return error_response(400, "invalid Anthropic request body"); } }; let model_id = anth_req.model.clone(); let is_streaming = anth_req.stream.unwrap_or(false); // Translate to OpenAI format. let openai_req = cortex_core::translate::anthropic_to_openai(anth_req); let openai_body = match serde_json::to_vec(&openai_req) { Ok(b) => Bytes::from(b), Err(e) => { tracing::error!( handler = "anthropic_messages", model = %model_id, error = %e, "internal: failed to serialise translated OpenAI request" ); return error_response(500, "internal translation error"); } }; let route = match router::resolve(&fleet, &model_id).await { Ok(r) => r, Err(e) => { tracing::warn!( handler = "anthropic_messages", model = %model_id, error = %e, "route resolve failed" ); // RouteError's Display strings are short and informative // ("model 'X' not found...", "no healthy nodes available") // — fine to surface to the caller. The warn above carries // any extra context for operators. return error_response(404, &e.to_string()); } }; touch_model(&fleet, &route.node_name, &model_id).await; let labels = [ ("model", model_id.clone()), ("node", route.node_name.clone()), ]; metrics::counter!("cortex_requests_total", &labels).increment(1); if route.cold_start { metrics::counter!("cortex_cold_starts_total", &labels).increment(1); } let start = Instant::now(); if is_streaming { // TODO: streaming Anthropic translation requires converting SSE format. // For now, proxy the OpenAI SSE stream directly (clients that can handle // OpenAI SSE will work; full Anthropic SSE translation is a follow-up). let result = proxy::forward_request( &fleet.http_client, &route, "/v1/chat/completions", headers, openai_body, ) .await; metrics::histogram!("cortex_request_duration_seconds", &labels) .record(start.elapsed().as_secs_f64()); match result { Ok(resp) => resp, Err(e) => { metrics::counter!("cortex_request_errors_total", &labels).increment(1); // forward_request already warn'd with the wire-level // detail; no need to log again here. e.into_response() } } } else { // Non-streaming: proxy, buffer full response, translate back to Anthropic. let target_url = format!("{}/v1/chat/completions", route.endpoint); tracing::info!( handler = "anthropic_messages", model = %model_id, node = %route.node_name, url = %target_url, cold_start = route.cold_start, "proxying request" ); let upstream_resp = fleet .http_client .post(&target_url) .body(openai_body) .header("content-type", "application/json") .send() .await; let upstream_resp = match upstream_resp { Ok(r) => r, Err(e) => { metrics::counter!("cortex_request_errors_total", &labels).increment(1); tracing::warn!( handler = "anthropic_messages", model = %model_id, node = %route.node_name, url = %target_url, error = %e, "upstream request failed (network)" ); return error_response(502, "upstream request failed"); } }; let upstream_status = upstream_resp.status(); if !upstream_status.is_success() { metrics::counter!("cortex_request_errors_total", &labels).increment(1); let status = upstream_status.as_u16(); let body = upstream_resp.text().await.unwrap_or_default(); let body_snippet = body.chars().take(512).collect::(); tracing::warn!( handler = "anthropic_messages", model = %model_id, node = %route.node_name, url = %target_url, status, body = %body_snippet, "upstream returned non-2xx" ); return error_response(status, &format!("upstream returned {status}")); } let body_bytes = match upstream_resp.bytes().await { Ok(b) => b, Err(e) => { metrics::counter!("cortex_request_errors_total", &labels).increment(1); tracing::warn!( handler = "anthropic_messages", model = %model_id, node = %route.node_name, url = %target_url, error = %e, "failed to read upstream response body" ); return error_response(502, "failed to read upstream response"); } }; let openai_resp: cortex_core::openai::ChatCompletionResponse = match serde_json::from_slice(&body_bytes) { Ok(r) => r, Err(e) => { metrics::counter!("cortex_request_errors_total", &labels).increment(1); let body_snippet = String::from_utf8_lossy(&body_bytes) .chars() .take(512) .collect::(); tracing::warn!( handler = "anthropic_messages", model = %model_id, node = %route.node_name, url = %target_url, error = %e, body = %body_snippet, "failed to parse upstream response as OpenAI ChatCompletionResponse" ); return error_response(502, "malformed upstream response"); } }; metrics::histogram!("cortex_request_duration_seconds", &labels) .record(start.elapsed().as_secs_f64()); let anthropic_resp = cortex_core::translate::openai_to_anthropic(openai_resp); Json(json!(anthropic_resp)).into_response() } } /// `GET /v1/models` — union of (catalogue × topology feasibility) and /// (currently loaded somewhere). The result is what the fleet *could* /// serve, not just what's already loaded — so OpenAI-compatible tools /// see every model the operator has provisioned, and cortex /// transparently cold-loads the first time one is requested. async fn list_models(State(fleet): State>) -> Json { use std::collections::HashMap; let now = Utc::now().timestamp() as u64; let nodes = fleet.nodes.read().await; let catalogue = &fleet.catalogue; let mut entries: HashMap = HashMap::new(); // Pass 1: catalogue × topology. For every catalogue profile, find // healthy neurons whose discovered devices satisfy the profile. // Catalogue-defined models surface here even if nothing has loaded // them yet — that's the point of the unified endpoint. for profile in &catalogue.models { let mut feasible_on = Vec::new(); for node in nodes.values() { if !node.healthy { continue; } let Some(disc) = node.discovery.as_ref() else { continue; }; if profile.is_feasible_on(&node.name, &disc.devices) { feasible_on.push(node.name.clone()); } } if feasible_on.is_empty() { // The catalogue lists this model but no neuron's topology // matches — surface it as not-loaded with no feasible // location. Hides nothing; lets operators see why a // configured model isn't reachable. feasible_on.clear(); } entries.insert( profile.id.clone(), CortexModelEntry { id: profile.id.clone(), object: "model".into(), created: now, owned_by: "helexa".into(), loaded: false, feasible_on, locations: Vec::new(), }, ); } // Pass 2: layer the actually-loaded state on top. For each // (node, model) entry, attach a ModelLocation. If the model isn't // in the catalogue, create a new CortexModelEntry from scratch — // cortex doesn't refuse to surface a manually-loaded model just // because the operator didn't enumerate it in models.toml. for node in nodes.values() { for (model_id, entry) in &node.models { let location = ModelLocation { node: node.name.clone(), status: entry.status, vram_estimate_mb: entry.vram_estimate_mb, }; let was_loaded = matches!(entry.status, cortex_core::node::ModelStatus::Loaded); entries .entry(model_id.clone()) .and_modify(|e| { e.locations.push(location.clone()); if was_loaded { e.loaded = true; } }) .or_insert_with(|| CortexModelEntry { id: model_id.clone(), object: "model".into(), created: now, owned_by: "helexa".into(), loaded: was_loaded, // Not in catalogue — cortex has no opinion on // feasibility; leave empty. feasible_on: Vec::new(), locations: vec![location], }); } } // Pass 3: surface pre-warming models. Each neuron's `/health` // activation snapshot (polled separately from /models) reports // `in_progress` (the model currently materialising) and `pending` // (queued behind it). Neither appears on the neuron's `/models` // yet — that endpoint only knows about fully-loaded handles — so // without this pass a client polling `/v1/models` during pre-warm // sees Qwen3.6-27B with no location and concludes "not there". // Synthesising a Loading location instead tells clients the model // is on its way. Idempotent against Pass 2: if a Loading location // for this node already exists (shouldn't, but be safe) we skip. for node in nodes.values() { let Some(activation) = node.activation.as_ref() else { continue; }; let mut loading_ids: Vec<&str> = Vec::new(); if let Some(id) = activation.in_progress.as_deref() { loading_ids.push(id); } for id in &activation.pending { loading_ids.push(id.as_str()); } for model_id in loading_ids { let location = ModelLocation { node: node.name.clone(), status: cortex_core::node::ModelStatus::Loading, vram_estimate_mb: None, }; entries .entry(model_id.to_string()) .and_modify(|e| { let already = e.locations.iter().any(|l| { l.node == node.name && l.status == cortex_core::node::ModelStatus::Loading }); if !already { e.locations.push(location.clone()); } }) .or_insert_with(|| CortexModelEntry { id: model_id.to_string(), object: "model".into(), created: now, owned_by: "helexa".into(), loaded: false, feasible_on: Vec::new(), locations: vec![location], }); } } let data: Vec = entries.values().map(|e| json!(e)).collect(); Json(json!({ "object": "list", "data": data, })) } /// `GET /health` async fn health(State(fleet): State>) -> Json { let nodes = fleet.nodes.read().await; let healthy_count = nodes.values().filter(|n| n.healthy).count(); let total_count = nodes.len(); Json(json!({ "status": if healthy_count > 0 { "ok" } else { "degraded" }, "nodes": { "healthy": healthy_count, "total": total_count, } })) } // ── Helpers ────────────────────────────────────────────────────────── /// Proxy a request with metrics instrumentation. async fn proxy_with_metrics( fleet: &CortexState, route: &RouteDecision, path: &str, headers: HeaderMap, body: Bytes, model_id: &str, ) -> Response { let labels = [ ("model", model_id.to_string()), ("node", route.node_name.clone()), ]; metrics::counter!("cortex_requests_total", &labels).increment(1); if route.cold_start { metrics::counter!("cortex_cold_starts_total", &labels).increment(1); } let start = Instant::now(); let result = proxy::forward_request(&fleet.http_client, route, path, headers, body).await; let duration = start.elapsed(); match result { Ok(resp) => { metrics::histogram!("cortex_request_duration_seconds", &labels) .record(duration.as_secs_f64()); resp } Err(e) => { metrics::counter!("cortex_request_errors_total", &labels).increment(1); // proxy::forward_request already warn'd with wire-level // detail (target URL, error, status). ProxyError::into_response // now returns a generic message — no body leak. e.into_response() } } } /// Update `last_accessed` timestamp for a model on a node (drives LRU eviction). async fn touch_model(fleet: &CortexState, node_name: &str, model_id: &str) { let mut nodes = fleet.nodes.write().await; if let Some(node) = nodes.get_mut(node_name) && let Some(entry) = node.models.get_mut(model_id) { entry.last_accessed = Some(Utc::now()); } } fn extract_model(body: &[u8]) -> Option { let v: Value = serde_json::from_slice(body).ok()?; v.get("model")?.as_str().map(|s| s.to_string()) } fn error_response(status: u16, message: &str) -> Response { let code = axum::http::StatusCode::from_u16(status) .unwrap_or(axum::http::StatusCode::INTERNAL_SERVER_ERROR); let body = json!({ "error": { "message": message, "type": "gateway_error", } }); (code, Json(body)).into_response() }