feat: scaffold cortex workspace

Rust reverse-proxy for multi-node mistral.rs inference clusters. Includes crate structure (cortex-core, cortex-gateway, cortex-agent, cortex-cli), config loading, OpenAI/Anthropic translation stubs, model routing, eviction, polling, and streaming proxy scaffolding. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-14 18:13:30 +03:00
commit 0da68833af
28 changed files with 4659 additions and 0 deletions
--- a/crates/cortex-core/Cargo.toml
+++ b/crates/cortex-core/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "cortex-core"
+version.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+serde.workspace = true
+serde_json.workspace = true
+toml.workspace = true
+figment.workspace = true
+chrono.workspace = true
+anyhow.workspace = true
+thiserror.workspace = true
+tracing.workspace = true
--- a/crates/cortex-core/src/anthropic.rs
+++ b/crates/cortex-core/src/anthropic.rs
@@ -0,0 +1,87 @@
+//! Anthropic Messages API request and response types.
+//!
+//! These mirror the `/v1/messages` format used by the Anthropic API.
+//! The gateway accepts these, translates to OpenAI format, proxies to
+//! mistral.rs, then translates the response back.
+
+use serde::{Deserialize, Serialize};
+use serde_json::Value;
+
+// ── Messages request ─────────────────────────────────────────────────
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct MessagesRequest {
+    pub model: String,
+    pub messages: Vec<AnthropicMessage>,
+    pub max_tokens: u64,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub system: Option<SystemPrompt>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub temperature: Option<f64>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub top_p: Option<f64>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stream: Option<bool>,
+    #[serde(flatten)]
+    pub extra: Value,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum SystemPrompt {
+    Text(String),
+    Blocks(Vec<Value>),
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AnthropicMessage {
+    pub role: String,
+    pub content: AnthropicContent,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum AnthropicContent {
+    Text(String),
+    Blocks(Vec<ContentBlock>),
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ContentBlock {
+    #[serde(rename = "type")]
+    pub block_type: String,
+    #[serde(flatten)]
+    pub data: Value,
+}
+
+// ── Messages response ────────────────────────────────────────────────
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct MessagesResponse {
+    pub id: String,
+    #[serde(rename = "type")]
+    pub response_type: String,
+    pub role: String,
+    pub content: Vec<ContentBlock>,
+    pub model: String,
+    pub stop_reason: Option<String>,
+    pub usage: AnthropicUsage,
+    #[serde(flatten)]
+    pub extra: Value,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AnthropicUsage {
+    pub input_tokens: u64,
+    pub output_tokens: u64,
+}
+
+// ── Streaming events ─────────────────────────────────────────────────
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct StreamEvent {
+    #[serde(rename = "type")]
+    pub event_type: String,
+    #[serde(flatten)]
+    pub data: Value,
+}
--- a/crates/cortex-core/src/config.rs
+++ b/crates/cortex-core/src/config.rs
@@ -0,0 +1,79 @@
+use figment::{
+    Figment,
+    providers::{Env, Format, Toml},
+};
+use serde::{Deserialize, Serialize};
+use std::path::Path;
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct GatewayConfig {
+    pub gateway: GatewaySettings,
+    pub eviction: EvictionSettings,
+    pub nodes: Vec<NodeConfig>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct GatewaySettings {
+    /// Address to listen on for API requests (e.g. "0.0.0.0:8000")
+    pub listen: String,
+    /// Address to listen on for Prometheus metrics (e.g. "0.0.0.0:9100")
+    pub metrics_listen: String,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct EvictionSettings {
+    /// Eviction strategy: "lru" or "priority"
+    pub strategy: EvictionStrategy,
+    /// Restart the mistralrs process after this many load/unload cycles
+    /// to reclaim fragmented VRAM. 0 = never.
+    #[serde(default)]
+    pub defrag_after_cycles: u32,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(rename_all = "lowercase")]
+pub enum EvictionStrategy {
+    Lru,
+    Priority,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct NodeConfig {
+    /// Human-readable node name (e.g. "gpu-large")
+    pub name: String,
+    /// Base URL of the mistralrs HTTP server (e.g. "http://gpu-large.internal:8080")
+    pub endpoint: String,
+    /// Total VRAM in MB across all GPUs on this node
+    pub vram_mb: u64,
+    /// Model IDs that should never be evicted from this node
+    #[serde(default)]
+    pub pinned: Vec<String>,
+}
+
+impl GatewayConfig {
+    /// Load configuration from a TOML file, with environment variable overrides.
+    /// Env vars are prefixed with `CORTEX_` and use `__` as a separator
+    /// (e.g. `CORTEX_GATEWAY__LISTEN=0.0.0.0:9000`).
+    pub fn load(path: impl AsRef<Path>) -> Result<Self, figment::Error> {
+        Figment::new()
+            .merge(Toml::file(path))
+            .merge(Env::prefixed("CORTEX_").split("__"))
+            .extract()
+    }
+}
+
+impl Default for GatewayConfig {
+    fn default() -> Self {
+        Self {
+            gateway: GatewaySettings {
+                listen: "0.0.0.0:8000".into(),
+                metrics_listen: "0.0.0.0:9100".into(),
+            },
+            eviction: EvictionSettings {
+                strategy: EvictionStrategy::Lru,
+                defrag_after_cycles: 50,
+            },
+            nodes: vec![],
+        }
+    }
+}
--- a/crates/cortex-core/src/lib.rs
+++ b/crates/cortex-core/src/lib.rs
@@ -0,0 +1,6 @@
+pub mod anthropic;
+pub mod config;
+pub mod metrics;
+pub mod node;
+pub mod openai;
+pub mod translate;
--- a/crates/cortex-core/src/metrics.rs
+++ b/crates/cortex-core/src/metrics.rs
@@ -0,0 +1,23 @@
+//! Request-level metrics captured by the gateway proxy layer.
+
+use chrono::{DateTime, Utc};
+use serde::{Deserialize, Serialize};
+
+/// Metrics captured for a single proxied request.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct RequestMetrics {
+    pub timestamp: DateTime<Utc>,
+    pub model: String,
+    pub node: String,
+    pub prompt_tokens: u64,
+    pub completion_tokens: u64,
+    pub total_tokens: u64,
+    /// Tokens per second for the generation phase.
+    pub tok_per_sec: f64,
+    /// Time from request start to first SSE chunk (streaming) or full response.
+    pub time_to_first_token_ms: u64,
+    /// Total request latency including proxy overhead.
+    pub total_latency_ms: u64,
+    /// Whether this request triggered a model load (cold start).
+    pub cold_start: bool,
+}
--- a/crates/cortex-core/src/node.rs
+++ b/crates/cortex-core/src/node.rs
@@ -0,0 +1,74 @@
+use chrono::{DateTime, Utc};
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+
+/// Runtime state of a single node in the fleet.
+#[derive(Debug, Clone)]
+pub struct NodeState {
+    pub name: String,
+    pub endpoint: String,
+    pub vram_mb: u64,
+    pub pinned: Vec<String>,
+    pub healthy: bool,
+    pub models: HashMap<String, ModelEntry>,
+    /// Number of load/unload cycles since last process restart.
+    pub lifecycle_cycles: u32,
+    pub last_poll: Option<DateTime<Utc>>,
+}
+
+/// A model registered on a node, with its runtime status.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ModelEntry {
+    pub id: String,
+    pub status: ModelStatus,
+    /// When this model was last used (for LRU eviction).
+    pub last_accessed: Option<DateTime<Utc>>,
+    /// Estimated VRAM usage in MB when loaded.
+    pub vram_estimate_mb: Option<u64>,
+}
+
+/// Model lifecycle status, matching the mistral.rs API.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "lowercase")]
+pub enum ModelStatus {
+    Loaded,
+    Unloaded,
+    Reloading,
+}
+
+/// Unified model entry as exposed by the gateway's `/v1/models` endpoint.
+/// Includes which node(s) host this model and their status.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct CortexModelEntry {
+    pub id: String,
+    pub object: String,
+    /// Which nodes have this model (and their status).
+    pub locations: Vec<ModelLocation>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ModelLocation {
+    pub node: String,
+    pub status: ModelStatus,
+    pub vram_estimate_mb: Option<u64>,
+}
+
+/// Response from mistral.rs `GET /v1/models`.
+/// This is the upstream format we parse when polling nodes.
+#[derive(Debug, Clone, Deserialize)]
+pub struct MistralModelsResponse {
+    pub data: Vec<MistralModelEntry>,
+}
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct MistralModelEntry {
+    pub id: String,
+    #[serde(default)]
+    pub status: Option<String>,
+}
+
+/// Request body for mistral.rs model lifecycle endpoints.
+#[derive(Debug, Clone, Serialize)]
+pub struct ModelLifecycleRequest {
+    pub model_id: String,
+}
--- a/crates/cortex-core/src/openai.rs
+++ b/crates/cortex-core/src/openai.rs
@@ -0,0 +1,122 @@
+//! OpenAI-compatible request and response types.
+//!
+//! These are a subset sufficient for chat completions (streaming + non-streaming).
+//! Fields not relevant to proxying are captured as `serde_json::Value` via
+//! `#[serde(flatten)]` so we forward them without needing to enumerate every
+//! extension field mistral.rs supports.
+
+use serde::{Deserialize, Serialize};
+use serde_json::Value;
+
+// ── Chat completion request ──────────────────────────────────────────
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ChatCompletionRequest {
+    pub model: String,
+    pub messages: Vec<ChatMessage>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub temperature: Option<f64>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub top_p: Option<f64>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub max_tokens: Option<u64>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stream: Option<bool>,
+    /// All other fields (tools, response_format, mistral.rs extensions, etc.)
+    #[serde(flatten)]
+    pub extra: Value,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ChatMessage {
+    pub role: String,
+    pub content: MessageContent,
+    #[serde(flatten)]
+    pub extra: Value,
+}
+
+/// Content can be a simple string or an array of content parts (for vision).
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum MessageContent {
+    Text(String),
+    Parts(Vec<Value>),
+}
+
+// ── Chat completion response (non-streaming) ─────────────────────────
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ChatCompletionResponse {
+    pub id: String,
+    pub object: String,
+    pub created: u64,
+    pub model: String,
+    pub choices: Vec<ChatCompletionChoice>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub usage: Option<Usage>,
+    #[serde(flatten)]
+    pub extra: Value,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ChatCompletionChoice {
+    pub index: u32,
+    pub message: ChatMessage,
+    pub finish_reason: Option<String>,
+    #[serde(flatten)]
+    pub extra: Value,
+}
+
+// ── Streaming chunk ──────────────────────────────────────────────────
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ChatCompletionChunk {
+    pub id: String,
+    pub object: String,
+    pub created: u64,
+    pub model: String,
+    pub choices: Vec<ChunkChoice>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub usage: Option<Usage>,
+    #[serde(flatten)]
+    pub extra: Value,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ChunkChoice {
+    pub index: u32,
+    pub delta: Value,
+    pub finish_reason: Option<String>,
+    #[serde(flatten)]
+    pub extra: Value,
+}
+
+// ── Usage ────────────────────────────────────────────────────────────
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Usage {
+    pub prompt_tokens: u64,
+    pub completion_tokens: u64,
+    pub total_tokens: u64,
+}
+
+// ── Models list response ─────────────────────────────────────────────
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ModelsResponse {
+    pub object: String,
+    pub data: Vec<ModelObject>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ModelObject {
+    pub id: String,
+    pub object: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub owned_by: Option<String>,
+    /// Gateway extensions: which node(s) host this model.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub locations: Option<Vec<super::node::ModelLocation>>,
+    #[serde(flatten)]
+    pub extra: Value,
+}
--- a/crates/cortex-core/src/translate.rs
+++ b/crates/cortex-core/src/translate.rs
@@ -0,0 +1,114 @@
+//! Translation between OpenAI and Anthropic request/response envelopes.
+//!
+//! This is a stateless transformation — no context is carried between requests.
+
+use crate::anthropic::{
+    AnthropicContent, AnthropicMessage, AnthropicUsage, ContentBlock, MessagesRequest,
+    MessagesResponse, SystemPrompt,
+};
+use crate::openai::{
+    ChatCompletionChoice, ChatCompletionRequest, ChatCompletionResponse, ChatMessage, Usage,
+    MessageContent,
+};
+use serde_json::{json, Value};
+
+/// Convert an Anthropic Messages request into an OpenAI ChatCompletion request.
+pub fn anthropic_to_openai(req: MessagesRequest) -> ChatCompletionRequest {
+    let mut messages = Vec::new();
+
+    // Anthropic `system` field becomes a system message.
+    if let Some(system) = req.system {
+        let content = match system {
+            SystemPrompt::Text(t) => t,
+            SystemPrompt::Blocks(blocks) => serde_json::to_string(&blocks).unwrap_or_default(),
+        };
+        messages.push(ChatMessage {
+            role: "system".into(),
+            content: MessageContent::Text(content),
+            extra: Value::Null,
+        });
+    }
+
+    // Convert message roles and content.
+    for msg in req.messages {
+        let content = match msg.content {
+            AnthropicContent::Text(t) => MessageContent::Text(t),
+            AnthropicContent::Blocks(blocks) => {
+                // For simple text-only blocks, extract the text.
+                // For mixed content (images, etc.), pass as parts.
+                if blocks.len() == 1 && blocks[0].block_type == "text" {
+                    let text = blocks[0]
+                        .data
+                        .get("text")
+                        .and_then(|v| v.as_str())
+                        .unwrap_or("")
+                        .to_string();
+                    MessageContent::Text(text)
+                } else {
+                    MessageContent::Parts(
+                        blocks.into_iter().map(|b| json!(b)).collect(),
+                    )
+                }
+            }
+        };
+        messages.push(ChatMessage {
+            role: msg.role,
+            content,
+            extra: Value::Null,
+        });
+    }
+
+    ChatCompletionRequest {
+        model: req.model,
+        messages,
+        temperature: req.temperature,
+        top_p: req.top_p,
+        max_tokens: Some(req.max_tokens),
+        stream: req.stream,
+        extra: req.extra,
+    }
+}
+
+/// Convert an OpenAI ChatCompletion response into an Anthropic Messages response.
+pub fn openai_to_anthropic(resp: ChatCompletionResponse) -> MessagesResponse {
+    let choice = resp.choices.into_iter().next();
+
+    let (content_text, stop_reason) = match choice {
+        Some(c) => {
+            let text = match c.message.content {
+                MessageContent::Text(t) => t,
+                MessageContent::Parts(parts) => serde_json::to_string(&parts).unwrap_or_default(),
+            };
+            let stop = c.finish_reason.map(|r| match r.as_str() {
+                "stop" => "end_turn".to_string(),
+                "length" => "max_tokens".to_string(),
+                other => other.to_string(),
+            });
+            (text, stop)
+        }
+        None => (String::new(), None),
+    };
+
+    let usage = resp.usage.unwrap_or(Usage {
+        prompt_tokens: 0,
+        completion_tokens: 0,
+        total_tokens: 0,
+    });
+
+    MessagesResponse {
+        id: resp.id,
+        response_type: "message".into(),
+        role: "assistant".into(),
+        content: vec![ContentBlock {
+            block_type: "text".into(),
+            data: json!({ "text": content_text }),
+        }],
+        model: resp.model,
+        stop_reason,
+        usage: AnthropicUsage {
+            input_tokens: usage.prompt_tokens,
+            output_tokens: usage.completion_tokens,
+        },
+        extra: Value::Null,
+    }
+}