feat: scaffold cortex workspace

Rust reverse-proxy for multi-node mistral.rs inference clusters.
Includes crate structure (cortex-core, cortex-gateway, cortex-agent,
cortex-cli), config loading, OpenAI/Anthropic translation stubs,
model routing, eviction, polling, and streaming proxy scaffolding.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-14 18:13:30 +03:00
commit 0da68833af
28 changed files with 4659 additions and 0 deletions

View File

@@ -0,0 +1,15 @@
[package]
name = "cortex-core"
version.workspace = true
edition.workspace = true
license.workspace = true
[dependencies]
serde.workspace = true
serde_json.workspace = true
toml.workspace = true
figment.workspace = true
chrono.workspace = true
anyhow.workspace = true
thiserror.workspace = true
tracing.workspace = true

View File

@@ -0,0 +1,87 @@
//! Anthropic Messages API request and response types.
//!
//! These mirror the `/v1/messages` format used by the Anthropic API.
//! The gateway accepts these, translates to OpenAI format, proxies to
//! mistral.rs, then translates the response back.
use serde::{Deserialize, Serialize};
use serde_json::Value;
// ── Messages request ─────────────────────────────────────────────────
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MessagesRequest {
pub model: String,
pub messages: Vec<AnthropicMessage>,
pub max_tokens: u64,
#[serde(skip_serializing_if = "Option::is_none")]
pub system: Option<SystemPrompt>,
#[serde(skip_serializing_if = "Option::is_none")]
pub temperature: Option<f64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub top_p: Option<f64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub stream: Option<bool>,
#[serde(flatten)]
pub extra: Value,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum SystemPrompt {
Text(String),
Blocks(Vec<Value>),
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnthropicMessage {
pub role: String,
pub content: AnthropicContent,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum AnthropicContent {
Text(String),
Blocks(Vec<ContentBlock>),
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ContentBlock {
#[serde(rename = "type")]
pub block_type: String,
#[serde(flatten)]
pub data: Value,
}
// ── Messages response ────────────────────────────────────────────────
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MessagesResponse {
pub id: String,
#[serde(rename = "type")]
pub response_type: String,
pub role: String,
pub content: Vec<ContentBlock>,
pub model: String,
pub stop_reason: Option<String>,
pub usage: AnthropicUsage,
#[serde(flatten)]
pub extra: Value,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnthropicUsage {
pub input_tokens: u64,
pub output_tokens: u64,
}
// ── Streaming events ─────────────────────────────────────────────────
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StreamEvent {
#[serde(rename = "type")]
pub event_type: String,
#[serde(flatten)]
pub data: Value,
}

View File

@@ -0,0 +1,79 @@
use figment::{
Figment,
providers::{Env, Format, Toml},
};
use serde::{Deserialize, Serialize};
use std::path::Path;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GatewayConfig {
pub gateway: GatewaySettings,
pub eviction: EvictionSettings,
pub nodes: Vec<NodeConfig>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GatewaySettings {
/// Address to listen on for API requests (e.g. "0.0.0.0:8000")
pub listen: String,
/// Address to listen on for Prometheus metrics (e.g. "0.0.0.0:9100")
pub metrics_listen: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvictionSettings {
/// Eviction strategy: "lru" or "priority"
pub strategy: EvictionStrategy,
/// Restart the mistralrs process after this many load/unload cycles
/// to reclaim fragmented VRAM. 0 = never.
#[serde(default)]
pub defrag_after_cycles: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum EvictionStrategy {
Lru,
Priority,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NodeConfig {
/// Human-readable node name (e.g. "gpu-large")
pub name: String,
/// Base URL of the mistralrs HTTP server (e.g. "http://gpu-large.internal:8080")
pub endpoint: String,
/// Total VRAM in MB across all GPUs on this node
pub vram_mb: u64,
/// Model IDs that should never be evicted from this node
#[serde(default)]
pub pinned: Vec<String>,
}
impl GatewayConfig {
/// Load configuration from a TOML file, with environment variable overrides.
/// Env vars are prefixed with `CORTEX_` and use `__` as a separator
/// (e.g. `CORTEX_GATEWAY__LISTEN=0.0.0.0:9000`).
pub fn load(path: impl AsRef<Path>) -> Result<Self, figment::Error> {
Figment::new()
.merge(Toml::file(path))
.merge(Env::prefixed("CORTEX_").split("__"))
.extract()
}
}
impl Default for GatewayConfig {
fn default() -> Self {
Self {
gateway: GatewaySettings {
listen: "0.0.0.0:8000".into(),
metrics_listen: "0.0.0.0:9100".into(),
},
eviction: EvictionSettings {
strategy: EvictionStrategy::Lru,
defrag_after_cycles: 50,
},
nodes: vec![],
}
}
}

View File

@@ -0,0 +1,6 @@
pub mod anthropic;
pub mod config;
pub mod metrics;
pub mod node;
pub mod openai;
pub mod translate;

View File

@@ -0,0 +1,23 @@
//! Request-level metrics captured by the gateway proxy layer.
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
/// Metrics captured for a single proxied request.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RequestMetrics {
pub timestamp: DateTime<Utc>,
pub model: String,
pub node: String,
pub prompt_tokens: u64,
pub completion_tokens: u64,
pub total_tokens: u64,
/// Tokens per second for the generation phase.
pub tok_per_sec: f64,
/// Time from request start to first SSE chunk (streaming) or full response.
pub time_to_first_token_ms: u64,
/// Total request latency including proxy overhead.
pub total_latency_ms: u64,
/// Whether this request triggered a model load (cold start).
pub cold_start: bool,
}

View File

@@ -0,0 +1,74 @@
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
/// Runtime state of a single node in the fleet.
#[derive(Debug, Clone)]
pub struct NodeState {
pub name: String,
pub endpoint: String,
pub vram_mb: u64,
pub pinned: Vec<String>,
pub healthy: bool,
pub models: HashMap<String, ModelEntry>,
/// Number of load/unload cycles since last process restart.
pub lifecycle_cycles: u32,
pub last_poll: Option<DateTime<Utc>>,
}
/// A model registered on a node, with its runtime status.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelEntry {
pub id: String,
pub status: ModelStatus,
/// When this model was last used (for LRU eviction).
pub last_accessed: Option<DateTime<Utc>>,
/// Estimated VRAM usage in MB when loaded.
pub vram_estimate_mb: Option<u64>,
}
/// Model lifecycle status, matching the mistral.rs API.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum ModelStatus {
Loaded,
Unloaded,
Reloading,
}
/// Unified model entry as exposed by the gateway's `/v1/models` endpoint.
/// Includes which node(s) host this model and their status.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CortexModelEntry {
pub id: String,
pub object: String,
/// Which nodes have this model (and their status).
pub locations: Vec<ModelLocation>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelLocation {
pub node: String,
pub status: ModelStatus,
pub vram_estimate_mb: Option<u64>,
}
/// Response from mistral.rs `GET /v1/models`.
/// This is the upstream format we parse when polling nodes.
#[derive(Debug, Clone, Deserialize)]
pub struct MistralModelsResponse {
pub data: Vec<MistralModelEntry>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct MistralModelEntry {
pub id: String,
#[serde(default)]
pub status: Option<String>,
}
/// Request body for mistral.rs model lifecycle endpoints.
#[derive(Debug, Clone, Serialize)]
pub struct ModelLifecycleRequest {
pub model_id: String,
}

View File

@@ -0,0 +1,122 @@
//! OpenAI-compatible request and response types.
//!
//! These are a subset sufficient for chat completions (streaming + non-streaming).
//! Fields not relevant to proxying are captured as `serde_json::Value` via
//! `#[serde(flatten)]` so we forward them without needing to enumerate every
//! extension field mistral.rs supports.
use serde::{Deserialize, Serialize};
use serde_json::Value;
// ── Chat completion request ──────────────────────────────────────────
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatCompletionRequest {
pub model: String,
pub messages: Vec<ChatMessage>,
#[serde(skip_serializing_if = "Option::is_none")]
pub temperature: Option<f64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub top_p: Option<f64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub max_tokens: Option<u64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub stream: Option<bool>,
/// All other fields (tools, response_format, mistral.rs extensions, etc.)
#[serde(flatten)]
pub extra: Value,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatMessage {
pub role: String,
pub content: MessageContent,
#[serde(flatten)]
pub extra: Value,
}
/// Content can be a simple string or an array of content parts (for vision).
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum MessageContent {
Text(String),
Parts(Vec<Value>),
}
// ── Chat completion response (non-streaming) ─────────────────────────
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatCompletionResponse {
pub id: String,
pub object: String,
pub created: u64,
pub model: String,
pub choices: Vec<ChatCompletionChoice>,
#[serde(skip_serializing_if = "Option::is_none")]
pub usage: Option<Usage>,
#[serde(flatten)]
pub extra: Value,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatCompletionChoice {
pub index: u32,
pub message: ChatMessage,
pub finish_reason: Option<String>,
#[serde(flatten)]
pub extra: Value,
}
// ── Streaming chunk ──────────────────────────────────────────────────
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatCompletionChunk {
pub id: String,
pub object: String,
pub created: u64,
pub model: String,
pub choices: Vec<ChunkChoice>,
#[serde(skip_serializing_if = "Option::is_none")]
pub usage: Option<Usage>,
#[serde(flatten)]
pub extra: Value,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChunkChoice {
pub index: u32,
pub delta: Value,
pub finish_reason: Option<String>,
#[serde(flatten)]
pub extra: Value,
}
// ── Usage ────────────────────────────────────────────────────────────
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Usage {
pub prompt_tokens: u64,
pub completion_tokens: u64,
pub total_tokens: u64,
}
// ── Models list response ─────────────────────────────────────────────
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelsResponse {
pub object: String,
pub data: Vec<ModelObject>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelObject {
pub id: String,
pub object: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub owned_by: Option<String>,
/// Gateway extensions: which node(s) host this model.
#[serde(skip_serializing_if = "Option::is_none")]
pub locations: Option<Vec<super::node::ModelLocation>>,
#[serde(flatten)]
pub extra: Value,
}

View File

@@ -0,0 +1,114 @@
//! Translation between OpenAI and Anthropic request/response envelopes.
//!
//! This is a stateless transformation — no context is carried between requests.
use crate::anthropic::{
AnthropicContent, AnthropicMessage, AnthropicUsage, ContentBlock, MessagesRequest,
MessagesResponse, SystemPrompt,
};
use crate::openai::{
ChatCompletionChoice, ChatCompletionRequest, ChatCompletionResponse, ChatMessage, Usage,
MessageContent,
};
use serde_json::{json, Value};
/// Convert an Anthropic Messages request into an OpenAI ChatCompletion request.
pub fn anthropic_to_openai(req: MessagesRequest) -> ChatCompletionRequest {
let mut messages = Vec::new();
// Anthropic `system` field becomes a system message.
if let Some(system) = req.system {
let content = match system {
SystemPrompt::Text(t) => t,
SystemPrompt::Blocks(blocks) => serde_json::to_string(&blocks).unwrap_or_default(),
};
messages.push(ChatMessage {
role: "system".into(),
content: MessageContent::Text(content),
extra: Value::Null,
});
}
// Convert message roles and content.
for msg in req.messages {
let content = match msg.content {
AnthropicContent::Text(t) => MessageContent::Text(t),
AnthropicContent::Blocks(blocks) => {
// For simple text-only blocks, extract the text.
// For mixed content (images, etc.), pass as parts.
if blocks.len() == 1 && blocks[0].block_type == "text" {
let text = blocks[0]
.data
.get("text")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
MessageContent::Text(text)
} else {
MessageContent::Parts(
blocks.into_iter().map(|b| json!(b)).collect(),
)
}
}
};
messages.push(ChatMessage {
role: msg.role,
content,
extra: Value::Null,
});
}
ChatCompletionRequest {
model: req.model,
messages,
temperature: req.temperature,
top_p: req.top_p,
max_tokens: Some(req.max_tokens),
stream: req.stream,
extra: req.extra,
}
}
/// Convert an OpenAI ChatCompletion response into an Anthropic Messages response.
pub fn openai_to_anthropic(resp: ChatCompletionResponse) -> MessagesResponse {
let choice = resp.choices.into_iter().next();
let (content_text, stop_reason) = match choice {
Some(c) => {
let text = match c.message.content {
MessageContent::Text(t) => t,
MessageContent::Parts(parts) => serde_json::to_string(&parts).unwrap_or_default(),
};
let stop = c.finish_reason.map(|r| match r.as_str() {
"stop" => "end_turn".to_string(),
"length" => "max_tokens".to_string(),
other => other.to_string(),
});
(text, stop)
}
None => (String::new(), None),
};
let usage = resp.usage.unwrap_or(Usage {
prompt_tokens: 0,
completion_tokens: 0,
total_tokens: 0,
});
MessagesResponse {
id: resp.id,
response_type: "message".into(),
role: "assistant".into(),
content: vec![ContentBlock {
block_type: "text".into(),
data: json!({ "text": content_text }),
}],
model: resp.model,
stop_reason,
usage: AnthropicUsage {
input_tokens: usage.prompt_tokens,
output_tokens: usage.completion_tokens,
},
extra: Value::Null,
}
}