feat: scaffold cortex workspace
Rust reverse-proxy for multi-node mistral.rs inference clusters. Includes crate structure (cortex-core, cortex-gateway, cortex-agent, cortex-cli), config loading, OpenAI/Anthropic translation stubs, model routing, eviction, polling, and streaming proxy scaffolding. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
15
crates/cortex-core/Cargo.toml
Normal file
15
crates/cortex-core/Cargo.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
[package]
|
||||
name = "cortex-core"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
toml.workspace = true
|
||||
figment.workspace = true
|
||||
chrono.workspace = true
|
||||
anyhow.workspace = true
|
||||
thiserror.workspace = true
|
||||
tracing.workspace = true
|
||||
87
crates/cortex-core/src/anthropic.rs
Normal file
87
crates/cortex-core/src/anthropic.rs
Normal file
@@ -0,0 +1,87 @@
|
||||
//! Anthropic Messages API request and response types.
|
||||
//!
|
||||
//! These mirror the `/v1/messages` format used by the Anthropic API.
|
||||
//! The gateway accepts these, translates to OpenAI format, proxies to
|
||||
//! mistral.rs, then translates the response back.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
|
||||
// ── Messages request ─────────────────────────────────────────────────
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct MessagesRequest {
|
||||
pub model: String,
|
||||
pub messages: Vec<AnthropicMessage>,
|
||||
pub max_tokens: u64,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub system: Option<SystemPrompt>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub temperature: Option<f64>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub top_p: Option<f64>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub stream: Option<bool>,
|
||||
#[serde(flatten)]
|
||||
pub extra: Value,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
pub enum SystemPrompt {
|
||||
Text(String),
|
||||
Blocks(Vec<Value>),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct AnthropicMessage {
|
||||
pub role: String,
|
||||
pub content: AnthropicContent,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
pub enum AnthropicContent {
|
||||
Text(String),
|
||||
Blocks(Vec<ContentBlock>),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ContentBlock {
|
||||
#[serde(rename = "type")]
|
||||
pub block_type: String,
|
||||
#[serde(flatten)]
|
||||
pub data: Value,
|
||||
}
|
||||
|
||||
// ── Messages response ────────────────────────────────────────────────
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct MessagesResponse {
|
||||
pub id: String,
|
||||
#[serde(rename = "type")]
|
||||
pub response_type: String,
|
||||
pub role: String,
|
||||
pub content: Vec<ContentBlock>,
|
||||
pub model: String,
|
||||
pub stop_reason: Option<String>,
|
||||
pub usage: AnthropicUsage,
|
||||
#[serde(flatten)]
|
||||
pub extra: Value,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct AnthropicUsage {
|
||||
pub input_tokens: u64,
|
||||
pub output_tokens: u64,
|
||||
}
|
||||
|
||||
// ── Streaming events ─────────────────────────────────────────────────
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct StreamEvent {
|
||||
#[serde(rename = "type")]
|
||||
pub event_type: String,
|
||||
#[serde(flatten)]
|
||||
pub data: Value,
|
||||
}
|
||||
79
crates/cortex-core/src/config.rs
Normal file
79
crates/cortex-core/src/config.rs
Normal file
@@ -0,0 +1,79 @@
|
||||
use figment::{
|
||||
Figment,
|
||||
providers::{Env, Format, Toml},
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::Path;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct GatewayConfig {
|
||||
pub gateway: GatewaySettings,
|
||||
pub eviction: EvictionSettings,
|
||||
pub nodes: Vec<NodeConfig>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct GatewaySettings {
|
||||
/// Address to listen on for API requests (e.g. "0.0.0.0:8000")
|
||||
pub listen: String,
|
||||
/// Address to listen on for Prometheus metrics (e.g. "0.0.0.0:9100")
|
||||
pub metrics_listen: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct EvictionSettings {
|
||||
/// Eviction strategy: "lru" or "priority"
|
||||
pub strategy: EvictionStrategy,
|
||||
/// Restart the mistralrs process after this many load/unload cycles
|
||||
/// to reclaim fragmented VRAM. 0 = never.
|
||||
#[serde(default)]
|
||||
pub defrag_after_cycles: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum EvictionStrategy {
|
||||
Lru,
|
||||
Priority,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct NodeConfig {
|
||||
/// Human-readable node name (e.g. "gpu-large")
|
||||
pub name: String,
|
||||
/// Base URL of the mistralrs HTTP server (e.g. "http://gpu-large.internal:8080")
|
||||
pub endpoint: String,
|
||||
/// Total VRAM in MB across all GPUs on this node
|
||||
pub vram_mb: u64,
|
||||
/// Model IDs that should never be evicted from this node
|
||||
#[serde(default)]
|
||||
pub pinned: Vec<String>,
|
||||
}
|
||||
|
||||
impl GatewayConfig {
|
||||
/// Load configuration from a TOML file, with environment variable overrides.
|
||||
/// Env vars are prefixed with `CORTEX_` and use `__` as a separator
|
||||
/// (e.g. `CORTEX_GATEWAY__LISTEN=0.0.0.0:9000`).
|
||||
pub fn load(path: impl AsRef<Path>) -> Result<Self, figment::Error> {
|
||||
Figment::new()
|
||||
.merge(Toml::file(path))
|
||||
.merge(Env::prefixed("CORTEX_").split("__"))
|
||||
.extract()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for GatewayConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
gateway: GatewaySettings {
|
||||
listen: "0.0.0.0:8000".into(),
|
||||
metrics_listen: "0.0.0.0:9100".into(),
|
||||
},
|
||||
eviction: EvictionSettings {
|
||||
strategy: EvictionStrategy::Lru,
|
||||
defrag_after_cycles: 50,
|
||||
},
|
||||
nodes: vec![],
|
||||
}
|
||||
}
|
||||
}
|
||||
6
crates/cortex-core/src/lib.rs
Normal file
6
crates/cortex-core/src/lib.rs
Normal file
@@ -0,0 +1,6 @@
|
||||
pub mod anthropic;
|
||||
pub mod config;
|
||||
pub mod metrics;
|
||||
pub mod node;
|
||||
pub mod openai;
|
||||
pub mod translate;
|
||||
23
crates/cortex-core/src/metrics.rs
Normal file
23
crates/cortex-core/src/metrics.rs
Normal file
@@ -0,0 +1,23 @@
|
||||
//! Request-level metrics captured by the gateway proxy layer.
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Metrics captured for a single proxied request.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct RequestMetrics {
|
||||
pub timestamp: DateTime<Utc>,
|
||||
pub model: String,
|
||||
pub node: String,
|
||||
pub prompt_tokens: u64,
|
||||
pub completion_tokens: u64,
|
||||
pub total_tokens: u64,
|
||||
/// Tokens per second for the generation phase.
|
||||
pub tok_per_sec: f64,
|
||||
/// Time from request start to first SSE chunk (streaming) or full response.
|
||||
pub time_to_first_token_ms: u64,
|
||||
/// Total request latency including proxy overhead.
|
||||
pub total_latency_ms: u64,
|
||||
/// Whether this request triggered a model load (cold start).
|
||||
pub cold_start: bool,
|
||||
}
|
||||
74
crates/cortex-core/src/node.rs
Normal file
74
crates/cortex-core/src/node.rs
Normal file
@@ -0,0 +1,74 @@
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Runtime state of a single node in the fleet.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct NodeState {
|
||||
pub name: String,
|
||||
pub endpoint: String,
|
||||
pub vram_mb: u64,
|
||||
pub pinned: Vec<String>,
|
||||
pub healthy: bool,
|
||||
pub models: HashMap<String, ModelEntry>,
|
||||
/// Number of load/unload cycles since last process restart.
|
||||
pub lifecycle_cycles: u32,
|
||||
pub last_poll: Option<DateTime<Utc>>,
|
||||
}
|
||||
|
||||
/// A model registered on a node, with its runtime status.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ModelEntry {
|
||||
pub id: String,
|
||||
pub status: ModelStatus,
|
||||
/// When this model was last used (for LRU eviction).
|
||||
pub last_accessed: Option<DateTime<Utc>>,
|
||||
/// Estimated VRAM usage in MB when loaded.
|
||||
pub vram_estimate_mb: Option<u64>,
|
||||
}
|
||||
|
||||
/// Model lifecycle status, matching the mistral.rs API.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum ModelStatus {
|
||||
Loaded,
|
||||
Unloaded,
|
||||
Reloading,
|
||||
}
|
||||
|
||||
/// Unified model entry as exposed by the gateway's `/v1/models` endpoint.
|
||||
/// Includes which node(s) host this model and their status.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct CortexModelEntry {
|
||||
pub id: String,
|
||||
pub object: String,
|
||||
/// Which nodes have this model (and their status).
|
||||
pub locations: Vec<ModelLocation>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ModelLocation {
|
||||
pub node: String,
|
||||
pub status: ModelStatus,
|
||||
pub vram_estimate_mb: Option<u64>,
|
||||
}
|
||||
|
||||
/// Response from mistral.rs `GET /v1/models`.
|
||||
/// This is the upstream format we parse when polling nodes.
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub struct MistralModelsResponse {
|
||||
pub data: Vec<MistralModelEntry>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub struct MistralModelEntry {
|
||||
pub id: String,
|
||||
#[serde(default)]
|
||||
pub status: Option<String>,
|
||||
}
|
||||
|
||||
/// Request body for mistral.rs model lifecycle endpoints.
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct ModelLifecycleRequest {
|
||||
pub model_id: String,
|
||||
}
|
||||
122
crates/cortex-core/src/openai.rs
Normal file
122
crates/cortex-core/src/openai.rs
Normal file
@@ -0,0 +1,122 @@
|
||||
//! OpenAI-compatible request and response types.
|
||||
//!
|
||||
//! These are a subset sufficient for chat completions (streaming + non-streaming).
|
||||
//! Fields not relevant to proxying are captured as `serde_json::Value` via
|
||||
//! `#[serde(flatten)]` so we forward them without needing to enumerate every
|
||||
//! extension field mistral.rs supports.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
|
||||
// ── Chat completion request ──────────────────────────────────────────
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ChatCompletionRequest {
|
||||
pub model: String,
|
||||
pub messages: Vec<ChatMessage>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub temperature: Option<f64>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub top_p: Option<f64>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub max_tokens: Option<u64>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub stream: Option<bool>,
|
||||
/// All other fields (tools, response_format, mistral.rs extensions, etc.)
|
||||
#[serde(flatten)]
|
||||
pub extra: Value,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ChatMessage {
|
||||
pub role: String,
|
||||
pub content: MessageContent,
|
||||
#[serde(flatten)]
|
||||
pub extra: Value,
|
||||
}
|
||||
|
||||
/// Content can be a simple string or an array of content parts (for vision).
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
pub enum MessageContent {
|
||||
Text(String),
|
||||
Parts(Vec<Value>),
|
||||
}
|
||||
|
||||
// ── Chat completion response (non-streaming) ─────────────────────────
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ChatCompletionResponse {
|
||||
pub id: String,
|
||||
pub object: String,
|
||||
pub created: u64,
|
||||
pub model: String,
|
||||
pub choices: Vec<ChatCompletionChoice>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub usage: Option<Usage>,
|
||||
#[serde(flatten)]
|
||||
pub extra: Value,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ChatCompletionChoice {
|
||||
pub index: u32,
|
||||
pub message: ChatMessage,
|
||||
pub finish_reason: Option<String>,
|
||||
#[serde(flatten)]
|
||||
pub extra: Value,
|
||||
}
|
||||
|
||||
// ── Streaming chunk ──────────────────────────────────────────────────
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ChatCompletionChunk {
|
||||
pub id: String,
|
||||
pub object: String,
|
||||
pub created: u64,
|
||||
pub model: String,
|
||||
pub choices: Vec<ChunkChoice>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub usage: Option<Usage>,
|
||||
#[serde(flatten)]
|
||||
pub extra: Value,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ChunkChoice {
|
||||
pub index: u32,
|
||||
pub delta: Value,
|
||||
pub finish_reason: Option<String>,
|
||||
#[serde(flatten)]
|
||||
pub extra: Value,
|
||||
}
|
||||
|
||||
// ── Usage ────────────────────────────────────────────────────────────
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Usage {
|
||||
pub prompt_tokens: u64,
|
||||
pub completion_tokens: u64,
|
||||
pub total_tokens: u64,
|
||||
}
|
||||
|
||||
// ── Models list response ─────────────────────────────────────────────
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ModelsResponse {
|
||||
pub object: String,
|
||||
pub data: Vec<ModelObject>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ModelObject {
|
||||
pub id: String,
|
||||
pub object: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub owned_by: Option<String>,
|
||||
/// Gateway extensions: which node(s) host this model.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub locations: Option<Vec<super::node::ModelLocation>>,
|
||||
#[serde(flatten)]
|
||||
pub extra: Value,
|
||||
}
|
||||
114
crates/cortex-core/src/translate.rs
Normal file
114
crates/cortex-core/src/translate.rs
Normal file
@@ -0,0 +1,114 @@
|
||||
//! Translation between OpenAI and Anthropic request/response envelopes.
|
||||
//!
|
||||
//! This is a stateless transformation — no context is carried between requests.
|
||||
|
||||
use crate::anthropic::{
|
||||
AnthropicContent, AnthropicMessage, AnthropicUsage, ContentBlock, MessagesRequest,
|
||||
MessagesResponse, SystemPrompt,
|
||||
};
|
||||
use crate::openai::{
|
||||
ChatCompletionChoice, ChatCompletionRequest, ChatCompletionResponse, ChatMessage, Usage,
|
||||
MessageContent,
|
||||
};
|
||||
use serde_json::{json, Value};
|
||||
|
||||
/// Convert an Anthropic Messages request into an OpenAI ChatCompletion request.
|
||||
pub fn anthropic_to_openai(req: MessagesRequest) -> ChatCompletionRequest {
|
||||
let mut messages = Vec::new();
|
||||
|
||||
// Anthropic `system` field becomes a system message.
|
||||
if let Some(system) = req.system {
|
||||
let content = match system {
|
||||
SystemPrompt::Text(t) => t,
|
||||
SystemPrompt::Blocks(blocks) => serde_json::to_string(&blocks).unwrap_or_default(),
|
||||
};
|
||||
messages.push(ChatMessage {
|
||||
role: "system".into(),
|
||||
content: MessageContent::Text(content),
|
||||
extra: Value::Null,
|
||||
});
|
||||
}
|
||||
|
||||
// Convert message roles and content.
|
||||
for msg in req.messages {
|
||||
let content = match msg.content {
|
||||
AnthropicContent::Text(t) => MessageContent::Text(t),
|
||||
AnthropicContent::Blocks(blocks) => {
|
||||
// For simple text-only blocks, extract the text.
|
||||
// For mixed content (images, etc.), pass as parts.
|
||||
if blocks.len() == 1 && blocks[0].block_type == "text" {
|
||||
let text = blocks[0]
|
||||
.data
|
||||
.get("text")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.to_string();
|
||||
MessageContent::Text(text)
|
||||
} else {
|
||||
MessageContent::Parts(
|
||||
blocks.into_iter().map(|b| json!(b)).collect(),
|
||||
)
|
||||
}
|
||||
}
|
||||
};
|
||||
messages.push(ChatMessage {
|
||||
role: msg.role,
|
||||
content,
|
||||
extra: Value::Null,
|
||||
});
|
||||
}
|
||||
|
||||
ChatCompletionRequest {
|
||||
model: req.model,
|
||||
messages,
|
||||
temperature: req.temperature,
|
||||
top_p: req.top_p,
|
||||
max_tokens: Some(req.max_tokens),
|
||||
stream: req.stream,
|
||||
extra: req.extra,
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert an OpenAI ChatCompletion response into an Anthropic Messages response.
|
||||
pub fn openai_to_anthropic(resp: ChatCompletionResponse) -> MessagesResponse {
|
||||
let choice = resp.choices.into_iter().next();
|
||||
|
||||
let (content_text, stop_reason) = match choice {
|
||||
Some(c) => {
|
||||
let text = match c.message.content {
|
||||
MessageContent::Text(t) => t,
|
||||
MessageContent::Parts(parts) => serde_json::to_string(&parts).unwrap_or_default(),
|
||||
};
|
||||
let stop = c.finish_reason.map(|r| match r.as_str() {
|
||||
"stop" => "end_turn".to_string(),
|
||||
"length" => "max_tokens".to_string(),
|
||||
other => other.to_string(),
|
||||
});
|
||||
(text, stop)
|
||||
}
|
||||
None => (String::new(), None),
|
||||
};
|
||||
|
||||
let usage = resp.usage.unwrap_or(Usage {
|
||||
prompt_tokens: 0,
|
||||
completion_tokens: 0,
|
||||
total_tokens: 0,
|
||||
});
|
||||
|
||||
MessagesResponse {
|
||||
id: resp.id,
|
||||
response_type: "message".into(),
|
||||
role: "assistant".into(),
|
||||
content: vec![ContentBlock {
|
||||
block_type: "text".into(),
|
||||
data: json!({ "text": content_text }),
|
||||
}],
|
||||
model: resp.model,
|
||||
stop_reason,
|
||||
usage: AnthropicUsage {
|
||||
input_tokens: usage.prompt_tokens,
|
||||
output_tokens: usage.completion_tokens,
|
||||
},
|
||||
extra: Value::Null,
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user