feat: scaffold cortex workspace
Rust reverse-proxy for multi-node mistral.rs inference clusters. Includes crate structure (cortex-core, cortex-gateway, cortex-agent, cortex-cli), config loading, OpenAI/Anthropic translation stubs, model routing, eviction, polling, and streaming proxy scaffolding. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
14
crates/cortex-agent/Cargo.toml
Normal file
14
crates/cortex-agent/Cargo.toml
Normal file
@@ -0,0 +1,14 @@
|
||||
[package]
|
||||
name = "cortex-agent"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
cortex-core.workspace = true
|
||||
tokio.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
reqwest.workspace = true
|
||||
tracing.workspace = true
|
||||
anyhow.workspace = true
|
||||
72
crates/cortex-agent/src/agent.rs
Normal file
72
crates/cortex-agent/src/agent.rs
Normal file
@@ -0,0 +1,72 @@
|
||||
//! Per-node agent sidecar.
|
||||
//!
|
||||
//! This is a future component that runs on each GPU node alongside mistralrs.
|
||||
//! It handles:
|
||||
//! - VRAM defragmentation (restarting the mistralrs systemd unit when the
|
||||
//! gateway signals that lifecycle_cycles has exceeded the threshold)
|
||||
//! - Local nvidia-smi polling for actual VRAM usage reporting
|
||||
//! - Systemd unit management for mistralrs process restarts
|
||||
//!
|
||||
//! For now this is a stub. The gateway's poller + evictor handle the critical
|
||||
//! path (model lifecycle via the mistralrs HTTP API). The agent adds
|
||||
//! operational niceties that can be built incrementally.
|
||||
|
||||
/// Placeholder for agent configuration.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct AgentConfig {
|
||||
/// The local mistralrs endpoint to monitor.
|
||||
pub mistralrs_endpoint: String,
|
||||
/// The systemd unit name for mistralrs (e.g. "mistralrs.service").
|
||||
pub systemd_unit: String,
|
||||
}
|
||||
|
||||
/// Restart the local mistralrs process via systemd.
|
||||
/// This is the nuclear option for VRAM defragmentation.
|
||||
pub async fn restart_mistralrs(config: &AgentConfig) -> anyhow::Result<()> {
|
||||
tracing::warn!(
|
||||
unit = %config.systemd_unit,
|
||||
"restarting mistralrs for VRAM defragmentation"
|
||||
);
|
||||
|
||||
let output = tokio::process::Command::new("systemctl")
|
||||
.args(["restart", &config.systemd_unit])
|
||||
.output()
|
||||
.await?;
|
||||
|
||||
if output.status.success() {
|
||||
tracing::info!(unit = %config.systemd_unit, "mistralrs restarted successfully");
|
||||
Ok(())
|
||||
} else {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
anyhow::bail!("systemctl restart failed: {stderr}");
|
||||
}
|
||||
}
|
||||
|
||||
/// Query nvidia-smi for current VRAM usage on this node.
|
||||
/// Returns (used_mb, total_mb) for each GPU.
|
||||
pub async fn query_vram() -> anyhow::Result<Vec<(u64, u64)>> {
|
||||
let output = tokio::process::Command::new("nvidia-smi")
|
||||
.args([
|
||||
"--query-gpu=memory.used,memory.total",
|
||||
"--format=csv,noheader,nounits",
|
||||
])
|
||||
.output()
|
||||
.await?;
|
||||
|
||||
if !output.status.success() {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
anyhow::bail!("nvidia-smi failed: {stderr}");
|
||||
}
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
let mut gpus = Vec::new();
|
||||
for line in stdout.lines() {
|
||||
let parts: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
|
||||
if parts.len() == 2 {
|
||||
let used: u64 = parts[0].parse().unwrap_or(0);
|
||||
let total: u64 = parts[1].parse().unwrap_or(0);
|
||||
gpus.push((used, total));
|
||||
}
|
||||
}
|
||||
Ok(gpus)
|
||||
}
|
||||
1
crates/cortex-agent/src/lib.rs
Normal file
1
crates/cortex-agent/src/lib.rs
Normal file
@@ -0,0 +1 @@
|
||||
pub mod agent;
|
||||
Reference in New Issue
Block a user