feat: add neuron daemon with GPU discovery and health endpoints
Replace cortex-agent stub with neuron (cortex-neuron binary). cortex-core additions: - discovery.rs: DeviceInfo, DiscoveryResponse, DeviceHealth, HealthResponse - harness.rs: Harness async trait, HarnessConfig, ModelSpec, ModelInfo neuron crate (crates/neuron/): - discovery.rs: nvidia-smi CSV parsing (pure functions) + system discovery via uname/nvidia-smi/nvcc - health.rs: cached GPU health polling every 5s - api.rs: GET /discovery and GET /health axum handlers - main.rs: CLI entrypoint with --port flag (default 9090) - harness stubs for mistralrs (Phase 8) and llamacpp (Phase 11) 12 new tests (9 unit + 3 integration), 35 total. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -13,3 +13,4 @@ chrono.workspace = true
|
||||
anyhow.workspace = true
|
||||
thiserror.workspace = true
|
||||
tracing.workspace = true
|
||||
async-trait.workspace = true
|
||||
|
||||
43
crates/cortex-core/src/discovery.rs
Normal file
43
crates/cortex-core/src/discovery.rs
Normal file
@@ -0,0 +1,43 @@
|
||||
//! Hardware discovery and health types shared between cortex and neuron.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Information about a single GPU device discovered on a node.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct DeviceInfo {
|
||||
pub index: u32,
|
||||
pub name: String,
|
||||
pub vram_total_mb: u64,
|
||||
pub compute_capability: String,
|
||||
}
|
||||
|
||||
/// Full discovery response from a neuron endpoint.
|
||||
/// Returned by `GET /discovery`.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct DiscoveryResponse {
|
||||
pub hostname: String,
|
||||
pub os: String,
|
||||
pub kernel: String,
|
||||
pub cuda_version: Option<String>,
|
||||
pub driver_version: Option<String>,
|
||||
pub devices: Vec<DeviceInfo>,
|
||||
pub harnesses: Vec<String>,
|
||||
}
|
||||
|
||||
/// Runtime health metrics for a single GPU device.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct DeviceHealth {
|
||||
pub index: u32,
|
||||
pub vram_used_mb: u64,
|
||||
pub vram_free_mb: u64,
|
||||
pub utilization_pct: u32,
|
||||
pub temp_c: u32,
|
||||
}
|
||||
|
||||
/// Runtime health response from a neuron endpoint.
|
||||
/// Returned by `GET /health`.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct HealthResponse {
|
||||
pub uptime_secs: u64,
|
||||
pub devices: Vec<DeviceHealth>,
|
||||
}
|
||||
76
crates/cortex-core/src/harness.rs
Normal file
76
crates/cortex-core/src/harness.rs
Normal file
@@ -0,0 +1,76 @@
|
||||
//! Harness trait and supporting types for inference engine management.
|
||||
//!
|
||||
//! Defined in cortex-core so both cortex (control plane) and neuron
|
||||
//! (node plane) share the type definitions. neuron provides the
|
||||
//! runtime implementations.
|
||||
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Configuration for a harness instance on a neuron.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct HarnessConfig {
|
||||
pub name: String,
|
||||
/// Base URL of the harness (e.g. "http://localhost:8080" for mistral.rs).
|
||||
pub endpoint: Option<String>,
|
||||
/// Systemd unit name, if the harness is managed via systemd.
|
||||
pub systemd_unit: Option<String>,
|
||||
}
|
||||
|
||||
/// Health status of a harness process.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct HarnessHealth {
|
||||
pub name: String,
|
||||
pub running: bool,
|
||||
pub uptime_secs: Option<u64>,
|
||||
}
|
||||
|
||||
/// Specification for loading a model through a harness.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ModelSpec {
|
||||
pub model_id: String,
|
||||
pub harness: String,
|
||||
pub quant: Option<String>,
|
||||
pub tensor_parallel: Option<u32>,
|
||||
pub devices: Option<Vec<u32>>,
|
||||
}
|
||||
|
||||
/// A model as reported by a harness.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ModelInfo {
|
||||
pub id: String,
|
||||
pub harness: String,
|
||||
pub status: String,
|
||||
pub devices: Vec<u32>,
|
||||
pub vram_used_mb: Option<u64>,
|
||||
}
|
||||
|
||||
/// What an inference harness must do, from neuron's perspective.
|
||||
#[async_trait]
|
||||
pub trait Harness: Send + Sync {
|
||||
/// Human-readable name (e.g. "mistralrs", "llamacpp", "comfyui").
|
||||
fn name(&self) -> &str;
|
||||
|
||||
/// Start the harness process if it is not already running.
|
||||
async fn start(&self, config: &HarnessConfig) -> Result<()>;
|
||||
|
||||
/// Stop the harness process gracefully.
|
||||
async fn stop(&self) -> Result<()>;
|
||||
|
||||
/// Health check. Returns the harness process status.
|
||||
async fn health(&self) -> HarnessHealth;
|
||||
|
||||
/// List models the harness knows about (loaded + unloaded).
|
||||
async fn list_models(&self) -> Result<Vec<ModelInfo>>;
|
||||
|
||||
/// Load a model with the given spec (quant, TP, device assignment).
|
||||
async fn load_model(&self, spec: &ModelSpec) -> Result<()>;
|
||||
|
||||
/// Unload a model, freeing device memory.
|
||||
async fn unload_model(&self, model_id: &str) -> Result<()>;
|
||||
|
||||
/// Return the URL where inference requests for this model should
|
||||
/// be sent. None if the model is not loaded.
|
||||
async fn inference_endpoint(&self, model_id: &str) -> Option<String>;
|
||||
}
|
||||
@@ -1,5 +1,7 @@
|
||||
pub mod anthropic;
|
||||
pub mod config;
|
||||
pub mod discovery;
|
||||
pub mod harness;
|
||||
pub mod metrics;
|
||||
pub mod node;
|
||||
pub mod openai;
|
||||
|
||||
Reference in New Issue
Block a user