feat: add neuron daemon with GPU discovery and health endpoints
Replace cortex-agent stub with neuron (cortex-neuron binary). cortex-core additions: - discovery.rs: DeviceInfo, DiscoveryResponse, DeviceHealth, HealthResponse - harness.rs: Harness async trait, HarnessConfig, ModelSpec, ModelInfo neuron crate (crates/neuron/): - discovery.rs: nvidia-smi CSV parsing (pure functions) + system discovery via uname/nvidia-smi/nvcc - health.rs: cached GPU health polling every 5s - api.rs: GET /discovery and GET /health axum handlers - main.rs: CLI entrypoint with --port flag (default 9090) - harness stubs for mistralrs (Phase 8) and llamacpp (Phase 11) 12 new tests (9 unit + 3 integration), 35 total. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
43
crates/cortex-core/src/discovery.rs
Normal file
43
crates/cortex-core/src/discovery.rs
Normal file
@@ -0,0 +1,43 @@
|
||||
//! Hardware discovery and health types shared between cortex and neuron.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Information about a single GPU device discovered on a node.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct DeviceInfo {
|
||||
pub index: u32,
|
||||
pub name: String,
|
||||
pub vram_total_mb: u64,
|
||||
pub compute_capability: String,
|
||||
}
|
||||
|
||||
/// Full discovery response from a neuron endpoint.
|
||||
/// Returned by `GET /discovery`.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct DiscoveryResponse {
|
||||
pub hostname: String,
|
||||
pub os: String,
|
||||
pub kernel: String,
|
||||
pub cuda_version: Option<String>,
|
||||
pub driver_version: Option<String>,
|
||||
pub devices: Vec<DeviceInfo>,
|
||||
pub harnesses: Vec<String>,
|
||||
}
|
||||
|
||||
/// Runtime health metrics for a single GPU device.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct DeviceHealth {
|
||||
pub index: u32,
|
||||
pub vram_used_mb: u64,
|
||||
pub vram_free_mb: u64,
|
||||
pub utilization_pct: u32,
|
||||
pub temp_c: u32,
|
||||
}
|
||||
|
||||
/// Runtime health response from a neuron endpoint.
|
||||
/// Returned by `GET /health`.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct HealthResponse {
|
||||
pub uptime_secs: u64,
|
||||
pub devices: Vec<DeviceHealth>,
|
||||
}
|
||||
Reference in New Issue
Block a user