feat: add neuron daemon with GPU discovery and health endpoints
Replace cortex-agent stub with neuron (cortex-neuron binary). cortex-core additions: - discovery.rs: DeviceInfo, DiscoveryResponse, DeviceHealth, HealthResponse - harness.rs: Harness async trait, HarnessConfig, ModelSpec, ModelInfo neuron crate (crates/neuron/): - discovery.rs: nvidia-smi CSV parsing (pure functions) + system discovery via uname/nvidia-smi/nvcc - health.rs: cached GPU health polling every 5s - api.rs: GET /discovery and GET /health axum handlers - main.rs: CLI entrypoint with --port flag (default 9090) - harness stubs for mistralrs (Phase 8) and llamacpp (Phase 11) 12 new tests (9 unit + 3 integration), 35 total. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
70
crates/neuron/src/health.rs
Normal file
70
crates/neuron/src/health.rs
Normal file
@@ -0,0 +1,70 @@
|
||||
//! Cached GPU health monitoring via periodic nvidia-smi polling.
|
||||
|
||||
use cortex_core::discovery::HealthResponse;
|
||||
use std::time::{Duration, Instant};
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
const POLL_INTERVAL: Duration = Duration::from_secs(5);
|
||||
|
||||
/// Thread-safe cache for the latest GPU health reading.
|
||||
pub struct HealthCache {
|
||||
inner: RwLock<HealthResponse>,
|
||||
has_gpus: RwLock<bool>,
|
||||
}
|
||||
|
||||
impl Default for HealthCache {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl HealthCache {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
inner: RwLock::new(HealthResponse {
|
||||
uptime_secs: 0,
|
||||
devices: vec![],
|
||||
}),
|
||||
has_gpus: RwLock::new(false),
|
||||
}
|
||||
}
|
||||
|
||||
/// Mark whether this node has GPUs (set after discovery).
|
||||
pub async fn set_has_gpus(&self, has_gpus: bool) {
|
||||
*self.has_gpus.write().await = has_gpus;
|
||||
}
|
||||
|
||||
/// Get a snapshot of the current health state.
|
||||
pub async fn snapshot(&self) -> HealthResponse {
|
||||
self.inner.read().await.clone()
|
||||
}
|
||||
|
||||
/// Run forever, polling nvidia-smi every 5 seconds and updating the cache.
|
||||
pub async fn poll_loop(&self, start_time: Instant) {
|
||||
loop {
|
||||
tokio::time::sleep(POLL_INTERVAL).await;
|
||||
|
||||
let uptime = start_time.elapsed().as_secs();
|
||||
|
||||
if !*self.has_gpus.read().await {
|
||||
let mut health = self.inner.write().await;
|
||||
health.uptime_secs = uptime;
|
||||
continue;
|
||||
}
|
||||
|
||||
match crate::discovery::query_health().await {
|
||||
Ok(devices) => {
|
||||
let mut health = self.inner.write().await;
|
||||
health.uptime_secs = uptime;
|
||||
health.devices = devices;
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(error = %e, "failed to poll GPU health");
|
||||
// Keep last known reading, just update uptime.
|
||||
let mut health = self.inner.write().await;
|
||||
health.uptime_secs = uptime;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user