Replace cortex-agent stub with neuron (cortex-neuron binary). cortex-core additions: - discovery.rs: DeviceInfo, DiscoveryResponse, DeviceHealth, HealthResponse - harness.rs: Harness async trait, HarnessConfig, ModelSpec, ModelInfo neuron crate (crates/neuron/): - discovery.rs: nvidia-smi CSV parsing (pure functions) + system discovery via uname/nvidia-smi/nvcc - health.rs: cached GPU health polling every 5s - api.rs: GET /discovery and GET /health axum handlers - main.rs: CLI entrypoint with --port flag (default 9090) - harness stubs for mistralrs (Phase 8) and llamacpp (Phase 11) 12 new tests (9 unit + 3 integration), 35 total. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
71 lines
2.0 KiB
Rust
71 lines
2.0 KiB
Rust
//! Cached GPU health monitoring via periodic nvidia-smi polling.
|
|
|
|
use cortex_core::discovery::HealthResponse;
|
|
use std::time::{Duration, Instant};
|
|
use tokio::sync::RwLock;
|
|
|
|
const POLL_INTERVAL: Duration = Duration::from_secs(5);
|
|
|
|
/// Thread-safe cache for the latest GPU health reading.
|
|
pub struct HealthCache {
|
|
inner: RwLock<HealthResponse>,
|
|
has_gpus: RwLock<bool>,
|
|
}
|
|
|
|
impl Default for HealthCache {
|
|
fn default() -> Self {
|
|
Self::new()
|
|
}
|
|
}
|
|
|
|
impl HealthCache {
|
|
pub fn new() -> Self {
|
|
Self {
|
|
inner: RwLock::new(HealthResponse {
|
|
uptime_secs: 0,
|
|
devices: vec![],
|
|
}),
|
|
has_gpus: RwLock::new(false),
|
|
}
|
|
}
|
|
|
|
/// Mark whether this node has GPUs (set after discovery).
|
|
pub async fn set_has_gpus(&self, has_gpus: bool) {
|
|
*self.has_gpus.write().await = has_gpus;
|
|
}
|
|
|
|
/// Get a snapshot of the current health state.
|
|
pub async fn snapshot(&self) -> HealthResponse {
|
|
self.inner.read().await.clone()
|
|
}
|
|
|
|
/// Run forever, polling nvidia-smi every 5 seconds and updating the cache.
|
|
pub async fn poll_loop(&self, start_time: Instant) {
|
|
loop {
|
|
tokio::time::sleep(POLL_INTERVAL).await;
|
|
|
|
let uptime = start_time.elapsed().as_secs();
|
|
|
|
if !*self.has_gpus.read().await {
|
|
let mut health = self.inner.write().await;
|
|
health.uptime_secs = uptime;
|
|
continue;
|
|
}
|
|
|
|
match crate::discovery::query_health().await {
|
|
Ok(devices) => {
|
|
let mut health = self.inner.write().await;
|
|
health.uptime_secs = uptime;
|
|
health.devices = devices;
|
|
}
|
|
Err(e) => {
|
|
tracing::warn!(error = %e, "failed to poll GPU health");
|
|
// Keep last known reading, just update uptime.
|
|
let mut health = self.inner.write().await;
|
|
health.uptime_secs = uptime;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|