//! Cached GPU health monitoring via periodic nvidia-smi polling. use cortex_core::discovery::HealthResponse; use std::time::{Duration, Instant}; use tokio::sync::RwLock; const POLL_INTERVAL: Duration = Duration::from_secs(5); /// Thread-safe cache for the latest GPU health reading. pub struct HealthCache { inner: RwLock, has_gpus: RwLock, } impl Default for HealthCache { fn default() -> Self { Self::new() } } impl HealthCache { pub fn new() -> Self { Self { inner: RwLock::new(HealthResponse { uptime_secs: 0, devices: vec![], }), has_gpus: RwLock::new(false), } } /// Mark whether this node has GPUs (set after discovery). pub async fn set_has_gpus(&self, has_gpus: bool) { *self.has_gpus.write().await = has_gpus; } /// Get a snapshot of the current health state. pub async fn snapshot(&self) -> HealthResponse { self.inner.read().await.clone() } /// Run forever, polling nvidia-smi every 5 seconds and updating the cache. pub async fn poll_loop(&self, start_time: Instant) { loop { tokio::time::sleep(POLL_INTERVAL).await; let uptime = start_time.elapsed().as_secs(); if !*self.has_gpus.read().await { let mut health = self.inner.write().await; health.uptime_secs = uptime; continue; } match crate::discovery::query_health().await { Ok(devices) => { let mut health = self.inner.write().await; health.uptime_secs = uptime; health.devices = devices; } Err(e) => { tracing::warn!(error = %e, "failed to poll GPU health"); // Keep last known reading, just update uptime. let mut health = self.inner.write().await; health.uptime_secs = uptime; } } } } }