feat: add neuron daemon with GPU discovery and health endpoints

Replace cortex-agent stub with neuron (cortex-neuron binary). cortex-core additions: - discovery.rs: DeviceInfo, DiscoveryResponse, DeviceHealth, HealthResponse - harness.rs: Harness async trait, HarnessConfig, ModelSpec, ModelInfo neuron crate (crates/neuron/): - discovery.rs: nvidia-smi CSV parsing (pure functions) + system discovery via uname/nvidia-smi/nvcc - health.rs: cached GPU health polling every 5s - api.rs: GET /discovery and GET /health axum handlers - main.rs: CLI entrypoint with --port flag (default 9090) - harness stubs for mistralrs (Phase 8) and llamacpp (Phase 11) 12 new tests (9 unit + 3 integration), 35 total. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-15 14:23:42 +03:00
parent 67b9b044d3
commit 6dc717ebcd
22 changed files with 1239 additions and 112 deletions
--- a/crates/cortex-core/Cargo.toml
+++ b/crates/cortex-core/Cargo.toml
@@ -13,3 +13,4 @@ chrono.workspace = true
 anyhow.workspace = true
 thiserror.workspace = true
 tracing.workspace = true
+async-trait.workspace = true
--- a/crates/cortex-core/src/discovery.rs
+++ b/crates/cortex-core/src/discovery.rs
@@ -0,0 +1,43 @@
+//! Hardware discovery and health types shared between cortex and neuron.
+
+use serde::{Deserialize, Serialize};
+
+/// Information about a single GPU device discovered on a node.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct DeviceInfo {
+    pub index: u32,
+    pub name: String,
+    pub vram_total_mb: u64,
+    pub compute_capability: String,
+}
+
+/// Full discovery response from a neuron endpoint.
+/// Returned by `GET /discovery`.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct DiscoveryResponse {
+    pub hostname: String,
+    pub os: String,
+    pub kernel: String,
+    pub cuda_version: Option<String>,
+    pub driver_version: Option<String>,
+    pub devices: Vec<DeviceInfo>,
+    pub harnesses: Vec<String>,
+}
+
+/// Runtime health metrics for a single GPU device.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct DeviceHealth {
+    pub index: u32,
+    pub vram_used_mb: u64,
+    pub vram_free_mb: u64,
+    pub utilization_pct: u32,
+    pub temp_c: u32,
+}
+
+/// Runtime health response from a neuron endpoint.
+/// Returned by `GET /health`.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct HealthResponse {
+    pub uptime_secs: u64,
+    pub devices: Vec<DeviceHealth>,
+}
--- a/crates/cortex-core/src/harness.rs
+++ b/crates/cortex-core/src/harness.rs
@@ -0,0 +1,76 @@
+//! Harness trait and supporting types for inference engine management.
+//!
+//! Defined in cortex-core so both cortex (control plane) and neuron
+//! (node plane) share the type definitions. neuron provides the
+//! runtime implementations.
+
+use anyhow::Result;
+use async_trait::async_trait;
+use serde::{Deserialize, Serialize};
+
+/// Configuration for a harness instance on a neuron.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct HarnessConfig {
+    pub name: String,
+    /// Base URL of the harness (e.g. "http://localhost:8080" for mistral.rs).
+    pub endpoint: Option<String>,
+    /// Systemd unit name, if the harness is managed via systemd.
+    pub systemd_unit: Option<String>,
+}
+
+/// Health status of a harness process.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct HarnessHealth {
+    pub name: String,
+    pub running: bool,
+    pub uptime_secs: Option<u64>,
+}
+
+/// Specification for loading a model through a harness.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ModelSpec {
+    pub model_id: String,
+    pub harness: String,
+    pub quant: Option<String>,
+    pub tensor_parallel: Option<u32>,
+    pub devices: Option<Vec<u32>>,
+}
+
+/// A model as reported by a harness.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ModelInfo {
+    pub id: String,
+    pub harness: String,
+    pub status: String,
+    pub devices: Vec<u32>,
+    pub vram_used_mb: Option<u64>,
+}
+
+/// What an inference harness must do, from neuron's perspective.
+#[async_trait]
+pub trait Harness: Send + Sync {
+    /// Human-readable name (e.g. "mistralrs", "llamacpp", "comfyui").
+    fn name(&self) -> &str;
+
+    /// Start the harness process if it is not already running.
+    async fn start(&self, config: &HarnessConfig) -> Result<()>;
+
+    /// Stop the harness process gracefully.
+    async fn stop(&self) -> Result<()>;
+
+    /// Health check. Returns the harness process status.
+    async fn health(&self) -> HarnessHealth;
+
+    /// List models the harness knows about (loaded + unloaded).
+    async fn list_models(&self) -> Result<Vec<ModelInfo>>;
+
+    /// Load a model with the given spec (quant, TP, device assignment).
+    async fn load_model(&self, spec: &ModelSpec) -> Result<()>;
+
+    /// Unload a model, freeing device memory.
+    async fn unload_model(&self, model_id: &str) -> Result<()>;
+
+    /// Return the URL where inference requests for this model should
+    /// be sent. None if the model is not loaded.
+    async fn inference_endpoint(&self, model_id: &str) -> Option<String>;
+}
--- a/crates/cortex-core/src/lib.rs
+++ b/crates/cortex-core/src/lib.rs
@@ -1,5 +1,7 @@
 pub mod anthropic;
 pub mod config;
+pub mod discovery;
+pub mod harness;
 pub mod metrics;
 pub mod node;
 pub mod openai;