[package] name = "neuron" version.workspace = true edition.workspace = true license.workspace = true [lib] name = "neuron" path = "src/lib.rs" [[bin]] name = "neuron" path = "src/main.rs" [features] default = [] # Enables CUDA acceleration in candle and the cudarc/nccl bindings the # TP worker pool uses. Without this feature, candle compiles for CPU # only, Device::new_cuda calls fall back to CPU, and TP Init/sanity # requests return Error{kind="cuda_feature_not_enabled"}. cuda = [ "candle-core/cuda", "candle-core/nccl", "candle-nn/cuda", "candle-transformers/cuda", "dep:cudarc", "dep:half", "dep:cudaforge", ] # Use cuDNN for convolution / attention kernels. Requires CUDA. cudnn = [ "cuda", "candle-core/cudnn", "candle-nn/cudnn", "candle-transformers/cudnn", ] # FlashAttention kernels. Requires CUDA. flash-attn = [ "cuda", "candle-transformers/flash-attn", ] # Reserved for GPU-only integration tests in later stages. cuda-integration = ["cuda"] [dependencies] cortex-core.workspace = true tokio.workspace = true axum.workspace = true serde.workspace = true serde_json.workspace = true reqwest.workspace = true tracing.workspace = true tracing-subscriber.workspace = true anyhow.workspace = true async-trait.workspace = true clap.workspace = true thiserror.workspace = true futures.workspace = true tokio-stream.workspace = true figment.workspace = true toml.workspace = true # candle for in-process inference. CUDA support is gated behind the # crate's `cuda` feature (default off) so the workspace builds on # non-CUDA hosts and CI runners. candle-core = "0.10.2" candle-nn = "0.10.2" candle-transformers = "0.10.2" # Direct dep on cudarc (matching candle's transitive version) so the # TP worker pool can call cudarc::nccl::{Comm, Id} directly. Gated on # the `cuda` feature; same toolchain requirement as candle's CUDA path. cudarc = { version = "0.19", optional = true, default-features = false, features = ["nccl", "cuda-version-from-build-system"] } # Used by the AllReduce CustomOp1 to type-dispatch on bf16/f16 candle # storages. Matches candle-core's pinned major version to avoid double- # compiling the `half` crate at conflicting versions. half = { version = "2.5", optional = true } tokenizers = { version = "0.22", default-features = false, features = ["onig"] } hf-hub = { version = "0.4", features = ["tokio"] } [dev-dependencies] tokio = { workspace = true, features = ["test-util"] } reqwest.workspace = true [build-dependencies] # Used by `build.rs` to compile `src/cuda/*.cu` into `libneuroncuda.a` # under the `cuda` feature. Matches mistralrs's upstream build setup # (their `mistralrs-core/build.rs` uses the same constructor). cudaforge = { version = "0.1", optional = true } [package.metadata.docs.rs] # Skip the CUDA path on docs.rs (it lacks nvcc). no-default-features = true