diff --git a/crates/neuron/src/harness/candle.rs b/crates/neuron/src/harness/candle.rs index f87d74b..ac12e95 100644 --- a/crates/neuron/src/harness/candle.rs +++ b/crates/neuron/src/harness/candle.rs @@ -128,6 +128,39 @@ const REPEAT_PENALTY: f32 = 1.1; /// penalty. Matches the candle quantized-qwen3 example default. const REPEAT_LAST_N: usize = 64; +/// Resolve the effective HuggingFace cache directory for the candle +/// harness. Precedence (first hit wins): +/// +/// 1. Explicit `hf_cache` from `[harness.candle]` in `neuron.toml`. +/// Operator's wishes always win. +/// 2. `HF_HUB_CACHE` env var. The Python `huggingface_hub` library +/// points at the cache root directly with this var; the Rust +/// `hf-hub` crate doesn't read it natively, so we bridge here. +/// Honouring it lets a neuron host share a cache directory with +/// Python tooling and other harnesses without per-tool config. +/// 3. `HF_HOME` env var. Canonical HuggingFace base directory; the +/// cache lives at `$HF_HOME/hub`. Hf-hub respects this on its own, +/// but we resolve it here too so the resulting path shows up in +/// logs alongside the explicit/HF_HUB_CACHE cases. +/// 4. `None`. Falls through to `hf-hub`'s default +/// (`~/.cache/huggingface/hub`). +fn resolve_hf_cache(explicit: Option) -> Option { + if let Some(p) = explicit { + return Some(p); + } + if let Ok(v) = std::env::var("HF_HUB_CACHE") + && !v.is_empty() + { + return Some(PathBuf::from(v)); + } + if let Ok(v) = std::env::var("HF_HOME") + && !v.is_empty() + { + return Some(PathBuf::from(v).join("hub")); + } + None +} + /// Apply the repetition penalty (if any) to the prediction logits and /// then sample. Centralises the prefill / generation-loop call sites /// so they share identical sampling behaviour. @@ -147,6 +180,10 @@ fn sample_with_penalty( impl CandleHarness { pub fn new(bind_url: String, hf_cache: Option) -> Self { + let hf_cache = resolve_hf_cache(hf_cache); + if let Some(p) = &hf_cache { + tracing::info!(path = %p.display(), "candle harness using HuggingFace cache"); + } Self { models: Arc::new(RwLock::new(HashMap::new())), hf_cache, diff --git a/neuron.example.toml b/neuron.example.toml index 46108a6..a842f5b 100644 --- a/neuron.example.toml +++ b/neuron.example.toml @@ -19,8 +19,21 @@ name = "candle" # Optional tuning for the candle harness. [harness.candle] -# HuggingFace cache directory for model weights. When unset, hf-hub's -# default (~/.cache/huggingface) is used. +# HuggingFace cache directory for model weights. +# +# Resolution order (first hit wins): +# 1. `hf_cache` here in this file. +# 2. `HF_HUB_CACHE` env var — same convention as the Python +# `huggingface_hub` library, so an existing cache directory shared +# with other tooling can be reused without per-tool config. +# 3. `HF_HOME` env var (cache appended as `$HF_HOME/hub`). +# 4. hf-hub's default (`~/.cache/huggingface/hub`). +# +# For per-host overrides (e.g. one neuron has an SSD with prefetched +# weights), prefer a systemd drop-in over editing this file: +# /etc/systemd/system/neuron.service.d/local.conf: +# [Service] +# Environment=HF_HUB_CACHE=/archive/hf-cache # hf_cache = "/var/lib/neuron/hf-cache" # -- Default models ---------------------------------------------------------- diff --git a/script/validate-neuron.sh b/script/validate-neuron.sh index ceacd0c..bdcaca6 100755 --- a/script/validate-neuron.sh +++ b/script/validate-neuron.sh @@ -45,11 +45,15 @@ EXPECT_SUBSTR='Tbilisi' MAX_TOKENS=256 # /models/load is synchronous — neuron blocks the response until the -# hf-hub download + GGUF parse + tensor materialisation is done. A -# fresh 0.6B-Q4_K_M is ~400 MB; on a slow link or cold cache that's -# easily a minute. Pick a generous ceiling. -LOAD_TIMEOUT=600 -INFER_TIMEOUT=120 +# hf-hub download + (GGUF parse or safetensors mmap) + tensor +# materialisation is done. Small GGUF (0.6B-Q4_K_M, ~400 MB) is +# typically a minute on a warm cache, several on a cold one. A +# Qwen3.6-class dense model is ~54 GB and can easily take an hour to +# download cold over a residential link, so default high. Override +# with NEURON_LOAD_TIMEOUT=N (seconds) for smaller targets if you'd +# rather fail fast. +LOAD_TIMEOUT="${NEURON_LOAD_TIMEOUT:-3600}" +INFER_TIMEOUT="${NEURON_INFER_TIMEOUT:-120}" # Status messages go to stderr so command substitutions like # `raw=$(run_probe)` capture only the function's intended return value