From b400e8b704983a72bdf9b3aa3050cf42e7716497 Mon Sep 17 00:00:00 2001 From: rob thijssen Date: Wed, 20 May 2026 07:52:50 +0300 Subject: [PATCH] feat(neuron): honour HF_HUB_CACHE / HF_HOME for the candle harness cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Resolves the candle harness's HuggingFace cache directory with the following precedence (first hit wins): 1. Explicit `hf_cache` in `[harness.candle]` from neuron.toml. 2. `HF_HUB_CACHE` env var — the Python `huggingface_hub` convention. The Rust hf-hub crate doesn't read this natively, so we bridge here. 3. `HF_HOME` env var (`$HF_HOME/hub` per the canonical layout). 4. None — falls through to hf-hub's own default. Honouring HF_HUB_CACHE lets a neuron host reuse an existing cache directory shared with Python tooling or other harnesses on the same host without per-tool config. The canonical per-host setup is a systemd drop-in: /etc/systemd/system/neuron.service.d/local.conf [Service] Environment=HF_HUB_CACHE=/archive/hf-cache neuron.example.toml documents the resolution chain inline. script/validate-neuron.sh: bump LOAD_TIMEOUT from 600s to 3600s and expose both load/infer timeouts via env (NEURON_LOAD_TIMEOUT, NEURON_INFER_TIMEOUT). A Qwen3.6-class dense model is ~54 GB and was hitting the 10-min ceiling cold-downloading on a residential link. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/neuron/src/harness/candle.rs | 37 +++++++++++++++++++++++++++++ neuron.example.toml | 17 +++++++++++-- script/validate-neuron.sh | 14 +++++++---- 3 files changed, 61 insertions(+), 7 deletions(-) diff --git a/crates/neuron/src/harness/candle.rs b/crates/neuron/src/harness/candle.rs index f87d74b..ac12e95 100644 --- a/crates/neuron/src/harness/candle.rs +++ b/crates/neuron/src/harness/candle.rs @@ -128,6 +128,39 @@ const REPEAT_PENALTY: f32 = 1.1; /// penalty. Matches the candle quantized-qwen3 example default. const REPEAT_LAST_N: usize = 64; +/// Resolve the effective HuggingFace cache directory for the candle +/// harness. Precedence (first hit wins): +/// +/// 1. Explicit `hf_cache` from `[harness.candle]` in `neuron.toml`. +/// Operator's wishes always win. +/// 2. `HF_HUB_CACHE` env var. The Python `huggingface_hub` library +/// points at the cache root directly with this var; the Rust +/// `hf-hub` crate doesn't read it natively, so we bridge here. +/// Honouring it lets a neuron host share a cache directory with +/// Python tooling and other harnesses without per-tool config. +/// 3. `HF_HOME` env var. Canonical HuggingFace base directory; the +/// cache lives at `$HF_HOME/hub`. Hf-hub respects this on its own, +/// but we resolve it here too so the resulting path shows up in +/// logs alongside the explicit/HF_HUB_CACHE cases. +/// 4. `None`. Falls through to `hf-hub`'s default +/// (`~/.cache/huggingface/hub`). +fn resolve_hf_cache(explicit: Option) -> Option { + if let Some(p) = explicit { + return Some(p); + } + if let Ok(v) = std::env::var("HF_HUB_CACHE") + && !v.is_empty() + { + return Some(PathBuf::from(v)); + } + if let Ok(v) = std::env::var("HF_HOME") + && !v.is_empty() + { + return Some(PathBuf::from(v).join("hub")); + } + None +} + /// Apply the repetition penalty (if any) to the prediction logits and /// then sample. Centralises the prefill / generation-loop call sites /// so they share identical sampling behaviour. @@ -147,6 +180,10 @@ fn sample_with_penalty( impl CandleHarness { pub fn new(bind_url: String, hf_cache: Option) -> Self { + let hf_cache = resolve_hf_cache(hf_cache); + if let Some(p) = &hf_cache { + tracing::info!(path = %p.display(), "candle harness using HuggingFace cache"); + } Self { models: Arc::new(RwLock::new(HashMap::new())), hf_cache, diff --git a/neuron.example.toml b/neuron.example.toml index 46108a6..a842f5b 100644 --- a/neuron.example.toml +++ b/neuron.example.toml @@ -19,8 +19,21 @@ name = "candle" # Optional tuning for the candle harness. [harness.candle] -# HuggingFace cache directory for model weights. When unset, hf-hub's -# default (~/.cache/huggingface) is used. +# HuggingFace cache directory for model weights. +# +# Resolution order (first hit wins): +# 1. `hf_cache` here in this file. +# 2. `HF_HUB_CACHE` env var — same convention as the Python +# `huggingface_hub` library, so an existing cache directory shared +# with other tooling can be reused without per-tool config. +# 3. `HF_HOME` env var (cache appended as `$HF_HOME/hub`). +# 4. hf-hub's default (`~/.cache/huggingface/hub`). +# +# For per-host overrides (e.g. one neuron has an SSD with prefetched +# weights), prefer a systemd drop-in over editing this file: +# /etc/systemd/system/neuron.service.d/local.conf: +# [Service] +# Environment=HF_HUB_CACHE=/archive/hf-cache # hf_cache = "/var/lib/neuron/hf-cache" # -- Default models ---------------------------------------------------------- diff --git a/script/validate-neuron.sh b/script/validate-neuron.sh index ceacd0c..bdcaca6 100755 --- a/script/validate-neuron.sh +++ b/script/validate-neuron.sh @@ -45,11 +45,15 @@ EXPECT_SUBSTR='Tbilisi' MAX_TOKENS=256 # /models/load is synchronous — neuron blocks the response until the -# hf-hub download + GGUF parse + tensor materialisation is done. A -# fresh 0.6B-Q4_K_M is ~400 MB; on a slow link or cold cache that's -# easily a minute. Pick a generous ceiling. -LOAD_TIMEOUT=600 -INFER_TIMEOUT=120 +# hf-hub download + (GGUF parse or safetensors mmap) + tensor +# materialisation is done. Small GGUF (0.6B-Q4_K_M, ~400 MB) is +# typically a minute on a warm cache, several on a cold one. A +# Qwen3.6-class dense model is ~54 GB and can easily take an hour to +# download cold over a residential link, so default high. Override +# with NEURON_LOAD_TIMEOUT=N (seconds) for smaller targets if you'd +# rather fail fast. +LOAD_TIMEOUT="${NEURON_LOAD_TIMEOUT:-3600}" +INFER_TIMEOUT="${NEURON_INFER_TIMEOUT:-120}" # Status messages go to stderr so command substitutions like # `raw=$(run_probe)` capture only the function's intended return value