feat(neuron): honour HF_HUB_CACHE / HF_HOME for the candle harness cache

Resolves the candle harness's HuggingFace cache directory with the following precedence (first hit wins): 1. Explicit `hf_cache` in `[harness.candle]` from neuron.toml. 2. `HF_HUB_CACHE` env var — the Python `huggingface_hub` convention. The Rust hf-hub crate doesn't read this natively, so we bridge here. 3. `HF_HOME` env var (`$HF_HOME/hub` per the canonical layout). 4. None — falls through to hf-hub's own default. Honouring HF_HUB_CACHE lets a neuron host reuse an existing cache directory shared with Python tooling or other harnesses on the same host without per-tool config. The canonical per-host setup is a systemd drop-in: /etc/systemd/system/neuron.service.d/local.conf [Service] Environment=HF_HUB_CACHE=/archive/hf-cache neuron.example.toml documents the resolution chain inline. script/validate-neuron.sh: bump LOAD_TIMEOUT from 600s to 3600s and expose both load/infer timeouts via env (NEURON_LOAD_TIMEOUT, NEURON_INFER_TIMEOUT). A Qwen3.6-class dense model is ~54 GB and was hitting the 10-min ceiling cold-downloading on a residential link. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 07:52:50 +03:00
parent 62ca125a68
commit b400e8b704
3 changed files with 61 additions and 7 deletions
--- a/crates/neuron/src/harness/candle.rs
+++ b/crates/neuron/src/harness/candle.rs
@@ -128,6 +128,39 @@ const REPEAT_PENALTY: f32 = 1.1;
 /// penalty. Matches the candle quantized-qwen3 example default.
 const REPEAT_LAST_N: usize = 64;

+/// Resolve the effective HuggingFace cache directory for the candle
+/// harness. Precedence (first hit wins):
+///
+/// 1. Explicit `hf_cache` from `[harness.candle]` in `neuron.toml`.
+///    Operator's wishes always win.
+/// 2. `HF_HUB_CACHE` env var. The Python `huggingface_hub` library
+///    points at the cache root directly with this var; the Rust
+///    `hf-hub` crate doesn't read it natively, so we bridge here.
+///    Honouring it lets a neuron host share a cache directory with
+///    Python tooling and other harnesses without per-tool config.
+/// 3. `HF_HOME` env var. Canonical HuggingFace base directory; the
+///    cache lives at `$HF_HOME/hub`. Hf-hub respects this on its own,
+///    but we resolve it here too so the resulting path shows up in
+///    logs alongside the explicit/HF_HUB_CACHE cases.
+/// 4. `None`. Falls through to `hf-hub`'s default
+///    (`~/.cache/huggingface/hub`).
+fn resolve_hf_cache(explicit: Option<PathBuf>) -> Option<PathBuf> {
+    if let Some(p) = explicit {
+        return Some(p);
+    }
+    if let Ok(v) = std::env::var("HF_HUB_CACHE")
+        && !v.is_empty()
+    {
+        return Some(PathBuf::from(v));
+    }
+    if let Ok(v) = std::env::var("HF_HOME")
+        && !v.is_empty()
+    {
+        return Some(PathBuf::from(v).join("hub"));
+    }
+    None
+}
+
 /// Apply the repetition penalty (if any) to the prediction logits and
 /// then sample. Centralises the prefill / generation-loop call sites
 /// so they share identical sampling behaviour.
@@ -147,6 +180,10 @@ fn sample_with_penalty(

 impl CandleHarness {
    pub fn new(bind_url: String, hf_cache: Option<PathBuf>) -> Self {
+        let hf_cache = resolve_hf_cache(hf_cache);
+        if let Some(p) = &hf_cache {
+            tracing::info!(path = %p.display(), "candle harness using HuggingFace cache");
+        }
        Self {
            models: Arc::new(RwLock::new(HashMap::new())),
            hf_cache,
--- a/neuron.example.toml
+++ b/neuron.example.toml
@@ -19,8 +19,21 @@ name = "candle"
 # Optional tuning for the candle harness.

 [harness.candle]
-# HuggingFace cache directory for model weights. When unset, hf-hub's
-# default (~/.cache/huggingface) is used.
+# HuggingFace cache directory for model weights.
+#
+# Resolution order (first hit wins):
+#   1. `hf_cache` here in this file.
+#   2. `HF_HUB_CACHE` env var — same convention as the Python
+#      `huggingface_hub` library, so an existing cache directory shared
+#      with other tooling can be reused without per-tool config.
+#   3. `HF_HOME` env var (cache appended as `$HF_HOME/hub`).
+#   4. hf-hub's default (`~/.cache/huggingface/hub`).
+#
+# For per-host overrides (e.g. one neuron has an SSD with prefetched
+# weights), prefer a systemd drop-in over editing this file:
+#   /etc/systemd/system/neuron.service.d/local.conf:
+#     [Service]
+#     Environment=HF_HUB_CACHE=/archive/hf-cache
 # hf_cache = "/var/lib/neuron/hf-cache"

 # -- Default models ----------------------------------------------------------
--- a/script/validate-neuron.sh
+++ b/script/validate-neuron.sh
@@ -45,11 +45,15 @@ EXPECT_SUBSTR='Tbilisi'
 MAX_TOKENS=256

 # /models/load is synchronous — neuron blocks the response until the
-# hf-hub download + GGUF parse + tensor materialisation is done. A
-# fresh 0.6B-Q4_K_M is ~400 MB; on a slow link or cold cache that's
-# easily a minute. Pick a generous ceiling.
-LOAD_TIMEOUT=600
-INFER_TIMEOUT=120
+# hf-hub download + (GGUF parse or safetensors mmap) + tensor
+# materialisation is done. Small GGUF (0.6B-Q4_K_M, ~400 MB) is
+# typically a minute on a warm cache, several on a cold one. A
+# Qwen3.6-class dense model is ~54 GB and can easily take an hour to
+# download cold over a residential link, so default high. Override
+# with NEURON_LOAD_TIMEOUT=N (seconds) for smaller targets if you'd
+# rather fail fast.
+LOAD_TIMEOUT="${NEURON_LOAD_TIMEOUT:-3600}"
+INFER_TIMEOUT="${NEURON_INFER_TIMEOUT:-120}"

 # Status messages go to stderr so command substitutions like
 # `raw=$(run_probe)` capture only the function's intended return value