feat(neuron): honour HF_HUB_CACHE / HF_HOME for the candle harness cache

Resolves the candle harness's HuggingFace cache directory with the following precedence (first hit wins): 1. Explicit `hf_cache` in `[harness.candle]` from neuron.toml. 2. `HF_HUB_CACHE` env var — the Python `huggingface_hub` convention. The Rust hf-hub crate doesn't read this natively, so we bridge here. 3. `HF_HOME` env var (`$HF_HOME/hub` per the canonical layout). 4. None — falls through to hf-hub's own default. Honouring HF_HUB_CACHE lets a neuron host reuse an existing cache directory shared with Python tooling or other harnesses on the same host without per-tool config. The canonical per-host setup is a systemd drop-in: /etc/systemd/system/neuron.service.d/local.conf [Service] Environment=HF_HUB_CACHE=/archive/hf-cache neuron.example.toml documents the resolution chain inline. script/validate-neuron.sh: bump LOAD_TIMEOUT from 600s to 3600s and expose both load/infer timeouts via env (NEURON_LOAD_TIMEOUT, NEURON_INFER_TIMEOUT). A Qwen3.6-class dense model is ~54 GB and was hitting the 10-min ceiling cold-downloading on a residential link. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 07:52:50 +03:00
parent 62ca125a68
commit b400e8b704
3 changed files with 61 additions and 7 deletions
--- a/crates/neuron/src/harness/candle.rs
+++ b/crates/neuron/src/harness/candle.rs
@@ -128,6 +128,39 @@ const REPEAT_PENALTY: f32 = 1.1;
 /// penalty. Matches the candle quantized-qwen3 example default.
 const REPEAT_LAST_N: usize = 64;
 /// Resolve the effective HuggingFace cache directory for the candle
 /// harness. Precedence (first hit wins):
 ///
 /// 1. Explicit `hf_cache` from `[harness.candle]` in `neuron.toml`.
 ///    Operator's wishes always win.
 /// 2. `HF_HUB_CACHE` env var. The Python `huggingface_hub` library
 ///    points at the cache root directly with this var; the Rust
 ///    `hf-hub` crate doesn't read it natively, so we bridge here.
 ///    Honouring it lets a neuron host share a cache directory with
 ///    Python tooling and other harnesses without per-tool config.
 /// 3. `HF_HOME` env var. Canonical HuggingFace base directory; the
 ///    cache lives at `$HF_HOME/hub`. Hf-hub respects this on its own,
 ///    but we resolve it here too so the resulting path shows up in
 ///    logs alongside the explicit/HF_HUB_CACHE cases.
 /// 4. `None`. Falls through to `hf-hub`'s default
 ///    (`~/.cache/huggingface/hub`).
 fn resolve_hf_cache(explicit: Option<PathBuf>) -> Option<PathBuf> {
    if let Some(p) = explicit {
        return Some(p);
    }
    if let Ok(v) = std::env::var("HF_HUB_CACHE")
        && !v.is_empty()
    {
        return Some(PathBuf::from(v));
    }
    if let Ok(v) = std::env::var("HF_HOME")
        && !v.is_empty()
    {
        return Some(PathBuf::from(v).join("hub"));
    }
    None
 }
 /// Apply the repetition penalty (if any) to the prediction logits and
 /// then sample. Centralises the prefill / generation-loop call sites
 /// so they share identical sampling behaviour.
@@ -147,6 +180,10 @@ fn sample_with_penalty(
 impl CandleHarness {
    pub fn new(bind_url: String, hf_cache: Option<PathBuf>) -> Self {
        let hf_cache = resolve_hf_cache(hf_cache);
        if let Some(p) = &hf_cache {
            tracing::info!(path = %p.display(), "candle harness using HuggingFace cache");
        }
        Self {
            models: Arc::new(RwLock::new(HashMap::new())),
            hf_cache,
--- a/neuron.example.toml
+++ b/neuron.example.toml
@@ -19,8 +19,21 @@ name = "candle"
 # Optional tuning for the candle harness.
 [harness.candle]
-# HuggingFace cache directory for model weights. When unset, hf-hub's
+# HuggingFace cache directory for model weights.
-# default (~/.cache/huggingface) is used.
+#
 # Resolution order (first hit wins):
 #   1. `hf_cache` here in this file.
 #   2. `HF_HUB_CACHE` env var — same convention as the Python
 #      `huggingface_hub` library, so an existing cache directory shared
 #      with other tooling can be reused without per-tool config.
 #   3. `HF_HOME` env var (cache appended as `$HF_HOME/hub`).
 #   4. hf-hub's default (`~/.cache/huggingface/hub`).
 #
 # For per-host overrides (e.g. one neuron has an SSD with prefetched
 # weights), prefer a systemd drop-in over editing this file:
 #   /etc/systemd/system/neuron.service.d/local.conf:
 #     [Service]
 #     Environment=HF_HUB_CACHE=/archive/hf-cache
 # hf_cache = "/var/lib/neuron/hf-cache"
 # -- Default models ----------------------------------------------------------
--- a/script/validate-neuron.sh
+++ b/script/validate-neuron.sh
@@ -45,11 +45,15 @@ EXPECT_SUBSTR='Tbilisi'
 MAX_TOKENS=256
 # /models/load is synchronous — neuron blocks the response until the
-# hf-hub download + GGUF parse + tensor materialisation is done. A
+# hf-hub download + (GGUF parse or safetensors mmap) + tensor
-# fresh 0.6B-Q4_K_M is ~400 MB; on a slow link or cold cache that's
+# materialisation is done. Small GGUF (0.6B-Q4_K_M, ~400 MB) is
-# easily a minute. Pick a generous ceiling.
+# typically a minute on a warm cache, several on a cold one. A
-LOAD_TIMEOUT=600
+# Qwen3.6-class dense model is ~54 GB and can easily take an hour to
-INFER_TIMEOUT=120
+# download cold over a residential link, so default high. Override
 # with NEURON_LOAD_TIMEOUT=N (seconds) for smaller targets if you'd
 # rather fail fast.
 LOAD_TIMEOUT="${NEURON_LOAD_TIMEOUT:-3600}"
 INFER_TIMEOUT="${NEURON_INFER_TIMEOUT:-120}"
 # Status messages go to stderr so command substitutions like
 # `raw=$(run_probe)` capture only the function's intended return value