fix(neuron): surface full anyhow chain + ensure $HOME exists at start

Two fixes uncovered by the live validation against beast/benjy/quadbrat: 1. api.rs swallowed everything beyond the outermost anyhow context. The validation script reported '{"error":"fetch GGUF ...gguf"}' but the actual underlying hf-hub failure (cache dir creation, network, auth, etc.) was hidden. Switching every error response to format!("{e:#}") expands the full cause chain via anyhow's alternate Display format. 2. The neuron systemd unit declared the service user but never ensured /var/lib/neuron (its $HOME) existed. hf-hub defaults its cache to ~/.cache/huggingface/hub — when $HOME is absent the cache dir creation fails and the download aborts. Adding `StateDirectory=neuron` makes systemd create + chown that directory at activation; no spec change needed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
fix(validate-neuron): default to unsloth GGUF + capture curl errors
2026-05-19 08:17:37 +03:00 · 2026-05-19 08:14:31 +03:00 · 2026-05-19 07:58:05 +03:00
3 changed files with 156 additions and 5 deletions
--- a/crates/neuron/src/api.rs
+++ b/crates/neuron/src/api.rs
@@ -56,7 +56,7 @@ async fn list_models(State(state): State<Arc<NeuronState>>) -> impl IntoResponse
        Ok(models) => Json(json!(models)).into_response(),
        Err(e) => (
            StatusCode::INTERNAL_SERVER_ERROR,
-            Json(json!({"error": e.to_string()})),
+            Json(json!({"error": format!("{e:#}")})),
        )
            .into_response(),
    }
@@ -71,7 +71,7 @@ async fn load_model(
        Ok(()) => Json(json!({"status": "loaded"})).into_response(),
        Err(e) => (
            StatusCode::BAD_REQUEST,
-            Json(json!({"error": e.to_string()})),
+            Json(json!({"error": format!("{e:#}")})),
        )
            .into_response(),
    }
@@ -95,7 +95,11 @@ async fn unload_model(
    let registry = state.registry.read().await;
    match registry.unload_model(&model_id).await {
        Ok(()) => Json(json!({"status": "unloaded"})).into_response(),
-        Err(e) => (StatusCode::NOT_FOUND, Json(json!({"error": e.to_string()}))).into_response(),
+        Err(e) => (
+            StatusCode::NOT_FOUND,
+            Json(json!({"error": format!("{e:#}")})),
+        )
+            .into_response(),
    }
 }

@@ -151,7 +155,7 @@ async fn chat_completions(
                .into_response(),
            Err(InferenceError::Other(e)) => (
                StatusCode::INTERNAL_SERVER_ERROR,
-                Json(json!({"error": e.to_string()})),
+                Json(json!({"error": format!("{e:#}")})),
            )
                .into_response(),
        }
@@ -165,7 +169,7 @@ async fn chat_completions(
                .into_response(),
            Err(InferenceError::Other(e)) => (
                StatusCode::INTERNAL_SERVER_ERROR,
-                Json(json!({"error": e.to_string()})),
+                Json(json!({"error": format!("{e:#}")})),
            )
                .into_response(),
        }
--- a/data/neuron.service
+++ b/data/neuron.service
@@ -10,6 +10,12 @@ Restart=on-failure
 RestartSec=5
 User=neuron
 Group=neuron
+# /var/lib/neuron is the neuron user's $HOME — hf-hub writes its
+# default cache there (~/.cache/huggingface/hub). Without this directive
+# systemd doesn't create the directory and hf-hub downloads fail with
+# "fetch GGUF <file>: failed to create cache dir".
+StateDirectory=neuron
+StateDirectoryMode=0755
 # Loading default_models from neuron.toml happens before the HTTP
 # listener binds; large models can take many minutes to download and
 # materialise on first activation. systemd's default TimeoutStartSec
--- a/script/validate-neuron.sh
+++ b/script/validate-neuron.sh
@@ -0,0 +1,141 @@
+#!/bin/env bash
+#
+# End-to-end smoke test for a deployed neuron.
+#
+# Confirms the daemon is reachable, loads a small public Qwen3 GGUF,
+# fires a reasoning probe at /v1/chat/completions, and prints the
+# answer. Used to validate the candle harness on a real GPU host
+# before trusting it for production traffic, and as a regression test
+# after pushing new neuron builds.
+#
+# Usage:
+#   script/validate-neuron.sh [host] [model_id] [quant]
+#
+# Defaults:
+#   host     = beast.hanzalova.internal
+#   model_id = unsloth/Qwen3-0.6B-GGUF  (official Qwen3-*-GGUF repos
+#              ship Q8_0 only; unsloth's mirror ships the full Q-spectrum
+#              including Q4_K_M)
+#   quant    = Q4_K_M
+
+set -euo pipefail
+
+HOST="${1:-beast.hanzalova.internal}"
+MODEL_ID="${2:-unsloth/Qwen3-0.6B-GGUF}"
+QUANT="${3:-Q4_K_M}"
+PORT="${NEURON_PORT:-13131}"
+BASE="http://${HOST}:${PORT}"
+
+# Reasoning probe — concrete, low-temperature answer that small models
+# can still get right. "Paris" is a strong signal of basic competence
+# beyond gibberish.
+PROBE_PROMPT='What is the capital of France? Respond with the city name only, no punctuation.'
+EXPECT_SUBSTR='Paris'
+MAX_TOKENS=32
+
+# /models/load is synchronous — neuron blocks the response until the
+# hf-hub download + GGUF parse + tensor materialisation is done. A
+# fresh 0.6B-Q4_K_M is ~400 MB; on a slow link or cold cache that's
+# easily a minute. Pick a generous ceiling.
+LOAD_TIMEOUT=600
+INFER_TIMEOUT=120
+
+say() { printf '[%s] %s\n' "${HOST}" "$*"; }
+die() { say "FAIL: $*"; exit 1; }
+
+probe_health() {
+    curl --silent --fail --max-time 5 "${BASE}/health" >/dev/null \
+        || die "neuron not reachable at ${BASE}/health"
+}
+
+list_loaded_ids() {
+    curl --silent --fail "${BASE}/models" | yq -r '.[].id'
+}
+
+is_loaded() {
+    list_loaded_ids 2>/dev/null | grep -Fxq "${MODEL_ID}"
+}
+
+trigger_load() {
+    say "POST /models/load ${MODEL_ID} (quant=${QUANT}, device=[0])"
+    say "  (synchronous; may take a minute on first run while HF downloads)"
+    local payload
+    payload=$(cat <<EOF
+{
+    "model_id": "${MODEL_ID}",
+    "harness": "candle",
+    "quant": "${QUANT}",
+    "devices": [0]
+}
+EOF
+    )
+    # --write-out captures the response code on a separate line so we
+    # can surface a real diagnostic instead of relying on --fail.
+    local resp http_code body
+    resp=$(curl --silent --show-error --max-time "${LOAD_TIMEOUT}" \
+        --write-out '\n__HTTP__%{http_code}' \
+        -X POST "${BASE}/models/load" \
+        -H 'content-type: application/json' \
+        --data "${payload}") || die "curl /models/load failed: $?"
+    http_code=$(echo "${resp}" | grep -oP '(?<=__HTTP__)\d+$' | tail -1)
+    body=$(echo "${resp}" | sed '$ s/__HTTP__.*$//')
+    if [[ "${http_code}" != "200" ]]; then
+        die "load returned HTTP ${http_code}: ${body}"
+    fi
+    say "load returned ${http_code}: ${body}"
+}
+
+run_probe() {
+    say "POST /v1/chat/completions (probe: ${PROBE_PROMPT})"
+    local payload
+    payload=$(yq -n -c \
+        --arg model "${MODEL_ID}" \
+        --arg content "${PROBE_PROMPT}" \
+        --argjson tokens "${MAX_TOKENS}" \
+        '{
+            model: $model,
+            messages: [{role: "user", content: $content}],
+            temperature: 0.1,
+            max_tokens: $tokens
+        }')
+    local resp http_code body
+    resp=$(curl --silent --show-error --max-time "${INFER_TIMEOUT}" \
+        --write-out '\n__HTTP__%{http_code}' \
+        -X POST "${BASE}/v1/chat/completions" \
+        -H 'content-type: application/json' \
+        --data "${payload}") || die "curl /v1/chat/completions failed: $?"
+    http_code=$(echo "${resp}" | grep -oP '(?<=__HTTP__)\d+$' | tail -1)
+    body=$(echo "${resp}" | sed '$ s/__HTTP__.*$//')
+    if [[ "${http_code}" != "200" ]]; then
+        die "inference returned HTTP ${http_code}: ${body}"
+    fi
+    echo "${body}"
+}
+
+say "validating neuron at ${BASE}"
+probe_health
+say "/health OK"
+
+if is_loaded; then
+    say "${MODEL_ID} already loaded"
+else
+    trigger_load
+fi
+
+raw=$(run_probe)
+echo "---"
+echo "${raw}" | yq -r '.'
+echo "---"
+
+content=$(echo "${raw}" | yq -r '.choices[0].message.content // empty')
+if [[ -z "${content}" ]]; then
+    die "no content in chat completion response"
+fi
+say "assistant said: ${content}"
+
+if echo "${content}" | grep -qiF "${EXPECT_SUBSTR}"; then
+    say "PASS — response contains expected substring '${EXPECT_SUBSTR}'"
+    exit 0
+else
+    die "response did not contain '${EXPECT_SUBSTR}'"
+fi