fix(validate-neuron): default to unsloth GGUF + capture curl errors

Two reasons the previous run silently bailed after POST /models/load: 1. Default model was Qwen/Qwen3-0.6B-GGUF (official). That repo ships ONLY Q8_0 — no Q4_K_M, no Q4_0, nothing else. The GGUF filename matcher in CandleHarness::resolve_files returned "no GGUF file matching quant Q4_K_M" and the load endpoint returned an error, but the script used `curl --silent --fail` and swallowed it. 2. /models/load is synchronous (it awaits the full HF download + GGUF parse). curl --max-time 30 was way too short for a 400 MB fresh download. Fixes: - Default model is now unsloth/Qwen3-0.6B-GGUF, which mirrors the full Q-spectrum (Q2_K through Q8_0 plus BF16) so Q4_K_M actually exists. - trigger_load / run_probe now use --write-out to capture HTTP code and emit the response body on non-2xx, so failures surface a real diagnostic instead of an opaque set -e abort. - LOAD_TIMEOUT bumped to 600s; INFER_TIMEOUT to 120s. - Probe payload built via `yq -n` so JSON quoting is reliable regardless of the prompt text. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 08:14:31 +03:00
parent 39010c779f
commit ed4d71db09
1 changed files with 54 additions and 54 deletions
--- a/script/validate-neuron.sh
+++ b/script/validate-neuron.sh
@@ -4,7 +4,7 @@
 #
 # Confirms the daemon is reachable, loads a small public Qwen3 GGUF,
 # fires a reasoning probe at /v1/chat/completions, and prints the
-# answer. Use this to validate the candle harness on a real GPU host
+# answer. Used to validate the candle harness on a real GPU host
 # before trusting it for production traffic, and as a regression test
 # after pushing new neuron builds.
 #
@@ -13,13 +13,15 @@
 #
 # Defaults:
 #   host     = beast.hanzalova.internal
-#   model_id = Qwen/Qwen3-1.7B-GGUF
+#   model_id = unsloth/Qwen3-0.6B-GGUF  (official Qwen3-*-GGUF repos
 #              ship Q8_0 only; unsloth's mirror ships the full Q-spectrum
 #              including Q4_K_M)
 #   quant    = Q4_K_M
 set -euo pipefail
 HOST="${1:-beast.hanzalova.internal}"
-MODEL_ID="${2:-Qwen/Qwen3-1.7B-GGUF}"
+MODEL_ID="${2:-unsloth/Qwen3-0.6B-GGUF}"
 QUANT="${3:-Q4_K_M}"
 PORT="${NEURON_PORT:-13131}"
 BASE="http://${HOST}:${PORT}"
@@ -31,13 +33,12 @@ PROBE_PROMPT='What is the capital of France? Respond with the city name only, no
 EXPECT_SUBSTR='Paris'
 MAX_TOKENS=32
-# Polling cadence while the model loads.
+# /models/load is synchronous — neuron blocks the response until the
-LOAD_POLL_INTERVAL=5
+# hf-hub download + GGUF parse + tensor materialisation is done. A
-LOAD_POLL_MAX=120   # 10 min worst-case for a fresh HF download
+# fresh 0.6B-Q4_K_M is ~400 MB; on a slow link or cold cache that's
-
+# easily a minute. Pick a generous ceiling.
-# ---------------------------------------------------------------------------
+LOAD_TIMEOUT=600
-# helpers
+INFER_TIMEOUT=120
 # ---------------------------------------------------------------------------
 say() { printf '[%s] %s\n' "${HOST}" "$*"; }
 die() { say "FAIL: $*"; exit 1; }
@@ -48,20 +49,18 @@ probe_health() {
 }
 list_loaded_ids() {
-    curl --silent --fail "${BASE}/models" \
+    curl --silent --fail "${BASE}/models" | yq -r '.[].id'
        | yq -r '.[].id'
 }
 is_loaded() {
-    list_loaded_ids | grep -Fxq "${MODEL_ID}"
+    list_loaded_ids 2>/dev/null | grep -Fxq "${MODEL_ID}"
 }
 trigger_load() {
    say "POST /models/load ${MODEL_ID} (quant=${QUANT}, device=[0])"
-    curl --silent --fail --max-time 30 \
+    say "  (synchronous; may take a minute on first run while HF downloads)"
-        -X POST "${BASE}/models/load" \
+    local payload
-        -H 'content-type: application/json' \
+    payload=$(cat <<EOF
        --data-binary @- <<EOF >/dev/null
 {
    "model_id": "${MODEL_ID}",
    "harness": "candle",
@@ -69,43 +68,49 @@ trigger_load() {
    "devices": [0]
 }
 EOF
-}
+    )
-
+    # --write-out captures the response code on a separate line so we
-wait_for_load() {
+    # can surface a real diagnostic instead of relying on --fail.
-    local elapsed=0
+    local resp http_code body
-    while ! is_loaded; do
+    resp=$(curl --silent --show-error --max-time "${LOAD_TIMEOUT}" \
-        if (( elapsed >= LOAD_POLL_MAX )); then
+        --write-out '\n__HTTP__%{http_code}' \
-            die "model did not appear in /models after ${LOAD_POLL_MAX} polls"
+        -X POST "${BASE}/models/load" \
-        fi
+        -H 'content-type: application/json' \
-        sleep "${LOAD_POLL_INTERVAL}"
+        --data "${payload}") || die "curl /models/load failed: $?"
-        elapsed=$(( elapsed + 1 ))
+    http_code=$(echo "${resp}" | grep -oP '(?<=__HTTP__)\d+$' | tail -1)
-        say "still loading... (${elapsed}/${LOAD_POLL_MAX})"
+    body=$(echo "${resp}" | sed '$ s/__HTTP__.*$//')
-    done
+    if [[ "${http_code}" != "200" ]]; then
-    say "model loaded"
+        die "load returned HTTP ${http_code}: ${body}"
    fi
    say "load returned ${http_code}: ${body}"
 }
 run_probe() {
    say "POST /v1/chat/completions (probe: ${PROBE_PROMPT})"
-    local resp
+    local payload
-    resp=$(
+    payload=$(yq -n -c \
-        curl --silent --fail --max-time 120 \
+        --arg model "${MODEL_ID}" \
-            -X POST "${BASE}/v1/chat/completions" \
+        --arg content "${PROBE_PROMPT}" \
-            -H 'content-type: application/json' \
+        --argjson tokens "${MAX_TOKENS}" \
-            --data-binary @- <<EOF
+        '{
-{
+            model: $model,
-    "model": "${MODEL_ID}",
+            messages: [{role: "user", content: $content}],
-    "messages": [{"role": "user", "content": ${PROBE_PROMPT@Q}}],
+            temperature: 0.1,
-    "temperature": 0.1,
+            max_tokens: $tokens
-    "max_tokens": ${MAX_TOKENS}
+        }')
    local resp http_code body
    resp=$(curl --silent --show-error --max-time "${INFER_TIMEOUT}" \
        --write-out '\n__HTTP__%{http_code}' \
        -X POST "${BASE}/v1/chat/completions" \
        -H 'content-type: application/json' \
        --data "${payload}") || die "curl /v1/chat/completions failed: $?"
    http_code=$(echo "${resp}" | grep -oP '(?<=__HTTP__)\d+$' | tail -1)
    body=$(echo "${resp}" | sed '$ s/__HTTP__.*$//')
    if [[ "${http_code}" != "200" ]]; then
        die "inference returned HTTP ${http_code}: ${body}"
    fi
    echo "${body}"
 }
 EOF
    )
    echo "${resp}"
 }
 # ---------------------------------------------------------------------------
 # main
 # ---------------------------------------------------------------------------
 say "validating neuron at ${BASE}"
 probe_health
@@ -114,12 +119,7 @@ say "/health OK"
 if is_loaded; then
    say "${MODEL_ID} already loaded"
 else
    # Note: /models/load returns once the load is initiated. For large
    # models the actual materialisation continues asynchronously; the
    # registry only reflects success once it's complete, hence the
    # subsequent poll loop.
    trigger_load
    wait_for_load
 fi
 raw=$(run_probe)