fix(validate-neuron): default to unsloth GGUF + capture curl errors

Two reasons the previous run silently bailed after POST /models/load:

1. Default model was Qwen/Qwen3-0.6B-GGUF (official). That repo ships
   ONLY Q8_0 — no Q4_K_M, no Q4_0, nothing else. The GGUF filename
   matcher in CandleHarness::resolve_files returned "no GGUF file
   matching quant Q4_K_M" and the load endpoint returned an error,
   but the script used `curl --silent --fail` and swallowed it.

2. /models/load is synchronous (it awaits the full HF download + GGUF
   parse). curl --max-time 30 was way too short for a 400 MB fresh
   download.

Fixes:
- Default model is now unsloth/Qwen3-0.6B-GGUF, which mirrors the
  full Q-spectrum (Q2_K through Q8_0 plus BF16) so Q4_K_M actually
  exists.
- trigger_load / run_probe now use --write-out to capture HTTP code
  and emit the response body on non-2xx, so failures surface a real
  diagnostic instead of an opaque set -e abort.
- LOAD_TIMEOUT bumped to 600s; INFER_TIMEOUT to 120s.
- Probe payload built via `yq -n` so JSON quoting is reliable
  regardless of the prompt text.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-19 08:14:31 +03:00
parent 39010c779f
commit ed4d71db09

View File

@@ -4,7 +4,7 @@
# #
# Confirms the daemon is reachable, loads a small public Qwen3 GGUF, # Confirms the daemon is reachable, loads a small public Qwen3 GGUF,
# fires a reasoning probe at /v1/chat/completions, and prints the # fires a reasoning probe at /v1/chat/completions, and prints the
# answer. Use this to validate the candle harness on a real GPU host # answer. Used to validate the candle harness on a real GPU host
# before trusting it for production traffic, and as a regression test # before trusting it for production traffic, and as a regression test
# after pushing new neuron builds. # after pushing new neuron builds.
# #
@@ -13,13 +13,15 @@
# #
# Defaults: # Defaults:
# host = beast.hanzalova.internal # host = beast.hanzalova.internal
# model_id = Qwen/Qwen3-1.7B-GGUF # model_id = unsloth/Qwen3-0.6B-GGUF (official Qwen3-*-GGUF repos
# ship Q8_0 only; unsloth's mirror ships the full Q-spectrum
# including Q4_K_M)
# quant = Q4_K_M # quant = Q4_K_M
set -euo pipefail set -euo pipefail
HOST="${1:-beast.hanzalova.internal}" HOST="${1:-beast.hanzalova.internal}"
MODEL_ID="${2:-Qwen/Qwen3-1.7B-GGUF}" MODEL_ID="${2:-unsloth/Qwen3-0.6B-GGUF}"
QUANT="${3:-Q4_K_M}" QUANT="${3:-Q4_K_M}"
PORT="${NEURON_PORT:-13131}" PORT="${NEURON_PORT:-13131}"
BASE="http://${HOST}:${PORT}" BASE="http://${HOST}:${PORT}"
@@ -31,13 +33,12 @@ PROBE_PROMPT='What is the capital of France? Respond with the city name only, no
EXPECT_SUBSTR='Paris' EXPECT_SUBSTR='Paris'
MAX_TOKENS=32 MAX_TOKENS=32
# Polling cadence while the model loads. # /models/load is synchronous — neuron blocks the response until the
LOAD_POLL_INTERVAL=5 # hf-hub download + GGUF parse + tensor materialisation is done. A
LOAD_POLL_MAX=120 # 10 min worst-case for a fresh HF download # fresh 0.6B-Q4_K_M is ~400 MB; on a slow link or cold cache that's
# easily a minute. Pick a generous ceiling.
# --------------------------------------------------------------------------- LOAD_TIMEOUT=600
# helpers INFER_TIMEOUT=120
# ---------------------------------------------------------------------------
say() { printf '[%s] %s\n' "${HOST}" "$*"; } say() { printf '[%s] %s\n' "${HOST}" "$*"; }
die() { say "FAIL: $*"; exit 1; } die() { say "FAIL: $*"; exit 1; }
@@ -48,20 +49,18 @@ probe_health() {
} }
list_loaded_ids() { list_loaded_ids() {
curl --silent --fail "${BASE}/models" \ curl --silent --fail "${BASE}/models" | yq -r '.[].id'
| yq -r '.[].id'
} }
is_loaded() { is_loaded() {
list_loaded_ids | grep -Fxq "${MODEL_ID}" list_loaded_ids 2>/dev/null | grep -Fxq "${MODEL_ID}"
} }
trigger_load() { trigger_load() {
say "POST /models/load ${MODEL_ID} (quant=${QUANT}, device=[0])" say "POST /models/load ${MODEL_ID} (quant=${QUANT}, device=[0])"
curl --silent --fail --max-time 30 \ say " (synchronous; may take a minute on first run while HF downloads)"
-X POST "${BASE}/models/load" \ local payload
-H 'content-type: application/json' \ payload=$(cat <<EOF
--data-binary @- <<EOF >/dev/null
{ {
"model_id": "${MODEL_ID}", "model_id": "${MODEL_ID}",
"harness": "candle", "harness": "candle",
@@ -69,43 +68,49 @@ trigger_load() {
"devices": [0] "devices": [0]
} }
EOF EOF
} )
# --write-out captures the response code on a separate line so we
wait_for_load() { # can surface a real diagnostic instead of relying on --fail.
local elapsed=0 local resp http_code body
while ! is_loaded; do resp=$(curl --silent --show-error --max-time "${LOAD_TIMEOUT}" \
if (( elapsed >= LOAD_POLL_MAX )); then --write-out '\n__HTTP__%{http_code}' \
die "model did not appear in /models after ${LOAD_POLL_MAX} polls" -X POST "${BASE}/models/load" \
fi -H 'content-type: application/json' \
sleep "${LOAD_POLL_INTERVAL}" --data "${payload}") || die "curl /models/load failed: $?"
elapsed=$(( elapsed + 1 )) http_code=$(echo "${resp}" | grep -oP '(?<=__HTTP__)\d+$' | tail -1)
say "still loading... (${elapsed}/${LOAD_POLL_MAX})" body=$(echo "${resp}" | sed '$ s/__HTTP__.*$//')
done if [[ "${http_code}" != "200" ]]; then
say "model loaded" die "load returned HTTP ${http_code}: ${body}"
fi
say "load returned ${http_code}: ${body}"
} }
run_probe() { run_probe() {
say "POST /v1/chat/completions (probe: ${PROBE_PROMPT})" say "POST /v1/chat/completions (probe: ${PROBE_PROMPT})"
local resp local payload
resp=$( payload=$(yq -n -c \
curl --silent --fail --max-time 120 \ --arg model "${MODEL_ID}" \
-X POST "${BASE}/v1/chat/completions" \ --arg content "${PROBE_PROMPT}" \
-H 'content-type: application/json' \ --argjson tokens "${MAX_TOKENS}" \
--data-binary @- <<EOF '{
{ model: $model,
"model": "${MODEL_ID}", messages: [{role: "user", content: $content}],
"messages": [{"role": "user", "content": ${PROBE_PROMPT@Q}}], temperature: 0.1,
"temperature": 0.1, max_tokens: $tokens
"max_tokens": ${MAX_TOKENS} }')
local resp http_code body
resp=$(curl --silent --show-error --max-time "${INFER_TIMEOUT}" \
--write-out '\n__HTTP__%{http_code}' \
-X POST "${BASE}/v1/chat/completions" \
-H 'content-type: application/json' \
--data "${payload}") || die "curl /v1/chat/completions failed: $?"
http_code=$(echo "${resp}" | grep -oP '(?<=__HTTP__)\d+$' | tail -1)
body=$(echo "${resp}" | sed '$ s/__HTTP__.*$//')
if [[ "${http_code}" != "200" ]]; then
die "inference returned HTTP ${http_code}: ${body}"
fi
echo "${body}"
} }
EOF
)
echo "${resp}"
}
# ---------------------------------------------------------------------------
# main
# ---------------------------------------------------------------------------
say "validating neuron at ${BASE}" say "validating neuron at ${BASE}"
probe_health probe_health
@@ -114,12 +119,7 @@ say "/health OK"
if is_loaded; then if is_loaded; then
say "${MODEL_ID} already loaded" say "${MODEL_ID} already loaded"
else else
# Note: /models/load returns once the load is initiated. For large
# models the actual materialisation continues asynchronously; the
# registry only reflects success once it's complete, hence the
# subsequent poll loop.
trigger_load trigger_load
wait_for_load
fi fi
raw=$(run_probe) raw=$(run_probe)