From ed4d71db09dccf65e813a46018da433ecc26ca3d Mon Sep 17 00:00:00 2001 From: rob thijssen Date: Tue, 19 May 2026 08:14:31 +0300 Subject: [PATCH] fix(validate-neuron): default to unsloth GGUF + capture curl errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two reasons the previous run silently bailed after POST /models/load: 1. Default model was Qwen/Qwen3-0.6B-GGUF (official). That repo ships ONLY Q8_0 — no Q4_K_M, no Q4_0, nothing else. The GGUF filename matcher in CandleHarness::resolve_files returned "no GGUF file matching quant Q4_K_M" and the load endpoint returned an error, but the script used `curl --silent --fail` and swallowed it. 2. /models/load is synchronous (it awaits the full HF download + GGUF parse). curl --max-time 30 was way too short for a 400 MB fresh download. Fixes: - Default model is now unsloth/Qwen3-0.6B-GGUF, which mirrors the full Q-spectrum (Q2_K through Q8_0 plus BF16) so Q4_K_M actually exists. - trigger_load / run_probe now use --write-out to capture HTTP code and emit the response body on non-2xx, so failures surface a real diagnostic instead of an opaque set -e abort. - LOAD_TIMEOUT bumped to 600s; INFER_TIMEOUT to 120s. - Probe payload built via `yq -n` so JSON quoting is reliable regardless of the prompt text. Co-Authored-By: Claude Opus 4.7 (1M context) --- script/validate-neuron.sh | 108 +++++++++++++++++++------------------- 1 file changed, 54 insertions(+), 54 deletions(-) diff --git a/script/validate-neuron.sh b/script/validate-neuron.sh index 6666ef5..1252443 100755 --- a/script/validate-neuron.sh +++ b/script/validate-neuron.sh @@ -4,7 +4,7 @@ # # Confirms the daemon is reachable, loads a small public Qwen3 GGUF, # fires a reasoning probe at /v1/chat/completions, and prints the -# answer. Use this to validate the candle harness on a real GPU host +# answer. Used to validate the candle harness on a real GPU host # before trusting it for production traffic, and as a regression test # after pushing new neuron builds. # @@ -13,13 +13,15 @@ # # Defaults: # host = beast.hanzalova.internal -# model_id = Qwen/Qwen3-1.7B-GGUF +# model_id = unsloth/Qwen3-0.6B-GGUF (official Qwen3-*-GGUF repos +# ship Q8_0 only; unsloth's mirror ships the full Q-spectrum +# including Q4_K_M) # quant = Q4_K_M set -euo pipefail HOST="${1:-beast.hanzalova.internal}" -MODEL_ID="${2:-Qwen/Qwen3-1.7B-GGUF}" +MODEL_ID="${2:-unsloth/Qwen3-0.6B-GGUF}" QUANT="${3:-Q4_K_M}" PORT="${NEURON_PORT:-13131}" BASE="http://${HOST}:${PORT}" @@ -31,13 +33,12 @@ PROBE_PROMPT='What is the capital of France? Respond with the city name only, no EXPECT_SUBSTR='Paris' MAX_TOKENS=32 -# Polling cadence while the model loads. -LOAD_POLL_INTERVAL=5 -LOAD_POLL_MAX=120 # 10 min worst-case for a fresh HF download - -# --------------------------------------------------------------------------- -# helpers -# --------------------------------------------------------------------------- +# /models/load is synchronous — neuron blocks the response until the +# hf-hub download + GGUF parse + tensor materialisation is done. A +# fresh 0.6B-Q4_K_M is ~400 MB; on a slow link or cold cache that's +# easily a minute. Pick a generous ceiling. +LOAD_TIMEOUT=600 +INFER_TIMEOUT=120 say() { printf '[%s] %s\n' "${HOST}" "$*"; } die() { say "FAIL: $*"; exit 1; } @@ -48,20 +49,18 @@ probe_health() { } list_loaded_ids() { - curl --silent --fail "${BASE}/models" \ - | yq -r '.[].id' + curl --silent --fail "${BASE}/models" | yq -r '.[].id' } is_loaded() { - list_loaded_ids | grep -Fxq "${MODEL_ID}" + list_loaded_ids 2>/dev/null | grep -Fxq "${MODEL_ID}" } trigger_load() { say "POST /models/load ${MODEL_ID} (quant=${QUANT}, device=[0])" - curl --silent --fail --max-time 30 \ - -X POST "${BASE}/models/load" \ - -H 'content-type: application/json' \ - --data-binary @- </dev/null + say " (synchronous; may take a minute on first run while HF downloads)" + local payload + payload=$(cat <= LOAD_POLL_MAX )); then - die "model did not appear in /models after ${LOAD_POLL_MAX} polls" - fi - sleep "${LOAD_POLL_INTERVAL}" - elapsed=$(( elapsed + 1 )) - say "still loading... (${elapsed}/${LOAD_POLL_MAX})" - done - say "model loaded" + ) + # --write-out captures the response code on a separate line so we + # can surface a real diagnostic instead of relying on --fail. + local resp http_code body + resp=$(curl --silent --show-error --max-time "${LOAD_TIMEOUT}" \ + --write-out '\n__HTTP__%{http_code}' \ + -X POST "${BASE}/models/load" \ + -H 'content-type: application/json' \ + --data "${payload}") || die "curl /models/load failed: $?" + http_code=$(echo "${resp}" | grep -oP '(?<=__HTTP__)\d+$' | tail -1) + body=$(echo "${resp}" | sed '$ s/__HTTP__.*$//') + if [[ "${http_code}" != "200" ]]; then + die "load returned HTTP ${http_code}: ${body}" + fi + say "load returned ${http_code}: ${body}" } run_probe() { say "POST /v1/chat/completions (probe: ${PROBE_PROMPT})" - local resp - resp=$( - curl --silent --fail --max-time 120 \ - -X POST "${BASE}/v1/chat/completions" \ - -H 'content-type: application/json' \ - --data-binary @- <