fix(validate-neuron): default to unsloth GGUF + capture curl errors
Two reasons the previous run silently bailed after POST /models/load: 1. Default model was Qwen/Qwen3-0.6B-GGUF (official). That repo ships ONLY Q8_0 — no Q4_K_M, no Q4_0, nothing else. The GGUF filename matcher in CandleHarness::resolve_files returned "no GGUF file matching quant Q4_K_M" and the load endpoint returned an error, but the script used `curl --silent --fail` and swallowed it. 2. /models/load is synchronous (it awaits the full HF download + GGUF parse). curl --max-time 30 was way too short for a 400 MB fresh download. Fixes: - Default model is now unsloth/Qwen3-0.6B-GGUF, which mirrors the full Q-spectrum (Q2_K through Q8_0 plus BF16) so Q4_K_M actually exists. - trigger_load / run_probe now use --write-out to capture HTTP code and emit the response body on non-2xx, so failures surface a real diagnostic instead of an opaque set -e abort. - LOAD_TIMEOUT bumped to 600s; INFER_TIMEOUT to 120s. - Probe payload built via `yq -n` so JSON quoting is reliable regardless of the prompt text. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -4,7 +4,7 @@
|
|||||||
#
|
#
|
||||||
# Confirms the daemon is reachable, loads a small public Qwen3 GGUF,
|
# Confirms the daemon is reachable, loads a small public Qwen3 GGUF,
|
||||||
# fires a reasoning probe at /v1/chat/completions, and prints the
|
# fires a reasoning probe at /v1/chat/completions, and prints the
|
||||||
# answer. Use this to validate the candle harness on a real GPU host
|
# answer. Used to validate the candle harness on a real GPU host
|
||||||
# before trusting it for production traffic, and as a regression test
|
# before trusting it for production traffic, and as a regression test
|
||||||
# after pushing new neuron builds.
|
# after pushing new neuron builds.
|
||||||
#
|
#
|
||||||
@@ -13,13 +13,15 @@
|
|||||||
#
|
#
|
||||||
# Defaults:
|
# Defaults:
|
||||||
# host = beast.hanzalova.internal
|
# host = beast.hanzalova.internal
|
||||||
# model_id = Qwen/Qwen3-1.7B-GGUF
|
# model_id = unsloth/Qwen3-0.6B-GGUF (official Qwen3-*-GGUF repos
|
||||||
|
# ship Q8_0 only; unsloth's mirror ships the full Q-spectrum
|
||||||
|
# including Q4_K_M)
|
||||||
# quant = Q4_K_M
|
# quant = Q4_K_M
|
||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
HOST="${1:-beast.hanzalova.internal}"
|
HOST="${1:-beast.hanzalova.internal}"
|
||||||
MODEL_ID="${2:-Qwen/Qwen3-1.7B-GGUF}"
|
MODEL_ID="${2:-unsloth/Qwen3-0.6B-GGUF}"
|
||||||
QUANT="${3:-Q4_K_M}"
|
QUANT="${3:-Q4_K_M}"
|
||||||
PORT="${NEURON_PORT:-13131}"
|
PORT="${NEURON_PORT:-13131}"
|
||||||
BASE="http://${HOST}:${PORT}"
|
BASE="http://${HOST}:${PORT}"
|
||||||
@@ -31,13 +33,12 @@ PROBE_PROMPT='What is the capital of France? Respond with the city name only, no
|
|||||||
EXPECT_SUBSTR='Paris'
|
EXPECT_SUBSTR='Paris'
|
||||||
MAX_TOKENS=32
|
MAX_TOKENS=32
|
||||||
|
|
||||||
# Polling cadence while the model loads.
|
# /models/load is synchronous — neuron blocks the response until the
|
||||||
LOAD_POLL_INTERVAL=5
|
# hf-hub download + GGUF parse + tensor materialisation is done. A
|
||||||
LOAD_POLL_MAX=120 # 10 min worst-case for a fresh HF download
|
# fresh 0.6B-Q4_K_M is ~400 MB; on a slow link or cold cache that's
|
||||||
|
# easily a minute. Pick a generous ceiling.
|
||||||
# ---------------------------------------------------------------------------
|
LOAD_TIMEOUT=600
|
||||||
# helpers
|
INFER_TIMEOUT=120
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
say() { printf '[%s] %s\n' "${HOST}" "$*"; }
|
say() { printf '[%s] %s\n' "${HOST}" "$*"; }
|
||||||
die() { say "FAIL: $*"; exit 1; }
|
die() { say "FAIL: $*"; exit 1; }
|
||||||
@@ -48,20 +49,18 @@ probe_health() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
list_loaded_ids() {
|
list_loaded_ids() {
|
||||||
curl --silent --fail "${BASE}/models" \
|
curl --silent --fail "${BASE}/models" | yq -r '.[].id'
|
||||||
| yq -r '.[].id'
|
|
||||||
}
|
}
|
||||||
|
|
||||||
is_loaded() {
|
is_loaded() {
|
||||||
list_loaded_ids | grep -Fxq "${MODEL_ID}"
|
list_loaded_ids 2>/dev/null | grep -Fxq "${MODEL_ID}"
|
||||||
}
|
}
|
||||||
|
|
||||||
trigger_load() {
|
trigger_load() {
|
||||||
say "POST /models/load ${MODEL_ID} (quant=${QUANT}, device=[0])"
|
say "POST /models/load ${MODEL_ID} (quant=${QUANT}, device=[0])"
|
||||||
curl --silent --fail --max-time 30 \
|
say " (synchronous; may take a minute on first run while HF downloads)"
|
||||||
-X POST "${BASE}/models/load" \
|
local payload
|
||||||
-H 'content-type: application/json' \
|
payload=$(cat <<EOF
|
||||||
--data-binary @- <<EOF >/dev/null
|
|
||||||
{
|
{
|
||||||
"model_id": "${MODEL_ID}",
|
"model_id": "${MODEL_ID}",
|
||||||
"harness": "candle",
|
"harness": "candle",
|
||||||
@@ -69,43 +68,49 @@ trigger_load() {
|
|||||||
"devices": [0]
|
"devices": [0]
|
||||||
}
|
}
|
||||||
EOF
|
EOF
|
||||||
}
|
)
|
||||||
|
# --write-out captures the response code on a separate line so we
|
||||||
wait_for_load() {
|
# can surface a real diagnostic instead of relying on --fail.
|
||||||
local elapsed=0
|
local resp http_code body
|
||||||
while ! is_loaded; do
|
resp=$(curl --silent --show-error --max-time "${LOAD_TIMEOUT}" \
|
||||||
if (( elapsed >= LOAD_POLL_MAX )); then
|
--write-out '\n__HTTP__%{http_code}' \
|
||||||
die "model did not appear in /models after ${LOAD_POLL_MAX} polls"
|
-X POST "${BASE}/models/load" \
|
||||||
fi
|
-H 'content-type: application/json' \
|
||||||
sleep "${LOAD_POLL_INTERVAL}"
|
--data "${payload}") || die "curl /models/load failed: $?"
|
||||||
elapsed=$(( elapsed + 1 ))
|
http_code=$(echo "${resp}" | grep -oP '(?<=__HTTP__)\d+$' | tail -1)
|
||||||
say "still loading... (${elapsed}/${LOAD_POLL_MAX})"
|
body=$(echo "${resp}" | sed '$ s/__HTTP__.*$//')
|
||||||
done
|
if [[ "${http_code}" != "200" ]]; then
|
||||||
say "model loaded"
|
die "load returned HTTP ${http_code}: ${body}"
|
||||||
|
fi
|
||||||
|
say "load returned ${http_code}: ${body}"
|
||||||
}
|
}
|
||||||
|
|
||||||
run_probe() {
|
run_probe() {
|
||||||
say "POST /v1/chat/completions (probe: ${PROBE_PROMPT})"
|
say "POST /v1/chat/completions (probe: ${PROBE_PROMPT})"
|
||||||
local resp
|
local payload
|
||||||
resp=$(
|
payload=$(yq -n -c \
|
||||||
curl --silent --fail --max-time 120 \
|
--arg model "${MODEL_ID}" \
|
||||||
-X POST "${BASE}/v1/chat/completions" \
|
--arg content "${PROBE_PROMPT}" \
|
||||||
-H 'content-type: application/json' \
|
--argjson tokens "${MAX_TOKENS}" \
|
||||||
--data-binary @- <<EOF
|
'{
|
||||||
{
|
model: $model,
|
||||||
"model": "${MODEL_ID}",
|
messages: [{role: "user", content: $content}],
|
||||||
"messages": [{"role": "user", "content": ${PROBE_PROMPT@Q}}],
|
temperature: 0.1,
|
||||||
"temperature": 0.1,
|
max_tokens: $tokens
|
||||||
"max_tokens": ${MAX_TOKENS}
|
}')
|
||||||
|
local resp http_code body
|
||||||
|
resp=$(curl --silent --show-error --max-time "${INFER_TIMEOUT}" \
|
||||||
|
--write-out '\n__HTTP__%{http_code}' \
|
||||||
|
-X POST "${BASE}/v1/chat/completions" \
|
||||||
|
-H 'content-type: application/json' \
|
||||||
|
--data "${payload}") || die "curl /v1/chat/completions failed: $?"
|
||||||
|
http_code=$(echo "${resp}" | grep -oP '(?<=__HTTP__)\d+$' | tail -1)
|
||||||
|
body=$(echo "${resp}" | sed '$ s/__HTTP__.*$//')
|
||||||
|
if [[ "${http_code}" != "200" ]]; then
|
||||||
|
die "inference returned HTTP ${http_code}: ${body}"
|
||||||
|
fi
|
||||||
|
echo "${body}"
|
||||||
}
|
}
|
||||||
EOF
|
|
||||||
)
|
|
||||||
echo "${resp}"
|
|
||||||
}
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# main
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
say "validating neuron at ${BASE}"
|
say "validating neuron at ${BASE}"
|
||||||
probe_health
|
probe_health
|
||||||
@@ -114,12 +119,7 @@ say "/health OK"
|
|||||||
if is_loaded; then
|
if is_loaded; then
|
||||||
say "${MODEL_ID} already loaded"
|
say "${MODEL_ID} already loaded"
|
||||||
else
|
else
|
||||||
# Note: /models/load returns once the load is initiated. For large
|
|
||||||
# models the actual materialisation continues asynchronously; the
|
|
||||||
# registry only reflects success once it's complete, hence the
|
|
||||||
# subsequent poll loop.
|
|
||||||
trigger_load
|
trigger_load
|
||||||
wait_for_load
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
raw=$(run_probe)
|
raw=$(run_probe)
|
||||||
|
|||||||
Reference in New Issue
Block a user