Files
cortex/script/validate-neuron.sh
rob thijssen 5436af9c73
All checks were successful
build-prerelease / Resolve version stamps (push) Successful in 33s
CI / Format (push) Successful in 38s
CI / Clippy (push) Successful in 2m19s
build-prerelease / Build neuron-blackwell (push) Successful in 3m32s
CI / Test (push) Successful in 4m34s
CI / Build cortex SRPM (push) Has been skipped
CI / Build neuron SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
build-prerelease / Build cortex binary (push) Successful in 4m16s
build-prerelease / Package cortex RPM (push) Successful in 1m18s
build-prerelease / Build neuron-ampere (push) Successful in 4m55s
build-prerelease / Build neuron-ada (push) Successful in 5m11s
build-prerelease / Package helexa-neuron-ampere RPM (push) Successful in 2m50s
build-prerelease / Package helexa-neuron-ada RPM (push) Successful in 2m52s
build-prerelease / Package helexa-neuron-blackwell RPM (push) Successful in 3m35s
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Successful in 1m0s
fix(neuron/candle): dense Qwen3 returns rank-3 logits, double-squeeze
Caught by live validation against Qwen/Qwen3-1.7B on beast:
  HTTP 500 "unexpected rank, expected: 1, got: 2 ([1, 151936])"

Candle's qwen3::ModelForCausalLM::forward returns shape [B, 1, V]
(no final squeeze) while quantized_qwen3::ModelWeights::forward
returns [B, V] (with squeeze(1) at the end). My match arms applied
a single squeeze(0) uniformly, which is correct for the quantized
[1, V] → [V] but leaves the dense at [1, V] → which then trips
apply_repeat_penalty::to_vec1() expecting rank 1.

Dense match arms now strip both batch and seq dims:
  model.forward(&input, offset)?.squeeze(0)?.squeeze(0)?

Also fixes validate-neuron.sh's `${3:-Q4_K_M}` → `${3-Q4_K_M}`
(no colon) so passing an explicit empty third arg now drives the
dense path instead of falling back to Q4_K_M.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 17:49:43 +03:00

163 lines
5.9 KiB
Bash
Executable File

#!/bin/env bash
#
# End-to-end smoke test for a deployed neuron.
#
# Confirms the daemon is reachable, loads a small public Qwen3 GGUF,
# fires a reasoning probe at /v1/chat/completions, and prints the
# answer. Used to validate the candle harness on a real GPU host
# before trusting it for production traffic, and as a regression test
# after pushing new neuron builds.
#
# Usage:
# script/validate-neuron.sh [host] [model_id] [quant]
#
# Defaults:
# host = beast.hanzalova.internal
# model_id = unsloth/Qwen3-0.6B-GGUF (official Qwen3-*-GGUF repos
# ship Q8_0 only; unsloth's mirror ships the full Q-spectrum
# including Q4_K_M)
# quant = Q4_K_M
set -euo pipefail
HOST="${1:-beast.hanzalova.internal}"
MODEL_ID="${2:-unsloth/Qwen3-0.6B-GGUF}"
# `${3-Q4_K_M}` (no colon) only uses the default when the arg is
# UNSET — passing an explicit empty string drives the dense path.
QUANT="${3-Q4_K_M}"
PORT="${NEURON_PORT:-13131}"
BASE="http://${HOST}:${PORT}"
# Reasoning probe — concrete, low-temperature answer that small models
# can still get right. "Paris" is a strong signal of basic competence
# beyond gibberish.
PROBE_PROMPT='What is the capital of Georgia (Caucasus)? Respond with the city name only, no punctuation.'
EXPECT_SUBSTR='Tbilisi'
# Qwen3 prepends <think>...</think> reasoning before the answer when the
# chat template enables thinking mode, which eats most of a small token
# budget. 256 leaves enough room for thinking + final answer.
MAX_TOKENS=256
# /models/load is synchronous — neuron blocks the response until the
# hf-hub download + GGUF parse + tensor materialisation is done. A
# fresh 0.6B-Q4_K_M is ~400 MB; on a slow link or cold cache that's
# easily a minute. Pick a generous ceiling.
LOAD_TIMEOUT=600
INFER_TIMEOUT=120
# Status messages go to stderr so command substitutions like
# `raw=$(run_probe)` capture only the function's intended return value
# (an HTTP body), not the progress chatter.
say() { printf '[%s] %s\n' "${HOST}" "$*" >&2; }
die() { say "FAIL: $*"; exit 1; }
probe_health() {
curl --silent --fail --max-time 5 "${BASE}/health" >/dev/null \
|| die "neuron not reachable at ${BASE}/health"
}
list_loaded_ids() {
# The manifest is YAML and uses yq; HTTP responses are JSON and use
# jq directly. pip-yq parses input as YAML by default, which trips
# on JSON content that happens to look like YAML aliases (chatcmpl
# ids, escaped quotes inside `<think>...</think>` blocks, etc.).
curl --silent --fail "${BASE}/models" | jq -r '.[].id'
}
is_loaded() {
list_loaded_ids 2>/dev/null | grep -Fxq "${MODEL_ID}"
}
trigger_load() {
say "POST /models/load ${MODEL_ID} (quant=${QUANT:-<dense>}, device=[0])"
say " (synchronous; may take a minute on first run while HF downloads)"
# Build the payload via jq so the optional `quant` field is
# omitted entirely when empty — that's the signal to the harness
# to take the dense safetensors load path rather than GGUF.
local payload
if [[ -z "${QUANT}" ]]; then
payload=$(jq -n -c \
--arg id "${MODEL_ID}" \
'{model_id: $id, harness: "candle", devices: [0]}')
else
payload=$(jq -n -c \
--arg id "${MODEL_ID}" \
--arg q "${QUANT}" \
'{model_id: $id, harness: "candle", quant: $q, devices: [0]}')
fi
# --write-out captures the response code on a separate line so we
# can surface a real diagnostic instead of relying on --fail.
local resp http_code body
resp=$(curl --silent --show-error --max-time "${LOAD_TIMEOUT}" \
--write-out '\n__HTTP__%{http_code}' \
-X POST "${BASE}/models/load" \
-H 'content-type: application/json' \
--data "${payload}") || die "curl /models/load failed: $?"
http_code=$(echo "${resp}" | grep -oP '(?<=__HTTP__)\d+$' | tail -1)
body=$(echo "${resp}" | sed '$ s/__HTTP__.*$//')
if [[ "${http_code}" != "200" ]]; then
die "load returned HTTP ${http_code}: ${body}"
fi
say "load returned ${http_code}: ${body}"
}
run_probe() {
say "POST /v1/chat/completions (probe: ${PROBE_PROMPT})"
local payload
payload=$(jq -n -c \
--arg model "${MODEL_ID}" \
--arg content "${PROBE_PROMPT}" \
--argjson tokens "${MAX_TOKENS}" \
'{
model: $model,
messages: [{role: "user", content: $content}],
temperature: 0.1,
max_tokens: $tokens
}')
local resp http_code body
resp=$(curl --silent --show-error --max-time "${INFER_TIMEOUT}" \
--write-out '\n__HTTP__%{http_code}' \
-X POST "${BASE}/v1/chat/completions" \
-H 'content-type: application/json' \
--data "${payload}") || die "curl /v1/chat/completions failed: $?"
http_code=$(echo "${resp}" | grep -oP '(?<=__HTTP__)\d+$' | tail -1)
body=$(echo "${resp}" | sed '$ s/__HTTP__.*$//')
if [[ "${http_code}" != "200" ]]; then
die "inference returned HTTP ${http_code}: ${body}"
fi
echo "${body}"
}
say "validating neuron at ${BASE}"
probe_health
say "/health OK"
if is_loaded; then
say "${MODEL_ID} already loaded"
else
trigger_load
fi
raw=$(run_probe)
echo "---"
# Dump the raw JSON. Don't pipe through `yq -r '.'` — yq's default
# YAML output mode chokes on JSON strings that contain `<` (and the
# `<think>` markers Qwen3 emits during reasoning are a perfect
# example). The targeted `yq -r '.path'` calls below work fine
# because jq's path filter mode bypasses the YAML re-emit.
echo "${raw}"
echo "---"
content=$(echo "${raw}" | jq -r '.choices[0].message.content // empty')
if [[ -z "${content}" ]]; then
die "no content in chat completion response"
fi
say "assistant said: ${content}"
if echo "${content}" | grep -qiF "${EXPECT_SUBSTR}"; then
say "PASS — response contains expected substring '${EXPECT_SUBSTR}'"
exit 0
else
die "response did not contain '${EXPECT_SUBSTR}'"
fi