#!/bin/env bash # # End-to-end smoke test for a deployed neuron. # # Confirms the daemon is reachable, loads a small public Qwen3 GGUF, # fires a reasoning probe at /v1/chat/completions, and prints the # answer. Used to validate the candle harness on a real GPU host # before trusting it for production traffic, and as a regression test # after pushing new neuron builds. # # Usage: # script/validate-neuron.sh [host] [model_id] [quant] # # Defaults: # host = beast.hanzalova.internal # model_id = unsloth/Qwen3-0.6B-GGUF (official Qwen3-*-GGUF repos # ship Q8_0 only; unsloth's mirror ships the full Q-spectrum # including Q4_K_M) # quant = Q4_K_M set -euo pipefail HOST="${1:-beast.hanzalova.internal}" MODEL_ID="${2:-unsloth/Qwen3-0.6B-GGUF}" QUANT="${3:-Q4_K_M}" PORT="${NEURON_PORT:-13131}" BASE="http://${HOST}:${PORT}" # Reasoning probe — concrete, low-temperature answer that small models # can still get right. "Paris" is a strong signal of basic competence # beyond gibberish. PROBE_PROMPT='What is the capital of France? Respond with the city name only, no punctuation.' EXPECT_SUBSTR='Paris' # Qwen3 prepends ... reasoning before the answer when the # chat template enables thinking mode, which eats most of a small token # budget. 256 leaves enough room for thinking + final answer. MAX_TOKENS=256 # /models/load is synchronous — neuron blocks the response until the # hf-hub download + GGUF parse + tensor materialisation is done. A # fresh 0.6B-Q4_K_M is ~400 MB; on a slow link or cold cache that's # easily a minute. Pick a generous ceiling. LOAD_TIMEOUT=600 INFER_TIMEOUT=120 # Status messages go to stderr so command substitutions like # `raw=$(run_probe)` capture only the function's intended return value # (an HTTP body), not the progress chatter. say() { printf '[%s] %s\n' "${HOST}" "$*" >&2; } die() { say "FAIL: $*"; exit 1; } probe_health() { curl --silent --fail --max-time 5 "${BASE}/health" >/dev/null \ || die "neuron not reachable at ${BASE}/health" } list_loaded_ids() { # The manifest is YAML and uses yq; HTTP responses are JSON and use # jq directly. pip-yq parses input as YAML by default, which trips # on JSON content that happens to look like YAML aliases (chatcmpl # ids, escaped quotes inside `...` blocks, etc.). curl --silent --fail "${BASE}/models" | jq -r '.[].id' } is_loaded() { list_loaded_ids 2>/dev/null | grep -Fxq "${MODEL_ID}" } trigger_load() { say "POST /models/load ${MODEL_ID} (quant=${QUANT}, device=[0])" say " (synchronous; may take a minute on first run while HF downloads)" local payload payload=$(cat <` markers Qwen3 emits during reasoning are a perfect # example). The targeted `yq -r '.path'` calls below work fine # because jq's path filter mode bypasses the YAML re-emit. echo "${raw}" echo "---" content=$(echo "${raw}" | jq -r '.choices[0].message.content // empty') if [[ -z "${content}" ]]; then die "no content in chat completion response" fi say "assistant said: ${content}" if echo "${content}" | grep -qiF "${EXPECT_SUBSTR}"; then say "PASS — response contains expected substring '${EXPECT_SUBSTR}'" exit 0 else die "response did not contain '${EXPECT_SUBSTR}'" fi