#!/bin/env bash # # End-to-end smoke test for a deployed neuron. # # Confirms the daemon is reachable, loads a small public Qwen3 GGUF, # fires a reasoning probe at /v1/chat/completions, and prints the # answer. Used to validate the candle harness on a real GPU host # before trusting it for production traffic, and as a regression test # after pushing new neuron builds. # # Usage: # script/validate-neuron.sh [host] [model_id] [quant] # # Defaults: # host = beast.hanzalova.internal # model_id = unsloth/Qwen3-0.6B-GGUF (official Qwen3-*-GGUF repos # ship Q8_0 only; unsloth's mirror ships the full Q-spectrum # including Q4_K_M) # quant = Q4_K_M set -euo pipefail HOST="${1:-beast.hanzalova.internal}" MODEL_ID="${2:-unsloth/Qwen3-0.6B-GGUF}" QUANT="${3:-Q4_K_M}" PORT="${NEURON_PORT:-13131}" BASE="http://${HOST}:${PORT}" # Reasoning probe — concrete, low-temperature answer that small models # can still get right. "Paris" is a strong signal of basic competence # beyond gibberish. PROBE_PROMPT='What is the capital of France? Respond with the city name only, no punctuation.' EXPECT_SUBSTR='Paris' MAX_TOKENS=32 # /models/load is synchronous — neuron blocks the response until the # hf-hub download + GGUF parse + tensor materialisation is done. A # fresh 0.6B-Q4_K_M is ~400 MB; on a slow link or cold cache that's # easily a minute. Pick a generous ceiling. LOAD_TIMEOUT=600 INFER_TIMEOUT=120 say() { printf '[%s] %s\n' "${HOST}" "$*"; } die() { say "FAIL: $*"; exit 1; } probe_health() { curl --silent --fail --max-time 5 "${BASE}/health" >/dev/null \ || die "neuron not reachable at ${BASE}/health" } list_loaded_ids() { curl --silent --fail "${BASE}/models" | yq -r '.[].id' } is_loaded() { list_loaded_ids 2>/dev/null | grep -Fxq "${MODEL_ID}" } trigger_load() { say "POST /models/load ${MODEL_ID} (quant=${QUANT}, device=[0])" say " (synchronous; may take a minute on first run while HF downloads)" local payload payload=$(cat <