From 39010c779fdf2cf0e46bc3e563f93b2f91c5a4b3 Mon Sep 17 00:00:00 2001 From: rob thijssen Date: Tue, 19 May 2026 07:58:05 +0300 Subject: [PATCH] =?UTF-8?q?add=20script/validate-neuron.sh=20=E2=80=94=20e?= =?UTF-8?q?nd-to-end=20candle=20harness=20smoke=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Loads a small public Qwen3 GGUF on a target neuron host, fires a deterministic reasoning probe ("What is the capital of France?"), and asserts the response contains 'Paris'. Used to validate the candle harness on a real GPU host before the Stage 7 TP work begins, and as a regression check after future neuron builds. Defaults to beast.hanzalova.internal + Qwen/Qwen3-1.7B-GGUF + Q4_K_M; all three are positional args so the same script tests any node / model combination. Polls /models after triggering the load since /models/load returns once the materialisation is *queued*, not finished. Co-Authored-By: Claude Opus 4.7 (1M context) --- script/validate-neuron.sh | 141 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100755 script/validate-neuron.sh diff --git a/script/validate-neuron.sh b/script/validate-neuron.sh new file mode 100755 index 0000000..6666ef5 --- /dev/null +++ b/script/validate-neuron.sh @@ -0,0 +1,141 @@ +#!/bin/env bash +# +# End-to-end smoke test for a deployed neuron. +# +# Confirms the daemon is reachable, loads a small public Qwen3 GGUF, +# fires a reasoning probe at /v1/chat/completions, and prints the +# answer. Use this to validate the candle harness on a real GPU host +# before trusting it for production traffic, and as a regression test +# after pushing new neuron builds. +# +# Usage: +# script/validate-neuron.sh [host] [model_id] [quant] +# +# Defaults: +# host = beast.hanzalova.internal +# model_id = Qwen/Qwen3-1.7B-GGUF +# quant = Q4_K_M + +set -euo pipefail + +HOST="${1:-beast.hanzalova.internal}" +MODEL_ID="${2:-Qwen/Qwen3-1.7B-GGUF}" +QUANT="${3:-Q4_K_M}" +PORT="${NEURON_PORT:-13131}" +BASE="http://${HOST}:${PORT}" + +# Reasoning probe — concrete, low-temperature answer that small models +# can still get right. "Paris" is a strong signal of basic competence +# beyond gibberish. +PROBE_PROMPT='What is the capital of France? Respond with the city name only, no punctuation.' +EXPECT_SUBSTR='Paris' +MAX_TOKENS=32 + +# Polling cadence while the model loads. +LOAD_POLL_INTERVAL=5 +LOAD_POLL_MAX=120 # 10 min worst-case for a fresh HF download + +# --------------------------------------------------------------------------- +# helpers +# --------------------------------------------------------------------------- + +say() { printf '[%s] %s\n' "${HOST}" "$*"; } +die() { say "FAIL: $*"; exit 1; } + +probe_health() { + curl --silent --fail --max-time 5 "${BASE}/health" >/dev/null \ + || die "neuron not reachable at ${BASE}/health" +} + +list_loaded_ids() { + curl --silent --fail "${BASE}/models" \ + | yq -r '.[].id' +} + +is_loaded() { + list_loaded_ids | grep -Fxq "${MODEL_ID}" +} + +trigger_load() { + say "POST /models/load ${MODEL_ID} (quant=${QUANT}, device=[0])" + curl --silent --fail --max-time 30 \ + -X POST "${BASE}/models/load" \ + -H 'content-type: application/json' \ + --data-binary @- </dev/null +{ + "model_id": "${MODEL_ID}", + "harness": "candle", + "quant": "${QUANT}", + "devices": [0] +} +EOF +} + +wait_for_load() { + local elapsed=0 + while ! is_loaded; do + if (( elapsed >= LOAD_POLL_MAX )); then + die "model did not appear in /models after ${LOAD_POLL_MAX} polls" + fi + sleep "${LOAD_POLL_INTERVAL}" + elapsed=$(( elapsed + 1 )) + say "still loading... (${elapsed}/${LOAD_POLL_MAX})" + done + say "model loaded" +} + +run_probe() { + say "POST /v1/chat/completions (probe: ${PROBE_PROMPT})" + local resp + resp=$( + curl --silent --fail --max-time 120 \ + -X POST "${BASE}/v1/chat/completions" \ + -H 'content-type: application/json' \ + --data-binary @- <