Compare commits
3 Commits
57d7ef8d3c
...
f9f5fa41b6
| Author | SHA1 | Date | |
|---|---|---|---|
|
f9f5fa41b6
|
|||
|
ed4d71db09
|
|||
|
39010c779f
|
@@ -56,7 +56,7 @@ async fn list_models(State(state): State<Arc<NeuronState>>) -> impl IntoResponse
|
||||
Ok(models) => Json(json!(models)).into_response(),
|
||||
Err(e) => (
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(json!({"error": e.to_string()})),
|
||||
Json(json!({"error": format!("{e:#}")})),
|
||||
)
|
||||
.into_response(),
|
||||
}
|
||||
@@ -71,7 +71,7 @@ async fn load_model(
|
||||
Ok(()) => Json(json!({"status": "loaded"})).into_response(),
|
||||
Err(e) => (
|
||||
StatusCode::BAD_REQUEST,
|
||||
Json(json!({"error": e.to_string()})),
|
||||
Json(json!({"error": format!("{e:#}")})),
|
||||
)
|
||||
.into_response(),
|
||||
}
|
||||
@@ -95,7 +95,11 @@ async fn unload_model(
|
||||
let registry = state.registry.read().await;
|
||||
match registry.unload_model(&model_id).await {
|
||||
Ok(()) => Json(json!({"status": "unloaded"})).into_response(),
|
||||
Err(e) => (StatusCode::NOT_FOUND, Json(json!({"error": e.to_string()}))).into_response(),
|
||||
Err(e) => (
|
||||
StatusCode::NOT_FOUND,
|
||||
Json(json!({"error": format!("{e:#}")})),
|
||||
)
|
||||
.into_response(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -151,7 +155,7 @@ async fn chat_completions(
|
||||
.into_response(),
|
||||
Err(InferenceError::Other(e)) => (
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(json!({"error": e.to_string()})),
|
||||
Json(json!({"error": format!("{e:#}")})),
|
||||
)
|
||||
.into_response(),
|
||||
}
|
||||
@@ -165,7 +169,7 @@ async fn chat_completions(
|
||||
.into_response(),
|
||||
Err(InferenceError::Other(e)) => (
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(json!({"error": e.to_string()})),
|
||||
Json(json!({"error": format!("{e:#}")})),
|
||||
)
|
||||
.into_response(),
|
||||
}
|
||||
|
||||
@@ -10,6 +10,12 @@ Restart=on-failure
|
||||
RestartSec=5
|
||||
User=neuron
|
||||
Group=neuron
|
||||
# /var/lib/neuron is the neuron user's $HOME — hf-hub writes its
|
||||
# default cache there (~/.cache/huggingface/hub). Without this directive
|
||||
# systemd doesn't create the directory and hf-hub downloads fail with
|
||||
# "fetch GGUF <file>: failed to create cache dir".
|
||||
StateDirectory=neuron
|
||||
StateDirectoryMode=0755
|
||||
# Loading default_models from neuron.toml happens before the HTTP
|
||||
# listener binds; large models can take many minutes to download and
|
||||
# materialise on first activation. systemd's default TimeoutStartSec
|
||||
|
||||
141
script/validate-neuron.sh
Executable file
141
script/validate-neuron.sh
Executable file
@@ -0,0 +1,141 @@
|
||||
#!/bin/env bash
|
||||
#
|
||||
# End-to-end smoke test for a deployed neuron.
|
||||
#
|
||||
# Confirms the daemon is reachable, loads a small public Qwen3 GGUF,
|
||||
# fires a reasoning probe at /v1/chat/completions, and prints the
|
||||
# answer. Used to validate the candle harness on a real GPU host
|
||||
# before trusting it for production traffic, and as a regression test
|
||||
# after pushing new neuron builds.
|
||||
#
|
||||
# Usage:
|
||||
# script/validate-neuron.sh [host] [model_id] [quant]
|
||||
#
|
||||
# Defaults:
|
||||
# host = beast.hanzalova.internal
|
||||
# model_id = unsloth/Qwen3-0.6B-GGUF (official Qwen3-*-GGUF repos
|
||||
# ship Q8_0 only; unsloth's mirror ships the full Q-spectrum
|
||||
# including Q4_K_M)
|
||||
# quant = Q4_K_M
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
HOST="${1:-beast.hanzalova.internal}"
|
||||
MODEL_ID="${2:-unsloth/Qwen3-0.6B-GGUF}"
|
||||
QUANT="${3:-Q4_K_M}"
|
||||
PORT="${NEURON_PORT:-13131}"
|
||||
BASE="http://${HOST}:${PORT}"
|
||||
|
||||
# Reasoning probe — concrete, low-temperature answer that small models
|
||||
# can still get right. "Paris" is a strong signal of basic competence
|
||||
# beyond gibberish.
|
||||
PROBE_PROMPT='What is the capital of France? Respond with the city name only, no punctuation.'
|
||||
EXPECT_SUBSTR='Paris'
|
||||
MAX_TOKENS=32
|
||||
|
||||
# /models/load is synchronous — neuron blocks the response until the
|
||||
# hf-hub download + GGUF parse + tensor materialisation is done. A
|
||||
# fresh 0.6B-Q4_K_M is ~400 MB; on a slow link or cold cache that's
|
||||
# easily a minute. Pick a generous ceiling.
|
||||
LOAD_TIMEOUT=600
|
||||
INFER_TIMEOUT=120
|
||||
|
||||
say() { printf '[%s] %s\n' "${HOST}" "$*"; }
|
||||
die() { say "FAIL: $*"; exit 1; }
|
||||
|
||||
probe_health() {
|
||||
curl --silent --fail --max-time 5 "${BASE}/health" >/dev/null \
|
||||
|| die "neuron not reachable at ${BASE}/health"
|
||||
}
|
||||
|
||||
list_loaded_ids() {
|
||||
curl --silent --fail "${BASE}/models" | yq -r '.[].id'
|
||||
}
|
||||
|
||||
is_loaded() {
|
||||
list_loaded_ids 2>/dev/null | grep -Fxq "${MODEL_ID}"
|
||||
}
|
||||
|
||||
trigger_load() {
|
||||
say "POST /models/load ${MODEL_ID} (quant=${QUANT}, device=[0])"
|
||||
say " (synchronous; may take a minute on first run while HF downloads)"
|
||||
local payload
|
||||
payload=$(cat <<EOF
|
||||
{
|
||||
"model_id": "${MODEL_ID}",
|
||||
"harness": "candle",
|
||||
"quant": "${QUANT}",
|
||||
"devices": [0]
|
||||
}
|
||||
EOF
|
||||
)
|
||||
# --write-out captures the response code on a separate line so we
|
||||
# can surface a real diagnostic instead of relying on --fail.
|
||||
local resp http_code body
|
||||
resp=$(curl --silent --show-error --max-time "${LOAD_TIMEOUT}" \
|
||||
--write-out '\n__HTTP__%{http_code}' \
|
||||
-X POST "${BASE}/models/load" \
|
||||
-H 'content-type: application/json' \
|
||||
--data "${payload}") || die "curl /models/load failed: $?"
|
||||
http_code=$(echo "${resp}" | grep -oP '(?<=__HTTP__)\d+$' | tail -1)
|
||||
body=$(echo "${resp}" | sed '$ s/__HTTP__.*$//')
|
||||
if [[ "${http_code}" != "200" ]]; then
|
||||
die "load returned HTTP ${http_code}: ${body}"
|
||||
fi
|
||||
say "load returned ${http_code}: ${body}"
|
||||
}
|
||||
|
||||
run_probe() {
|
||||
say "POST /v1/chat/completions (probe: ${PROBE_PROMPT})"
|
||||
local payload
|
||||
payload=$(yq -n -c \
|
||||
--arg model "${MODEL_ID}" \
|
||||
--arg content "${PROBE_PROMPT}" \
|
||||
--argjson tokens "${MAX_TOKENS}" \
|
||||
'{
|
||||
model: $model,
|
||||
messages: [{role: "user", content: $content}],
|
||||
temperature: 0.1,
|
||||
max_tokens: $tokens
|
||||
}')
|
||||
local resp http_code body
|
||||
resp=$(curl --silent --show-error --max-time "${INFER_TIMEOUT}" \
|
||||
--write-out '\n__HTTP__%{http_code}' \
|
||||
-X POST "${BASE}/v1/chat/completions" \
|
||||
-H 'content-type: application/json' \
|
||||
--data "${payload}") || die "curl /v1/chat/completions failed: $?"
|
||||
http_code=$(echo "${resp}" | grep -oP '(?<=__HTTP__)\d+$' | tail -1)
|
||||
body=$(echo "${resp}" | sed '$ s/__HTTP__.*$//')
|
||||
if [[ "${http_code}" != "200" ]]; then
|
||||
die "inference returned HTTP ${http_code}: ${body}"
|
||||
fi
|
||||
echo "${body}"
|
||||
}
|
||||
|
||||
say "validating neuron at ${BASE}"
|
||||
probe_health
|
||||
say "/health OK"
|
||||
|
||||
if is_loaded; then
|
||||
say "${MODEL_ID} already loaded"
|
||||
else
|
||||
trigger_load
|
||||
fi
|
||||
|
||||
raw=$(run_probe)
|
||||
echo "---"
|
||||
echo "${raw}" | yq -r '.'
|
||||
echo "---"
|
||||
|
||||
content=$(echo "${raw}" | yq -r '.choices[0].message.content // empty')
|
||||
if [[ -z "${content}" ]]; then
|
||||
die "no content in chat completion response"
|
||||
fi
|
||||
say "assistant said: ${content}"
|
||||
|
||||
if echo "${content}" | grep -qiF "${EXPECT_SUBSTR}"; then
|
||||
say "PASS — response contains expected substring '${EXPECT_SUBSTR}'"
|
||||
exit 0
|
||||
else
|
||||
die "response did not contain '${EXPECT_SUBSTR}'"
|
||||
fi
|
||||
Reference in New Issue
Block a user