Compare commits
3 Commits
57d7ef8d3c
...
f9f5fa41b6
| Author | SHA1 | Date | |
|---|---|---|---|
|
f9f5fa41b6
|
|||
|
ed4d71db09
|
|||
|
39010c779f
|
@@ -56,7 +56,7 @@ async fn list_models(State(state): State<Arc<NeuronState>>) -> impl IntoResponse
|
|||||||
Ok(models) => Json(json!(models)).into_response(),
|
Ok(models) => Json(json!(models)).into_response(),
|
||||||
Err(e) => (
|
Err(e) => (
|
||||||
StatusCode::INTERNAL_SERVER_ERROR,
|
StatusCode::INTERNAL_SERVER_ERROR,
|
||||||
Json(json!({"error": e.to_string()})),
|
Json(json!({"error": format!("{e:#}")})),
|
||||||
)
|
)
|
||||||
.into_response(),
|
.into_response(),
|
||||||
}
|
}
|
||||||
@@ -71,7 +71,7 @@ async fn load_model(
|
|||||||
Ok(()) => Json(json!({"status": "loaded"})).into_response(),
|
Ok(()) => Json(json!({"status": "loaded"})).into_response(),
|
||||||
Err(e) => (
|
Err(e) => (
|
||||||
StatusCode::BAD_REQUEST,
|
StatusCode::BAD_REQUEST,
|
||||||
Json(json!({"error": e.to_string()})),
|
Json(json!({"error": format!("{e:#}")})),
|
||||||
)
|
)
|
||||||
.into_response(),
|
.into_response(),
|
||||||
}
|
}
|
||||||
@@ -95,7 +95,11 @@ async fn unload_model(
|
|||||||
let registry = state.registry.read().await;
|
let registry = state.registry.read().await;
|
||||||
match registry.unload_model(&model_id).await {
|
match registry.unload_model(&model_id).await {
|
||||||
Ok(()) => Json(json!({"status": "unloaded"})).into_response(),
|
Ok(()) => Json(json!({"status": "unloaded"})).into_response(),
|
||||||
Err(e) => (StatusCode::NOT_FOUND, Json(json!({"error": e.to_string()}))).into_response(),
|
Err(e) => (
|
||||||
|
StatusCode::NOT_FOUND,
|
||||||
|
Json(json!({"error": format!("{e:#}")})),
|
||||||
|
)
|
||||||
|
.into_response(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -151,7 +155,7 @@ async fn chat_completions(
|
|||||||
.into_response(),
|
.into_response(),
|
||||||
Err(InferenceError::Other(e)) => (
|
Err(InferenceError::Other(e)) => (
|
||||||
StatusCode::INTERNAL_SERVER_ERROR,
|
StatusCode::INTERNAL_SERVER_ERROR,
|
||||||
Json(json!({"error": e.to_string()})),
|
Json(json!({"error": format!("{e:#}")})),
|
||||||
)
|
)
|
||||||
.into_response(),
|
.into_response(),
|
||||||
}
|
}
|
||||||
@@ -165,7 +169,7 @@ async fn chat_completions(
|
|||||||
.into_response(),
|
.into_response(),
|
||||||
Err(InferenceError::Other(e)) => (
|
Err(InferenceError::Other(e)) => (
|
||||||
StatusCode::INTERNAL_SERVER_ERROR,
|
StatusCode::INTERNAL_SERVER_ERROR,
|
||||||
Json(json!({"error": e.to_string()})),
|
Json(json!({"error": format!("{e:#}")})),
|
||||||
)
|
)
|
||||||
.into_response(),
|
.into_response(),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,6 +10,12 @@ Restart=on-failure
|
|||||||
RestartSec=5
|
RestartSec=5
|
||||||
User=neuron
|
User=neuron
|
||||||
Group=neuron
|
Group=neuron
|
||||||
|
# /var/lib/neuron is the neuron user's $HOME — hf-hub writes its
|
||||||
|
# default cache there (~/.cache/huggingface/hub). Without this directive
|
||||||
|
# systemd doesn't create the directory and hf-hub downloads fail with
|
||||||
|
# "fetch GGUF <file>: failed to create cache dir".
|
||||||
|
StateDirectory=neuron
|
||||||
|
StateDirectoryMode=0755
|
||||||
# Loading default_models from neuron.toml happens before the HTTP
|
# Loading default_models from neuron.toml happens before the HTTP
|
||||||
# listener binds; large models can take many minutes to download and
|
# listener binds; large models can take many minutes to download and
|
||||||
# materialise on first activation. systemd's default TimeoutStartSec
|
# materialise on first activation. systemd's default TimeoutStartSec
|
||||||
|
|||||||
141
script/validate-neuron.sh
Executable file
141
script/validate-neuron.sh
Executable file
@@ -0,0 +1,141 @@
|
|||||||
|
#!/bin/env bash
|
||||||
|
#
|
||||||
|
# End-to-end smoke test for a deployed neuron.
|
||||||
|
#
|
||||||
|
# Confirms the daemon is reachable, loads a small public Qwen3 GGUF,
|
||||||
|
# fires a reasoning probe at /v1/chat/completions, and prints the
|
||||||
|
# answer. Used to validate the candle harness on a real GPU host
|
||||||
|
# before trusting it for production traffic, and as a regression test
|
||||||
|
# after pushing new neuron builds.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# script/validate-neuron.sh [host] [model_id] [quant]
|
||||||
|
#
|
||||||
|
# Defaults:
|
||||||
|
# host = beast.hanzalova.internal
|
||||||
|
# model_id = unsloth/Qwen3-0.6B-GGUF (official Qwen3-*-GGUF repos
|
||||||
|
# ship Q8_0 only; unsloth's mirror ships the full Q-spectrum
|
||||||
|
# including Q4_K_M)
|
||||||
|
# quant = Q4_K_M
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
HOST="${1:-beast.hanzalova.internal}"
|
||||||
|
MODEL_ID="${2:-unsloth/Qwen3-0.6B-GGUF}"
|
||||||
|
QUANT="${3:-Q4_K_M}"
|
||||||
|
PORT="${NEURON_PORT:-13131}"
|
||||||
|
BASE="http://${HOST}:${PORT}"
|
||||||
|
|
||||||
|
# Reasoning probe — concrete, low-temperature answer that small models
|
||||||
|
# can still get right. "Paris" is a strong signal of basic competence
|
||||||
|
# beyond gibberish.
|
||||||
|
PROBE_PROMPT='What is the capital of France? Respond with the city name only, no punctuation.'
|
||||||
|
EXPECT_SUBSTR='Paris'
|
||||||
|
MAX_TOKENS=32
|
||||||
|
|
||||||
|
# /models/load is synchronous — neuron blocks the response until the
|
||||||
|
# hf-hub download + GGUF parse + tensor materialisation is done. A
|
||||||
|
# fresh 0.6B-Q4_K_M is ~400 MB; on a slow link or cold cache that's
|
||||||
|
# easily a minute. Pick a generous ceiling.
|
||||||
|
LOAD_TIMEOUT=600
|
||||||
|
INFER_TIMEOUT=120
|
||||||
|
|
||||||
|
say() { printf '[%s] %s\n' "${HOST}" "$*"; }
|
||||||
|
die() { say "FAIL: $*"; exit 1; }
|
||||||
|
|
||||||
|
probe_health() {
|
||||||
|
curl --silent --fail --max-time 5 "${BASE}/health" >/dev/null \
|
||||||
|
|| die "neuron not reachable at ${BASE}/health"
|
||||||
|
}
|
||||||
|
|
||||||
|
list_loaded_ids() {
|
||||||
|
curl --silent --fail "${BASE}/models" | yq -r '.[].id'
|
||||||
|
}
|
||||||
|
|
||||||
|
is_loaded() {
|
||||||
|
list_loaded_ids 2>/dev/null | grep -Fxq "${MODEL_ID}"
|
||||||
|
}
|
||||||
|
|
||||||
|
trigger_load() {
|
||||||
|
say "POST /models/load ${MODEL_ID} (quant=${QUANT}, device=[0])"
|
||||||
|
say " (synchronous; may take a minute on first run while HF downloads)"
|
||||||
|
local payload
|
||||||
|
payload=$(cat <<EOF
|
||||||
|
{
|
||||||
|
"model_id": "${MODEL_ID}",
|
||||||
|
"harness": "candle",
|
||||||
|
"quant": "${QUANT}",
|
||||||
|
"devices": [0]
|
||||||
|
}
|
||||||
|
EOF
|
||||||
|
)
|
||||||
|
# --write-out captures the response code on a separate line so we
|
||||||
|
# can surface a real diagnostic instead of relying on --fail.
|
||||||
|
local resp http_code body
|
||||||
|
resp=$(curl --silent --show-error --max-time "${LOAD_TIMEOUT}" \
|
||||||
|
--write-out '\n__HTTP__%{http_code}' \
|
||||||
|
-X POST "${BASE}/models/load" \
|
||||||
|
-H 'content-type: application/json' \
|
||||||
|
--data "${payload}") || die "curl /models/load failed: $?"
|
||||||
|
http_code=$(echo "${resp}" | grep -oP '(?<=__HTTP__)\d+$' | tail -1)
|
||||||
|
body=$(echo "${resp}" | sed '$ s/__HTTP__.*$//')
|
||||||
|
if [[ "${http_code}" != "200" ]]; then
|
||||||
|
die "load returned HTTP ${http_code}: ${body}"
|
||||||
|
fi
|
||||||
|
say "load returned ${http_code}: ${body}"
|
||||||
|
}
|
||||||
|
|
||||||
|
run_probe() {
|
||||||
|
say "POST /v1/chat/completions (probe: ${PROBE_PROMPT})"
|
||||||
|
local payload
|
||||||
|
payload=$(yq -n -c \
|
||||||
|
--arg model "${MODEL_ID}" \
|
||||||
|
--arg content "${PROBE_PROMPT}" \
|
||||||
|
--argjson tokens "${MAX_TOKENS}" \
|
||||||
|
'{
|
||||||
|
model: $model,
|
||||||
|
messages: [{role: "user", content: $content}],
|
||||||
|
temperature: 0.1,
|
||||||
|
max_tokens: $tokens
|
||||||
|
}')
|
||||||
|
local resp http_code body
|
||||||
|
resp=$(curl --silent --show-error --max-time "${INFER_TIMEOUT}" \
|
||||||
|
--write-out '\n__HTTP__%{http_code}' \
|
||||||
|
-X POST "${BASE}/v1/chat/completions" \
|
||||||
|
-H 'content-type: application/json' \
|
||||||
|
--data "${payload}") || die "curl /v1/chat/completions failed: $?"
|
||||||
|
http_code=$(echo "${resp}" | grep -oP '(?<=__HTTP__)\d+$' | tail -1)
|
||||||
|
body=$(echo "${resp}" | sed '$ s/__HTTP__.*$//')
|
||||||
|
if [[ "${http_code}" != "200" ]]; then
|
||||||
|
die "inference returned HTTP ${http_code}: ${body}"
|
||||||
|
fi
|
||||||
|
echo "${body}"
|
||||||
|
}
|
||||||
|
|
||||||
|
say "validating neuron at ${BASE}"
|
||||||
|
probe_health
|
||||||
|
say "/health OK"
|
||||||
|
|
||||||
|
if is_loaded; then
|
||||||
|
say "${MODEL_ID} already loaded"
|
||||||
|
else
|
||||||
|
trigger_load
|
||||||
|
fi
|
||||||
|
|
||||||
|
raw=$(run_probe)
|
||||||
|
echo "---"
|
||||||
|
echo "${raw}" | yq -r '.'
|
||||||
|
echo "---"
|
||||||
|
|
||||||
|
content=$(echo "${raw}" | yq -r '.choices[0].message.content // empty')
|
||||||
|
if [[ -z "${content}" ]]; then
|
||||||
|
die "no content in chat completion response"
|
||||||
|
fi
|
||||||
|
say "assistant said: ${content}"
|
||||||
|
|
||||||
|
if echo "${content}" | grep -qiF "${EXPECT_SUBSTR}"; then
|
||||||
|
say "PASS — response contains expected substring '${EXPECT_SUBSTR}'"
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
die "response did not contain '${EXPECT_SUBSTR}'"
|
||||||
|
fi
|
||||||
Reference in New Issue
Block a user