feat(neuron): fail loud (422) when a tools-bearing request can't render
All checks were successful
build-prerelease / Resolve version stamps + change detection (push) Successful in 30s
build-prerelease / Build cortex binary (push) Has been skipped
build-prerelease / Package cortex RPM (push) Has been skipped
build-prerelease / Build helexa-bench binary (push) Has been skipped
build-prerelease / Package helexa-bench RPM (push) Has been skipped
build-prerelease / Lint (fmt + clippy) (push) Successful in 2m18s
build-prerelease / Test (push) Successful in 4m12s
build-prerelease / Build neuron-blackwell (push) Successful in 1m38s
build-prerelease / Build neuron-ada (push) Successful in 2m10s
build-prerelease / Build neuron-ampere (push) Successful in 2m49s
build-prerelease / Package helexa-neuron-ada RPM (push) Successful in 1m36s
build-prerelease / Package helexa-neuron-blackwell RPM (push) Successful in 1m40s
build-prerelease / Package helexa-neuron-ampere RPM (push) Successful in 1m44s
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Successful in 58s

Three of this session's bugs (system-message position, tool_call argument
shape, and the original tool rendering) all hid behind the same silent
behaviour: chat_template render fails → neuron falls back to
format_qwen3_prompt, which drops every tool → the request still returns
200 with degraded, tool-less output. Each cost real debugging time
because the failure was invisible on the wire.

build_prompt_for_request now returns Result. On a render failure it
checks whether the request carried tools: if so it returns the new
InferenceError::TemplateRenderFailed (mapped to 422 with a
template_render_failed code and the underlying Jinja error), instead of
silently degrading. A render failure with no tools still falls back
quietly — there's nothing to lose, and `format_qwen3_prompt` is a
reasonable text-only prompt. The four prompt-build call sites propagate
with `?`.

Now the next client/template incompatibility surfaces as a loud 422 the
operator sees immediately, not a mysteriously-degraded session.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-15 17:48:31 +03:00
parent 8880b2f8a6
commit a94dd55ab8
2 changed files with 84 additions and 11 deletions

View File

@@ -328,6 +328,14 @@ async fn chat_completions(
})), })),
) )
.into_response(), .into_response(),
Err(InferenceError::TemplateRenderFailed { detail }) => (
StatusCode::UNPROCESSABLE_ENTITY,
Json(json!({
"error": format!("chat template could not render this request: {detail}"),
"code": "template_render_failed",
})),
)
.into_response(),
Err(InferenceError::Other(e)) => ( Err(InferenceError::Other(e)) => (
StatusCode::INTERNAL_SERVER_ERROR, StatusCode::INTERNAL_SERVER_ERROR,
Json(json!({"error": format!("{e:#}")})), Json(json!({"error": format!("{e:#}")})),
@@ -379,6 +387,14 @@ async fn chat_completions(
})), })),
) )
.into_response(), .into_response(),
Err(InferenceError::TemplateRenderFailed { detail }) => (
StatusCode::UNPROCESSABLE_ENTITY,
Json(json!({
"error": format!("chat template could not render this request: {detail}"),
"code": "template_render_failed",
})),
)
.into_response(),
Err(InferenceError::Other(e)) => ( Err(InferenceError::Other(e)) => (
StatusCode::INTERNAL_SERVER_ERROR, StatusCode::INTERNAL_SERVER_ERROR,
Json(json!({"error": format!("{e:#}")})), Json(json!({"error": format!("{e:#}")})),
@@ -554,6 +570,14 @@ fn inference_error_response(err: InferenceError) -> axum::response::Response {
})), })),
) )
.into_response(), .into_response(),
InferenceError::TemplateRenderFailed { detail } => (
StatusCode::UNPROCESSABLE_ENTITY,
Json(json!({
"error": format!("chat template could not render this request: {detail}"),
"code": "template_render_failed",
})),
)
.into_response(),
InferenceError::Other(e) => ( InferenceError::Other(e) => (
StatusCode::INTERNAL_SERVER_ERROR, StatusCode::INTERNAL_SERVER_ERROR,
Json(json!({"error": format!("{e:#}")})), Json(json!({"error": format!("{e:#}")})),

View File

@@ -1904,7 +1904,7 @@ impl CandleHarness {
let _inference_guard = loaded.inference_lock.lock().await; let _inference_guard = loaded.inference_lock.lock().await;
let result = async { let result = async {
let prompt = build_prompt_for_request(loaded.chat_template.as_deref(), &request); let prompt = build_prompt_for_request(loaded.chat_template.as_deref(), &request)?;
let encoding = loaded let encoding = loaded
.tokenizer .tokenizer
@@ -2301,7 +2301,7 @@ impl CandleHarness {
} }
}; };
let prompt = build_prompt_for_request(loaded.chat_template.as_deref(), &request); let prompt = build_prompt_for_request(loaded.chat_template.as_deref(), &request)?;
let encoding = loaded let encoding = loaded
.tokenizer .tokenizer
.encode(prompt.as_str(), true) .encode(prompt.as_str(), true)
@@ -3346,7 +3346,7 @@ impl CandleHarness {
}); });
} }
let prompt = build_prompt_for_request(tp.chat_template.as_deref(), &request); let prompt = build_prompt_for_request(tp.chat_template.as_deref(), &request)?;
let encoding = tp let encoding = tp
.tokenizer .tokenizer
.encode(prompt.as_str(), true) .encode(prompt.as_str(), true)
@@ -3959,7 +3959,7 @@ async fn chat_completion_tp_inner(
let req_start = std::time::Instant::now(); let req_start = std::time::Instant::now();
let model_id = request.model.clone(); let model_id = request.model.clone();
let prompt = build_prompt_for_request(tp.chat_template.as_deref(), &request); let prompt = build_prompt_for_request(tp.chat_template.as_deref(), &request)?;
let encoding = tp let encoding = tp
.tokenizer .tokenizer
.encode(prompt.as_str(), true) .encode(prompt.as_str(), true)
@@ -4574,6 +4574,13 @@ pub enum InferenceError {
remove the image_url content parts from the request" remove the image_url content parts from the request"
)] )]
VisionUnsupported { model_id: String }, VisionUnsupported { model_id: String },
/// The loaded model's chat template could not render the request
/// (e.g. a message / tool-call structure it rejects). Returned only
/// when the request carried tools — silently degrading to a
/// tool-less prompt breaks tool calling invisibly, which is the
/// failure mode that hid several client-compat bugs. Maps to 422.
#[error("chat template could not render this request: {detail}")]
TemplateRenderFailed { detail: String },
#[error(transparent)] #[error(transparent)]
Other(#[from] anyhow::Error), Other(#[from] anyhow::Error),
} }
@@ -4602,12 +4609,12 @@ pub enum InferenceError {
fn build_prompt_for_request( fn build_prompt_for_request(
chat_template: Option<&str>, chat_template: Option<&str>,
request: &ChatCompletionRequest, request: &ChatCompletionRequest,
) -> String { ) -> Result<String, InferenceError> {
if !super::chat_template::chat_templates_enabled() { if !super::chat_template::chat_templates_enabled() {
return format_qwen3_prompt(&request.messages); return Ok(format_qwen3_prompt(&request.messages));
} }
let Some(tmpl) = chat_template else { let Some(tmpl) = chat_template else {
return format_qwen3_prompt(&request.messages); return Ok(format_qwen3_prompt(&request.messages));
}; };
// Pull `chat_template_kwargs` and `tools` from the request's // Pull `chat_template_kwargs` and `tools` from the request's
@@ -4641,15 +4648,30 @@ fn build_prompt_for_request(
prompt = %prompt, prompt = %prompt,
"chat_template: rendered prompt" "chat_template: rendered prompt"
); );
prompt Ok(prompt)
} }
Err(e) => { Err(e) => {
let detail = format!("{e:#}");
// A tools-bearing request the template can't render must NOT
// silently degrade to a tool-less fallback prompt — that
// strips every tool and breaks tool calling invisibly (the
// failure mode behind the system-message, arguments-format,
// and tool-render bugs). Surface it as an error instead.
let has_tools = tools.as_array().is_some_and(|a| !a.is_empty());
if has_tools {
tracing::warn!(
model = %request.model,
error = %detail,
"chat_template render failed on a tools-bearing request — returning 422 (refusing silent tool-less fallback)"
);
return Err(InferenceError::TemplateRenderFailed { detail });
}
tracing::warn!( tracing::warn!(
model = %request.model, model = %request.model,
error = %format!("{e:#}"), error = %detail,
"chat_template render failed; falling back to format_qwen3_prompt" "chat_template render failed; falling back to format_qwen3_prompt (no tools to drop)"
); );
format_qwen3_prompt(&request.messages) Ok(format_qwen3_prompt(&request.messages))
} }
} }
} }
@@ -6372,4 +6394,31 @@ mod tests {
// Non-reasoning model (no pair) → always false. // Non-reasoning model (no pair) → always false.
assert!(!prompt_opens_reasoning(&[100], None)); assert!(!prompt_opens_reasoning(&[100], None));
} }
#[test]
fn render_failure_with_tools_errors_instead_of_silent_fallback() {
// A template that always raises — stands in for the real
// incompatibilities (system-message position, tool_call arg
// shape) that made neuron silently drop tools.
let bad = "{{ raise_exception('boom') }}";
// Tools present → must surface as an error, never a tool-less
// fallback prompt.
let with_tools: ChatCompletionRequest = serde_json::from_value(serde_json::json!({
"model": "m",
"messages": [{"role": "user", "content": "hi"}],
"tools": [{"type": "function", "function": {"name": "x"}}]
}))
.unwrap();
let err = build_prompt_for_request(Some(bad), &with_tools).unwrap_err();
assert!(matches!(err, InferenceError::TemplateRenderFailed { .. }));
// No tools → falling back is harmless, so it stays Ok.
let no_tools: ChatCompletionRequest = serde_json::from_value(serde_json::json!({
"model": "m",
"messages": [{"role": "user", "content": "hi"}]
}))
.unwrap();
assert!(build_prompt_for_request(Some(bad), &no_tools).is_ok());
}
} }