feat(neuron): fail loud (422) when a tools-bearing request can't render

Three of this session's bugs (system-message position, tool_call argument shape, and the original tool rendering) all hid behind the same silent behaviour: chat_template render fails → neuron falls back to format_qwen3_prompt, which drops every tool → the request still returns 200 with degraded, tool-less output. Each cost real debugging time because the failure was invisible on the wire. build_prompt_for_request now returns Result. On a render failure it checks whether the request carried tools: if so it returns the new InferenceError::TemplateRenderFailed (mapped to 422 with a template_render_failed code and the underlying Jinja error), instead of silently degrading. A render failure with no tools still falls back quietly — there's nothing to lose, and `format_qwen3_prompt` is a reasonable text-only prompt. The four prompt-build call sites propagate with `?`. Now the next client/template incompatibility surfaces as a loud 422 the operator sees immediately, not a mysteriously-degraded session. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-15 17:48:31 +03:00
parent 8880b2f8a6
commit a94dd55ab8
2 changed files with 84 additions and 11 deletions
--- a/crates/neuron/src/api.rs
+++ b/crates/neuron/src/api.rs
@@ -328,6 +328,14 @@ async fn chat_completions(
                })),
            )
                .into_response(),
+            Err(InferenceError::TemplateRenderFailed { detail }) => (
+                StatusCode::UNPROCESSABLE_ENTITY,
+                Json(json!({
+                    "error": format!("chat template could not render this request: {detail}"),
+                    "code": "template_render_failed",
+                })),
+            )
+                .into_response(),
            Err(InferenceError::Other(e)) => (
                StatusCode::INTERNAL_SERVER_ERROR,
                Json(json!({"error": format!("{e:#}")})),
@@ -379,6 +387,14 @@ async fn chat_completions(
                })),
            )
                .into_response(),
+            Err(InferenceError::TemplateRenderFailed { detail }) => (
+                StatusCode::UNPROCESSABLE_ENTITY,
+                Json(json!({
+                    "error": format!("chat template could not render this request: {detail}"),
+                    "code": "template_render_failed",
+                })),
+            )
+                .into_response(),
            Err(InferenceError::Other(e)) => (
                StatusCode::INTERNAL_SERVER_ERROR,
                Json(json!({"error": format!("{e:#}")})),
@@ -554,6 +570,14 @@ fn inference_error_response(err: InferenceError) -> axum::response::Response {
            })),
        )
            .into_response(),
+        InferenceError::TemplateRenderFailed { detail } => (
+            StatusCode::UNPROCESSABLE_ENTITY,
+            Json(json!({
+                "error": format!("chat template could not render this request: {detail}"),
+                "code": "template_render_failed",
+            })),
+        )
+            .into_response(),
        InferenceError::Other(e) => (
            StatusCode::INTERNAL_SERVER_ERROR,
            Json(json!({"error": format!("{e:#}")})),
--- a/crates/neuron/src/harness/candle.rs
+++ b/crates/neuron/src/harness/candle.rs
@@ -1904,7 +1904,7 @@ impl CandleHarness {
        let _inference_guard = loaded.inference_lock.lock().await;

        let result = async {
-            let prompt = build_prompt_for_request(loaded.chat_template.as_deref(), &request);
+            let prompt = build_prompt_for_request(loaded.chat_template.as_deref(), &request)?;

            let encoding = loaded
                .tokenizer
@@ -2301,7 +2301,7 @@ impl CandleHarness {
            }
        };

-        let prompt = build_prompt_for_request(loaded.chat_template.as_deref(), &request);
+        let prompt = build_prompt_for_request(loaded.chat_template.as_deref(), &request)?;
        let encoding = loaded
            .tokenizer
            .encode(prompt.as_str(), true)
@@ -3346,7 +3346,7 @@ impl CandleHarness {
            });
        }

-        let prompt = build_prompt_for_request(tp.chat_template.as_deref(), &request);
+        let prompt = build_prompt_for_request(tp.chat_template.as_deref(), &request)?;
        let encoding = tp
            .tokenizer
            .encode(prompt.as_str(), true)
@@ -3959,7 +3959,7 @@ async fn chat_completion_tp_inner(
    let req_start = std::time::Instant::now();
    let model_id = request.model.clone();

-    let prompt = build_prompt_for_request(tp.chat_template.as_deref(), &request);
+    let prompt = build_prompt_for_request(tp.chat_template.as_deref(), &request)?;
    let encoding = tp
        .tokenizer
        .encode(prompt.as_str(), true)
@@ -4574,6 +4574,13 @@ pub enum InferenceError {
         remove the image_url content parts from the request"
    )]
    VisionUnsupported { model_id: String },
+    /// The loaded model's chat template could not render the request
+    /// (e.g. a message / tool-call structure it rejects). Returned only
+    /// when the request carried tools — silently degrading to a
+    /// tool-less prompt breaks tool calling invisibly, which is the
+    /// failure mode that hid several client-compat bugs. Maps to 422.
+    #[error("chat template could not render this request: {detail}")]
+    TemplateRenderFailed { detail: String },
    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }
@@ -4602,12 +4609,12 @@ pub enum InferenceError {
 fn build_prompt_for_request(
    chat_template: Option<&str>,
    request: &ChatCompletionRequest,
-) -> String {
+) -> Result<String, InferenceError> {
    if !super::chat_template::chat_templates_enabled() {
-        return format_qwen3_prompt(&request.messages);
+        return Ok(format_qwen3_prompt(&request.messages));
    }
    let Some(tmpl) = chat_template else {
-        return format_qwen3_prompt(&request.messages);
+        return Ok(format_qwen3_prompt(&request.messages));
    };

    // Pull `chat_template_kwargs` and `tools` from the request's
@@ -4641,15 +4648,30 @@ fn build_prompt_for_request(
                prompt = %prompt,
                "chat_template: rendered prompt"
            );
-            prompt
+            Ok(prompt)
        }
        Err(e) => {
+            let detail = format!("{e:#}");
+            // A tools-bearing request the template can't render must NOT
+            // silently degrade to a tool-less fallback prompt — that
+            // strips every tool and breaks tool calling invisibly (the
+            // failure mode behind the system-message, arguments-format,
+            // and tool-render bugs). Surface it as an error instead.
+            let has_tools = tools.as_array().is_some_and(|a| !a.is_empty());
+            if has_tools {
+                tracing::warn!(
+                    model = %request.model,
+                    error = %detail,
+                    "chat_template render failed on a tools-bearing request — returning 422 (refusing silent tool-less fallback)"
+                );
+                return Err(InferenceError::TemplateRenderFailed { detail });
+            }
            tracing::warn!(
                model = %request.model,
-                error = %format!("{e:#}"),
-                "chat_template render failed; falling back to format_qwen3_prompt"
+                error = %detail,
+                "chat_template render failed; falling back to format_qwen3_prompt (no tools to drop)"
            );
-            format_qwen3_prompt(&request.messages)
+            Ok(format_qwen3_prompt(&request.messages))
        }
    }
 }
@@ -6372,4 +6394,31 @@ mod tests {
        // Non-reasoning model (no pair) → always false.
        assert!(!prompt_opens_reasoning(&[100], None));
    }
+
+    #[test]
+    fn render_failure_with_tools_errors_instead_of_silent_fallback() {
+        // A template that always raises — stands in for the real
+        // incompatibilities (system-message position, tool_call arg
+        // shape) that made neuron silently drop tools.
+        let bad = "{{ raise_exception('boom') }}";
+
+        // Tools present → must surface as an error, never a tool-less
+        // fallback prompt.
+        let with_tools: ChatCompletionRequest = serde_json::from_value(serde_json::json!({
+            "model": "m",
+            "messages": [{"role": "user", "content": "hi"}],
+            "tools": [{"type": "function", "function": {"name": "x"}}]
+        }))
+        .unwrap();
+        let err = build_prompt_for_request(Some(bad), &with_tools).unwrap_err();
+        assert!(matches!(err, InferenceError::TemplateRenderFailed { .. }));
+
+        // No tools → falling back is harmless, so it stays Ok.
+        let no_tools: ChatCompletionRequest = serde_json::from_value(serde_json::json!({
+            "model": "m",
+            "messages": [{"role": "user", "content": "hi"}]
+        }))
+        .unwrap();
+        assert!(build_prompt_for_request(Some(bad), &no_tools).is_ok());
+    }
 }