//! Chat-template rendering for the model-supplied Jinja templates //! HuggingFace tokenizers ship in `tokenizer_config.json`. //! //! ## Background //! //! Every modern open-weight model bundles a `chat_template` field //! in its `tokenizer_config.json` — a Jinja2 template string that //! converts a sequence of `{role, content}` messages into the //! exact prompt the model was trained on. Examples: //! //! - Qwen3-Coder: `<|im_start|>{role}\n{content}<|im_end|>\n…` //! with conditional `enable_thinking` handling that injects an //! empty `\n\n` block when set false. //! - DeepSeek-R1: similar im_start framing with different special- //! token names. //! - Mistral / Magistral: a `[INST]` / `[/INST]` framing. //! - Claude / Llama: another shape again. //! //! Rendering the model's own template is the only way to get the //! *exact* prompt format the model was trained on plus the //! model-specific kwargs (`enable_thinking`, `tools`, …) without //! hardcoding per-model logic. The alternative — neuron's previous //! `format_qwen3_prompt` — was a hardcoded Qwen3 ChatML glue that //! ignored kwargs entirely. //! //! ## Scope //! //! This module is request-side only: it builds the prompt string //! the tokenizer ingests before inference. The reasoning- and //! tool-call-marker token routing (issues #6, #8) is response-side //! and stays in `wire::openai_chat` / the streaming inference //! loops. //! //! ## Fallback //! //! When the model's `tokenizer_config.json` is missing, doesn't //! parse, lacks a `chat_template`, or renders an error, the caller //! falls back to `format_qwen3_prompt`. The //! `NEURON_USE_CHAT_TEMPLATE=false` env var is a global kill //! switch — if a deploy goes sideways and the renderer is to //! blame, an operator can flip the env and restart neuron without //! shipping a new build. use anyhow::{Context, Result}; use cortex_core::openai::{ChatMessage, MessageContent}; use minijinja::{Environment, Error as MjError, ErrorKind as MjErrorKind, Value as MjValue}; use serde_json::Value; use std::path::Path; /// Environment variable that, when set to `false`/`0`/`no`, /// forces every model to skip its `chat_template` and fall back /// to `format_qwen3_prompt`. Default (unset) is "use chat /// templates where available". pub const KILL_SWITCH_ENV: &str = "NEURON_USE_CHAT_TEMPLATE"; /// Read the global kill switch. `true` means chat templates are /// enabled; `false` forces the fallback path everywhere. pub fn chat_templates_enabled() -> bool { match std::env::var(KILL_SWITCH_ENV).ok().as_deref() { Some(s) => !matches!( s.trim().to_ascii_lowercase().as_str(), "false" | "0" | "no" | "off" ), None => true, } } /// Probe for the model's chat template in the same directory the /// tokenizer was loaded from, following HuggingFace `transformers` /// precedence: a standalone `chat_template.jinja` (then /// `chat_template.json`) wins over the `chat_template` field in /// `tokenizer_config.json`. /// /// This matters for multimodal models: Qwen3-VL / Qwen3.6 ship their /// vision-aware template (the one that emits /// `<|vision_start|><|image_pad|><|vision_end|>` per image) **only** in /// `chat_template.jinja`, and may not ship a `tokenizer_config.json` at /// all. Reading `tokenizer_config.json` alone returned `None`, which /// dropped image content into the text-only `format_qwen3_prompt` /// fallback — so image requests rendered zero `<|image_pad|>` tokens /// and the vision path bailed on the count mismatch. pub fn load_chat_template_alongside(tokenizer_json_path: &Path) -> Option { let parent = tokenizer_json_path.parent()?; // 1. Standalone Jinja file — raw template text, highest priority. let jinja_path = parent.join("chat_template.jinja"); match std::fs::read_to_string(&jinja_path) { Ok(text) if !text.trim().is_empty() => { tracing::info!( path = %jinja_path.display(), "chat_template: loaded standalone chat_template.jinja" ); return Some(text); } Ok(_) => { tracing::warn!( path = %jinja_path.display(), "chat_template: chat_template.jinja present but empty; trying other sources" ); } Err(_) => {} // absent — fall through, common case } // 2. Standalone JSON file — `{"chat_template": "..."}` form. let json_path = parent.join("chat_template.json"); if json_path.exists() && let Some(t) = load_chat_template_from(&json_path) { tracing::info!( path = %json_path.display(), "chat_template: loaded standalone chat_template.json" ); return Some(t); } // 3. The `chat_template` field inside tokenizer_config.json. let config_path = parent.join("tokenizer_config.json"); load_chat_template_from(&config_path) } /// Best-effort load of `chat_template` from a HuggingFace /// `tokenizer_config.json`. Returns `None` when the file is /// absent, doesn't parse, or lacks the `chat_template` field — /// in all of those cases the caller falls back to /// `format_qwen3_prompt`. Warnings are logged so an operator can /// see why the fallback fired. pub fn load_chat_template_from(path: &Path) -> Option { let text = match std::fs::read_to_string(path) { Ok(t) => t, Err(e) => { tracing::debug!( path = %path.display(), error = %e, "chat_template: tokenizer_config.json absent or unreadable; falling back" ); return None; } }; let value: Value = match serde_json::from_str(&text) { Ok(v) => v, Err(e) => { tracing::warn!( path = %path.display(), error = %e, "chat_template: tokenizer_config.json failed to parse; falling back" ); return None; } }; // Some tokenizer_config.json files carry `chat_template` as an // array of `{name, template}` objects (multi-template models — // tool-use variant, default variant). For now we pick the first // entry; future iterations could honour a name hint. match value.get("chat_template") { Some(Value::String(s)) => Some(s.clone()), Some(Value::Array(arr)) => { for entry in arr { if let Some(t) = entry.get("template").and_then(|v| v.as_str()) { return Some(t.to_string()); } } tracing::warn!( path = %path.display(), "chat_template: array form had no usable template entry; falling back" ); None } _ => None, } } /// Render the chat template into the prompt the model expects. /// /// `template` is the raw Jinja string from `tokenizer_config.json`. /// `messages` is the conversation in order. `kwargs` is the /// `chat_template_kwargs` object the client supplied on the /// request (or `Value::Null` when absent). The function expands /// the kwargs into the Jinja context alongside the standard /// `messages` and `add_generation_prompt` variables HF templates /// expect. /// /// `tools` is the request's `tools` array (or `Value::Null`). /// Some chat templates iterate it to emit native tool definitions /// (Qwen3-Coder's tool-use template, Mistral's [TOOL_DEFINITIONS] /// frame). We forward whatever the client sent without /// interpretation. pub fn render_chat_template( template: &str, messages: &[ChatMessage], tools: &Value, kwargs: &Value, ) -> Result { let mut env = Environment::new(); // HF chat templates are authored against Python's Jinja2 with its // string semantics. Bridge the two so real model templates render: // // - `pycompat::unknown_method_callback` supplies Python str/list/dict // methods minijinja lacks natively (`startswith`, `endswith`, // `split`, `rstrip`, `lstrip`, …) — the Qwen3.6 template uses // several in its think-block and tool-response handling. // - `raise_exception` is the global HF templates call to reject // malformed inputs (e.g. an image in a system message). Map it to // a render error so the caller falls back / surfaces it. env.set_unknown_method_callback(minijinja_contrib::pycompat::unknown_method_callback); env.add_function( "raise_exception", |msg: String| -> Result { Err(MjError::new(MjErrorKind::InvalidOperation, msg)) }, ); // Compile the template against a fixed name so error messages // surface "chat_template" rather than `