From 51e452b6076c68210f1abdbbb90a53240a115758 Mon Sep 17 00:00:00 2001 From: rob thijssen Date: Mon, 9 Mar 2026 18:44:41 +0200 Subject: [PATCH] feat: discover max_output_tokens from server at startup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of hardcoding per-family token budgets, ClaudeClient queries the server at startup and sets max_output_tokens = context_length / 2. Two discovery strategies, tried in order: 1. LM Studio /api/v1/models — returns loaded_instances[].config.context_length (the actually-configured context, e.g. 64000) and max_context_length (theoretical max, e.g. 131072). We prefer the loaded value. 2. OpenAI-compat /v1/models/{id} — used as fallback for non-LM Studio backends that expose context_length on the model object. If both fail, the family default is kept (DeepSeekR1=32768, Generic=8192). lmstudio_context_length() matches model IDs with and without quantization suffixes (@q4_k_m etc.) so the --model flag doesn't need to be exact. For the current R1-32B setup: loaded context=64000 → max_output_tokens=32000, giving the thinking pass plenty of room while reserving half for input. Co-Authored-By: Claude Sonnet 4.6 --- src/agent.rs | 3 +- src/claude.rs | 108 +++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 108 insertions(+), 3 deletions(-) diff --git a/src/agent.rs b/src/agent.rs index baf0514..4a9c454 100644 --- a/src/agent.rs +++ b/src/agent.rs @@ -132,7 +132,8 @@ pub async fn run(cli: &Cli) -> Result<()> { // Init clients let swym = SwymClient::new(&cli.swym_url)?; - let claude = ClaudeClient::new(&cli.anthropic_key, &cli.anthropic_url, &cli.model); + let mut claude = ClaudeClient::new(&cli.anthropic_key, &cli.anthropic_url, &cli.model); + claude.apply_server_limits().await; // Check candle coverage for all instruments info!( diff --git a/src/claude.rs b/src/claude.rs index 8b96a77..9c70bd7 100644 --- a/src/claude.rs +++ b/src/claude.rs @@ -2,6 +2,7 @@ use anyhow::{Context, Result}; use reqwest::Client; use serde::{Deserialize, Serialize}; use serde_json::Value; +use tracing::{info, warn}; use crate::config::ModelFamily; @@ -11,6 +12,10 @@ pub struct ClaudeClient { api_url: String, model: String, family: ModelFamily, + /// Effective max output tokens, initialised from the family default and + /// optionally updated by `apply_server_limits()` after querying the + /// server's model metadata. + max_output_tokens: u32, } #[derive(Serialize)] @@ -53,12 +58,14 @@ impl ClaudeClient { .timeout(std::time::Duration::from_secs(timeout_secs)) .build() .expect("build http client"); + let max_output_tokens = family.max_output_tokens(); Self { client, api_key: api_key.to_string(), api_url: api_url.to_string(), model: model.to_string(), family, + max_output_tokens, } } @@ -66,7 +73,71 @@ impl ClaudeClient { &self.family } - /// Send a conversation to Claude and get the text response. + /// Query the server for the loaded model's actual context length and + /// update `max_output_tokens` accordingly. + /// + /// Uses half the loaded context window for output, leaving the other + /// half for the system prompt and conversation history. Falls back to + /// the family default if the server does not expose the information. + /// + /// Tries two endpoints: + /// 1. LM Studio `/api/v1/models` — returns `loaded_instances[].config.context_length` + /// 2. OpenAI-compat `/v1/models/{id}` — returns `context_length` if present + pub async fn apply_server_limits(&mut self) { + match self.query_context_length().await { + Some(ctx_len) => { + // Reserve half the context for input (system prompt + history). + let budget = ctx_len / 2; + info!( + "server context_length={ctx_len} → max_output_tokens={budget} \ + (was {} from family default)", + self.max_output_tokens, + ); + self.max_output_tokens = budget; + } + None => { + info!( + "could not determine server context_length; \ + using family default max_output_tokens={}", + self.max_output_tokens, + ); + } + } + } + + /// Try to discover the loaded context length for the current model. + async fn query_context_length(&self) -> Option { + let base = self.api_url.trim_end_matches('/'); + + // --- Strategy 1: LM Studio proprietary /api/v1/models --- + let lmstudio_url = format!("{base}/api/v1/models"); + if let Ok(resp) = self.client.get(&lmstudio_url).send().await { + if resp.status().is_success() { + if let Ok(json) = resp.json::().await { + if let Some(ctx) = lmstudio_context_length(&json, &self.model) { + return Some(ctx); + } + } + } + } + + // --- Strategy 2: OpenAI-compat /v1/models/{id} --- + let oai_url = format!("{base}/v1/models/{}", self.model); + if let Ok(resp) = self.client.get(&oai_url).send().await { + if resp.status().is_success() { + if let Ok(json) = resp.json::().await { + if let Some(n) = json["context_length"].as_u64() { + return Some(n as u32); + } + } + } + } + + warn!("could not query context_length from server for model {}", self.model); + None + } + + /// Send a conversation to the model and get the text response. pub async fn chat( &self, system: &str, @@ -74,7 +145,7 @@ impl ClaudeClient { ) -> Result<(String, Option)> { let body = MessagesRequest { model: self.model.clone(), - max_tokens: self.family.max_output_tokens(), + max_tokens: self.max_output_tokens, system: system.to_string(), messages: messages.to_vec(), }; @@ -109,6 +180,39 @@ impl ClaudeClient { } } +/// Extract the loaded context_length for a model from the LM Studio +/// `/api/v1/models` response. +/// +/// Matches on `key` or `id` fields (LM Studio uses `key`; some variants +/// append a quantization suffix like `@q4_k_m`, so we strip that too). +fn lmstudio_context_length(json: &Value, model_id: &str) -> Option { + let models = json["models"].as_array()?; + let model_base = model_id.split('@').next().unwrap_or(model_id); + + for entry in models { + let key = entry["key"].as_str().unwrap_or(""); + let key_base = key.split('@').next().unwrap_or(key); + + if key_base == model_base || key == model_id { + // Prefer the actually-loaded context (loaded_instances[0].config.context_length) + // over the theoretical max_context_length. + let loaded = entry["loaded_instances"] + .as_array() + .and_then(|a| a.first()) + .and_then(|inst| inst["config"]["context_length"].as_u64()) + .map(|n| n as u32); + if loaded.is_some() { + return loaded; + } + // Fall back to max_context_length if no loaded instance info + if let Some(n) = entry["max_context_length"].as_u64() { + return Some(n as u32); + } + } + } + None +} + /// Extract a JSON object from a model response text. /// Handles markdown code fences and R1-style `...` blocks. pub fn extract_json(text: &str) -> Result {