From 51e452b6076c68210f1abdbbb90a53240a115758 Mon Sep 17 00:00:00 2001
From: rob thijssen <grenade@rob.tn>
Date: Mon, 9 Mar 2026 18:44:41 +0200
Subject: [PATCH] feat: discover max_output_tokens from server at startup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of hardcoding per-family token budgets, ClaudeClient queries the
server at startup and sets max_output_tokens = context_length / 2.

Two discovery strategies, tried in order:
1. LM Studio /api/v1/models — returns loaded_instances[].config.context_length
   (the actually-configured context, e.g. 64000) and max_context_length
   (theoretical max, e.g. 131072). We prefer the loaded value.
2. OpenAI-compat /v1/models/{id} — used as fallback for non-LM Studio
   backends that expose context_length on the model object.

If both fail, the family default is kept (DeepSeekR1=32768, Generic=8192).

lmstudio_context_length() matches model IDs with and without quantization
suffixes (@q4_k_m etc.) so the --model flag doesn't need to be exact.

For the current R1-32B setup: loaded context=64000 → max_output_tokens=32000,
giving the thinking pass plenty of room while reserving half for input.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/agent.rs  |   3 +-
 src/claude.rs | 108 +++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 108 insertions(+), 3 deletions(-)

diff --git a/src/agent.rs b/src/agent.rs
index baf0514..4a9c454 100644
--- a/src/agent.rs
+++ b/src/agent.rs
@@ -132,7 +132,8 @@ pub async fn run(cli: &Cli) -> Result<()> {
 
     // Init clients
     let swym = SwymClient::new(&cli.swym_url)?;
-    let claude = ClaudeClient::new(&cli.anthropic_key, &cli.anthropic_url, &cli.model);
+    let mut claude = ClaudeClient::new(&cli.anthropic_key, &cli.anthropic_url, &cli.model);
+    claude.apply_server_limits().await;
 
     // Check candle coverage for all instruments
     info!(
diff --git a/src/claude.rs b/src/claude.rs
index 8b96a77..9c70bd7 100644
--- a/src/claude.rs
+++ b/src/claude.rs
@@ -2,6 +2,7 @@ use anyhow::{Context, Result};
 use reqwest::Client;
 use serde::{Deserialize, Serialize};
 use serde_json::Value;
+use tracing::{info, warn};
 
 use crate::config::ModelFamily;
 
@@ -11,6 +12,10 @@ pub struct ClaudeClient {
     api_url: String,
     model: String,
     family: ModelFamily,
+    /// Effective max output tokens, initialised from the family default and
+    /// optionally updated by `apply_server_limits()` after querying the
+    /// server's model metadata.
+    max_output_tokens: u32,
 }
 
 #[derive(Serialize)]
@@ -53,12 +58,14 @@ impl ClaudeClient {
             .timeout(std::time::Duration::from_secs(timeout_secs))
             .build()
             .expect("build http client");
+        let max_output_tokens = family.max_output_tokens();
         Self {
             client,
             api_key: api_key.to_string(),
             api_url: api_url.to_string(),
             model: model.to_string(),
             family,
+            max_output_tokens,
         }
     }
 
@@ -66,7 +73,71 @@ impl ClaudeClient {
         &self.family
     }
 
-    /// Send a conversation to Claude and get the text response.
+    /// Query the server for the loaded model's actual context length and
+    /// update `max_output_tokens` accordingly.
+    ///
+    /// Uses half the loaded context window for output, leaving the other
+    /// half for the system prompt and conversation history. Falls back to
+    /// the family default if the server does not expose the information.
+    ///
+    /// Tries two endpoints:
+    /// 1. LM Studio `/api/v1/models` — returns `loaded_instances[].config.context_length`
+    /// 2. OpenAI-compat `/v1/models/{id}` — returns `context_length` if present
+    pub async fn apply_server_limits(&mut self) {
+        match self.query_context_length().await {
+            Some(ctx_len) => {
+                // Reserve half the context for input (system prompt + history).
+                let budget = ctx_len / 2;
+                info!(
+                    "server context_length={ctx_len} → max_output_tokens={budget} \
+                     (was {} from family default)",
+                    self.max_output_tokens,
+                );
+                self.max_output_tokens = budget;
+            }
+            None => {
+                info!(
+                    "could not determine server context_length; \
+                     using family default max_output_tokens={}",
+                    self.max_output_tokens,
+                );
+            }
+        }
+    }
+
+    /// Try to discover the loaded context length for the current model.
+    async fn query_context_length(&self) -> Option<u32> {
+        let base = self.api_url.trim_end_matches('/');
+
+        // --- Strategy 1: LM Studio proprietary /api/v1/models ---
+        let lmstudio_url = format!("{base}/api/v1/models");
+        if let Ok(resp) = self.client.get(&lmstudio_url).send().await {
+            if resp.status().is_success() {
+                if let Ok(json) = resp.json::<Value>().await {
+                    if let Some(ctx) = lmstudio_context_length(&json, &self.model) {
+                        return Some(ctx);
+                    }
+                }
+            }
+        }
+
+        // --- Strategy 2: OpenAI-compat /v1/models/{id} ---
+        let oai_url = format!("{base}/v1/models/{}", self.model);
+        if let Ok(resp) = self.client.get(&oai_url).send().await {
+            if resp.status().is_success() {
+                if let Ok(json) = resp.json::<Value>().await {
+                    if let Some(n) = json["context_length"].as_u64() {
+                        return Some(n as u32);
+                    }
+                }
+            }
+        }
+
+        warn!("could not query context_length from server for model {}", self.model);
+        None
+    }
+
+    /// Send a conversation to the model and get the text response.
     pub async fn chat(
         &self,
         system: &str,
@@ -74,7 +145,7 @@ impl ClaudeClient {
     ) -> Result<(String, Option<Usage>)> {
         let body = MessagesRequest {
             model: self.model.clone(),
-            max_tokens: self.family.max_output_tokens(),
+            max_tokens: self.max_output_tokens,
             system: system.to_string(),
             messages: messages.to_vec(),
         };
@@ -109,6 +180,39 @@ impl ClaudeClient {
     }
 }
 
+/// Extract the loaded context_length for a model from the LM Studio
+/// `/api/v1/models` response.
+///
+/// Matches on `key` or `id` fields (LM Studio uses `key`; some variants
+/// append a quantization suffix like `@q4_k_m`, so we strip that too).
+fn lmstudio_context_length(json: &Value, model_id: &str) -> Option<u32> {
+    let models = json["models"].as_array()?;
+    let model_base = model_id.split('@').next().unwrap_or(model_id);
+
+    for entry in models {
+        let key = entry["key"].as_str().unwrap_or("");
+        let key_base = key.split('@').next().unwrap_or(key);
+
+        if key_base == model_base || key == model_id {
+            // Prefer the actually-loaded context (loaded_instances[0].config.context_length)
+            // over the theoretical max_context_length.
+            let loaded = entry["loaded_instances"]
+                .as_array()
+                .and_then(|a| a.first())
+                .and_then(|inst| inst["config"]["context_length"].as_u64())
+                .map(|n| n as u32);
+            if loaded.is_some() {
+                return loaded;
+            }
+            // Fall back to max_context_length if no loaded instance info
+            if let Some(n) = entry["max_context_length"].as_u64() {
+                return Some(n as u32);
+            }
+        }
+    }
+    None
+}
+
 /// Extract a JSON object from a model response text.
 /// Handles markdown code fences and R1-style `<think>...</think>` blocks.
 pub fn extract_json(text: &str) -> Result<Value> {