feat: discover max_output_tokens from server at startup

Instead of hardcoding per-family token budgets, ClaudeClient queries the server at startup and sets max_output_tokens = context_length / 2. Two discovery strategies, tried in order: 1. LM Studio /api/v1/models — returns loaded_instances[].config.context_length (the actually-configured context, e.g. 64000) and max_context_length (theoretical max, e.g. 131072). We prefer the loaded value. 2. OpenAI-compat /v1/models/{id} — used as fallback for non-LM Studio backends that expose context_length on the model object. If both fail, the family default is kept (DeepSeekR1=32768, Generic=8192). lmstudio_context_length() matches model IDs with and without quantization suffixes (@q4_k_m etc.) so the --model flag doesn't need to be exact. For the current R1-32B setup: loaded context=64000 → max_output_tokens=32000, giving the thinking pass plenty of room while reserving half for input. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
feat: model-family-aware token budgets and prompt style
2026-03-09 18:44:41 +02:00 · 2026-03-09 18:39:51 +02:00
4 changed files with 195 additions and 21 deletions
--- a/src/agent.rs
+++ b/src/agent.rs
@@ -132,7 +132,8 @@ pub async fn run(cli: &Cli) -> Result<()> {
    // Init clients
    let swym = SwymClient::new(&cli.swym_url)?;
-    let claude = ClaudeClient::new(&cli.anthropic_key, &cli.anthropic_url, &cli.model);
+    let mut claude = ClaudeClient::new(&cli.anthropic_key, &cli.anthropic_url, &cli.model);
    claude.apply_server_limits().await;
    // Check candle coverage for all instruments
    info!(
@@ -189,7 +190,8 @@ pub async fn run(cli: &Cli) -> Result<()> {
    // Load DSL schema for the system prompt
    let schema = include_str!("dsl-schema.json");
-    let system = prompts::system_prompt(schema);
+    let system = prompts::system_prompt(schema, claude.family());
    info!("model family: {}", claude.family().name());
    // Agent state
    let mut history: Vec<IterationRecord> = Vec::new();
@@ -267,10 +269,11 @@ pub async fn run(cli: &Cli) -> Result<()> {
        let strategy = match claude::extract_json(&response_text) {
            Ok(s) => s,
            Err(e) => {
-                warn!("failed to extract strategy JSON: {e}");
+                warn!("failed to extract strategy JSON: {e:#}");
                warn!(
-                    "raw response: {}",
+                    "raw response ({} chars): {}",
-                    &response_text[..response_text.len().min(500)]
+                    response_text.len(),
                    &response_text[..response_text.len().min(800)]
                );
                consecutive_failures += 1;
                if consecutive_failures >= 3 {
--- a/src/claude.rs
+++ b/src/claude.rs
@@ -2,12 +2,20 @@ use anyhow::{Context, Result};
 use reqwest::Client;
 use serde::{Deserialize, Serialize};
 use serde_json::Value;
 use tracing::{info, warn};
 use crate::config::ModelFamily;
 pub struct ClaudeClient {
    client: Client,
    api_key: String,
    api_url: String,
    model: String,
    family: ModelFamily,
    /// Effective max output tokens, initialised from the family default and
    /// optionally updated by `apply_server_limits()` after querying the
    /// server's model metadata.
    max_output_tokens: u32,
 }
 #[derive(Serialize)]
@@ -43,19 +51,93 @@ pub struct Usage {
 impl ClaudeClient {
    pub fn new(api_key: &str, api_url: &str, model: &str) -> Self {
        let family = ModelFamily::detect(model);
        // R1 thinking can take several minutes; use a generous timeout.
        let timeout_secs = if family.has_thinking() { 300 } else { 120 };
        let client = Client::builder()
-            .timeout(std::time::Duration::from_secs(120))
+            .timeout(std::time::Duration::from_secs(timeout_secs))
            .build()
            .expect("build http client");
        let max_output_tokens = family.max_output_tokens();
        Self {
            client,
            api_key: api_key.to_string(),
            api_url: api_url.to_string(),
            model: model.to_string(),
            family,
            max_output_tokens,
        }
    }
-    /// Send a conversation to Claude and get the text response.
+    pub fn family(&self) -> &ModelFamily {
        &self.family
    }
    /// Query the server for the loaded model's actual context length and
    /// update `max_output_tokens` accordingly.
    ///
    /// Uses half the loaded context window for output, leaving the other
    /// half for the system prompt and conversation history. Falls back to
    /// the family default if the server does not expose the information.
    ///
    /// Tries two endpoints:
    /// 1. LM Studio `/api/v1/models` — returns `loaded_instances[].config.context_length`
    /// 2. OpenAI-compat `/v1/models/{id}` — returns `context_length` if present
    pub async fn apply_server_limits(&mut self) {
        match self.query_context_length().await {
            Some(ctx_len) => {
                // Reserve half the context for input (system prompt + history).
                let budget = ctx_len / 2;
                info!(
                    "server context_length={ctx_len} → max_output_tokens={budget} \
                     (was {} from family default)",
                    self.max_output_tokens,
                );
                self.max_output_tokens = budget;
            }
            None => {
                info!(
                    "could not determine server context_length; \
                     using family default max_output_tokens={}",
                    self.max_output_tokens,
                );
            }
        }
    }
    /// Try to discover the loaded context length for the current model.
    async fn query_context_length(&self) -> Option<u32> {
        let base = self.api_url.trim_end_matches('/');
        // --- Strategy 1: LM Studio proprietary /api/v1/models ---
        let lmstudio_url = format!("{base}/api/v1/models");
        if let Ok(resp) = self.client.get(&lmstudio_url).send().await {
            if resp.status().is_success() {
                if let Ok(json) = resp.json::<Value>().await {
                    if let Some(ctx) = lmstudio_context_length(&json, &self.model) {
                        return Some(ctx);
                    }
                }
            }
        }
        // --- Strategy 2: OpenAI-compat /v1/models/{id} ---
        let oai_url = format!("{base}/v1/models/{}", self.model);
        if let Ok(resp) = self.client.get(&oai_url).send().await {
            if resp.status().is_success() {
                if let Ok(json) = resp.json::<Value>().await {
                    if let Some(n) = json["context_length"].as_u64() {
                        return Some(n as u32);
                    }
                }
            }
        }
        warn!("could not query context_length from server for model {}", self.model);
        None
    }
    /// Send a conversation to the model and get the text response.
    pub async fn chat(
        &self,
        system: &str,
@@ -63,7 +145,7 @@ impl ClaudeClient {
    ) -> Result<(String, Option<Usage>)> {
        let body = MessagesRequest {
            model: self.model.clone(),
-            max_tokens: 8192,
+            max_tokens: self.max_output_tokens,
            system: system.to_string(),
            messages: messages.to_vec(),
        };
@@ -98,6 +180,39 @@ impl ClaudeClient {
    }
 }
 /// Extract the loaded context_length for a model from the LM Studio
 /// `/api/v1/models` response.
 ///
 /// Matches on `key` or `id` fields (LM Studio uses `key`; some variants
 /// append a quantization suffix like `@q4_k_m`, so we strip that too).
 fn lmstudio_context_length(json: &Value, model_id: &str) -> Option<u32> {
    let models = json["models"].as_array()?;
    let model_base = model_id.split('@').next().unwrap_or(model_id);
    for entry in models {
        let key = entry["key"].as_str().unwrap_or("");
        let key_base = key.split('@').next().unwrap_or(key);
        if key_base == model_base || key == model_id {
            // Prefer the actually-loaded context (loaded_instances[0].config.context_length)
            // over the theoretical max_context_length.
            let loaded = entry["loaded_instances"]
                .as_array()
                .and_then(|a| a.first())
                .and_then(|inst| inst["config"]["context_length"].as_u64())
                .map(|n| n as u32);
            if loaded.is_some() {
                return loaded;
            }
            // Fall back to max_context_length if no loaded instance info
            if let Some(n) = entry["max_context_length"].as_u64() {
                return Some(n as u32);
            }
        }
    }
    None
 }
 /// Extract a JSON object from a model response text.
 /// Handles markdown code fences and R1-style `<think>...</think>` blocks.
 pub fn extract_json(text: &str) -> Result<Value> {
--- a/src/config.rs
+++ b/src/config.rs
@@ -2,6 +2,50 @@ use std::path::PathBuf;
 use clap::Parser;
 /// Model family — controls token budgets and prompt style.
 #[derive(Debug, Clone, PartialEq)]
 pub enum ModelFamily {
    /// DeepSeek-R1 and its distillations: emit `<think>` blocks that count
    /// against the output-token budget, so we need a much larger max_tokens.
    DeepSeekR1,
    /// General instruction-following models (Qwen, Llama, Mistral, …).
    Generic,
 }
 impl ModelFamily {
    /// Detect family from a model name string (case-insensitive).
    pub fn detect(model: &str) -> Self {
        let m = model.to_ascii_lowercase();
        if m.contains("deepseek-r1") || m.contains("r1-distill") || m.contains("r1_distill") {
            Self::DeepSeekR1
        } else {
            Self::Generic
        }
    }
    /// Display name for logging.
    pub fn name(&self) -> &'static str {
        match self {
            Self::DeepSeekR1 => "DeepSeek-R1",
            Self::Generic => "Generic",
        }
    }
    /// Maximum output tokens to request. R1 thinking blocks can be thousands
    /// of tokens; reserve enough headroom for the JSON after thinking.
    pub fn max_output_tokens(&self) -> u32 {
        match self {
            Self::DeepSeekR1 => 32768,
            Self::Generic => 8192,
        }
    }
    /// Whether this model family emits chain-of-thought before its response.
    pub fn has_thinking(&self) -> bool {
        matches!(self, Self::DeepSeekR1)
    }
 }
 /// Autonomous strategy search agent for the swym backtesting platform.
 ///
 /// Runs a loop: ask Claude to generate/refine strategies → submit backtests to swym →
--- a/src/prompts.rs
+++ b/src/prompts.rs
@@ -1,9 +1,28 @@
-/// System prompt for the strategy-generation Claude instance.
+use crate::config::ModelFamily;
 /// System prompt for the strategy-generation model.
 ///
-/// This is the most important part of the agent — it defines how Claude
+/// Accepts a `ModelFamily` so each family can receive tailored guidance
-/// thinks about strategy design, what it knows about the DSL, and how
+/// while sharing the common DSL schema and strategy evaluation rules.
-/// it should interpret backtest results.
+pub fn system_prompt(dsl_schema: &str, family: &ModelFamily) -> String {
-pub fn system_prompt(dsl_schema: &str) -> String {
+    let output_instructions = match family {
        ModelFamily::DeepSeekR1 => {
            "## Output format\n\n\
             Think through your strategy design carefully before committing to it. \
             After your thinking, output ONLY a bare JSON object — no markdown fences, \
             no commentary, no explanation. Start with `{` and end with `}`. \
             Your thinking will be stripped automatically; only the JSON is used."
        }
        ModelFamily::Generic => {
            "## How to respond\n\n\
             You must respond with ONLY a valid JSON object — the strategy config.\n\
             No prose, no markdown explanation, no commentary.\n\
             Just the raw JSON starting with { and ending with }.\n\n\
             The JSON must be a valid strategy with \"type\": \"rule_based\".\n\
             Use \"usdc\" (not \"usdt\") as the quote asset for balance expressions."
        }
    };
    format!(
        r##"You are a quantitative trading strategy researcher. Your task is to design,
 evaluate, and iteratively refine trading strategies expressed in the swym JSON DSL.
@@ -88,14 +107,7 @@ Every strategy MUST have:
 - A time-based exit: use bars_since_entry to avoid holding losers indefinitely
 - Reasonable position sizing: prefer ATR-based or percent-of-balance over fixed quantity
-## How to respond
+{output_instructions}
 You must respond with ONLY a valid JSON object — the strategy config.
 No prose, no markdown explanation, no commentary.
 Just the raw JSON starting with {{ and ending with }}.
 The JSON must be a valid strategy with "type": "rule_based".
 Use "usdc" (not "usdt") as the quote asset for balance expressions.
 ## Interpreting backtest results