Compare commits

...

2 Commits

Author SHA1 Message Date
51e452b607 feat: discover max_output_tokens from server at startup
Instead of hardcoding per-family token budgets, ClaudeClient queries the
server at startup and sets max_output_tokens = context_length / 2.

Two discovery strategies, tried in order:
1. LM Studio /api/v1/models — returns loaded_instances[].config.context_length
   (the actually-configured context, e.g. 64000) and max_context_length
   (theoretical max, e.g. 131072). We prefer the loaded value.
2. OpenAI-compat /v1/models/{id} — used as fallback for non-LM Studio
   backends that expose context_length on the model object.

If both fail, the family default is kept (DeepSeekR1=32768, Generic=8192).

lmstudio_context_length() matches model IDs with and without quantization
suffixes (@q4_k_m etc.) so the --model flag doesn't need to be exact.

For the current R1-32B setup: loaded context=64000 → max_output_tokens=32000,
giving the thinking pass plenty of room while reserving half for input.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-09 18:44:41 +02:00
89f7ba66e0 feat: model-family-aware token budgets and prompt style
Add ModelFamily enum (config.rs) detected from the model name:
- DeepSeekR1: matched on "deepseek-r1", "r1-distill" — R1 thinking blocks
  consume thousands of output tokens before the JSON; max_output_tokens
  raised to 32768 and HTTP timeout to 300s; prompt tells the model its
  <think> output is stripped and only the bare JSON is used
- Generic: previous behaviour (8192 tokens, 120s timeout)

ClaudeClient stores the detected family and uses it for max_tokens and
the request timeout. family() accessor lets the caller (agent.rs) pass
it into system_prompt().

prompts::system_prompt() now accepts &ModelFamily and injects a
family-specific "output format" section in place of the hardcoded
"How to respond" block. New families can be added by extending the
enum and the match arms without touching prompt logic elsewhere.

Also: log full anyhow cause chain (:#) on JSON extraction failure and
show response length alongside the truncated preview, to make future
diagnosis easier.

Root cause of the 2026-03-09T18:29:22 run failure: R1's thinking tokens
counted against max_tokens:8192, leaving only ~500 chars for the actual
JSON, which was always truncated mid-object.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-09 18:39:51 +02:00
4 changed files with 195 additions and 21 deletions

View File

@@ -132,7 +132,8 @@ pub async fn run(cli: &Cli) -> Result<()> {
// Init clients // Init clients
let swym = SwymClient::new(&cli.swym_url)?; let swym = SwymClient::new(&cli.swym_url)?;
let claude = ClaudeClient::new(&cli.anthropic_key, &cli.anthropic_url, &cli.model); let mut claude = ClaudeClient::new(&cli.anthropic_key, &cli.anthropic_url, &cli.model);
claude.apply_server_limits().await;
// Check candle coverage for all instruments // Check candle coverage for all instruments
info!( info!(
@@ -189,7 +190,8 @@ pub async fn run(cli: &Cli) -> Result<()> {
// Load DSL schema for the system prompt // Load DSL schema for the system prompt
let schema = include_str!("dsl-schema.json"); let schema = include_str!("dsl-schema.json");
let system = prompts::system_prompt(schema); let system = prompts::system_prompt(schema, claude.family());
info!("model family: {}", claude.family().name());
// Agent state // Agent state
let mut history: Vec<IterationRecord> = Vec::new(); let mut history: Vec<IterationRecord> = Vec::new();
@@ -267,10 +269,11 @@ pub async fn run(cli: &Cli) -> Result<()> {
let strategy = match claude::extract_json(&response_text) { let strategy = match claude::extract_json(&response_text) {
Ok(s) => s, Ok(s) => s,
Err(e) => { Err(e) => {
warn!("failed to extract strategy JSON: {e}"); warn!("failed to extract strategy JSON: {e:#}");
warn!( warn!(
"raw response: {}", "raw response ({} chars): {}",
&response_text[..response_text.len().min(500)] response_text.len(),
&response_text[..response_text.len().min(800)]
); );
consecutive_failures += 1; consecutive_failures += 1;
if consecutive_failures >= 3 { if consecutive_failures >= 3 {

View File

@@ -2,12 +2,20 @@ use anyhow::{Context, Result};
use reqwest::Client; use reqwest::Client;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_json::Value; use serde_json::Value;
use tracing::{info, warn};
use crate::config::ModelFamily;
pub struct ClaudeClient { pub struct ClaudeClient {
client: Client, client: Client,
api_key: String, api_key: String,
api_url: String, api_url: String,
model: String, model: String,
family: ModelFamily,
/// Effective max output tokens, initialised from the family default and
/// optionally updated by `apply_server_limits()` after querying the
/// server's model metadata.
max_output_tokens: u32,
} }
#[derive(Serialize)] #[derive(Serialize)]
@@ -43,19 +51,93 @@ pub struct Usage {
impl ClaudeClient { impl ClaudeClient {
pub fn new(api_key: &str, api_url: &str, model: &str) -> Self { pub fn new(api_key: &str, api_url: &str, model: &str) -> Self {
let family = ModelFamily::detect(model);
// R1 thinking can take several minutes; use a generous timeout.
let timeout_secs = if family.has_thinking() { 300 } else { 120 };
let client = Client::builder() let client = Client::builder()
.timeout(std::time::Duration::from_secs(120)) .timeout(std::time::Duration::from_secs(timeout_secs))
.build() .build()
.expect("build http client"); .expect("build http client");
let max_output_tokens = family.max_output_tokens();
Self { Self {
client, client,
api_key: api_key.to_string(), api_key: api_key.to_string(),
api_url: api_url.to_string(), api_url: api_url.to_string(),
model: model.to_string(), model: model.to_string(),
family,
max_output_tokens,
} }
} }
/// Send a conversation to Claude and get the text response. pub fn family(&self) -> &ModelFamily {
&self.family
}
/// Query the server for the loaded model's actual context length and
/// update `max_output_tokens` accordingly.
///
/// Uses half the loaded context window for output, leaving the other
/// half for the system prompt and conversation history. Falls back to
/// the family default if the server does not expose the information.
///
/// Tries two endpoints:
/// 1. LM Studio `/api/v1/models` — returns `loaded_instances[].config.context_length`
/// 2. OpenAI-compat `/v1/models/{id}` — returns `context_length` if present
pub async fn apply_server_limits(&mut self) {
match self.query_context_length().await {
Some(ctx_len) => {
// Reserve half the context for input (system prompt + history).
let budget = ctx_len / 2;
info!(
"server context_length={ctx_len} → max_output_tokens={budget} \
(was {} from family default)",
self.max_output_tokens,
);
self.max_output_tokens = budget;
}
None => {
info!(
"could not determine server context_length; \
using family default max_output_tokens={}",
self.max_output_tokens,
);
}
}
}
/// Try to discover the loaded context length for the current model.
async fn query_context_length(&self) -> Option<u32> {
let base = self.api_url.trim_end_matches('/');
// --- Strategy 1: LM Studio proprietary /api/v1/models ---
let lmstudio_url = format!("{base}/api/v1/models");
if let Ok(resp) = self.client.get(&lmstudio_url).send().await {
if resp.status().is_success() {
if let Ok(json) = resp.json::<Value>().await {
if let Some(ctx) = lmstudio_context_length(&json, &self.model) {
return Some(ctx);
}
}
}
}
// --- Strategy 2: OpenAI-compat /v1/models/{id} ---
let oai_url = format!("{base}/v1/models/{}", self.model);
if let Ok(resp) = self.client.get(&oai_url).send().await {
if resp.status().is_success() {
if let Ok(json) = resp.json::<Value>().await {
if let Some(n) = json["context_length"].as_u64() {
return Some(n as u32);
}
}
}
}
warn!("could not query context_length from server for model {}", self.model);
None
}
/// Send a conversation to the model and get the text response.
pub async fn chat( pub async fn chat(
&self, &self,
system: &str, system: &str,
@@ -63,7 +145,7 @@ impl ClaudeClient {
) -> Result<(String, Option<Usage>)> { ) -> Result<(String, Option<Usage>)> {
let body = MessagesRequest { let body = MessagesRequest {
model: self.model.clone(), model: self.model.clone(),
max_tokens: 8192, max_tokens: self.max_output_tokens,
system: system.to_string(), system: system.to_string(),
messages: messages.to_vec(), messages: messages.to_vec(),
}; };
@@ -98,6 +180,39 @@ impl ClaudeClient {
} }
} }
/// Extract the loaded context_length for a model from the LM Studio
/// `/api/v1/models` response.
///
/// Matches on `key` or `id` fields (LM Studio uses `key`; some variants
/// append a quantization suffix like `@q4_k_m`, so we strip that too).
fn lmstudio_context_length(json: &Value, model_id: &str) -> Option<u32> {
let models = json["models"].as_array()?;
let model_base = model_id.split('@').next().unwrap_or(model_id);
for entry in models {
let key = entry["key"].as_str().unwrap_or("");
let key_base = key.split('@').next().unwrap_or(key);
if key_base == model_base || key == model_id {
// Prefer the actually-loaded context (loaded_instances[0].config.context_length)
// over the theoretical max_context_length.
let loaded = entry["loaded_instances"]
.as_array()
.and_then(|a| a.first())
.and_then(|inst| inst["config"]["context_length"].as_u64())
.map(|n| n as u32);
if loaded.is_some() {
return loaded;
}
// Fall back to max_context_length if no loaded instance info
if let Some(n) = entry["max_context_length"].as_u64() {
return Some(n as u32);
}
}
}
None
}
/// Extract a JSON object from a model response text. /// Extract a JSON object from a model response text.
/// Handles markdown code fences and R1-style `<think>...</think>` blocks. /// Handles markdown code fences and R1-style `<think>...</think>` blocks.
pub fn extract_json(text: &str) -> Result<Value> { pub fn extract_json(text: &str) -> Result<Value> {

View File

@@ -2,6 +2,50 @@ use std::path::PathBuf;
use clap::Parser; use clap::Parser;
/// Model family — controls token budgets and prompt style.
#[derive(Debug, Clone, PartialEq)]
pub enum ModelFamily {
/// DeepSeek-R1 and its distillations: emit `<think>` blocks that count
/// against the output-token budget, so we need a much larger max_tokens.
DeepSeekR1,
/// General instruction-following models (Qwen, Llama, Mistral, …).
Generic,
}
impl ModelFamily {
/// Detect family from a model name string (case-insensitive).
pub fn detect(model: &str) -> Self {
let m = model.to_ascii_lowercase();
if m.contains("deepseek-r1") || m.contains("r1-distill") || m.contains("r1_distill") {
Self::DeepSeekR1
} else {
Self::Generic
}
}
/// Display name for logging.
pub fn name(&self) -> &'static str {
match self {
Self::DeepSeekR1 => "DeepSeek-R1",
Self::Generic => "Generic",
}
}
/// Maximum output tokens to request. R1 thinking blocks can be thousands
/// of tokens; reserve enough headroom for the JSON after thinking.
pub fn max_output_tokens(&self) -> u32 {
match self {
Self::DeepSeekR1 => 32768,
Self::Generic => 8192,
}
}
/// Whether this model family emits chain-of-thought before its response.
pub fn has_thinking(&self) -> bool {
matches!(self, Self::DeepSeekR1)
}
}
/// Autonomous strategy search agent for the swym backtesting platform. /// Autonomous strategy search agent for the swym backtesting platform.
/// ///
/// Runs a loop: ask Claude to generate/refine strategies → submit backtests to swym → /// Runs a loop: ask Claude to generate/refine strategies → submit backtests to swym →

View File

@@ -1,9 +1,28 @@
/// System prompt for the strategy-generation Claude instance. use crate::config::ModelFamily;
/// System prompt for the strategy-generation model.
/// ///
/// This is the most important part of the agent — it defines how Claude /// Accepts a `ModelFamily` so each family can receive tailored guidance
/// thinks about strategy design, what it knows about the DSL, and how /// while sharing the common DSL schema and strategy evaluation rules.
/// it should interpret backtest results. pub fn system_prompt(dsl_schema: &str, family: &ModelFamily) -> String {
pub fn system_prompt(dsl_schema: &str) -> String { let output_instructions = match family {
ModelFamily::DeepSeekR1 => {
"## Output format\n\n\
Think through your strategy design carefully before committing to it. \
After your thinking, output ONLY a bare JSON object — no markdown fences, \
no commentary, no explanation. Start with `{` and end with `}`. \
Your thinking will be stripped automatically; only the JSON is used."
}
ModelFamily::Generic => {
"## How to respond\n\n\
You must respond with ONLY a valid JSON object — the strategy config.\n\
No prose, no markdown explanation, no commentary.\n\
Just the raw JSON starting with { and ending with }.\n\n\
The JSON must be a valid strategy with \"type\": \"rule_based\".\n\
Use \"usdc\" (not \"usdt\") as the quote asset for balance expressions."
}
};
format!( format!(
r##"You are a quantitative trading strategy researcher. Your task is to design, r##"You are a quantitative trading strategy researcher. Your task is to design,
evaluate, and iteratively refine trading strategies expressed in the swym JSON DSL. evaluate, and iteratively refine trading strategies expressed in the swym JSON DSL.
@@ -88,14 +107,7 @@ Every strategy MUST have:
- A time-based exit: use bars_since_entry to avoid holding losers indefinitely - A time-based exit: use bars_since_entry to avoid holding losers indefinitely
- Reasonable position sizing: prefer ATR-based or percent-of-balance over fixed quantity - Reasonable position sizing: prefer ATR-based or percent-of-balance over fixed quantity
## How to respond {output_instructions}
You must respond with ONLY a valid JSON object — the strategy config.
No prose, no markdown explanation, no commentary.
Just the raw JSON starting with {{ and ending with }}.
The JSON must be a valid strategy with "type": "rule_based".
Use "usdc" (not "usdt") as the quote asset for balance expressions.
## Interpreting backtest results ## Interpreting backtest results