feat(helexa-acp): context compaction for small-context local models
All checks were successful
build-prerelease / Resolve version stamps (push) Successful in 26s
CI / Format (push) Successful in 29s
CI / Clippy (push) Successful in 2m26s
build-prerelease / Build cortex binary (push) Successful in 5m17s
build-prerelease / Build neuron-blackwell (push) Successful in 5m51s
CI / Test (push) Successful in 5m53s
CI / Build cortex SRPM (push) Has been skipped
CI / Build neuron SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
build-prerelease / Package cortex RPM (push) Successful in 1m21s
build-prerelease / Build neuron-ampere (push) Successful in 7m58s
build-prerelease / Build neuron-ada (push) Successful in 5m30s
build-prerelease / Package helexa-neuron-ada RPM (push) Successful in 2m57s
build-prerelease / Package helexa-neuron-ampere RPM (push) Successful in 3m7s
build-prerelease / Package helexa-neuron-blackwell RPM (push) Successful in 3m40s
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Successful in 1m0s
All checks were successful
build-prerelease / Resolve version stamps (push) Successful in 26s
CI / Format (push) Successful in 29s
CI / Clippy (push) Successful in 2m26s
build-prerelease / Build cortex binary (push) Successful in 5m17s
build-prerelease / Build neuron-blackwell (push) Successful in 5m51s
CI / Test (push) Successful in 5m53s
CI / Build cortex SRPM (push) Has been skipped
CI / Build neuron SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
build-prerelease / Package cortex RPM (push) Successful in 1m21s
build-prerelease / Build neuron-ampere (push) Successful in 7m58s
build-prerelease / Build neuron-ada (push) Successful in 5m30s
build-prerelease / Package helexa-neuron-ada RPM (push) Successful in 2m57s
build-prerelease / Package helexa-neuron-ampere RPM (push) Successful in 3m7s
build-prerelease / Package helexa-neuron-blackwell RPM (push) Successful in 3m40s
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Successful in 1m0s
A new src/compaction.rs module projects rolling conversation history into a token budget before each completion. Older tool results and assistant prose get elided to one-line markers; system prompts, user turns, and the last KEEP_TAIL=4 messages stay verbatim. tool_call_id pairing is preserved so OpenAI strict-schema providers keep working. Driven by a new per-endpoint `context_window` config field (also HELEXA_ACP_CONTEXT_WINDOW for the env-only single-endpoint case). When set, prompt budget = context_window - max_tokens - 512_safety; when unset, behaviour is unchanged. Without this, a 32 K Qwen3 dies with `prompt_too_long` after the first few read_file results pile up in history — the symptom seen in plan-mode dogfooding on beat. 10 new unit tests cover the compaction strategy and the prompt budget arithmetic. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -30,6 +30,7 @@ use futures::StreamExt;
|
|||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
|
|
||||||
|
use crate::compaction;
|
||||||
use crate::config::{Config, parse_model_selector};
|
use crate::config::{Config, parse_model_selector};
|
||||||
use crate::prompt::build_system_prompt;
|
use crate::prompt::build_system_prompt;
|
||||||
use crate::provider::{
|
use crate::provider::{
|
||||||
@@ -67,6 +68,11 @@ struct AgentInner {
|
|||||||
/// name after resolution. `None` (or an absent entry) means the
|
/// name after resolution. `None` (or an absent entry) means the
|
||||||
/// upstream picks its own default.
|
/// upstream picks its own default.
|
||||||
max_tokens: std::collections::HashMap<String, u64>,
|
max_tokens: std::collections::HashMap<String, u64>,
|
||||||
|
/// Per-endpoint model context window in tokens. When set, the
|
||||||
|
/// agent compacts history before each completion so the prompt
|
||||||
|
/// fits inside `context_window - max_tokens - safety` tokens.
|
||||||
|
/// Absent entry → no compaction (legacy behaviour).
|
||||||
|
context_window: std::collections::HashMap<String, usize>,
|
||||||
sessions: SessionStore,
|
sessions: SessionStore,
|
||||||
system_prompt_path: Option<PathBuf>,
|
system_prompt_path: Option<PathBuf>,
|
||||||
/// Monotonic counter for minting session ids. The wire format is
|
/// Monotonic counter for minting session ids. The wire format is
|
||||||
@@ -99,12 +105,18 @@ impl Agent {
|
|||||||
.iter()
|
.iter()
|
||||||
.filter_map(|ep| ep.max_tokens.map(|m| (ep.name.clone(), m)))
|
.filter_map(|ep| ep.max_tokens.map(|m| (ep.name.clone(), m)))
|
||||||
.collect();
|
.collect();
|
||||||
|
let context_window = cfg
|
||||||
|
.endpoints
|
||||||
|
.iter()
|
||||||
|
.filter_map(|ep| ep.context_window.map(|w| (ep.name.clone(), w)))
|
||||||
|
.collect();
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
inner: Arc::new(AgentInner {
|
inner: Arc::new(AgentInner {
|
||||||
providers,
|
providers,
|
||||||
default_endpoint_name: default.name.clone(),
|
default_endpoint_name: default.name.clone(),
|
||||||
default_model: default.default_model.clone(),
|
default_model: default.default_model.clone(),
|
||||||
max_tokens,
|
max_tokens,
|
||||||
|
context_window,
|
||||||
sessions: session::new_store(),
|
sessions: session::new_store(),
|
||||||
system_prompt_path: cfg.system_prompt_path.clone(),
|
system_prompt_path: cfg.system_prompt_path.clone(),
|
||||||
next_session_id: AtomicU64::new(1),
|
next_session_id: AtomicU64::new(1),
|
||||||
@@ -766,6 +778,34 @@ async fn drive_prompt(
|
|||||||
"prompt round: streaming"
|
"prompt round: streaming"
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Project history into the model's context window when the
|
||||||
|
// endpoint advertises one. Compaction is a per-request
|
||||||
|
// *projection* — `messages` (and the persisted session
|
||||||
|
// history downstream) stay intact; only what we send
|
||||||
|
// upstream shrinks. Without this, a 32 K Qwen3 dies after
|
||||||
|
// the first few `read_file` results pile up in history.
|
||||||
|
let provider_max_tokens = inner.max_tokens.get(provider.name()).copied();
|
||||||
|
let messages_for_provider = match inner.context_window.get(provider.name()).copied() {
|
||||||
|
Some(ctx) => {
|
||||||
|
let budget = prompt_budget(ctx, provider_max_tokens);
|
||||||
|
let (compacted, stats) = compaction::compact_to_budget(&messages, budget);
|
||||||
|
if stats.elided_messages > 0 {
|
||||||
|
tracing::info!(
|
||||||
|
session_id = %session_id.0,
|
||||||
|
round = round + 1,
|
||||||
|
context_window = ctx,
|
||||||
|
budget,
|
||||||
|
original_tokens = stats.original_tokens,
|
||||||
|
final_tokens = stats.final_tokens,
|
||||||
|
elided = stats.elided_messages,
|
||||||
|
"context compaction applied"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
compacted
|
||||||
|
}
|
||||||
|
None => messages.clone(),
|
||||||
|
};
|
||||||
|
|
||||||
// Tool descriptions reach the model via the Qwen3 `# Tools`
|
// Tool descriptions reach the model via the Qwen3 `# Tools`
|
||||||
// block in the system prompt, not via the OpenAI `tools`
|
// block in the system prompt, not via the OpenAI `tools`
|
||||||
// request field — cortex/neuron pass that field through to
|
// request field — cortex/neuron pass that field through to
|
||||||
@@ -773,11 +813,11 @@ async fn drive_prompt(
|
|||||||
// tools once a strict-OpenAI backend lands. Leave empty.
|
// tools once a strict-OpenAI backend lands. Leave empty.
|
||||||
let completion_req = CompletionRequest {
|
let completion_req = CompletionRequest {
|
||||||
model: local_model.clone(),
|
model: local_model.clone(),
|
||||||
messages: messages.clone(),
|
messages: messages_for_provider,
|
||||||
tools: vec![],
|
tools: vec![],
|
||||||
temperature: None,
|
temperature: None,
|
||||||
top_p: None,
|
top_p: None,
|
||||||
max_tokens: inner.max_tokens.get(provider.name()).copied(),
|
max_tokens: provider_max_tokens,
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut stream = match provider.complete(completion_req, cancel.clone()).await {
|
let mut stream = match provider.complete(completion_req, cancel.clone()).await {
|
||||||
@@ -1203,6 +1243,26 @@ fn synthesize_malformed_history(tool_call_id: &str, raw: &str) -> (Message, Mess
|
|||||||
(call, result)
|
(call, result)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Compute the prompt token budget for an endpoint given its
|
||||||
|
/// `context_window` and `max_tokens` settings. The model needs room
|
||||||
|
/// for both the prompt and its response inside the context window,
|
||||||
|
/// so the prompt budget is the remainder after subtracting the
|
||||||
|
/// response cap (defaulting to a conservative 2048 when the endpoint
|
||||||
|
/// didn't set one) and a small safety margin for tokenizer
|
||||||
|
/// disagreement.
|
||||||
|
///
|
||||||
|
/// The safety margin matters because our per-character estimate in
|
||||||
|
/// [`compaction`] can drift a few percent from any given upstream
|
||||||
|
/// tokenizer; we'd rather under-fill the context window than have a
|
||||||
|
/// well-compacted history still trip `prompt_too_long`.
|
||||||
|
fn prompt_budget(context_window: usize, max_tokens: Option<u64>) -> usize {
|
||||||
|
const SAFETY_MARGIN: usize = 512;
|
||||||
|
let max_tokens = max_tokens.unwrap_or(2048) as usize;
|
||||||
|
context_window
|
||||||
|
.saturating_sub(max_tokens)
|
||||||
|
.saturating_sub(SAFETY_MARGIN)
|
||||||
|
}
|
||||||
|
|
||||||
fn map_finish_reason(reason: Option<&str>) -> StopReason {
|
fn map_finish_reason(reason: Option<&str>) -> StopReason {
|
||||||
match reason {
|
match reason {
|
||||||
Some("length") => StopReason::MaxTokens,
|
Some("length") => StopReason::MaxTokens,
|
||||||
@@ -1349,6 +1409,28 @@ mod tests {
|
|||||||
|
|
||||||
// ── map_finish_reason ───────────────────────────────────────────
|
// ── map_finish_reason ───────────────────────────────────────────
|
||||||
|
|
||||||
|
// ── prompt_budget ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn prompt_budget_reserves_response_and_safety() {
|
||||||
|
// 32K window, 8K response cap → 32768 - 8192 - 512 = 24064.
|
||||||
|
assert_eq!(prompt_budget(32_768, Some(8_192)), 24_064);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn prompt_budget_uses_default_when_max_tokens_unset() {
|
||||||
|
// Default response cap = 2048; safety = 512.
|
||||||
|
assert_eq!(prompt_budget(32_768, None), 32_768 - 2_048 - 512);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn prompt_budget_saturates_when_window_too_small() {
|
||||||
|
// Pathological config: window smaller than response + safety.
|
||||||
|
// Don't underflow — return zero so compaction tries hardest
|
||||||
|
// and upstream surfaces the inevitable error.
|
||||||
|
assert_eq!(prompt_budget(1_000, Some(8_192)), 0);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn maps_known_finish_reasons() {
|
fn maps_known_finish_reasons() {
|
||||||
assert!(matches!(
|
assert!(matches!(
|
||||||
|
|||||||
396
crates/helexa-acp/src/compaction.rs
Normal file
396
crates/helexa-acp/src/compaction.rs
Normal file
@@ -0,0 +1,396 @@
|
|||||||
|
//! Rolling-conversation compaction for small-context local models.
|
||||||
|
//!
|
||||||
|
//! The tool-call loop in [`crate::agent`] grows the message vec it
|
||||||
|
//! sends upstream every round. On a frontier model that's fine; on a
|
||||||
|
//! 32 K Qwen3 the first few `read_file` results can push the prompt
|
||||||
|
//! past the model's context window, at which point cortex/neuron
|
||||||
|
//! refuses with `prompt_too_long` and the whole turn dies. Long-form
|
||||||
|
//! local agents are unusable without something here.
|
||||||
|
//!
|
||||||
|
//! Strategy (intentionally simple — no LLM-summarization round-trip,
|
||||||
|
//! no tokenizer dependency):
|
||||||
|
//!
|
||||||
|
//! 1. **Protect** the things the model cannot reason without:
|
||||||
|
//! - The system prompt (idx 0).
|
||||||
|
//! - Every `Role::User` turn (the user's intent — irreplaceable).
|
||||||
|
//! - The last [`KEEP_TAIL`] messages (most recent rounds stay
|
||||||
|
//! verbatim so the model can keep working on what it just
|
||||||
|
//! observed).
|
||||||
|
//! 2. **Elide** older `Role::Assistant` prose and older `Role::Tool`
|
||||||
|
//! result content. The structure stays — `tool_call_id`s, tool
|
||||||
|
//! names, and argument JSON survive intact — so OpenAI's strict
|
||||||
|
//! `tool_calls` ↔ `tool` pairing schema remains satisfied. Only
|
||||||
|
//! the *payload* shrinks to a one-line marker.
|
||||||
|
//! 3. Walk oldest→newest, recomputing the budget after each elision.
|
||||||
|
//! Stop as soon as we fit; we don't compact more than necessary.
|
||||||
|
//! 4. If we still exceed budget after eliding everything we're
|
||||||
|
//! allowed to, return what we have. The upstream will surface a
|
||||||
|
//! `prompt_too_long` error and the user can intervene; that's
|
||||||
|
//! better than silently dropping content the model needs.
|
||||||
|
//!
|
||||||
|
//! Token estimation uses a `chars / 3.5` heuristic — conservative
|
||||||
|
//! (over-estimates tokens slightly) so we compact a touch early
|
||||||
|
//! rather than a touch late.
|
||||||
|
|
||||||
|
use crate::provider::{Message, MessageContent, Role};
|
||||||
|
|
||||||
|
/// Most-recent N messages that are never elided. Roughly "the
|
||||||
|
/// current tool round in flight" — assistant turn that called the
|
||||||
|
/// tools + each tool result + a bit of slack.
|
||||||
|
const KEEP_TAIL: usize = 4;
|
||||||
|
|
||||||
|
/// Below this content size we don't bother eliding — the savings
|
||||||
|
/// don't outweigh the loss of detail. Roughly 60–80 tokens.
|
||||||
|
const ELIDE_MIN_CHARS: usize = 256;
|
||||||
|
|
||||||
|
/// Roughly tokens-per-character for English + code mixed in. The
|
||||||
|
/// actual per-tokenizer ratio varies (GPT-4o ≈ 4 chars/token on
|
||||||
|
/// English prose, ≈ 3 chars/token on code-heavy text). We pick a
|
||||||
|
/// value on the conservative end so the budget check fires *before*
|
||||||
|
/// the upstream tokenizer says no.
|
||||||
|
const CHARS_PER_TOKEN: f32 = 3.5;
|
||||||
|
|
||||||
|
/// Per-message envelope overhead (role + JSON framing). Comes out
|
||||||
|
/// to a few tokens; tiny but it adds up across long histories.
|
||||||
|
const ENVELOPE_TOKENS: usize = 8;
|
||||||
|
|
||||||
|
/// Stats reported back from [`compact_to_budget`] for the caller to
|
||||||
|
/// log. The numbers are estimates (see [`estimate_tokens`]), so
|
||||||
|
/// don't compare them to upstream-reported token counts as if they
|
||||||
|
/// were exact.
|
||||||
|
#[derive(Debug, Clone, Default, PartialEq, Eq)]
|
||||||
|
pub struct CompactionStats {
|
||||||
|
/// Estimated tokens in the input messages.
|
||||||
|
pub original_tokens: usize,
|
||||||
|
/// Estimated tokens after compaction. Equal to `original_tokens`
|
||||||
|
/// when no compaction was needed.
|
||||||
|
pub final_tokens: usize,
|
||||||
|
/// Number of messages whose content was elided. Zero is the
|
||||||
|
/// hot path (nothing to do).
|
||||||
|
pub elided_messages: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CompactionStats {
|
||||||
|
fn unchanged(tokens: usize) -> Self {
|
||||||
|
Self {
|
||||||
|
original_tokens: tokens,
|
||||||
|
final_tokens: tokens,
|
||||||
|
elided_messages: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Approximate token count for one message. Sums the textual
|
||||||
|
/// payload's chars, divides by [`CHARS_PER_TOKEN`], and adds an
|
||||||
|
/// envelope constant. Cheap (no allocation) so safe to call once per
|
||||||
|
/// message per round.
|
||||||
|
pub fn estimate_tokens(msg: &Message) -> usize {
|
||||||
|
let chars = match &msg.content {
|
||||||
|
MessageContent::Text { text } => text.len(),
|
||||||
|
MessageContent::ToolCalls { text, calls } => {
|
||||||
|
let txt = text.as_deref().map(|s| s.len()).unwrap_or(0);
|
||||||
|
let calls_size: usize = calls
|
||||||
|
.iter()
|
||||||
|
.map(|c| c.name.len() + c.arguments.len() + c.id.len())
|
||||||
|
.sum();
|
||||||
|
txt + calls_size
|
||||||
|
}
|
||||||
|
MessageContent::ToolResult {
|
||||||
|
tool_call_id,
|
||||||
|
content,
|
||||||
|
} => tool_call_id.len() + content.len(),
|
||||||
|
};
|
||||||
|
((chars as f32 / CHARS_PER_TOKEN) as usize) + ENVELOPE_TOKENS
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Sum of [`estimate_tokens`] across all messages.
|
||||||
|
pub fn total_tokens(messages: &[Message]) -> usize {
|
||||||
|
messages.iter().map(estimate_tokens).sum()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Project `messages` into a vec whose estimated token count fits in
|
||||||
|
/// `budget` tokens. Returns the projection plus stats about what
|
||||||
|
/// was done. When the input already fits, the projection is a clone
|
||||||
|
/// of the input and stats report zero elisions.
|
||||||
|
///
|
||||||
|
/// See module docs for the strategy and protected set.
|
||||||
|
pub fn compact_to_budget(messages: &[Message], budget: usize) -> (Vec<Message>, CompactionStats) {
|
||||||
|
let original = total_tokens(messages);
|
||||||
|
if original <= budget {
|
||||||
|
return (messages.to_vec(), CompactionStats::unchanged(original));
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut out = messages.to_vec();
|
||||||
|
let len = out.len();
|
||||||
|
let tail_start = len.saturating_sub(KEEP_TAIL);
|
||||||
|
let mut elided = 0usize;
|
||||||
|
|
||||||
|
// Two passes. First pass: ToolResult contents (largest savings
|
||||||
|
// per elision — read_file payloads land here). Second pass: long
|
||||||
|
// Assistant prose. We don't interleave because eliding a long
|
||||||
|
// assistant turn before a really old read_file would do less
|
||||||
|
// good per elision; oldest-first ordering is enforced *within*
|
||||||
|
// each pass instead.
|
||||||
|
for pass in 0..2 {
|
||||||
|
for i in 1..tail_start {
|
||||||
|
if matches!(out[i].role, Role::User) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let target_pass_2 = matches!(
|
||||||
|
&out[i].content,
|
||||||
|
MessageContent::Text { .. } | MessageContent::ToolCalls { .. }
|
||||||
|
);
|
||||||
|
let target_pass_1 = matches!(&out[i].content, MessageContent::ToolResult { .. });
|
||||||
|
let in_pass = (pass == 0 && target_pass_1) || (pass == 1 && target_pass_2);
|
||||||
|
if !in_pass {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if elide_in_place(&mut out[i]) {
|
||||||
|
elided += 1;
|
||||||
|
if total_tokens(&out) <= budget {
|
||||||
|
let final_tokens = total_tokens(&out);
|
||||||
|
return (
|
||||||
|
out,
|
||||||
|
CompactionStats {
|
||||||
|
original_tokens: original,
|
||||||
|
final_tokens,
|
||||||
|
elided_messages: elided,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let final_tokens = total_tokens(&out);
|
||||||
|
(
|
||||||
|
out,
|
||||||
|
CompactionStats {
|
||||||
|
original_tokens: original,
|
||||||
|
final_tokens,
|
||||||
|
elided_messages: elided,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Shrink one message's payload while keeping its structural role
|
||||||
|
/// (so tool_call_id pairing survives). Returns `true` when the
|
||||||
|
/// message changed.
|
||||||
|
///
|
||||||
|
/// - `ToolResult.content` → `(elided: N bytes of tool result)`
|
||||||
|
/// - `ToolCalls.text` → `(elided: N bytes of assistant prose)`
|
||||||
|
/// - `Text` (assistant) → `(elided: N bytes of assistant prose)`
|
||||||
|
///
|
||||||
|
/// Already-tiny payloads are skipped — eliding a 50-byte string
|
||||||
|
/// would *grow* it once the marker is in place.
|
||||||
|
fn elide_in_place(msg: &mut Message) -> bool {
|
||||||
|
match &mut msg.content {
|
||||||
|
MessageContent::ToolResult { content, .. } => {
|
||||||
|
if content.len() < ELIDE_MIN_CHARS {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
*content = format!("(elided: {} bytes of tool result)", content.len());
|
||||||
|
true
|
||||||
|
}
|
||||||
|
MessageContent::ToolCalls { text, .. } => match text {
|
||||||
|
Some(t) if t.len() >= ELIDE_MIN_CHARS => {
|
||||||
|
*text = Some(format!("(elided: {} bytes of assistant prose)", t.len()));
|
||||||
|
true
|
||||||
|
}
|
||||||
|
_ => false,
|
||||||
|
},
|
||||||
|
MessageContent::Text { text } => {
|
||||||
|
if text.len() < ELIDE_MIN_CHARS {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
*text = format!("(elided: {} bytes of assistant prose)", text.len());
|
||||||
|
true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use crate::provider::ToolCall;
|
||||||
|
|
||||||
|
fn sys(text: &str) -> Message {
|
||||||
|
Message {
|
||||||
|
role: Role::System,
|
||||||
|
content: MessageContent::Text { text: text.into() },
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn user(text: &str) -> Message {
|
||||||
|
Message {
|
||||||
|
role: Role::User,
|
||||||
|
content: MessageContent::Text { text: text.into() },
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn assistant_text(text: &str) -> Message {
|
||||||
|
Message {
|
||||||
|
role: Role::Assistant,
|
||||||
|
content: MessageContent::Text { text: text.into() },
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn assistant_calls(text: Option<&str>, name: &str, args: &str, id: &str) -> Message {
|
||||||
|
Message {
|
||||||
|
role: Role::Assistant,
|
||||||
|
content: MessageContent::ToolCalls {
|
||||||
|
text: text.map(|s| s.to_string()),
|
||||||
|
calls: vec![ToolCall {
|
||||||
|
id: id.into(),
|
||||||
|
name: name.into(),
|
||||||
|
arguments: args.into(),
|
||||||
|
}],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn tool_result(id: &str, body: &str) -> Message {
|
||||||
|
Message {
|
||||||
|
role: Role::Tool,
|
||||||
|
content: MessageContent::ToolResult {
|
||||||
|
tool_call_id: id.into(),
|
||||||
|
content: body.into(),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn under_budget_is_a_no_op_clone() {
|
||||||
|
let msgs = vec![sys("you are an agent"), user("hi"), assistant_text("hello")];
|
||||||
|
let (out, stats) = compact_to_budget(&msgs, 10_000);
|
||||||
|
assert_eq!(stats.elided_messages, 0);
|
||||||
|
assert_eq!(stats.original_tokens, stats.final_tokens);
|
||||||
|
assert_eq!(out.len(), msgs.len());
|
||||||
|
// Strings unchanged.
|
||||||
|
match &out[2].content {
|
||||||
|
MessageContent::Text { text } => assert_eq!(text, "hello"),
|
||||||
|
other => panic!("expected Text, got {other:?}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn elides_old_tool_result_before_old_assistant_prose() {
|
||||||
|
// History: sys, user, assistant_calls, big_tool_result,
|
||||||
|
// assistant_with_big_text, user, assistant_calls,
|
||||||
|
// small_tool_result.
|
||||||
|
// KEEP_TAIL=4 protects the last four; the big tool result
|
||||||
|
// sits in the prunable range and should go first because
|
||||||
|
// pass 0 (tool results) runs before pass 1 (prose).
|
||||||
|
let big_result = "X".repeat(4096);
|
||||||
|
let big_prose = "Y".repeat(2048);
|
||||||
|
let msgs = vec![
|
||||||
|
sys("preamble"),
|
||||||
|
user("first ask"),
|
||||||
|
assistant_calls(None, "read_file", r#"{"path":"/a"}"#, "c0"),
|
||||||
|
tool_result("c0", &big_result),
|
||||||
|
assistant_text(&big_prose),
|
||||||
|
user("follow up"),
|
||||||
|
assistant_calls(None, "read_file", r#"{"path":"/b"}"#, "c1"),
|
||||||
|
tool_result("c1", "short result body"),
|
||||||
|
];
|
||||||
|
let before = total_tokens(&msgs);
|
||||||
|
// Force compaction by setting budget well below current.
|
||||||
|
let budget = before / 2;
|
||||||
|
let (out, stats) = compact_to_budget(&msgs, budget);
|
||||||
|
|
||||||
|
assert!(
|
||||||
|
stats.elided_messages >= 1,
|
||||||
|
"expected at least one elision, got {stats:?}"
|
||||||
|
);
|
||||||
|
// The big tool result must be elided (oldest fat target).
|
||||||
|
match &out[3].content {
|
||||||
|
MessageContent::ToolResult { content, .. } => {
|
||||||
|
assert!(
|
||||||
|
content.starts_with("(elided:"),
|
||||||
|
"tool result not elided: {content:?}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
other => panic!("expected ToolResult, got {other:?}"),
|
||||||
|
}
|
||||||
|
// Last four messages must be untouched.
|
||||||
|
assert!(matches!(
|
||||||
|
&out[out.len() - 1].content,
|
||||||
|
MessageContent::ToolResult { content, .. } if content == "short result body"
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn never_elides_system_or_user_turns() {
|
||||||
|
let big_user = "U".repeat(8192);
|
||||||
|
let msgs = vec![sys("preamble"), user(&big_user), assistant_text("ok")];
|
||||||
|
let budget = 10; // way below — forces all possible elision
|
||||||
|
let (out, _stats) = compact_to_budget(&msgs, budget);
|
||||||
|
// System unchanged.
|
||||||
|
match &out[0].content {
|
||||||
|
MessageContent::Text { text } => assert_eq!(text, "preamble"),
|
||||||
|
other => panic!("expected Text, got {other:?}"),
|
||||||
|
}
|
||||||
|
// User unchanged even though it's huge.
|
||||||
|
match &out[1].content {
|
||||||
|
MessageContent::Text { text } => assert_eq!(text.len(), big_user.len()),
|
||||||
|
other => panic!("expected Text, got {other:?}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn preserves_tool_call_id_pairing_after_elision() {
|
||||||
|
// OpenAI strict mode rejects a tool-result whose tool_call_id
|
||||||
|
// doesn't match a preceding assistant tool_call. Elision
|
||||||
|
// must not break that linkage.
|
||||||
|
let big = "Z".repeat(4096);
|
||||||
|
let msgs = vec![
|
||||||
|
sys("preamble"),
|
||||||
|
user("first"),
|
||||||
|
assistant_calls(None, "read_file", r#"{"path":"/a"}"#, "call_42"),
|
||||||
|
tool_result("call_42", &big),
|
||||||
|
// Tail messages.
|
||||||
|
user("next"),
|
||||||
|
assistant_calls(None, "read_file", r#"{"path":"/b"}"#, "call_43"),
|
||||||
|
tool_result("call_43", "ok"),
|
||||||
|
assistant_text("done"),
|
||||||
|
];
|
||||||
|
let budget = total_tokens(&msgs) / 3;
|
||||||
|
let (out, _stats) = compact_to_budget(&msgs, budget);
|
||||||
|
// The assistant call and its result both carry call_42.
|
||||||
|
let call_id = match &out[2].content {
|
||||||
|
MessageContent::ToolCalls { calls, .. } => calls[0].id.clone(),
|
||||||
|
other => panic!("expected ToolCalls, got {other:?}"),
|
||||||
|
};
|
||||||
|
match &out[3].content {
|
||||||
|
MessageContent::ToolResult { tool_call_id, .. } => {
|
||||||
|
assert_eq!(tool_call_id, &call_id, "pairing broken");
|
||||||
|
}
|
||||||
|
other => panic!("expected ToolResult, got {other:?}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn estimate_tokens_grows_with_content() {
|
||||||
|
let small = sys("hi");
|
||||||
|
let large = sys(&"x".repeat(10_000));
|
||||||
|
assert!(estimate_tokens(&large) > estimate_tokens(&small) * 100);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn elide_in_place_skips_short_content() {
|
||||||
|
let mut m = tool_result("c0", "tiny");
|
||||||
|
assert!(!elide_in_place(&mut m));
|
||||||
|
match m.content {
|
||||||
|
MessageContent::ToolResult { content, .. } => assert_eq!(content, "tiny"),
|
||||||
|
other => panic!("expected ToolResult, got {other:?}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn returns_best_effort_when_budget_unmeetable() {
|
||||||
|
// Single huge user message that cannot be elided. Budget 10.
|
||||||
|
// We don't error — we return what we have and let upstream
|
||||||
|
// refuse the prompt with its own error.
|
||||||
|
let big_user = "U".repeat(100_000);
|
||||||
|
let msgs = vec![sys("preamble"), user(&big_user)];
|
||||||
|
let (out, stats) = compact_to_budget(&msgs, 10);
|
||||||
|
assert_eq!(out.len(), msgs.len());
|
||||||
|
assert!(stats.final_tokens > 10, "still over budget by design");
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -98,6 +98,14 @@ pub struct EndpointConfig {
|
|||||||
/// request field.
|
/// request field.
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub max_tokens: Option<u64>,
|
pub max_tokens: Option<u64>,
|
||||||
|
/// Model context window in tokens (prompt + response). When set,
|
||||||
|
/// the agent compacts conversation history before each completion
|
||||||
|
/// so the prompt fits within `context_window - max_tokens - safety`
|
||||||
|
/// tokens — long sessions on small-context local models (Qwen3 at
|
||||||
|
/// 32 K) survive past the first few tool-call rounds rather than
|
||||||
|
/// dying with `prompt_too_long`. `None` disables compaction.
|
||||||
|
#[serde(default)]
|
||||||
|
pub context_window: Option<usize>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
|
||||||
@@ -193,6 +201,15 @@ impl Config {
|
|||||||
})
|
})
|
||||||
})
|
})
|
||||||
.transpose()?;
|
.transpose()?;
|
||||||
|
let context_window = std::env::var("HELEXA_ACP_CONTEXT_WINDOW")
|
||||||
|
.ok()
|
||||||
|
.filter(|s| !s.is_empty())
|
||||||
|
.map(|s| {
|
||||||
|
s.parse::<usize>().with_context(|| {
|
||||||
|
format!("HELEXA_ACP_CONTEXT_WINDOW is not a positive integer ({s})")
|
||||||
|
})
|
||||||
|
})
|
||||||
|
.transpose()?;
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
default_endpoint: Some(DEFAULT_ENDPOINT_NAME.into()),
|
default_endpoint: Some(DEFAULT_ENDPOINT_NAME.into()),
|
||||||
endpoints: vec![EndpointConfig {
|
endpoints: vec![EndpointConfig {
|
||||||
@@ -203,6 +220,7 @@ impl Config {
|
|||||||
api_key,
|
api_key,
|
||||||
api_key_env: None,
|
api_key_env: None,
|
||||||
max_tokens,
|
max_tokens,
|
||||||
|
context_window,
|
||||||
}],
|
}],
|
||||||
system_prompt_path,
|
system_prompt_path,
|
||||||
})
|
})
|
||||||
@@ -316,6 +334,7 @@ mod tests {
|
|||||||
api_key: None,
|
api_key: None,
|
||||||
api_key_env: None,
|
api_key_env: None,
|
||||||
max_tokens: None,
|
max_tokens: None,
|
||||||
|
context_window: None,
|
||||||
};
|
};
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
ep.chat_completions_url().as_str(),
|
ep.chat_completions_url().as_str(),
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ use agent_client_protocol::{Result, Stdio};
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
mod agent;
|
mod agent;
|
||||||
|
mod compaction;
|
||||||
mod config;
|
mod config;
|
||||||
mod prompt;
|
mod prompt;
|
||||||
mod provider;
|
mod provider;
|
||||||
|
|||||||
@@ -149,6 +149,7 @@ mod tests {
|
|||||||
api_key: None,
|
api_key: None,
|
||||||
api_key_env: None,
|
api_key_env: None,
|
||||||
max_tokens: None,
|
max_tokens: None,
|
||||||
|
context_window: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user