feat(helexa-acp): context compaction for small-context local models

A new src/compaction.rs module projects rolling conversation history into a token budget before each completion. Older tool results and assistant prose get elided to one-line markers; system prompts, user turns, and the last KEEP_TAIL=4 messages stay verbatim. tool_call_id pairing is preserved so OpenAI strict-schema providers keep working. Driven by a new per-endpoint `context_window` config field (also HELEXA_ACP_CONTEXT_WINDOW for the env-only single-endpoint case). When set, prompt budget = context_window - max_tokens - 512_safety; when unset, behaviour is unchanged. Without this, a 32 K Qwen3 dies with `prompt_too_long` after the first few read_file results pile up in history — the symptom seen in plan-mode dogfooding on beat. 10 new unit tests cover the compaction strategy and the prompt budget arithmetic. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-29 08:22:01 +03:00
parent cbadfcf112
commit 537a0fe7f2
5 changed files with 501 additions and 2 deletions
--- a/crates/helexa-acp/src/agent.rs
+++ b/crates/helexa-acp/src/agent.rs
@@ -30,6 +30,7 @@ use futures::StreamExt;
 use std::collections::BTreeMap;
 use tokio_util::sync::CancellationToken;
 use crate::compaction;
 use crate::config::{Config, parse_model_selector};
 use crate::prompt::build_system_prompt;
 use crate::provider::{
@@ -67,6 +68,11 @@ struct AgentInner {
    /// name after resolution. `None` (or an absent entry) means the
    /// upstream picks its own default.
    max_tokens: std::collections::HashMap<String, u64>,
    /// Per-endpoint model context window in tokens. When set, the
    /// agent compacts history before each completion so the prompt
    /// fits inside `context_window - max_tokens - safety` tokens.
    /// Absent entry → no compaction (legacy behaviour).
    context_window: std::collections::HashMap<String, usize>,
    sessions: SessionStore,
    system_prompt_path: Option<PathBuf>,
    /// Monotonic counter for minting session ids. The wire format is
@@ -99,12 +105,18 @@ impl Agent {
            .iter()
            .filter_map(|ep| ep.max_tokens.map(|m| (ep.name.clone(), m)))
            .collect();
        let context_window = cfg
            .endpoints
            .iter()
            .filter_map(|ep| ep.context_window.map(|w| (ep.name.clone(), w)))
            .collect();
        Ok(Self {
            inner: Arc::new(AgentInner {
                providers,
                default_endpoint_name: default.name.clone(),
                default_model: default.default_model.clone(),
                max_tokens,
                context_window,
                sessions: session::new_store(),
                system_prompt_path: cfg.system_prompt_path.clone(),
                next_session_id: AtomicU64::new(1),
@@ -766,6 +778,34 @@ async fn drive_prompt(
            "prompt round: streaming"
        );
        // Project history into the model's context window when the
        // endpoint advertises one. Compaction is a per-request
        // *projection* — `messages` (and the persisted session
        // history downstream) stay intact; only what we send
        // upstream shrinks. Without this, a 32 K Qwen3 dies after
        // the first few `read_file` results pile up in history.
        let provider_max_tokens = inner.max_tokens.get(provider.name()).copied();
        let messages_for_provider = match inner.context_window.get(provider.name()).copied() {
            Some(ctx) => {
                let budget = prompt_budget(ctx, provider_max_tokens);
                let (compacted, stats) = compaction::compact_to_budget(&messages, budget);
                if stats.elided_messages > 0 {
                    tracing::info!(
                        session_id = %session_id.0,
                        round = round + 1,
                        context_window = ctx,
                        budget,
                        original_tokens = stats.original_tokens,
                        final_tokens = stats.final_tokens,
                        elided = stats.elided_messages,
                        "context compaction applied"
                    );
                }
                compacted
            }
            None => messages.clone(),
        };
        // Tool descriptions reach the model via the Qwen3 `# Tools`
        // block in the system prompt, not via the OpenAI `tools`
        // request field — cortex/neuron pass that field through to
@@ -773,11 +813,11 @@ async fn drive_prompt(
        // tools once a strict-OpenAI backend lands. Leave empty.
        let completion_req = CompletionRequest {
            model: local_model.clone(),
-            messages: messages.clone(),
+            messages: messages_for_provider,
            tools: vec![],
            temperature: None,
            top_p: None,
-            max_tokens: inner.max_tokens.get(provider.name()).copied(),
+            max_tokens: provider_max_tokens,
        };
        let mut stream = match provider.complete(completion_req, cancel.clone()).await {
@@ -1203,6 +1243,26 @@ fn synthesize_malformed_history(tool_call_id: &str, raw: &str) -> (Message, Mess
    (call, result)
 }
 /// Compute the prompt token budget for an endpoint given its
 /// `context_window` and `max_tokens` settings. The model needs room
 /// for both the prompt and its response inside the context window,
 /// so the prompt budget is the remainder after subtracting the
 /// response cap (defaulting to a conservative 2048 when the endpoint
 /// didn't set one) and a small safety margin for tokenizer
 /// disagreement.
 ///
 /// The safety margin matters because our per-character estimate in
 /// [`compaction`] can drift a few percent from any given upstream
 /// tokenizer; we'd rather under-fill the context window than have a
 /// well-compacted history still trip `prompt_too_long`.
 fn prompt_budget(context_window: usize, max_tokens: Option<u64>) -> usize {
    const SAFETY_MARGIN: usize = 512;
    let max_tokens = max_tokens.unwrap_or(2048) as usize;
    context_window
        .saturating_sub(max_tokens)
        .saturating_sub(SAFETY_MARGIN)
 }
 fn map_finish_reason(reason: Option<&str>) -> StopReason {
    match reason {
        Some("length") => StopReason::MaxTokens,
@@ -1349,6 +1409,28 @@ mod tests {
    // ── map_finish_reason ───────────────────────────────────────────
    // ── prompt_budget ───────────────────────────────────────────────
    #[test]
    fn prompt_budget_reserves_response_and_safety() {
        // 32K window, 8K response cap → 32768 - 8192 - 512 = 24064.
        assert_eq!(prompt_budget(32_768, Some(8_192)), 24_064);
    }
    #[test]
    fn prompt_budget_uses_default_when_max_tokens_unset() {
        // Default response cap = 2048; safety = 512.
        assert_eq!(prompt_budget(32_768, None), 32_768 - 2_048 - 512);
    }
    #[test]
    fn prompt_budget_saturates_when_window_too_small() {
        // Pathological config: window smaller than response + safety.
        // Don't underflow — return zero so compaction tries hardest
        // and upstream surfaces the inevitable error.
        assert_eq!(prompt_budget(1_000, Some(8_192)), 0);
    }
    #[test]
    fn maps_known_finish_reasons() {
        assert!(matches!(
--- a/crates/helexa-acp/src/compaction.rs
+++ b/crates/helexa-acp/src/compaction.rs
@@ -0,0 +1,396 @@
 //! Rolling-conversation compaction for small-context local models.
 //!
 //! The tool-call loop in [`crate::agent`] grows the message vec it
 //! sends upstream every round. On a frontier model that's fine; on a
 //! 32 K Qwen3 the first few `read_file` results can push the prompt
 //! past the model's context window, at which point cortex/neuron
 //! refuses with `prompt_too_long` and the whole turn dies. Long-form
 //! local agents are unusable without something here.
 //!
 //! Strategy (intentionally simple — no LLM-summarization round-trip,
 //! no tokenizer dependency):
 //!
 //! 1. **Protect** the things the model cannot reason without:
 //!    - The system prompt (idx 0).
 //!    - Every `Role::User` turn (the user's intent — irreplaceable).
 //!    - The last [`KEEP_TAIL`] messages (most recent rounds stay
 //!      verbatim so the model can keep working on what it just
 //!      observed).
 //! 2. **Elide** older `Role::Assistant` prose and older `Role::Tool`
 //!    result content. The structure stays — `tool_call_id`s, tool
 //!    names, and argument JSON survive intact — so OpenAI's strict
 //!    `tool_calls` ↔ `tool` pairing schema remains satisfied. Only
 //!    the *payload* shrinks to a one-line marker.
 //! 3. Walk oldest→newest, recomputing the budget after each elision.
 //!    Stop as soon as we fit; we don't compact more than necessary.
 //! 4. If we still exceed budget after eliding everything we're
 //!    allowed to, return what we have. The upstream will surface a
 //!    `prompt_too_long` error and the user can intervene; that's
 //!    better than silently dropping content the model needs.
 //!
 //! Token estimation uses a `chars / 3.5` heuristic — conservative
 //! (over-estimates tokens slightly) so we compact a touch early
 //! rather than a touch late.
 use crate::provider::{Message, MessageContent, Role};
 /// Most-recent N messages that are never elided. Roughly "the
 /// current tool round in flight" — assistant turn that called the
 /// tools + each tool result + a bit of slack.
 const KEEP_TAIL: usize = 4;
 /// Below this content size we don't bother eliding — the savings
 /// don't outweigh the loss of detail. Roughly 60–80 tokens.
 const ELIDE_MIN_CHARS: usize = 256;
 /// Roughly tokens-per-character for English + code mixed in. The
 /// actual per-tokenizer ratio varies (GPT-4o ≈ 4 chars/token on
 /// English prose, ≈ 3 chars/token on code-heavy text). We pick a
 /// value on the conservative end so the budget check fires *before*
 /// the upstream tokenizer says no.
 const CHARS_PER_TOKEN: f32 = 3.5;
 /// Per-message envelope overhead (role + JSON framing). Comes out
 /// to a few tokens; tiny but it adds up across long histories.
 const ENVELOPE_TOKENS: usize = 8;
 /// Stats reported back from [`compact_to_budget`] for the caller to
 /// log. The numbers are estimates (see [`estimate_tokens`]), so
 /// don't compare them to upstream-reported token counts as if they
 /// were exact.
 #[derive(Debug, Clone, Default, PartialEq, Eq)]
 pub struct CompactionStats {
    /// Estimated tokens in the input messages.
    pub original_tokens: usize,
    /// Estimated tokens after compaction. Equal to `original_tokens`
    /// when no compaction was needed.
    pub final_tokens: usize,
    /// Number of messages whose content was elided. Zero is the
    /// hot path (nothing to do).
    pub elided_messages: usize,
 }
 impl CompactionStats {
    fn unchanged(tokens: usize) -> Self {
        Self {
            original_tokens: tokens,
            final_tokens: tokens,
            elided_messages: 0,
        }
    }
 }
 /// Approximate token count for one message. Sums the textual
 /// payload's chars, divides by [`CHARS_PER_TOKEN`], and adds an
 /// envelope constant. Cheap (no allocation) so safe to call once per
 /// message per round.
 pub fn estimate_tokens(msg: &Message) -> usize {
    let chars = match &msg.content {
        MessageContent::Text { text } => text.len(),
        MessageContent::ToolCalls { text, calls } => {
            let txt = text.as_deref().map(|s| s.len()).unwrap_or(0);
            let calls_size: usize = calls
                .iter()
                .map(|c| c.name.len() + c.arguments.len() + c.id.len())
                .sum();
            txt + calls_size
        }
        MessageContent::ToolResult {
            tool_call_id,
            content,
        } => tool_call_id.len() + content.len(),
    };
    ((chars as f32 / CHARS_PER_TOKEN) as usize) + ENVELOPE_TOKENS
 }
 /// Sum of [`estimate_tokens`] across all messages.
 pub fn total_tokens(messages: &[Message]) -> usize {
    messages.iter().map(estimate_tokens).sum()
 }
 /// Project `messages` into a vec whose estimated token count fits in
 /// `budget` tokens. Returns the projection plus stats about what
 /// was done. When the input already fits, the projection is a clone
 /// of the input and stats report zero elisions.
 ///
 /// See module docs for the strategy and protected set.
 pub fn compact_to_budget(messages: &[Message], budget: usize) -> (Vec<Message>, CompactionStats) {
    let original = total_tokens(messages);
    if original <= budget {
        return (messages.to_vec(), CompactionStats::unchanged(original));
    }
    let mut out = messages.to_vec();
    let len = out.len();
    let tail_start = len.saturating_sub(KEEP_TAIL);
    let mut elided = 0usize;
    // Two passes. First pass: ToolResult contents (largest savings
    // per elision — read_file payloads land here). Second pass: long
    // Assistant prose. We don't interleave because eliding a long
    // assistant turn before a really old read_file would do less
    // good per elision; oldest-first ordering is enforced *within*
    // each pass instead.
    for pass in 0..2 {
        for i in 1..tail_start {
            if matches!(out[i].role, Role::User) {
                continue;
            }
            let target_pass_2 = matches!(
                &out[i].content,
                MessageContent::Text { .. } | MessageContent::ToolCalls { .. }
            );
            let target_pass_1 = matches!(&out[i].content, MessageContent::ToolResult { .. });
            let in_pass = (pass == 0 && target_pass_1) || (pass == 1 && target_pass_2);
            if !in_pass {
                continue;
            }
            if elide_in_place(&mut out[i]) {
                elided += 1;
                if total_tokens(&out) <= budget {
                    let final_tokens = total_tokens(&out);
                    return (
                        out,
                        CompactionStats {
                            original_tokens: original,
                            final_tokens,
                            elided_messages: elided,
                        },
                    );
                }
            }
        }
    }
    let final_tokens = total_tokens(&out);
    (
        out,
        CompactionStats {
            original_tokens: original,
            final_tokens,
            elided_messages: elided,
        },
    )
 }
 /// Shrink one message's payload while keeping its structural role
 /// (so tool_call_id pairing survives). Returns `true` when the
 /// message changed.
 ///
 /// - `ToolResult.content` → `(elided: N bytes of tool result)`
 /// - `ToolCalls.text`     → `(elided: N bytes of assistant prose)`
 /// - `Text` (assistant)   → `(elided: N bytes of assistant prose)`
 ///
 /// Already-tiny payloads are skipped — eliding a 50-byte string
 /// would *grow* it once the marker is in place.
 fn elide_in_place(msg: &mut Message) -> bool {
    match &mut msg.content {
        MessageContent::ToolResult { content, .. } => {
            if content.len() < ELIDE_MIN_CHARS {
                return false;
            }
            *content = format!("(elided: {} bytes of tool result)", content.len());
            true
        }
        MessageContent::ToolCalls { text, .. } => match text {
            Some(t) if t.len() >= ELIDE_MIN_CHARS => {
                *text = Some(format!("(elided: {} bytes of assistant prose)", t.len()));
                true
            }
            _ => false,
        },
        MessageContent::Text { text } => {
            if text.len() < ELIDE_MIN_CHARS {
                return false;
            }
            *text = format!("(elided: {} bytes of assistant prose)", text.len());
            true
        }
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use crate::provider::ToolCall;
    fn sys(text: &str) -> Message {
        Message {
            role: Role::System,
            content: MessageContent::Text { text: text.into() },
        }
    }
    fn user(text: &str) -> Message {
        Message {
            role: Role::User,
            content: MessageContent::Text { text: text.into() },
        }
    }
    fn assistant_text(text: &str) -> Message {
        Message {
            role: Role::Assistant,
            content: MessageContent::Text { text: text.into() },
        }
    }
    fn assistant_calls(text: Option<&str>, name: &str, args: &str, id: &str) -> Message {
        Message {
            role: Role::Assistant,
            content: MessageContent::ToolCalls {
                text: text.map(|s| s.to_string()),
                calls: vec![ToolCall {
                    id: id.into(),
                    name: name.into(),
                    arguments: args.into(),
                }],
            },
        }
    }
    fn tool_result(id: &str, body: &str) -> Message {
        Message {
            role: Role::Tool,
            content: MessageContent::ToolResult {
                tool_call_id: id.into(),
                content: body.into(),
            },
        }
    }
    #[test]
    fn under_budget_is_a_no_op_clone() {
        let msgs = vec![sys("you are an agent"), user("hi"), assistant_text("hello")];
        let (out, stats) = compact_to_budget(&msgs, 10_000);
        assert_eq!(stats.elided_messages, 0);
        assert_eq!(stats.original_tokens, stats.final_tokens);
        assert_eq!(out.len(), msgs.len());
        // Strings unchanged.
        match &out[2].content {
            MessageContent::Text { text } => assert_eq!(text, "hello"),
            other => panic!("expected Text, got {other:?}"),
        }
    }
    #[test]
    fn elides_old_tool_result_before_old_assistant_prose() {
        // History: sys, user, assistant_calls, big_tool_result,
        //          assistant_with_big_text, user, assistant_calls,
        //          small_tool_result.
        // KEEP_TAIL=4 protects the last four; the big tool result
        // sits in the prunable range and should go first because
        // pass 0 (tool results) runs before pass 1 (prose).
        let big_result = "X".repeat(4096);
        let big_prose = "Y".repeat(2048);
        let msgs = vec![
            sys("preamble"),
            user("first ask"),
            assistant_calls(None, "read_file", r#"{"path":"/a"}"#, "c0"),
            tool_result("c0", &big_result),
            assistant_text(&big_prose),
            user("follow up"),
            assistant_calls(None, "read_file", r#"{"path":"/b"}"#, "c1"),
            tool_result("c1", "short result body"),
        ];
        let before = total_tokens(&msgs);
        // Force compaction by setting budget well below current.
        let budget = before / 2;
        let (out, stats) = compact_to_budget(&msgs, budget);
        assert!(
            stats.elided_messages >= 1,
            "expected at least one elision, got {stats:?}"
        );
        // The big tool result must be elided (oldest fat target).
        match &out[3].content {
            MessageContent::ToolResult { content, .. } => {
                assert!(
                    content.starts_with("(elided:"),
                    "tool result not elided: {content:?}"
                );
            }
            other => panic!("expected ToolResult, got {other:?}"),
        }
        // Last four messages must be untouched.
        assert!(matches!(
            &out[out.len() - 1].content,
            MessageContent::ToolResult { content, .. } if content == "short result body"
        ));
    }
    #[test]
    fn never_elides_system_or_user_turns() {
        let big_user = "U".repeat(8192);
        let msgs = vec![sys("preamble"), user(&big_user), assistant_text("ok")];
        let budget = 10; // way below — forces all possible elision
        let (out, _stats) = compact_to_budget(&msgs, budget);
        // System unchanged.
        match &out[0].content {
            MessageContent::Text { text } => assert_eq!(text, "preamble"),
            other => panic!("expected Text, got {other:?}"),
        }
        // User unchanged even though it's huge.
        match &out[1].content {
            MessageContent::Text { text } => assert_eq!(text.len(), big_user.len()),
            other => panic!("expected Text, got {other:?}"),
        }
    }
    #[test]
    fn preserves_tool_call_id_pairing_after_elision() {
        // OpenAI strict mode rejects a tool-result whose tool_call_id
        // doesn't match a preceding assistant tool_call. Elision
        // must not break that linkage.
        let big = "Z".repeat(4096);
        let msgs = vec![
            sys("preamble"),
            user("first"),
            assistant_calls(None, "read_file", r#"{"path":"/a"}"#, "call_42"),
            tool_result("call_42", &big),
            // Tail messages.
            user("next"),
            assistant_calls(None, "read_file", r#"{"path":"/b"}"#, "call_43"),
            tool_result("call_43", "ok"),
            assistant_text("done"),
        ];
        let budget = total_tokens(&msgs) / 3;
        let (out, _stats) = compact_to_budget(&msgs, budget);
        // The assistant call and its result both carry call_42.
        let call_id = match &out[2].content {
            MessageContent::ToolCalls { calls, .. } => calls[0].id.clone(),
            other => panic!("expected ToolCalls, got {other:?}"),
        };
        match &out[3].content {
            MessageContent::ToolResult { tool_call_id, .. } => {
                assert_eq!(tool_call_id, &call_id, "pairing broken");
            }
            other => panic!("expected ToolResult, got {other:?}"),
        }
    }
    #[test]
    fn estimate_tokens_grows_with_content() {
        let small = sys("hi");
        let large = sys(&"x".repeat(10_000));
        assert!(estimate_tokens(&large) > estimate_tokens(&small) * 100);
    }
    #[test]
    fn elide_in_place_skips_short_content() {
        let mut m = tool_result("c0", "tiny");
        assert!(!elide_in_place(&mut m));
        match m.content {
            MessageContent::ToolResult { content, .. } => assert_eq!(content, "tiny"),
            other => panic!("expected ToolResult, got {other:?}"),
        }
    }
    #[test]
    fn returns_best_effort_when_budget_unmeetable() {
        // Single huge user message that cannot be elided. Budget 10.
        // We don't error — we return what we have and let upstream
        // refuse the prompt with its own error.
        let big_user = "U".repeat(100_000);
        let msgs = vec![sys("preamble"), user(&big_user)];
        let (out, stats) = compact_to_budget(&msgs, 10);
        assert_eq!(out.len(), msgs.len());
        assert!(stats.final_tokens > 10, "still over budget by design");
    }
 }
--- a/crates/helexa-acp/src/config.rs
+++ b/crates/helexa-acp/src/config.rs
@@ -98,6 +98,14 @@ pub struct EndpointConfig {
    /// request field.
    #[serde(default)]
    pub max_tokens: Option<u64>,
    /// Model context window in tokens (prompt + response). When set,
    /// the agent compacts conversation history before each completion
    /// so the prompt fits within `context_window - max_tokens - safety`
    /// tokens — long sessions on small-context local models (Qwen3 at
    /// 32 K) survive past the first few tool-call rounds rather than
    /// dying with `prompt_too_long`. `None` disables compaction.
    #[serde(default)]
    pub context_window: Option<usize>,
 }
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
@@ -193,6 +201,15 @@ impl Config {
                })
            })
            .transpose()?;
        let context_window = std::env::var("HELEXA_ACP_CONTEXT_WINDOW")
            .ok()
            .filter(|s| !s.is_empty())
            .map(|s| {
                s.parse::<usize>().with_context(|| {
                    format!("HELEXA_ACP_CONTEXT_WINDOW is not a positive integer ({s})")
                })
            })
            .transpose()?;
        Ok(Self {
            default_endpoint: Some(DEFAULT_ENDPOINT_NAME.into()),
            endpoints: vec![EndpointConfig {
@@ -203,6 +220,7 @@ impl Config {
                api_key,
                api_key_env: None,
                max_tokens,
                context_window,
            }],
            system_prompt_path,
        })
@@ -316,6 +334,7 @@ mod tests {
            api_key: None,
            api_key_env: None,
            max_tokens: None,
            context_window: None,
        };
        assert_eq!(
            ep.chat_completions_url().as_str(),
--- a/crates/helexa-acp/src/main.rs
+++ b/crates/helexa-acp/src/main.rs
@@ -16,6 +16,7 @@ use agent_client_protocol::{Result, Stdio};
 use std::sync::Arc;
 mod agent;
 mod compaction;
 mod config;
 mod prompt;
 mod provider;
--- a/crates/helexa-acp/src/provider/openai_chat.rs
+++ b/crates/helexa-acp/src/provider/openai_chat.rs
@@ -149,6 +149,7 @@ mod tests {
            api_key: None,
            api_key_env: None,
            max_tokens: None,
            context_window: None,
        }
    }