cortex/crates/helexa-acp/src/compaction.rs

//! Rolling-conversation compaction for small-context local models.
//!
//! The tool-call loop in [`crate::agent`] grows the message vec it
//! sends upstream every round. On a frontier model that's fine; on a
//! 32 K Qwen3 the first few `read_file` results can push the prompt
//! past the model's context window, at which point cortex/neuron
//! refuses with `prompt_too_long` and the whole turn dies. Long-form
//! local agents are unusable without something here.
//!
//! Strategy (intentionally simple — no LLM-summarization round-trip,
//! no tokenizer dependency):
//!
//! 1. **Protect** the things the model cannot reason without:
//!    - The system prompt (idx 0).
//!    - Every `Role::User` turn (the user's intent — irreplaceable).
//!    - The last [`KEEP_TAIL`] messages (most recent rounds stay
//!      verbatim so the model can keep working on what it just
//!      observed).
//! 2. **Elide** older `Role::Assistant` prose and older `Role::Tool`
//!    result content. The structure stays — `tool_call_id`s, tool
//!    names, and argument JSON survive intact — so OpenAI's strict
//!    `tool_calls` ↔ `tool` pairing schema remains satisfied. Only
//!    the *payload* shrinks to a one-line marker.
//! 3. Walk oldest→newest, recomputing the budget after each elision.
//!    Stop as soon as we fit; we don't compact more than necessary.
//! 4. If we still exceed budget after eliding everything we're
//!    allowed to, return what we have. The upstream will surface a
//!    `prompt_too_long` error and the user can intervene; that's
//!    better than silently dropping content the model needs.
//!
//! Token estimation uses a `chars / 3.5` heuristic — conservative
//! (over-estimates tokens slightly) so we compact a touch early
//! rather than a touch late.

use crate::provider::{Message, MessageContent, MessagePart, Role};

/// Most-recent N messages that are never elided. Roughly "the
/// current tool round in flight" — assistant turn that called the
/// tools + each tool result + a bit of slack.
const KEEP_TAIL: usize = 4;

/// Below this content size we don't bother eliding — the savings
/// don't outweigh the loss of detail. Roughly 60–80 tokens.
const ELIDE_MIN_CHARS: usize = 256;

/// Roughly tokens-per-character for English + code mixed in. The
/// actual per-tokenizer ratio varies (GPT-4o ≈ 4 chars/token on
/// English prose, ≈ 3 chars/token on code-heavy text). We pick a
/// value on the conservative end so the budget check fires *before*
/// the upstream tokenizer says no.
const CHARS_PER_TOKEN: f32 = 3.5;

/// Per-message envelope overhead (role + JSON framing). Comes out
/// to a few tokens; tiny but it adds up across long histories.
const ENVELOPE_TOKENS: usize = 8;

/// Rough per-image token cost used by the budget estimator. Real
/// vision tokenizers vary widely (256–1024 tokens for typical
/// resolutions on Qwen3-VL, OpenAI's `low`/`high` detail toggles
/// pick between ~85 and ~1000+). 512 is a defensible middle that
/// keeps compaction from treating images as free.
const IMAGE_TOKENS_APPROX: usize = 512;

/// Stats reported back from [`compact_to_budget`] for the caller to
/// log. The numbers are estimates (see [`estimate_tokens`]), so
/// don't compare them to upstream-reported token counts as if they
/// were exact.
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct CompactionStats {
    /// Estimated tokens in the input messages.
    pub original_tokens: usize,
    /// Estimated tokens after compaction. Equal to `original_tokens`
    /// when no compaction was needed.
    pub final_tokens: usize,
    /// Number of messages whose content was elided. Zero is the
    /// hot path (nothing to do).
    pub elided_messages: usize,
}

impl CompactionStats {
    fn unchanged(tokens: usize) -> Self {
        Self {
            original_tokens: tokens,
            final_tokens: tokens,
            elided_messages: 0,
        }
    }
}

/// Approximate token count for one message. Sums the textual
/// payload's chars, divides by [`CHARS_PER_TOKEN`], and adds an
/// envelope constant. Cheap (no allocation) so safe to call once per
/// message per round.
pub fn estimate_tokens(msg: &Message) -> usize {
    let chars = match &msg.content {
        MessageContent::Text { text } => text.len(),
        MessageContent::MultiPart { parts } => parts
            .iter()
            .map(|p| match p {
                MessagePart::Text { text } => text.len(),
                // Each image is one block in the context window; the
                // upstream tokenizer handles the real cost (and it
                // varies wildly by model — Qwen3-VL uses ~256-1024
                // tokens per image depending on size). Take a
                // middle estimate so the budget tracker doesn't
                // pretend images are free.
                MessagePart::Image(_) => IMAGE_TOKENS_APPROX * CHARS_PER_TOKEN as usize,
            })
            .sum(),
        MessageContent::ToolCalls { text, calls } => {
            let txt = text.as_deref().map(|s| s.len()).unwrap_or(0);
            let calls_size: usize = calls
                .iter()
                .map(|c| c.name.len() + c.arguments.len() + c.id.len())
                .sum();
            txt + calls_size
        }
        MessageContent::ToolResult {
            tool_call_id,
            content,
        } => tool_call_id.len() + content.len(),
    };
    ((chars as f32 / CHARS_PER_TOKEN) as usize) + ENVELOPE_TOKENS
}

/// Sum of [`estimate_tokens`] across all messages.
pub fn total_tokens(messages: &[Message]) -> usize {
    messages.iter().map(estimate_tokens).sum()
}

/// Project `messages` into a vec whose estimated token count fits in
/// `budget` tokens. Returns the projection plus stats about what
/// was done. When the input already fits, the projection is a clone
/// of the input and stats report zero elisions.
///
/// See module docs for the strategy and protected set.
pub fn compact_to_budget(messages: &[Message], budget: usize) -> (Vec<Message>, CompactionStats) {
    let original = total_tokens(messages);
    if original <= budget {
        return (messages.to_vec(), CompactionStats::unchanged(original));
    }

    let mut out = messages.to_vec();
    let len = out.len();
    let tail_start = len.saturating_sub(KEEP_TAIL);
    let mut elided = 0usize;

    // Two passes. First pass: ToolResult contents (largest savings
    // per elision — read_file payloads land here). Second pass: long
    // Assistant prose. We don't interleave because eliding a long
    // assistant turn before a really old read_file would do less
    // good per elision; oldest-first ordering is enforced *within*
    // each pass instead.
    for pass in 0..2 {
        for i in 1..tail_start {
            if matches!(out[i].role, Role::User) {
                continue;
            }
            let target_pass_2 = matches!(
                &out[i].content,
                MessageContent::Text { .. } | MessageContent::ToolCalls { .. }
            );
            let target_pass_1 = matches!(&out[i].content, MessageContent::ToolResult { .. });
            let in_pass = (pass == 0 && target_pass_1) || (pass == 1 && target_pass_2);
            if !in_pass {
                continue;
            }
            if elide_in_place(&mut out[i]) {
                elided += 1;
                if total_tokens(&out) <= budget {
                    let final_tokens = total_tokens(&out);
                    return (
                        out,
                        CompactionStats {
                            original_tokens: original,
                            final_tokens,
                            elided_messages: elided,
                        },
                    );
                }
            }
        }
    }

    let final_tokens = total_tokens(&out);
    (
        out,
        CompactionStats {
            original_tokens: original,
            final_tokens,
            elided_messages: elided,
        },
    )
}

/// Shrink one message's payload while keeping its structural role
/// (so tool_call_id pairing survives). Returns `true` when the
/// message changed.
///
/// - `ToolResult.content` → `(elided: N bytes of tool result)`
/// - `ToolCalls.text`     → `(elided: N bytes of assistant prose)`
/// - `Text` (assistant)   → `(elided: N bytes of assistant prose)`
///
/// Already-tiny payloads are skipped — eliding a 50-byte string
/// would *grow* it once the marker is in place.
fn elide_in_place(msg: &mut Message) -> bool {
    match &mut msg.content {
        MessageContent::ToolResult { content, .. } => {
            if content.len() < ELIDE_MIN_CHARS {
                return false;
            }
            *content = format!("(elided: {} bytes of tool result)", content.len());
            true
        }
        MessageContent::ToolCalls { text, .. } => match text {
            Some(t) if t.len() >= ELIDE_MIN_CHARS => {
                *text = Some(format!("(elided: {} bytes of assistant prose)", t.len()));
                true
            }
            _ => false,
        },
        MessageContent::Text { text } => {
            if text.len() < ELIDE_MIN_CHARS {
                return false;
            }
            *text = format!("(elided: {} bytes of assistant prose)", text.len());
            true
        }
        MessageContent::MultiPart { .. } => {
            // MultiPart messages today only exist as User turns,
            // and User turns are protected by the role check in
            // `compact_to_budget` — so this branch is unreachable
            // for current call sites. Returning false keeps the
            // unreachable path benign if a future stage starts
            // emitting MultiPart on other roles.
            false
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::provider::ToolCall;

    fn sys(text: &str) -> Message {
        Message {
            role: Role::System,
            content: MessageContent::Text { text: text.into() },
        }
    }
    fn user(text: &str) -> Message {
        Message {
            role: Role::User,
            content: MessageContent::Text { text: text.into() },
        }
    }
    fn assistant_text(text: &str) -> Message {
        Message {
            role: Role::Assistant,
            content: MessageContent::Text { text: text.into() },
        }
    }
    fn assistant_calls(text: Option<&str>, name: &str, args: &str, id: &str) -> Message {
        Message {
            role: Role::Assistant,
            content: MessageContent::ToolCalls {
                text: text.map(|s| s.to_string()),
                calls: vec![ToolCall {
                    id: id.into(),
                    name: name.into(),
                    arguments: args.into(),
                }],
            },
        }
    }
    fn tool_result(id: &str, body: &str) -> Message {
        Message {
            role: Role::Tool,
            content: MessageContent::ToolResult {
                tool_call_id: id.into(),
                content: body.into(),
            },
        }
    }

    #[test]
    fn under_budget_is_a_no_op_clone() {
        let msgs = vec![sys("you are an agent"), user("hi"), assistant_text("hello")];
        let (out, stats) = compact_to_budget(&msgs, 10_000);
        assert_eq!(stats.elided_messages, 0);
        assert_eq!(stats.original_tokens, stats.final_tokens);
        assert_eq!(out.len(), msgs.len());
        // Strings unchanged.
        match &out[2].content {
            MessageContent::Text { text } => assert_eq!(text, "hello"),
            other => panic!("expected Text, got {other:?}"),
        }
    }

    #[test]
    fn elides_old_tool_result_before_old_assistant_prose() {
        // History: sys, user, assistant_calls, big_tool_result,
        //          assistant_with_big_text, user, assistant_calls,
        //          small_tool_result.
        // KEEP_TAIL=4 protects the last four; the big tool result
        // sits in the prunable range and should go first because
        // pass 0 (tool results) runs before pass 1 (prose).
        let big_result = "X".repeat(4096);
        let big_prose = "Y".repeat(2048);
        let msgs = vec![
            sys("preamble"),
            user("first ask"),
            assistant_calls(None, "read_file", r#"{"path":"/a"}"#, "c0"),
            tool_result("c0", &big_result),
            assistant_text(&big_prose),
            user("follow up"),
            assistant_calls(None, "read_file", r#"{"path":"/b"}"#, "c1"),
            tool_result("c1", "short result body"),
        ];
        let before = total_tokens(&msgs);
        // Force compaction by setting budget well below current.
        let budget = before / 2;
        let (out, stats) = compact_to_budget(&msgs, budget);

        assert!(
            stats.elided_messages >= 1,
            "expected at least one elision, got {stats:?}"
        );
        // The big tool result must be elided (oldest fat target).
        match &out[3].content {
            MessageContent::ToolResult { content, .. } => {
                assert!(
                    content.starts_with("(elided:"),
                    "tool result not elided: {content:?}"
                );
            }
            other => panic!("expected ToolResult, got {other:?}"),
        }
        // Last four messages must be untouched.
        assert!(matches!(
            &out[out.len() - 1].content,
            MessageContent::ToolResult { content, .. } if content == "short result body"
        ));
    }

    #[test]
    fn never_elides_system_or_user_turns() {
        let big_user = "U".repeat(8192);
        let msgs = vec![sys("preamble"), user(&big_user), assistant_text("ok")];
        let budget = 10; // way below — forces all possible elision
        let (out, _stats) = compact_to_budget(&msgs, budget);
        // System unchanged.
        match &out[0].content {
            MessageContent::Text { text } => assert_eq!(text, "preamble"),
            other => panic!("expected Text, got {other:?}"),
        }
        // User unchanged even though it's huge.
        match &out[1].content {
            MessageContent::Text { text } => assert_eq!(text.len(), big_user.len()),
            other => panic!("expected Text, got {other:?}"),
        }
    }

    #[test]
    fn preserves_tool_call_id_pairing_after_elision() {
        // OpenAI strict mode rejects a tool-result whose tool_call_id
        // doesn't match a preceding assistant tool_call. Elision
        // must not break that linkage.
        let big = "Z".repeat(4096);
        let msgs = vec![
            sys("preamble"),
            user("first"),
            assistant_calls(None, "read_file", r#"{"path":"/a"}"#, "call_42"),
            tool_result("call_42", &big),
            // Tail messages.
            user("next"),
            assistant_calls(None, "read_file", r#"{"path":"/b"}"#, "call_43"),
            tool_result("call_43", "ok"),
            assistant_text("done"),
        ];
        let budget = total_tokens(&msgs) / 3;
        let (out, _stats) = compact_to_budget(&msgs, budget);
        // The assistant call and its result both carry call_42.
        let call_id = match &out[2].content {
            MessageContent::ToolCalls { calls, .. } => calls[0].id.clone(),
            other => panic!("expected ToolCalls, got {other:?}"),
        };
        match &out[3].content {
            MessageContent::ToolResult { tool_call_id, .. } => {
                assert_eq!(tool_call_id, &call_id, "pairing broken");
            }
            other => panic!("expected ToolResult, got {other:?}"),
        }
    }

    #[test]
    fn estimate_tokens_grows_with_content() {
        let small = sys("hi");
        let large = sys(&"x".repeat(10_000));
        assert!(estimate_tokens(&large) > estimate_tokens(&small) * 100);
    }

    #[test]
    fn elide_in_place_skips_short_content() {
        let mut m = tool_result("c0", "tiny");
        assert!(!elide_in_place(&mut m));
        match m.content {
            MessageContent::ToolResult { content, .. } => assert_eq!(content, "tiny"),
            other => panic!("expected ToolResult, got {other:?}"),
        }
    }

    #[test]
    fn returns_best_effort_when_budget_unmeetable() {
        // Single huge user message that cannot be elided. Budget 10.
        // We don't error — we return what we have and let upstream
        // refuse the prompt with its own error.
        let big_user = "U".repeat(100_000);
        let msgs = vec![sys("preamble"), user(&big_user)];
        let (out, stats) = compact_to_budget(&msgs, 10);
        assert_eq!(out.len(), msgs.len());
        assert!(stats.final_tokens > 10, "still over budget by design");
    }
}