From df0abfe4d48ecbf1b6e3831cd928e7f9465c972a Mon Sep 17 00:00:00 2001 From: rob thijssen Date: Fri, 29 May 2026 09:43:00 +0300 Subject: [PATCH] feat(helexa-acp): image input for vision-capable models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stage 5. Zed clipboard/DnD images get forwarded as OpenAI content-array messages on user turns. - New MessageContent::MultiPart variant + MessagePart (Text|Image) + ImageData struct (mime_type, base64 data, optional uri). - flatten_prompt now produces structured content: collapses to Text when every block is text (some upstreams treat array-form as vision-only and refuse on text-only models), otherwise produces MultiPart preserving block order. - OpenAI encoder emits `[{type:"text",text:…}, {type:"image_url", image_url:{url:"data:{mime};base64,{data}"}}]` for MultiPart user messages. Data URIs are used over remote `uri` because they round-trip through every upstream we care about. - prompt_capabilities.image = true at initialize so Zed actually sends image blocks. - compaction estimates ~512 tokens per image (the middle of the Qwen3-VL / OpenAI detail range) so the budget tracker doesn't pretend images are free. - session/load replays image-bearing user turns by surfacing the text parts verbatim and rendering each image as a "[image: {mime} ({n} bytes)]" placeholder chunk — Zed can show the prior text context even though re-uploading the bytes through ACP isn't meaningful for resume. - 4 new tests: flatten produces MultiPart in block order, image-only prompts still flatten to MultiPart, encoder emits the correct array shape, text-only encoding stays as the string form. Co-Authored-By: Claude Opus 4.7 --- crates/helexa-acp/src/agent.rs | 185 +++++++++++++++--- crates/helexa-acp/src/compaction.rs | 31 ++- crates/helexa-acp/src/provider/mod.rs | 30 +++ crates/helexa-acp/src/provider/openai_chat.rs | 113 ++++++++++- 4 files changed, 328 insertions(+), 31 deletions(-) diff --git a/crates/helexa-acp/src/agent.rs b/crates/helexa-acp/src/agent.rs index 63d3546..9b2e72a 100644 --- a/crates/helexa-acp/src/agent.rs +++ b/crates/helexa-acp/src/agent.rs @@ -12,7 +12,8 @@ //! | `session/set_model` | switch the session's active model (endpoint:model selector) | //! | (anything else) | "not implemented yet" error | //! -//! Stage 5 flips on image content. +//! Stage 5 flipped on image content. Stage 6 starts adding new wire +//! protocols (Anthropic /v1/messages first). use std::path::PathBuf; use std::sync::Arc; @@ -288,9 +289,9 @@ impl Agent { } fn initialize_response(req: &InitializeRequest) -> InitializeResponse { - // Stage 2: text-only prompts. Image / audio / embedded resources - // flip on in later stages. - let prompt_caps = PromptCapabilities::default(); + // Stage 5: image is on (Zed clipboard / drag-drop). Audio and + // embedded resources flip on in later stages. + let prompt_caps = PromptCapabilities::default().image(true); // Stage 3b: advertise both the top-level `load_session` flag and // the `session/list` sub-capability. Zed (and other ACP clients) // uses `session/list` to discover the session id that belongs to @@ -488,6 +489,31 @@ fn replay_history(cx: &ConnectionTo, session_id: &SessionId, history: &[ send(SessionUpdate::UserMessageChunk(text_chunk(text.clone()))); total_events += 1; } + (Role::User, MessageContent::MultiPart { parts }) => { + // We can re-emit text parts as UserMessageChunks. + // Images get a placeholder line so the user sees + // *that* an image was attached; re-replaying the + // image bytes themselves through ACP would require + // a path round-trip we don't currently keep. + for part in parts { + match part { + crate::provider::MessagePart::Text { text } => { + send(SessionUpdate::UserMessageChunk(text_chunk(text.clone()))); + total_events += 1; + } + crate::provider::MessagePart::Image(img) => { + let label = match &img.uri { + Some(u) => format!("[image: {u}]"), + None => { + format!("[image: {} ({} bytes)]", img.mime_type, img.data.len()) + } + }; + send(SessionUpdate::UserMessageChunk(text_chunk(label))); + total_events += 1; + } + } + } + } (Role::Assistant, MessageContent::Text { text }) => { send(SessionUpdate::AgentMessageChunk(text_chunk(text.clone()))); total_events += 1; @@ -586,10 +612,18 @@ fn handle_list_sessions(req: ListSessionsRequest) -> anyhow::Result Option { + use crate::provider::MessagePart; history .iter() .find_map(|msg| match (msg.role, &msg.content) { - (Role::User, MessageContent::Text { text }) => Some(text.as_str()), + (Role::User, MessageContent::Text { text }) => Some(text.clone()), + (Role::User, MessageContent::MultiPart { parts }) => parts.iter().find_map(|p| { + if let MessagePart::Text { text } = p { + Some(text.clone()) + } else { + None + } + }), _ => None, }) .map(|s| { @@ -823,10 +857,10 @@ async fn drive_prompt( state.cancel.cancel(); let cancel = CancellationToken::new(); state.cancel = cancel.clone(); - let user_text = flatten_prompt(&req.prompt); + let user_content = flatten_prompt(&req.prompt); state.history.push(Message { role: Role::User, - content: MessageContent::Text { text: user_text }, + content: user_content, }); ( state.history.clone(), @@ -1422,31 +1456,76 @@ fn map_finish_reason(reason: Option<&str>) -> StopReason { } /// Pure helper — turn a prompt's ContentBlocks into the user-message -/// text that goes into history. Lifted out so unit tests don't need a -/// running runtime. -fn flatten_prompt(blocks: &[ContentBlock]) -> String { - let mut out = String::new(); - for block in blocks { - if !out.is_empty() { - out.push_str("\n\n"); +/// content that goes into history. +/// +/// - All-text prompts collapse to [`MessageContent::Text`] (cheaper +/// to encode upstream — many OpenAI-compatible servers prefer the +/// string form when there's no reason to use the array form). +/// - Anything with at least one image becomes +/// [`MessageContent::MultiPart`], preserving block order so the +/// user's "this image, then this text" pacing reaches the model. +/// - `ResourceLink` is rendered as inline text so the model knows +/// it was referenced. Audio and embedded resources aren't +/// advertised as supported in [`PromptCapabilities`]; drop with a +/// warning if a non-conformant client sends one. +fn flatten_prompt(blocks: &[ContentBlock]) -> MessageContent { + use crate::provider::{ImageData, MessagePart}; + + let mut parts: Vec = Vec::new(); + let mut text_buf = String::new(); + let flush_text = |buf: &mut String, parts: &mut Vec| { + if !buf.is_empty() { + parts.push(MessagePart::Text { + text: std::mem::take(buf), + }); } + }; + + for block in blocks { match block { - ContentBlock::Text(t) => out.push_str(&t.text), - ContentBlock::ResourceLink(link) => { - // Stage 2 has no fs access; surface the link as a - // textual reference so the model at least knows it - // was asked about something. - out.push_str(&format!("[resource link: {}]", link.uri)); + ContentBlock::Text(t) => { + if !text_buf.is_empty() { + text_buf.push_str("\n\n"); + } + text_buf.push_str(&t.text); + } + ContentBlock::ResourceLink(link) => { + if !text_buf.is_empty() { + text_buf.push_str("\n\n"); + } + text_buf.push_str(&format!("[resource link: {}]", link.uri)); + } + ContentBlock::Image(img) => { + flush_text(&mut text_buf, &mut parts); + parts.push(MessagePart::Image(ImageData { + mime_type: img.mime_type.clone(), + data: img.data.clone(), + uri: img.uri.clone(), + })); } - // Image / Audio / Resource: not advertised in - // PromptCapabilities for Stage 2; a well-behaved client - // shouldn't send these. If one does, drop and warn. other => { - tracing::warn!(?other, "ignoring unsupported content block in Stage 2"); + tracing::warn!(?other, "ignoring unsupported content block"); } } } - out + flush_text(&mut text_buf, &mut parts); + + // Collapse to plain Text when there's no image part — the + // OpenAI string-form is friendlier to non-vision endpoints + // (some treat the array form as a vision-only path). + let has_image = parts.iter().any(|p| matches!(p, MessagePart::Image(_))); + if !has_image { + let text = parts + .into_iter() + .filter_map(|p| match p { + MessagePart::Text { text } => Some(text), + MessagePart::Image(_) => None, + }) + .collect::>() + .join("\n\n"); + return MessageContent::Text { text }; + } + MessageContent::MultiPart { parts } } /// Pure helper — pick which provider handles a session's `model_id`. @@ -1475,9 +1554,16 @@ mod tests { // ── flatten_prompt ────────────────────────────────────────────── + fn expect_text(content: &MessageContent) -> &str { + match content { + MessageContent::Text { text } => text.as_str(), + other => panic!("expected MessageContent::Text, got {other:?}"), + } + } + #[test] fn flatten_empty_prompt_is_empty() { - assert_eq!(flatten_prompt(&[]), ""); + assert_eq!(expect_text(&flatten_prompt(&[])), ""); } #[test] @@ -1486,7 +1572,7 @@ mod tests { ContentBlock::Text(TextContent::new("first")), ContentBlock::Text(TextContent::new("second")), ]; - assert_eq!(flatten_prompt(&blocks), "first\n\nsecond"); + assert_eq!(expect_text(&flatten_prompt(&blocks)), "first\n\nsecond"); } #[test] @@ -1495,7 +1581,50 @@ mod tests { "readme", "file:///tmp/x", ))]; - assert_eq!(flatten_prompt(&blocks), "[resource link: file:///tmp/x]"); + assert_eq!( + expect_text(&flatten_prompt(&blocks)), + "[resource link: file:///tmp/x]" + ); + } + + #[test] + fn flatten_text_and_image_produces_multipart_in_order() { + use crate::provider::MessagePart; + let blocks = vec![ + ContentBlock::Text(TextContent::new("describe:")), + ContentBlock::Image(agent_client_protocol::schema::ImageContent::new( + "iVBORw0KGgo=", + "image/png", + )), + ContentBlock::Text(TextContent::new("…in detail.")), + ]; + let content = flatten_prompt(&blocks); + match content { + MessageContent::MultiPart { parts } => { + assert_eq!(parts.len(), 3); + assert!(matches!(&parts[0], MessagePart::Text { text } if text == "describe:")); + assert!(matches!(&parts[1], MessagePart::Image(img) + if img.mime_type == "image/png" && img.data == "iVBORw0KGgo=")); + assert!(matches!(&parts[2], MessagePart::Text { text } if text == "…in detail.")); + } + other => panic!("expected MultiPart, got {other:?}"), + } + } + + #[test] + fn flatten_image_only_still_produces_multipart() { + use crate::provider::MessagePart; + let blocks = vec![ContentBlock::Image( + agent_client_protocol::schema::ImageContent::new("AAAA", "image/jpeg"), + )]; + match flatten_prompt(&blocks) { + MessageContent::MultiPart { parts } => { + assert_eq!(parts.len(), 1); + assert!(matches!(&parts[0], MessagePart::Image(img) + if img.mime_type == "image/jpeg")); + } + other => panic!("expected MultiPart, got {other:?}"), + } } // ── resolve_provider ──────────────────────────────────────────── diff --git a/crates/helexa-acp/src/compaction.rs b/crates/helexa-acp/src/compaction.rs index 2594e49..a8ccabe 100644 --- a/crates/helexa-acp/src/compaction.rs +++ b/crates/helexa-acp/src/compaction.rs @@ -32,7 +32,7 @@ //! (over-estimates tokens slightly) so we compact a touch early //! rather than a touch late. -use crate::provider::{Message, MessageContent, Role}; +use crate::provider::{Message, MessageContent, MessagePart, Role}; /// Most-recent N messages that are never elided. Roughly "the /// current tool round in flight" — assistant turn that called the @@ -54,6 +54,13 @@ const CHARS_PER_TOKEN: f32 = 3.5; /// to a few tokens; tiny but it adds up across long histories. const ENVELOPE_TOKENS: usize = 8; +/// Rough per-image token cost used by the budget estimator. Real +/// vision tokenizers vary widely (256–1024 tokens for typical +/// resolutions on Qwen3-VL, OpenAI's `low`/`high` detail toggles +/// pick between ~85 and ~1000+). 512 is a defensible middle that +/// keeps compaction from treating images as free. +const IMAGE_TOKENS_APPROX: usize = 512; + /// Stats reported back from [`compact_to_budget`] for the caller to /// log. The numbers are estimates (see [`estimate_tokens`]), so /// don't compare them to upstream-reported token counts as if they @@ -87,6 +94,19 @@ impl CompactionStats { pub fn estimate_tokens(msg: &Message) -> usize { let chars = match &msg.content { MessageContent::Text { text } => text.len(), + MessageContent::MultiPart { parts } => parts + .iter() + .map(|p| match p { + MessagePart::Text { text } => text.len(), + // Each image is one block in the context window; the + // upstream tokenizer handles the real cost (and it + // varies wildly by model — Qwen3-VL uses ~256-1024 + // tokens per image depending on size). Take a + // middle estimate so the budget tracker doesn't + // pretend images are free. + MessagePart::Image(_) => IMAGE_TOKENS_APPROX * CHARS_PER_TOKEN as usize, + }) + .sum(), MessageContent::ToolCalls { text, calls } => { let txt = text.as_deref().map(|s| s.len()).unwrap_or(0); let calls_size: usize = calls @@ -206,6 +226,15 @@ fn elide_in_place(msg: &mut Message) -> bool { *text = format!("(elided: {} bytes of assistant prose)", text.len()); true } + MessageContent::MultiPart { .. } => { + // MultiPart messages today only exist as User turns, + // and User turns are protected by the role check in + // `compact_to_budget` — so this branch is unreachable + // for current call sites. Returning false keeps the + // unreachable path benign if a future stage starts + // emitting MultiPart on other roles. + false + } } } diff --git a/crates/helexa-acp/src/provider/mod.rs b/crates/helexa-acp/src/provider/mod.rs index c2769e2..af175e1 100644 --- a/crates/helexa-acp/src/provider/mod.rs +++ b/crates/helexa-acp/src/provider/mod.rs @@ -101,6 +101,11 @@ pub enum MessageContent { /// enum, which is incompatible with newtype-of-primitive /// variants. Text { text: String }, + /// Mixed text + image user turn. Stage 5 introduces this when + /// Zed sends an `ImageContent` block alongside the user's prompt. + /// Providers that don't support vision should down-convert by + /// dropping image parts and concatenating text parts. + MultiPart { parts: Vec }, /// Assistant turn that called one or more tools. Stage 3 starts /// constructing this when the provider stream yields a /// `ToolCallStart` / `ToolCallArgsDelta` sequence. @@ -117,6 +122,31 @@ pub enum MessageContent { }, } +/// One part of a [`MessageContent::MultiPart`] message. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum MessagePart { + Text { text: String }, + Image(ImageData), +} + +/// Inline image attachment. `data` is base64-encoded raw image +/// bytes; the encoder constructs an `image_url` data URI from it +/// at request time. `uri` carries any pointer the client supplied +/// (e.g. `file:///tmp/x.png`) — we keep it on the message for +/// debugging / future providers but the OpenAI encoder ignores it +/// when `data` is present (data wins, since it round-trips through +/// every wire format). +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ImageData { + pub mime_type: String, + /// Base64-encoded image bytes (no `data:` prefix, no padding + /// stripped — exactly what `ImageContent.data` carried). + pub data: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub uri: Option, +} + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ToolCall { /// Provider-assigned id that ties the call to its result. The diff --git a/crates/helexa-acp/src/provider/openai_chat.rs b/crates/helexa-acp/src/provider/openai_chat.rs index 6497808..6d5bd8c 100644 --- a/crates/helexa-acp/src/provider/openai_chat.rs +++ b/crates/helexa-acp/src/provider/openai_chat.rs @@ -14,8 +14,8 @@ use serde_json::{Value, json}; use tokio_util::sync::CancellationToken; use super::{ - CompletionEvent, CompletionRequest, Message, MessageContent, ModelInfo, Provider, Role, - ToolSpec, UsageStats, + CompletionEvent, CompletionRequest, ImageData, Message, MessageContent, MessagePart, ModelInfo, + Provider, Role, ToolSpec, UsageStats, }; use crate::config::EndpointConfig; @@ -189,6 +189,71 @@ mod tests { assert_eq!(body["stream_options"]["include_usage"], true); } + #[test] + fn encodes_user_multipart_with_image_as_content_array() { + let req = CompletionRequest { + model: "vl".into(), + messages: vec![Message { + role: Role::User, + content: MessageContent::MultiPart { + parts: vec![ + MessagePart::Text { + text: "what's in this?".into(), + }, + MessagePart::Image(ImageData { + mime_type: "image/png".into(), + data: "iVBORw0KGgo=".into(), + uri: None, + }), + ], + }, + }], + tools: vec![], + temperature: None, + top_p: None, + max_tokens: None, + }; + let body = encode_request(&req); + let msg = &body["messages"][0]; + assert_eq!(msg["role"], "user"); + let content = msg["content"].as_array().expect("content array"); + assert_eq!(content.len(), 2); + assert_eq!(content[0]["type"], "text"); + assert_eq!(content[0]["text"], "what's in this?"); + assert_eq!(content[1]["type"], "image_url"); + assert_eq!( + content[1]["image_url"]["url"], + "data:image/png;base64,iVBORw0KGgo=" + ); + } + + #[test] + fn encodes_text_only_user_message_as_string() { + // Regression: even though we accept MultiPart now, the + // string form must stay the encoded shape for plain text + // messages — some upstreams treat array-form as a vision + // request and refuse on text-only models. + let req = CompletionRequest { + model: "m".into(), + messages: vec![Message { + role: Role::User, + content: MessageContent::Text { + text: "plain".into(), + }, + }], + tools: vec![], + temperature: None, + top_p: None, + max_tokens: None, + }; + let body = encode_request(&req); + assert_eq!(body["messages"][0]["content"], "plain"); + assert!( + body["messages"][0]["content"].as_array().is_none(), + "text-only must not become array form" + ); + } + #[test] fn encodes_tool_call_round_trip() { let req = CompletionRequest { @@ -510,6 +575,10 @@ fn encode_message(m: &Message) -> Value { json!({"role": "system", "content": text}) } (Role::User, MessageContent::Text { text }) => json!({"role": "user", "content": text}), + (Role::User, MessageContent::MultiPart { parts }) => json!({ + "role": "user", + "content": encode_user_parts(parts), + }), (Role::Assistant, MessageContent::Text { text }) => { json!({"role": "assistant", "content": text}) } @@ -549,6 +618,38 @@ fn encode_message(m: &Message) -> Value { } } +/// Encode a [`MessageContent::MultiPart`] user message as the OpenAI +/// chat content-array form: +/// +/// ```jsonc +/// [ +/// {"type": "text", "text": "describe this:"}, +/// {"type": "image_url", "image_url": {"url": "data:image/png;base64,…"}} +/// ] +/// ``` +/// +/// Images use a `data:` URI built from `mime_type` + base64 `data`. +/// `uri` is intentionally ignored here — the inline data form +/// round-trips through every upstream we care about (cortex's +/// model loaders read the bytes directly, OpenAI accepts both +/// data and remote URLs but data is portable). Sticking to one +/// shape keeps wire-level surprises down. +fn encode_user_parts(parts: &[MessagePart]) -> Value { + let items: Vec = parts + .iter() + .map(|p| match p { + MessagePart::Text { text } => json!({ "type": "text", "text": text }), + MessagePart::Image(ImageData { + mime_type, data, .. + }) => json!({ + "type": "image_url", + "image_url": { "url": format!("data:{mime_type};base64,{data}") }, + }), + }) + .collect(); + Value::Array(items) +} + fn role_str(r: Role) -> &'static str { match r { Role::System => "system", @@ -561,6 +662,14 @@ fn role_str(r: Role) -> &'static str { fn content_as_text(c: &MessageContent) -> String { match c { MessageContent::Text { text } => text.clone(), + MessageContent::MultiPart { parts } => parts + .iter() + .filter_map(|p| match p { + MessagePart::Text { text } => Some(text.as_str()), + MessagePart::Image(_) => None, + }) + .collect::>() + .join("\n\n"), MessageContent::ToolCalls { text, .. } => text.clone().unwrap_or_default(), MessageContent::ToolResult { content, .. } => content.clone(), }