From df0abfe4d48ecbf1b6e3831cd928e7f9465c972a Mon Sep 17 00:00:00 2001
From: rob thijssen <grenade@rob.tn>
Date: Fri, 29 May 2026 09:43:00 +0300
Subject: [PATCH] feat(helexa-acp): image input for vision-capable models
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Stage 5. Zed clipboard/DnD images get forwarded as OpenAI
content-array messages on user turns.

- New MessageContent::MultiPart variant + MessagePart (Text|Image)
  + ImageData struct (mime_type, base64 data, optional uri).
- flatten_prompt now produces structured content: collapses to
  Text when every block is text (some upstreams treat array-form
  as vision-only and refuse on text-only models), otherwise
  produces MultiPart preserving block order.
- OpenAI encoder emits `[{type:"text",text:…}, {type:"image_url",
  image_url:{url:"data:{mime};base64,{data}"}}]` for MultiPart user
  messages. Data URIs are used over remote `uri` because they
  round-trip through every upstream we care about.
- prompt_capabilities.image = true at initialize so Zed actually
  sends image blocks.
- compaction estimates ~512 tokens per image (the middle of the
  Qwen3-VL / OpenAI detail range) so the budget tracker doesn't
  pretend images are free.
- session/load replays image-bearing user turns by surfacing the
  text parts verbatim and rendering each image as a "[image: {mime}
  ({n} bytes)]" placeholder chunk — Zed can show the prior text
  context even though re-uploading the bytes through ACP isn't
  meaningful for resume.
- 4 new tests: flatten produces MultiPart in block order, image-only
  prompts still flatten to MultiPart, encoder emits the correct
  array shape, text-only encoding stays as the string form.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 crates/helexa-acp/src/agent.rs                | 185 +++++++++++++++---
 crates/helexa-acp/src/compaction.rs           |  31 ++-
 crates/helexa-acp/src/provider/mod.rs         |  30 +++
 crates/helexa-acp/src/provider/openai_chat.rs | 113 ++++++++++-
 4 files changed, 328 insertions(+), 31 deletions(-)
diff --git a/crates/helexa-acp/src/agent.rs b/crates/helexa-acp/src/agent.rs
index 63d3546..9b2e72a 100644
--- a/crates/helexa-acp/src/agent.rs
+++ b/crates/helexa-acp/src/agent.rs
@@ -12,7 +12,8 @@
 //! | `session/set_model`   | switch the session's active model (endpoint:model selector) |
 //! | (anything else)       | "not implemented yet" error                                 |
 //!
-//! Stage 5 flips on image content.
+//! Stage 5 flipped on image content. Stage 6 starts adding new wire
+//! protocols (Anthropic /v1/messages first).
 
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -288,9 +289,9 @@ impl Agent {
 }
 
 fn initialize_response(req: &InitializeRequest) -> InitializeResponse {
-    // Stage 2: text-only prompts. Image / audio / embedded resources
-    // flip on in later stages.
-    let prompt_caps = PromptCapabilities::default();
+    // Stage 5: image is on (Zed clipboard / drag-drop). Audio and
+    // embedded resources flip on in later stages.
+    let prompt_caps = PromptCapabilities::default().image(true);
     // Stage 3b: advertise both the top-level `load_session` flag and
     // the `session/list` sub-capability. Zed (and other ACP clients)
     // uses `session/list` to discover the session id that belongs to
@@ -488,6 +489,31 @@ fn replay_history(cx: &ConnectionTo<Client>, session_id: &SessionId, history: &[
                 send(SessionUpdate::UserMessageChunk(text_chunk(text.clone())));
                 total_events += 1;
             }
+            (Role::User, MessageContent::MultiPart { parts }) => {
+                // We can re-emit text parts as UserMessageChunks.
+                // Images get a placeholder line so the user sees
+                // *that* an image was attached; re-replaying the
+                // image bytes themselves through ACP would require
+                // a path round-trip we don't currently keep.
+                for part in parts {
+                    match part {
+                        crate::provider::MessagePart::Text { text } => {
+                            send(SessionUpdate::UserMessageChunk(text_chunk(text.clone())));
+                            total_events += 1;
+                        }
+                        crate::provider::MessagePart::Image(img) => {
+                            let label = match &img.uri {
+                                Some(u) => format!("[image: {u}]"),
+                                None => {
+                                    format!("[image: {} ({} bytes)]", img.mime_type, img.data.len())
+                                }
+                            };
+                            send(SessionUpdate::UserMessageChunk(text_chunk(label)));
+                            total_events += 1;
+                        }
+                    }
+                }
+            }
             (Role::Assistant, MessageContent::Text { text }) => {
                 send(SessionUpdate::AgentMessageChunk(text_chunk(text.clone())));
                 total_events += 1;
@@ -586,10 +612,18 @@ fn handle_list_sessions(req: ListSessionsRequest) -> anyhow::Result<ListSessions
 /// first user turn's text (truncated to ~60 chars). Empty string
 /// becomes `None` so Zed can fall back to its own placeholder.
 fn derive_session_title(history: &[Message]) -> Option<String> {
+    use crate::provider::MessagePart;
     history
         .iter()
         .find_map(|msg| match (msg.role, &msg.content) {
-            (Role::User, MessageContent::Text { text }) => Some(text.as_str()),
+            (Role::User, MessageContent::Text { text }) => Some(text.clone()),
+            (Role::User, MessageContent::MultiPart { parts }) => parts.iter().find_map(|p| {
+                if let MessagePart::Text { text } = p {
+                    Some(text.clone())
+                } else {
+                    None
+                }
+            }),
             _ => None,
         })
         .map(|s| {
@@ -823,10 +857,10 @@ async fn drive_prompt(
         state.cancel.cancel();
         let cancel = CancellationToken::new();
         state.cancel = cancel.clone();
-        let user_text = flatten_prompt(&req.prompt);
+        let user_content = flatten_prompt(&req.prompt);
         state.history.push(Message {
             role: Role::User,
-            content: MessageContent::Text { text: user_text },
+            content: user_content,
         });
         (
             state.history.clone(),
@@ -1422,31 +1456,76 @@ fn map_finish_reason(reason: Option<&str>) -> StopReason {
 }
 
 /// Pure helper — turn a prompt's ContentBlocks into the user-message
-/// text that goes into history. Lifted out so unit tests don't need a
-/// running runtime.
-fn flatten_prompt(blocks: &[ContentBlock]) -> String {
-    let mut out = String::new();
-    for block in blocks {
-        if !out.is_empty() {
-            out.push_str("\n\n");
+/// content that goes into history.
+///
+/// - All-text prompts collapse to [`MessageContent::Text`] (cheaper
+///   to encode upstream — many OpenAI-compatible servers prefer the
+///   string form when there's no reason to use the array form).
+/// - Anything with at least one image becomes
+///   [`MessageContent::MultiPart`], preserving block order so the
+///   user's "this image, then this text" pacing reaches the model.
+/// - `ResourceLink` is rendered as inline text so the model knows
+///   it was referenced. Audio and embedded resources aren't
+///   advertised as supported in [`PromptCapabilities`]; drop with a
+///   warning if a non-conformant client sends one.
+fn flatten_prompt(blocks: &[ContentBlock]) -> MessageContent {
+    use crate::provider::{ImageData, MessagePart};
+
+    let mut parts: Vec<MessagePart> = Vec::new();
+    let mut text_buf = String::new();
+    let flush_text = |buf: &mut String, parts: &mut Vec<MessagePart>| {
+        if !buf.is_empty() {
+            parts.push(MessagePart::Text {
+                text: std::mem::take(buf),
+            });
         }
+    };
+
+    for block in blocks {
         match block {
-            ContentBlock::Text(t) => out.push_str(&t.text),
-            ContentBlock::ResourceLink(link) => {
-                // Stage 2 has no fs access; surface the link as a
-                // textual reference so the model at least knows it
-                // was asked about something.
-                out.push_str(&format!("[resource link: {}]", link.uri));
+            ContentBlock::Text(t) => {
+                if !text_buf.is_empty() {
+                    text_buf.push_str("\n\n");
+                }
+                text_buf.push_str(&t.text);
+            }
+            ContentBlock::ResourceLink(link) => {
+                if !text_buf.is_empty() {
+                    text_buf.push_str("\n\n");
+                }
+                text_buf.push_str(&format!("[resource link: {}]", link.uri));
+            }
+            ContentBlock::Image(img) => {
+                flush_text(&mut text_buf, &mut parts);
+                parts.push(MessagePart::Image(ImageData {
+                    mime_type: img.mime_type.clone(),
+                    data: img.data.clone(),
+                    uri: img.uri.clone(),
+                }));
             }
-            // Image / Audio / Resource: not advertised in
-            // PromptCapabilities for Stage 2; a well-behaved client
-            // shouldn't send these. If one does, drop and warn.
             other => {
-                tracing::warn!(?other, "ignoring unsupported content block in Stage 2");
+                tracing::warn!(?other, "ignoring unsupported content block");
             }
         }
     }
-    out
+    flush_text(&mut text_buf, &mut parts);
+
+    // Collapse to plain Text when there's no image part — the
+    // OpenAI string-form is friendlier to non-vision endpoints
+    // (some treat the array form as a vision-only path).
+    let has_image = parts.iter().any(|p| matches!(p, MessagePart::Image(_)));
+    if !has_image {
+        let text = parts
+            .into_iter()
+            .filter_map(|p| match p {
+                MessagePart::Text { text } => Some(text),
+                MessagePart::Image(_) => None,
+            })
+            .collect::<Vec<_>>()
+            .join("\n\n");
+        return MessageContent::Text { text };
+    }
+    MessageContent::MultiPart { parts }
 }
 
 /// Pure helper — pick which provider handles a session's `model_id`.
@@ -1475,9 +1554,16 @@ mod tests {
 
     // ── flatten_prompt ──────────────────────────────────────────────
 
+    fn expect_text(content: &MessageContent) -> &str {
+        match content {
+            MessageContent::Text { text } => text.as_str(),
+            other => panic!("expected MessageContent::Text, got {other:?}"),
+        }
+    }
+
     #[test]
     fn flatten_empty_prompt_is_empty() {
-        assert_eq!(flatten_prompt(&[]), "");
+        assert_eq!(expect_text(&flatten_prompt(&[])), "");
     }
 
     #[test]
@@ -1486,7 +1572,7 @@ mod tests {
             ContentBlock::Text(TextContent::new("first")),
             ContentBlock::Text(TextContent::new("second")),
         ];
-        assert_eq!(flatten_prompt(&blocks), "first\n\nsecond");
+        assert_eq!(expect_text(&flatten_prompt(&blocks)), "first\n\nsecond");
     }
 
     #[test]
@@ -1495,7 +1581,50 @@ mod tests {
             "readme",
             "file:///tmp/x",
         ))];
-        assert_eq!(flatten_prompt(&blocks), "[resource link: file:///tmp/x]");
+        assert_eq!(
+            expect_text(&flatten_prompt(&blocks)),
+            "[resource link: file:///tmp/x]"
+        );
+    }
+
+    #[test]
+    fn flatten_text_and_image_produces_multipart_in_order() {
+        use crate::provider::MessagePart;
+        let blocks = vec![
+            ContentBlock::Text(TextContent::new("describe:")),
+            ContentBlock::Image(agent_client_protocol::schema::ImageContent::new(
+                "iVBORw0KGgo=",
+                "image/png",
+            )),
+            ContentBlock::Text(TextContent::new("…in detail.")),
+        ];
+        let content = flatten_prompt(&blocks);
+        match content {
+            MessageContent::MultiPart { parts } => {
+                assert_eq!(parts.len(), 3);
+                assert!(matches!(&parts[0], MessagePart::Text { text } if text == "describe:"));
+                assert!(matches!(&parts[1], MessagePart::Image(img)
+                    if img.mime_type == "image/png" && img.data == "iVBORw0KGgo="));
+                assert!(matches!(&parts[2], MessagePart::Text { text } if text == "…in detail."));
+            }
+            other => panic!("expected MultiPart, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn flatten_image_only_still_produces_multipart() {
+        use crate::provider::MessagePart;
+        let blocks = vec![ContentBlock::Image(
+            agent_client_protocol::schema::ImageContent::new("AAAA", "image/jpeg"),
+        )];
+        match flatten_prompt(&blocks) {
+            MessageContent::MultiPart { parts } => {
+                assert_eq!(parts.len(), 1);
+                assert!(matches!(&parts[0], MessagePart::Image(img)
+                    if img.mime_type == "image/jpeg"));
+            }
+            other => panic!("expected MultiPart, got {other:?}"),
+        }
     }
 
     // ── resolve_provider ────────────────────────────────────────────
diff --git a/crates/helexa-acp/src/compaction.rs b/crates/helexa-acp/src/compaction.rs
index 2594e49..a8ccabe 100644
--- a/crates/helexa-acp/src/compaction.rs
+++ b/crates/helexa-acp/src/compaction.rs
@@ -32,7 +32,7 @@
 //! (over-estimates tokens slightly) so we compact a touch early
 //! rather than a touch late.
 
-use crate::provider::{Message, MessageContent, Role};
+use crate::provider::{Message, MessageContent, MessagePart, Role};
 
 /// Most-recent N messages that are never elided. Roughly "the
 /// current tool round in flight" — assistant turn that called the
@@ -54,6 +54,13 @@ const CHARS_PER_TOKEN: f32 = 3.5;
 /// to a few tokens; tiny but it adds up across long histories.
 const ENVELOPE_TOKENS: usize = 8;
 
+/// Rough per-image token cost used by the budget estimator. Real
+/// vision tokenizers vary widely (256–1024 tokens for typical
+/// resolutions on Qwen3-VL, OpenAI's `low`/`high` detail toggles
+/// pick between ~85 and ~1000+). 512 is a defensible middle that
+/// keeps compaction from treating images as free.
+const IMAGE_TOKENS_APPROX: usize = 512;
+
 /// Stats reported back from [`compact_to_budget`] for the caller to
 /// log. The numbers are estimates (see [`estimate_tokens`]), so
 /// don't compare them to upstream-reported token counts as if they
@@ -87,6 +94,19 @@ impl CompactionStats {
 pub fn estimate_tokens(msg: &Message) -> usize {
     let chars = match &msg.content {
         MessageContent::Text { text } => text.len(),
+        MessageContent::MultiPart { parts } => parts
+            .iter()
+            .map(|p| match p {
+                MessagePart::Text { text } => text.len(),
+                // Each image is one block in the context window; the
+                // upstream tokenizer handles the real cost (and it
+                // varies wildly by model — Qwen3-VL uses ~256-1024
+                // tokens per image depending on size). Take a
+                // middle estimate so the budget tracker doesn't
+                // pretend images are free.
+                MessagePart::Image(_) => IMAGE_TOKENS_APPROX * CHARS_PER_TOKEN as usize,
+            })
+            .sum(),
         MessageContent::ToolCalls { text, calls } => {
             let txt = text.as_deref().map(|s| s.len()).unwrap_or(0);
             let calls_size: usize = calls
@@ -206,6 +226,15 @@ fn elide_in_place(msg: &mut Message) -> bool {
             *text = format!("(elided: {} bytes of assistant prose)", text.len());
             true
         }
+        MessageContent::MultiPart { .. } => {
+            // MultiPart messages today only exist as User turns,
+            // and User turns are protected by the role check in
+            // `compact_to_budget` — so this branch is unreachable
+            // for current call sites. Returning false keeps the
+            // unreachable path benign if a future stage starts
+            // emitting MultiPart on other roles.
+            false
+        }
     }
 }
 
diff --git a/crates/helexa-acp/src/provider/mod.rs b/crates/helexa-acp/src/provider/mod.rs
index c2769e2..af175e1 100644
--- a/crates/helexa-acp/src/provider/mod.rs
+++ b/crates/helexa-acp/src/provider/mod.rs
@@ -101,6 +101,11 @@ pub enum MessageContent {
     /// enum, which is incompatible with newtype-of-primitive
     /// variants.
     Text { text: String },
+    /// Mixed text + image user turn. Stage 5 introduces this when
+    /// Zed sends an `ImageContent` block alongside the user's prompt.
+    /// Providers that don't support vision should down-convert by
+    /// dropping image parts and concatenating text parts.
+    MultiPart { parts: Vec<MessagePart> },
     /// Assistant turn that called one or more tools. Stage 3 starts
     /// constructing this when the provider stream yields a
     /// `ToolCallStart` / `ToolCallArgsDelta` sequence.
@@ -117,6 +122,31 @@ pub enum MessageContent {
     },
 }
 
+/// One part of a [`MessageContent::MultiPart`] message.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum MessagePart {
+    Text { text: String },
+    Image(ImageData),
+}
+
+/// Inline image attachment. `data` is base64-encoded raw image
+/// bytes; the encoder constructs an `image_url` data URI from it
+/// at request time. `uri` carries any pointer the client supplied
+/// (e.g. `file:///tmp/x.png`) — we keep it on the message for
+/// debugging / future providers but the OpenAI encoder ignores it
+/// when `data` is present (data wins, since it round-trips through
+/// every wire format).
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ImageData {
+    pub mime_type: String,
+    /// Base64-encoded image bytes (no `data:` prefix, no padding
+    /// stripped — exactly what `ImageContent.data` carried).
+    pub data: String,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub uri: Option<String>,
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct ToolCall {
     /// Provider-assigned id that ties the call to its result. The
diff --git a/crates/helexa-acp/src/provider/openai_chat.rs b/crates/helexa-acp/src/provider/openai_chat.rs
index 6497808..6d5bd8c 100644
--- a/crates/helexa-acp/src/provider/openai_chat.rs
+++ b/crates/helexa-acp/src/provider/openai_chat.rs
@@ -14,8 +14,8 @@ use serde_json::{Value, json};
 use tokio_util::sync::CancellationToken;
 
 use super::{
-    CompletionEvent, CompletionRequest, Message, MessageContent, ModelInfo, Provider, Role,
-    ToolSpec, UsageStats,
+    CompletionEvent, CompletionRequest, ImageData, Message, MessageContent, MessagePart, ModelInfo,
+    Provider, Role, ToolSpec, UsageStats,
 };
 use crate::config::EndpointConfig;
 
@@ -189,6 +189,71 @@ mod tests {
         assert_eq!(body["stream_options"]["include_usage"], true);
     }
 
+    #[test]
+    fn encodes_user_multipart_with_image_as_content_array() {
+        let req = CompletionRequest {
+            model: "vl".into(),
+            messages: vec![Message {
+                role: Role::User,
+                content: MessageContent::MultiPart {
+                    parts: vec![
+                        MessagePart::Text {
+                            text: "what's in this?".into(),
+                        },
+                        MessagePart::Image(ImageData {
+                            mime_type: "image/png".into(),
+                            data: "iVBORw0KGgo=".into(),
+                            uri: None,
+                        }),
+                    ],
+                },
+            }],
+            tools: vec![],
+            temperature: None,
+            top_p: None,
+            max_tokens: None,
+        };
+        let body = encode_request(&req);
+        let msg = &body["messages"][0];
+        assert_eq!(msg["role"], "user");
+        let content = msg["content"].as_array().expect("content array");
+        assert_eq!(content.len(), 2);
+        assert_eq!(content[0]["type"], "text");
+        assert_eq!(content[0]["text"], "what's in this?");
+        assert_eq!(content[1]["type"], "image_url");
+        assert_eq!(
+            content[1]["image_url"]["url"],
+            "data:image/png;base64,iVBORw0KGgo="
+        );
+    }
+
+    #[test]
+    fn encodes_text_only_user_message_as_string() {
+        // Regression: even though we accept MultiPart now, the
+        // string form must stay the encoded shape for plain text
+        // messages — some upstreams treat array-form as a vision
+        // request and refuse on text-only models.
+        let req = CompletionRequest {
+            model: "m".into(),
+            messages: vec![Message {
+                role: Role::User,
+                content: MessageContent::Text {
+                    text: "plain".into(),
+                },
+            }],
+            tools: vec![],
+            temperature: None,
+            top_p: None,
+            max_tokens: None,
+        };
+        let body = encode_request(&req);
+        assert_eq!(body["messages"][0]["content"], "plain");
+        assert!(
+            body["messages"][0]["content"].as_array().is_none(),
+            "text-only must not become array form"
+        );
+    }
+
     #[test]
     fn encodes_tool_call_round_trip() {
         let req = CompletionRequest {
@@ -510,6 +575,10 @@ fn encode_message(m: &Message) -> Value {
             json!({"role": "system", "content": text})
         }
         (Role::User, MessageContent::Text { text }) => json!({"role": "user", "content": text}),
+        (Role::User, MessageContent::MultiPart { parts }) => json!({
+            "role": "user",
+            "content": encode_user_parts(parts),
+        }),
         (Role::Assistant, MessageContent::Text { text }) => {
             json!({"role": "assistant", "content": text})
         }
@@ -549,6 +618,38 @@ fn encode_message(m: &Message) -> Value {
     }
 }
 
+/// Encode a [`MessageContent::MultiPart`] user message as the OpenAI
+/// chat content-array form:
+///
+/// ```jsonc
+/// [
+///   {"type": "text", "text": "describe this:"},
+///   {"type": "image_url", "image_url": {"url": "data:image/png;base64,…"}}
+/// ]
+/// ```
+///
+/// Images use a `data:` URI built from `mime_type` + base64 `data`.
+/// `uri` is intentionally ignored here — the inline data form
+/// round-trips through every upstream we care about (cortex's
+/// model loaders read the bytes directly, OpenAI accepts both
+/// data and remote URLs but data is portable). Sticking to one
+/// shape keeps wire-level surprises down.
+fn encode_user_parts(parts: &[MessagePart]) -> Value {
+    let items: Vec<Value> = parts
+        .iter()
+        .map(|p| match p {
+            MessagePart::Text { text } => json!({ "type": "text", "text": text }),
+            MessagePart::Image(ImageData {
+                mime_type, data, ..
+            }) => json!({
+                "type": "image_url",
+                "image_url": { "url": format!("data:{mime_type};base64,{data}") },
+            }),
+        })
+        .collect();
+    Value::Array(items)
+}
+
 fn role_str(r: Role) -> &'static str {
     match r {
         Role::System => "system",
@@ -561,6 +662,14 @@ fn role_str(r: Role) -> &'static str {
 fn content_as_text(c: &MessageContent) -> String {
     match c {
         MessageContent::Text { text } => text.clone(),
+        MessageContent::MultiPart { parts } => parts
+            .iter()
+            .filter_map(|p| match p {
+                MessagePart::Text { text } => Some(text.as_str()),
+                MessagePart::Image(_) => None,
+            })
+            .collect::<Vec<_>>()
+            .join("\n\n"),
         MessageContent::ToolCalls { text, .. } => text.clone().unwrap_or_default(),
         MessageContent::ToolResult { content, .. } => content.clone(),
     }