From d62a91ef60d3a9e92c937e9f51beb7d31ef48b1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 22 Apr 2026 10:50:12 +0200
Subject: [PATCH] fix(llm): auto-shape multimodal mediaPath messages in chat
 template
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

LLMController.generate() collected imagePaths from messages with a
mediaPath but did not transform their content into the array form
([{type:'image'}, {type:'text', text}]) that the chat template needs
to emit the image placeholder. Calling generate() directly with a
vision-capable model thus threw "More images paths provided than
'<image>' placeholders in prompt" from native. sendMessage() worked
because it built its own historyForTemplate that did the transformation.

Move the transformation into applyChatTemplate so both call sites get
correct behavior, and remove the now-redundant historyForTemplate block
from sendMessage. Public Message.content type unchanged; external
callers always pass plain strings, the controller handles the array
form internally.

Refs #1086 (items 1 and 2 — with item 1 fixed, item 2's type mismatch
no longer surfaces because external callers never need to construct
the array form themselves).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../src/controllers/LLMController.ts          | 37 ++++++++++++-------
 1 file changed, 23 insertions(+), 14 deletions(-)
diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts
index 8829f3710..d9d5593ae 100644
--- a/packages/react-native-executorch/src/controllers/LLMController.ts
+++ b/packages/react-native-executorch/src/controllers/LLMController.ts
@@ -354,18 +354,6 @@ export class LLMController {
     const updatedHistory = [...this._messageHistory, newMessage];
     this.messageHistoryCallback(updatedHistory);
 
-    const historyForTemplate = updatedHistory.map((m) =>
-      m.mediaPath
-        ? {
-            ...m,
-            content: [
-              { type: 'image' },
-              { type: 'text', text: m.content },
-            ] as any,
-          }
-        : m
-    );
-
     const visualTokenCount = this.nativeModule.getVisualTokenCount();
     const countTokensCallback = (messages: Message[]) => {
       const rendered = this.applyChatTemplate(
@@ -383,7 +371,7 @@ export class LLMController {
     const messageHistoryWithPrompt =
       this.chatConfig.contextStrategy.buildContext(
         this.chatConfig.systemPrompt,
-        historyForTemplate,
+        updatedHistory,
         maxContextLength,
         countTokensCallback
       );
@@ -448,7 +436,7 @@ export class LLMController {
     );
 
     const result = template.render({
-      messages,
+      messages: messagesForChatTemplate(messages),
       tools,
       ...templateFlags,
       ...specialTokens,
@@ -468,3 +456,24 @@ export class LLMController {
 function normalizeImagePath(path: string): string {
   return path.startsWith('file://') ? path : `file://${path}`;
 }
+
+/**
+ * Multimodal chat templates expect message content for image-bearing turns
+ * to be an array of content parts with an `image` part as a placeholder.
+ * Callers of `LLMController.generate` and `LLMController.sendMessage` pass
+ * messages with a plain string `content` plus an optional `mediaPath`; this
+ * helper rewrites them into the structured form that the template engine
+ * understands.
+ * @param messages - Messages to prepare for the chat template engine.
+ * @returns Messages with image-bearing turns rewritten to structured content.
+ */
+function messagesForChatTemplate(messages: Message[]): any[] {
+  return messages.map((m) =>
+    m.mediaPath && typeof m.content === 'string'
+      ? {
+          ...m,
+          content: [{ type: 'image' }, { type: 'text', text: m.content }],
+        }
+      : m
+  );
+}