From d62a91ef60d3a9e92c937e9f51beb7d31ef48b1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Wed, 22 Apr 2026 10:50:12 +0200 Subject: [PATCH] fix(llm): auto-shape multimodal mediaPath messages in chat template MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LLMController.generate() collected imagePaths from messages with a mediaPath but did not transform their content into the array form ([{type:'image'}, {type:'text', text}]) that the chat template needs to emit the image placeholder. Calling generate() directly with a vision-capable model thus threw "More images paths provided than '' placeholders in prompt" from native. sendMessage() worked because it built its own historyForTemplate that did the transformation. Move the transformation into applyChatTemplate so both call sites get correct behavior, and remove the now-redundant historyForTemplate block from sendMessage. Public Message.content type unchanged; external callers always pass plain strings, the controller handles the array form internally. Refs #1086 (items 1 and 2 — with item 1 fixed, item 2's type mismatch no longer surfaces because external callers never need to construct the array form themselves). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/controllers/LLMController.ts | 37 ++++++++++++------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts index 8829f3710..d9d5593ae 100644 --- a/packages/react-native-executorch/src/controllers/LLMController.ts +++ b/packages/react-native-executorch/src/controllers/LLMController.ts @@ -354,18 +354,6 @@ export class LLMController { const updatedHistory = [...this._messageHistory, newMessage]; this.messageHistoryCallback(updatedHistory); - const historyForTemplate = updatedHistory.map((m) => - m.mediaPath - ? { - ...m, - content: [ - { type: 'image' }, - { type: 'text', text: m.content }, - ] as any, - } - : m - ); - const visualTokenCount = this.nativeModule.getVisualTokenCount(); const countTokensCallback = (messages: Message[]) => { const rendered = this.applyChatTemplate( @@ -383,7 +371,7 @@ export class LLMController { const messageHistoryWithPrompt = this.chatConfig.contextStrategy.buildContext( this.chatConfig.systemPrompt, - historyForTemplate, + updatedHistory, maxContextLength, countTokensCallback ); @@ -448,7 +436,7 @@ export class LLMController { ); const result = template.render({ - messages, + messages: messagesForChatTemplate(messages), tools, ...templateFlags, ...specialTokens, @@ -468,3 +456,24 @@ export class LLMController { function normalizeImagePath(path: string): string { return path.startsWith('file://') ? path : `file://${path}`; } + +/** + * Multimodal chat templates expect message content for image-bearing turns + * to be an array of content parts with an `image` part as a placeholder. + * Callers of `LLMController.generate` and `LLMController.sendMessage` pass + * messages with a plain string `content` plus an optional `mediaPath`; this + * helper rewrites them into the structured form that the template engine + * understands. + * @param messages - Messages to prepare for the chat template engine. + * @returns Messages with image-bearing turns rewritten to structured content. + */ +function messagesForChatTemplate(messages: Message[]): any[] { + return messages.map((m) => + m.mediaPath && typeof m.content === 'string' + ? { + ...m, + content: [{ type: 'image' }, { type: 'text', text: m.content }], + } + : m + ); +}