diff --git a/src/llm/BUILD b/src/llm/BUILD index 0195973540..10d8a33c70 100644 --- a/src/llm/BUILD +++ b/src/llm/BUILD @@ -60,12 +60,23 @@ ovms_cc_library( "//src:image_conversion", "//src/filesystem:libovmsfilesystem", "@stb//:image", + ":canonical_request", ":openai_request", ":output_parsers", "//third_party:genai",], visibility = ["//visibility:public"], ) +ovms_cc_library( + name = "canonical_request", + hdrs = ["preprocessing/canonical_request.hpp"], + deps = [ + "//third_party:genai", + ":openai_request", + ], + visibility = ["//visibility:public"], +) + ovms_cc_library( name = "openai_completions_api_handler", hdrs = ["apis/openai_completions.hpp", "apis/openai_json_response.hpp"], @@ -296,6 +307,7 @@ ovms_cc_library( "language_model/continuous_batching/llm_executor.hpp", "language_model/continuous_batching/servable_initializer.hpp", "visual_language_model/continuous_batching/servable.hpp", + "visual_language_model/image_prompt_utils.hpp", "language_model/legacy/servable.hpp", "language_model/legacy/servable_initializer.hpp", "language_model/legacy/legacy_executor.hpp", @@ -307,6 +319,7 @@ ovms_cc_library( "servable_initializer.cpp", "language_model/continuous_batching/servable.cpp", "language_model/continuous_batching/servable_initializer.cpp", + "visual_language_model/image_prompt_utils.cpp", "visual_language_model/continuous_batching/servable.cpp", "language_model/legacy/servable.cpp", "language_model/legacy/servable_initializer.cpp", diff --git a/src/llm/apis/openai_api_handler.cpp b/src/llm/apis/openai_api_handler.cpp index c52136d67c..e08ebdb32c 100644 --- a/src/llm/apis/openai_api_handler.cpp +++ b/src/llm/apis/openai_api_handler.cpp @@ -348,11 +348,9 @@ absl::Status OpenAIApiHandler::parseTools() { return absl::InvalidArgumentError("tool_choice is not a valid JSON object or string"); } } - bool jsonChanged = false; if (toolChoice == "none") { // remove tools from the request doc.RemoveMember("tools"); - jsonChanged = true; } auto it = doc.FindMember("tools"); if (it != doc.MemberEnd() && !it->value.IsNull()) { @@ -405,7 +403,6 @@ absl::Status OpenAIApiHandler::parseTools() { // If toolChoice is set to a specific function name, we keep only that tool if (toolChoice != "auto" && toolChoice != "required" && toolChoice != functionName) { it->value.Erase(&obj); - jsonChanged = true; continue; } @@ -430,16 +427,10 @@ absl::Status OpenAIApiHandler::parseTools() { } request.toolChoice = toolChoice; - if (jsonChanged) { - StringBuffer buffer; - Writer writer(buffer); - doc.Accept(writer); - request.processedJson = buffer.GetString(); - } return absl::OkStatus(); } -absl::StatusOr> OpenAIApiHandler::parseToolsToJsonContainer() { +absl::StatusOr> OpenAIApiHandler::parseToolsToJsonContainer() const { auto it = doc.FindMember("tools"); if (it == doc.MemberEnd() || it->value.IsNull()) { return std::nullopt; @@ -460,7 +451,7 @@ absl::StatusOr> OpenAIApiHandler::parseT } } -absl::StatusOr> OpenAIApiHandler::parseChatTemplateKwargsToJsonContainer() { +absl::StatusOr> OpenAIApiHandler::parseChatTemplateKwargsToJsonContainer() const { auto it = doc.FindMember("chat_template_kwargs"); if (it == doc.MemberEnd() || it->value.IsNull()) { return std::nullopt; @@ -492,15 +483,47 @@ const OpenAIRequest& OpenAIApiHandler::getRequest() const { return request; } +absl::StatusOr OpenAIApiHandler::buildCanonicalRequest(RendererType rendererType) const { + return buildCanonicalRequestImpl(rendererType); +} + +absl::StatusOr OpenAIApiHandler::getCanonicalRequest(RendererType rendererType) const { + auto& cache = (rendererType == RendererType::CPP_TOKENIZER) ? cachedCppCanonicalRequest : cachedPyCanonicalRequest; + if (!cache.has_value()) { + auto canonical = buildCanonicalRequest(rendererType); + if (!canonical.ok()) { + return canonical.status(); + } + cache = std::move(canonical.value()); + } + return &(*cache); +} + const std::string& OpenAIApiHandler::getProcessedJson() const { - return request.processedJson; + auto canonicalRequest = getCanonicalRequest(RendererType::PY_JINJA); + if (canonicalRequest.ok()) { + const auto* pyPath = std::get_if(canonicalRequest.value()); + if (pyPath != nullptr) { + return pyPath->processedJson.get(); + } + } + static const std::string EMPTY_JSON{}; + return EMPTY_JSON; } const ImageHistory& OpenAIApiHandler::getImageHistory() const { + auto canonicalRequest = getCanonicalRequest(RendererType::CPP_TOKENIZER); + if (!canonicalRequest.ok()) { + return request.imageHistory; + } return request.imageHistory; } ov::genai::ChatHistory& OpenAIApiHandler::getChatHistory() { + auto canonicalRequest = getCanonicalRequest(RendererType::CPP_TOKENIZER); + if (!canonicalRequest.ok()) { + return request.chatHistory; + } return request.chatHistory; } @@ -512,7 +535,6 @@ std::optional OpenAIApiHandler::getResponseFormat() const { return request.responseFormat; } -std::optional OpenAIApiHandler::getPrompt() const { return request.prompt; } std::optional OpenAIApiHandler::getNumReturnSequences() const { return request.numReturnSequences; } StreamOptions OpenAIApiHandler::getStreamOptions() const { return request.streamOptions; } diff --git a/src/llm/apis/openai_api_handler.hpp b/src/llm/apis/openai_api_handler.hpp index 30d29c0d21..77cdc81042 100644 --- a/src/llm/apis/openai_api_handler.hpp +++ b/src/llm/apis/openai_api_handler.hpp @@ -37,6 +37,7 @@ #pragma warning(pop) #include "../io_processing/output_parser.hpp" #include "openai_request.hpp" +#include "../preprocessing/canonical_request.hpp" // Forward declarations for types only used by reference in virtual method signatures namespace ov { @@ -119,6 +120,13 @@ class OpenAIApiHandler { // Shared VLM workaround: encode text to tokens using tokenizer, validates shape std::vector encodeTextToTokens(const std::string& text); + virtual absl::StatusOr buildCanonicalRequestImpl(RendererType rendererType) const = 0; + absl::StatusOr buildCanonicalRequest(RendererType rendererType) const; + + mutable std::optional cachedCppCanonicalRequest; + mutable std::optional cachedPyCanonicalRequest; + mutable std::optional synthesizedProcessedJson; + public: OpenAIApiHandler(Document& doc, Endpoint endpoint, std::chrono::time_point creationTime, ov::genai::Tokenizer tokenizer, const std::string& toolParserName = "", const std::string& reasoningParserName = "") : @@ -147,18 +155,18 @@ class OpenAIApiHandler { // Shared parsing (non-virtual) absl::Status parseTools(); - absl::StatusOr> parseToolsToJsonContainer(); - absl::StatusOr> parseChatTemplateKwargsToJsonContainer(); + absl::StatusOr> parseToolsToJsonContainer() const; + absl::StatusOr> parseChatTemplateKwargsToJsonContainer() const; const bool areToolsAvailable() const; // Accessors (non-virtual) const OpenAIRequest& getRequest() const; - std::optional getPrompt() const; std::optional getNumReturnSequences() const; StreamOptions getStreamOptions() const; const std::string& getProcessedJson() const; const ImageHistory& getImageHistory() const; ov::genai::ChatHistory& getChatHistory(); + absl::StatusOr getCanonicalRequest(RendererType rendererType) const; std::optional getMaxTokens() const; std::optional getResponseFormat() const; bool isStream() const; diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 89009c0d74..e65ffcad9e 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -75,11 +75,9 @@ absl::Status OpenAIChatCompletionsHandler::parseCompletionsPart() { if (it != doc.MemberEnd()) { if (!it->value.IsString()) { return absl::InvalidArgumentError("prompt is not a string"); - } else { - request.prompt = it->value.GetString(); } } - if (!request.prompt.has_value() || !request.prompt.value().size()) { + if (it == doc.MemberEnd() || it->value.GetStringLength() == 0) { return absl::Status(absl::StatusCode::kInvalidArgument, "prompt is missing"); } // logprobs: int; 1 value allowed @@ -265,16 +263,45 @@ absl::Status OpenAIChatCompletionsHandler::parseMessages(std::optional writer(buffer); - doc.Accept(writer); - request.processedJson = buffer.GetString(); - } SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Parsed messages successfully"); return absl::OkStatus(); } +absl::StatusOr OpenAIChatCompletionsHandler::buildCanonicalRequestImpl(RendererType rendererType) const { + if (rendererType == RendererType::CPP_TOKENIZER) { + auto tools = parseToolsToJsonContainer(); + if (!tools.ok()) { + return tools.status(); + } + auto kwargs = parseChatTemplateKwargsToJsonContainer(); + if (!kwargs.ok()) { + return kwargs.status(); + } + std::optional rawPrompt; + if (endpoint == Endpoint::COMPLETIONS) { + auto promptIt = doc.FindMember("prompt"); + if (promptIt != doc.MemberEnd() && promptIt->value.IsString()) { + rawPrompt = std::string(promptIt->value.GetString(), promptIt->value.GetStringLength()); + } + } + CppPath cppPath{ + std::cref(request.chatHistory), + std::cref(request.imageHistory), + std::move(tools.value()), + std::move(kwargs.value()), + std::move(rawPrompt), + true}; + return CanonicalRequest(std::move(cppPath)); + } + + StringBuffer buffer; + Writer writer(buffer); + doc.Accept(writer); + synthesizedProcessedJson = buffer.GetString(); + PyPath pyPath{std::cref(synthesizedProcessedJson.value())}; + return CanonicalRequest(std::move(pyPath)); +} + // --- Unary response serialization --- std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const std::vector& generationOutputs) { diff --git a/src/llm/apis/openai_completions.hpp b/src/llm/apis/openai_completions.hpp index cbb8f2645f..6f93be80c0 100644 --- a/src/llm/apis/openai_completions.hpp +++ b/src/llm/apis/openai_completions.hpp @@ -36,6 +36,7 @@ class OpenAIChatCompletionsHandler : public OpenAIApiHandler { absl::Status parseRequest(std::optional maxTokensLimit, uint32_t bestOfLimit, std::optional maxModelLength, std::optional allowedLocalMediaPath = std::nullopt, std::optional> allowedMediaDomains = std::nullopt) override; absl::Status parseMessages(std::optional allowedLocalMediaPath = std::nullopt, std::optional> allowedMediaDomains = std::nullopt); + absl::StatusOr buildCanonicalRequestImpl(RendererType rendererType) const override; std::string serializeUnaryResponse(const std::vector& generationOutputs) override; std::string serializeUnaryResponse(ov::genai::EncodedResults& results) override; diff --git a/src/llm/apis/openai_request.hpp b/src/llm/apis/openai_request.hpp index 24327be44f..47ba0370d1 100644 --- a/src/llm/apis/openai_request.hpp +++ b/src/llm/apis/openai_request.hpp @@ -41,9 +41,7 @@ struct StreamOptions { // Class that maps OpenAI request content. struct OpenAIRequest { ov::genai::ChatHistory chatHistory; - std::string processedJson; ImageHistory imageHistory; - std::optional prompt{std::nullopt}; bool stream{false}; StreamOptions streamOptions; std::string model; diff --git a/src/llm/apis/openai_responses.cpp b/src/llm/apis/openai_responses.cpp index 1f984722dc..4c90835e24 100644 --- a/src/llm/apis/openai_responses.cpp +++ b/src/llm/apis/openai_responses.cpp @@ -648,14 +648,14 @@ absl::Status OpenAIResponsesHandler::parseInput(std::optional allow } if (inputIt->value.IsString()) { - request.prompt = inputIt->value.GetString(); - if (request.prompt.value().empty()) { + const std::string inputText(inputIt->value.GetString(), inputIt->value.GetStringLength()); + if (inputText.empty()) { return absl::InvalidArgumentError("input cannot be empty"); } request.chatHistory.push_back({}); request.chatHistory.last()["role"] = "user"; - request.chatHistory.last()["content"] = request.prompt.value(); + request.chatHistory.last()["content"] = inputText; } else if (inputIt->value.IsArray()) { if (inputIt->value.GetArray().Size() == 0) { return absl::InvalidArgumentError("input array must not be empty"); @@ -749,64 +749,6 @@ absl::Status OpenAIResponsesHandler::parseResponsesPart(std::optional return toolsStatus; } -#if (PYTHON_DISABLE == 0) - // Build processedJson with a "messages" array in chat/completions format so that - // the Python Jinja template path can consume Responses API input without a separate code path. - // Handles reasoning, function_call (merged into assistant tool_calls), and - // function_call_output (converted to role:tool messages). - // - // Built after parseTools() so any tool filtering (e.g. tool_choice removing - // unselected tools) is reflected here, and so parseTools()'s own write to - // request.processedJson (Responses-shaped doc with "input") does not - // clobber the chat/completions-shaped JSON the Python Jinja path expects. - { - Document processedDoc; - processedDoc.SetObject(); - auto& alloc = processedDoc.GetAllocator(); - - Value messagesArray(kArrayType); - - auto inputArrIt = doc.FindMember("input"); - if (inputArrIt != doc.MemberEnd() && inputArrIt->value.IsArray()) { - ProcessedJsonSink sink(messagesArray, alloc); - ResponsesInputBuilder builder(sink); - auto processedStatus = builder.build(inputArrIt->value); - if (!processedStatus.ok()) { - return processedStatus; - } - } else if (inputArrIt != doc.MemberEnd() && inputArrIt->value.IsString()) { - // String input: emit a single user message so the Python Jinja path - // sees the same content the C++ chatHistory path does. - Value msgObj(kObjectType); - msgObj.AddMember("role", Value("user", alloc), alloc); - msgObj.AddMember("content", Value(inputArrIt->value.GetString(), alloc), alloc); - messagesArray.PushBack(msgObj, alloc); - } - - processedDoc.AddMember("messages", messagesArray, alloc); - - // Tools were already normalised to chat/completions nested format by - // convertResponsesToolsInPlace earlier in parseResponsesPart — just copy verbatim. - auto processedToolsIt = doc.FindMember("tools"); - if (processedToolsIt != doc.MemberEnd() && !processedToolsIt->value.IsNull()) { - Value toolsCopy(processedToolsIt->value, alloc); - processedDoc.AddMember("tools", toolsCopy, alloc); - } - - // Copy chat_template_kwargs from original doc if present - auto kwargsIt = doc.FindMember("chat_template_kwargs"); - if (kwargsIt != doc.MemberEnd() && !kwargsIt->value.IsNull()) { - Value kwargsCopy(kwargsIt->value, alloc); - processedDoc.AddMember("chat_template_kwargs", kwargsCopy, alloc); - } - - StringBuffer buffer; - Writer writer(buffer); - processedDoc.Accept(writer); - request.processedJson = buffer.GetString(); - } -#endif - // max_output_tokens: uint; optional // OpenAI Responses API uses this field for output token limit. it = doc.FindMember("max_output_tokens"); @@ -829,6 +771,84 @@ absl::Status OpenAIResponsesHandler::parseResponsesPart(std::optional return parseResponseFormat(); } +absl::StatusOr OpenAIResponsesHandler::buildCanonicalRequestImpl(RendererType rendererType) const { + if (rendererType == RendererType::CPP_TOKENIZER) { + auto tools = parseToolsToJsonContainer(); + if (!tools.ok()) { + return tools.status(); + } + auto kwargs = parseChatTemplateKwargsToJsonContainer(); + if (!kwargs.ok()) { + return kwargs.status(); + } + CppPath cppPath{ + std::cref(request.chatHistory), + std::cref(request.imageHistory), + std::move(tools.value()), + std::move(kwargs.value()), + std::nullopt, + true}; + return CanonicalRequest(std::move(cppPath)); + } + +#if (PYTHON_DISABLE == 0) + Document processedDoc; + processedDoc.SetObject(); + auto& alloc = processedDoc.GetAllocator(); + + Value messagesArray(kArrayType); + + auto inputArrIt = doc.FindMember("input"); + if (inputArrIt != doc.MemberEnd() && inputArrIt->value.IsArray()) { + ProcessedJsonSink sink(messagesArray, alloc); + ResponsesInputBuilder builder(sink); + auto processedStatus = builder.build(inputArrIt->value); + if (!processedStatus.ok()) { + return processedStatus; + } + } else if (inputArrIt != doc.MemberEnd() && inputArrIt->value.IsString()) { + // String input: emit a single user message so the Python Jinja path + // sees the same content the C++ chatHistory path does. + Value msgObj(kObjectType); + msgObj.AddMember("role", Value("user", alloc), alloc); + msgObj.AddMember("content", Value(inputArrIt->value.GetString(), alloc), alloc); + messagesArray.PushBack(msgObj, alloc); + } + + processedDoc.AddMember("messages", messagesArray, alloc); + + // Tools were already normalised to chat/completions nested format by + // convertResponsesToolsInPlace in parseResponsesPart — just copy verbatim. + auto processedToolsIt = doc.FindMember("tools"); + if (processedToolsIt != doc.MemberEnd() && !processedToolsIt->value.IsNull()) { + Value toolsCopy(processedToolsIt->value, alloc); + processedDoc.AddMember("tools", toolsCopy, alloc); + } + + // Copy chat_template_kwargs from original doc if present. + auto kwargsIt = doc.FindMember("chat_template_kwargs"); + if (kwargsIt != doc.MemberEnd() && !kwargsIt->value.IsNull()) { + Value kwargsCopy(kwargsIt->value, alloc); + processedDoc.AddMember("chat_template_kwargs", kwargsCopy, alloc); + } + + StringBuffer buffer; + Writer writer(buffer); + processedDoc.Accept(writer); + synthesizedProcessedJson = buffer.GetString(); + PyPath pyPath{std::cref(synthesizedProcessedJson.value())}; + return CanonicalRequest(std::move(pyPath)); +#else + // When Python support is disabled, keep a best-effort canonical payload. + StringBuffer buffer; + Writer writer(buffer); + doc.Accept(writer); + synthesizedProcessedJson = buffer.GetString(); + PyPath pyPath{std::cref(synthesizedProcessedJson.value())}; + return CanonicalRequest(std::move(pyPath)); +#endif +} + // --- Serialization helpers --- void OpenAIResponsesHandler::serializeToolChoice(Writer& writer) const { diff --git a/src/llm/apis/openai_responses.hpp b/src/llm/apis/openai_responses.hpp index 6a10400952..dc30bc6111 100644 --- a/src/llm/apis/openai_responses.hpp +++ b/src/llm/apis/openai_responses.hpp @@ -94,6 +94,7 @@ class OpenAIResponsesHandler : public OpenAIApiHandler { absl::Status parseRequest(std::optional maxTokensLimit, uint32_t bestOfLimit, std::optional maxModelLength, std::optional allowedLocalMediaPath = std::nullopt, std::optional> allowedMediaDomains = std::nullopt) override; + absl::StatusOr buildCanonicalRequestImpl(RendererType rendererType) const override; std::string serializeUnaryResponse(const std::vector& generationOutputs) override; std::string serializeUnaryResponse(ov::genai::EncodedResults& results) override; diff --git a/src/llm/preprocessing/canonical_request.hpp b/src/llm/preprocessing/canonical_request.hpp new file mode 100644 index 0000000000..6ad57939ef --- /dev/null +++ b/src/llm/preprocessing/canonical_request.hpp @@ -0,0 +1,55 @@ +//***************************************************************************** +// Copyright 2026 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +namespace ovms { + +// Forward declarations +using ImageHistory = std::vector>; + +enum class RendererType { + CPP_TOKENIZER, + PY_JINJA, +}; + +// For C++ renderer path (tokenizer.apply_chat_template) +struct CppPath { + std::reference_wrapper chatHistory; + std::reference_wrapper imageHistory; + std::optional tools; + std::optional chatTemplateKwargs; + std::optional rawPrompt; + bool addGenerationPrompt = true; +}; + +// For Python Jinja renderer path +struct PyPath { + std::reference_wrapper processedJson; +}; + +// Single variant type: either C++ data or Python data, never both +using CanonicalRequest = std::variant; + +} // namespace ovms diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp index b4d5ca8185..f230b3990c 100644 --- a/src/llm/servable.cpp +++ b/src/llm/servable.cpp @@ -37,10 +37,168 @@ #include "apis/openai_responses.hpp" #include "servable.hpp" #include "text_utils.hpp" +#include #include "../tokenize/tokenize_parser.hpp" namespace ovms { +namespace { + +#if (PYTHON_DISABLE != 0) +absl::Status applyTokenizerChatTemplate( + const CppPath& cppPath, + const std::shared_ptr& properties, + std::string& inputText) { + try { + inputText = properties->tokenizer.apply_chat_template(cppPath.chatHistory.get(), cppPath.addGenerationPrompt, {}, cppPath.tools, cppPath.chatTemplateKwargs); + } catch (const std::exception& e) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what()); + return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one."); + } + return absl::OkStatus(); +} +#endif + +#if (PYTHON_DISABLE == 0) +absl::Status applyPythonChatTemplate( + const PyPath& pyPath, + const std::shared_ptr& properties, + std::string& inputText) { + bool success = PyJinjaTemplateProcessor::applyChatTemplate(properties->templateProcessor, properties->modelsPath, pyPath.processedJson.get(), inputText); + if (!success) { + return absl::Status(absl::StatusCode::kInvalidArgument, inputText); + } + return absl::OkStatus(); +} +#endif +absl::Status detectImplicitReasoningStartIfNeeded( + const std::shared_ptr& executionContext, + const std::string& inputText) { + if (executionContext->apiHandler->getOutputParser() != nullptr) { + executionContext->apiHandler->getOutputParser()->detectAndSetImplicitReasoningStart(inputText); + } + return absl::OkStatus(); +} + +absl::Status buildChatCompletionsInputText( + const CanonicalRequest& canonicalRequest, + const std::shared_ptr& executionContext, + const std::shared_ptr& properties, + std::string& inputText) { +#if (PYTHON_DISABLE == 0) + const auto* pyPath = std::get_if(&canonicalRequest); + if (pyPath == nullptr) { + return absl::InternalError("Canonical request path mismatch for Python renderer"); + } + auto status = applyPythonChatTemplate(*pyPath, properties, inputText); + if (!status.ok()) { + return status; + } +#else + const auto* cppPath = std::get_if(&canonicalRequest); + if (cppPath == nullptr) { + return absl::InternalError("Canonical request path mismatch for C++ renderer"); + } + auto status = applyTokenizerChatTemplate(*cppPath, properties, inputText); + if (!status.ok()) { + return status; + } +#endif + if (inputText.size() == 0) { + return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty"); + } + return detectImplicitReasoningStartIfNeeded(executionContext, inputText); +} + +absl::Status buildResponsesInputText( + const CanonicalRequest& canonicalRequest, + const std::shared_ptr& executionContext, + const std::shared_ptr& properties, + std::string& inputText) { +#if (PYTHON_DISABLE == 0) + const auto* pyPath = std::get_if(&canonicalRequest); + if (pyPath == nullptr) { + return absl::InternalError("Canonical request path mismatch for Python renderer"); + } + auto status = applyPythonChatTemplate(*pyPath, properties, inputText); + if (!status.ok()) { + return status; + } +#else + const auto* cppPath = std::get_if(&canonicalRequest); + if (cppPath == nullptr) { + return absl::InternalError("Canonical request path mismatch for C++ renderer"); + } + if (cppPath->chatHistory.get().size() > 0) { + auto status = applyTokenizerChatTemplate(*cppPath, properties, inputText); + if (!status.ok()) { + return status; + } + } else { + auto prompt = cppPath->rawPrompt; + if (!prompt.has_value()) { + return absl::Status(absl::StatusCode::kInvalidArgument, "input is missing"); + } + inputText = prompt.value(); + return absl::OkStatus(); + } +#endif + if (inputText.size() == 0) { + return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty"); + } + return detectImplicitReasoningStartIfNeeded(executionContext, inputText); +} + +absl::Status buildInputTextForEndpoint( + const CanonicalRequest& canonicalRequest, + const std::shared_ptr& executionContext, + const std::shared_ptr& properties, + std::string& inputText) { + switch (executionContext->endpoint) { + case Endpoint::CHAT_COMPLETIONS: + return buildChatCompletionsInputText(canonicalRequest, executionContext, properties, inputText); + case Endpoint::RESPONSES: + return buildResponsesInputText(canonicalRequest, executionContext, properties, inputText); + case Endpoint::COMPLETIONS: + if (const auto* cppPath = std::get_if(&canonicalRequest)) { + if (!cppPath->rawPrompt.has_value()) { + return absl::Status(absl::StatusCode::kInvalidArgument, "input is missing"); + } + inputText = cppPath->rawPrompt.value(); + return absl::OkStatus(); + } + return absl::InternalError("Canonical request path mismatch for completions endpoint"); + case Endpoint::TOKENIZE: + return absl::InternalError("Tokenize endpoint should not reach prepareInputs stage"); + } + return absl::InternalError("Unsupported endpoint"); +} + +absl::Status encodeAndValidateInputIds( + const std::shared_ptr& executionContext, + const std::shared_ptr& properties, + const std::string& inputText) { + bool encodeAddSpecialTokens = (executionContext->endpoint == Endpoint::COMPLETIONS); + executionContext->inputIds = properties->tokenizer.encode(inputText, ov::genai::add_special_tokens(encodeAddSpecialTokens)).input_ids; + if (properties->maxModelLength.has_value()) { + if (executionContext->inputIds.get_size() > properties->maxModelLength.value()) { + std::stringstream ss; + ss << "Number of prompt tokens: " << executionContext->inputIds.get_size() << " exceeds model max length: " << properties->maxModelLength.value(); + SPDLOG_LOGGER_ERROR(llm_calculator_logger, ss.str()); + return absl::Status(absl::StatusCode::kInvalidArgument, ss.str()); + } + if (executionContext->apiHandler->getMaxTokens().has_value() && executionContext->inputIds.get_size() + executionContext->apiHandler->getMaxTokens().value() > properties->maxModelLength.value()) { + std::stringstream ss; + ss << "Number of prompt tokens: " << executionContext->inputIds.get_size() << " + max tokens value: " << executionContext->apiHandler->getMaxTokens().value() << " exceeds model max length: " << properties->maxModelLength.value(); + SPDLOG_LOGGER_ERROR(llm_calculator_logger, ss.str()); + return absl::Status(absl::StatusCode::kInvalidArgument, ss.str()); + } + } + return absl::OkStatus(); +} + +} // namespace + void GenAiServable::determineDecodingMethod() { getProperties()->decodingMethod = DecodingMethod::STANDARD; auto& pluginConfig = getProperties()->pluginConfig; @@ -170,123 +328,56 @@ absl::Status GenAiServable::parseRequest(std::shared_ptr& executionContext) { + auto properties = getProperties(); if (executionContext->apiHandler == nullptr) { return absl::Status(absl::StatusCode::kInvalidArgument, "API handler is not initialized"); } + auto cppCanonicalRequest = executionContext->apiHandler->getCanonicalRequest(RendererType::CPP_TOKENIZER); + if (!cppCanonicalRequest.ok()) { + return cppCanonicalRequest.status(); + } + const auto* cppPath = std::get_if(cppCanonicalRequest.value()); + if (cppPath == nullptr) { + return absl::InternalError("Canonical request path mismatch for C++ renderer"); + } + // Base servable cannot process images - if (executionContext->apiHandler->getImageHistory().size() > 0) { + if (cppPath->imageHistory.get().size() > 0) { return absl::InternalError("This servable supports only text input, but image_url has been provided"); } - std::string inputText; - switch (executionContext->endpoint) { - case Endpoint::CHAT_COMPLETIONS: { #if (PYTHON_DISABLE == 0) - bool success; - if (executionContext->apiHandler->getProcessedJson().size() > 0) { - success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, executionContext->apiHandler->getProcessedJson(), inputText); - } else { - success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, executionContext->payload.body, inputText); - } - if (!success) { - return absl::Status(absl::StatusCode::kInvalidArgument, inputText); - } + RendererType rendererType = (executionContext->endpoint == Endpoint::CHAT_COMPLETIONS || executionContext->endpoint == Endpoint::RESPONSES) + ? RendererType::PY_JINJA + : RendererType::CPP_TOKENIZER; #else - ov::genai::ChatHistory& chatHistory = executionContext->apiHandler->getChatHistory(); - constexpr bool addGenerationPrompt = true; // confirm it should be hardcoded - auto toolsStatus = executionContext->apiHandler->parseToolsToJsonContainer(); - if (!toolsStatus.ok()) { - return toolsStatus.status(); - } - const auto& tools = toolsStatus.value(); - auto chatTemplateKwargsStatus = executionContext->apiHandler->parseChatTemplateKwargsToJsonContainer(); - if (!chatTemplateKwargsStatus.ok()) { - return chatTemplateKwargsStatus.status(); - } - const auto& chatTemplateKwargs = chatTemplateKwargsStatus.value(); - try { - inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, addGenerationPrompt, {}, tools, chatTemplateKwargs); - } catch (const std::exception& e) { - SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what()); - return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one."); - } + RendererType rendererType = RendererType::CPP_TOKENIZER; #endif - if (inputText.size() == 0) { - return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty"); - } - if (executionContext->apiHandler->getOutputParser() != nullptr) { - executionContext->apiHandler->getOutputParser()->detectAndSetImplicitReasoningStart(inputText); - } - break; - } - case Endpoint::RESPONSES: { - if (executionContext->apiHandler->getChatHistory().size() > 0) { + const CanonicalRequest* canonicalRequest = cppCanonicalRequest.value(); #if (PYTHON_DISABLE == 0) - bool success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, executionContext->apiHandler->getProcessedJson(), inputText); - if (!success) { - return absl::Status(absl::StatusCode::kInvalidArgument, inputText); - } -#else - ov::genai::ChatHistory& chatHistory = executionContext->apiHandler->getChatHistory(); - constexpr bool addGenerationPrompt = true; - auto toolsStatus = executionContext->apiHandler->parseToolsToJsonContainer(); - if (!toolsStatus.ok()) { - return toolsStatus.status(); - } - const auto& tools = toolsStatus.value(); - auto chatTemplateKwargsStatus = executionContext->apiHandler->parseChatTemplateKwargsToJsonContainer(); - if (!chatTemplateKwargsStatus.ok()) { - return chatTemplateKwargsStatus.status(); - } - const auto& chatTemplateKwargs = chatTemplateKwargsStatus.value(); - try { - inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, addGenerationPrompt, {}, tools, chatTemplateKwargs); - } catch (const std::exception& e) { - SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what()); - return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one."); - } -#endif - if (inputText.size() == 0) { - return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty"); - } - if (executionContext->apiHandler->getOutputParser() != nullptr) { - executionContext->apiHandler->getOutputParser()->detectAndSetImplicitReasoningStart(inputText); - } - } else { - auto prompt = executionContext->apiHandler->getPrompt(); - if (!prompt.has_value()) { - return absl::Status(absl::StatusCode::kInvalidArgument, "input is missing"); - } - inputText = prompt.value(); + if (rendererType == RendererType::PY_JINJA) { + auto pyCanonicalRequest = executionContext->apiHandler->getCanonicalRequest(RendererType::PY_JINJA); + if (!pyCanonicalRequest.ok()) { + return pyCanonicalRequest.status(); } - break; - } - case Endpoint::COMPLETIONS: { - inputText = executionContext->apiHandler->getPrompt().value(); - break; + canonicalRequest = pyCanonicalRequest.value(); } - case Endpoint::TOKENIZE: - return absl::InternalError("Tokenize endpoint should not reach prepareInputs stage"); +#endif + + std::string inputText; + auto inputTextStatus = buildInputTextForEndpoint(*canonicalRequest, executionContext, properties, inputText); + if (!inputTextStatus.ok()) { + return inputTextStatus; } + if (Config::instance().getServerSettings().verboseResponse) { executionContext->apiHandler->enableVerboseResponse(inputText); } - bool encodeAddSpecialTokens = (executionContext->endpoint == Endpoint::COMPLETIONS); - executionContext->inputIds = getProperties()->tokenizer.encode(inputText, ov::genai::add_special_tokens(encodeAddSpecialTokens)).input_ids; - if (getProperties()->maxModelLength.has_value()) { - if (executionContext->inputIds.get_size() > getProperties()->maxModelLength.value()) { - std::stringstream ss; - ss << "Number of prompt tokens: " << executionContext->inputIds.get_size() << " exceeds model max length: " << getProperties()->maxModelLength.value(); - SPDLOG_LOGGER_ERROR(llm_calculator_logger, ss.str()); - return absl::Status(absl::StatusCode::kInvalidArgument, ss.str()); - } - if (executionContext->apiHandler->getMaxTokens().has_value() && executionContext->inputIds.get_size() + executionContext->apiHandler->getMaxTokens().value() > getProperties()->maxModelLength.value()) { - std::stringstream ss; - ss << "Number of prompt tokens: " << executionContext->inputIds.get_size() << " + max tokens value: " << executionContext->apiHandler->getMaxTokens().value() << " exceeds model max length: " << getProperties()->maxModelLength.value(); - SPDLOG_LOGGER_ERROR(llm_calculator_logger, ss.str()); - return absl::Status(absl::StatusCode::kInvalidArgument, ss.str()); - } + + auto encodeStatus = encodeAndValidateInputIds(executionContext, properties, inputText); + if (!encodeStatus.ok()) { + return encodeStatus; } executionContext->apiHandler->setPromptTokensUsage(executionContext->inputIds.get_size()); diff --git a/src/llm/visual_language_model/continuous_batching/servable.cpp b/src/llm/visual_language_model/continuous_batching/servable.cpp index 7779d9c0be..1be2afae64 100644 --- a/src/llm/visual_language_model/continuous_batching/servable.cpp +++ b/src/llm/visual_language_model/continuous_batching/servable.cpp @@ -20,11 +20,13 @@ #include #include #include +#include #include #include "../../../config.hpp" #include "../../../logging.hpp" #include "../../text_utils.hpp" +#include "../image_prompt_utils.hpp" #include "../../../tokenize/tokenize_parser.hpp" namespace ovms { @@ -71,50 +73,36 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptrendpoint == Endpoint::CHAT_COMPLETIONS || executionContext->endpoint == Endpoint::RESPONSES) { - ov::genai::ChatHistory& chatHistory = vlmExecutionContext->apiHandler->getChatHistory(); - - for (size_t i = 0; i < chatHistory.size(); i++) { - const auto& message = chatHistory[i]; - if (message["content"].as_string().value_or("").find(" tag"); - } + auto canonicalRequest = vlmExecutionContext->apiHandler->getCanonicalRequest(RendererType::CPP_TOKENIZER); + if (!canonicalRequest.ok()) { + return canonicalRequest.status(); } - - const ImageHistory& imageHistory = vlmExecutionContext->apiHandler->getImageHistory(); - size_t imageIndex = 0; - std::unordered_map imageTags; - for (const auto& image : imageHistory) { - const auto& [chatTurnIndex, imageTensor] = image; - std::string imageTag = "\n"; - imageTags[chatTurnIndex] = imageTags[chatTurnIndex] + imageTag; - vlmExecutionContext->inputImages.push_back(imageTensor); + const auto* cppPath = std::get_if(canonicalRequest.value()); + if (cppPath == nullptr) { + return absl::InternalError("Canonical request path mismatch for C++ renderer"); } + ov::genai::ChatHistory chatHistory = cppPath->chatHistory.get(); - for (const auto& [chatTurnIndex, imageTagString] : imageTags) { - std::string messageContent = chatHistory[chatTurnIndex]["content"].as_string().value_or(""); - chatHistory[chatTurnIndex]["content"] = imageTagString + messageContent; + auto restrictedTagStatus = vlm::rejectRestrictedImageTags(chatHistory); + if (!restrictedTagStatus.ok()) { + return restrictedTagStatus; } - constexpr bool addGenerationPrompt = true; // confirm it should be hardcoded - auto toolsStatus = vlmExecutionContext->apiHandler->parseToolsToJsonContainer(); - if (!toolsStatus.ok()) { - return toolsStatus.status(); + const ImageHistory& imageHistory = cppPath->imageHistory.get(); + auto imagePlacementStatus = vlm::injectImageTagsAndCollectTensors(chatHistory, imageHistory, vlmExecutionContext->inputImages); + if (!imagePlacementStatus.ok()) { + return imagePlacementStatus; } - const auto& tools = toolsStatus.value(); - auto chatTemplateKwargsStatus = vlmExecutionContext->apiHandler->parseChatTemplateKwargsToJsonContainer(); - if (!chatTemplateKwargsStatus.ok()) { - return chatTemplateKwargsStatus.status(); - } - const auto& chatTemplateKwargs = chatTemplateKwargsStatus.value(); + if (llm_calculator_logger->should_log(spdlog::level::trace)) { SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM chatHistory messages: {}", chatHistory.get_messages().to_json_string()); SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM chatHistory.get_tools(): {}", chatHistory.get_tools().to_json_string()); SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM chatHistory.get_extra_context(): {}", chatHistory.get_extra_context().to_json_string()); - SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM tools: {}", tools.has_value() ? tools->to_json_string() : std::string("")); - SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM chatTemplateKwargs: {}", chatTemplateKwargs.has_value() ? chatTemplateKwargs->to_json_string() : std::string("")); - SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM addGenerationPrompt: {}", addGenerationPrompt); + SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM tools: {}", cppPath->tools.has_value() ? cppPath->tools->to_json_string() : std::string("")); + SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM chatTemplateKwargs: {}", cppPath->chatTemplateKwargs.has_value() ? cppPath->chatTemplateKwargs->to_json_string() : std::string("")); + SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM addGenerationPrompt: {}", cppPath->addGenerationPrompt); } - vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, addGenerationPrompt, {}, tools, chatTemplateKwargs); + vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, cppPath->addGenerationPrompt, {}, cppPath->tools, cppPath->chatTemplateKwargs); if (vlmExecutionContext->apiHandler->getOutputParser() != nullptr) { vlmExecutionContext->apiHandler->getOutputParser()->detectAndSetImplicitReasoningStart(vlmExecutionContext->inputText); } diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index 9c8e02c5df..8a97494eef 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "../../../logging.hpp" @@ -38,6 +39,7 @@ #include "../../../http_payload.hpp" #include "../../../mediapipe_internal/mediapipe_utils.hpp" #include "../../text_utils.hpp" +#include "../image_prompt_utils.hpp" #include "../../../tokenize/tokenize_parser.hpp" #if (PYTHON_DISABLE == 0) #include "../../py_jinja_template_processor.hpp" @@ -280,41 +282,28 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptrendpoint == Endpoint::CHAT_COMPLETIONS || executionContext->endpoint == Endpoint::RESPONSES) { - ov::genai::ChatHistory& chatHistory = vlmExecutionContext->apiHandler->getChatHistory(); - - for (size_t i = 0; i < chatHistory.size(); i++) { - const auto& message = chatHistory[i]; - if (message["content"].as_string().value_or("").find(" tag"); - } - } - - const ImageHistory& imageHistory = vlmExecutionContext->apiHandler->getImageHistory(); - size_t imageIndex = 0; - std::unordered_map imageTags; - for (const auto& image : imageHistory) { - const auto& [chatTurnIndex, imageTensor] = image; - std::string imageTag = "\n"; - imageTags[chatTurnIndex] = imageTags[chatTurnIndex] + imageTag; - vlmExecutionContext->inputImages.push_back(imageTensor); + auto canonicalRequest = vlmExecutionContext->apiHandler->getCanonicalRequest(RendererType::CPP_TOKENIZER); + if (!canonicalRequest.ok()) { + return canonicalRequest.status(); } - for (const auto& [chatTurnIndex, imageTagString] : imageTags) { - std::string messageContent = chatHistory[chatTurnIndex]["content"].as_string().value_or(""); - chatHistory[chatTurnIndex]["content"] = imageTagString + messageContent; + const auto* cppPath = std::get_if(canonicalRequest.value()); + if (cppPath == nullptr) { + return absl::InternalError("Canonical request path mismatch for C++ renderer"); } + ov::genai::ChatHistory chatHistory = cppPath->chatHistory.get(); - constexpr bool addGenerationPrompt = true; // confirm it should be hardcoded - auto toolsStatus = vlmExecutionContext->apiHandler->parseToolsToJsonContainer(); - if (!toolsStatus.ok()) { - return toolsStatus.status(); + auto restrictedTagStatus = vlm::rejectRestrictedImageTags(chatHistory); + if (!restrictedTagStatus.ok()) { + return restrictedTagStatus; } - const auto& tools = toolsStatus.value(); - auto chatTemplateKwargsStatus = vlmExecutionContext->apiHandler->parseChatTemplateKwargsToJsonContainer(); - if (!chatTemplateKwargsStatus.ok()) { - return chatTemplateKwargsStatus.status(); + + const ImageHistory& imageHistory = cppPath->imageHistory.get(); + auto imagePlacementStatus = vlm::injectImageTagsAndCollectTensors(chatHistory, imageHistory, vlmExecutionContext->inputImages); + if (!imagePlacementStatus.ok()) { + return imagePlacementStatus; } - const auto& chatTemplateKwargs = chatTemplateKwargsStatus.value(); - vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, addGenerationPrompt, {}, tools, chatTemplateKwargs); + + vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, cppPath->addGenerationPrompt, {}, cppPath->tools, cppPath->chatTemplateKwargs); if (vlmExecutionContext->apiHandler->getOutputParser() != nullptr) { vlmExecutionContext->apiHandler->getOutputParser()->detectAndSetImplicitReasoningStart(vlmExecutionContext->inputText); } diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 4fd28e771d..f42ab1377a 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -28,6 +29,7 @@ #include "../filesystem/filesystem.hpp" #include "../llm/apis/openai_completions.hpp" #include "../llm/apis/openai_responses.hpp" +#include "../llm/visual_language_model/image_prompt_utils.hpp" #include #include "../module_names.hpp" #include "../servablemanagermodule.hpp" @@ -761,19 +763,142 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingTextInputCreatesUser EXPECT_EQ(chatHistory[0]["role"], "user"); EXPECT_EQ(chatHistory[0]["content"], "What is OpenVINO?"); if (endpoint() == ovms::Endpoint::CHAT_COMPLETIONS) { - // Chat completions with simple text does not mutate the JSON, so processedJson is empty - EXPECT_TRUE(apiHandler->getProcessedJson().empty()); + // Canonical PyPath should always provide a JSON payload for template processing. + EXPECT_FALSE(apiHandler->getProcessedJson().empty()); } } +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, CanonicalRequestCppPathIsAvailable) { + std::string json = createTextRequest("What is OpenVINO?"); + auto apiHandler = parseCurrentRequest(json); + ASSERT_NE(apiHandler, nullptr); + + auto canonicalRequest = apiHandler->getCanonicalRequest(ovms::RendererType::CPP_TOKENIZER); + ASSERT_TRUE(canonicalRequest.ok()); + ASSERT_TRUE(std::holds_alternative(*canonicalRequest.value())); + + const auto& cppPath = std::get(*canonicalRequest.value()); + EXPECT_EQ(&cppPath.chatHistory.get(), &apiHandler->getChatHistory()); + EXPECT_EQ(&cppPath.imageHistory.get(), &apiHandler->getImageHistory()); +} + +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, CanonicalRequestPyPathIsAvailable) { + std::string json = createTextRequest("What is OpenVINO?"); + auto apiHandler = parseCurrentRequest(json); + ASSERT_NE(apiHandler, nullptr); + + auto canonicalRequest = apiHandler->getCanonicalRequest(ovms::RendererType::PY_JINJA); + ASSERT_TRUE(canonicalRequest.ok()); + ASSERT_TRUE(std::holds_alternative(*canonicalRequest.value())); + + const auto& pyPath = std::get(*canonicalRequest.value()); + EXPECT_FALSE(pyPath.processedJson.get().empty()); + + if (endpoint() == ovms::Endpoint::RESPONSES) { + rapidjson::Document processedDoc; + processedDoc.Parse(pyPath.processedJson.get().c_str()); + ASSERT_FALSE(processedDoc.HasParseError()); + ASSERT_TRUE(processedDoc.HasMember("messages")); + ASSERT_TRUE(processedDoc["messages"].IsArray()); + ASSERT_GE(processedDoc["messages"].Size(), 1u); + } +} + +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, LegacyGettersRemainCompatibleWithCanonicalCache) { + std::string json = createTextRequest("What is OpenVINO?"); + auto apiHandler = parseCurrentRequest(json); + ASSERT_NE(apiHandler, nullptr); + + auto& chatHistory = apiHandler->getChatHistory(); + const auto& imageHistory = apiHandler->getImageHistory(); + const auto& processedJson = apiHandler->getProcessedJson(); + + auto cppCanonical = apiHandler->getCanonicalRequest(ovms::RendererType::CPP_TOKENIZER); + auto pyCanonical = apiHandler->getCanonicalRequest(ovms::RendererType::PY_JINJA); + ASSERT_TRUE(cppCanonical.ok()); + ASSERT_TRUE(pyCanonical.ok()); + + ASSERT_TRUE(std::holds_alternative(*cppCanonical.value())); + ASSERT_TRUE(std::holds_alternative(*pyCanonical.value())); + + const auto& cppPath = std::get(*cppCanonical.value()); + const auto& pyPath = std::get(*pyCanonical.value()); + EXPECT_EQ(&cppPath.chatHistory.get(), &chatHistory); + EXPECT_EQ(&cppPath.imageHistory.get(), &imageHistory); + if (!processedJson.empty()) { + EXPECT_EQ(&pyPath.processedJson.get(), &processedJson); + } else { + EXPECT_FALSE(pyPath.processedJson.get().empty()); + } +} + +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, CanonicalRequestCacheReturnsStableAddressPerRenderer) { + std::string json = createTextRequest("What is OpenVINO?"); + auto apiHandler = parseCurrentRequest(json); + ASSERT_NE(apiHandler, nullptr); + + auto cppCanonicalFirst = apiHandler->getCanonicalRequest(ovms::RendererType::CPP_TOKENIZER); + auto cppCanonicalSecond = apiHandler->getCanonicalRequest(ovms::RendererType::CPP_TOKENIZER); + ASSERT_TRUE(cppCanonicalFirst.ok()); + ASSERT_TRUE(cppCanonicalSecond.ok()); + EXPECT_EQ(cppCanonicalFirst.value(), cppCanonicalSecond.value()); + + auto pyCanonicalFirst = apiHandler->getCanonicalRequest(ovms::RendererType::PY_JINJA); + auto pyCanonicalSecond = apiHandler->getCanonicalRequest(ovms::RendererType::PY_JINJA); + ASSERT_TRUE(pyCanonicalFirst.ok()); + ASSERT_TRUE(pyCanonicalSecond.ok()); + EXPECT_EQ(pyCanonicalFirst.value(), pyCanonicalSecond.value()); +} + +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, CanonicalRequestCppPathContainsTemplateInputs) { + if (endpoint() != ovms::Endpoint::CHAT_COMPLETIONS) { + GTEST_SKIP() << "Tools/chat_template_kwargs assertions apply to chat/completions flow"; + } + + std::string json = createTextRequest( + "What is OpenVINO?", + R"(, + "tools": [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + }, + "required": ["location"] + } + } + } + ], + "chat_template_kwargs": {"enable_thinking": true} + )"); + auto apiHandler = parseCurrentRequest(json); + ASSERT_NE(apiHandler, nullptr); + + auto cppCanonical = apiHandler->getCanonicalRequest(ovms::RendererType::CPP_TOKENIZER); + ASSERT_TRUE(cppCanonical.ok()); + ASSERT_TRUE(std::holds_alternative(*cppCanonical.value())); + + const auto& cppPath = std::get(*cppCanonical.value()); + EXPECT_TRUE(cppPath.addGenerationPrompt); + ASSERT_TRUE(cppPath.tools.has_value()); + ASSERT_TRUE(cppPath.chatTemplateKwargs.has_value()); + + const auto& kwargs = cppPath.chatTemplateKwargs.value(); + ASSERT_TRUE(kwargs["enable_thinking"].as_bool().has_value()); + EXPECT_TRUE(kwargs["enable_thinking"].as_bool().value()); +} + TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ProcessedJsonContainsEquivalentMessages) { std::string json = createTextRequest("What is OpenVINO?"); auto apiHandler = parseCurrentRequest(json); ASSERT_NE(apiHandler, nullptr); - // For Responses, processedJson is always built from chatHistory. - // For chat/completions with simple text, processedJson is empty (original body is used instead). - // In both cases, the chatHistory should be equivalent. + // Canonical PyPath provides processedJson lazily for both endpoints. + // In all cases, chatHistory should stay equivalent to parsed input. auto& chatHistory = apiHandler->getChatHistory(); ASSERT_EQ(chatHistory.size(), 1); EXPECT_EQ(chatHistory[0]["role"], "user"); @@ -781,7 +906,7 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ProcessedJsonContainsEquiva #if (PYTHON_DISABLE == 0) if (endpoint() == ovms::Endpoint::RESPONSES) { - // Responses path builds processedJson with messages array + // Responses canonical path builds processedJson with messages array. const std::string& processedJson = apiHandler->getProcessedJson(); ASSERT_FALSE(processedJson.empty()) << "Responses should build processedJson"; // Verify it contains a messages array with the correct content @@ -796,7 +921,7 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ProcessedJsonContainsEquiva } #else if (endpoint() == ovms::Endpoint::RESPONSES) { - EXPECT_TRUE(apiHandler->getProcessedJson().empty()) << "processedJson should be empty when Python is disabled"; + EXPECT_FALSE(apiHandler->getProcessedJson().empty()) << "Canonical PyPath should provide JSON regardless of Python build mode"; } #endif } @@ -841,7 +966,7 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ProcessedJsonEquivalentMult } #else if (endpoint() == ovms::Endpoint::RESPONSES) { - EXPECT_TRUE(apiHandler->getProcessedJson().empty()) << "processedJson should be empty when Python is disabled"; + EXPECT_FALSE(apiHandler->getProcessedJson().empty()) << "Canonical PyPath should provide JSON regardless of Python build mode"; } #endif } @@ -867,7 +992,7 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ProcessedJsonIncludesToolsW } #else if (endpoint() == ovms::Endpoint::RESPONSES) { - EXPECT_TRUE(apiHandler->getProcessedJson().empty()) << "processedJson should be empty when Python is disabled"; + EXPECT_FALSE(apiHandler->getProcessedJson().empty()) << "Canonical PyPath should provide JSON regardless of Python build mode"; } #endif } @@ -5446,6 +5571,62 @@ TEST_F(HttpOpenAIHandlerParsingTest, ResponsesImageHistoryIndexMatchesChatHistor EXPECT_LT(turnIndex, chatHistory.size()); } +TEST_F(HttpOpenAIHandlerParsingTest, VlmImagePromptUtilsRejectsRestrictedImageTag) { + std::string json = R"({ + "model": "llama", + "messages": [{"role": "user", "content": "prefix "}] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + auto apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + auto status = ovms::vlm::rejectRestrictedImageTags(apiHandler->getChatHistory()); + EXPECT_EQ(status, absl::InvalidArgumentError("Message contains restricted tag")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, VlmImagePromptUtilsInjectsImageTagsAndCollectsTensors) { + const std::string base64Image = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGLK27oAEAAA//8DYAHGgEvy5AAAAABJRU5ErkJggg=="; + std::string json = R"({ + "model": "llama", + "messages": [ + {"role":"user","content":[ + {"type":"text","text":"what is in these images?"}, + {"type":"image_url","image_url":{"url":")" + + base64Image + R"("}}, + {"type":"image_url","image_url":{"url":")" + + base64Image + R"("}} + ]} + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + auto apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + auto& chatHistory = apiHandler->getChatHistory(); + const auto& imageHistory = apiHandler->getImageHistory(); + ASSERT_EQ(chatHistory.size(), 1u); + ASSERT_EQ(imageHistory.size(), 2u); + + std::vector inputImages; + auto status = ovms::vlm::injectImageTagsAndCollectTensors(chatHistory, imageHistory, inputImages); + EXPECT_EQ(status, absl::OkStatus()); + + ASSERT_EQ(inputImages.size(), 2u); + std::string content = chatHistory[0]["content"].as_string().value_or(""); + EXPECT_THAT(content, ::testing::HasSubstr("\n\n")); + EXPECT_THAT(content, ::testing::HasSubstr("what is in these images?")); +} + // --- Tools normalisation edge cases --- TEST_F(HttpOpenAIHandlerParsingTest, ResponsesFlatToolWithoutParametersIsNormalised) {