diff --git a/sdk/contentunderstanding/azure-ai-contentunderstanding/CHANGELOG.md b/sdk/contentunderstanding/azure-ai-contentunderstanding/CHANGELOG.md index dd6a421cec16..afcf782c8bc3 100644 --- a/sdk/contentunderstanding/azure-ai-contentunderstanding/CHANGELOG.md +++ b/sdk/contentunderstanding/azure-ai-contentunderstanding/CHANGELOG.md @@ -8,8 +8,12 @@ ### Bugs Fixed +- Filtered service-emitted `LLMStats:` telemetry entries from the rendered `rai_warnings` front matter in `LlmInputHelper.toLlmInput`. + ### Other Changes +- Updated `LlmInputHelper.toLlmInput` page markers from `` to `` and avoided duplicate marker injection when the service markdown already includes `InputPageNumber` markers. + ## 1.1.0-beta.1 (2026-05-01) ### Features Added diff --git a/sdk/contentunderstanding/azure-ai-contentunderstanding/README.md b/sdk/contentunderstanding/azure-ai-contentunderstanding/README.md index 8e7161193383..e8d6c848ce9c 100644 --- a/sdk/contentunderstanding/azure-ai-contentunderstanding/README.md +++ b/sdk/contentunderstanding/azure-ai-contentunderstanding/README.md @@ -165,7 +165,7 @@ If you encounter errors: com.azure azure-ai-contentunderstanding - 1.0.0 + 1.1.0-beta.2 ``` [//]: # ({x-version-update-end}) @@ -439,7 +439,7 @@ fields: figure illustrating monthly values, and describes the AI Document Intelligence service... --- - + # ==This is title== ## 1. Text [Latin](https://en.wikipedia.org/wiki/Latin) refers to an ancient Italic language... diff --git a/sdk/contentunderstanding/azure-ai-contentunderstanding/src/main/java/com/azure/ai/contentunderstanding/LlmInputHelper.java b/sdk/contentunderstanding/azure-ai-contentunderstanding/src/main/java/com/azure/ai/contentunderstanding/LlmInputHelper.java index dba03f138d08..85bc967df7c0 100644 --- a/sdk/contentunderstanding/azure-ai-contentunderstanding/src/main/java/com/azure/ai/contentunderstanding/LlmInputHelper.java +++ b/sdk/contentunderstanding/azure-ai-contentunderstanding/src/main/java/com/azure/ai/contentunderstanding/LlmInputHelper.java @@ -58,6 +58,20 @@ public final class LlmInputHelper { private static final Pattern PAGE_BREAK_PATTERN = Pattern.compile("\\n*\\n*"); + // Marker emitted by toLlmInput at each page boundary. Future Content Understanding + // service versions emit this same marker directly in the returned markdown (per + // ContentUnderstanding-Docs#249). When the helper sees any occurrence of this + // prefix in the input markdown it treats the service as having already paginated + // the content and skips its own injection to avoid duplicate markers. + private static final String INPUT_PAGE_MARKER_PREFIX = "}) inserted at page boundaries so downstream consumers - * can locate content by page number. + * ({@code }) inserted at page boundaries so downstream + * consumers can locate content by page number. If the service markdown already + * contains {@code \n\n"); + sb.append(INPUT_PAGE_MARKER_PREFIX).append(' ').append(marker[1]).append(" -->\n\n"); prev = adj; } if (prev < cleaned.length()) { @@ -565,7 +588,7 @@ private static String pageMarkersFromBreaks(String markdown, RenderableContent c for (int i = 0; i < chunks.length; i++) { String text = chunks[i].trim(); if (!text.isEmpty()) { - parts.add("\n\n" + text); + parts.add(INPUT_PAGE_MARKER_PREFIX + " " + (startPage + i) + " -->\n\n" + text); } } return String.join("\n\n", parts); @@ -646,12 +669,20 @@ private static List> formatWarnings(List warn if (w == null) { continue; } + String message = w.getMessage(); + // Skip internal service telemetry strings (e.g. "LLMStats: ...") that + // occasionally leak into the warnings collection. These are not + // Responsible-AI warnings and would otherwise be rendered into the + // LLM-facing rai_warnings: block. + if (message != null && isTelemetryMessage(message)) { + continue; + } Map entry = new LinkedHashMap<>(); if (w.getCode() != null && !w.getCode().isEmpty()) { entry.put("code", w.getCode()); } - if (w.getMessage() != null && !w.getMessage().isEmpty()) { - entry.put("message", w.getMessage()); + if (message != null && !message.isEmpty()) { + entry.put("message", message); } if (!entry.isEmpty()) { items.add(entry); @@ -660,6 +691,20 @@ private static List> formatWarnings(List warn return items; } + private static boolean isTelemetryMessage(String message) { + // Strip leading whitespace (case-sensitive prefix match). + int i = 0; + while (i < message.length() && (message.charAt(i) == ' ' || message.charAt(i) == '\t')) { + i++; + } + for (String prefix : TELEMETRY_MESSAGE_PREFIXES) { + if (message.regionMatches(false, i, prefix, 0, prefix.length())) { + return true; + } + } + return false; + } + // ----------------------------------------------------------------------- // Minimal YAML serializer (no external dependency) // ----------------------------------------------------------------------- diff --git a/sdk/contentunderstanding/azure-ai-contentunderstanding/src/samples/java/com/azure/ai/contentunderstanding/samples/Sample_Advanced_ToLlmInput.java b/sdk/contentunderstanding/azure-ai-contentunderstanding/src/samples/java/com/azure/ai/contentunderstanding/samples/Sample_Advanced_ToLlmInput.java index e9b1863702cf..3553924b9929 100644 --- a/sdk/contentunderstanding/azure-ai-contentunderstanding/src/samples/java/com/azure/ai/contentunderstanding/samples/Sample_Advanced_ToLlmInput.java +++ b/sdk/contentunderstanding/azure-ai-contentunderstanding/src/samples/java/com/azure/ai/contentunderstanding/samples/Sample_Advanced_ToLlmInput.java @@ -136,7 +136,7 @@ public static void main(String[] args) { // Analyze specific pages using ContentRange. // Page markers in the output will use the original document page numbers, // so even though we only requested pages 2-3 and 5, the markers will say - // , , (not 1, 2, 3). + // , , (not 1, 2, 3). System.out.println("Analyzing pages 2-3 and 5 of a multi-page PDF..."); System.out.println(" URL: " + multiPageUrl); System.out.println(" contentRange: '2-3,5'\n"); diff --git a/sdk/contentunderstanding/azure-ai-contentunderstanding/src/test/java/com/azure/ai/contentunderstanding/tests/LlmInputHelperTest.java b/sdk/contentunderstanding/azure-ai-contentunderstanding/src/test/java/com/azure/ai/contentunderstanding/tests/LlmInputHelperTest.java index b5bc8676ca39..41077746d49a 100644 --- a/sdk/contentunderstanding/azure-ai-contentunderstanding/src/test/java/com/azure/ai/contentunderstanding/tests/LlmInputHelperTest.java +++ b/sdk/contentunderstanding/azure-ai-contentunderstanding/src/test/java/com/azure/ai/contentunderstanding/tests/LlmInputHelperTest.java @@ -82,7 +82,7 @@ public void toLlmInputSingleDocumentDefaultOptions() { assertTrue(output.contains("Amount: 165")); assertTrue(output.contains("CurrencyCode: USD")); assertTrue(output.contains("Hello world")); - assertTrue(output.contains("")); + assertTrue(output.contains("")); } @Test @@ -183,10 +183,110 @@ public void toLlmInputWithWarnings() { assertTrue(output.contains("message: 'latency: 2s'")); } + @Test + public void llmStatsWarningFilteredFromRaiWarnings() { + String json = "{" + "\"analyzerId\":\"a\",\"apiVersion\":\"v\"," + + "\"createdAt\":\"2026-01-01T00:00:00Z\",\"stringEncoding\":\"utf16\"," + "\"warnings\":[" + + "{\"code\":\"Telemetry\",\"message\":\"LLMStats: completion calls: 2; embedding calls: 1\"}," + + "{\"code\":\"ContentWarning\",\"message\":\"Potentially sensitive content.\"}" + "]," + + "\"contents\":[{\"kind\":\"document\",\"mimeType\":\"application/pdf\"," + + "\"startPageNumber\":1,\"endPageNumber\":1,\"markdown\":\"text\"}]" + "}"; + AnalysisResult result = parseResult(json); + String output = LlmInputHelper.toLlmInput(result); + + assertTrue(output.contains("rai_warnings:")); + assertFalse(output.contains("LLMStats:")); + assertTrue(output.contains("Potentially sensitive content.")); + } + + @Test + public void llmStatsWarningOnlyOmitsRaiWarningsBlock() { + String json = "{" + "\"analyzerId\":\"a\",\"apiVersion\":\"v\"," + + "\"createdAt\":\"2026-01-01T00:00:00Z\",\"stringEncoding\":\"utf16\"," + + "\"warnings\":[{\"code\":\"Telemetry\",\"message\":\"LLMStats: completion latency: 7.71s\"}]," + + "\"contents\":[{\"kind\":\"document\",\"mimeType\":\"application/pdf\"," + + "\"startPageNumber\":1,\"endPageNumber\":1,\"markdown\":\"text\"}]" + "}"; + AnalysisResult result = parseResult(json); + String output = LlmInputHelper.toLlmInput(result); + + assertFalse(output.contains("rai_warnings:")); + assertFalse(output.contains("LLMStats:")); + } + + @Test + public void llmStatsFilterIsCaseSensitive() { + String json = "{" + "\"analyzerId\":\"a\",\"apiVersion\":\"v\"," + + "\"createdAt\":\"2026-01-01T00:00:00Z\",\"stringEncoding\":\"utf16\"," + + "\"warnings\":[{\"code\":\"ContentWarning\",\"message\":\"llmstats: keep as a real warning\"}]," + + "\"contents\":[{\"kind\":\"document\",\"mimeType\":\"application/pdf\"," + + "\"startPageNumber\":1,\"endPageNumber\":1,\"markdown\":\"text\"}]" + "}"; + AnalysisResult result = parseResult(json); + String output = LlmInputHelper.toLlmInput(result); + + assertTrue(output.contains("rai_warnings:")); + assertTrue(output.contains("llmstats: keep as a real warning")); + } + + @Test + public void llmStatsTextInMarkdownBodyIsPreserved() { + String json = "{" + "\"analyzerId\":\"a\",\"apiVersion\":\"v\"," + + "\"createdAt\":\"2026-01-01T00:00:00Z\",\"stringEncoding\":\"utf16\"," + + "\"warnings\":[{\"code\":\"Telemetry\",\"message\":\"LLMStats: remove this warning text\"}]," + + "\"contents\":[{\"kind\":\"document\",\"mimeType\":\"application/pdf\"," + + "\"startPageNumber\":1,\"endPageNumber\":1," + + "\"markdown\":\"A log excerpt:\\n- LLMStats: keep this body text\"}]" + "}"; + AnalysisResult result = parseResult(json); + String output = LlmInputHelper.toLlmInput(result); + + assertFalse(output.contains("rai_warnings:")); + assertTrue(output.contains("LLMStats: keep this body text")); + assertFalse(output.contains("LLMStats: remove this warning text")); + } + + @Test + public void llmStatsWarningFilteredWithLeadingWhitespace() { + String json = "{" + "\"analyzerId\":\"a\",\"apiVersion\":\"v\"," + + "\"createdAt\":\"2026-01-01T00:00:00Z\",\"stringEncoding\":\"utf16\"," + + "\"warnings\":[{\"code\":\"Telemetry\",\"message\":\" LLMStats: completion calls: 2\"}]," + + "\"contents\":[{\"kind\":\"document\",\"mimeType\":\"application/pdf\"," + + "\"startPageNumber\":1,\"endPageNumber\":1,\"markdown\":\"text\"}]" + "}"; + AnalysisResult result = parseResult(json); + String output = LlmInputHelper.toLlmInput(result); + + assertFalse(output.contains("rai_warnings:")); + assertFalse(output.contains("LLMStats:")); + } + // ----------------------------------------------------------------------- // Page markers // ----------------------------------------------------------------------- + @Test + public void pageMarkersNotDuplicatedWhenServiceProvidesMarkers() { + String json = "{" + "\"analyzerId\":\"a\",\"apiVersion\":\"v\"," + + "\"createdAt\":\"2026-01-01T00:00:00Z\",\"stringEncoding\":\"utf16\"," + "\"contents\":[{" + + " \"kind\":\"document\",\"mimeType\":\"application/pdf\"," + + " \"startPageNumber\":1,\"endPageNumber\":2," + + " \"markdown\":\"\\n\\nFirst page text.\\n\\n\\n\\nSecond page text.\"," + + " \"pages\":[" + " {\"pageNumber\":1,\"spans\":[{\"offset\":0,\"length\":47}]}," + + " {\"pageNumber\":2,\"spans\":[{\"offset\":49,\"length\":48}]}" + " ]" + "}]" + "}"; + AnalysisResult result = parseResult(json); + String output = LlmInputHelper.toLlmInput(result); + + assertEquals(1, countOccurrences(output, "")); + assertEquals(1, countOccurrences(output, "")); + } + + private static int countOccurrences(String text, String needle) { + int count = 0; + int idx = 0; + while ((idx = text.indexOf(needle, idx)) != -1) { + count++; + idx += needle.length(); + } + return count; + } + @Test public void toLlmInputMultiPageWithSpans() { String json = "{" + "\"analyzerId\":\"a\",\"apiVersion\":\"v\"," @@ -201,9 +301,9 @@ public void toLlmInputMultiPageWithSpans() { String output = LlmInputHelper.toLlmInput(result); assertTrue(output.contains("pages: 2-4")); - assertTrue(output.contains("")); - assertTrue(output.contains("")); - assertTrue(output.contains("")); + assertTrue(output.contains("")); + assertTrue(output.contains("")); + assertTrue(output.contains("")); } @Test @@ -216,9 +316,9 @@ public void toLlmInputMultiPageWithPageBreaks() { AnalysisResult result = parseResult(json); String output = LlmInputHelper.toLlmInput(result); - assertTrue(output.contains("")); - assertTrue(output.contains("")); - assertTrue(output.contains("")); + assertTrue(output.contains("")); + assertTrue(output.contains("")); + assertTrue(output.contains("")); assertFalse(output.contains("")); } diff --git a/sdk/contentunderstanding/azure-ai-contentunderstanding/src/test/java/com/azure/ai/contentunderstanding/tests/samples/Sample_Advanced_ToLlmInputAsyncTest.java b/sdk/contentunderstanding/azure-ai-contentunderstanding/src/test/java/com/azure/ai/contentunderstanding/tests/samples/Sample_Advanced_ToLlmInputAsyncTest.java index deadd92bb25d..e07eb261fa5a 100644 --- a/sdk/contentunderstanding/azure-ai-contentunderstanding/src/test/java/com/azure/ai/contentunderstanding/tests/samples/Sample_Advanced_ToLlmInputAsyncTest.java +++ b/sdk/contentunderstanding/azure-ai-contentunderstanding/src/test/java/com/azure/ai/contentunderstanding/tests/samples/Sample_Advanced_ToLlmInputAsyncTest.java @@ -131,15 +131,16 @@ public void testToLlmInputMultiPageContentRangeAsync() { "'pages' value should be '2-3, 5' (original page numbers preserved)"); // Page markers in the markdown body should use the original page numbers - assertFalse(text.contains(""), - "Page marker '' should not appear — we only requested pages 2-3, 5"); - assertTrue(text.contains(""), - "Page marker '' should appear in the markdown body"); - assertTrue(text.contains(""), - "Page marker '' should appear in the markdown body"); - assertTrue(text.contains(""), - "Page marker '' should appear in the markdown body"); - System.out.println("[PASS] Page markers verified: , , "); + assertFalse(text.contains(""), + "Page marker '' should not appear — we only requested pages 2-3, 5"); + assertTrue(text.contains(""), + "Page marker '' should appear in the markdown body"); + assertTrue(text.contains(""), + "Page marker '' should appear in the markdown body"); + assertTrue(text.contains(""), + "Page marker '' should appear in the markdown body"); + System.out.println( + "[PASS] Page markers verified: , , "); System.out .println("[PASS] toLlmInput output validated (" + text.length() + " chars, pages='2-3, 5' preserved)"); diff --git a/sdk/contentunderstanding/azure-ai-contentunderstanding/src/test/java/com/azure/ai/contentunderstanding/tests/samples/Sample_Advanced_ToLlmInputTest.java b/sdk/contentunderstanding/azure-ai-contentunderstanding/src/test/java/com/azure/ai/contentunderstanding/tests/samples/Sample_Advanced_ToLlmInputTest.java index 974a51708725..e03bbced9438 100644 --- a/sdk/contentunderstanding/azure-ai-contentunderstanding/src/test/java/com/azure/ai/contentunderstanding/tests/samples/Sample_Advanced_ToLlmInputTest.java +++ b/sdk/contentunderstanding/azure-ai-contentunderstanding/src/test/java/com/azure/ai/contentunderstanding/tests/samples/Sample_Advanced_ToLlmInputTest.java @@ -137,16 +137,18 @@ public void testToLlmInputMultiPageContentRange() { "'pages' value should be '2-3, 5' (original page numbers preserved)"); // Page markers in the markdown body should use the original page numbers - // (, , ), not renumbered (1, 2, 3). - assertFalse(text.contains(""), - "Page marker '' should not appear — we only requested pages 2-3, 5"); - assertTrue(text.contains(""), - "Page marker '' should appear in the markdown body"); - assertTrue(text.contains(""), - "Page marker '' should appear in the markdown body"); - assertTrue(text.contains(""), - "Page marker '' should appear in the markdown body"); - System.out.println("[PASS] Page markers verified: , , "); + // (, , ), + // not renumbered (1, 2, 3). + assertFalse(text.contains(""), + "Page marker '' should not appear — we only requested pages 2-3, 5"); + assertTrue(text.contains(""), + "Page marker '' should appear in the markdown body"); + assertTrue(text.contains(""), + "Page marker '' should appear in the markdown body"); + assertTrue(text.contains(""), + "Page marker '' should appear in the markdown body"); + System.out.println( + "[PASS] Page markers verified: , , "); System.out .println("[PASS] toLlmInput output validated (" + text.length() + " chars, pages='2-3, 5' preserved)");