diff --git a/sdk/contentunderstanding/azure-ai-contentunderstanding/CHANGELOG.md b/sdk/contentunderstanding/azure-ai-contentunderstanding/CHANGELOG.md index 5d0eb9112b59..db0b86a859c8 100644 --- a/sdk/contentunderstanding/azure-ai-contentunderstanding/CHANGELOG.md +++ b/sdk/contentunderstanding/azure-ai-contentunderstanding/CHANGELOG.md @@ -1,5 +1,13 @@ # Release History +## 1.2.0b2 (Unreleased) + +### Bugs Fixed +- Filtered service-emitted `LLMStats:` telemetry entries from the rendered `rai_warnings` front matter. + +### Other Changes +- Updated `to_llm_input` page markers from `` to `` and avoided duplicate marker injection when the service markdown already includes `InputPageNumber` markers. + ## 1.2.0b1 (2026-04-28) ### Features Added diff --git a/sdk/contentunderstanding/azure-ai-contentunderstanding/README.md b/sdk/contentunderstanding/azure-ai-contentunderstanding/README.md index a50a10002bf4..59125b208808 100644 --- a/sdk/contentunderstanding/azure-ai-contentunderstanding/README.md +++ b/sdk/contentunderstanding/azure-ai-contentunderstanding/README.md @@ -59,6 +59,7 @@ This table shows the relationship between SDK versions and supported API service | SDK version | Supported API service version | | ----------- | ----------------------------- | +| 1.2.0b2 | 2025-11-01 | | 1.2.0b1 | 2025-11-01 | | 1.1.0 | 2025-11-01 | | 1.0.1 | 2025-11-01 | diff --git a/sdk/contentunderstanding/azure-ai-contentunderstanding/azure/ai/contentunderstanding/_helpers.py b/sdk/contentunderstanding/azure-ai-contentunderstanding/azure/ai/contentunderstanding/_helpers.py index 19ff1638b9e0..7156c1effae4 100644 --- a/sdk/contentunderstanding/azure-ai-contentunderstanding/azure/ai/contentunderstanding/_helpers.py +++ b/sdk/contentunderstanding/azure-ai-contentunderstanding/azure/ai/contentunderstanding/_helpers.py @@ -15,7 +15,7 @@ import datetime import math import re -from typing import Any, Dict, List, Optional, TYPE_CHECKING +from typing import Any, Dict, List, Optional, Tuple, TYPE_CHECKING if TYPE_CHECKING: from .models import ( @@ -39,6 +39,37 @@ } ) +# Marker emitted by ``to_llm_input`` at each page boundary. Future Content +# Understanding service versions emit this same marker directly in the +# returned markdown (per ContentUnderstanding-Docs#249). When the helper sees +# any occurrence of this prefix in the input markdown it treats the service +# as having already paginated the content and skips its own injection to +# avoid duplicate markers. +_INPUT_PAGE_MARKER_PREFIX = "`` + markers at page boundaries when the service result does not already + include them. Internal telemetry messages such as ``LLMStats: ...`` + are filtered from the rendered ``rai_warnings`` front matter. + :param result: The ``AnalysisResult`` from a Content Understanding analyze operation. :type result: ~azure.ai.contentunderstanding.models.AnalysisResult :keyword include_fields: Whether to include structured fields in the @@ -379,7 +415,12 @@ def _render_content_block( def _add_page_markers(content: "DocumentContent", markdown: str) -> str: - """Add ```` markers to document markdown. + """Add ```` markers to document markdown. + + If *markdown* already contains ``\n*") - shifts: List[tuple] = [] # (original_pos, delta) + shifts: List[Tuple[int, int]] = [] # (original_pos, delta) for m in break_pattern.finditer(markdown): replacement_len = 2 # "\n\n" delta = m.end() - m.start() - replacement_len @@ -438,7 +481,7 @@ def _adjusted_offset(orig: int) -> int: for offset, page_num in markers: adj = _adjusted_offset(offset) parts.append(cleaned[prev:adj]) - parts.append(f"\n\n") + parts.append(f"{_INPUT_PAGE_MARKER_PREFIX} {page_num} -->\n\n") prev = adj parts.append(cleaned[prev:]) @@ -464,7 +507,7 @@ def _page_markers_from_breaks(markdown: str, content: "DocumentContent") -> str: page_num = start_page + i text = chunk.strip() if text: - parts.append(f"\n\n{text}") + parts.append(f"{_INPUT_PAGE_MARKER_PREFIX} {page_num} -->\n\n{text}") return "\n\n".join(parts) @@ -559,11 +602,18 @@ def _format_warnings( """ items: List[Dict[str, str]] = [] for w in warnings: + message = getattr(w, "message", None) + # Skip internal service telemetry strings (e.g. ``LLMStats: ...``) + # that occasionally leak into the warnings collection. These are + # not Responsible-AI warnings and would otherwise be rendered into + # the LLM-facing ``rai_warnings:`` block. + if message and message.lstrip().startswith(_TELEMETRY_MESSAGE_PREFIXES): + continue entry: Dict[str, str] = {} if getattr(w, "code", None): entry["code"] = w.code # type: ignore[assignment, union-attr] - if getattr(w, "message", None): - entry["message"] = w.message # type: ignore[assignment, union-attr] + if message: + entry["message"] = message if getattr(w, "target", None): entry["target"] = w.target # type: ignore[assignment, union-attr] if entry: diff --git a/sdk/contentunderstanding/azure-ai-contentunderstanding/azure/ai/contentunderstanding/_version.py b/sdk/contentunderstanding/azure-ai-contentunderstanding/azure/ai/contentunderstanding/_version.py index 5bf479b145f7..a8cca866f40a 100644 --- a/sdk/contentunderstanding/azure-ai-contentunderstanding/azure/ai/contentunderstanding/_version.py +++ b/sdk/contentunderstanding/azure-ai-contentunderstanding/azure/ai/contentunderstanding/_version.py @@ -6,4 +6,4 @@ # Changes may cause incorrect behavior and will be lost if the code is regenerated. # -------------------------------------------------------------------------- -VERSION = "1.2.0b1" +VERSION = "1.2.0b2" diff --git a/sdk/contentunderstanding/azure-ai-contentunderstanding/tests/samples/test_sample_to_llm_input.py b/sdk/contentunderstanding/azure-ai-contentunderstanding/tests/samples/test_sample_to_llm_input.py index b4132da1f426..321c83212110 100644 --- a/sdk/contentunderstanding/azure-ai-contentunderstanding/tests/samples/test_sample_to_llm_input.py +++ b/sdk/contentunderstanding/azure-ai-contentunderstanding/tests/samples/test_sample_to_llm_input.py @@ -147,16 +147,20 @@ def test_to_llm_input_multi_page_content_range(self, contentunderstanding_endpoi print(f"[PASS] to_llm_input output validated ({len(text)} chars, pages='2-3, 5' preserved)") # Page markers in the markdown body should use the original page numbers - # (, , ), not renumbered (1, 2, 3). - assert "" not in text, ( - "Page marker '' should not appear — we only requested pages 2-3, 5" + # (, , ), + # not renumbered (1, 2, 3). + assert "" not in text, ( + "Page marker '' should not appear — we only requested pages 2-3, 5" ) for expected_page in [2, 3, 5]: - assert f"" in text, ( - f"Page marker '' should appear in the markdown body. " + assert f"" in text, ( + f"Page marker '' should appear in the markdown body. " f"Output:\n{text[:800]}" ) - print("[PASS] Page markers verified: , , ") + print( + "[PASS] Page markers verified: , " + ", " + ) print("\n[SUCCESS] All test_to_llm_input_multi_page_content_range assertions passed") diff --git a/sdk/contentunderstanding/azure-ai-contentunderstanding/tests/samples/test_sample_to_llm_input_async.py b/sdk/contentunderstanding/azure-ai-contentunderstanding/tests/samples/test_sample_to_llm_input_async.py index 36ec9791c4a5..4b566c8fa598 100644 --- a/sdk/contentunderstanding/azure-ai-contentunderstanding/tests/samples/test_sample_to_llm_input_async.py +++ b/sdk/contentunderstanding/azure-ai-contentunderstanding/tests/samples/test_sample_to_llm_input_async.py @@ -148,16 +148,20 @@ async def test_to_llm_input_multi_page_content_range_async(self, contentundersta print(f"[PASS] to_llm_input output validated ({len(text)} chars, pages='2-3, 5' preserved)") # Page markers in the markdown body should use the original page numbers - # (, , ), not renumbered (1, 2, 3). - assert "" not in text, ( - "Page marker '' should not appear — we only requested pages 2-3, 5" + # (, , ), + # not renumbered (1, 2, 3). + assert "" not in text, ( + "Page marker '' should not appear — we only requested pages 2-3, 5" ) for expected_page in [2, 3, 5]: - assert f"" in text, ( - f"Page marker '' should appear in the markdown body. " + assert f"" in text, ( + f"Page marker '' should appear in the markdown body. " f"Output:\n{text[:800]}" ) - print("[PASS] Page markers verified: , , ") + print( + "[PASS] Page markers verified: , " + ", " + ) await client.close() print("\n[SUCCESS] All test_to_llm_input_multi_page_content_range_async assertions passed") diff --git a/sdk/contentunderstanding/azure-ai-contentunderstanding/tests/test_to_llm_input.py b/sdk/contentunderstanding/azure-ai-contentunderstanding/tests/test_to_llm_input.py index 2cd18729b67d..8845fc2088bc 100644 --- a/sdk/contentunderstanding/azure-ai-contentunderstanding/tests/test_to_llm_input.py +++ b/sdk/contentunderstanding/azure-ai-contentunderstanding/tests/test_to_llm_input.py @@ -287,8 +287,23 @@ def test_page_markers_from_spans(self): ], ) output = to_llm_input(_make_result([doc])) - assert "" in output - assert "" in output + assert "" in output + assert "" in output + + def test_page_markers_not_duplicated_when_service_provides_markers(self): + doc = DocumentContent( + kind="document", + markdown="\n\nFirst page text.\n\n\n\nSecond page text.", + start_page_number=1, + end_page_number=2, + pages=[ + DocumentPage(page_number=1, spans=[ContentSpan(offset=0, length=47)]), + DocumentPage(page_number=2, spans=[ContentSpan(offset=49, length=48)]), + ], + ) + output = to_llm_input(_make_result([doc])) + assert output.count("") == 1 + assert output.count("") == 1 def test_page_markers_from_pagebreak_fallback(self): doc = DocumentContent( @@ -298,8 +313,8 @@ def test_page_markers_from_pagebreak_fallback(self): end_page_number=2, ) output = to_llm_input(_make_result([doc])) - assert "" in output - assert "" in output + assert "" in output + assert "" in output assert "" not in output def test_page_markers_respect_start_page_number(self): @@ -311,8 +326,8 @@ def test_page_markers_respect_start_page_number(self): end_page_number=4, ) output = to_llm_input(_make_result([doc])) - assert "" in output - assert "" in output + assert "" in output + assert "" in output def test_pages_single_page_format(self): doc = _make_invoice_doc(start_page_number=1, end_page_number=1) @@ -867,6 +882,67 @@ def test_warnings_present_regardless_of_include_flags(self): output = to_llm_input(result, include_fields=False, include_markdown=False) assert "rai_warnings:" in output + def test_llm_stats_warning_filtered_from_rai_warnings(self): + from azure.core.exceptions import ODataV4Format + doc = _make_invoice_doc() + telemetry_warning = ODataV4Format( + {"code": "Telemetry", "message": "LLMStats: completion calls: 2; embedding calls: 1"} + ) + real_warning = ODataV4Format({"code": "ContentWarning", "message": "Potentially sensitive content."}) + result = AnalysisResult(contents=[doc], warnings=[telemetry_warning, real_warning]) + + output = to_llm_input(result) + + assert "rai_warnings:" in output + assert "LLMStats:" not in output + assert "Potentially sensitive content." in output + + def test_llm_stats_warning_only_omits_rai_warnings_block(self): + from azure.core.exceptions import ODataV4Format + doc = _make_invoice_doc() + warning = ODataV4Format({"code": "Telemetry", "message": "LLMStats: completion latency: 7.71s"}) + result = AnalysisResult(contents=[doc], warnings=[warning]) + + output = to_llm_input(result) + + assert "rai_warnings:" not in output + assert "LLMStats:" not in output + + def test_llm_stats_filter_is_case_sensitive(self): + from azure.core.exceptions import ODataV4Format + doc = _make_invoice_doc() + warning = ODataV4Format({"code": "ContentWarning", "message": "llmstats: keep as a real warning"}) + result = AnalysisResult(contents=[doc], warnings=[warning]) + + output = to_llm_input(result) + + assert "rai_warnings:" in output + assert "llmstats: keep as a real warning" in output + + def test_llm_stats_text_in_markdown_body_is_preserved(self): + from azure.core.exceptions import ODataV4Format + body_text = "A log excerpt:\n- LLMStats: keep this body text" + doc = _make_invoice_doc(markdown=body_text) + warning = ODataV4Format({"code": "Telemetry", "message": "LLMStats: remove this warning text"}) + result = AnalysisResult(contents=[doc], warnings=[warning]) + + output = to_llm_input(result) + + assert "rai_warnings:" not in output + assert "LLMStats: keep this body text" in output + assert "LLMStats: remove this warning text" not in output + + def test_llm_stats_warning_filtered_with_leading_whitespace(self): + from azure.core.exceptions import ODataV4Format + doc = _make_invoice_doc() + warning = ODataV4Format({"code": "Telemetry", "message": " LLMStats: completion calls: 2"}) + result = AnalysisResult(contents=[doc], warnings=[warning]) + + output = to_llm_input(result) + + assert "rai_warnings:" not in output + assert "LLMStats:" not in output + def test_empty_string_field_value_quoted(self): doc = DocumentContent( kind="document", @@ -1029,9 +1105,9 @@ def test_multipage_doc_strips_pagebreak_with_spans(self): ) output = to_llm_input(_make_result([doc])) assert "" not in output - assert "" in output - assert "" in output - assert "" in output + assert "" in output + assert "" in output + assert "" in output assert "Page 1 content." in output assert "Page 2 content." in output assert "Page 3 content." in output @@ -1048,7 +1124,7 @@ def test_image_with_empty_page_spans_falls_back(self): ) output = to_llm_input(_make_result([doc])) # Should fall back to PageBreak method, which adds page 1 marker - assert "" in output + assert "" in output assert "![image](pages/1)" in output def test_document_search_png_single_page_with_spans(self): @@ -1063,7 +1139,7 @@ def test_document_search_png_single_page_with_spans(self): pages=[DocumentPage(page_number=1, spans=[ContentSpan(offset=0, length=len(markdown))])], ) output = to_llm_input(_make_result([doc])) - assert "" in output + assert "" in output assert "IAN HANSSON" in output assert "Summary: A resume document." in output @@ -1082,8 +1158,8 @@ def test_prebuilt_read_no_fields(self): output = to_llm_input(_make_result([doc])) assert "contentType: document" in output assert "fields:" not in output - assert "" in output - assert "" in output + assert "" in output + assert "" in output def test_metadata_keys_with_yaml_special_chars(self): """Metadata keys with YAML-special characters must be quoted to produce valid YAML."""