Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# Release History

## 1.2.0b2 (Unreleased)

### Bugs Fixed
- Filtered service-emitted `LLMStats:` telemetry entries from the rendered `rai_warnings` front matter.

### Other Changes
- Updated `to_llm_input` page markers from `<!-- page N -->` to `<!-- InputPageNumber: N -->` and avoided duplicate marker injection when the service markdown already includes `InputPageNumber` markers.

## 1.2.0b1 (2026-04-28)

### Features Added
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ This table shows the relationship between SDK versions and supported API service

| SDK version | Supported API service version |
| ----------- | ----------------------------- |
| 1.2.0b2 | 2025-11-01 |
| 1.2.0b1 | 2025-11-01 |
| 1.1.0 | 2025-11-01 |
| 1.0.1 | 2025-11-01 |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import datetime
import math
import re
from typing import Any, Dict, List, Optional, TYPE_CHECKING
from typing import Any, Dict, List, Optional, Tuple, TYPE_CHECKING

if TYPE_CHECKING:
from .models import (
Expand All @@ -39,6 +39,37 @@
}
)

# Marker emitted by ``to_llm_input`` at each page boundary. Future Content
# Understanding service versions emit this same marker directly in the
# returned markdown (per ContentUnderstanding-Docs#249). When the helper sees
# any occurrence of this prefix in the input markdown it treats the service
# as having already paginated the content and skips its own injection to
# avoid duplicate markers.
_INPUT_PAGE_MARKER_PREFIX = "<!-- InputPageNumber:"

# Message prefixes the Content Understanding service has been observed to
# emit into the ``warnings`` collection that are *not* real Responsible-AI
# warnings (they are internal telemetry counters). The helper drops any
# warning whose message starts with one of these prefixes before rendering
# the ``rai_warnings:`` block, so the noise never reaches the LLM. Tracked
# alongside a separate service bug to stop emitting them in the first place.
_TELEMETRY_MESSAGE_PREFIXES: Tuple[str, ...] = ("LLMStats:",)


def _has_input_page_marker(markdown: str) -> bool:
"""Return True if *markdown* already contains an ``InputPageNumber`` marker.

Case-sensitive substring check. A single occurrence is sufficient: when
the service paginates content it places markers at every boundary, so
the presence of any marker means the helper should not inject its own.

:param str markdown: The markdown text to inspect.
:returns: ``True`` if at least one ``<!-- InputPageNumber:`` substring is
present, ``False`` otherwise.
:rtype: bool
"""
return _INPUT_PAGE_MARKER_PREFIX in markdown


# ---------------------------------------------------------------------------
# Public API
Expand All @@ -65,6 +96,11 @@ def to_llm_input(
helper automatically expands the parent into per-segment blocks
with category labels and markdown slices.

For document content, the helper emits ``<!-- InputPageNumber: N -->``
markers at page boundaries when the service result does not already
include them. Internal telemetry messages such as ``LLMStats: ...``
are filtered from the rendered ``rai_warnings`` front matter.

:param result: The ``AnalysisResult`` from a Content Understanding analyze operation.
:type result: ~azure.ai.contentunderstanding.models.AnalysisResult
:keyword include_fields: Whether to include structured fields in the
Expand Down Expand Up @@ -379,14 +415,21 @@ def _render_content_block(


def _add_page_markers(content: "DocumentContent", markdown: str) -> str:
"""Add ``<!-- page N -->`` markers to document markdown.
"""Add ``<!-- InputPageNumber: N -->`` markers to document markdown.

If *markdown* already contains ``<!-- InputPageNumber:`` markers (e.g.,
because the service paginated the content itself per
ContentUnderstanding-Docs#249), the helper passes the markdown through
unchanged to avoid duplicate markers.

:param content: The document content with page information.
:type content: ~azure.ai.contentunderstanding.models.DocumentContent
:param str markdown: The markdown text to annotate.
:returns: The markdown with page markers inserted.
:rtype: str
"""
if _has_input_page_marker(markdown):
return markdown
if content.pages:
result = _page_markers_from_spans(markdown, content.pages)
if result is not markdown: # spans were found and used
Expand All @@ -403,7 +446,7 @@ def _page_markers_from_spans(markdown: str, pages: "List[DocumentPage]") -> str:
:returns: The markdown with page markers inserted at span offsets.
:rtype: str
"""
markers: List[tuple] = []
markers: List[Tuple[int, int]] = []
for page in pages:
if page.spans:
markers.append((page.spans[0].offset, page.page_number))
Expand All @@ -419,7 +462,7 @@ def _page_markers_from_spans(markdown: str, pages: "List[DocumentPage]") -> str:
# Compute offset shifts from the cleaning
# Re-map original offsets to cleaned string positions
break_pattern = re.compile(r"\n*<!-- PageBreak -->\n*")
shifts: List[tuple] = [] # (original_pos, delta)
shifts: List[Tuple[int, int]] = [] # (original_pos, delta)
for m in break_pattern.finditer(markdown):
replacement_len = 2 # "\n\n"
delta = m.end() - m.start() - replacement_len
Expand All @@ -438,7 +481,7 @@ def _adjusted_offset(orig: int) -> int:
for offset, page_num in markers:
adj = _adjusted_offset(offset)
parts.append(cleaned[prev:adj])
parts.append(f"<!-- page {page_num} -->\n\n")
parts.append(f"{_INPUT_PAGE_MARKER_PREFIX} {page_num} -->\n\n")
prev = adj
parts.append(cleaned[prev:])

Expand All @@ -464,7 +507,7 @@ def _page_markers_from_breaks(markdown: str, content: "DocumentContent") -> str:
page_num = start_page + i
text = chunk.strip()
if text:
parts.append(f"<!-- page {page_num} -->\n\n{text}")
parts.append(f"{_INPUT_PAGE_MARKER_PREFIX} {page_num} -->\n\n{text}")
return "\n\n".join(parts)


Expand Down Expand Up @@ -559,11 +602,18 @@ def _format_warnings(
"""
items: List[Dict[str, str]] = []
for w in warnings:
message = getattr(w, "message", None)
# Skip internal service telemetry strings (e.g. ``LLMStats: ...``)
# that occasionally leak into the warnings collection. These are
# not Responsible-AI warnings and would otherwise be rendered into
# the LLM-facing ``rai_warnings:`` block.
if message and message.lstrip().startswith(_TELEMETRY_MESSAGE_PREFIXES):
continue
entry: Dict[str, str] = {}
if getattr(w, "code", None):
entry["code"] = w.code # type: ignore[assignment, union-attr]
if getattr(w, "message", None):
entry["message"] = w.message # type: ignore[assignment, union-attr]
if message:
entry["message"] = message
if getattr(w, "target", None):
entry["target"] = w.target # type: ignore[assignment, union-attr]
if entry:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
# Changes may cause incorrect behavior and will be lost if the code is regenerated.
# --------------------------------------------------------------------------

VERSION = "1.2.0b1"
VERSION = "1.2.0b2"
Original file line number Diff line number Diff line change
Expand Up @@ -147,16 +147,20 @@ def test_to_llm_input_multi_page_content_range(self, contentunderstanding_endpoi
print(f"[PASS] to_llm_input output validated ({len(text)} chars, pages='2-3, 5' preserved)")

# Page markers in the markdown body should use the original page numbers
# (<!-- page 2 -->, <!-- page 3 -->, <!-- page 5 -->), not renumbered (1, 2, 3).
assert "<!-- page 1 -->" not in text, (
"Page marker '<!-- page 1 -->' should not appear — we only requested pages 2-3, 5"
# (<!-- InputPageNumber: 2 -->, <!-- InputPageNumber: 3 -->, <!-- InputPageNumber: 5 -->),
# not renumbered (1, 2, 3).
assert "<!-- InputPageNumber: 1 -->" not in text, (
"Page marker '<!-- InputPageNumber: 1 -->' should not appear — we only requested pages 2-3, 5"
)
for expected_page in [2, 3, 5]:
assert f"<!-- page {expected_page} -->" in text, (
f"Page marker '<!-- page {expected_page} -->' should appear in the markdown body. "
assert f"<!-- InputPageNumber: {expected_page} -->" in text, (
f"Page marker '<!-- InputPageNumber: {expected_page} -->' should appear in the markdown body. "
f"Output:\n{text[:800]}"
)
print("[PASS] Page markers verified: <!-- page 2 -->, <!-- page 3 -->, <!-- page 5 -->")
print(
"[PASS] Page markers verified: <!-- InputPageNumber: 2 -->, "
"<!-- InputPageNumber: 3 -->, <!-- InputPageNumber: 5 -->"
)

print("\n[SUCCESS] All test_to_llm_input_multi_page_content_range assertions passed")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,16 +148,20 @@ async def test_to_llm_input_multi_page_content_range_async(self, contentundersta
print(f"[PASS] to_llm_input output validated ({len(text)} chars, pages='2-3, 5' preserved)")

# Page markers in the markdown body should use the original page numbers
# (<!-- page 2 -->, <!-- page 3 -->, <!-- page 5 -->), not renumbered (1, 2, 3).
assert "<!-- page 1 -->" not in text, (
"Page marker '<!-- page 1 -->' should not appear — we only requested pages 2-3, 5"
# (<!-- InputPageNumber: 2 -->, <!-- InputPageNumber: 3 -->, <!-- InputPageNumber: 5 -->),
# not renumbered (1, 2, 3).
assert "<!-- InputPageNumber: 1 -->" not in text, (
"Page marker '<!-- InputPageNumber: 1 -->' should not appear — we only requested pages 2-3, 5"
)
for expected_page in [2, 3, 5]:
assert f"<!-- page {expected_page} -->" in text, (
f"Page marker '<!-- page {expected_page} -->' should appear in the markdown body. "
assert f"<!-- InputPageNumber: {expected_page} -->" in text, (
f"Page marker '<!-- InputPageNumber: {expected_page} -->' should appear in the markdown body. "
f"Output:\n{text[:800]}"
)
print("[PASS] Page markers verified: <!-- page 2 -->, <!-- page 3 -->, <!-- page 5 -->")
print(
"[PASS] Page markers verified: <!-- InputPageNumber: 2 -->, "
"<!-- InputPageNumber: 3 -->, <!-- InputPageNumber: 5 -->"
)

await client.close()
print("\n[SUCCESS] All test_to_llm_input_multi_page_content_range_async assertions passed")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -287,8 +287,23 @@ def test_page_markers_from_spans(self):
],
)
output = to_llm_input(_make_result([doc]))
assert "<!-- page 1 -->" in output
assert "<!-- page 2 -->" in output
assert "<!-- InputPageNumber: 1 -->" in output
assert "<!-- InputPageNumber: 2 -->" in output

def test_page_markers_not_duplicated_when_service_provides_markers(self):
doc = DocumentContent(
kind="document",
markdown="<!-- InputPageNumber: 1 -->\n\nFirst page text.\n\n<!-- InputPageNumber: 2 -->\n\nSecond page text.",
start_page_number=1,
end_page_number=2,
pages=[
DocumentPage(page_number=1, spans=[ContentSpan(offset=0, length=47)]),
DocumentPage(page_number=2, spans=[ContentSpan(offset=49, length=48)]),
],
)
output = to_llm_input(_make_result([doc]))
assert output.count("<!-- InputPageNumber: 1 -->") == 1
assert output.count("<!-- InputPageNumber: 2 -->") == 1

def test_page_markers_from_pagebreak_fallback(self):
doc = DocumentContent(
Expand All @@ -298,8 +313,8 @@ def test_page_markers_from_pagebreak_fallback(self):
end_page_number=2,
)
output = to_llm_input(_make_result([doc]))
assert "<!-- page 1 -->" in output
assert "<!-- page 2 -->" in output
assert "<!-- InputPageNumber: 1 -->" in output
assert "<!-- InputPageNumber: 2 -->" in output
assert "<!-- PageBreak -->" not in output

def test_page_markers_respect_start_page_number(self):
Expand All @@ -311,8 +326,8 @@ def test_page_markers_respect_start_page_number(self):
end_page_number=4,
)
output = to_llm_input(_make_result([doc]))
assert "<!-- page 3 -->" in output
assert "<!-- page 4 -->" in output
assert "<!-- InputPageNumber: 3 -->" in output
assert "<!-- InputPageNumber: 4 -->" in output

def test_pages_single_page_format(self):
doc = _make_invoice_doc(start_page_number=1, end_page_number=1)
Expand Down Expand Up @@ -867,6 +882,67 @@ def test_warnings_present_regardless_of_include_flags(self):
output = to_llm_input(result, include_fields=False, include_markdown=False)
assert "rai_warnings:" in output

def test_llm_stats_warning_filtered_from_rai_warnings(self):
from azure.core.exceptions import ODataV4Format
doc = _make_invoice_doc()
telemetry_warning = ODataV4Format(
{"code": "Telemetry", "message": "LLMStats: completion calls: 2; embedding calls: 1"}
)
real_warning = ODataV4Format({"code": "ContentWarning", "message": "Potentially sensitive content."})
result = AnalysisResult(contents=[doc], warnings=[telemetry_warning, real_warning])

output = to_llm_input(result)

assert "rai_warnings:" in output
assert "LLMStats:" not in output
assert "Potentially sensitive content." in output

def test_llm_stats_warning_only_omits_rai_warnings_block(self):
from azure.core.exceptions import ODataV4Format
doc = _make_invoice_doc()
warning = ODataV4Format({"code": "Telemetry", "message": "LLMStats: completion latency: 7.71s"})
result = AnalysisResult(contents=[doc], warnings=[warning])

output = to_llm_input(result)

assert "rai_warnings:" not in output
assert "LLMStats:" not in output

def test_llm_stats_filter_is_case_sensitive(self):
from azure.core.exceptions import ODataV4Format
doc = _make_invoice_doc()
warning = ODataV4Format({"code": "ContentWarning", "message": "llmstats: keep as a real warning"})
result = AnalysisResult(contents=[doc], warnings=[warning])

output = to_llm_input(result)

assert "rai_warnings:" in output
assert "llmstats: keep as a real warning" in output

def test_llm_stats_text_in_markdown_body_is_preserved(self):
from azure.core.exceptions import ODataV4Format
body_text = "A log excerpt:\n- LLMStats: keep this body text"
doc = _make_invoice_doc(markdown=body_text)
warning = ODataV4Format({"code": "Telemetry", "message": "LLMStats: remove this warning text"})
result = AnalysisResult(contents=[doc], warnings=[warning])

output = to_llm_input(result)

assert "rai_warnings:" not in output
assert "LLMStats: keep this body text" in output
assert "LLMStats: remove this warning text" not in output

def test_llm_stats_warning_filtered_with_leading_whitespace(self):
from azure.core.exceptions import ODataV4Format
doc = _make_invoice_doc()
warning = ODataV4Format({"code": "Telemetry", "message": " LLMStats: completion calls: 2"})
result = AnalysisResult(contents=[doc], warnings=[warning])

output = to_llm_input(result)

assert "rai_warnings:" not in output
assert "LLMStats:" not in output

def test_empty_string_field_value_quoted(self):
doc = DocumentContent(
kind="document",
Expand Down Expand Up @@ -1029,9 +1105,9 @@ def test_multipage_doc_strips_pagebreak_with_spans(self):
)
output = to_llm_input(_make_result([doc]))
assert "<!-- PageBreak -->" not in output
assert "<!-- page 1 -->" in output
assert "<!-- page 2 -->" in output
assert "<!-- page 3 -->" in output
assert "<!-- InputPageNumber: 1 -->" in output
assert "<!-- InputPageNumber: 2 -->" in output
assert "<!-- InputPageNumber: 3 -->" in output
assert "Page 1 content." in output
assert "Page 2 content." in output
assert "Page 3 content." in output
Expand All @@ -1048,7 +1124,7 @@ def test_image_with_empty_page_spans_falls_back(self):
)
output = to_llm_input(_make_result([doc]))
# Should fall back to PageBreak method, which adds page 1 marker
assert "<!-- page 1 -->" in output
assert "<!-- InputPageNumber: 1 -->" in output
assert "![image](pages/1)" in output

def test_document_search_png_single_page_with_spans(self):
Expand All @@ -1063,7 +1139,7 @@ def test_document_search_png_single_page_with_spans(self):
pages=[DocumentPage(page_number=1, spans=[ContentSpan(offset=0, length=len(markdown))])],
)
output = to_llm_input(_make_result([doc]))
assert "<!-- page 1 -->" in output
assert "<!-- InputPageNumber: 1 -->" in output
assert "IAN HANSSON" in output
assert "Summary: A resume document." in output

Expand All @@ -1082,8 +1158,8 @@ def test_prebuilt_read_no_fields(self):
output = to_llm_input(_make_result([doc]))
assert "contentType: document" in output
assert "fields:" not in output
assert "<!-- page 1 -->" in output
assert "<!-- page 2 -->" in output
assert "<!-- InputPageNumber: 1 -->" in output
assert "<!-- InputPageNumber: 2 -->" in output

def test_metadata_keys_with_yaml_special_chars(self):
"""Metadata keys with YAML-special characters must be quoted to produce valid YAML."""
Expand Down
Loading