From 2a6c257c5992cd9d1555c1e93c8e293b965a759c Mon Sep 17 00:00:00 2001
From: chienyuanchang <ds.chienyuanchang@gmail.com>
Date: Fri, 8 May 2026 14:46:50 -0700
Subject: [PATCH 1/3] first version

---
 .../markitdown/src/markitdown/__main__.py     |  28 +-
 .../markitdown/src/markitdown/_markitdown.py  |   8 +
 .../converters/_doc_intel_converter.py        | 197 ++++++++++-
 .../tests/test_docintel_converter.py          | 326 ++++++++++++++++++
 4 files changed, 552 insertions(+), 7 deletions(-)
 create mode 100644 packages/markitdown/tests/test_docintel_converter.py

diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py
index ccb44b64b..b14040f83 100644
--- a/packages/markitdown/src/markitdown/__main__.py
+++ b/packages/markitdown/src/markitdown/__main__.py
@@ -119,6 +119,20 @@ def main():
         help="Comma-separated list of file types to route to Content Understanding (e.g., pdf,jpeg,mp4). If omitted, all supported types are routed.",
     )
 
+    parser.add_argument(
+        "--docintel-model-id",
+        type=str,
+        default=None,
+        help="Document Intelligence model ID (e.g., 'prebuilt-layout', 'prebuilt-invoice', or a custom model ID). Defaults to 'prebuilt-layout'.",
+    )
+
+    parser.add_argument(
+        "--docintel-query-fields",
+        type=str,
+        default=None,
+        help="Comma-separated list of field names to extract via the Document Intelligence queryFields add-on (OCR file types only).",
+    )
+
     parser.add_argument(
         "-p",
         "--use-plugins",
@@ -208,8 +222,20 @@ def main():
         elif args.filename is None:
             _exit_with_error("Filename is required when using Document Intelligence.")
 
+        docintel_kwargs: Dict[str, Any] = {
+            "docintel_endpoint": args.endpoint,
+        }
+        if args.docintel_model_id:
+            docintel_kwargs["docintel_model_id"] = args.docintel_model_id
+        if args.docintel_query_fields:
+            fields = [
+                f.strip() for f in args.docintel_query_fields.split(",") if f.strip()
+            ]
+            if fields:
+                docintel_kwargs["docintel_query_fields"] = fields
+
         markitdown = MarkItDown(
-            enable_plugins=args.use_plugins, docintel_endpoint=args.endpoint
+            enable_plugins=args.use_plugins, **docintel_kwargs
         )
     elif args.use_cu:
         if args.cu_endpoint is None:
diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
index f6aa4df0e..7ffad0996 100644
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -222,6 +222,14 @@ def enable_builtins(self, **kwargs) -> None:
                 if docintel_version is not None:
                     docintel_args["api_version"] = docintel_version
 
+                docintel_model_id = kwargs.get("docintel_model_id")
+                if docintel_model_id is not None:
+                    docintel_args["model_id"] = docintel_model_id
+
+                docintel_query_fields = kwargs.get("docintel_query_fields")
+                if docintel_query_fields is not None:
+                    docintel_args["query_fields"] = docintel_query_fields
+
                 self.register_converter(
                     DocumentIntelligenceConverter(**docintel_args),
                 )
diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
index fd843f231..8999eceda 100644
--- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@@ -1,12 +1,16 @@
 import sys
 import re
 import os
-from typing import BinaryIO, Any, List
+from datetime import date, datetime, time
+from typing import BinaryIO, Any, List, Optional
 from enum import Enum
 
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException
+from .. import __version__ as _markitdown_version
+
+_USER_AGENT = f"markitdown-docintel/{_markitdown_version}"
 
 # Try loading optional (but in this case, required) dependencies
 # Save reporting of any exceptions for later
@@ -127,6 +131,158 @@ def _get_file_extensions(types: List[DocumentIntelligenceFileType]) -> List[str]
     return extensions
 
 
+def _field_value(field: Any) -> Any:
+    """
+    Extract a serializable Python value from a Document Intelligence DocumentField.
+
+    Returns the most specific typed value when available, falling back to the
+    raw ``content`` string. Returns ``None`` when nothing usable is present.
+    """
+    if field is None:
+        return None
+
+    # Typed scalar values (in rough order of specificity).
+    for attr in (
+        "value_string",
+        "value_boolean",
+        "value_integer",
+        "value_number",
+        "value_date",
+        "value_time",
+        "value_phone_number",
+        "value_country_region",
+        "value_selection_mark",
+        "value_signature",
+    ):
+        v = getattr(field, attr, None)
+        if v is not None:
+            if isinstance(v, (date, datetime, time)):
+                return v.isoformat()
+            return v
+
+    # Currency: { amount, currencySymbol, currencyCode }
+    cur = getattr(field, "value_currency", None)
+    if cur is not None:
+        amount = getattr(cur, "amount", None)
+        code = getattr(cur, "currency_code", None) or getattr(cur, "currency_symbol", None)
+        if amount is not None and code:
+            return f"{amount} {code}"
+        if amount is not None:
+            return amount
+
+    # Address: serialize to its content/string form.
+    addr = getattr(field, "value_address", None)
+    if addr is not None:
+        return getattr(field, "content", None) or str(addr)
+
+    # Array of fields -> list of values.
+    arr = getattr(field, "value_array", None)
+    if arr is not None:
+        return [_field_value(item) for item in arr]
+
+    # Object of fields -> dict of values.
+    obj = getattr(field, "value_object", None)
+    if obj is not None:
+        return {k: _field_value(v) for k, v in obj.items()}
+
+    # Last resort: the raw extracted text.
+    return getattr(field, "content", None)
+
+
+def _yaml_scalar(value: Any) -> str:
+    """Render a scalar value as a YAML string."""
+    if value is None:
+        return "null"
+    if isinstance(value, bool):
+        return "true" if value else "false"
+    if isinstance(value, (int, float)):
+        return repr(value)
+    s = str(value)
+    # Quote when necessary: contains special chars, leading/trailing whitespace,
+    # or characters that would confuse a YAML parser.
+    if (
+        s == ""
+        or s != s.strip()
+        or any(c in s for c in ":#&*!|>'\"%@`\n\r\t")
+        or s.lower() in ("null", "true", "false", "yes", "no", "~")
+    ):
+        # Escape backslashes and double quotes; collapse newlines.
+        escaped = s.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n").replace("\r", "\\r").replace("\t", "\\t")
+        return f'"{escaped}"'
+    return s
+
+
+def _yaml_dump(value: Any, indent: int = 0) -> str:
+    """Minimal YAML emitter for scalars, lists, and dicts of scalars/lists/dicts."""
+    pad = "  " * indent
+    if isinstance(value, dict):
+        if not value:
+            return f"{pad}{{}}"
+        lines: List[str] = []
+        for k, v in value.items():
+            key = _yaml_scalar(k)
+            if isinstance(v, (dict, list)) and v:
+                lines.append(f"{pad}{key}:")
+                lines.append(_yaml_dump(v, indent + 1))
+            else:
+                lines.append(f"{pad}{key}: {_yaml_scalar(v) if not isinstance(v, (dict, list)) else ('{}' if isinstance(v, dict) else '[]')}")
+        return "\n".join(lines)
+    if isinstance(value, list):
+        if not value:
+            return f"{pad}[]"
+        lines = []
+        for item in value:
+            if isinstance(item, (dict, list)) and item:
+                lines.append(f"{pad}-")
+                lines.append(_yaml_dump(item, indent + 1))
+            else:
+                lines.append(f"{pad}- {_yaml_scalar(item) if not isinstance(item, (dict, list)) else ('{}' if isinstance(item, dict) else '[]')}")
+        return "\n".join(lines)
+    return f"{pad}{_yaml_scalar(value)}"
+
+
+def _fields_to_front_matter(documents: Any, model_id: Optional[str] = None) -> str:
+    """
+    Build a YAML front matter block from ``AnalyzeResult.documents[*].fields``.
+
+    Returns an empty string when there are no documents or no non-empty fields.
+    Multiple documents are merged into a single ``fields`` mapping; on duplicate
+    keys, the value from the later document wins.
+
+    The shape mirrors the Content Understanding converter's front matter so that
+    downstream consumers (e.g., LLM pipelines) can parse both uniformly:
+
+        ---
+        modelId: prebuilt-invoice
+        fields:
+          VendorName: Contoso Ltd.
+          InvoiceTotal: 1250.0
+        ---
+    """
+    if not documents:
+        return ""
+
+    merged: dict = {}
+    for doc in documents:
+        fields = getattr(doc, "fields", None) or {}
+        for name, field in fields.items():
+            value = _field_value(field)
+            if value is None or value == "" or value == [] or value == {}:
+                continue
+            merged[name] = value
+
+    if not merged:
+        return ""
+
+    payload: dict = {}
+    if model_id:
+        payload["modelId"] = model_id
+    payload["fields"] = merged
+
+    body = _yaml_dump(payload)
+    return f"---\n{body}\n---\n\n"
+
+
 class DocumentIntelligenceConverter(DocumentConverter):
     """Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
 
@@ -134,8 +290,10 @@ def __init__(
         self,
         *,
         endpoint: str,
-        api_version: str = "2024-07-31-preview",
+        api_version: str = "2024-11-30",
         credential: AzureKeyCredential | TokenCredential | None = None,
+        model_id: str = "prebuilt-layout",
+        query_fields: Optional[List[str]] = None,
         file_types: List[DocumentIntelligenceFileType] = [
             DocumentIntelligenceFileType.DOCX,
             DocumentIntelligenceFileType.PPTX,
@@ -152,13 +310,19 @@ def __init__(
 
         Args:
             endpoint (str): The endpoint for the Document Intelligence service.
-            api_version (str): The API version to use. Defaults to "2024-07-31-preview".
+            api_version (str): The API version to use. Defaults to "2024-11-30" (GA).
             credential (AzureKeyCredential | TokenCredential | None): The credential to use for authentication.
+            model_id (str): The Document Intelligence model ID to use (e.g., "prebuilt-layout",
+                "prebuilt-invoice", "prebuilt-receipt", or a custom model ID). Defaults to "prebuilt-layout".
+            query_fields (List[str] | None): Optional list of field names to extract via the DI
+                ``queryFields`` add-on. Only applied to OCR-supported file types (PDF/images).
             file_types (List[DocumentIntelligenceFileType]): The file types to accept. Defaults to all supported file types.
         """
 
         super().__init__()
         self._file_types = file_types
+        self._model_id = model_id
+        self._query_fields = list(query_fields) if query_fields else None
 
         # Raise an error if the dependencies are not available.
         # This is different than other converters since this one isn't even instantiated
@@ -184,6 +348,7 @@ def __init__(
             endpoint=self.endpoint,
             api_version=self.api_version,
             credential=credential,
+            user_agent=_USER_AGENT,
         )
 
     def accepts(
@@ -228,11 +393,14 @@ def _analysis_features(self, stream_info: StreamInfo) -> List[str]:
             if mimetype.startswith(prefix):
                 return []
 
-        return [
+        features = [
             DocumentAnalysisFeature.FORMULAS,  # enable formula extraction
             DocumentAnalysisFeature.OCR_HIGH_RESOLUTION,  # enable high resolution OCR
             DocumentAnalysisFeature.STYLE_FONT,  # enable font style extraction
         ]
+        if self._query_fields:
+            features.append(DocumentAnalysisFeature.QUERY_FIELDS)
+        return features
 
     def convert(
         self,
@@ -240,15 +408,32 @@ def convert(
         stream_info: StreamInfo,
         **kwargs: Any,  # Options to pass to the converter
     ) -> DocumentConverterResult:
+        # Build optional kwargs so that we only pass query_fields when the
+        # QUERY_FIELDS feature is actually enabled for this file type.
+        features = self._analysis_features(stream_info)
+        extra: dict = {}
+        if self._query_fields and DocumentAnalysisFeature.QUERY_FIELDS in features:
+            extra["query_fields"] = self._query_fields
+
         # Extract the text using Azure Document Intelligence
         poller = self.doc_intel_client.begin_analyze_document(
-            model_id="prebuilt-layout",
+            model_id=self._model_id,
             body=AnalyzeDocumentRequest(bytes_source=file_stream.read()),
-            features=self._analysis_features(stream_info),
+            features=features,
             output_content_format=CONTENT_FORMAT,  # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
+            **extra,
         )
         result: AnalyzeResult = poller.result()
 
         # remove comments from the markdown content generated by Doc Intelligence and append to markdown string
         markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
+
+        # Prepend YAML front matter when DI returned structured fields (e.g., from
+        # prebuilt-invoice/-receipt, custom models, or queryFields).
+        front_matter = _fields_to_front_matter(
+            getattr(result, "documents", None), model_id=self._model_id
+        )
+        if front_matter:
+            markdown_text = front_matter + markdown_text
+
         return DocumentConverterResult(markdown=markdown_text)
diff --git a/packages/markitdown/tests/test_docintel_converter.py b/packages/markitdown/tests/test_docintel_converter.py
new file mode 100644
index 000000000..87744741a
--- /dev/null
+++ b/packages/markitdown/tests/test_docintel_converter.py
@@ -0,0 +1,326 @@
+"""Unit tests for the DocumentIntelligenceConverter improvements.
+
+These tests exercise the converter without making any network calls. They use
+``__new__`` to bypass ``__init__`` (which would construct a real
+``DocumentIntelligenceClient``) and instead inject a mock client.
+"""
+
+import io
+from datetime import date
+from types import SimpleNamespace
+from unittest import mock
+
+import pytest
+
+from markitdown._stream_info import StreamInfo
+from markitdown.converters import _doc_intel_converter as di_mod
+from markitdown.converters._doc_intel_converter import (
+    DocumentIntelligenceConverter,
+    DocumentIntelligenceFileType,
+    _USER_AGENT,
+    _field_value,
+    _fields_to_front_matter,
+    _yaml_dump,
+)
+
+
+# --------- helpers ---------------------------------------------------------
+
+
+def _bare_converter(
+    *,
+    file_types=None,
+    model_id="prebuilt-layout",
+    query_fields=None,
+    client=None,
+):
+    """Build a converter without calling __init__ (no real DI client)."""
+    conv = DocumentIntelligenceConverter.__new__(DocumentIntelligenceConverter)
+    conv._file_types = file_types or [
+        DocumentIntelligenceFileType.PDF,
+        DocumentIntelligenceFileType.DOCX,
+    ]
+    conv._model_id = model_id
+    conv._query_fields = list(query_fields) if query_fields else None
+    conv.endpoint = "https://example.cognitiveservices.azure.com/"
+    conv.api_version = "2024-11-30"
+    conv.doc_intel_client = client
+    return conv
+
+
+def _mock_field(**kwargs):
+    """A SimpleNamespace with all DocumentField value_* attrs defaulting to None."""
+    defaults = {
+        "value_string": None,
+        "value_boolean": None,
+        "value_integer": None,
+        "value_number": None,
+        "value_date": None,
+        "value_time": None,
+        "value_phone_number": None,
+        "value_country_region": None,
+        "value_selection_mark": None,
+        "value_signature": None,
+        "value_currency": None,
+        "value_address": None,
+        "value_array": None,
+        "value_object": None,
+        "content": None,
+    }
+    defaults.update(kwargs)
+    return SimpleNamespace(**defaults)
+
+
+# --------- Phase 1: API version + user agent -------------------------------
+
+
+def test_default_api_version_is_2024_11_30():
+    """The default api_version must be the GA value '2024-11-30'."""
+    import inspect
+
+    sig = inspect.signature(DocumentIntelligenceConverter.__init__)
+    assert sig.parameters["api_version"].default == "2024-11-30"
+
+
+def test_user_agent_string_format():
+    """User agent should start with 'markitdown-docintel/'."""
+    assert _USER_AGENT.startswith("markitdown-docintel/")
+    assert len(_USER_AGENT) > len("markitdown-docintel/")
+
+
+def test_client_constructed_with_user_agent_and_api_version():
+    """__init__ should pass user_agent and api_version to DocumentIntelligenceClient."""
+    fake_client = mock.MagicMock()
+    with mock.patch.object(di_mod, "DocumentIntelligenceClient", return_value=fake_client) as ctor:
+        DocumentIntelligenceConverter(
+            endpoint="https://example.cognitiveservices.azure.com/",
+            credential=mock.MagicMock(),
+        )
+    kwargs = ctor.call_args.kwargs
+    assert kwargs["api_version"] == "2024-11-30"
+    assert kwargs["user_agent"] == _USER_AGENT
+
+
+# --------- Phase 2: configurable model_id ----------------------------------
+
+
+def test_default_model_id():
+    """Default model_id preserves existing behavior."""
+    import inspect
+
+    sig = inspect.signature(DocumentIntelligenceConverter.__init__)
+    assert sig.parameters["model_id"].default == "prebuilt-layout"
+
+
+def test_convert_uses_default_model_id():
+    """Without overrides, convert() calls begin_analyze_document with prebuilt-layout."""
+    fake_poller = mock.MagicMock()
+    fake_poller.result.return_value = SimpleNamespace(content="# hi", documents=None)
+    client = mock.MagicMock()
+    client.begin_analyze_document.return_value = fake_poller
+
+    conv = _bare_converter(client=client)
+    conv.convert(io.BytesIO(b"data"), StreamInfo(extension=".pdf", mimetype="application/pdf"))
+
+    args, kwargs = client.begin_analyze_document.call_args
+    assert kwargs["model_id"] == "prebuilt-layout"
+
+
+def test_convert_uses_overridden_model_id():
+    fake_poller = mock.MagicMock()
+    fake_poller.result.return_value = SimpleNamespace(content="# hi", documents=None)
+    client = mock.MagicMock()
+    client.begin_analyze_document.return_value = fake_poller
+
+    conv = _bare_converter(model_id="prebuilt-invoice", client=client)
+    conv.convert(io.BytesIO(b"data"), StreamInfo(extension=".pdf", mimetype="application/pdf"))
+
+    assert client.begin_analyze_document.call_args.kwargs["model_id"] == "prebuilt-invoice"
+
+
+# --------- Phase 3: YAML front matter --------------------------------------
+
+
+def test_field_value_typed_scalars():
+    assert _field_value(_mock_field(value_string="Contoso")) == "Contoso"
+    assert _field_value(_mock_field(value_integer=42)) == 42
+    assert _field_value(_mock_field(value_number=12.5)) == 12.5
+    assert _field_value(_mock_field(value_boolean=True)) is True
+    assert _field_value(_mock_field(value_date=date(2026, 3, 15))) == "2026-03-15"
+
+
+def test_field_value_currency():
+    cur = SimpleNamespace(amount=1250.0, currency_code="USD", currency_symbol="$")
+    assert _field_value(_mock_field(value_currency=cur)) == "1250.0 USD"
+
+
+def test_field_value_falls_back_to_content():
+    assert _field_value(_mock_field(content="raw text")) == "raw text"
+
+
+def test_field_value_array_of_scalars():
+    items = [_mock_field(value_string="A"), _mock_field(value_string="B")]
+    assert _field_value(_mock_field(value_array=items)) == ["A", "B"]
+
+
+def test_fields_to_front_matter_empty_when_no_documents():
+    assert _fields_to_front_matter(None) == ""
+    assert _fields_to_front_matter([]) == ""
+
+
+def test_fields_to_front_matter_empty_when_no_fields():
+    doc = SimpleNamespace(fields={})
+    assert _fields_to_front_matter([doc]) == ""
+
+
+def test_fields_to_front_matter_basic():
+    doc = SimpleNamespace(
+        fields={
+            "VendorName": _mock_field(value_string="Contoso Ltd."),
+            "InvoiceTotal": _mock_field(value_number=1250.0),
+        }
+    )
+    fm = _fields_to_front_matter([doc], model_id="prebuilt-invoice")
+    assert fm.startswith("---\n")
+    assert fm.endswith("---\n\n")
+    assert "modelId: prebuilt-invoice" in fm
+    assert "fields:" in fm
+    assert "  VendorName: Contoso Ltd." in fm
+    assert "  InvoiceTotal: 1250.0" in fm
+
+
+def test_fields_to_front_matter_omits_model_id_when_not_provided():
+    doc = SimpleNamespace(fields={"X": _mock_field(value_string="y")})
+    fm = _fields_to_front_matter([doc])
+    assert "modelId:" not in fm
+    assert "fields:" in fm
+
+
+def test_fields_with_special_chars_are_quoted():
+    doc = SimpleNamespace(
+        fields={"Note": _mock_field(value_string="line1\nline2: with colon")}
+    )
+    fm = _fields_to_front_matter([doc])
+    # Value contains both \n and ':' so it must be quoted.
+    assert '  Note: "line1\\nline2: with colon"' in fm
+
+
+def test_yaml_dump_nested_dict():
+    out = _yaml_dump({"a": 1, "b": {"c": "x"}})
+    assert "a: 1" in out
+    assert "b:" in out
+    assert "  c: x" in out
+
+
+def test_convert_prepends_front_matter_when_fields_present():
+    doc = SimpleNamespace(fields={"VendorName": _mock_field(value_string="Contoso")})
+    fake_poller = mock.MagicMock()
+    fake_poller.result.return_value = SimpleNamespace(
+        content="# Invoice\n\nbody", documents=[doc]
+    )
+    client = mock.MagicMock()
+    client.begin_analyze_document.return_value = fake_poller
+
+    conv = _bare_converter(model_id="prebuilt-invoice", client=client)
+    result = conv.convert(
+        io.BytesIO(b"data"), StreamInfo(extension=".pdf", mimetype="application/pdf")
+    )
+
+    assert result.markdown.startswith("---\n")
+    assert "modelId: prebuilt-invoice" in result.markdown
+    assert "  VendorName: Contoso" in result.markdown
+    assert "# Invoice" in result.markdown
+
+
+def test_convert_no_front_matter_when_no_documents():
+    fake_poller = mock.MagicMock()
+    fake_poller.result.return_value = SimpleNamespace(content="# Layout", documents=None)
+    client = mock.MagicMock()
+    client.begin_analyze_document.return_value = fake_poller
+
+    conv = _bare_converter(client=client)
+    result = conv.convert(
+        io.BytesIO(b"data"), StreamInfo(extension=".pdf", mimetype="application/pdf")
+    )
+
+    assert not result.markdown.startswith("---")
+    assert result.markdown.startswith("# Layout")
+
+
+# --------- Phase 4: query fields -------------------------------------------
+
+
+def test_query_fields_adds_feature_for_ocr_types():
+    conv = _bare_converter(query_fields=["VendorName", "Total"])
+    features = conv._analysis_features(StreamInfo(extension=".pdf", mimetype="application/pdf"))
+    from azure.ai.documentintelligence.models import DocumentAnalysisFeature
+
+    assert DocumentAnalysisFeature.QUERY_FIELDS in features
+
+
+def test_query_fields_skipped_for_office_types():
+    conv = _bare_converter(query_fields=["VendorName"])
+    features = conv._analysis_features(
+        StreamInfo(
+            extension=".docx",
+            mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        )
+    )
+    # Office types skip OCR features entirely.
+    assert features == []
+
+
+def test_query_fields_passed_to_begin_analyze_document_for_pdf():
+    fake_poller = mock.MagicMock()
+    fake_poller.result.return_value = SimpleNamespace(content="x", documents=None)
+    client = mock.MagicMock()
+    client.begin_analyze_document.return_value = fake_poller
+
+    conv = _bare_converter(query_fields=["A", "B"], client=client)
+    conv.convert(io.BytesIO(b"data"), StreamInfo(extension=".pdf", mimetype="application/pdf"))
+
+    assert client.begin_analyze_document.call_args.kwargs.get("query_fields") == ["A", "B"]
+
+
+def test_query_fields_not_passed_for_office_types():
+    fake_poller = mock.MagicMock()
+    fake_poller.result.return_value = SimpleNamespace(content="x", documents=None)
+    client = mock.MagicMock()
+    client.begin_analyze_document.return_value = fake_poller
+
+    conv = _bare_converter(query_fields=["A"], client=client)
+    conv.convert(
+        io.BytesIO(b"data"),
+        StreamInfo(
+            extension=".docx",
+            mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        ),
+    )
+
+    assert "query_fields" not in client.begin_analyze_document.call_args.kwargs
+
+
+# --------- _markitdown.py wiring -------------------------------------------
+
+
+def test_markitdown_forwards_docintel_kwargs(monkeypatch):
+    """MarkItDown(...) should forward docintel_model_id / docintel_query_fields."""
+    from markitdown import _markitdown as md_mod
+
+    captured = {}
+
+    class _Fake:
+        def __init__(self, **kwargs):
+            captured.update(kwargs)
+
+    monkeypatch.setattr(md_mod, "DocumentIntelligenceConverter", _Fake)
+
+    md_mod.MarkItDown(
+        docintel_endpoint="https://example.cognitiveservices.azure.com/",
+        docintel_model_id="prebuilt-invoice",
+        docintel_query_fields=["A", "B"],
+    )
+
+    assert captured.get("endpoint") == "https://example.cognitiveservices.azure.com/"
+    assert captured.get("model_id") == "prebuilt-invoice"
+    assert captured.get("query_fields") == ["A", "B"]

From 97c19b6354b0bf8bd99c5d91851b2096502d490e Mon Sep 17 00:00:00 2001
From: chienyuanchang <ds.chienyuanchang@gmail.com>
Date: Tue, 26 May 2026 10:11:10 -0700
Subject: [PATCH 2/3] fix black

---
 .../markitdown/src/markitdown/__main__.py     | 10 ++----
 .../converters/_doc_intel_converter.py        | 20 ++++++++---
 .../tests/test_docintel_converter.py          | 34 ++++++++++++++-----
 3 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py
index b14040f83..e58ab7936 100644
--- a/packages/markitdown/src/markitdown/__main__.py
+++ b/packages/markitdown/src/markitdown/__main__.py
@@ -16,8 +16,7 @@ def main():
         description="Convert various file formats to markdown.",
         prog="markitdown",
         formatter_class=argparse.RawDescriptionHelpFormatter,
-        usage=dedent(
-            """
+        usage=dedent("""
             SYNTAX:
 
                 markitdown <OPTIONAL: FILENAME>
@@ -42,8 +41,7 @@ def main():
                 OR
 
                 markitdown example.pdf > example.md
-            """
-        ).strip(),
+            """).strip(),
     )
 
     parser.add_argument(
@@ -234,9 +232,7 @@ def main():
             if fields:
                 docintel_kwargs["docintel_query_fields"] = fields
 
-        markitdown = MarkItDown(
-            enable_plugins=args.use_plugins, **docintel_kwargs
-        )
+        markitdown = MarkItDown(enable_plugins=args.use_plugins, **docintel_kwargs)
     elif args.use_cu:
         if args.cu_endpoint is None:
             _exit_with_error(
diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
index 8999eceda..98aafcf46 100644
--- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@@ -164,7 +164,9 @@ def _field_value(field: Any) -> Any:
     cur = getattr(field, "value_currency", None)
     if cur is not None:
         amount = getattr(cur, "amount", None)
-        code = getattr(cur, "currency_code", None) or getattr(cur, "currency_symbol", None)
+        code = getattr(cur, "currency_code", None) or getattr(
+            cur, "currency_symbol", None
+        )
         if amount is not None and code:
             return f"{amount} {code}"
         if amount is not None:
@@ -207,7 +209,13 @@ def _yaml_scalar(value: Any) -> str:
         or s.lower() in ("null", "true", "false", "yes", "no", "~")
     ):
         # Escape backslashes and double quotes; collapse newlines.
-        escaped = s.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n").replace("\r", "\\r").replace("\t", "\\t")
+        escaped = (
+            s.replace("\\", "\\\\")
+            .replace('"', '\\"')
+            .replace("\n", "\\n")
+            .replace("\r", "\\r")
+            .replace("\t", "\\t")
+        )
         return f'"{escaped}"'
     return s
 
@@ -225,7 +233,9 @@ def _yaml_dump(value: Any, indent: int = 0) -> str:
                 lines.append(f"{pad}{key}:")
                 lines.append(_yaml_dump(v, indent + 1))
             else:
-                lines.append(f"{pad}{key}: {_yaml_scalar(v) if not isinstance(v, (dict, list)) else ('{}' if isinstance(v, dict) else '[]')}")
+                lines.append(
+                    f"{pad}{key}: {_yaml_scalar(v) if not isinstance(v, (dict, list)) else ('{}' if isinstance(v, dict) else '[]')}"
+                )
         return "\n".join(lines)
     if isinstance(value, list):
         if not value:
@@ -236,7 +246,9 @@ def _yaml_dump(value: Any, indent: int = 0) -> str:
                 lines.append(f"{pad}-")
                 lines.append(_yaml_dump(item, indent + 1))
             else:
-                lines.append(f"{pad}- {_yaml_scalar(item) if not isinstance(item, (dict, list)) else ('{}' if isinstance(item, dict) else '[]')}")
+                lines.append(
+                    f"{pad}- {_yaml_scalar(item) if not isinstance(item, (dict, list)) else ('{}' if isinstance(item, dict) else '[]')}"
+                )
         return "\n".join(lines)
     return f"{pad}{_yaml_scalar(value)}"
 
diff --git a/packages/markitdown/tests/test_docintel_converter.py b/packages/markitdown/tests/test_docintel_converter.py
index 87744741a..5b872f0df 100644
--- a/packages/markitdown/tests/test_docintel_converter.py
+++ b/packages/markitdown/tests/test_docintel_converter.py
@@ -23,7 +23,6 @@
     _yaml_dump,
 )
 
-
 # --------- helpers ---------------------------------------------------------
 
 
@@ -91,7 +90,9 @@ def test_user_agent_string_format():
 def test_client_constructed_with_user_agent_and_api_version():
     """__init__ should pass user_agent and api_version to DocumentIntelligenceClient."""
     fake_client = mock.MagicMock()
-    with mock.patch.object(di_mod, "DocumentIntelligenceClient", return_value=fake_client) as ctor:
+    with mock.patch.object(
+        di_mod, "DocumentIntelligenceClient", return_value=fake_client
+    ) as ctor:
         DocumentIntelligenceConverter(
             endpoint="https://example.cognitiveservices.azure.com/",
             credential=mock.MagicMock(),
@@ -120,7 +121,9 @@ def test_convert_uses_default_model_id():
     client.begin_analyze_document.return_value = fake_poller
 
     conv = _bare_converter(client=client)
-    conv.convert(io.BytesIO(b"data"), StreamInfo(extension=".pdf", mimetype="application/pdf"))
+    conv.convert(
+        io.BytesIO(b"data"), StreamInfo(extension=".pdf", mimetype="application/pdf")
+    )
 
     args, kwargs = client.begin_analyze_document.call_args
     assert kwargs["model_id"] == "prebuilt-layout"
@@ -133,9 +136,13 @@ def test_convert_uses_overridden_model_id():
     client.begin_analyze_document.return_value = fake_poller
 
     conv = _bare_converter(model_id="prebuilt-invoice", client=client)
-    conv.convert(io.BytesIO(b"data"), StreamInfo(extension=".pdf", mimetype="application/pdf"))
+    conv.convert(
+        io.BytesIO(b"data"), StreamInfo(extension=".pdf", mimetype="application/pdf")
+    )
 
-    assert client.begin_analyze_document.call_args.kwargs["model_id"] == "prebuilt-invoice"
+    assert (
+        client.begin_analyze_document.call_args.kwargs["model_id"] == "prebuilt-invoice"
+    )
 
 
 # --------- Phase 3: YAML front matter --------------------------------------
@@ -234,7 +241,9 @@ def test_convert_prepends_front_matter_when_fields_present():
 
 def test_convert_no_front_matter_when_no_documents():
     fake_poller = mock.MagicMock()
-    fake_poller.result.return_value = SimpleNamespace(content="# Layout", documents=None)
+    fake_poller.result.return_value = SimpleNamespace(
+        content="# Layout", documents=None
+    )
     client = mock.MagicMock()
     client.begin_analyze_document.return_value = fake_poller
 
@@ -252,7 +261,9 @@ def test_convert_no_front_matter_when_no_documents():
 
 def test_query_fields_adds_feature_for_ocr_types():
     conv = _bare_converter(query_fields=["VendorName", "Total"])
-    features = conv._analysis_features(StreamInfo(extension=".pdf", mimetype="application/pdf"))
+    features = conv._analysis_features(
+        StreamInfo(extension=".pdf", mimetype="application/pdf")
+    )
     from azure.ai.documentintelligence.models import DocumentAnalysisFeature
 
     assert DocumentAnalysisFeature.QUERY_FIELDS in features
@@ -277,9 +288,14 @@ def test_query_fields_passed_to_begin_analyze_document_for_pdf():
     client.begin_analyze_document.return_value = fake_poller
 
     conv = _bare_converter(query_fields=["A", "B"], client=client)
-    conv.convert(io.BytesIO(b"data"), StreamInfo(extension=".pdf", mimetype="application/pdf"))
+    conv.convert(
+        io.BytesIO(b"data"), StreamInfo(extension=".pdf", mimetype="application/pdf")
+    )
 
-    assert client.begin_analyze_document.call_args.kwargs.get("query_fields") == ["A", "B"]
+    assert client.begin_analyze_document.call_args.kwargs.get("query_fields") == [
+        "A",
+        "B",
+    ]
 
 
 def test_query_fields_not_passed_for_office_types():

From 35c4848f1ab21425b9f66c9bbe5cf12da0cc9f15 Mon Sep 17 00:00:00 2001
From: chienyuanchang <ds.chienyuanchang@gmail.com>
Date: Tue, 26 May 2026 10:37:15 -0700
Subject: [PATCH 3/3] fix black

---
 packages/markitdown/src/markitdown/__main__.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py
index e58ab7936..802bf553d 100644
--- a/packages/markitdown/src/markitdown/__main__.py
+++ b/packages/markitdown/src/markitdown/__main__.py
@@ -16,7 +16,8 @@ def main():
         description="Convert various file formats to markdown.",
         prog="markitdown",
         formatter_class=argparse.RawDescriptionHelpFormatter,
-        usage=dedent("""
+        usage=dedent(
+            """
             SYNTAX:
 
                 markitdown <OPTIONAL: FILENAME>
@@ -41,7 +42,8 @@ def main():
                 OR
 
                 markitdown example.pdf > example.md
-            """).strip(),
+            """
+        ).strip(),
     )
 
     parser.add_argument(