From b5851a7394b289c7ad39ee1721321956902e2f75 Mon Sep 17 00:00:00 2001
From: "Aayush D.C Dangi" <89606048+dcaayushd@users.noreply.github.com>
Date: Fri, 29 May 2026 18:13:52 +0545
Subject: [PATCH] Fix OCR fallback for scanned PDFs returning only page headers

---
 .../markitdown_ocr/_pdf_converter_with_ocr.py | 21 +++++--
 .../tests/test_pdf_converter.py               | 63 +++++++++++++++++--
 2 files changed, 73 insertions(+), 11 deletions(-)

diff --git a/packages/markitdown-ocr/src/markitdown_ocr/_pdf_converter_with_ocr.py b/packages/markitdown-ocr/src/markitdown_ocr/_pdf_converter_with_ocr.py
index c1dc0f613..98e68b32a 100644
--- a/packages/markitdown-ocr/src/markitdown_ocr/_pdf_converter_with_ocr.py
+++ b/packages/markitdown-ocr/src/markitdown_ocr/_pdf_converter_with_ocr.py
@@ -182,6 +182,7 @@ def convert(
         pdf_bytes = io.BytesIO(file_stream.read())
 
         markdown_content = []
+        has_extracted_content = False
 
         try:
             with pdfplumber.open(pdf_bytes) as pdf:
@@ -269,42 +270,52 @@ def convert(
                             for item in content_items:
                                 if item["type"] == "text":
                                     markdown_content.append(item["text"])
+                                    has_extracted_content = True
                                 else:  # image
                                     ocr_text = item["ocr_text"]
                                     img_marker = (
                                         f"\n\n*[Image OCR]\n{ocr_text}\n[End OCR]*\n"
                                     )
                                     markdown_content.append(img_marker)
+                                    has_extracted_content = True
                         else:
                             # No images detected - just extract regular text
                             text_content = page.extract_text() or ""
                             if text_content.strip():
                                 markdown_content.append(text_content.strip())
+                                has_extracted_content = True
                     else:
                         # No OCR, just extract text
                         text_content = page.extract_text() or ""
                         if text_content.strip():
                             markdown_content.append(text_content.strip())
+                            has_extracted_content = True
 
                 # Build final markdown
                 markdown = "\n\n".join(markdown_content).strip()
 
-                # Fallback to pdfminer if empty
-                if not markdown:
+                # Fallback to pdfminer if page headers are the only content.
+                if not has_extracted_content:
                     pdf_bytes.seek(0)
-                    markdown = pdfminer.high_level.extract_text(pdf_bytes)
+                    pdfminer_markdown = (
+                        pdfminer.high_level.extract_text(pdf_bytes) or ""
+                    )
+                    if pdfminer_markdown.strip():
+                        markdown = pdfminer_markdown
+                        has_extracted_content = True
 
         except Exception:
             # Fallback to pdfminer
             try:
                 pdf_bytes.seek(0)
-                markdown = pdfminer.high_level.extract_text(pdf_bytes)
+                markdown = pdfminer.high_level.extract_text(pdf_bytes) or ""
+                has_extracted_content = bool(markdown.strip())
             except Exception:
                 markdown = ""
 
         # Final fallback: If still empty/whitespace and OCR is available,
         # treat as scanned PDF and OCR full pages
-        if ocr_service and (not markdown or not markdown.strip()):
+        if ocr_service and not has_extracted_content:
             pdf_bytes.seek(0)
             markdown = self._ocr_full_pages(pdf_bytes, ocr_service)
 
diff --git a/packages/markitdown-ocr/tests/test_pdf_converter.py b/packages/markitdown-ocr/tests/test_pdf_converter.py
index 5d4adcc5e..86c701fd1 100644
--- a/packages/markitdown-ocr/tests/test_pdf_converter.py
+++ b/packages/markitdown-ocr/tests/test_pdf_converter.py
@@ -144,17 +144,30 @@ def test_pdf_complex_layout(svc: MockOCRService) -> None:
 
 
 # ---------------------------------------------------------------------------
-# pdf_multipage.pdf — pdfplumber/pdfminer fail (EOF); PyMuPDF fallback used
+# pdf_multipage.pdf
 # ---------------------------------------------------------------------------
 
 
 def test_pdf_multipage(svc: MockOCRService) -> None:
-    # pdfplumber cannot open this file (Unexpected EOF), so _ocr_full_pages
-    # falls back to PyMuPDF for page rendering.  Each page becomes one OCR block.
     expected = (
-        f"## Page 1\n\n\n{_OCR_BLOCK}\n\n\n"
-        f"## Page 2\n\n\n{_OCR_BLOCK}\n\n\n"
-        f"## Page 3\n\n\n{_OCR_BLOCK}"
+        "## Page 1\n\n\n"
+        "Page 1 - Content before image\n\n"
+        "This is important text that appears BEFORE the image.\n\n\n\n"
+        f"{_OCR_BLOCK}\n\n\n"
+        "This text appears AFTER the image on page 1.\n\n"
+        "More content follows here.\n\n\n"
+        "## Page 2\n\n\n"
+        "Page 2 - Content with image at end\n\n"
+        "Main content of page 2 starts here.\n\n"
+        "This is paragraph 1.\n\n"
+        "This is paragraph 2.\n\n"
+        "Final paragraph before image.\n\n\n\n"
+        f"{_OCR_BLOCK}\n\n\n\n"
+        "## Page 3\n\n\n"
+        "Page 3 - Image at top\n\n\n\n"
+        f"{_OCR_BLOCK}\n\n\n"
+        "Content that follows the image.\n\n"
+        "This text is AFTER the image."
     )
     assert _convert("pdf_multipage.pdf", svc) == expected
 
@@ -218,6 +231,44 @@ def test_pdf_scanned_fallback_format(svc: MockOCRService) -> None:
     ), f"_ocr_full_pages must produce:\n{expected!r}\nActual:\n{md!r}"
 
 
+def test_pdf_headers_only_triggers_full_page_ocr(svc: MockOCRService) -> None:
+    """Page headings alone must not suppress full-page OCR fallback."""
+    path = TEST_DATA_DIR / "pdf_image_start.pdf"
+    if not path.exists():
+        pytest.skip(f"Test file not found: {path}")
+
+    converter = PdfConverterWithOCR()
+
+    with (
+        patch("pdfplumber.open") as mock_plumber,
+        patch("pdfminer.high_level.extract_text", return_value=""),
+        patch.object(
+            converter,
+            "_extract_page_images",
+            return_value=[],
+        ),
+        patch.object(
+            converter,
+            "_ocr_full_pages",
+            return_value="## Page 1\n\n\n" + _OCR_BLOCK,
+        ) as mock_full_page_ocr,
+    ):
+        mock_pdf = MagicMock()
+        mock_page = MagicMock()
+        mock_page.extract_text.return_value = ""
+        mock_pdf.pages = [mock_page]
+        mock_pdf.__enter__.return_value = mock_pdf
+        mock_plumber.return_value = mock_pdf
+
+        with open(path, "rb") as f:
+            md = converter.convert(
+                f, StreamInfo(extension=".pdf"), ocr_service=svc
+            ).text_content
+
+    assert md == "## Page 1\n\n\n" + _OCR_BLOCK
+    mock_full_page_ocr.assert_called_once()
+
+
 # ---------------------------------------------------------------------------
 # No OCR service — no OCR tags emitted
 # ---------------------------------------------------------------------------