From b5851a7394b289c7ad39ee1721321956902e2f75 Mon Sep 17 00:00:00 2001 From: "Aayush D.C Dangi" <89606048+dcaayushd@users.noreply.github.com> Date: Fri, 29 May 2026 18:13:52 +0545 Subject: [PATCH] Fix OCR fallback for scanned PDFs returning only page headers --- .../markitdown_ocr/_pdf_converter_with_ocr.py | 21 +++++-- .../tests/test_pdf_converter.py | 63 +++++++++++++++++-- 2 files changed, 73 insertions(+), 11 deletions(-) diff --git a/packages/markitdown-ocr/src/markitdown_ocr/_pdf_converter_with_ocr.py b/packages/markitdown-ocr/src/markitdown_ocr/_pdf_converter_with_ocr.py index c1dc0f613..98e68b32a 100644 --- a/packages/markitdown-ocr/src/markitdown_ocr/_pdf_converter_with_ocr.py +++ b/packages/markitdown-ocr/src/markitdown_ocr/_pdf_converter_with_ocr.py @@ -182,6 +182,7 @@ def convert( pdf_bytes = io.BytesIO(file_stream.read()) markdown_content = [] + has_extracted_content = False try: with pdfplumber.open(pdf_bytes) as pdf: @@ -269,42 +270,52 @@ def convert( for item in content_items: if item["type"] == "text": markdown_content.append(item["text"]) + has_extracted_content = True else: # image ocr_text = item["ocr_text"] img_marker = ( f"\n\n*[Image OCR]\n{ocr_text}\n[End OCR]*\n" ) markdown_content.append(img_marker) + has_extracted_content = True else: # No images detected - just extract regular text text_content = page.extract_text() or "" if text_content.strip(): markdown_content.append(text_content.strip()) + has_extracted_content = True else: # No OCR, just extract text text_content = page.extract_text() or "" if text_content.strip(): markdown_content.append(text_content.strip()) + has_extracted_content = True # Build final markdown markdown = "\n\n".join(markdown_content).strip() - # Fallback to pdfminer if empty - if not markdown: + # Fallback to pdfminer if page headers are the only content. + if not has_extracted_content: pdf_bytes.seek(0) - markdown = pdfminer.high_level.extract_text(pdf_bytes) + pdfminer_markdown = ( + pdfminer.high_level.extract_text(pdf_bytes) or "" + ) + if pdfminer_markdown.strip(): + markdown = pdfminer_markdown + has_extracted_content = True except Exception: # Fallback to pdfminer try: pdf_bytes.seek(0) - markdown = pdfminer.high_level.extract_text(pdf_bytes) + markdown = pdfminer.high_level.extract_text(pdf_bytes) or "" + has_extracted_content = bool(markdown.strip()) except Exception: markdown = "" # Final fallback: If still empty/whitespace and OCR is available, # treat as scanned PDF and OCR full pages - if ocr_service and (not markdown or not markdown.strip()): + if ocr_service and not has_extracted_content: pdf_bytes.seek(0) markdown = self._ocr_full_pages(pdf_bytes, ocr_service) diff --git a/packages/markitdown-ocr/tests/test_pdf_converter.py b/packages/markitdown-ocr/tests/test_pdf_converter.py index 5d4adcc5e..86c701fd1 100644 --- a/packages/markitdown-ocr/tests/test_pdf_converter.py +++ b/packages/markitdown-ocr/tests/test_pdf_converter.py @@ -144,17 +144,30 @@ def test_pdf_complex_layout(svc: MockOCRService) -> None: # --------------------------------------------------------------------------- -# pdf_multipage.pdf — pdfplumber/pdfminer fail (EOF); PyMuPDF fallback used +# pdf_multipage.pdf # --------------------------------------------------------------------------- def test_pdf_multipage(svc: MockOCRService) -> None: - # pdfplumber cannot open this file (Unexpected EOF), so _ocr_full_pages - # falls back to PyMuPDF for page rendering. Each page becomes one OCR block. expected = ( - f"## Page 1\n\n\n{_OCR_BLOCK}\n\n\n" - f"## Page 2\n\n\n{_OCR_BLOCK}\n\n\n" - f"## Page 3\n\n\n{_OCR_BLOCK}" + "## Page 1\n\n\n" + "Page 1 - Content before image\n\n" + "This is important text that appears BEFORE the image.\n\n\n\n" + f"{_OCR_BLOCK}\n\n\n" + "This text appears AFTER the image on page 1.\n\n" + "More content follows here.\n\n\n" + "## Page 2\n\n\n" + "Page 2 - Content with image at end\n\n" + "Main content of page 2 starts here.\n\n" + "This is paragraph 1.\n\n" + "This is paragraph 2.\n\n" + "Final paragraph before image.\n\n\n\n" + f"{_OCR_BLOCK}\n\n\n\n" + "## Page 3\n\n\n" + "Page 3 - Image at top\n\n\n\n" + f"{_OCR_BLOCK}\n\n\n" + "Content that follows the image.\n\n" + "This text is AFTER the image." ) assert _convert("pdf_multipage.pdf", svc) == expected @@ -218,6 +231,44 @@ def test_pdf_scanned_fallback_format(svc: MockOCRService) -> None: ), f"_ocr_full_pages must produce:\n{expected!r}\nActual:\n{md!r}" +def test_pdf_headers_only_triggers_full_page_ocr(svc: MockOCRService) -> None: + """Page headings alone must not suppress full-page OCR fallback.""" + path = TEST_DATA_DIR / "pdf_image_start.pdf" + if not path.exists(): + pytest.skip(f"Test file not found: {path}") + + converter = PdfConverterWithOCR() + + with ( + patch("pdfplumber.open") as mock_plumber, + patch("pdfminer.high_level.extract_text", return_value=""), + patch.object( + converter, + "_extract_page_images", + return_value=[], + ), + patch.object( + converter, + "_ocr_full_pages", + return_value="## Page 1\n\n\n" + _OCR_BLOCK, + ) as mock_full_page_ocr, + ): + mock_pdf = MagicMock() + mock_page = MagicMock() + mock_page.extract_text.return_value = "" + mock_pdf.pages = [mock_page] + mock_pdf.__enter__.return_value = mock_pdf + mock_plumber.return_value = mock_pdf + + with open(path, "rb") as f: + md = converter.convert( + f, StreamInfo(extension=".pdf"), ocr_service=svc + ).text_content + + assert md == "## Page 1\n\n\n" + _OCR_BLOCK + mock_full_page_ocr.assert_called_once() + + # --------------------------------------------------------------------------- # No OCR service — no OCR tags emitted # ---------------------------------------------------------------------------