Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ def convert(
pdf_bytes = io.BytesIO(file_stream.read())

markdown_content = []
has_extracted_content = False

try:
with pdfplumber.open(pdf_bytes) as pdf:
Expand Down Expand Up @@ -269,42 +270,52 @@ def convert(
for item in content_items:
if item["type"] == "text":
markdown_content.append(item["text"])
has_extracted_content = True
else: # image
ocr_text = item["ocr_text"]
img_marker = (
f"\n\n*[Image OCR]\n{ocr_text}\n[End OCR]*\n"
)
markdown_content.append(img_marker)
has_extracted_content = True
else:
# No images detected - just extract regular text
text_content = page.extract_text() or ""
if text_content.strip():
markdown_content.append(text_content.strip())
has_extracted_content = True
else:
# No OCR, just extract text
text_content = page.extract_text() or ""
if text_content.strip():
markdown_content.append(text_content.strip())
has_extracted_content = True

# Build final markdown
markdown = "\n\n".join(markdown_content).strip()

# Fallback to pdfminer if empty
if not markdown:
# Fallback to pdfminer if page headers are the only content.
if not has_extracted_content:
pdf_bytes.seek(0)
markdown = pdfminer.high_level.extract_text(pdf_bytes)
pdfminer_markdown = (
pdfminer.high_level.extract_text(pdf_bytes) or ""
)
if pdfminer_markdown.strip():
markdown = pdfminer_markdown
has_extracted_content = True

except Exception:
# Fallback to pdfminer
try:
pdf_bytes.seek(0)
markdown = pdfminer.high_level.extract_text(pdf_bytes)
markdown = pdfminer.high_level.extract_text(pdf_bytes) or ""
has_extracted_content = bool(markdown.strip())
except Exception:
markdown = ""

# Final fallback: If still empty/whitespace and OCR is available,
# treat as scanned PDF and OCR full pages
if ocr_service and (not markdown or not markdown.strip()):
if ocr_service and not has_extracted_content:
pdf_bytes.seek(0)
markdown = self._ocr_full_pages(pdf_bytes, ocr_service)

Expand Down
63 changes: 57 additions & 6 deletions packages/markitdown-ocr/tests/test_pdf_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,17 +144,30 @@ def test_pdf_complex_layout(svc: MockOCRService) -> None:


# ---------------------------------------------------------------------------
# pdf_multipage.pdf — pdfplumber/pdfminer fail (EOF); PyMuPDF fallback used
# pdf_multipage.pdf
# ---------------------------------------------------------------------------


def test_pdf_multipage(svc: MockOCRService) -> None:
# pdfplumber cannot open this file (Unexpected EOF), so _ocr_full_pages
# falls back to PyMuPDF for page rendering. Each page becomes one OCR block.
expected = (
f"## Page 1\n\n\n{_OCR_BLOCK}\n\n\n"
f"## Page 2\n\n\n{_OCR_BLOCK}\n\n\n"
f"## Page 3\n\n\n{_OCR_BLOCK}"
"## Page 1\n\n\n"
"Page 1 - Content before image\n\n"
"This is important text that appears BEFORE the image.\n\n\n\n"
f"{_OCR_BLOCK}\n\n\n"
"This text appears AFTER the image on page 1.\n\n"
"More content follows here.\n\n\n"
"## Page 2\n\n\n"
"Page 2 - Content with image at end\n\n"
"Main content of page 2 starts here.\n\n"
"This is paragraph 1.\n\n"
"This is paragraph 2.\n\n"
"Final paragraph before image.\n\n\n\n"
f"{_OCR_BLOCK}\n\n\n\n"
"## Page 3\n\n\n"
"Page 3 - Image at top\n\n\n\n"
f"{_OCR_BLOCK}\n\n\n"
"Content that follows the image.\n\n"
"This text is AFTER the image."
)
assert _convert("pdf_multipage.pdf", svc) == expected

Expand Down Expand Up @@ -218,6 +231,44 @@ def test_pdf_scanned_fallback_format(svc: MockOCRService) -> None:
), f"_ocr_full_pages must produce:\n{expected!r}\nActual:\n{md!r}"


def test_pdf_headers_only_triggers_full_page_ocr(svc: MockOCRService) -> None:
"""Page headings alone must not suppress full-page OCR fallback."""
path = TEST_DATA_DIR / "pdf_image_start.pdf"
if not path.exists():
pytest.skip(f"Test file not found: {path}")

converter = PdfConverterWithOCR()

with (
patch("pdfplumber.open") as mock_plumber,
patch("pdfminer.high_level.extract_text", return_value=""),
patch.object(
converter,
"_extract_page_images",
return_value=[],
),
patch.object(
converter,
"_ocr_full_pages",
return_value="## Page 1\n\n\n" + _OCR_BLOCK,
) as mock_full_page_ocr,
):
mock_pdf = MagicMock()
mock_page = MagicMock()
mock_page.extract_text.return_value = ""
mock_pdf.pages = [mock_page]
mock_pdf.__enter__.return_value = mock_pdf
mock_plumber.return_value = mock_pdf

with open(path, "rb") as f:
md = converter.convert(
f, StreamInfo(extension=".pdf"), ocr_service=svc
).text_content

assert md == "## Page 1\n\n\n" + _OCR_BLOCK
mock_full_page_ocr.assert_called_once()


# ---------------------------------------------------------------------------
# No OCR service — no OCR tags emitted
# ---------------------------------------------------------------------------
Expand Down