Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions docs/pymupdf4llm/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,29 @@ The PyMuPDF4LLM API
}
}

.. method:: markdown_to_pdf(md_path: str | pathlib.Path, \
user_css: str | None = None, \
page_rect: rect-like | None = None, \
margins: rect-like | None) = None, \
archive: str | pathlib.Path | None = None, \
output_path: str | pathlib.Path | None = None) -> pymupdf.Document | None

Convert the markdown text content of the file specified by `md_path` into a PDF document.

The function is always available -- independently of whether you are using the PyMuPDF Layout module or not.

:arg str|Path md_path: the file path of the markdown file to be converted.

:arg str|None user_css: optional, a string of CSS code to be applied to the markdown content. This may be used to customize the appearance of the generated PDF document. If `None` (default), the built-in default CSS is used.

:arg rect-like|None page_rect: optional, the rectangle defining the page boundaries for the generated PDF document. If `None` (default), ISO A4 page dimensions are used. To use one of PyMuPDF's predefined page formats, use e.g. ``pymupdf.paper_rect("Letter")``.

:arg rect-like|None margins: optional, the margins (borders) for the generated pages. This must be a sequence of four floats ``[left, top, right, bottom]`` specifying the respective border width in points (1/72 inches). If `None` (default), the default ``[50, 50, 50, 50]`` margins are used.

:arg str|Archive|None archive: optional. This is be required if the markdown source references images that are **not** stored in the same folder as the markdown file. In this case, `archive` must be a `pymupdf.Archive` object which provides access to the respective image files. If `None` (default), it is assumed that all referenced images are stored in the same folder as the markdown file. The parameter **may** also be required if a custom ``user_css`` references external resources like font files.

:arg str|Path|None output_path: optional, the file path where the generated PDF document will be saved. If specified, the generated PDF will be saved to that location. If `None` (default), the document is returned as a `pymupdf.Document` object.

.. note::

Please see `this site <https://github.com/pymupdf/pymupdf4llm/discussions/327>`_ for more background and the current status of further improvements regarding usage with :ref:`PyMuPDF Layout <pymupdf-layout>`.
Expand Down
40 changes: 23 additions & 17 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2897,7 +2897,7 @@ def __getitem__(self, i=0):
raise IndexError(f"page {i} not in document")
return self.load_page(i)

def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0, height=0, fontsize=11):
def __init__(self, filename=None, stream=None, filetype=None, archive=None, rect=None, width=0, height=0, fontsize=11):
"""Creates a document. Use 'open' as a synonym.

Notes:
Expand Down Expand Up @@ -2943,7 +2943,16 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0

self._name = filename
self.stream = stream

if isinstance(archive, pathlib.Path):
archive = Archive(archive.name)
elif isinstance(archive, str):
archive = Archive(archive)
if archive and not isinstance(archive, Archive):
raise TypeError(f"bad archive: {type(archive)=}.")
if archive:
archive_parm = archive.this # pass this to open
else:
archive_parm = None # means: no archive present
if stream is not None:
if filename is not None and filetype is None:
# 2025-05-06: Use <filename> as the filetype. This is
Expand All @@ -2958,6 +2967,8 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0
stream = stream.getvalue()
else:
raise TypeError(f"bad stream: {type(stream)=}.")

# this prevents bad things if original goes out of existence:
self.stream = stream

assert isinstance(stream, (bytes, memoryview))
Expand All @@ -2967,9 +2978,9 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0
# raise a specific exception.
raise EmptyFileError('Cannot open empty stream.')

stream2 = mupdf.fz_open_memory(mupdf.python_buffer_data(stream), len(stream))
fz_stream = mupdf.fz_open_memory(mupdf.python_buffer_data(stream), len(stream))
try:
doc = mupdf.fz_open_document_with_stream(filetype if filetype else '', stream2)
doc = mupdf.fz_open_document_with_stream_and_dir(filetype if filetype else '', fz_stream, archive_parm)
except Exception as e:
if g_exceptions_verbose > 1: exception_info()
raise FileDataError('Failed to open stream') from e
Expand All @@ -2996,20 +3007,15 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0
raise EmptyFileError(f'Cannot open empty file: {filename=}.')

if filetype:
# Override the type implied by <filename>. MuPDF does not
# have a way to do this directly so we open via a stream.
try:
fz_stream = mupdf.fz_open_file(filename)
doc = mupdf.fz_open_document_with_stream(filetype, fz_stream)
except Exception as e:
if g_exceptions_verbose > 1: exception_info()
raise FileDataError(f'Failed to open file {filename!r} as type {filetype!r}.') from e
suffix = filetype
else:
try:
doc = mupdf.fz_open_document(filename)
except Exception as e:
if g_exceptions_verbose > 1: exception_info()
raise FileDataError(f'Failed to open file {filename!r}.') from e
suffix = Path(filename).suffix
try:
fz_stream = mupdf.fz_open_file(filename)
doc = mupdf.fz_open_document_with_stream_and_dir(suffix, fz_stream, archive_parm)
except Exception as e:
if g_exceptions_verbose > 1: exception_info()
raise FileDataError(f'Failed to open file {filename!r} as type {suffix}.') from e

else:
pdf = mupdf.PdfDocument()
Expand Down
Loading