diff --git a/docs/pymupdf4llm/api.rst b/docs/pymupdf4llm/api.rst index afa88bede..8f6789203 100644 --- a/docs/pymupdf4llm/api.rst +++ b/docs/pymupdf4llm/api.rst @@ -279,6 +279,29 @@ The PyMuPDF4LLM API } } +.. method:: markdown_to_pdf(md_path: str | pathlib.Path, \ + user_css: str | None = None, \ + page_rect: rect-like | None = None, \ + margins: rect-like | None) = None, \ + archive: str | pathlib.Path | None = None, \ + output_path: str | pathlib.Path | None = None) -> pymupdf.Document | None + + Convert the markdown text content of the file specified by `md_path` into a PDF document. + + The function is always available -- independently of whether you are using the PyMuPDF Layout module or not. + + :arg str|Path md_path: the file path of the markdown file to be converted. + + :arg str|None user_css: optional, a string of CSS code to be applied to the markdown content. This may be used to customize the appearance of the generated PDF document. If `None` (default), the built-in default CSS is used. + + :arg rect-like|None page_rect: optional, the rectangle defining the page boundaries for the generated PDF document. If `None` (default), ISO A4 page dimensions are used. To use one of PyMuPDF's predefined page formats, use e.g. ``pymupdf.paper_rect("Letter")``. + + :arg rect-like|None margins: optional, the margins (borders) for the generated pages. This must be a sequence of four floats ``[left, top, right, bottom]`` specifying the respective border width in points (1/72 inches). If `None` (default), the default ``[50, 50, 50, 50]`` margins are used. + + :arg str|Archive|None archive: optional. This is be required if the markdown source references images that are **not** stored in the same folder as the markdown file. In this case, `archive` must be a `pymupdf.Archive` object which provides access to the respective image files. If `None` (default), it is assumed that all referenced images are stored in the same folder as the markdown file. The parameter **may** also be required if a custom ``user_css`` references external resources like font files. + + :arg str|Path|None output_path: optional, the file path where the generated PDF document will be saved. If specified, the generated PDF will be saved to that location. If `None` (default), the document is returned as a `pymupdf.Document` object. + .. note:: Please see `this site `_ for more background and the current status of further improvements regarding usage with :ref:`PyMuPDF Layout `. diff --git a/src/__init__.py b/src/__init__.py index c98164dc7..5a2953c19 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -2897,7 +2897,7 @@ def __getitem__(self, i=0): raise IndexError(f"page {i} not in document") return self.load_page(i) - def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0, height=0, fontsize=11): + def __init__(self, filename=None, stream=None, filetype=None, archive=None, rect=None, width=0, height=0, fontsize=11): """Creates a document. Use 'open' as a synonym. Notes: @@ -2943,7 +2943,16 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0 self._name = filename self.stream = stream - + if isinstance(archive, pathlib.Path): + archive = Archive(archive.name) + elif isinstance(archive, str): + archive = Archive(archive) + if archive and not isinstance(archive, Archive): + raise TypeError(f"bad archive: {type(archive)=}.") + if archive: + archive_parm = archive.this # pass this to open + else: + archive_parm = None # means: no archive present if stream is not None: if filename is not None and filetype is None: # 2025-05-06: Use as the filetype. This is @@ -2958,6 +2967,8 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0 stream = stream.getvalue() else: raise TypeError(f"bad stream: {type(stream)=}.") + + # this prevents bad things if original goes out of existence: self.stream = stream assert isinstance(stream, (bytes, memoryview)) @@ -2967,9 +2978,9 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0 # raise a specific exception. raise EmptyFileError('Cannot open empty stream.') - stream2 = mupdf.fz_open_memory(mupdf.python_buffer_data(stream), len(stream)) + fz_stream = mupdf.fz_open_memory(mupdf.python_buffer_data(stream), len(stream)) try: - doc = mupdf.fz_open_document_with_stream(filetype if filetype else '', stream2) + doc = mupdf.fz_open_document_with_stream_and_dir(filetype if filetype else '', fz_stream, archive_parm) except Exception as e: if g_exceptions_verbose > 1: exception_info() raise FileDataError('Failed to open stream') from e @@ -2996,20 +3007,15 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0 raise EmptyFileError(f'Cannot open empty file: {filename=}.') if filetype: - # Override the type implied by . MuPDF does not - # have a way to do this directly so we open via a stream. - try: - fz_stream = mupdf.fz_open_file(filename) - doc = mupdf.fz_open_document_with_stream(filetype, fz_stream) - except Exception as e: - if g_exceptions_verbose > 1: exception_info() - raise FileDataError(f'Failed to open file {filename!r} as type {filetype!r}.') from e + suffix = filetype else: - try: - doc = mupdf.fz_open_document(filename) - except Exception as e: - if g_exceptions_verbose > 1: exception_info() - raise FileDataError(f'Failed to open file {filename!r}.') from e + suffix = Path(filename).suffix + try: + fz_stream = mupdf.fz_open_file(filename) + doc = mupdf.fz_open_document_with_stream_and_dir(suffix, fz_stream, archive_parm) + except Exception as e: + if g_exceptions_verbose > 1: exception_info() + raise FileDataError(f'Failed to open file {filename!r} as type {suffix}.') from e else: pdf = mupdf.PdfDocument()