33from pathlib import Path
44from typing import BinaryIO
55
6- from mindee .dependencies .checkers import PYPDFIUM2_AVAILABLE
7- from mindee .dependencies .decorators import requires_pypdfium2
86from mindee .error .mindee_error import MindeeError
97from mindee .input .bytes_input import BytesInput
108
11- if PYPDFIUM2_AVAILABLE :
12- # pylint: disable=import-error
13- import pypdfium2 as pdfium
14- else :
15- pdfium = None # pylint: disable=invalid-name
16-
179
1810class ExtractedPDF :
1911 """An extracted sub-Pdf."""
2012
2113 buffer : BinaryIO
14+ """PDF content as a byte stream."""
2215 filename : str
23- _page_indexes : tuple [int , int ]
16+ """Name of the file when writing to disk."""
17+ _page_range : tuple [int , int ]
2418
2519 def __init__ (
26- self , pdf_byte_stream : BinaryIO , filename : str , page_indexes : tuple [int , int ]
20+ self , pdf_byte_stream : BinaryIO , filename : str , page_range : tuple [int , int ]
2721 ):
2822 self .buffer = pdf_byte_stream
2923 self .filename = filename
30- self ._page_indexes = page_indexes
31-
32- @requires_pypdfium2
33- def get_page_count (self ) -> int :
34- """Get the number of pages in the PDF file."""
35- try :
36- pdf = pdfium .PdfDocument (self .buffer )
37- return len (pdf )
38- except Exception as e :
39- raise MindeeError (
40- "Could not retrieve page count from Extracted PDF object."
41- ) from e
24+ self ._page_range = page_range
4225
43- def save_to_file (self , output_path : Path | str ):
26+ def write_to_file (self , output_path : Path | str ):
4427 """
4528 Writes the contents of the current PDF object to a file.
4629
@@ -66,6 +49,15 @@ def as_input_source(self) -> BytesInput:
6649 return BytesInput (self .buffer .read (), self .filename )
6750
6851 @property
69- def page_indexes (self ) -> tuple [int , int ]:
70- """This PDF was extracted from this page range of the original PDF."""
71- return self ._page_indexes
52+ def page_range (self ) -> tuple [int , int ]:
53+ """
54+ This PDF was extracted from this page range of the original PDF.
55+ The first number is the index of the first page.
56+ The second number is the index of the last page.
57+ """
58+ return self ._page_range
59+
60+ @property
61+ def page_count (self ) -> int :
62+ """The number of pages in this PDF file."""
63+ return self ._page_range [1 ] - self ._page_range [0 ] + 1
0 commit comments