99from mindee .error .mindee_error import MindeeError
1010from mindee .input .local_input_source import LocalInputSource
1111from mindee .pdf .extracted_pdf import ExtractedPDF
12+ from mindee .pdf .extracted_pdfs import ExtractedPDFs
1213
1314if PYPDFIUM2_AVAILABLE :
1415 # pylint: disable=import-error
@@ -28,10 +29,12 @@ class PDFExtractor:
2829
2930 _source_pdf : BinaryIO
3031 _filename : str
32+ _page_count : int
3133
3234 @requires_pillow
3335 def __init__ (self , local_input : LocalInputSource ):
3436 self ._filename = local_input .filename
37+ self ._page_count = local_input .page_count
3538 if local_input .is_pdf ():
3639 self ._source_pdf = local_input .file_object
3740 else :
@@ -40,66 +43,51 @@ def __init__(self, local_input: LocalInputSource):
4043 pdf_image .save (self ._source_pdf , format = "PDF" )
4144
4245 @requires_pypdfium2
43- def get_page_count (self ) -> int :
44- """Get the number of pages in the PDF file."""
45- pdf = pdfium .PdfDocument (self ._source_pdf )
46- return len (pdf )
47-
48- @requires_pypdfium2
49- def cut_pages (self , page_indexes : list ) -> BinaryIO :
46+ def extract_single_document (self , page_indexes : list [int ]) -> ExtractedPDF :
5047 """
5148 Create a new PDF from pages and save it into a buffer.
5249
5350 :param page_indexes: List of pages number to use for merging in the original PDF.
5451 :return: The buffer containing the new PDF.
5552 """
53+ if not page_indexes or len (page_indexes ) == 0 :
54+ raise MindeeError ("Empty indexes aren't allowed for extraction." )
55+ for page_index in page_indexes :
56+ if page_index > self ._page_count :
57+ raise MindeeError (f"Index { page_index } is out of range." )
58+
5659 self ._source_pdf .seek (0 )
5760 new_pdf = pdfium .PdfDocument .new ()
5861 pdf = pdfium .PdfDocument (self ._source_pdf )
5962 new_pdf .import_pages (pdf , page_indexes )
6063 bytes_io = io .BytesIO ()
6164 new_pdf .save (bytes_io )
62- return bytes_io
65+
66+ first_page = page_indexes [0 ]
67+ last_page = page_indexes [len (page_indexes ) - 1 ]
68+ return ExtractedPDF (
69+ pdf_byte_stream = bytes_io ,
70+ filename = self ._make_filename (first_page , last_page ),
71+ page_indexes = page_indexes ,
72+ )
6373
6474 @requires_pypdfium2
65- def extract_sub_documents (
75+ def extract_multiple_documents (
6676 self , page_indexes : list [list [int ]]
67- ) -> list [ ExtractedPDF ] :
77+ ) -> ExtractedPDFs :
6878 """
6979 Extract the sub-documents from the main pdf, based on the given list of page indexes.
7080
7181 :param page_indexes: 2D list of numbers, representing page indexes.
7282 :return: A list of created PDFS.
7383 """
84+ if len (page_indexes ) < 1 :
85+ raise MindeeError ("No indexes provided." )
7486 extracted_pdfs : list [ExtractedPDF ] = []
75- extension = Path (self ._filename ).suffix
76- stem = Path (self ._filename ).stem
7787 for page_index_elem in page_indexes :
78- if not page_index_elem or len (page_index_elem ) == 0 :
79- raise MindeeError ("Empty indexes aren't allowed for extraction." )
80- for page_index in page_index_elem :
81- if page_index > self .get_page_count ():
82- raise MindeeError (f"Index { page_index } is out of range." )
83- first_page = page_index_elem [0 ]
84- last_page = page_index_elem [len (page_index_elem ) - 1 ]
85- extracted_pdf = ExtractedPDF (
86- self .cut_pages (page_index_elem ),
87- f"{ stem } _pages-{ (first_page + 1 ):03d} -{ (last_page + 1 ):03d} { extension } " ,
88- (first_page , last_page ),
89- )
90- extracted_pdfs .append (extracted_pdf )
91- return extracted_pdfs
92-
93- def extract_documents (
94- self ,
95- page_indexes : list [list [int ]],
96- ) -> list [ExtractedPDF ]:
97- """
98- Extracts complete PDFs from the document.
88+ extracted_pdfs .append (self .extract_single_document (page_index_elem ))
89+ return ExtractedPDFs (extracted_pdfs )
9990
100- :param page_indexes: List of sub-lists of pages to keep.
101- :return: A list of extracted invoices.
102- """
103- if len (page_indexes ) < 1 :
104- raise MindeeError ("No indexes provided." )
105- return self .extract_sub_documents (page_indexes )
91+ def _make_filename (self , first_page : int , last_page : int ) -> str :
92+ stem = Path (self ._filename ).stem
93+ return f"{ stem } _pages-{ (first_page + 1 ):03d} -{ (last_page + 1 ):03d} .pdf"
0 commit comments