Skip to content

Commit b275ae5

Browse files
authored
🐛 💥 rework extraction methods (#440)
1 parent fbad75c commit b275ae5

11 files changed

Lines changed: 82 additions & 90 deletions

File tree

mindee/image/extracted_image.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def __init__(
4848
self._element_id = 0 if element_id is None else element_id
4949

5050
@requires_pillow
51-
def save_to_file(self, output_path: Path | str):
51+
def write_to_file(self, output_path: Path | str):
5252
"""
5353
Saves the document to a file.
5454

mindee/image/extracted_images.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,4 @@ class ExtractedImages(list[ExtractedImage]):
99
def save_all_to_disk(self, output_path: Path | str) -> None:
1010
"""Save all extracted images to disk."""
1111
for image in self:
12-
image.save_to_file(output_path)
12+
image.write_to_file(output_path)

mindee/pdf/extracted_pdf.py

Lines changed: 14 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3,44 +3,27 @@
33
from pathlib import Path
44
from typing import BinaryIO
55

6-
from mindee.dependencies.checkers import PYPDFIUM2_AVAILABLE
7-
from mindee.dependencies.decorators import requires_pypdfium2
86
from mindee.error.mindee_error import MindeeError
97
from mindee.input.bytes_input import BytesInput
108

11-
if PYPDFIUM2_AVAILABLE:
12-
# pylint: disable=import-error
13-
import pypdfium2 as pdfium
14-
else:
15-
pdfium = None # pylint: disable=invalid-name
16-
179

1810
class ExtractedPDF:
1911
"""An extracted sub-Pdf."""
2012

2113
buffer: BinaryIO
14+
"""PDF content as a byte stream."""
2215
filename: str
23-
_page_indexes: tuple[int, int]
16+
"""Name of the file when writing to disk."""
17+
_page_indexes: list[int]
2418

2519
def __init__(
26-
self, pdf_byte_stream: BinaryIO, filename: str, page_indexes: tuple[int, int]
20+
self, pdf_byte_stream: BinaryIO, filename: str, page_indexes: list[int]
2721
):
2822
self.buffer = pdf_byte_stream
2923
self.filename = filename
3024
self._page_indexes = page_indexes
3125

32-
@requires_pypdfium2
33-
def get_page_count(self) -> int:
34-
"""Get the number of pages in the PDF file."""
35-
try:
36-
pdf = pdfium.PdfDocument(self.buffer)
37-
return len(pdf)
38-
except Exception as e:
39-
raise MindeeError(
40-
"Could not retrieve page count from Extracted PDF object."
41-
) from e
42-
43-
def save_to_file(self, output_path: Path | str):
26+
def write_to_file(self, output_path: Path | str):
4427
"""
4528
Writes the contents of the current PDF object to a file.
4629
@@ -66,6 +49,13 @@ def as_input_source(self) -> BytesInput:
6649
return BytesInput(self.buffer.read(), self.filename)
6750

6851
@property
69-
def page_indexes(self) -> tuple[int, int]:
70-
"""This PDF was extracted from this page range of the original PDF."""
52+
def page_indexes(self) -> list[int]:
53+
"""
54+
0-based indexes of all pages taken from the original PDF.
55+
"""
7156
return self._page_indexes
57+
58+
@property
59+
def page_count(self) -> int:
60+
"""The number of pages in this PDF file."""
61+
return len(self._page_indexes)

mindee/pdf/extracted_pdfs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,4 @@ def save_all_to_disk(self, output_path: Path | str) -> None:
1010
"""Save all extracted images to disk."""
1111

1212
for image in self:
13-
image.save_to_file(output_path)
13+
image.write_to_file(output_path)

mindee/pdf/pdf_extractor.py

Lines changed: 27 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from mindee.error.mindee_error import MindeeError
1010
from mindee.input.local_input_source import LocalInputSource
1111
from mindee.pdf.extracted_pdf import ExtractedPDF
12+
from mindee.pdf.extracted_pdfs import ExtractedPDFs
1213

1314
if PYPDFIUM2_AVAILABLE:
1415
# pylint: disable=import-error
@@ -28,10 +29,12 @@ class PDFExtractor:
2829

2930
_source_pdf: BinaryIO
3031
_filename: str
32+
_page_count: int
3133

3234
@requires_pillow
3335
def __init__(self, local_input: LocalInputSource):
3436
self._filename = local_input.filename
37+
self._page_count = local_input.page_count
3538
if local_input.is_pdf():
3639
self._source_pdf = local_input.file_object
3740
else:
@@ -40,66 +43,51 @@ def __init__(self, local_input: LocalInputSource):
4043
pdf_image.save(self._source_pdf, format="PDF")
4144

4245
@requires_pypdfium2
43-
def get_page_count(self) -> int:
44-
"""Get the number of pages in the PDF file."""
45-
pdf = pdfium.PdfDocument(self._source_pdf)
46-
return len(pdf)
47-
48-
@requires_pypdfium2
49-
def cut_pages(self, page_indexes: list) -> BinaryIO:
46+
def extract_single_document(self, page_indexes: list[int]) -> ExtractedPDF:
5047
"""
5148
Create a new PDF from pages and save it into a buffer.
5249
5350
:param page_indexes: List of pages number to use for merging in the original PDF.
5451
:return: The buffer containing the new PDF.
5552
"""
53+
if not page_indexes or len(page_indexes) == 0:
54+
raise MindeeError("Empty indexes aren't allowed for extraction.")
55+
for page_index in page_indexes:
56+
if page_index > self._page_count:
57+
raise MindeeError(f"Index {page_index} is out of range.")
58+
5659
self._source_pdf.seek(0)
5760
new_pdf = pdfium.PdfDocument.new()
5861
pdf = pdfium.PdfDocument(self._source_pdf)
5962
new_pdf.import_pages(pdf, page_indexes)
6063
bytes_io = io.BytesIO()
6164
new_pdf.save(bytes_io)
62-
return bytes_io
65+
66+
first_page = page_indexes[0]
67+
last_page = page_indexes[len(page_indexes) - 1]
68+
return ExtractedPDF(
69+
pdf_byte_stream=bytes_io,
70+
filename=self._make_filename(first_page, last_page),
71+
page_indexes=page_indexes,
72+
)
6373

6474
@requires_pypdfium2
65-
def extract_sub_documents(
75+
def extract_multiple_documents(
6676
self, page_indexes: list[list[int]]
67-
) -> list[ExtractedPDF]:
77+
) -> ExtractedPDFs:
6878
"""
6979
Extract the sub-documents from the main pdf, based on the given list of page indexes.
7080
7181
:param page_indexes: 2D list of numbers, representing page indexes.
7282
:return: A list of created PDFS.
7383
"""
84+
if len(page_indexes) < 1:
85+
raise MindeeError("No indexes provided.")
7486
extracted_pdfs: list[ExtractedPDF] = []
75-
extension = Path(self._filename).suffix
76-
stem = Path(self._filename).stem
7787
for page_index_elem in page_indexes:
78-
if not page_index_elem or len(page_index_elem) == 0:
79-
raise MindeeError("Empty indexes aren't allowed for extraction.")
80-
for page_index in page_index_elem:
81-
if page_index > self.get_page_count():
82-
raise MindeeError(f"Index {page_index} is out of range.")
83-
first_page = page_index_elem[0]
84-
last_page = page_index_elem[len(page_index_elem) - 1]
85-
extracted_pdf = ExtractedPDF(
86-
self.cut_pages(page_index_elem),
87-
f"{stem}_pages-{(first_page + 1):03d}-{(last_page + 1):03d}{extension}",
88-
(first_page, last_page),
89-
)
90-
extracted_pdfs.append(extracted_pdf)
91-
return extracted_pdfs
92-
93-
def extract_documents(
94-
self,
95-
page_indexes: list[list[int]],
96-
) -> list[ExtractedPDF]:
97-
"""
98-
Extracts complete PDFs from the document.
88+
extracted_pdfs.append(self.extract_single_document(page_index_elem))
89+
return ExtractedPDFs(extracted_pdfs)
9990

100-
:param page_indexes: List of sub-lists of pages to keep.
101-
:return: A list of extracted invoices.
102-
"""
103-
if len(page_indexes) < 1:
104-
raise MindeeError("No indexes provided.")
105-
return self.extract_sub_documents(page_indexes)
91+
def _make_filename(self, first_page: int, last_page: int) -> str:
92+
stem = Path(self._filename).stem
93+
return f"{stem}_pages-{(first_page + 1):03d}-{(last_page + 1):03d}.pdf"

mindee/v1/pdf/pdf_extractor.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,11 @@ def extract_invoices(
2424
if len(page_indexes) < 1:
2525
raise MindeeError("No indexes provided.")
2626
if not isinstance(page_indexes[0], InvoiceSplitterV1InvoicePageGroup):
27-
return self.extract_sub_documents(page_indexes) # type: ignore
27+
return self.extract_multiple_documents(page_indexes) # type: ignore
2828

2929
if not strict:
3030
indexes_as_list = [page_index.page_indexes for page_index in page_indexes] # type: ignore
31-
return self.extract_sub_documents(indexes_as_list)
31+
return self.extract_multiple_documents(indexes_as_list)
3232
correct_page_indexes: list[list[int]] = []
3333
current_list: list[int] = []
3434
previous_confidence: float | None = None
@@ -49,4 +49,4 @@ def extract_invoices(
4949
correct_page_indexes.append(current_list)
5050
correct_page_indexes.append(page_list)
5151
previous_confidence = confidence
52-
return self.extract_sub_documents(correct_page_indexes)
52+
return self.extract_multiple_documents(correct_page_indexes)

mindee/v2/file_operations/split.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ def extract_single_split(
1515
:param split: List of pages to keep.
1616
:return: Extracted PDF
1717
"""
18-
return extract_multiple_splits(input_source, [split])[0]
18+
pdf_extractor = PDFExtractor(input_source)
19+
return pdf_extractor.extract_single_document(_range_to_indexes(split))
1920

2021

2122
def extract_multiple_splits(
@@ -32,7 +33,11 @@ def extract_multiple_splits(
3233
pdf_extractor = PDFExtractor(input_source)
3334
page_groups = []
3435
for split in splits:
35-
page_groups.append(list(range(split[0], split[1] + 1)))
36+
page_groups.append(_range_to_indexes(split))
3637
if len(splits) < 1:
3738
raise MindeeError("No indexes provided.")
38-
return ExtractedPDFs(pdf_extractor.extract_sub_documents(page_groups))
39+
return pdf_extractor.extract_multiple_documents(page_groups)
40+
41+
42+
def _range_to_indexes(split: list[int]) -> list[int]:
43+
return list(range(split[0], split[1] + 1))

tests/v1/extraction/test_invoice_splitter_auto_extraction.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,15 +40,15 @@ def test_pdf_should_extract_invoices_strict():
4040
)
4141
inference = response.document.inference
4242
pdf_extractor = PDFExtractor(invoice_splitter_input)
43-
assert pdf_extractor.get_page_count() == 2
43+
assert invoice_splitter_input.page_count == 2
4444

4545
extracted_pdfs_not_strict = pdf_extractor.extract_invoices(
4646
inference.prediction.invoice_page_groups
4747
)
4848
extracted_pdfs_strict = pdf_extractor.extract_invoices(
4949
inference.prediction.invoice_page_groups
5050
)
51-
extracted_base_pdfs = pdf_extractor.extract_documents(
51+
extracted_base_pdfs = pdf_extractor.extract_multiple_documents(
5252
[int_list.page_indexes for int_list in inference.prediction.invoice_page_groups]
5353
)
5454
for i, extracted_pdf in enumerate(extracted_base_pdfs):

tests/v1/extraction/test_pdf_extractor.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from mindee.v1.product.invoice_splitter.invoice_splitter_v1_document import (
99
InvoiceSplitterV1Document,
1010
)
11-
from tests.utils import V1_PRODUCT_DATA_DIR
11+
from tests.utils import OUTPUT_DIR, V1_PRODUCT_DATA_DIR
1212

1313

1414
@pytest.fixture
@@ -39,7 +39,12 @@ def test_image_should_extract_pdf(invoice_default_sample_path):
3939
jpg_input = PathInput(invoice_default_sample_path)
4040
assert not jpg_input.is_pdf()
4141
extractor = PDFExtractor(jpg_input)
42-
assert extractor.get_page_count() == 1
42+
extracted_pdf = extractor.extract_single_document([0])
43+
assert extracted_pdf.page_count == 1
44+
assert extracted_pdf.page_indexes == [0]
45+
assert extracted_pdf.filename == "default_sample_pages-001-001.pdf"
46+
extracted_pdf.write_to_file(OUTPUT_DIR)
47+
assert (OUTPUT_DIR / extracted_pdf.filename).exists()
4348

4449

4550
@pytest.mark.pillow
@@ -48,20 +53,20 @@ def test_pdf_should_extract_invoices_no_strict(
4853
invoice_splitter_5p_path, loaded_prediction
4954
):
5055
pdf_input = PathInput(invoice_splitter_5p_path)
56+
assert pdf_input.page_count == 5
5157
extractor = PDFExtractor(pdf_input)
52-
assert extractor.get_page_count() == 5
5358
extracted_pdfs_no_strict = extractor.extract_invoices(
5459
loaded_prediction.invoice_page_groups
5560
)
5661

5762
assert len(extracted_pdfs_no_strict) == 3
58-
assert extracted_pdfs_no_strict[0].get_page_count() == 1
63+
assert extracted_pdfs_no_strict[0].page_count == 1
5964
assert extracted_pdfs_no_strict[0].filename == "invoice_5p_pages-001-001.pdf"
6065

61-
assert extracted_pdfs_no_strict[1].get_page_count() == 3
66+
assert extracted_pdfs_no_strict[1].page_count == 3
6267
assert extracted_pdfs_no_strict[1].filename == "invoice_5p_pages-002-004.pdf"
6368

64-
assert extracted_pdfs_no_strict[2].get_page_count() == 1
69+
assert extracted_pdfs_no_strict[2].page_count == 1
6570
assert extracted_pdfs_no_strict[2].filename == "invoice_5p_pages-005-005.pdf"
6671

6772

@@ -71,15 +76,16 @@ def test_pdf_should_extract_invoices_strict(
7176
invoice_splitter_5p_path, loaded_prediction
7277
):
7378
pdf_input = PathInput(invoice_splitter_5p_path)
79+
assert pdf_input.page_count == 5
80+
7481
extractor = PDFExtractor(pdf_input)
75-
assert extractor.get_page_count() == 5
7682
extracted_pdfs_strict = extractor.extract_invoices(
7783
loaded_prediction.invoice_page_groups, True
7884
)
7985

8086
assert len(extracted_pdfs_strict) == 2
81-
assert extracted_pdfs_strict[0].get_page_count() == 1
87+
assert extracted_pdfs_strict[0].page_count == 1
8288
assert extracted_pdfs_strict[0].filename == "invoice_5p_pages-001-001.pdf"
8389

84-
assert extracted_pdfs_strict[1].get_page_count() == 4
90+
assert extracted_pdfs_strict[1].page_count == 4
8591
assert extracted_pdfs_strict[1].filename == "invoice_5p_pages-002-005.pdf"

tests/v2/file_operations/test_split_operation.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,9 @@ def test_default_split():
3232
extracted_splits = response.inference.result.extract_from_input_source(input_sample)
3333
assert len(extracted_splits) == 2
3434

35-
assert extracted_splits[0].get_page_count() == 1
35+
assert extracted_splits[0].page_count == 1
3636
assert extracted_splits[0].filename == "default_sample_pages-001-001.pdf"
37-
assert extracted_splits[1].get_page_count() == 1
37+
assert extracted_splits[1].page_count == 1
3838
assert extracted_splits[1].filename == "default_sample_pages-002-002.pdf"
3939

4040

@@ -46,11 +46,14 @@ def test_multi_page_receipt_split(splits_5p, splits_multi_page_json_path):
4646
extracted_splits = response.inference.result.extract_from_input_source(input_sample)
4747
assert len(extracted_splits) == 3
4848

49-
assert extracted_splits[0].get_page_count() == 1
49+
assert extracted_splits[0].page_count == 1
50+
assert extracted_splits[0].page_indexes == [0]
5051
assert extracted_splits[0].filename == "invoice_5p_pages-001-001.pdf"
51-
assert extracted_splits[1].get_page_count() == 3
52+
assert extracted_splits[1].page_count == 3
53+
assert extracted_splits[1].page_indexes == [1, 2, 3]
5254
assert extracted_splits[1].filename == "invoice_5p_pages-002-004.pdf"
53-
assert extracted_splits[2].get_page_count() == 1
55+
assert extracted_splits[2].page_count == 1
56+
assert extracted_splits[2].page_indexes == [4]
5457
assert extracted_splits[2].filename == "invoice_5p_pages-005-005.pdf"
5558

5659

@@ -62,4 +65,4 @@ def test_multi_page_receipt_single_split(splits_5p, splits_multi_page_json_path)
6265
split = response.inference.result.splits[1]
6366
extracted_split = split.extract_from_input_source(input_sample)
6467

65-
assert extracted_split.get_page_count() == 3
68+
assert extracted_split.page_count == 3

0 commit comments

Comments
 (0)