Skip to content

Commit f73130f

Browse files
committed
🐛 fix method naming
1 parent fbad75c commit f73130f

8 files changed

Lines changed: 41 additions & 51 deletions

File tree

mindee/image/extracted_image.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def __init__(
4848
self._element_id = 0 if element_id is None else element_id
4949

5050
@requires_pillow
51-
def save_to_file(self, output_path: Path | str):
51+
def write_to_file(self, output_path: Path | str):
5252
"""
5353
Saves the document to a file.
5454

mindee/image/extracted_images.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,4 @@ class ExtractedImages(list[ExtractedImage]):
99
def save_all_to_disk(self, output_path: Path | str) -> None:
1010
"""Save all extracted images to disk."""
1111
for image in self:
12-
image.save_to_file(output_path)
12+
image.write_to_file(output_path)

mindee/pdf/extracted_pdf.py

Lines changed: 18 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,44 +3,27 @@
33
from pathlib import Path
44
from typing import BinaryIO
55

6-
from mindee.dependencies.checkers import PYPDFIUM2_AVAILABLE
7-
from mindee.dependencies.decorators import requires_pypdfium2
86
from mindee.error.mindee_error import MindeeError
97
from mindee.input.bytes_input import BytesInput
108

11-
if PYPDFIUM2_AVAILABLE:
12-
# pylint: disable=import-error
13-
import pypdfium2 as pdfium
14-
else:
15-
pdfium = None # pylint: disable=invalid-name
16-
179

1810
class ExtractedPDF:
1911
"""An extracted sub-Pdf."""
2012

2113
buffer: BinaryIO
14+
"""PDF content as a byte stream."""
2215
filename: str
23-
_page_indexes: tuple[int, int]
16+
"""Name of the file when writing to disk."""
17+
_page_range: tuple[int, int]
2418

2519
def __init__(
26-
self, pdf_byte_stream: BinaryIO, filename: str, page_indexes: tuple[int, int]
20+
self, pdf_byte_stream: BinaryIO, filename: str, page_range: tuple[int, int]
2721
):
2822
self.buffer = pdf_byte_stream
2923
self.filename = filename
30-
self._page_indexes = page_indexes
31-
32-
@requires_pypdfium2
33-
def get_page_count(self) -> int:
34-
"""Get the number of pages in the PDF file."""
35-
try:
36-
pdf = pdfium.PdfDocument(self.buffer)
37-
return len(pdf)
38-
except Exception as e:
39-
raise MindeeError(
40-
"Could not retrieve page count from Extracted PDF object."
41-
) from e
24+
self._page_range = page_range
4225

43-
def save_to_file(self, output_path: Path | str):
26+
def write_to_file(self, output_path: Path | str):
4427
"""
4528
Writes the contents of the current PDF object to a file.
4629
@@ -66,6 +49,15 @@ def as_input_source(self) -> BytesInput:
6649
return BytesInput(self.buffer.read(), self.filename)
6750

6851
@property
69-
def page_indexes(self) -> tuple[int, int]:
70-
"""This PDF was extracted from this page range of the original PDF."""
71-
return self._page_indexes
52+
def page_range(self) -> tuple[int, int]:
53+
"""
54+
This PDF was extracted from this page range of the original PDF.
55+
The first number is the index of the first page.
56+
The second number is the index of the last page.
57+
"""
58+
return self._page_range
59+
60+
@property
61+
def page_count(self) -> int:
62+
"""The number of pages in this PDF file."""
63+
return self._page_range[1] - self._page_range[0] + 1

mindee/pdf/extracted_pdfs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,4 @@ def save_all_to_disk(self, output_path: Path | str) -> None:
1010
"""Save all extracted images to disk."""
1111

1212
for image in self:
13-
image.save_to_file(output_path)
13+
image.write_to_file(output_path)

mindee/pdf/pdf_extractor.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,23 +28,19 @@ class PDFExtractor:
2828

2929
_source_pdf: BinaryIO
3030
_filename: str
31+
_page_count: int
3132

3233
@requires_pillow
3334
def __init__(self, local_input: LocalInputSource):
3435
self._filename = local_input.filename
36+
self._page_count = local_input.page_count
3537
if local_input.is_pdf():
3638
self._source_pdf = local_input.file_object
3739
else:
3840
pdf_image = Image.open(local_input.file_object)
3941
self._source_pdf = io.BytesIO()
4042
pdf_image.save(self._source_pdf, format="PDF")
4143

42-
@requires_pypdfium2
43-
def get_page_count(self) -> int:
44-
"""Get the number of pages in the PDF file."""
45-
pdf = pdfium.PdfDocument(self._source_pdf)
46-
return len(pdf)
47-
4844
@requires_pypdfium2
4945
def cut_pages(self, page_indexes: list) -> BinaryIO:
5046
"""
@@ -78,7 +74,7 @@ def extract_sub_documents(
7874
if not page_index_elem or len(page_index_elem) == 0:
7975
raise MindeeError("Empty indexes aren't allowed for extraction.")
8076
for page_index in page_index_elem:
81-
if page_index > self.get_page_count():
77+
if page_index > self._page_count:
8278
raise MindeeError(f"Index {page_index} is out of range.")
8379
first_page = page_index_elem[0]
8480
last_page = page_index_elem[len(page_index_elem) - 1]

tests/v1/extraction/test_invoice_splitter_auto_extraction.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def test_pdf_should_extract_invoices_strict():
4040
)
4141
inference = response.document.inference
4242
pdf_extractor = PDFExtractor(invoice_splitter_input)
43-
assert pdf_extractor.get_page_count() == 2
43+
assert pdf_extractor.page_count == 2
4444

4545
extracted_pdfs_not_strict = pdf_extractor.extract_invoices(
4646
inference.prediction.invoice_page_groups

tests/v1/extraction/test_pdf_extractor.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ def test_image_should_extract_pdf(invoice_default_sample_path):
3939
jpg_input = PathInput(invoice_default_sample_path)
4040
assert not jpg_input.is_pdf()
4141
extractor = PDFExtractor(jpg_input)
42-
assert extractor.get_page_count() == 1
42+
extracted_pdfs = extractor.extract_documents([[0]])
43+
assert len(extracted_pdfs) == 1
4344

4445

4546
@pytest.mark.pillow
@@ -48,20 +49,20 @@ def test_pdf_should_extract_invoices_no_strict(
4849
invoice_splitter_5p_path, loaded_prediction
4950
):
5051
pdf_input = PathInput(invoice_splitter_5p_path)
52+
assert pdf_input.page_count == 5
5153
extractor = PDFExtractor(pdf_input)
52-
assert extractor.get_page_count() == 5
5354
extracted_pdfs_no_strict = extractor.extract_invoices(
5455
loaded_prediction.invoice_page_groups
5556
)
5657

5758
assert len(extracted_pdfs_no_strict) == 3
58-
assert extracted_pdfs_no_strict[0].get_page_count() == 1
59+
assert extracted_pdfs_no_strict[0].page_count == 1
5960
assert extracted_pdfs_no_strict[0].filename == "invoice_5p_pages-001-001.pdf"
6061

61-
assert extracted_pdfs_no_strict[1].get_page_count() == 3
62+
assert extracted_pdfs_no_strict[1].page_count == 3
6263
assert extracted_pdfs_no_strict[1].filename == "invoice_5p_pages-002-004.pdf"
6364

64-
assert extracted_pdfs_no_strict[2].get_page_count() == 1
65+
assert extracted_pdfs_no_strict[2].page_count == 1
6566
assert extracted_pdfs_no_strict[2].filename == "invoice_5p_pages-005-005.pdf"
6667

6768

@@ -71,15 +72,16 @@ def test_pdf_should_extract_invoices_strict(
7172
invoice_splitter_5p_path, loaded_prediction
7273
):
7374
pdf_input = PathInput(invoice_splitter_5p_path)
75+
assert pdf_input.page_count == 5
76+
7477
extractor = PDFExtractor(pdf_input)
75-
assert extractor.get_page_count() == 5
7678
extracted_pdfs_strict = extractor.extract_invoices(
7779
loaded_prediction.invoice_page_groups, True
7880
)
7981

8082
assert len(extracted_pdfs_strict) == 2
81-
assert extracted_pdfs_strict[0].get_page_count() == 1
83+
assert extracted_pdfs_strict[0].page_count == 1
8284
assert extracted_pdfs_strict[0].filename == "invoice_5p_pages-001-001.pdf"
8385

84-
assert extracted_pdfs_strict[1].get_page_count() == 4
86+
assert extracted_pdfs_strict[1].page_count == 4
8587
assert extracted_pdfs_strict[1].filename == "invoice_5p_pages-002-005.pdf"

tests/v2/file_operations/test_split_operation.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,9 @@ def test_default_split():
3232
extracted_splits = response.inference.result.extract_from_input_source(input_sample)
3333
assert len(extracted_splits) == 2
3434

35-
assert extracted_splits[0].get_page_count() == 1
35+
assert extracted_splits[0].page_count == 1
3636
assert extracted_splits[0].filename == "default_sample_pages-001-001.pdf"
37-
assert extracted_splits[1].get_page_count() == 1
37+
assert extracted_splits[1].page_count == 1
3838
assert extracted_splits[1].filename == "default_sample_pages-002-002.pdf"
3939

4040

@@ -46,11 +46,11 @@ def test_multi_page_receipt_split(splits_5p, splits_multi_page_json_path):
4646
extracted_splits = response.inference.result.extract_from_input_source(input_sample)
4747
assert len(extracted_splits) == 3
4848

49-
assert extracted_splits[0].get_page_count() == 1
49+
assert extracted_splits[0].page_count == 1
5050
assert extracted_splits[0].filename == "invoice_5p_pages-001-001.pdf"
51-
assert extracted_splits[1].get_page_count() == 3
51+
assert extracted_splits[1].page_count == 3
5252
assert extracted_splits[1].filename == "invoice_5p_pages-002-004.pdf"
53-
assert extracted_splits[2].get_page_count() == 1
53+
assert extracted_splits[2].page_count == 1
5454
assert extracted_splits[2].filename == "invoice_5p_pages-005-005.pdf"
5555

5656

@@ -62,4 +62,4 @@ def test_multi_page_receipt_single_split(splits_5p, splits_multi_page_json_path)
6262
split = response.inference.result.splits[1]
6363
extracted_split = split.extract_from_input_source(input_sample)
6464

65-
assert extracted_split.get_page_count() == 3
65+
assert extracted_split.page_count == 3

0 commit comments

Comments
 (0)