Skip to content

Commit 0bfbf06

Browse files
committed
rework again
1 parent 981f99e commit 0bfbf06

6 files changed

Lines changed: 43 additions & 47 deletions

File tree

mindee/pdf/extracted_pdf.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,14 @@ class ExtractedPDF:
1414
"""PDF content as a byte stream."""
1515
filename: str
1616
"""Name of the file when writing to disk."""
17-
_page_range: tuple[int, int]
17+
_page_indexes: list[int]
1818

1919
def __init__(
20-
self, pdf_byte_stream: BinaryIO, filename: str, page_range: tuple[int, int]
20+
self, pdf_byte_stream: BinaryIO, filename: str, page_indexes: list[int]
2121
):
2222
self.buffer = pdf_byte_stream
2323
self.filename = filename
24-
self._page_range = page_range
24+
self._page_indexes = page_indexes
2525

2626
def write_to_file(self, output_path: Path | str):
2727
"""
@@ -49,15 +49,13 @@ def as_input_source(self) -> BytesInput:
4949
return BytesInput(self.buffer.read(), self.filename)
5050

5151
@property
52-
def page_range(self) -> tuple[int, int]:
52+
def page_indexes(self) -> list[int]:
5353
"""
54-
This PDF was extracted from this page range of the original PDF.
55-
The first number is the index of the first page.
56-
The second number is the index of the last page.
54+
0-based indexes of all pages taken from the original PDF.
5755
"""
58-
return self._page_range
56+
return self._page_indexes
5957

6058
@property
6159
def page_count(self) -> int:
6260
"""The number of pages in this PDF file."""
63-
return self._page_range[1] - self._page_range[0] + 1
61+
return len(self._page_indexes)

mindee/pdf/pdf_extractor.py

Lines changed: 22 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -42,23 +42,36 @@ def __init__(self, local_input: LocalInputSource):
4242
pdf_image.save(self._source_pdf, format="PDF")
4343

4444
@requires_pypdfium2
45-
def cut_pages(self, page_indexes: list) -> BinaryIO:
45+
def extract_single_document(self, page_indexes: list[int]) -> ExtractedPDF:
4646
"""
4747
Create a new PDF from pages and save it into a buffer.
4848
4949
:param page_indexes: List of pages number to use for merging in the original PDF.
5050
:return: The buffer containing the new PDF.
5151
"""
52+
if not page_indexes or len(page_indexes) == 0:
53+
raise MindeeError("Empty indexes aren't allowed for extraction.")
54+
for page_index in page_indexes:
55+
if page_index > self._page_count:
56+
raise MindeeError(f"Index {page_index} is out of range.")
57+
5258
self._source_pdf.seek(0)
5359
new_pdf = pdfium.PdfDocument.new()
5460
pdf = pdfium.PdfDocument(self._source_pdf)
5561
new_pdf.import_pages(pdf, page_indexes)
5662
bytes_io = io.BytesIO()
5763
new_pdf.save(bytes_io)
58-
return bytes_io
64+
65+
first_page = page_indexes[0]
66+
last_page = page_indexes[len(page_indexes) - 1]
67+
return ExtractedPDF(
68+
pdf_byte_stream=bytes_io,
69+
filename=self._make_filename(first_page, last_page),
70+
page_indexes=page_indexes,
71+
)
5972

6073
@requires_pypdfium2
61-
def extract_sub_documents(
74+
def extract_multiple_documents(
6275
self, page_indexes: list[list[int]]
6376
) -> list[ExtractedPDF]:
6477
"""
@@ -67,35 +80,13 @@ def extract_sub_documents(
6780
:param page_indexes: 2D list of numbers, representing page indexes.
6881
:return: A list of created PDFS.
6982
"""
83+
if len(page_indexes) < 1:
84+
raise MindeeError("No indexes provided.")
7085
extracted_pdfs: list[ExtractedPDF] = []
71-
extension = Path(self._filename).suffix
72-
stem = Path(self._filename).stem
7386
for page_index_elem in page_indexes:
74-
if not page_index_elem or len(page_index_elem) == 0:
75-
raise MindeeError("Empty indexes aren't allowed for extraction.")
76-
for page_index in page_index_elem:
77-
if page_index > self._page_count:
78-
raise MindeeError(f"Index {page_index} is out of range.")
79-
first_page = page_index_elem[0]
80-
last_page = page_index_elem[len(page_index_elem) - 1]
81-
extracted_pdf = ExtractedPDF(
82-
self.cut_pages(page_index_elem),
83-
f"{stem}_pages-{(first_page + 1):03d}-{(last_page + 1):03d}{extension}",
84-
(first_page, last_page),
85-
)
86-
extracted_pdfs.append(extracted_pdf)
87+
extracted_pdfs.append(self.extract_single_document(page_index_elem))
8788
return extracted_pdfs
8889

89-
def extract_documents(
90-
self,
91-
page_indexes: list[list[int]],
92-
) -> list[ExtractedPDF]:
93-
"""
94-
Extracts complete PDFs from the document.
95-
96-
:param page_indexes: List of sub-lists of pages to keep.
97-
:return: A list of extracted invoices.
98-
"""
99-
if len(page_indexes) < 1:
100-
raise MindeeError("No indexes provided.")
101-
return self.extract_sub_documents(page_indexes)
90+
def _make_filename(self, first_page: int, last_page: int) -> str:
91+
stem = Path(self._filename).stem
92+
return f"{stem}_pages-{(first_page + 1):03d}-{(last_page + 1):03d}.pdf"

mindee/v1/pdf/pdf_extractor.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,11 @@ def extract_invoices(
2424
if len(page_indexes) < 1:
2525
raise MindeeError("No indexes provided.")
2626
if not isinstance(page_indexes[0], InvoiceSplitterV1InvoicePageGroup):
27-
return self.extract_sub_documents(page_indexes) # type: ignore
27+
return self.extract_multiple_documents(page_indexes) # type: ignore
2828

2929
if not strict:
3030
indexes_as_list = [page_index.page_indexes for page_index in page_indexes] # type: ignore
31-
return self.extract_sub_documents(indexes_as_list)
31+
return self.extract_multiple_documents(indexes_as_list)
3232
correct_page_indexes: list[list[int]] = []
3333
current_list: list[int] = []
3434
previous_confidence: float | None = None
@@ -49,4 +49,4 @@ def extract_invoices(
4949
correct_page_indexes.append(current_list)
5050
correct_page_indexes.append(page_list)
5151
previous_confidence = confidence
52-
return self.extract_sub_documents(correct_page_indexes)
52+
return self.extract_multiple_documents(correct_page_indexes)

mindee/v2/file_operations/split.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,4 +35,4 @@ def extract_multiple_splits(
3535
page_groups.append(list(range(split[0], split[1] + 1)))
3636
if len(splits) < 1:
3737
raise MindeeError("No indexes provided.")
38-
return ExtractedPDFs(pdf_extractor.extract_sub_documents(page_groups))
38+
return ExtractedPDFs(pdf_extractor.extract_multiple_documents(page_groups))

tests/v1/extraction/test_pdf_extractor.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from mindee.v1.product.invoice_splitter.invoice_splitter_v1_document import (
99
InvoiceSplitterV1Document,
1010
)
11-
from tests.utils import V1_PRODUCT_DATA_DIR
11+
from tests.utils import OUTPUT_DIR, V1_PRODUCT_DATA_DIR
1212

1313

1414
@pytest.fixture
@@ -39,8 +39,12 @@ def test_image_should_extract_pdf(invoice_default_sample_path):
3939
jpg_input = PathInput(invoice_default_sample_path)
4040
assert not jpg_input.is_pdf()
4141
extractor = PDFExtractor(jpg_input)
42-
extracted_pdfs = extractor.extract_documents([[0]])
43-
assert len(extracted_pdfs) == 1
42+
extracted_pdf = extractor.extract_single_document([0])
43+
assert extracted_pdf.page_count == 1
44+
assert extracted_pdf.page_indexes == [0]
45+
assert extracted_pdf.filename == "default_sample_pages-001-001.pdf"
46+
extracted_pdf.write_to_file(OUTPUT_DIR)
47+
assert (OUTPUT_DIR / extracted_pdf.filename).exists()
4448

4549

4650
@pytest.mark.pillow

tests/v2/file_operations/test_split_operation.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,13 @@ def test_multi_page_receipt_split(splits_5p, splits_multi_page_json_path):
4747
assert len(extracted_splits) == 3
4848

4949
assert extracted_splits[0].page_count == 1
50+
assert extracted_splits[0].page_indexes == [0]
5051
assert extracted_splits[0].filename == "invoice_5p_pages-001-001.pdf"
5152
assert extracted_splits[1].page_count == 3
53+
assert extracted_splits[1].page_indexes == [1, 2, 3]
5254
assert extracted_splits[1].filename == "invoice_5p_pages-002-004.pdf"
5355
assert extracted_splits[2].page_count == 1
56+
assert extracted_splits[2].page_indexes == [4]
5457
assert extracted_splits[2].filename == "invoice_5p_pages-005-005.pdf"
5558

5659

0 commit comments

Comments
 (0)