Skip to content

Commit 2b48c4e

Browse files
committed
🐛 harmonize extractors
1 parent 631ab3d commit 2b48c4e

13 files changed

Lines changed: 187 additions & 151 deletions

File tree

src/main/java/com/mindee/image/ExtractedImage.java

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ public class ExtractedImage {
1919
private final String filename;
2020
private final String saveFormat;
2121
private final int pageId;
22+
private final int elementId;
2223

2324
/**
2425
* Default constructor.
@@ -27,11 +28,18 @@ public class ExtractedImage {
2728
* @param filename Name of the extracted image.
2829
* @param saveFormat Format to save the image as, defaults to PNG.
2930
*/
30-
public ExtractedImage(BufferedImage image, String filename, String saveFormat, int pageId) {
31+
public ExtractedImage(
32+
BufferedImage image,
33+
String filename,
34+
String saveFormat,
35+
int pageId,
36+
int elementId
37+
) {
3138
this.image = image;
3239
this.filename = filename;
3340
this.saveFormat = saveFormat;
3441
this.pageId = pageId;
42+
this.elementId = elementId;
3543
}
3644

3745
/**
@@ -53,11 +61,10 @@ public void writeToFile(String outputPath) throws IOException {
5361
* @throws IOException Throws if the file can't be accessed.
5462
*/
5563
public void writeToFile(Path outputPath) throws IOException {
56-
if (Files.isDirectory(outputPath)) {
57-
outputPath = outputPath.resolve(this.filename);
64+
if (!Files.isDirectory(outputPath)) {
65+
throw new IllegalArgumentException("Provided path is not a directory.");
5866
}
59-
var outputfile = outputPath.toFile();
60-
ImageIO.write(this.image, this.saveFormat, outputfile);
67+
ImageIO.write(this.image, this.saveFormat, outputPath.resolve(this.filename).toFile());
6168
}
6269

6370
/**

src/main/java/com/mindee/image/ImageExtractor.java

Lines changed: 28 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -22,19 +22,17 @@ public class ImageExtractor {
2222
public ImageExtractor(LocalInputSource source) throws IOException {
2323

2424
this.pageImages = new ArrayList<>();
25+
this.filename = source.getFilename();
2526

2627
if (source.isPDF()) {
2728
this.saveFormat = "jpg";
2829
var pdfPageImages = getPDFRasterizer().PDFToImages(source.getFile(), source.getFilename());
2930
for (PDFPageImage pdfPageImage : pdfPageImages) {
3031
this.pageImages.add(pdfPageImage.getImage());
3132
}
32-
this.filename = source.getFilename() + "." + this.saveFormat;
3333
} else {
34-
this.filename = source.getFilename();
3534
String[] splitName = InputSourceUtils.splitNameStrict(this.filename);
3635
this.saveFormat = splitName[1].toLowerCase();
37-
3836
var input = new ByteArrayInputStream(source.getFile());
3937
this.pageImages.add(ImageIO.read(input));
4038
}
@@ -64,53 +62,29 @@ public int getPageCount() {
6462
*
6563
* @param <FieldT> Type of field (needs to support positioning data).
6664
* @param fields List of Fields to extract.
67-
* @param pageIndex The page index to extract, begins at 0.
65+
* @param pageId The page index to extract, begins at 0.
6866
* @return A list of {@link ExtractedImage}.
6967
*/
7068
public <FieldT extends PositionDataField> ExtractedImages extractImagesFromPage(
7169
List<FieldT> fields,
72-
int pageIndex
70+
int pageId
7371
) {
74-
return extractImagesFromPage(fields, pageIndex, this.filename);
75-
}
76-
77-
/**
78-
* Extract multiple images on a given page from a list of fields having position data.
79-
*
80-
* @param <FieldT> Type of field (needs to support positioning data).
81-
* @param fields List of Fields to extract.
82-
* @param pageIndex The page index to extract, begins at 0.
83-
* @param outputName The base output filename, must have an image extension.
84-
* @return A list of {@link ExtractedImage}.
85-
*/
86-
public <FieldT extends PositionDataField> ExtractedImages extractImagesFromPage(
87-
List<FieldT> fields,
88-
int pageIndex,
89-
String outputName
90-
) {
91-
String filename;
92-
if (this.getPageCount() > 1) {
93-
String[] splitName = InputSourceUtils.splitNameStrict(outputName);
94-
filename = splitName[0] + "." + this.saveFormat;
95-
} else {
96-
filename = outputName;
97-
}
98-
return extractFromPage(fields, pageIndex, filename);
72+
return extractFromPage(fields, pageId, this.filename);
9973
}
10074

10175
private <FieldT extends PositionDataField> ExtractedImages extractFromPage(
10276
List<FieldT> fields,
103-
int pageIndex,
77+
int pageId,
10478
String outputName
10579
) {
106-
String[] splitName = InputSourceUtils.splitNameStrict(outputName);
107-
var filename = String
108-
.format("%s_page-%3s.%s", splitName[0], pageIndex + 1, splitName[1])
109-
.replace(" ", "0");
110-
11180
var extractedImages = new ExtractedImages();
112-
for (int i = 0; i < fields.size(); i++) {
113-
ExtractedImage extractedImage = extractImage(fields.get(i), pageIndex, i + 1, filename);
81+
for (int elementId = 0; elementId < fields.size(); elementId++) {
82+
ExtractedImage extractedImage = extractImage(
83+
fields.get(elementId),
84+
pageId,
85+
elementId,
86+
outputName
87+
);
11488
if (extractedImage != null) {
11589
extractedImages.add(extractedImage);
11690
}
@@ -123,33 +97,31 @@ private <FieldT extends PositionDataField> ExtractedImages extractFromPage(
12397
*
12498
* @param <FieldT> Type of field (needs to support positioning data).
12599
* @param field The field to extract.
126-
* @param index The index to use for naming the extracted image.
100+
* @param elementId The index to use for naming the extracted image.
127101
* @param filename Name of the file.
128-
* @param pageIndex The page index to extract, begins at 0.
102+
* @param pageId The page index to extract, begins at 0.
129103
* @return The {@link ExtractedImage}, or <code>null</code> if the field does not have valid
130104
* position data.
131105
*/
132106
public <FieldT extends PositionDataField> ExtractedImage extractImage(
133107
FieldT field,
134-
int pageIndex,
135-
int index,
108+
int pageId,
109+
int elementId,
136110
String filename
137111
) {
138112
String[] splitName = InputSourceUtils.splitNameStrict(filename);
139-
String saveFormat = splitName[1].toLowerCase();
140113
var polygon = field.getPolygon();
141114
if (polygon == null) {
142115
return null;
143116
}
144-
String fieldFilename = splitName[0]
145-
+ String.format("_%3s", index).replace(" ", "0")
146-
+ "."
147-
+ saveFormat;
148117
return new ExtractedImage(
149-
extractImage(polygon.getAsBbox(), pageIndex),
150-
fieldFilename,
151-
saveFormat,
152-
pageIndex
118+
extractImage(polygon.getAsBbox(), pageId),
119+
String
120+
.format("%s_page-%3s-item-%3s.%s", splitName[0], pageId + 1, elementId + 1, this.saveFormat)
121+
.replace(" ", "0"),
122+
this.saveFormat,
123+
pageId,
124+
elementId
153125
);
154126
}
155127

@@ -158,17 +130,17 @@ public <FieldT extends PositionDataField> ExtractedImage extractImage(
158130
*
159131
* @param <FieldT> Type of field (needs to support positioning data).
160132
* @param field The field to extract.
161-
* @param index The index to use for naming the extracted image.
162-
* @param pageIndex The 0-based page index to extract.
133+
* @param elementId The index to use for naming the extracted image.
134+
* @param pageId The 0-based page index to extract.
163135
* @return The {@link ExtractedImage}, or <code>null</code> if the field does not have valid
164136
* position data.
165137
*/
166138
public <FieldT extends PositionDataField> ExtractedImage extractImage(
167139
FieldT field,
168-
int pageIndex,
169-
int index
140+
int pageId,
141+
int elementId
170142
) {
171-
return extractImage(field, pageIndex, index, this.filename);
143+
return extractImage(field, pageId, elementId, this.filename);
172144
}
173145

174146
private BufferedImage extractImage(Bbox bbox, int pageIndex) {

src/main/java/com/mindee/pdf/BasePDFExtractor.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,9 @@ public ExtractedPDFs extractSubDocuments(List<List<Integer>> pageIndexes) throws
8888
protected String makeFilename(List<Integer> pageNumbers) {
8989
String[] splitName = InputSourceUtils.splitNameStrict(filename);
9090
return splitName[0]
91-
+ String.format("_%3s", pageNumbers.get(0)).replace(" ", "0")
91+
+ String.format("_pages-%3s", pageNumbers.get(0) + 1).replace(" ", "0")
9292
+ "-"
93-
+ String.format("%3s", pageNumbers.get(pageNumbers.size() - 1)).replace(" ", "0")
93+
+ String.format("%3s", pageNumbers.get(pageNumbers.size() - 1) + 1).replace(" ", "0")
9494
+ "."
9595
+ splitName[1];
9696
}

src/main/java/com/mindee/pdf/ExtractedPDF.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,11 @@ public ExtractedPDF(byte[] fileBytes, String filename) {
3333
* @throws IOException Throws if the file can't be accessed.
3434
*/
3535
public void writeToFile(Path outputPath) throws IOException {
36-
if (Files.isDirectory(outputPath)) {
37-
outputPath = outputPath.resolve(this.filename);
36+
if (!Files.isDirectory(outputPath)) {
37+
throw new IllegalArgumentException("Provided path is not a directory.");
3838
}
39-
Files.write(outputPath, this.fileBytes);
39+
40+
Files.write(outputPath.resolve(this.filename), this.fileBytes);
4041
}
4142

4243
/**

src/main/java/com/mindee/v2/fileoperations/Crop.java

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
import com.mindee.v2.product.crop.CropItem;
88
import java.io.IOException;
99
import java.util.List;
10+
import java.util.Map;
11+
import java.util.stream.Collectors;
12+
import java.util.stream.IntStream;
1013

1114
public class Crop {
1215
private final ImageExtractor imageExtractor;
@@ -15,21 +18,41 @@ public Crop(LocalInputSource inputSource) throws IOException {
1518
this.imageExtractor = new ImageExtractor(inputSource);
1619
}
1720

18-
public ExtractedImage extractSingleCrop(CropItem cropItem) throws IOException {
21+
public ExtractedImage extractSingleCrop(CropItem cropItem) {
1922
return this.imageExtractor
2023
.extractImage(cropItem.getLocation(), cropItem.getLocation().getPage(), 0);
2124
}
2225

2326
public ExtractedImages extractMultipleCrops(List<CropItem> cropItems) {
24-
var extractedImages = new ExtractedImages();
25-
for (int i = 0; i < cropItems.size(); i++) {
26-
var cropItem = cropItems.get(i);
27-
extractedImages
28-
.add(
29-
this.imageExtractor
30-
.extractImage(cropItem.getLocation(), cropItem.getLocation().getPage(), i + 1)
31-
);
27+
if (cropItems == null || cropItems.isEmpty()) {
28+
return new ExtractedImages();
3229
}
30+
31+
// Group crops by page, preserving insertion order
32+
Map<Integer, List<CropItem>> cropsByPage = cropItems
33+
.stream()
34+
.collect(
35+
Collectors
36+
.groupingBy(
37+
item -> item.getLocation().getPage(),
38+
java.util.LinkedHashMap::new,
39+
Collectors.toList()
40+
)
41+
);
42+
43+
var extractedImages = new ExtractedImages();
44+
cropsByPage
45+
.forEach(
46+
(page, pageCrops) -> IntStream
47+
.range(0, pageCrops.size())
48+
.forEach(
49+
elementId -> extractedImages
50+
.add(
51+
this.imageExtractor
52+
.extractImage(pageCrops.get(elementId).getLocation(), page, elementId)
53+
)
54+
)
55+
);
3356
return extractedImages;
3457
}
3558
}

src/test/java/com/mindee/TestingUtilities.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,17 @@
99
import org.junit.jupiter.api.Assertions;
1010

1111
public class TestingUtilities {
12+
public static void deleteRecursively(Path path) throws IOException {
13+
if (Files.exists(path)) {
14+
try (var entries = Files.walk(path)) {
15+
entries
16+
.sorted(java.util.Comparator.reverseOrder())
17+
.map(Path::toFile)
18+
.forEach(java.io.File::delete);
19+
}
20+
}
21+
}
22+
1223
public static Path getResourcePath(String filePath) {
1324
return Paths.get("src/test/resources/" + filePath);
1425
}

src/test/java/com/mindee/image/ImageExtractorTest.java

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,9 @@ public void givenAnImage_shouldExtractPositionFields() throws IOException {
7474
LocalInputSource source = extractedImage.asInputSource();
7575
Assertions
7676
.assertEquals(
77-
String.format("default_sample_page-001_%3s.jpg", i + 1).replace(" ", "0"),
77+
String
78+
.format("default_sample_page-%3s-item-%3s.jpg", page.getPageId() + 1, i + 1)
79+
.replace(" ", "0"),
7880
source.getFilename()
7981
);
8082
}
@@ -93,28 +95,22 @@ public void givenAnImage_shouldExtractValueFields() throws IOException {
9395

9496
for (Page<BarcodeReaderV1Document> page : inference.getPages()) {
9597
List<ExtractedImage> codes1D = extractor
96-
.extractImagesFromPage(
97-
page.getPrediction().getCodes1D(),
98-
page.getPageId(),
99-
"barcodes_1D.png"
100-
);
98+
.extractImagesFromPage(page.getPrediction().getCodes1D(), page.getPageId());
10199
for (int i = 0; i < codes1D.size(); i++) {
102100
ExtractedImage extractedImage = codes1D.get(i);
103101
Assertions.assertNotNull(extractedImage.getImage());
104102
LocalInputSource source = extractedImage.asInputSource();
105103
Assertions
106104
.assertEquals(
107-
String.format("barcodes_1D_page-001_%3s.png", i + 1).replace(" ", "0"),
105+
String
106+
.format("default_sample_page-%3s-item-%3s.jpg", page.getPageId() + 1, i + 1)
107+
.replace(" ", "0"),
108108
source.getFilename()
109109
);
110110
extractedImage.writeToFile(getResourcePath("output/"));
111111
}
112112
List<ExtractedImage> codes2D = extractor
113-
.extractImagesFromPage(
114-
page.getPrediction().getCodes2D(),
115-
page.getPageId(),
116-
"barcodes_2D.png"
117-
);
113+
.extractImagesFromPage(page.getPrediction().getCodes2D(), page.getPageId());
118114
for (ExtractedImage extractedImage : codes2D) {
119115
Assertions.assertNotNull(extractedImage.getImage());
120116
extractedImage.writeToFile(getResourcePath("output/"));

src/test/java/com/mindee/v1/fileoperations/InvoiceSplitterExtractionIT.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,10 @@ public void givenAPDF_shouldExtractInvoices() throws IOException, InterruptedExc
7070
List<ExtractedPDF> extractedPDFsStrict = extractor
7171
.extractInvoices(inference.getPrediction().getInvoicePageGroups(), false);
7272
Assertions.assertEquals(2, extractedPDFsStrict.size());
73-
Assertions.assertEquals("default_sample_000-000.pdf", extractedPDFsStrict.get(0).getFilename());
74-
Assertions.assertEquals("default_sample_001-001.pdf", extractedPDFsStrict.get(1).getFilename());
73+
Assertions
74+
.assertEquals("default_sample_pages-001-001.pdf", extractedPDFsStrict.get(0).getFilename());
75+
Assertions
76+
.assertEquals("default_sample_pages-002-002.pdf", extractedPDFsStrict.get(1).getFilename());
7577

7678
PredictResponse<InvoiceV4> invoice0 = getInvoicePrediction(
7779
extractedPDFsStrict.get(0).asInputSource()

src/test/java/com/mindee/v1/fileoperations/InvoiceSplitterExtractionTest.java

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,12 @@ public void givenAPDF_shouldExtractInvoicesNoStrict() throws IOException {
3131
var extractedPDFSNoStrict = extractor
3232
.extractInvoices(inference.getPrediction().getInvoicePageGroups(), false);
3333
Assertions.assertEquals(3, extractedPDFSNoStrict.size());
34-
Assertions.assertEquals("invoice_5p_000-000.pdf", extractedPDFSNoStrict.get(0).getFilename());
35-
Assertions.assertEquals("invoice_5p_001-003.pdf", extractedPDFSNoStrict.get(1).getFilename());
36-
Assertions.assertEquals("invoice_5p_004-004.pdf", extractedPDFSNoStrict.get(2).getFilename());
34+
Assertions
35+
.assertEquals("invoice_5p_pages-001-001.pdf", extractedPDFSNoStrict.get(0).getFilename());
36+
Assertions
37+
.assertEquals("invoice_5p_pages-002-004.pdf", extractedPDFSNoStrict.get(1).getFilename());
38+
Assertions
39+
.assertEquals("invoice_5p_pages-005-005.pdf", extractedPDFSNoStrict.get(2).getFilename());
3740
}
3841

3942
@Test
@@ -48,7 +51,9 @@ public void givenAPDF_shouldExtractInvoicesStrict() throws IOException {
4851
var extractedPDFStrict = extractor
4952
.extractInvoices(inference.getPrediction().getInvoicePageGroups(), true);
5053
Assertions.assertEquals(2, extractedPDFStrict.size());
51-
Assertions.assertEquals("invoice_5p_000-000.pdf", extractedPDFStrict.get(0).getFilename());
52-
Assertions.assertEquals("invoice_5p_001-004.pdf", extractedPDFStrict.get(1).getFilename());
54+
Assertions
55+
.assertEquals("invoice_5p_pages-001-001.pdf", extractedPDFStrict.get(0).getFilename());
56+
Assertions
57+
.assertEquals("invoice_5p_pages-002-005.pdf", extractedPDFStrict.get(1).getFilename());
5358
}
5459
}

0 commit comments

Comments
 (0)