@@ -22,19 +22,17 @@ public class ImageExtractor {
2222 public ImageExtractor (LocalInputSource source ) throws IOException {
2323
2424 this .pageImages = new ArrayList <>();
25+ this .filename = source .getFilename ();
2526
2627 if (source .isPDF ()) {
2728 this .saveFormat = "jpg" ;
2829 var pdfPageImages = getPDFRasterizer ().PDFToImages (source .getFile (), source .getFilename ());
2930 for (PDFPageImage pdfPageImage : pdfPageImages ) {
3031 this .pageImages .add (pdfPageImage .getImage ());
3132 }
32- this .filename = source .getFilename () + "." + this .saveFormat ;
3333 } else {
34- this .filename = source .getFilename ();
3534 String [] splitName = InputSourceUtils .splitNameStrict (this .filename );
3635 this .saveFormat = splitName [1 ].toLowerCase ();
37-
3836 var input = new ByteArrayInputStream (source .getFile ());
3937 this .pageImages .add (ImageIO .read (input ));
4038 }
@@ -64,53 +62,29 @@ public int getPageCount() {
6462 *
6563 * @param <FieldT> Type of field (needs to support positioning data).
6664 * @param fields List of Fields to extract.
67- * @param pageIndex The page index to extract, begins at 0.
65+ * @param pageId The page index to extract, begins at 0.
6866 * @return A list of {@link ExtractedImage}.
6967 */
7068 public <FieldT extends PositionDataField > ExtractedImages extractImagesFromPage (
7169 List <FieldT > fields ,
72- int pageIndex
70+ int pageId
7371 ) {
74- return extractImagesFromPage (fields , pageIndex , this .filename );
75- }
76-
77- /**
78- * Extract multiple images on a given page from a list of fields having position data.
79- *
80- * @param <FieldT> Type of field (needs to support positioning data).
81- * @param fields List of Fields to extract.
82- * @param pageIndex The page index to extract, begins at 0.
83- * @param outputName The base output filename, must have an image extension.
84- * @return A list of {@link ExtractedImage}.
85- */
86- public <FieldT extends PositionDataField > ExtractedImages extractImagesFromPage (
87- List <FieldT > fields ,
88- int pageIndex ,
89- String outputName
90- ) {
91- String filename ;
92- if (this .getPageCount () > 1 ) {
93- String [] splitName = InputSourceUtils .splitNameStrict (outputName );
94- filename = splitName [0 ] + "." + this .saveFormat ;
95- } else {
96- filename = outputName ;
97- }
98- return extractFromPage (fields , pageIndex , filename );
72+ return extractFromPage (fields , pageId , this .filename );
9973 }
10074
10175 private <FieldT extends PositionDataField > ExtractedImages extractFromPage (
10276 List <FieldT > fields ,
103- int pageIndex ,
77+ int pageId ,
10478 String outputName
10579 ) {
106- String [] splitName = InputSourceUtils .splitNameStrict (outputName );
107- var filename = String
108- .format ("%s_page-%3s.%s" , splitName [0 ], pageIndex + 1 , splitName [1 ])
109- .replace (" " , "0" );
110-
11180 var extractedImages = new ExtractedImages ();
112- for (int i = 0 ; i < fields .size (); i ++) {
113- ExtractedImage extractedImage = extractImage (fields .get (i ), pageIndex , i + 1 , filename );
81+ for (int elementId = 0 ; elementId < fields .size (); elementId ++) {
82+ ExtractedImage extractedImage = extractImage (
83+ fields .get (elementId ),
84+ pageId ,
85+ elementId ,
86+ outputName
87+ );
11488 if (extractedImage != null ) {
11589 extractedImages .add (extractedImage );
11690 }
@@ -123,33 +97,31 @@ private <FieldT extends PositionDataField> ExtractedImages extractFromPage(
12397 *
12498 * @param <FieldT> Type of field (needs to support positioning data).
12599 * @param field The field to extract.
126- * @param index The index to use for naming the extracted image.
100+ * @param elementId The index to use for naming the extracted image.
127101 * @param filename Name of the file.
128- * @param pageIndex The page index to extract, begins at 0.
102+ * @param pageId The page index to extract, begins at 0.
129103 * @return The {@link ExtractedImage}, or <code>null</code> if the field does not have valid
130104 * position data.
131105 */
132106 public <FieldT extends PositionDataField > ExtractedImage extractImage (
133107 FieldT field ,
134- int pageIndex ,
135- int index ,
108+ int pageId ,
109+ int elementId ,
136110 String filename
137111 ) {
138112 String [] splitName = InputSourceUtils .splitNameStrict (filename );
139- String saveFormat = splitName [1 ].toLowerCase ();
140113 var polygon = field .getPolygon ();
141114 if (polygon == null ) {
142115 return null ;
143116 }
144- String fieldFilename = splitName [0 ]
145- + String .format ("_%3s" , index ).replace (" " , "0" )
146- + "."
147- + saveFormat ;
148117 return new ExtractedImage (
149- extractImage (polygon .getAsBbox (), pageIndex ),
150- fieldFilename ,
151- saveFormat ,
152- pageIndex
118+ extractImage (polygon .getAsBbox (), pageId ),
119+ String
120+ .format ("%s_page-%3s-item-%3s.%s" , splitName [0 ], pageId + 1 , elementId + 1 , this .saveFormat )
121+ .replace (" " , "0" ),
122+ this .saveFormat ,
123+ pageId ,
124+ elementId
153125 );
154126 }
155127
@@ -158,17 +130,17 @@ public <FieldT extends PositionDataField> ExtractedImage extractImage(
158130 *
159131 * @param <FieldT> Type of field (needs to support positioning data).
160132 * @param field The field to extract.
161- * @param index The index to use for naming the extracted image.
162- * @param pageIndex The 0-based page index to extract.
133+ * @param elementId The index to use for naming the extracted image.
134+ * @param pageId The 0-based page index to extract.
163135 * @return The {@link ExtractedImage}, or <code>null</code> if the field does not have valid
164136 * position data.
165137 */
166138 public <FieldT extends PositionDataField > ExtractedImage extractImage (
167139 FieldT field ,
168- int pageIndex ,
169- int index
140+ int pageId ,
141+ int elementId
170142 ) {
171- return extractImage (field , pageIndex , index , this .filename );
143+ return extractImage (field , pageId , elementId , this .filename );
172144 }
173145
174146 private BufferedImage extractImage (Bbox bbox , int pageIndex ) {
0 commit comments