From 9f95dd3d3b69ec56cdcb71380a8679f46c929adf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ianar=C3=A9=20S=C3=A9vi?= Date: Mon, 22 Jun 2026 13:49:49 +0200 Subject: [PATCH] :bug: :boom: harmonize Crop and Split extraction --- .github/workflows/_publish-code.yml | 2 +- .github/workflows/_publish-docs.yml | 2 +- .github/workflows/_static-analysis.yml | 2 +- .github/workflows/_test-integrations.yml | 4 ++-- .github/workflows/_test-smoke.yml | 2 +- .github/workflows/_test-units.yml | 4 ++-- .../cropFiles.ts => image/extractedImages.ts} | 2 +- src/image/index.ts | 1 + src/pdf/extractedPdfs.ts | 7 +++++++ src/pdf/index.ts | 2 ++ src/v2/fileOperations/crop.ts | 16 +++++++++------- src/v2/fileOperations/split.ts | 12 ++++++------ src/v2/fileOperations/splitFiles.ts | 8 -------- src/v2/product/crop/cropItem.ts | 6 +++--- src/v2/product/crop/cropResponse.ts | 12 ------------ src/v2/product/crop/cropResult.ts | 12 ++++++++++++ src/v2/product/split/splitRange.ts | 7 ++++--- src/v2/product/split/splitResponse.ts | 15 --------------- src/v2/product/split/splitResult.ts | 15 +++++++++++++++ tests/data | 2 +- tests/v2/fileOperations/crop.spec.ts | 16 +++++++--------- tests/v2/fileOperations/split.spec.ts | 13 +++++-------- 22 files changed, 81 insertions(+), 81 deletions(-) rename src/{v2/fileOperations/cropFiles.ts => image/extractedImages.ts} (66%) create mode 100644 src/pdf/extractedPdfs.ts delete mode 100644 src/v2/fileOperations/splitFiles.ts diff --git a/.github/workflows/_publish-code.yml b/.github/workflows/_publish-code.yml index cb7b728e8..9ccb1a0d4 100644 --- a/.github/workflows/_publish-code.yml +++ b/.github/workflows/_publish-code.yml @@ -11,7 +11,7 @@ jobs: steps: - name: Check out Git repository - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Install Dependencies run: sudo apt-get install pcregrep diff --git a/.github/workflows/_publish-docs.yml b/.github/workflows/_publish-docs.yml index 6617f7c47..81d92d1a2 100644 --- a/.github/workflows/_publish-docs.yml +++ b/.github/workflows/_publish-docs.yml @@ -11,7 +11,7 @@ jobs: steps: - name: Check out Git repository - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Set up Node.js uses: actions/setup-node@v6 diff --git a/.github/workflows/_static-analysis.yml b/.github/workflows/_static-analysis.yml index 9dce9e20b..3aea4f433 100644 --- a/.github/workflows/_static-analysis.yml +++ b/.github/workflows/_static-analysis.yml @@ -10,7 +10,7 @@ jobs: steps: - name: Check out Git repository - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Set up Node.js uses: actions/setup-node@v6 diff --git a/.github/workflows/_test-integrations.yml b/.github/workflows/_test-integrations.yml index bceb234a5..855d5ce17 100644 --- a/.github/workflows/_test-integrations.yml +++ b/.github/workflows/_test-integrations.yml @@ -33,7 +33,7 @@ jobs: steps: - name: Check out Git repository - uses: actions/checkout@v4 + uses: actions/checkout@v6 with: submodules: recursive @@ -84,7 +84,7 @@ jobs: steps: - name: Check out Git repository - uses: actions/checkout@v4 + uses: actions/checkout@v6 with: submodules: recursive diff --git a/.github/workflows/_test-smoke.yml b/.github/workflows/_test-smoke.yml index b847bda59..b87164d41 100644 --- a/.github/workflows/_test-smoke.yml +++ b/.github/workflows/_test-smoke.yml @@ -27,7 +27,7 @@ jobs: steps: - name: Check out Git repository - uses: actions/checkout@v4 + uses: actions/checkout@v6 with: submodules: recursive diff --git a/.github/workflows/_test-units.yml b/.github/workflows/_test-units.yml index aafbf97b2..f93c6a4f8 100644 --- a/.github/workflows/_test-units.yml +++ b/.github/workflows/_test-units.yml @@ -18,7 +18,7 @@ jobs: steps: - name: Check out Git repository - uses: actions/checkout@v4 + uses: actions/checkout@v6 with: submodules: recursive @@ -56,7 +56,7 @@ jobs: steps: - name: Check out Git repository - uses: actions/checkout@v4 + uses: actions/checkout@v6 with: submodules: recursive diff --git a/src/v2/fileOperations/cropFiles.ts b/src/image/extractedImages.ts similarity index 66% rename from src/v2/fileOperations/cropFiles.ts rename to src/image/extractedImages.ts index 61d3ff22b..2839523f1 100644 --- a/src/v2/fileOperations/cropFiles.ts +++ b/src/image/extractedImages.ts @@ -1,6 +1,6 @@ import { ExtractedImage } from "@/image/index.js"; -export class CropFiles extends Array { +export class ExtractedImages extends Array { constructor(...items: ExtractedImage[]) { super(...items); } diff --git a/src/image/index.ts b/src/image/index.ts index 5ace9a21c..2b1a8dde5 100644 --- a/src/image/index.ts +++ b/src/image/index.ts @@ -1,3 +1,4 @@ export { compressImage } from "./imageCompressor.js"; export { ExtractedImage } from "./extractedImage.js"; +export { ExtractedImages } from "./extractedImages.js"; export { extractFromPage } from "./imageExtractor.js"; diff --git a/src/pdf/extractedPdfs.ts b/src/pdf/extractedPdfs.ts new file mode 100644 index 000000000..19771b0f1 --- /dev/null +++ b/src/pdf/extractedPdfs.ts @@ -0,0 +1,7 @@ +import { ExtractedPdf } from "@/pdf/extractedPdf.js"; + +export class ExtractedPdfs extends Array { + constructor(...items: ExtractedPdf[]) { + super(...items); + } +} diff --git a/src/pdf/index.ts b/src/pdf/index.ts index 752c5ffd0..182ce9d95 100644 --- a/src/pdf/index.ts +++ b/src/pdf/index.ts @@ -2,3 +2,5 @@ export { extractPages, countPages } from "./pdfOperation.js"; export type { SplitPdf } from "./pdfOperation.js"; export { compressPdf } from "./pdfCompressor.js"; export { hasSourceText } from "./pdfUtils.js"; +export { ExtractedPdf } from "./extractedPdf.js"; +export { ExtractedPdfs } from "./extractedPdfs.js"; diff --git a/src/v2/fileOperations/crop.ts b/src/v2/fileOperations/crop.ts index 764a0947e..f178d431c 100644 --- a/src/v2/fileOperations/crop.ts +++ b/src/v2/fileOperations/crop.ts @@ -3,8 +3,7 @@ import { CropItem } from "@/v2/product/crop/index.js"; import { MindeeError } from "@/errors/index.js"; import { extractImagesFromPolygon } from "@/image/imageExtractor.js"; import { Polygon } from "@/geometry/index.js"; -import { CropFiles } from "@/v2/fileOperations/cropFiles.js"; -import { ExtractedImage } from "@/image/index.js"; +import { ExtractedImage, ExtractedImages } from "@/image/index.js"; import { logger } from "@/logger.js"; @@ -12,9 +11,12 @@ import { logger } from "@/logger.js"; * Extracts a single specified crop from a given input source. * @param inputSource Local input source. * @param crop Crop to extract. + * @param quality JPEG quality of extracted image. */ -export async function extractSingleCrop(inputSource: LocalInputSource, crop: CropItem): Promise { - return (await extractCrops(inputSource, [crop]))[0]; +export async function extractSingleCrop( + inputSource: LocalInputSource, crop: CropItem, quality?: number +): Promise { + return (await extractMultipleCrops(inputSource, [crop], quality))[0]; } @@ -25,11 +27,11 @@ export async function extractSingleCrop(inputSource: LocalInputSource, crop: Cro * @param quality JPEG quality of extracted images. * @return a list of extracted files, as a CropFiles object. */ -export async function extractCrops( +export async function extractMultipleCrops( inputSource: LocalInputSource, crops: CropItem[], quality?: number , -): Promise { +): Promise { if (crops.length === 0) { throw new MindeeError("No crop indexes provided."); } @@ -44,5 +46,5 @@ export async function extractCrops( polygonsByPage.get(pageId)!.push(crop.location.polygon); } const extractedCrops = await extractImagesFromPolygon(inputSource, polygonsByPage, quality); - return new CropFiles(...extractedCrops); + return new ExtractedImages(...extractedCrops); } diff --git a/src/v2/fileOperations/split.ts b/src/v2/fileOperations/split.ts index 2370befc8..da4a3fd99 100644 --- a/src/v2/fileOperations/split.ts +++ b/src/v2/fileOperations/split.ts @@ -1,9 +1,9 @@ import { LocalInputSource } from "@/input/index.js"; import { MindeeError } from "@/errors/index.js"; import { PdfExtractor } from "@/pdf/pdfExtractor.js"; -import { SplitFiles } from "@/v2/fileOperations/splitFiles.js"; import { logger } from "@/logger.js"; import { ExtractedPdf } from "@/pdf/extractedPdf.js"; +import { ExtractedPdfs } from "@/pdf/extractedPdfs.js"; /** * Extracts a single specified split from a @@ -11,7 +11,7 @@ import { ExtractedPdf } from "@/pdf/extractedPdf.js"; * @param split */ export async function extractSingleSplit(inputSource: LocalInputSource, split: number[]) { - return await extractSplits(inputSource, [split]); + return await extractMultipleSplits(inputSource, [split]); } /** @@ -21,7 +21,7 @@ export async function extractSingleSplit(inputSource: LocalInputSource, split: n * @return a list of extracted files. * @throws MindeeError if no indexes are provided. */ -export async function extractSplits(inputSource: LocalInputSource, splits: number[][]): Promise { +export async function extractMultipleSplits(inputSource: LocalInputSource, splits: number[][]): Promise { const pageGroups = splits.filter(e => e.length > 0); if (pageGroups.length === 0) { throw new MindeeError("No valid split indexes provided."); @@ -32,14 +32,14 @@ export async function extractSplits(inputSource: LocalInputSource, splits: numbe await pdfExtractor.init(); if (splits.length === 0) { - return new SplitFiles(); + return new ExtractedPdfs(); } const pageCount = await pdfExtractor.getPageCount(); if (splits.length === 1 && splits[0].at(-1) === pageCount-1) { - return new SplitFiles(new ExtractedPdf(inputSource.fileObject as Buffer, inputSource.filename, pageCount)); + return new ExtractedPdfs(new ExtractedPdf(inputSource.fileObject as Buffer, inputSource.filename, pageCount)); } const subDocuments = await pdfExtractor.extractSubDocuments(pageGroups); - return new SplitFiles(...subDocuments); + return new ExtractedPdfs(...subDocuments); } /** diff --git a/src/v2/fileOperations/splitFiles.ts b/src/v2/fileOperations/splitFiles.ts deleted file mode 100644 index 14dcb7553..000000000 --- a/src/v2/fileOperations/splitFiles.ts +++ /dev/null @@ -1,8 +0,0 @@ -import { ExtractedPdf } from "@/pdf/extractedPdf.js"; - -export class SplitFiles extends Array { - - constructor(...args: ExtractedPdf[]) { - super(...args); - } -} diff --git a/src/v2/product/crop/cropItem.ts b/src/v2/product/crop/cropItem.ts index e0b06d1c1..cce46560d 100644 --- a/src/v2/product/crop/cropItem.ts +++ b/src/v2/product/crop/cropItem.ts @@ -1,7 +1,7 @@ import { FieldLocation } from "@/v2/parsing/inference/field/index.js"; import { StringDict } from "@/parsing/index.js"; import { LocalInputSource } from "@/input/index.js"; -import { extractCrops } from "@/v2/fileOperations/crop.js"; +import { extractSingleCrop } from "@/v2/fileOperations/crop.js"; import { ExtractedImage } from "@/image/index.js"; import { ExtractionResponse } from "@/v2/product/index.js"; @@ -36,7 +36,7 @@ export class CropItem { * @param inputSource The input file to extract from. * @param quality Optional quality parameter for image extraction, default is undefined (full quality). */ - async extractFromFile(inputSource: LocalInputSource, quality: number = 1): Promise{ - return (await extractCrops(inputSource, [this], quality))[0]; + async extractFromInputSource(inputSource: LocalInputSource, quality: number = 1): Promise{ + return (await extractSingleCrop(inputSource, this, quality)); } } diff --git a/src/v2/product/crop/cropResponse.ts b/src/v2/product/crop/cropResponse.ts index 0f63e316a..e9f7d2a39 100644 --- a/src/v2/product/crop/cropResponse.ts +++ b/src/v2/product/crop/cropResponse.ts @@ -1,7 +1,4 @@ -import { LocalInputSource } from "@/input/index.js"; import { StringDict } from "@/parsing/stringDict.js"; -import { extractCrops } from "@/v2/fileOperations/crop.js"; -import { CropFiles } from "@/v2/fileOperations/cropFiles.js"; import { BaseResponse } from "@/v2/parsing/index.js"; import { CropInference } from "./cropInference.js"; @@ -18,13 +15,4 @@ export class CropResponse extends BaseResponse { super(serverResponse); this.inference = new CropInference(serverResponse["inference"]); } - - /** - * Extracts all crops from an input. - * @param inputSource The input file to extract from. - * @param quality Optional quality parameter for image extraction, default is undefined (full quality). - */ - async extractFromFile(inputSource: LocalInputSource, quality: number = 1): Promise { - return await extractCrops(inputSource, this.inference.result.crops, quality); - } } diff --git a/src/v2/product/crop/cropResult.ts b/src/v2/product/crop/cropResult.ts index 9a0c527ac..d6d1957d5 100644 --- a/src/v2/product/crop/cropResult.ts +++ b/src/v2/product/crop/cropResult.ts @@ -1,5 +1,8 @@ import { StringDict } from "@/parsing/stringDict.js"; import { CropItem } from "@/v2/product/crop/cropItem.js"; +import { LocalInputSource } from "@/input/index.js"; +import { extractMultipleCrops } from "@/v2/fileOperations/crop.js"; +import { ExtractedImages } from "@/image/extractedImages.js"; export class CropResult { /** @@ -15,4 +18,13 @@ export class CropResult { const crops = this.crops.map(item => item.toString()).join("\n"); return `Crops\n=====\n${crops}`; } + + /** + * Extracts a single crop from an input. + * @param inputSource The input file to extract from. + * @param quality Optional quality parameter for image extraction, default is undefined (full quality). + */ + async extractFromInputSource(inputSource: LocalInputSource, quality: number = 1): Promise{ + return (await extractMultipleCrops(inputSource, this.crops, quality)); + } } diff --git a/src/v2/product/split/splitRange.ts b/src/v2/product/split/splitRange.ts index 27018cdfd..30bb92090 100644 --- a/src/v2/product/split/splitRange.ts +++ b/src/v2/product/split/splitRange.ts @@ -1,7 +1,8 @@ import { StringDict } from "@/parsing/index.js"; import { LocalInputSource } from "@/input/index.js"; -import { expandRange, extractSplits } from "@/v2/fileOperations/split.js"; +import { expandRange, extractMultipleSplits } from "@/v2/fileOperations/split.js"; import { ExtractionResponse } from "@/v2/product/index.js"; +import { ExtractedPdf } from "@/pdf/index.js"; /** * Split inference result. @@ -38,8 +39,8 @@ export class SplitRange { * Extracts a single split from the input file. * @param inputSource The input file to extract from. */ - async extractFromFile(inputSource: LocalInputSource) { + async extractFromFile(inputSource: LocalInputSource): Promise { const pageRange = [expandRange(this.pageRange as [number, number])]; - return (await extractSplits(inputSource, pageRange))[0]; + return (await extractMultipleSplits(inputSource, pageRange))[0]; } } diff --git a/src/v2/product/split/splitResponse.ts b/src/v2/product/split/splitResponse.ts index 123ab2866..891cb30de 100644 --- a/src/v2/product/split/splitResponse.ts +++ b/src/v2/product/split/splitResponse.ts @@ -1,9 +1,6 @@ import { StringDict } from "@/parsing/stringDict.js"; import { SplitInference } from "./splitInference.js"; import { BaseResponse } from "@/v2/parsing/index.js"; -import { LocalInputSource } from "@/input/index.js"; -import { expandRange, extractSplits } from "@/v2/fileOperations/split.js"; -import { SplitFiles } from "@/v2/fileOperations/splitFiles.js"; export class SplitResponse extends BaseResponse { /** @@ -18,16 +15,4 @@ export class SplitResponse extends BaseResponse { super(serverResponse); this.inference = new SplitInference(serverResponse["inference"]); } - - /** - * Extracts all splits from an input PDF. - * @param inputSource The input file to extract from. - */ - async extractFromFile(inputSource: LocalInputSource): Promise{ - const splits: number[][] = []; - for (const split of this.inference.result.splits) { - splits.push(expandRange(split.pageRange as [number, number])); - } - return await extractSplits(inputSource, splits); - } } diff --git a/src/v2/product/split/splitResult.ts b/src/v2/product/split/splitResult.ts index af3b4ec76..08411d8af 100644 --- a/src/v2/product/split/splitResult.ts +++ b/src/v2/product/split/splitResult.ts @@ -1,5 +1,8 @@ import { SplitRange } from "./splitRange.js"; import { StringDict } from "@/parsing/index.js"; +import { LocalInputSource } from "@/input/index.js"; +import { ExtractedPdfs } from "@/pdf/index.js"; +import { extractMultipleSplits, expandRange } from "@/v2/fileOperations/split.js"; /** * Split result info. @@ -14,6 +17,18 @@ export class SplitResult { this.splits = rawResponse.splits.map((split: StringDict) => new SplitRange(split)); } + /** + * Extracts all splits from an input PDF. + * @param inputSource The input file to extract from. + */ + async extractFromInputSource(inputSource: LocalInputSource): Promise{ + const splits: number[][] = []; + for (const split of this.splits) { + splits.push(expandRange(split.pageRange as [number, number])); + } + return await extractMultipleSplits(inputSource, splits); + } + toString(): string { let splits = "\n"; if (this.splits.length > 0) { diff --git a/tests/data b/tests/data index 13093f3a4..2d7fcf8f5 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit 13093f3a48de212ef26889df71199c1a2a9d1478 +Subproject commit 2d7fcf8f591f6d7f40e39862965325e6a8a21874 diff --git a/tests/v2/fileOperations/crop.spec.ts b/tests/v2/fileOperations/crop.spec.ts index eaa84fb4f..819a63bd1 100644 --- a/tests/v2/fileOperations/crop.spec.ts +++ b/tests/v2/fileOperations/crop.spec.ts @@ -1,7 +1,6 @@ import { loadOptionalDependency } from "@/dependency/index.js"; import { ExtractedImage } from "@/image/index.js"; import { PathInput } from "@/index.js"; -import { extractCrops } from "@/v2/fileOperations/crop.js"; import { LocalResponse } from "@/v2/parsing/index.js"; import { CropResponse } from "@/v2/product/crop/cropResponse.js"; @@ -58,7 +57,7 @@ describe("MindeeV2 - FileOperation - Crop #OptionalDepsRequired", async () => { path.join(cropPath, "default_sample.json") ); - const extractedCrops = await response.extractFromFile(inputSample); + const extractedCrops = await response.inference.result.extractFromInputSource(inputSample); assert.strictEqual(extractedCrops.length, 2); @@ -66,7 +65,8 @@ describe("MindeeV2 - FileOperation - Crop #OptionalDepsRequired", async () => { const dimensions = await getFileDimensions(extractedCrops[0].buffer, sharp); assert.strictEqual(Math.round(dimensions.width), 2201); assert.strictEqual(Math.round(dimensions.height), 4314); - const localExtract: ExtractedImage = await response.inference.result.crops[0].extractFromFile(inputSample); + const cropItem = response.inference.result.crops[0]; + const localExtract: ExtractedImage = await cropItem.extractFromInputSource(inputSample); assert.ok(localExtract.buffer.equals(extractedCrops[0].buffer)); }); @@ -79,7 +79,7 @@ describe("MindeeV2 - FileOperation - Crop #OptionalDepsRequired", async () => { path.join(cropPath, "default_sample.json") ); - const extractedCrops = await response.extractFromFile(inputSample, 0.5); + const extractedCrops = await response.inference.result.extractFromInputSource(inputSample, 0.5); assert.strictEqual(extractedCrops.length, 2); @@ -87,7 +87,8 @@ describe("MindeeV2 - FileOperation - Crop #OptionalDepsRequired", async () => { const dimensions = await getFileDimensions(extractedCrops[0].buffer, sharp); assert.strictEqual(Math.round(dimensions.width), Math.round(2201 * 0.5)); assert.strictEqual(Math.round(dimensions.height), Math.round(4314 * 0.5)); - const localExtract: ExtractedImage = await response.inference.result.crops[0].extractFromFile(inputSample, 0.5); + const cropItem = response.inference.result.crops[0]; + const localExtract: ExtractedImage = await cropItem.extractFromInputSource(inputSample, 0.5); assert.ok(localExtract.buffer.equals(extractedCrops[0].buffer)); }); @@ -100,10 +101,7 @@ describe("MindeeV2 - FileOperation - Crop #OptionalDepsRequired", async () => { path.join(cropPath, "crop_multiple.json") ); - const extractedCrops = await extractCrops( - inputSample, - response.inference.result.crops - ); + const extractedCrops = await response.inference.result.extractFromInputSource(inputSample); assert.strictEqual(extractedCrops.length, 2); diff --git a/tests/v2/fileOperations/split.spec.ts b/tests/v2/fileOperations/split.spec.ts index fdedf699b..fd5baf221 100644 --- a/tests/v2/fileOperations/split.spec.ts +++ b/tests/v2/fileOperations/split.spec.ts @@ -1,8 +1,5 @@ import { PathInput } from "@/index.js"; -import { ExtractedPdf } from "@/pdf/extractedPdf.js"; -import { extractSplits } from "@/v2/fileOperations/split.js"; -import { SplitFiles } from "@/v2/fileOperations/splitFiles.js"; - +import { extractMultipleSplits } from "@/v2/fileOperations/split.js"; import { LocalResponse } from "@/v2/parsing/index.js"; import { SplitResponse } from "@/v2/product/split/splitResponse.js"; import assert from "node:assert/strict"; @@ -29,7 +26,7 @@ describe("MindeeV2 - Product - SplitResponse #OptionalDepsRequired", async () => path.join(splitPath, "split_single.json") ); - const extractedSplits = await response.extractFromFile(inputSample); + const extractedSplits = await response.inference.result.extractFromInputSource(inputSample); assert.strictEqual(extractedSplits.length, 1); @@ -49,7 +46,7 @@ describe("MindeeV2 - Product - SplitResponse #OptionalDepsRequired", async () => path.join(splitPath, "split_multiple.json") ); - const extractedSplits = await response.extractFromFile(inputSample); + const extractedSplits = await response.inference.result.extractFromInputSource(inputSample); assert.strictEqual(extractedSplits.length, 3); @@ -67,7 +64,7 @@ describe("MindeeV2 - Product - SplitResponse #OptionalDepsRequired", async () => const bufferInput2 = extractedSplits[2].asSource(); const count2 = await bufferInput2.getPageCount(); assert.strictEqual(count2, 1); - const localExtract: ExtractedPdf = await response.inference.result.splits[0].extractFromFile(inputSample); + const localExtract = await response.inference.result.splits[0].extractFromFile(inputSample); assert.ok(extractedSplits[0].buffer.equals(localExtract.buffer)); }); @@ -75,7 +72,7 @@ describe("MindeeV2 - Product - SplitResponse #OptionalDepsRequired", async () => const inputSample = new PathInput({ inputPath: path.join(splitPath, "invoice_5p.pdf") }); - const splitFiles: SplitFiles = await extractSplits(inputSample, [[0, 1, 2, 3, 4]]); + const splitFiles = await extractMultipleSplits(inputSample, [[0, 1, 2, 3, 4]]); assert(splitFiles.length === 1); assert(splitFiles[0].pageCount === 5); assert(splitFiles[0].buffer === inputSample.fileObject);