From e3b67db148a1611d635a7b45eea2a4ffa738526c Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Fri, 10 Apr 2026 19:37:10 +0200 Subject: [PATCH 1/9] :sparkles: add support for crop & split operations --- src/image/extractedImage.ts | 12 +- src/image/imageExtractor.ts | 70 +++++++-- src/pdf/extractedPdf.ts | 71 +++++++++ src/pdf/pdfCompressor.ts | 51 +------ src/pdf/pdfExtractor.ts | 142 ++++++++++++++++++ src/pdf/pdfOperation.ts | 37 ++++- src/pdf/pdfUtils.ts | 49 ++++++ .../multiReceiptsExtractor.ts | 36 +---- src/v2/fileOperations/crop.ts | 47 ++++++ src/v2/fileOperations/cropFiles.ts | 7 + src/v2/fileOperations/index.ts | 0 src/v2/fileOperations/split.ts | 45 ++++++ src/v2/fileOperations/splitFiles.ts | 8 + src/v2/product/split/splitRange.ts | 10 ++ src/v2/product/split/splitResponse.ts | 14 ++ tests/index.ts | 1 + tests/v1/extraction/multiReceipts.spec.ts | 1 - tests/v2/fileOperations/crop.integration.ts | 95 ++++++++++++ tests/v2/fileOperations/crop.spec.ts | 104 +++++++++++++ tests/v2/fileOperations/split.integration.ts | 0 tests/v2/fileOperations/split.spec.ts | 92 ++++++++++++ 21 files changed, 793 insertions(+), 99 deletions(-) create mode 100644 src/pdf/extractedPdf.ts create mode 100644 src/pdf/pdfExtractor.ts create mode 100644 src/v2/fileOperations/crop.ts create mode 100644 src/v2/fileOperations/cropFiles.ts create mode 100644 src/v2/fileOperations/index.ts create mode 100644 src/v2/fileOperations/split.ts create mode 100644 src/v2/fileOperations/splitFiles.ts create mode 100644 tests/v2/fileOperations/crop.integration.ts create mode 100644 tests/v2/fileOperations/crop.spec.ts create mode 100644 tests/v2/fileOperations/split.integration.ts create mode 100644 tests/v2/fileOperations/split.spec.ts diff --git a/src/image/extractedImage.ts b/src/image/extractedImage.ts index 583c0640b..f56c7c08d 100644 --- a/src/image/extractedImage.ts +++ b/src/image/extractedImage.ts @@ -15,11 +15,15 @@ import { loadOptionalDependency } from "@/dependency/index.js"; */ export class ExtractedImage { public buffer: Buffer; - protected internalFileName: string; + public filename: string; + public pageId?: number; + public elementId?: number; - protected constructor(buffer: Uint8Array, fileName: string) { + constructor(buffer: Uint8Array, fileName: string, pageId?: number, elementId?: number) { this.buffer = Buffer.from(buffer); - this.internalFileName = fileName; + this.filename = fileName; + this.pageId = pageId; + this.elementId = elementId; } /** @@ -104,7 +108,7 @@ export class ExtractedImage { asSource(): BufferInput { return new BufferInput({ buffer: this.buffer, - filename: this.internalFileName, + filename: this.filename, }); } } diff --git a/src/image/imageExtractor.ts b/src/image/imageExtractor.ts index f8ba56fd3..2efb5f06a 100644 --- a/src/image/imageExtractor.ts +++ b/src/image/imageExtractor.ts @@ -4,6 +4,11 @@ import type * as pdfLibTypes from "@cantoo/pdf-lib"; import { getMinMaxX, getMinMaxY, Polygon } from "@/geometry/index.js"; import { adjustForRotation } from "@/geometry/polygonUtils.js"; import { loadOptionalDependency } from "@/dependency/index.js"; +import { LocalInputSource } from "@/input/index.js"; +import { ExtractedImage } from "@/image/extractedImage.js"; +import { createPdfFromInputSource } from "@/pdf/pdfOperation.js"; +import { logger } from "@/logger.js"; +import { rasterizePage } from "@/pdf/pdfUtils.js"; let pdfLib: typeof pdfLibTypes | null = null; @@ -17,43 +22,79 @@ async function getPdfLib(): Promise { return pdfLib!; } + +/** + * Extracts elements from a PDF document based on a list of bounding boxes. + * @param inputSource The input source to extract from. + * @param polygonsPerPage List of polygons to extract from per page. + * @param upscale Whether to upscale the image. + */ +export async function extractImagesFromPolygon( + inputSource: LocalInputSource, + polygonsPerPage: Map, + upscale: boolean = false +) { + const allExtractedImages: ExtractedImage[] = []; + const pdfDoc = await createPdfFromInputSource(inputSource); + + for (const [pageId, polygons] of polygonsPerPage) { + logger.debug(`Extracting images from page ${pageId}`); + const pdfPage = pdfDoc.getPage(pageId); + const extractions = (await extractFromPage(pdfPage, polygons, true, upscale)); + const extractedImages = extractions.map( + (v, i) => new ExtractedImage(v, inputSource.filename + `_page${pageId}-${i}.jpg`, pageId, i) + ); + allExtractedImages.push(...extractedImages); + } + return allExtractedImages; +} + /** * Extracts elements from a page based off of a list of bounding boxes. * * @param pdfPage PDF Page to extract from. * @param polygons List of coordinates to pull the elements from. + * @param asImage Whether to return the extracted elements as images. + * @param upscale Whether to upscale the image. */ export async function extractFromPage( pdfPage: pdfLibTypes.PDFPage, - polygons: Polygon[] + polygons: Polygon[], + asImage: boolean = false, + upscale: boolean = true ) { const pdfLib = await getPdfLib(); const { width, height } = pdfPage.getSize(); - const extractedElements :Uint8Array[] = []; - // Manual upscale. - // Fixes issues with the OCR. - const qualityScale = 300/72; + const extractedElements: Uint8Array[] = []; + + const qualityScale = upscale ? 300 / 72 : 1; const orientation = pdfPage.getRotation().angle; + const sourceDoc = pdfPage.doc; + const pageIndex = sourceDoc.getPages().indexOf(pdfPage); + for (const origPolygon of polygons) { - const polygon = adjustForRotation(origPolygon, orientation); + logger.debug(`Extracting image with polygon: ${origPolygon.toString()}`); const tempPdf = await pdfLib.PDFDocument.create(); + const [copiedPage] = await tempPdf.copyPages(sourceDoc, [pageIndex]); + + const polygon = adjustForRotation(origPolygon, orientation); + const newWidth = width * (getMinMaxX(polygon).max - getMinMaxX(polygon).min); const newHeight = height * (getMinMaxY(polygon).max - getMinMaxY(polygon).min); - const cropped = await tempPdf.embedPage(pdfPage, { + + const cropped = await tempPdf.embedPage(copiedPage, { left: getMinMaxX(polygon).min * width, right: getMinMaxX(polygon).max * width, top: height - (getMinMaxY(polygon).min * height), bottom: height - (getMinMaxY(polygon).max * height), }); - // Determine the final page dimensions based on orientation let finalWidth: number; let finalHeight: number; if (orientation === 90 || orientation === 270) { - // For 90/270 rotations, swap width and height finalWidth = newHeight * qualityScale; finalHeight = newWidth * qualityScale; } else { @@ -62,15 +103,14 @@ export async function extractFromPage( } const samplePage = tempPdf.addPage([finalWidth, finalHeight]); - samplePage.drawRectangle({ x: 0, y: 0, width: finalWidth, height: finalHeight, + color: pdfLib.rgb(1, 1, 1), }); - // Draw the cropped page with rotation applied if (orientation === 0) { samplePage.drawPage(cropped, { width: newWidth * qualityScale, @@ -102,7 +142,13 @@ export async function extractFromPage( }); } - extractedElements.push(await tempPdf.save()); + const pdfBuffer = Buffer.from(await tempPdf.save()); + if (asImage) { + extractedElements.push(await rasterizePage(pdfBuffer, 0, 100)); + } else { + extractedElements.push(pdfBuffer); + } } + return extractedElements; } diff --git a/src/pdf/extractedPdf.ts b/src/pdf/extractedPdf.ts new file mode 100644 index 000000000..7f578ae30 --- /dev/null +++ b/src/pdf/extractedPdf.ts @@ -0,0 +1,71 @@ +import path from "node:path"; +import { BufferInput, MIMETYPES } from "@/input/index.js"; +import { MindeeError } from "@/errors/index.js"; +import { Buffer } from "node:buffer"; +import { writeFile } from "fs/promises"; +import { logger } from "@/logger.js"; +import { writeFileSync } from "node:fs"; + +export class ExtractedPdf { + public readonly buffer: Buffer; + private readonly filename: string; + public readonly pageCount: number; + + constructor(pdfData: Buffer, filename: string, pageCount: number) { + this.buffer = pdfData; + this.filename = filename; + this.pageCount = pageCount; + } + + /** + * Saves the document to a file. + * + * @param outputPath Path to save the file to. + */ + async saveToFileAsync(outputPath: string) { + const fileExt = path.extname(outputPath).toLowerCase(); + if (fileExt !== ".pdf" && !MIMETYPES.has(fileExt)) { + outputPath += ".pdf"; + } + + try { + await writeFile(path.resolve(outputPath), this.buffer); + logger.info(`File saved successfully to ${path.resolve(outputPath)}.`); + } catch (e) { + if (e instanceof TypeError) { + throw new MindeeError("Invalid path/filename provided."); + } else { + throw e; + } + } + } + + /** + * Saves the document to a file synchronously. + * @param outputPath + */ + saveToFile(outputPath: string){ + try { + writeFileSync(path.resolve(outputPath), this.buffer); + logger.info(`File saved successfully to ${path.resolve(outputPath)}.`); + } catch (e) { + if (e instanceof TypeError) { + throw new MindeeError("Invalid path/filename provided."); + } else { + throw e; + } + } + } + + /** + * Return the file as a Mindee-compatible BufferInput source. + * + * @returns A BufferInput source. + */ + asSource(): BufferInput { + return new BufferInput({ + buffer: this.buffer, + filename: this.filename, + }); + } +} diff --git a/src/pdf/pdfCompressor.ts b/src/pdf/pdfCompressor.ts index aa4f3183f..9442f8c43 100644 --- a/src/pdf/pdfCompressor.ts +++ b/src/pdf/pdfCompressor.ts @@ -1,15 +1,10 @@ import { logger } from "@/logger.js"; -import tmp from "tmp"; -import * as fs from "node:fs"; -// eslint-disable-next-line @typescript-eslint/ban-ts-comment -// @ts-ignore -import type * as popplerTypes from "node-poppler"; // eslint-disable-next-line @typescript-eslint/ban-ts-comment // @ts-ignore import type * as pdfLibTypes from "@cantoo/pdf-lib"; import { compressImage } from "@/image/index.js"; import { loadOptionalDependency } from "@/dependency/index.js"; -import { ExtractedPdfInfo, extractTextFromPdf, hasSourceText } from "./pdfUtils.js"; +import { ExtractedPdfInfo, extractTextFromPdf, hasSourceText, rasterizePage } from "./pdfUtils.js"; let pdfLib: typeof pdfLibTypes | null = null; @@ -159,7 +154,7 @@ async function compressPagesWithQuality( const page = pdfDoc.getPages()[i]; const rasterizedPage = await rasterizePage(pdfData, i + 1, imageQuality); const compressedImage = await compressImage( - Buffer.from(rasterizedPage, "binary"), imageQuality + rasterizedPage, imageQuality ); if (!disableSourceText) { await addTextToPdfPage(page, extractedText); @@ -260,48 +255,6 @@ async function getFontFromName(fontName: string): Promise { return font; } -/** - * Rasterizes a PDF page. - * - * @param pdfData Buffer representation of the entire PDF file. - * @param index Index of the page to rasterize. - * @param quality Quality to apply during rasterization. - */ -async function rasterizePage( - pdfData: Buffer, index: number, quality = 85 -): Promise { - const popplerImport = await loadOptionalDependency( - "node-poppler", "Image Processing" - ); - const poppler = (popplerImport as any).default || popplerImport; - const popplerInstance = new poppler.Poppler(); - const tmpPdf = tmp.fileSync(); - const tempPdfPath = tmpPdf.name; - const antialiasOption: "fast" | "best" | "default" | "good" | "gray" | "none" | "subpixel" = "best"; - try { - await fs.promises.writeFile(tempPdfPath, pdfData); - const options = { - antialias: antialiasOption, - firstPageToConvert: index, - lastPageToConvert: index, - jpegFile: true, - jpegOptions: `quality=${quality}`, - singleFile: true - }; - - const jpegBuffer = await popplerInstance.pdfToCairo(tempPdfPath, undefined, options); - - await fs.promises.unlink(tempPdfPath); - - return jpegBuffer; - } catch (error) { - logger.error("Error rasterizing PDF:", error); - throw error; - } finally { - tmpPdf.removeCallback(); - } -} - /** * Performs linear interpolation between two numbers. * @param start The starting value. diff --git a/src/pdf/pdfExtractor.ts b/src/pdf/pdfExtractor.ts new file mode 100644 index 000000000..44353a6ed --- /dev/null +++ b/src/pdf/pdfExtractor.ts @@ -0,0 +1,142 @@ +// eslint-disable-next-line @typescript-eslint/ban-ts-comment +// @ts-ignore +import type * as pdfLibTypes from "@cantoo/pdf-lib"; +import { LocalInputSource, PageOptions, PageOptionsOperation, PathInput } from "@/input/index.js"; +import { logger } from "@/logger.js"; +import path from "path"; +import { loadOptionalDependency } from "@/dependency/index.js"; +import { MindeeInputSourceError, MindeePdfError } from "@/errors/index.js"; +import { ExtractedPdf } from "@/pdf/extractedPdf.js"; +import { createPdfFromInputSource, extractPages } from "@/pdf/pdfOperation.js"; + +let pdfLib: typeof pdfLibTypes | null = null; + +async function getPdfLib(): Promise { + if (!pdfLib) { + const pdfLibImport = await loadOptionalDependency("@cantoo/pdf-lib", "Text Embedding"); + pdfLib = (pdfLibImport as any).default || pdfLibImport; + } + return pdfLib!; +} + +export class PdfExtractor { + /** + * Buffer containing the PDF data. + * @private + */ + private sourcePdf: Buffer | null = null; + /** + * Filename of the PDF. + * @private + */ + private filename: string | null = null; + /** + * Input document. + * @private + */ + private readonly inputDocument: string | LocalInputSource; + /** + * Whether the extractor has been initialized. + * @private + */ + private initialized: boolean = false; + /** + * PDF library instance. + * @private + */ + private pdfLib: typeof pdfLibTypes | null = null; + + /** + * List of extracted PDFs. + * @private + */ + private extractedPdfs: ExtractedPdf[] | null = null; + + constructor(inputDocument: string | LocalInputSource) { + this.inputDocument = inputDocument; + } + + async init() { + this.pdfLib = await getPdfLib(); + if (typeof this.inputDocument === "string") { + logger.debug(`Loading from path: ${this.inputDocument}`); + try { + const tempPathInput = new PathInput({ inputPath: this.inputDocument }); + await tempPathInput.init(); + if (tempPathInput.isPdf()) { + this.sourcePdf = tempPathInput.fileObject; + } else { + const pdfObject = await createPdfFromInputSource(tempPathInput); + this.sourcePdf = Buffer.from(await pdfObject.save()); + } + } catch { + throw new MindeeInputSourceError("Couldn't generate PDF from input."); + } + this.filename = path.basename(this.inputDocument); + } else { + logger.debug(`Loading document: ${this.inputDocument.filename}`); + await this.inputDocument.init(); + if (this.inputDocument.isPdf()) { + this.sourcePdf = this.inputDocument.fileObject as Buffer; + } else { + const pdfObject = await createPdfFromInputSource(this.inputDocument); + const arrayBuffer = await pdfObject.save(); + this.sourcePdf = Buffer.from(arrayBuffer); + } + this.filename = this.inputDocument.filename; + } + this.initialized = true; + if (!this.sourcePdf) { + throw new MindeePdfError("Could not load PDF source."); + } + } + + /** + * Gets the number of pages in the PDF. + * @returns The number of pages in the PDF. + */ + async getPageCount() { + if (!this.initialized) { + await this.init(); + } + const currentPdf = await this.pdfLib!.PDFDocument.load(this.sourcePdf!, { + ignoreEncryption: true, + password: "" + }); + return currentPdf.getPageCount(); + } + + /** + * Extracts pages from the PDF. + * @param pageIndexes + */ + async extractSubDocuments(pageIndexes: number[][]): Promise { + if (this.extractedPdfs && this.extractedPdfs.length > 0) { + return this.extractedPdfs; + } + if (!this.initialized) { + await this.init(); + } + this.extractedPdfs = []; + for (const pageRange of pageIndexes) { + logger.debug(`Extracting pages ${pageRange.join(", ")}`); + if (pageRange.length === 0) { + throw new MindeeInputSourceError("Empty indexes not allowed for extraction."); + } + const pageOptions: PageOptions = { + pageIndexes: pageRange, + operation: PageOptionsOperation.KeepOnly, + onMinPages: 1, + }; + const splitName = path.basename(this.filename!, path.extname(this.filename!)); + + const startPage = String(pageRange[0] + 1).padStart(3, "0"); + const endPage = String(pageRange[pageRange.length - 1] + 1).padStart(3, "0"); + + const fieldFilename = `${splitName}_page${startPage}-${endPage}.pdf`; + const page = await extractPages(this.sourcePdf!, pageOptions); + this.extractedPdfs.push(new ExtractedPdf(page.file, fieldFilename, pageRange.length)); + } + return this.extractedPdfs; + } +} diff --git a/src/pdf/pdfOperation.ts b/src/pdf/pdfOperation.ts index ebc22bb42..8af6b32a2 100644 --- a/src/pdf/pdfOperation.ts +++ b/src/pdf/pdfOperation.ts @@ -3,9 +3,10 @@ import type * as pdfLibTypes from "@cantoo/pdf-lib"; import { errorHandler } from "@/errors/handler.js"; import { PageOptions, PageOptionsOperation } from "@/input/pageOptions.js"; -import { MindeeError } from "@/errors/index.js"; +import { MindeeError, MindeeInputSourceError } from "@/errors/index.js"; import { logger } from "@/logger.js"; import { loadOptionalDependency } from "@/dependency/index.js"; +import { LocalInputSource } from "@/input/index.js"; let pdfLib: typeof pdfLibTypes | null = null; @@ -111,3 +112,37 @@ export async function countPages(file: Buffer): Promise { }); return currentPdf.getPageCount(); } + + +/** + * Creates a PDF from a local file. Converts images to PDFs if needed. + * @param inputSource The input source to create a PDF from. + */ +export async function createPdfFromInputSource(inputSource: LocalInputSource) { + const pdfLib = await getPdfLib(); + let pdfDoc: pdfLibTypes.PDFDocument; + if (!["image/jpeg", "image/jpg", "image/png", "application/pdf"].includes(inputSource.mimeType)) { + throw new MindeeInputSourceError( + 'Unsupported file type "' + + inputSource.mimeType + + '" Currently supported types are .png, .jpg and .pdf' + ); + } else if (inputSource.isPdf()) { + pdfDoc = await pdfLib.PDFDocument.load(inputSource.fileObject, { + ignoreEncryption: true, + password: "" + }); + } else { + pdfDoc = await pdfLib.PDFDocument.create(); + let image: pdfLibTypes.PDFImage; + if (inputSource.mimeType === "image/png") { + image = await pdfDoc.embedPng(inputSource.fileObject); + } else { + image = await pdfDoc.embedJpg(inputSource.fileObject); + } + const imageDims = image.scale(1); + const pageImage = pdfDoc.addPage([imageDims.width, imageDims.height]); + pageImage.drawImage(image); + } + return pdfDoc; +} diff --git a/src/pdf/pdfUtils.ts b/src/pdf/pdfUtils.ts index ea32f595c..748d9c9bc 100644 --- a/src/pdf/pdfUtils.ts +++ b/src/pdf/pdfUtils.ts @@ -1,8 +1,14 @@ // eslint-disable-next-line @typescript-eslint/ban-ts-comment // @ts-ignore import type * as pdfJsExtractTypes from "pdf.js-extract"; +// eslint-disable-next-line @typescript-eslint/ban-ts-comment +// @ts-ignore +import type * as popplerTypes from "node-poppler"; +import tmp from "tmp"; +import * as fs from "node:fs"; import { MindeePdfError } from "@/errors/index.js"; import { loadOptionalDependency } from "@/dependency/index.js"; +import { logger } from "@/logger.js"; export interface PageTextInfo { @@ -85,3 +91,46 @@ export async function hasSourceText(pdfData: Buffer): Promise { const text = await extractTextFromPdf(pdfData); return text.getConcatenatedText().trim().length > 0; } + +/** + * Rasterizes a PDF page. + * + * @param pdfData Buffer representation of the entire PDF file. + * @param index Index of the page to rasterize. + * @param quality Quality to apply during rasterization. + * @return Buffer containing the rasterized image data. + */ +export async function rasterizePage( + pdfData: Buffer, index: number, quality = 85 +): Promise { + const popplerImport = await loadOptionalDependency( + "node-poppler", "Image Processing" + ); + const poppler = (popplerImport as any).default || popplerImport; + const popplerInstance = new poppler.Poppler(); + const tmpPdf = tmp.fileSync(); + const tempPdfPath = tmpPdf.name; + const antialiasOption: "fast" | "best" | "default" | "good" | "gray" | "none" | "subpixel" = "best"; + try { + await fs.promises.writeFile(tempPdfPath, pdfData); + const options = { + antialias: antialiasOption, + firstPageToConvert: index, + lastPageToConvert: index, + jpegFile: true, + jpegOptions: `quality=${quality}`, + singleFile: true + }; + + const jpegBuffer = await popplerInstance.pdfToCairo(tempPdfPath, undefined, options); + + await fs.promises.unlink(tempPdfPath); + + return Buffer.from(jpegBuffer, "binary"); + } catch (error) { + logger.error("Error rasterizing PDF:", error); + throw error; + } finally { + tmpPdf.removeCallback(); + } +} diff --git a/src/v1/extraction/multiReceiptsExtractor/multiReceiptsExtractor.ts b/src/v1/extraction/multiReceiptsExtractor/multiReceiptsExtractor.ts index c49e37b58..802112f5f 100644 --- a/src/v1/extraction/multiReceiptsExtractor/multiReceiptsExtractor.ts +++ b/src/v1/extraction/multiReceiptsExtractor/multiReceiptsExtractor.ts @@ -1,7 +1,8 @@ // eslint-disable-next-line @typescript-eslint/ban-ts-comment // @ts-ignore import type * as pdfLibTypes from "@cantoo/pdf-lib"; -import { MindeeError, MindeeInputSourceError } from "@/errors/index.js"; +import { MindeeError } from "@/errors/index.js"; +import { createPdfFromInputSource } from "@/pdf/pdfOperation.js"; import { Polygon } from "@/geometry/index.js"; import { MultiReceiptsDetectorV1 } from "@/v1/product/index.js"; import { ExtractedMultiReceiptImage } from "@/v1/extraction/index.js"; @@ -41,35 +42,6 @@ async function extractReceiptsFromPage( return extractedReceipts; } -async function loadPdfDoc(inputFile: LocalInputSource) { - const pdfLib = await getPdfLib(); - let pdfDoc: pdfLibTypes.PDFDocument; - if (!["image/jpeg", "image/jpg", "image/png", "application/pdf"].includes(inputFile.mimeType)) { - throw new MindeeInputSourceError( - 'Unsupported file type "' + - inputFile.mimeType + - '" Currently supported types are .png, .jpg and .pdf' - ); - } else if (inputFile.isPdf()) { - pdfDoc = await pdfLib.PDFDocument.load(inputFile.fileObject, { - ignoreEncryption: true, - password: "" - }); - } else { - pdfDoc = await pdfLib.PDFDocument.create(); - let image: pdfLibTypes.PDFImage; - if (inputFile.mimeType === "image/png") { - image = await pdfDoc.embedPng(inputFile.fileObject); - } else { - image = await pdfDoc.embedJpg(inputFile.fileObject); - } - const imageDims = image.scale(1); - const pageImage = pdfDoc.addPage([imageDims.width, imageDims.height]); - pageImage.drawImage(image); - } - return pdfDoc; -} - /** * Extracts individual receipts from multi-receipts documents. * @@ -86,9 +58,9 @@ export async function extractReceipts( if (!inference.prediction.receipts) { throw new MindeeError("No possible receipts candidates found for MultiReceipts extraction."); } - const pdfDoc = await loadPdfDoc(inputFile); + const pdfDoc = await createPdfFromInputSource(inputFile); for (let pageId = 0; pageId < pdfDoc.getPageCount(); pageId++) { - const [page] = await pdfDoc.copyPages(pdfDoc, [pageId]); + const page = pdfDoc.getPage(pageId); page.setRotation(pdfLib.degrees(inference.pages[pageId].orientation?.value ?? 0)); const receiptPositions = inference.pages[pageId].prediction.receipts.map( (receipt: PositionField) => receipt.boundingBox diff --git a/src/v2/fileOperations/crop.ts b/src/v2/fileOperations/crop.ts new file mode 100644 index 000000000..56c793b40 --- /dev/null +++ b/src/v2/fileOperations/crop.ts @@ -0,0 +1,47 @@ +import { LocalInputSource } from "@/input/index.js"; +import { CropItem } from "@/v2/product/crop/index.js"; +import { MindeeError } from "@/errors/index.js"; +import { extractImagesFromPolygon } from "@/image/imageExtractor.js"; +import { Polygon } from "@/geometry/index.js"; +import { CropFiles } from "@/v2/fileOperations/cropFiles.js"; +import { ExtractedImage } from "@/image/index.js"; +import { logger } from "@/logger.js"; + + +/** + * Extracts a single specified crop from a given input source. + * @param inputSource Local input source. + * @param crop Crop to extract. + */ +export async function extractSingleCrop(inputSource: LocalInputSource, crop: CropItem): Promise { + return (await extractCrops(inputSource, [crop]))[0]; +} + + +/** + * Extracts a list of crops from a document. + * @param inputSource Local input source. + * @param crops List of crops to extract. + * @param upscale Whether to upscale the extracted images. + * @return a list of extracted files, as a CropFiles object. + */ +export async function extractCrops( + inputSource: LocalInputSource, + crops: CropItem[], + upscale: boolean = false +): Promise { + if (crops.length === 0) { + throw new MindeeError("No crop indexes provided."); + } + logger.debug("Extracting crops: " + crops.join(", ")); + const polygonsByPage = new Map(); + for (const crop of crops) { + const pageId: number = crop.location.page; + if (!polygonsByPage.has(pageId)) { + polygonsByPage.set(pageId, []); + } + polygonsByPage.get(pageId)!.push(crop.location.polygon); + } + const extractedCrops = await extractImagesFromPolygon(inputSource, polygonsByPage, upscale); + return new CropFiles(...extractedCrops); +} diff --git a/src/v2/fileOperations/cropFiles.ts b/src/v2/fileOperations/cropFiles.ts new file mode 100644 index 000000000..61d3ff22b --- /dev/null +++ b/src/v2/fileOperations/cropFiles.ts @@ -0,0 +1,7 @@ +import { ExtractedImage } from "@/image/index.js"; + +export class CropFiles extends Array { + constructor(...items: ExtractedImage[]) { + super(...items); + } +} diff --git a/src/v2/fileOperations/index.ts b/src/v2/fileOperations/index.ts new file mode 100644 index 000000000..e69de29bb diff --git a/src/v2/fileOperations/split.ts b/src/v2/fileOperations/split.ts new file mode 100644 index 000000000..d606f4304 --- /dev/null +++ b/src/v2/fileOperations/split.ts @@ -0,0 +1,45 @@ +import { LocalInputSource } from "@/input/index.js"; +import { MindeeError } from "@/errors/index.js"; +import { PdfExtractor } from "@/pdf/pdfExtractor.js"; +import { SplitFiles } from "@/v2/fileOperations/splitFiles.js"; +import { logger } from "@/logger.js"; + +/** + * Extracts a single specified split from a + * @param inputSource + * @param split + */ +export async function extractSingleSplit(inputSource: LocalInputSource, split: number[]) { + return await extractSplits(inputSource, [split]); +} + +/** + * Extracts splits as complete PDFs from the document. + * @param inputSource Local input source. + * @param splits List of sub-lists of pages to keep. + * @return a list of extracted files. + * @throws MindeeError if no indexes are provided. + */ +export async function extractSplits(inputSource: LocalInputSource, splits: number[][]) { + const pageGroups = splits.filter(e => e.length > 0); + if (pageGroups.length === 0) { + throw new MindeeError("No valid split indexes provided."); + } + logger.debug("Extracting splits: " + splits.join(", ")); + const pdfExtractor = new PdfExtractor(inputSource); + await pdfExtractor.init(); + + const subDocuments = await pdfExtractor.extractSubDocuments(pageGroups); + return new SplitFiles(...subDocuments); +} + +/** + * Expands a range of pages into a list of page indexes. + * @param range start and end of the page range + */ +export function expandRange(range: [number, number]): number[] { + if (range[0] > range[1]) { + throw new MindeeError("Invalid page range provided."); + } + return Array.from({ length: range[1] - range[0] + 1 }, (_, i) => range[0] + i); +} diff --git a/src/v2/fileOperations/splitFiles.ts b/src/v2/fileOperations/splitFiles.ts new file mode 100644 index 000000000..14dcb7553 --- /dev/null +++ b/src/v2/fileOperations/splitFiles.ts @@ -0,0 +1,8 @@ +import { ExtractedPdf } from "@/pdf/extractedPdf.js"; + +export class SplitFiles extends Array { + + constructor(...args: ExtractedPdf[]) { + super(...args); + } +} diff --git a/src/v2/product/split/splitRange.ts b/src/v2/product/split/splitRange.ts index 0974b4352..843e27a90 100644 --- a/src/v2/product/split/splitRange.ts +++ b/src/v2/product/split/splitRange.ts @@ -1,4 +1,6 @@ import { StringDict } from "@/parsing/index.js"; +import { LocalInputSource } from "@/input/index.js"; +import { extractSplits } from "@/v2/fileOperations/split.js"; /** * Split inference result. @@ -24,4 +26,12 @@ export class SplitRange { const pageRange = this.pageRange.join(","); return `* :Page Range: ${pageRange}\n :Document Type: ${this.documentType}`; } + + /** + * Extracts a single split from the input file. + * @param inputSource The input file to extract from. + */ + async extractFromFile(inputSource: LocalInputSource) { + return (await extractSplits(inputSource, [this.pageRange]))[0]; + } } diff --git a/src/v2/product/split/splitResponse.ts b/src/v2/product/split/splitResponse.ts index 891cb30de..7c1b6ba0d 100644 --- a/src/v2/product/split/splitResponse.ts +++ b/src/v2/product/split/splitResponse.ts @@ -1,6 +1,8 @@ import { StringDict } from "@/parsing/stringDict.js"; import { SplitInference } from "./splitInference.js"; import { BaseResponse } from "@/v2/parsing/index.js"; +import { LocalInputSource } from "@/input/index.js"; +import { expandRange, extractSplits } from "@/v2/fileOperations/split.js"; export class SplitResponse extends BaseResponse { /** @@ -15,4 +17,16 @@ export class SplitResponse extends BaseResponse { super(serverResponse); this.inference = new SplitInference(serverResponse["inference"]); } + + /** + * Extracts all splits from an input PDF. + * @param inputSource The input file to extract from. + */ + async extractFromFile(inputSource: LocalInputSource){ + const splits: number[][] = []; + for (const split of this.inference.result.splits) { + splits.push(expandRange(split.pageRange as [number, number])); + } + return await extractSplits(inputSource, splits); + } } diff --git a/tests/index.ts b/tests/index.ts index 2d161c7d7..69a5c918b 100644 --- a/tests/index.ts +++ b/tests/index.ts @@ -4,6 +4,7 @@ import path from "path"; const currentDirName = dirname(fileURLToPath(import.meta.url)); export const RESOURCE_PATH = path.join(currentDirName, "data"); +export const OUTPUT_PATH = path.join(RESOURCE_PATH, "output"); export const V1_RESOURCE_PATH = path.join(RESOURCE_PATH, "v1"); export const V1_PRODUCT_PATH = path.join(V1_RESOURCE_PATH, "products"); diff --git a/tests/v1/extraction/multiReceipts.spec.ts b/tests/v1/extraction/multiReceipts.spec.ts index f520a3462..49cc7f62d 100644 --- a/tests/v1/extraction/multiReceipts.spec.ts +++ b/tests/v1/extraction/multiReceipts.spec.ts @@ -15,7 +15,6 @@ const dataPath = { multiPageSample: path.join(V1_PRODUCT_PATH, "multi_receipts_detector/multipage_sample.pdf"), }; describe("MindeeV1 - Multi-Receipt Extraction #OptionalDepsRequired", () => { - describe("A single-page multi-receipts document", () => { it("should be split properly.", async () => { const jsonDataNA = await fs.readFile(path.resolve(dataPath.complete)); diff --git a/tests/v2/fileOperations/crop.integration.ts b/tests/v2/fileOperations/crop.integration.ts new file mode 100644 index 000000000..b9ce35927 --- /dev/null +++ b/tests/v2/fileOperations/crop.integration.ts @@ -0,0 +1,95 @@ +import { after, before, beforeEach, describe, it } from "node:test"; +import assert from "node:assert/strict"; +import path from "node:path"; +import * as fs from "node:fs"; + +import { Client, PathInput, BufferInput } from "@/index.js"; +import { Crop } from "@/v2/product/crop/index.js"; +import { Extraction, ExtractionResponse } from "@/v2/product/extraction/index.js"; +import { extractCrops } from "@/v2/fileOperations/crop.js"; +import { V2_PRODUCT_PATH, OUTPUT_PATH } from "../../index.js"; +import { SimpleField } from "@/v2/parsing/inference/field/index.js"; + + +function checkFindocReturn(findocResponse: ExtractionResponse) { + assert.ok(findocResponse.inference.model.id.length > 0); + const totalAmount = findocResponse.inference.result.fields.get("total_amount") as SimpleField; + assert.ok(totalAmount !== undefined); + assert.ok((totalAmount.value as number) > 0); +} + +describe("MindeeV2 - Integration - FileOperation - Crop", { timeout: 120000 }, () => { + let client: Client; + let cropModelId: string; + let findocModelId: string; + + const cropSample = path.join( + V2_PRODUCT_PATH, + "crop", + "default_sample.jpg" + ); + + before(() => { + if (!fs.existsSync(OUTPUT_PATH)) { + fs.mkdirSync(OUTPUT_PATH, { recursive: true }); + } + }); + + beforeEach(() => { + const apiKey = process.env["MINDEE_V2_API_KEY"] ?? ""; + cropModelId = process.env["MINDEE_V2_SE_TESTS_CROP_MODEL_ID"] ?? ""; + findocModelId = process.env["MINDEE_V2_SE_TESTS_FINDOC_MODEL_ID"] ?? ""; + + client = new Client({ apiKey: apiKey, debug: true }); + }); + + after(() => { + const file1 = path.join(OUTPUT_PATH, "crop_001.jpg"); + const file2 = path.join(OUTPUT_PATH, "crop_002.jpg"); + if (fs.existsSync(file1)) fs.rmSync(file1); + if (fs.existsSync(file2)) fs.rmSync(file2); + }); + + it("extracts crops from image correctly", async () => { + const cropInput = new PathInput({ inputPath: cropSample }); + await cropInput.init(); + + const cropParams = { modelId: cropModelId }; + + const response = await client.enqueueAndGetResult( + Crop, cropInput, cropParams + ); + + assert.equal(response.inference.result.crops.length, 2); + + const extractedImages = await extractCrops(cropInput, response.inference.result.crops); + + assert.equal(extractedImages.length, 2); + assert.equal(extractedImages[0].filename, "default_sample.jpg_page0-0.jpg"); + assert.equal(extractedImages[1].filename, "default_sample.jpg_page0-1.jpg"); + + const extractionInput = new BufferInput({ + buffer: extractedImages[0].buffer, + filename: extractedImages[0].filename + }); + const findocParams = { modelId: findocModelId }; + + const invoice0 = await client.enqueueAndGetResult( + Extraction, extractionInput, findocParams + ); + + checkFindocReturn(invoice0); + + const file1Path = path.join(OUTPUT_PATH, "crop_001.jpg"); + const file2Path = path.join(OUTPUT_PATH, "crop_002.jpg"); + + fs.writeFileSync(file1Path, extractedImages[0].buffer); + fs.writeFileSync(file2Path, extractedImages[1].buffer); + + const stat1 = fs.statSync(file1Path); + assert.ok(stat1.size >= 3100000 && stat1.size <= 3200000); + + const stat2 = fs.statSync(file2Path); + assert.ok(stat2.size >= 3200000 && stat2.size <= 3300000); + }); +}); diff --git a/tests/v2/fileOperations/crop.spec.ts b/tests/v2/fileOperations/crop.spec.ts new file mode 100644 index 000000000..7ea50c2bc --- /dev/null +++ b/tests/v2/fileOperations/crop.spec.ts @@ -0,0 +1,104 @@ +import path from "path"; +import assert from "node:assert/strict"; +import { describe, it } from "node:test"; + +import { LocalResponse } from "@/v2/parsing/index.js"; +import { CropResponse } from "@/v2/product/crop/cropResponse.js"; +import { PathInput } from "@/index.js"; +import { extractCrops } from "@/v2/fileOperations/crop.js"; +import { V2_PRODUCT_PATH } from "../../index"; +import { loadOptionalDependency } from "../../../src/dependency"; +import type * as SharpTypes from "sharp"; +import type * as pdfLibTypes from "@cantoo/pdf-lib"; + +const cropPath = path.join(V2_PRODUCT_PATH, "crop"); +let pdfLib: typeof pdfLibTypes | null = null; + +async function getPdfLib(): Promise { + if (!pdfLib) { + const pdfLibImport = await loadOptionalDependency("@cantoo/pdf-lib", "Text Embedding"); + pdfLib = (pdfLibImport as any).default || pdfLibImport; + } + return pdfLib!; +} + +async function loadV2Crop(resourcePath: string): Promise { + const localResponse = new LocalResponse(resourcePath); + await localResponse.init(); + return localResponse.deserializeResponse(CropResponse); +} +/** + * Gets dimensions of a buffer, routing to pdf-lib for PDFs and sharp for images. + */ +async function getFileDimensions(buffer: Buffer, sharpInstance: any) { + const isPdf = buffer.subarray(0, 4).toString("ascii") === "%PDF"; + const pdfLib = await getPdfLib(); + if (isPdf) { + const pdfDoc = await pdfLib.PDFDocument.load(buffer); + const page = pdfDoc.getPage(0); + const { width, height } = page.getSize(); + + return { width, height }; + } + const metadata = await sharpInstance(buffer).metadata(); + return { width: metadata.width, height: metadata.height }; + +} + +describe("MindeeV2 - FileOperation - Crop #OptionalDepsRequired", async () => { + const sharpLoaded = await loadOptionalDependency("sharp", "Image compression"); + const sharp = (sharpLoaded as any).default || sharpLoaded; + await it("should process single page crop split correctly", async () => { + const inputSample = new PathInput({ + inputPath: + path.join(cropPath, "default_sample.jpg") + }); + await inputSample.init(); + const response = await loadV2Crop( + path.join(cropPath, "crop_single.json") + ); + + const extractedCrops = await extractCrops( + inputSample, + response.inference.result.crops + ); + + assert.strictEqual(extractedCrops.length, 1); + + assert.strictEqual(extractedCrops[0].pageId, 0); + const dimensions = await getFileDimensions(extractedCrops[0].buffer, sharp); + assert.strictEqual(Math.round(dimensions.width), 5880); + assert.strictEqual(Math.round(dimensions.height), 3275); + }); + + await it("should process multi page receipt split correctly", async () => { + const inputSample = new PathInput({ + inputPath: + path.join(cropPath, "multipage_sample.pdf") + }); + await inputSample.init(); + const response = await loadV2Crop( + path.join(cropPath, "crop_multiple.json") + ); + + const extractedCrops = await extractCrops( + inputSample, + response.inference.result.crops + ); + + assert.strictEqual(extractedCrops.length, 2); + + assert.strictEqual(extractedCrops[0].pageId, 0); + assert.strictEqual(extractedCrops[0].elementId, 0); + + const dimensions1 = await getFileDimensions(extractedCrops[0].buffer, sharp); + assert.strictEqual(Math.round(dimensions1.width), 325); + assert.strictEqual(Math.round(dimensions1.height), 1579); + + assert.strictEqual(extractedCrops[1].pageId, 0); + assert.strictEqual(extractedCrops[1].elementId, 1); + const dimensions2 = await getFileDimensions(extractedCrops[1].buffer, sharp); + assert.strictEqual(Math.round(dimensions2.width), 391); + assert.strictEqual(Math.round(dimensions2.height), 1439); + }); +}); diff --git a/tests/v2/fileOperations/split.integration.ts b/tests/v2/fileOperations/split.integration.ts new file mode 100644 index 000000000..e69de29bb diff --git a/tests/v2/fileOperations/split.spec.ts b/tests/v2/fileOperations/split.spec.ts new file mode 100644 index 000000000..bc4112c61 --- /dev/null +++ b/tests/v2/fileOperations/split.spec.ts @@ -0,0 +1,92 @@ +import path from "path"; +import assert from "node:assert/strict"; +import { describe, it } from "node:test"; + +import { LocalResponse } from "@/v2/parsing/index.js"; +import { SplitResponse } from "@/v2/product/split/splitResponse.js"; +import { PathInput } from "@/index.js"; +import { V2_PRODUCT_PATH } from "../../index"; +import { loadOptionalDependency } from "../../../src/dependency"; +import type * as pdfLibTypes from "@cantoo/pdf-lib"; + +const splitPath = path.join(V2_PRODUCT_PATH, "split"); +const financialDocumentPath = path.join(V2_PRODUCT_PATH, "extraction", "financial_document"); + +let pdfLib: typeof pdfLibTypes | null = null; + +async function getPdfLib(): Promise { + if (!pdfLib) { + const pdfLibImport = await loadOptionalDependency("@cantoo/pdf-lib", "PDF Parsing"); + pdfLib = (pdfLibImport as any).default || pdfLibImport; + } + return pdfLib!; +} + +async function loadV2Split(resourcePath: string): Promise { + const localResponse = new LocalResponse(resourcePath); + await localResponse.init(); + return localResponse.deserializeResponse(SplitResponse); +} + +/** + * Gets the page count of a buffer, routing to pdf-lib for PDFs. + */ +async function getPageCount(buffer: Buffer): Promise { + const isPdf = buffer.subarray(0, 4).toString("ascii") === "%PDF"; + if (isPdf) { + const pdfLib = await getPdfLib(); + const pdfDoc = await pdfLib.PDFDocument.load(buffer); + return pdfDoc.getPageCount(); + } + return 1; +} + +describe("MindeeV2 - Product - SplitResponse #OptionalDepsRequired", async () => { + + await it("should process single page split correctly", async () => { + const inputSample = new PathInput({ + inputPath: path.join(financialDocumentPath, "default_sample.jpg") + }); + await inputSample.init(); + + const response = await loadV2Split( + path.join(splitPath, "split_single.json") + ); + + const extractedSplits = await response.extractFromFile(inputSample); + + assert.strictEqual(extractedSplits.length, 1); + + assert.strictEqual(extractedSplits[0].pageCount, 1); + + const count0 = await getPageCount(extractedSplits[0].buffer); + assert.strictEqual(count0, 1); + }); + + await it("should process multi page receipt split correctly", async () => { + const inputSample = new PathInput({ + inputPath: path.join(splitPath, "invoice_5p.pdf") + }); + await inputSample.init(); + + const response = await loadV2Split( + path.join(splitPath, "split_multiple.json") + ); + + const extractedSplits = await response.extractFromFile(inputSample); + + assert.strictEqual(extractedSplits.length, 3); + + assert.strictEqual(extractedSplits[0].pageCount, 1); + const count0 = await getPageCount(extractedSplits[0].buffer); + assert.strictEqual(count0, 1); + + assert.strictEqual(extractedSplits[1].pageCount, 3); + const count1 = await getPageCount(extractedSplits[1].buffer); + assert.strictEqual(count1, 3); + + assert.strictEqual(extractedSplits[2].pageCount, 1); + const count2 = await getPageCount(extractedSplits[2].buffer); + assert.strictEqual(count2, 1); + }); +}); From 9f1d7603f7eec791c2cdf075d0906b91f37fb990 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Mon, 13 Apr 2026 11:04:34 +0200 Subject: [PATCH 2/9] add integration test for split --- tests/v2/fileOperations/crop.integration.ts | 2 +- tests/v2/fileOperations/split.integration.ts | 96 ++++++++++++++++++++ 2 files changed, 97 insertions(+), 1 deletion(-) diff --git a/tests/v2/fileOperations/crop.integration.ts b/tests/v2/fileOperations/crop.integration.ts index b9ce35927..181eb1f46 100644 --- a/tests/v2/fileOperations/crop.integration.ts +++ b/tests/v2/fileOperations/crop.integration.ts @@ -18,7 +18,7 @@ function checkFindocReturn(findocResponse: ExtractionResponse) { assert.ok((totalAmount.value as number) > 0); } -describe("MindeeV2 - Integration - FileOperation - Crop", { timeout: 120000 }, () => { +describe("MindeeV2 - Integration - FileOperation - Crop #OptionalDepsRequired", { timeout: 120000 }, () => { let client: Client; let cropModelId: string; let findocModelId: string; diff --git a/tests/v2/fileOperations/split.integration.ts b/tests/v2/fileOperations/split.integration.ts index e69de29bb..b4ac33c57 100644 --- a/tests/v2/fileOperations/split.integration.ts +++ b/tests/v2/fileOperations/split.integration.ts @@ -0,0 +1,96 @@ +import { after, before, beforeEach, describe, it } from "node:test"; +import assert from "node:assert/strict"; +import path from "node:path"; +import * as fs from "node:fs"; + +import { Client, PathInput, BufferInput } from "@/index.js"; +import { Crop } from "@/v2/product/crop/index.js"; +import { Extraction, ExtractionResponse } from "@/v2/product/extraction/index.js"; +import { extractCrops } from "@/v2/fileOperations/crop.js"; +import { V2_PRODUCT_PATH } from "../../index.js"; +import { SimpleField } from "@/v2/parsing/inference/field/index.js"; + +const OUTPUT_DIR = path.join(__dirname, "output"); + +function checkFindocReturn(findocResponse: ExtractionResponse) { + assert.ok(findocResponse.inference.model.id.length > 0); + const totalAmount = findocResponse.inference.result.fields.get("total_amount") as SimpleField; + assert.ok(totalAmount !== undefined); + assert.ok((totalAmount.value as number) > 0); +} + +describe("MindeeV2 - Integration - FileOperation - Crop", { timeout: 120000 }, () => { + let client: Client; + let cropModelId: string; + let findocModelId: string; + + const cropSample = path.join( + V2_PRODUCT_PATH, + "crop", + "default_sample.jpg" + ); + + before(() => { + if (!fs.existsSync(OUTPUT_DIR)) { + fs.mkdirSync(OUTPUT_DIR, { recursive: true }); + } + }); + + beforeEach(() => { + const apiKey = process.env["MINDEE_V2_API_KEY"] ?? ""; + cropModelId = process.env["MINDEE_V2_SE_TESTS_CROP_MODEL_ID"] ?? ""; + findocModelId = process.env["MINDEE_V2_SE_TESTS_FINDOC_MODEL_ID"] ?? ""; + + client = new Client({ apiKey: apiKey, debug: true }); + }); + + after(() => { + const file1 = path.join(OUTPUT_DIR, "crop_001.jpg"); + const file2 = path.join(OUTPUT_DIR, "crop_002.jpg"); + if (fs.existsSync(file1)) fs.rmSync(file1); + if (fs.existsSync(file2)) fs.rmSync(file2); + }); + + it("extracts crops from image correctly", async () => { + const cropInput = new PathInput({ inputPath: cropSample }); + await cropInput.init(); + + const cropParams = { modelId: cropModelId }; + + const response = await client.enqueueAndGetResult( + Crop, cropInput, cropParams + ); + + assert.equal(response.inference.result.crops.length, 2); + + const extractedImages = await extractCrops(cropInput, response.inference.result.crops); + + assert.equal(extractedImages.length, 2); + assert.equal(extractedImages[0].filename, "default_sample.jpg_page0-0.jpg"); + assert.equal(extractedImages[1].filename, "default_sample.jpg_page0-1.jpg"); + + const extractionInput = new BufferInput({ + buffer: extractedImages[0].buffer, + filename: extractedImages[0].filename + }); + const findocParams = { modelId: findocModelId }; + + const invoice0 = await client.enqueueAndGetResult( + Extraction, extractionInput, findocParams + ); + + checkFindocReturn(invoice0 as ExtractionResponse); + + const file1Path = path.join(OUTPUT_DIR, "crop_001.jpg"); + const file2Path = path.join(OUTPUT_DIR, "crop_002.jpg"); + + fs.writeFileSync(file1Path, extractedImages[0].buffer); + fs.writeFileSync(file2Path, extractedImages[1].buffer); + + const stat1 = fs.statSync(file1Path); + assert.ok(stat1.size >= 3170000 && stat1.size <= 3180000); + + const stat2 = fs.statSync(file2Path); + assert.ok(stat2.size >= 3210000 && stat2.size <= 3230000); + }); +}); From 9ff506876ef648692a29eeb8316f139d77f825c1 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Mon, 13 Apr 2026 16:08:24 +0200 Subject: [PATCH 3/9] add support for quality in croppings + fix tests --- src/image/imageExtractor.ts | 31 ++++--- src/pdf/extractedPdf.ts | 2 +- src/pdf/pdfExtractor.ts | 2 +- .../multiReceiptsExtractor.ts | 3 +- src/v2/fileOperations/crop.ts | 6 +- src/v2/fileOperations/split.ts | 10 ++- src/v2/product/crop/cropItem.ts | 12 +++ src/v2/product/crop/cropResponse.ts | 14 ++- src/v2/product/split/splitRange.ts | 5 +- src/v2/product/split/splitResponse.ts | 3 +- tests/v2/fileOperations/crop.spec.ts | 50 ++++++++--- tests/v2/fileOperations/split.integration.ts | 85 ++++++++++++------- tests/v2/fileOperations/split.spec.ts | 20 ++++- 13 files changed, 173 insertions(+), 70 deletions(-) diff --git a/src/image/imageExtractor.ts b/src/image/imageExtractor.ts index 2efb5f06a..8dc8e63a5 100644 --- a/src/image/imageExtractor.ts +++ b/src/image/imageExtractor.ts @@ -1,21 +1,22 @@ // eslint-disable-next-line @typescript-eslint/ban-ts-comment // @ts-ignore -import type * as pdfLibTypes from "@cantoo/pdf-lib"; +import { loadOptionalDependency } from "@/dependency/index.js"; +import { MindeeImageError } from "@/errors/index.js"; import { getMinMaxX, getMinMaxY, Polygon } from "@/geometry/index.js"; import { adjustForRotation } from "@/geometry/polygonUtils.js"; -import { loadOptionalDependency } from "@/dependency/index.js"; -import { LocalInputSource } from "@/input/index.js"; import { ExtractedImage } from "@/image/extractedImage.js"; -import { createPdfFromInputSource } from "@/pdf/pdfOperation.js"; +import { LocalInputSource } from "@/input/index.js"; import { logger } from "@/logger.js"; +import { createPdfFromInputSource } from "@/pdf/pdfOperation.js"; import { rasterizePage } from "@/pdf/pdfUtils.js"; +import type * as pdfLibTypes from "@cantoo/pdf-lib"; let pdfLib: typeof pdfLibTypes | null = null; async function getPdfLib(): Promise { if (!pdfLib) { const pdfLibImport = await loadOptionalDependency( - "@cantoo/pdf-lib", "Text Embedding" + "@cantoo/pdf-lib", "Image Extraction" ); pdfLib = (pdfLibImport as any).default || pdfLibImport; } @@ -27,12 +28,12 @@ async function getPdfLib(): Promise { * Extracts elements from a PDF document based on a list of bounding boxes. * @param inputSource The input source to extract from. * @param polygonsPerPage List of polygons to extract from per page. - * @param upscale Whether to upscale the image. + * @param quality JPEG quality of extracted images. */ export async function extractImagesFromPolygon( inputSource: LocalInputSource, polygonsPerPage: Map, - upscale: boolean = false + quality?: number ) { const allExtractedImages: ExtractedImage[] = []; const pdfDoc = await createPdfFromInputSource(inputSource); @@ -40,7 +41,7 @@ export async function extractImagesFromPolygon( for (const [pageId, polygons] of polygonsPerPage) { logger.debug(`Extracting images from page ${pageId}`); const pdfPage = pdfDoc.getPage(pageId); - const extractions = (await extractFromPage(pdfPage, polygons, true, upscale)); + const extractions = (await extractFromPage(pdfPage, polygons, true, quality)); const extractedImages = extractions.map( (v, i) => new ExtractedImage(v, inputSource.filename + `_page${pageId}-${i}.jpg`, pageId, i) ); @@ -55,19 +56,25 @@ export async function extractImagesFromPolygon( * @param pdfPage PDF Page to extract from. * @param polygons List of coordinates to pull the elements from. * @param asImage Whether to return the extracted elements as images. - * @param upscale Whether to upscale the image. + * @param quality JPEG quality of extracted images, given as number between 0 and 1. */ export async function extractFromPage( pdfPage: pdfLibTypes.PDFPage, polygons: Polygon[], asImage: boolean = false, - upscale: boolean = true + quality?: number, ) { const pdfLib = await getPdfLib(); const { width, height } = pdfPage.getSize(); const extractedElements: Uint8Array[] = []; - - const qualityScale = upscale ? 300 / 72 : 1; + if (quality && (quality < 0)) { + throw new MindeeImageError("Quality must be a number between 0 and 1"); + } + if (quality && quality > 1) { + logger.warn("Quality is greater than 1, this operation will apply a manual upscale on the output." + + " Use only if you know what you are doing."); + } + const qualityScale = quality ?? 1; const orientation = pdfPage.getRotation().angle; const sourceDoc = pdfPage.doc; diff --git a/src/pdf/extractedPdf.ts b/src/pdf/extractedPdf.ts index 7f578ae30..0369e9cc0 100644 --- a/src/pdf/extractedPdf.ts +++ b/src/pdf/extractedPdf.ts @@ -11,7 +11,7 @@ export class ExtractedPdf { private readonly filename: string; public readonly pageCount: number; - constructor(pdfData: Buffer, filename: string, pageCount: number) { + constructor(pdfData: Buffer, filename: string, pageCount: number) { this.buffer = pdfData; this.filename = filename; this.pageCount = pageCount; diff --git a/src/pdf/pdfExtractor.ts b/src/pdf/pdfExtractor.ts index 44353a6ed..78a0ae99a 100644 --- a/src/pdf/pdfExtractor.ts +++ b/src/pdf/pdfExtractor.ts @@ -133,7 +133,7 @@ export class PdfExtractor { const startPage = String(pageRange[0] + 1).padStart(3, "0"); const endPage = String(pageRange[pageRange.length - 1] + 1).padStart(3, "0"); - const fieldFilename = `${splitName}_page${startPage}-${endPage}.pdf`; + const fieldFilename = `${splitName}_page_${startPage}-${endPage}.pdf`; const page = await extractPages(this.sourcePdf!, pageOptions); this.extractedPdfs.push(new ExtractedPdf(page.file, fieldFilename, pageRange.length)); } diff --git a/src/v1/extraction/multiReceiptsExtractor/multiReceiptsExtractor.ts b/src/v1/extraction/multiReceiptsExtractor/multiReceiptsExtractor.ts index 802112f5f..a751b4014 100644 --- a/src/v1/extraction/multiReceiptsExtractor/multiReceiptsExtractor.ts +++ b/src/v1/extraction/multiReceiptsExtractor/multiReceiptsExtractor.ts @@ -34,7 +34,8 @@ async function extractReceiptsFromPage( pdfPage: pdfLibTypes.PDFPage, boundingBoxes: Polygon[], pageId: number) { - const extractedReceiptsRaw = await extractFromPage(pdfPage, boundingBoxes); + const manualUpscaleFactor = 300/72; + const extractedReceiptsRaw = await extractFromPage(pdfPage, boundingBoxes, false, manualUpscaleFactor); const extractedReceipts = []; for (let i = 0; i < extractedReceiptsRaw.length; i++) { extractedReceipts.push(new ExtractedMultiReceiptImage(extractedReceiptsRaw[i], pageId, i)); diff --git a/src/v2/fileOperations/crop.ts b/src/v2/fileOperations/crop.ts index 56c793b40..591013aeb 100644 --- a/src/v2/fileOperations/crop.ts +++ b/src/v2/fileOperations/crop.ts @@ -22,13 +22,13 @@ export async function extractSingleCrop(inputSource: LocalInputSource, crop: Cro * Extracts a list of crops from a document. * @param inputSource Local input source. * @param crops List of crops to extract. - * @param upscale Whether to upscale the extracted images. + * @param quality JPEG quality of extracted images. * @return a list of extracted files, as a CropFiles object. */ export async function extractCrops( inputSource: LocalInputSource, crops: CropItem[], - upscale: boolean = false + quality?: number , ): Promise { if (crops.length === 0) { throw new MindeeError("No crop indexes provided."); @@ -42,6 +42,6 @@ export async function extractCrops( } polygonsByPage.get(pageId)!.push(crop.location.polygon); } - const extractedCrops = await extractImagesFromPolygon(inputSource, polygonsByPage, upscale); + const extractedCrops = await extractImagesFromPolygon(inputSource, polygonsByPage, quality); return new CropFiles(...extractedCrops); } diff --git a/src/v2/fileOperations/split.ts b/src/v2/fileOperations/split.ts index d606f4304..cedb065db 100644 --- a/src/v2/fileOperations/split.ts +++ b/src/v2/fileOperations/split.ts @@ -3,6 +3,7 @@ import { MindeeError } from "@/errors/index.js"; import { PdfExtractor } from "@/pdf/pdfExtractor.js"; import { SplitFiles } from "@/v2/fileOperations/splitFiles.js"; import { logger } from "@/logger.js"; +import { ExtractedPdf } from "@/pdf/extractedPdf.js"; /** * Extracts a single specified split from a @@ -20,7 +21,7 @@ export async function extractSingleSplit(inputSource: LocalInputSource, split: n * @return a list of extracted files. * @throws MindeeError if no indexes are provided. */ -export async function extractSplits(inputSource: LocalInputSource, splits: number[][]) { +export async function extractSplits(inputSource: LocalInputSource, splits: number[][]): Promise { const pageGroups = splits.filter(e => e.length > 0); if (pageGroups.length === 0) { throw new MindeeError("No valid split indexes provided."); @@ -29,6 +30,13 @@ export async function extractSplits(inputSource: LocalInputSource, splits: numbe const pdfExtractor = new PdfExtractor(inputSource); await pdfExtractor.init(); + if (splits.length === 0) { + return new SplitFiles(); + } + const pageCount = await pdfExtractor.getPageCount(); + if (splits.length === 1 && splits[0].at(-1) === pageCount-1) { + return new SplitFiles(new ExtractedPdf(inputSource.fileObject as Buffer, inputSource.filename, pageCount)); + } const subDocuments = await pdfExtractor.extractSubDocuments(pageGroups); return new SplitFiles(...subDocuments); } diff --git a/src/v2/product/crop/cropItem.ts b/src/v2/product/crop/cropItem.ts index c49c38723..6f5a56fd9 100644 --- a/src/v2/product/crop/cropItem.ts +++ b/src/v2/product/crop/cropItem.ts @@ -1,5 +1,8 @@ import { FieldLocation } from "@/v2/parsing/inference/field/index.js"; import { StringDict } from "@/parsing/index.js"; +import { LocalInputSource } from "@/input/index.js"; +import { extractCrops } from "@/v2/fileOperations/crop.js"; +import { ExtractedImage } from "@/image/index.js"; export class CropItem { objectType: string; @@ -13,4 +16,13 @@ export class CropItem { toString(): string { return `* :Location: ${this.location}\n :Object Type: ${this.objectType}`; } + + /** + * Extracts a single crop from an input. + * @param inputSource The input file to extract from. + * @param quality Optional quality parameter for image extraction, default is undefined (full quality). + */ + async extractFromFile(inputSource: LocalInputSource, quality: number = 1): Promise{ + return (await extractCrops(inputSource, [this], quality))[0]; + } } diff --git a/src/v2/product/crop/cropResponse.ts b/src/v2/product/crop/cropResponse.ts index c50b2518a..0f63e316a 100644 --- a/src/v2/product/crop/cropResponse.ts +++ b/src/v2/product/crop/cropResponse.ts @@ -1,6 +1,9 @@ +import { LocalInputSource } from "@/input/index.js"; import { StringDict } from "@/parsing/stringDict.js"; -import { CropInference } from "./cropInference.js"; +import { extractCrops } from "@/v2/fileOperations/crop.js"; +import { CropFiles } from "@/v2/fileOperations/cropFiles.js"; import { BaseResponse } from "@/v2/parsing/index.js"; +import { CropInference } from "./cropInference.js"; export class CropResponse extends BaseResponse { /** @@ -15,4 +18,13 @@ export class CropResponse extends BaseResponse { super(serverResponse); this.inference = new CropInference(serverResponse["inference"]); } + + /** + * Extracts all crops from an input. + * @param inputSource The input file to extract from. + * @param quality Optional quality parameter for image extraction, default is undefined (full quality). + */ + async extractFromFile(inputSource: LocalInputSource, quality: number = 1): Promise { + return await extractCrops(inputSource, this.inference.result.crops, quality); + } } diff --git a/src/v2/product/split/splitRange.ts b/src/v2/product/split/splitRange.ts index 843e27a90..059bfbc7f 100644 --- a/src/v2/product/split/splitRange.ts +++ b/src/v2/product/split/splitRange.ts @@ -1,6 +1,6 @@ import { StringDict } from "@/parsing/index.js"; import { LocalInputSource } from "@/input/index.js"; -import { extractSplits } from "@/v2/fileOperations/split.js"; +import { expandRange, extractSplits } from "@/v2/fileOperations/split.js"; /** * Split inference result. @@ -32,6 +32,7 @@ export class SplitRange { * @param inputSource The input file to extract from. */ async extractFromFile(inputSource: LocalInputSource) { - return (await extractSplits(inputSource, [this.pageRange]))[0]; + const pageRange = [expandRange(this.pageRange as [number, number])]; + return (await extractSplits(inputSource, pageRange))[0]; } } diff --git a/src/v2/product/split/splitResponse.ts b/src/v2/product/split/splitResponse.ts index 7c1b6ba0d..123ab2866 100644 --- a/src/v2/product/split/splitResponse.ts +++ b/src/v2/product/split/splitResponse.ts @@ -3,6 +3,7 @@ import { SplitInference } from "./splitInference.js"; import { BaseResponse } from "@/v2/parsing/index.js"; import { LocalInputSource } from "@/input/index.js"; import { expandRange, extractSplits } from "@/v2/fileOperations/split.js"; +import { SplitFiles } from "@/v2/fileOperations/splitFiles.js"; export class SplitResponse extends BaseResponse { /** @@ -22,7 +23,7 @@ export class SplitResponse extends BaseResponse { * Extracts all splits from an input PDF. * @param inputSource The input file to extract from. */ - async extractFromFile(inputSource: LocalInputSource){ + async extractFromFile(inputSource: LocalInputSource): Promise{ const splits: number[][] = []; for (const split of this.inference.result.splits) { splits.push(expandRange(split.pageRange as [number, number])); diff --git a/tests/v2/fileOperations/crop.spec.ts b/tests/v2/fileOperations/crop.spec.ts index 7ea50c2bc..7bed5f1ba 100644 --- a/tests/v2/fileOperations/crop.spec.ts +++ b/tests/v2/fileOperations/crop.spec.ts @@ -1,15 +1,16 @@ -import path from "path"; -import assert from "node:assert/strict"; -import { describe, it } from "node:test"; +import { loadOptionalDependency } from "@/dependency/index.js"; +import { ExtractedImage } from "@/image/index.js"; +import { PathInput } from "@/index.js"; +import { extractCrops } from "@/v2/fileOperations/crop.js"; import { LocalResponse } from "@/v2/parsing/index.js"; import { CropResponse } from "@/v2/product/crop/cropResponse.js"; -import { PathInput } from "@/index.js"; -import { extractCrops } from "@/v2/fileOperations/crop.js"; -import { V2_PRODUCT_PATH } from "../../index"; -import { loadOptionalDependency } from "../../../src/dependency"; -import type * as SharpTypes from "sharp"; import type * as pdfLibTypes from "@cantoo/pdf-lib"; +import assert from "node:assert/strict"; +import { describe, it } from "node:test"; +import path from "path"; +import type * as SharpTypes from "sharp"; +import { V2_PRODUCT_PATH } from "../../index.js"; const cropPath = path.join(V2_PRODUCT_PATH, "crop"); let pdfLib: typeof pdfLibTypes | null = null; @@ -48,7 +49,7 @@ async function getFileDimensions(buffer: Buffer, sharpInstance: any) { describe("MindeeV2 - FileOperation - Crop #OptionalDepsRequired", async () => { const sharpLoaded = await loadOptionalDependency("sharp", "Image compression"); const sharp = (sharpLoaded as any).default || sharpLoaded; - await it("should process single page crop split correctly", async () => { + await it("should process single page crop correctly", async () => { const inputSample = new PathInput({ inputPath: path.join(cropPath, "default_sample.jpg") @@ -58,10 +59,7 @@ describe("MindeeV2 - FileOperation - Crop #OptionalDepsRequired", async () => { path.join(cropPath, "crop_single.json") ); - const extractedCrops = await extractCrops( - inputSample, - response.inference.result.crops - ); + const extractedCrops = await response.extractFromFile(inputSample); assert.strictEqual(extractedCrops.length, 1); @@ -69,9 +67,33 @@ describe("MindeeV2 - FileOperation - Crop #OptionalDepsRequired", async () => { const dimensions = await getFileDimensions(extractedCrops[0].buffer, sharp); assert.strictEqual(Math.round(dimensions.width), 5880); assert.strictEqual(Math.round(dimensions.height), 3275); + const localExtract: ExtractedImage = await response.inference.result.crops[0].extractFromFile(inputSample); + assert.ok(localExtract.buffer.equals(extractedCrops[0].buffer)); + }); + + await it("should extract and still work with lower quality", async () => { + const inputSample = new PathInput({ + inputPath: + path.join(cropPath, "default_sample.jpg") + }); + await inputSample.init(); + const response = await loadV2Crop( + path.join(cropPath, "crop_single.json") + ); + + const extractedCrops = await response.extractFromFile(inputSample, 0.4); + + assert.strictEqual(extractedCrops.length, 1); + + assert.strictEqual(extractedCrops[0].pageId, 0); + const dimensions = await getFileDimensions(extractedCrops[0].buffer, sharp); + assert.strictEqual(Math.round(dimensions.width), 5880 * 0.4); + assert.strictEqual(Math.round(dimensions.height), 3275 * 0.4); + const localExtract: ExtractedImage = await response.inference.result.crops[0].extractFromFile(inputSample, 0.4); + assert.ok(localExtract.buffer.equals(extractedCrops[0].buffer)); }); - await it("should process multi page receipt split correctly", async () => { + await it("should process multi page receipt crops correctly", async () => { const inputSample = new PathInput({ inputPath: path.join(cropPath, "multipage_sample.pdf") diff --git a/tests/v2/fileOperations/split.integration.ts b/tests/v2/fileOperations/split.integration.ts index b4ac33c57..9827e0bed 100644 --- a/tests/v2/fileOperations/split.integration.ts +++ b/tests/v2/fileOperations/split.integration.ts @@ -4,13 +4,33 @@ import path from "node:path"; import * as fs from "node:fs"; import { Client, PathInput, BufferInput } from "@/index.js"; -import { Crop } from "@/v2/product/crop/index.js"; +import { Split } from "@/v2/product/split/index.js"; import { Extraction, ExtractionResponse } from "@/v2/product/extraction/index.js"; -import { extractCrops } from "@/v2/fileOperations/crop.js"; import { V2_PRODUCT_PATH } from "../../index.js"; import { SimpleField } from "@/v2/parsing/inference/field/index.js"; +import { loadOptionalDependency } from "@/dependency/index.js"; +import type * as pdfLibTypes from "@cantoo/pdf-lib"; const OUTPUT_DIR = path.join(__dirname, "output"); +let pdfLib: typeof pdfLibTypes | null = null; + +async function getPdfLib(): Promise { + if (!pdfLib) { + const pdfLibImport = await loadOptionalDependency("@cantoo/pdf-lib", "PDF Parsing"); + pdfLib = (pdfLibImport as any).default || pdfLibImport; + } + return pdfLib!; +} + +async function getPageCount(buffer: Buffer): Promise { + const isPdf = buffer.subarray(0, 4).toString("ascii") === "%PDF"; + if (isPdf) { + const lib = await getPdfLib(); + const pdfDoc = await lib.PDFDocument.load(buffer); + return pdfDoc.getPageCount(); + } + return 1; +} function checkFindocReturn(findocResponse: ExtractionResponse) { assert.ok(findocResponse.inference.model.id.length > 0); @@ -19,15 +39,15 @@ function checkFindocReturn(findocResponse: ExtractionResponse) { assert.ok((totalAmount.value as number) > 0); } -describe("MindeeV2 - Integration - FileOperation - Crop", { timeout: 120000 }, () => { +describe("MindeeV2 - Integration - Product - Split", { timeout: 120000 }, () => { let client: Client; - let cropModelId: string; + let splitModelId: string; let findocModelId: string; - const cropSample = path.join( + const splitSample = path.join( V2_PRODUCT_PATH, - "crop", - "default_sample.jpg" + "split", + "default_sample.pdf" ); before(() => { @@ -38,41 +58,42 @@ describe("MindeeV2 - Integration - FileOperation - Crop", { timeout: 120000 }, ( beforeEach(() => { const apiKey = process.env["MINDEE_V2_API_KEY"] ?? ""; - cropModelId = process.env["MINDEE_V2_SE_TESTS_CROP_MODEL_ID"] ?? ""; + splitModelId = process.env["MINDEE_V2_SE_TESTS_SPLIT_MODEL_ID"] ?? ""; findocModelId = process.env["MINDEE_V2_SE_TESTS_FINDOC_MODEL_ID"] ?? ""; client = new Client({ apiKey: apiKey, debug: true }); }); after(() => { - const file1 = path.join(OUTPUT_DIR, "crop_001.jpg"); - const file2 = path.join(OUTPUT_DIR, "crop_002.jpg"); + const file1 = path.join(OUTPUT_DIR, "split_001.pdf"); + const file2 = path.join(OUTPUT_DIR, "split_002.pdf"); if (fs.existsSync(file1)) fs.rmSync(file1); if (fs.existsSync(file2)) fs.rmSync(file2); }); - it("extracts crops from image correctly", async () => { - const cropInput = new PathInput({ inputPath: cropSample }); - await cropInput.init(); + it("extracts splits from pdf correctly", async () => { + const splitInput = new PathInput({ inputPath: splitSample }); + await splitInput.init(); - const cropParams = { modelId: cropModelId }; + const splitParams = { modelId: splitModelId }; - const response = await client.enqueueAndGetResult( - Crop, cropInput, cropParams + const response: any = await client.enqueueAndGetResult( + Split, splitInput, splitParams ); - assert.equal(response.inference.result.crops.length, 2); + assert.equal(response.inference.file.pageCount, 2); - const extractedImages = await extractCrops(cropInput, response.inference.result.crops); + const extractedPdfs = await response.extractFromFile(splitInput); - assert.equal(extractedImages.length, 2); - assert.equal(extractedImages[0].filename, "default_sample.jpg_page0-0.jpg"); - assert.equal(extractedImages[1].filename, "default_sample.jpg_page0-1.jpg"); + assert.equal(extractedPdfs.length, 2); + assert.equal(extractedPdfs[0].filename, "default_sample_page_001-001.pdf"); + assert.equal(extractedPdfs[1].filename, "default_sample_page_002-002.pdf"); const extractionInput = new BufferInput({ - buffer: extractedImages[0].buffer, - filename: extractedImages[0].filename + buffer: extractedPdfs[0].buffer, + filename: extractedPdfs[0].filename }); + const findocParams = { modelId: findocModelId }; const invoice0 = await client.enqueueAndGetResult( @@ -81,16 +102,18 @@ describe("MindeeV2 - Integration - FileOperation - Crop", { timeout: 120000 }, ( checkFindocReturn(invoice0 as ExtractionResponse); - const file1Path = path.join(OUTPUT_DIR, "crop_001.jpg"); - const file2Path = path.join(OUTPUT_DIR, "crop_002.jpg"); + const file1Path = path.join(OUTPUT_DIR, "split_001.pdf"); + const file2Path = path.join(OUTPUT_DIR, "split_002.pdf"); - fs.writeFileSync(file1Path, extractedImages[0].buffer); - fs.writeFileSync(file2Path, extractedImages[1].buffer); + fs.writeFileSync(file1Path, extractedPdfs[0].buffer); + fs.writeFileSync(file2Path, extractedPdfs[1].buffer); - const stat1 = fs.statSync(file1Path); - assert.ok(stat1.size >= 3170000 && stat1.size <= 3180000); + const localBuffer1 = fs.readFileSync(file1Path); + const pageCount1 = await getPageCount(localBuffer1); + assert.equal(pageCount1, extractedPdfs[0].pageCount); - const stat2 = fs.statSync(file2Path); - assert.ok(stat2.size >= 3210000 && stat2.size <= 3230000); + const localBuffer2 = fs.readFileSync(file2Path); + const pageCount2 = await getPageCount(localBuffer2); + assert.equal(pageCount2, extractedPdfs[1].pageCount); }); }); diff --git a/tests/v2/fileOperations/split.spec.ts b/tests/v2/fileOperations/split.spec.ts index bc4112c61..67853bfad 100644 --- a/tests/v2/fileOperations/split.spec.ts +++ b/tests/v2/fileOperations/split.spec.ts @@ -5,9 +5,12 @@ import { describe, it } from "node:test"; import { LocalResponse } from "@/v2/parsing/index.js"; import { SplitResponse } from "@/v2/product/split/splitResponse.js"; import { PathInput } from "@/index.js"; -import { V2_PRODUCT_PATH } from "../../index"; -import { loadOptionalDependency } from "../../../src/dependency"; +import { V2_PRODUCT_PATH } from "../../index.js"; +import { loadOptionalDependency } from "@/dependency/index.js"; import type * as pdfLibTypes from "@cantoo/pdf-lib"; +import { extractSplits } from "@/v2/fileOperations/split.js"; +import { SplitFiles } from "@/v2/fileOperations/splitFiles.js"; +import { ExtractedPdf } from "@/pdf/extractedPdf.js"; const splitPath = path.join(V2_PRODUCT_PATH, "split"); const financialDocumentPath = path.join(V2_PRODUCT_PATH, "extraction", "financial_document"); @@ -88,5 +91,18 @@ describe("MindeeV2 - Product - SplitResponse #OptionalDepsRequired", async () => assert.strictEqual(extractedSplits[2].pageCount, 1); const count2 = await getPageCount(extractedSplits[2].buffer); assert.strictEqual(count2, 1); + const localExtract: ExtractedPdf = await response.inference.result.splits[0].extractFromFile(inputSample); + assert.ok(extractedSplits[0].buffer.equals(localExtract.buffer)); + }); + + await it("extracts a file as itself if the split count is its own length", async () => { + const inputSample = new PathInput({ + inputPath: path.join(splitPath, "invoice_5p.pdf") + }); + await inputSample.init(); + const splitFiles: SplitFiles = await extractSplits(inputSample, [[0, 1, 2, 3, 4]]); + assert(splitFiles.length === 1); + assert(splitFiles[0].pageCount === 5); + assert(splitFiles[0].buffer === inputSample.fileObject); }); }); From f7fcb4572983bf4b780c1ecea82804f425f22706 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Mon, 13 Apr 2026 16:24:48 +0200 Subject: [PATCH 4/9] fix import thing --- src/image/imageExtractor.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/image/imageExtractor.ts b/src/image/imageExtractor.ts index 8dc8e63a5..27c0e750a 100644 --- a/src/image/imageExtractor.ts +++ b/src/image/imageExtractor.ts @@ -1,5 +1,3 @@ -// eslint-disable-next-line @typescript-eslint/ban-ts-comment -// @ts-ignore import { loadOptionalDependency } from "@/dependency/index.js"; import { MindeeImageError } from "@/errors/index.js"; import { getMinMaxX, getMinMaxY, Polygon } from "@/geometry/index.js"; @@ -9,6 +7,8 @@ import { LocalInputSource } from "@/input/index.js"; import { logger } from "@/logger.js"; import { createPdfFromInputSource } from "@/pdf/pdfOperation.js"; import { rasterizePage } from "@/pdf/pdfUtils.js"; +// eslint-disable-next-line @typescript-eslint/ban-ts-comment +// @ts-ignore import type * as pdfLibTypes from "@cantoo/pdf-lib"; let pdfLib: typeof pdfLibTypes | null = null; From eda4f2533b7d33d67d49ec7025d675e245c0adfe Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Mon, 13 Apr 2026 16:29:20 +0200 Subject: [PATCH 5/9] fix test --- tests/v2/fileOperations/crop.integration.ts | 8 +------- tests/v2/fileOperations/split.integration.ts | 10 ++-------- 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/tests/v2/fileOperations/crop.integration.ts b/tests/v2/fileOperations/crop.integration.ts index 181eb1f46..3efecb67e 100644 --- a/tests/v2/fileOperations/crop.integration.ts +++ b/tests/v2/fileOperations/crop.integration.ts @@ -1,4 +1,4 @@ -import { after, before, beforeEach, describe, it } from "node:test"; +import { after, beforeEach, describe, it } from "node:test"; import assert from "node:assert/strict"; import path from "node:path"; import * as fs from "node:fs"; @@ -29,12 +29,6 @@ describe("MindeeV2 - Integration - FileOperation - Crop #OptionalDepsRequired", "default_sample.jpg" ); - before(() => { - if (!fs.existsSync(OUTPUT_PATH)) { - fs.mkdirSync(OUTPUT_PATH, { recursive: true }); - } - }); - beforeEach(() => { const apiKey = process.env["MINDEE_V2_API_KEY"] ?? ""; cropModelId = process.env["MINDEE_V2_SE_TESTS_CROP_MODEL_ID"] ?? ""; diff --git a/tests/v2/fileOperations/split.integration.ts b/tests/v2/fileOperations/split.integration.ts index 9827e0bed..d7de230b6 100644 --- a/tests/v2/fileOperations/split.integration.ts +++ b/tests/v2/fileOperations/split.integration.ts @@ -1,4 +1,4 @@ -import { after, before, beforeEach, describe, it } from "node:test"; +import { after, beforeEach, describe, it } from "node:test"; import assert from "node:assert/strict"; import path from "node:path"; import * as fs from "node:fs"; @@ -39,7 +39,7 @@ function checkFindocReturn(findocResponse: ExtractionResponse) { assert.ok((totalAmount.value as number) > 0); } -describe("MindeeV2 - Integration - Product - Split", { timeout: 120000 }, () => { +describe("MindeeV2 - Integration - Product - Split #OptionalDepsRequired", { timeout: 120000 }, () => { let client: Client; let splitModelId: string; let findocModelId: string; @@ -50,12 +50,6 @@ describe("MindeeV2 - Integration - Product - Split", { timeout: 120000 }, () => "default_sample.pdf" ); - before(() => { - if (!fs.existsSync(OUTPUT_DIR)) { - fs.mkdirSync(OUTPUT_DIR, { recursive: true }); - } - }); - beforeEach(() => { const apiKey = process.env["MINDEE_V2_API_KEY"] ?? ""; splitModelId = process.env["MINDEE_V2_SE_TESTS_SPLIT_MODEL_ID"] ?? ""; From f52eafd88efb80fbea09fdcc1c87b77156fceddf Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Tue, 14 Apr 2026 10:10:17 +0200 Subject: [PATCH 6/9] fix inits & asSource() --- src/pdf/extractedPdf.ts | 2 +- tests/v2/fileOperations/crop.integration.ts | 1 - tests/v2/fileOperations/crop.spec.ts | 4 ---- tests/v2/fileOperations/split.integration.ts | 11 ++++------- tests/v2/fileOperations/split.spec.ts | 4 ---- 5 files changed, 5 insertions(+), 17 deletions(-) diff --git a/src/pdf/extractedPdf.ts b/src/pdf/extractedPdf.ts index 0369e9cc0..58efe467a 100644 --- a/src/pdf/extractedPdf.ts +++ b/src/pdf/extractedPdf.ts @@ -8,7 +8,7 @@ import { writeFileSync } from "node:fs"; export class ExtractedPdf { public readonly buffer: Buffer; - private readonly filename: string; + public readonly filename: string; public readonly pageCount: number; constructor(pdfData: Buffer, filename: string, pageCount: number) { diff --git a/tests/v2/fileOperations/crop.integration.ts b/tests/v2/fileOperations/crop.integration.ts index 3efecb67e..b00f88af6 100644 --- a/tests/v2/fileOperations/crop.integration.ts +++ b/tests/v2/fileOperations/crop.integration.ts @@ -46,7 +46,6 @@ describe("MindeeV2 - Integration - FileOperation - Crop #OptionalDepsRequired", it("extracts crops from image correctly", async () => { const cropInput = new PathInput({ inputPath: cropSample }); - await cropInput.init(); const cropParams = { modelId: cropModelId }; diff --git a/tests/v2/fileOperations/crop.spec.ts b/tests/v2/fileOperations/crop.spec.ts index 7bed5f1ba..18c7218b5 100644 --- a/tests/v2/fileOperations/crop.spec.ts +++ b/tests/v2/fileOperations/crop.spec.ts @@ -25,7 +25,6 @@ async function getPdfLib(): Promise { async function loadV2Crop(resourcePath: string): Promise { const localResponse = new LocalResponse(resourcePath); - await localResponse.init(); return localResponse.deserializeResponse(CropResponse); } /** @@ -54,7 +53,6 @@ describe("MindeeV2 - FileOperation - Crop #OptionalDepsRequired", async () => { inputPath: path.join(cropPath, "default_sample.jpg") }); - await inputSample.init(); const response = await loadV2Crop( path.join(cropPath, "crop_single.json") ); @@ -76,7 +74,6 @@ describe("MindeeV2 - FileOperation - Crop #OptionalDepsRequired", async () => { inputPath: path.join(cropPath, "default_sample.jpg") }); - await inputSample.init(); const response = await loadV2Crop( path.join(cropPath, "crop_single.json") ); @@ -98,7 +95,6 @@ describe("MindeeV2 - FileOperation - Crop #OptionalDepsRequired", async () => { inputPath: path.join(cropPath, "multipage_sample.pdf") }); - await inputSample.init(); const response = await loadV2Crop( path.join(cropPath, "crop_multiple.json") ); diff --git a/tests/v2/fileOperations/split.integration.ts b/tests/v2/fileOperations/split.integration.ts index d7de230b6..546ffc60b 100644 --- a/tests/v2/fileOperations/split.integration.ts +++ b/tests/v2/fileOperations/split.integration.ts @@ -3,9 +3,10 @@ import assert from "node:assert/strict"; import path from "node:path"; import * as fs from "node:fs"; -import { Client, PathInput, BufferInput } from "@/index.js"; +import { Client, PathInput } from "@/index.js"; import { Split } from "@/v2/product/split/index.js"; import { Extraction, ExtractionResponse } from "@/v2/product/extraction/index.js"; +import { SplitFiles } from "@/v2/fileOperations/splitFiles.js"; import { V2_PRODUCT_PATH } from "../../index.js"; import { SimpleField } from "@/v2/parsing/inference/field/index.js"; import { loadOptionalDependency } from "@/dependency/index.js"; @@ -67,7 +68,6 @@ describe("MindeeV2 - Integration - Product - Split #OptionalDepsRequired", { tim it("extracts splits from pdf correctly", async () => { const splitInput = new PathInput({ inputPath: splitSample }); - await splitInput.init(); const splitParams = { modelId: splitModelId }; @@ -77,16 +77,13 @@ describe("MindeeV2 - Integration - Product - Split #OptionalDepsRequired", { tim assert.equal(response.inference.file.pageCount, 2); - const extractedPdfs = await response.extractFromFile(splitInput); + const extractedPdfs: SplitFiles = await response.extractFromFile(splitInput); assert.equal(extractedPdfs.length, 2); assert.equal(extractedPdfs[0].filename, "default_sample_page_001-001.pdf"); assert.equal(extractedPdfs[1].filename, "default_sample_page_002-002.pdf"); - const extractionInput = new BufferInput({ - buffer: extractedPdfs[0].buffer, - filename: extractedPdfs[0].filename - }); + const extractionInput = extractedPdfs[0].asSource(); const findocParams = { modelId: findocModelId }; diff --git a/tests/v2/fileOperations/split.spec.ts b/tests/v2/fileOperations/split.spec.ts index 67853bfad..40e6802ed 100644 --- a/tests/v2/fileOperations/split.spec.ts +++ b/tests/v2/fileOperations/split.spec.ts @@ -27,7 +27,6 @@ async function getPdfLib(): Promise { async function loadV2Split(resourcePath: string): Promise { const localResponse = new LocalResponse(resourcePath); - await localResponse.init(); return localResponse.deserializeResponse(SplitResponse); } @@ -50,7 +49,6 @@ describe("MindeeV2 - Product - SplitResponse #OptionalDepsRequired", async () => const inputSample = new PathInput({ inputPath: path.join(financialDocumentPath, "default_sample.jpg") }); - await inputSample.init(); const response = await loadV2Split( path.join(splitPath, "split_single.json") @@ -70,7 +68,6 @@ describe("MindeeV2 - Product - SplitResponse #OptionalDepsRequired", async () => const inputSample = new PathInput({ inputPath: path.join(splitPath, "invoice_5p.pdf") }); - await inputSample.init(); const response = await loadV2Split( path.join(splitPath, "split_multiple.json") @@ -99,7 +96,6 @@ describe("MindeeV2 - Product - SplitResponse #OptionalDepsRequired", async () => const inputSample = new PathInput({ inputPath: path.join(splitPath, "invoice_5p.pdf") }); - await inputSample.init(); const splitFiles: SplitFiles = await extractSplits(inputSample, [[0, 1, 2, 3, 4]]); assert(splitFiles.length === 1); assert(splitFiles[0].pageCount === 5); From 63300e576d1de31acf80ca00efdc2d2fbc8bba7e Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Tue, 14 Apr 2026 10:24:17 +0200 Subject: [PATCH 7/9] fix tests --- tests/v2/fileOperations/split.integration.ts | 35 +++---------- tests/v2/fileOperations/split.spec.ts | 52 ++++++-------------- 2 files changed, 24 insertions(+), 63 deletions(-) diff --git a/tests/v2/fileOperations/split.integration.ts b/tests/v2/fileOperations/split.integration.ts index 546ffc60b..372a4e9d4 100644 --- a/tests/v2/fileOperations/split.integration.ts +++ b/tests/v2/fileOperations/split.integration.ts @@ -7,31 +7,10 @@ import { Client, PathInput } from "@/index.js"; import { Split } from "@/v2/product/split/index.js"; import { Extraction, ExtractionResponse } from "@/v2/product/extraction/index.js"; import { SplitFiles } from "@/v2/fileOperations/splitFiles.js"; +import { BufferInput } from "../../../src/index.js"; import { V2_PRODUCT_PATH } from "../../index.js"; import { SimpleField } from "@/v2/parsing/inference/field/index.js"; -import { loadOptionalDependency } from "@/dependency/index.js"; -import type * as pdfLibTypes from "@cantoo/pdf-lib"; - const OUTPUT_DIR = path.join(__dirname, "output"); -let pdfLib: typeof pdfLibTypes | null = null; - -async function getPdfLib(): Promise { - if (!pdfLib) { - const pdfLibImport = await loadOptionalDependency("@cantoo/pdf-lib", "PDF Parsing"); - pdfLib = (pdfLibImport as any).default || pdfLibImport; - } - return pdfLib!; -} - -async function getPageCount(buffer: Buffer): Promise { - const isPdf = buffer.subarray(0, 4).toString("ascii") === "%PDF"; - if (isPdf) { - const lib = await getPdfLib(); - const pdfDoc = await lib.PDFDocument.load(buffer); - return pdfDoc.getPageCount(); - } - return 1; -} function checkFindocReturn(findocResponse: ExtractionResponse) { assert.ok(findocResponse.inference.model.id.length > 0); @@ -96,15 +75,17 @@ describe("MindeeV2 - Integration - Product - Split #OptionalDepsRequired", { tim const file1Path = path.join(OUTPUT_DIR, "split_001.pdf"); const file2Path = path.join(OUTPUT_DIR, "split_002.pdf"); - fs.writeFileSync(file1Path, extractedPdfs[0].buffer); - fs.writeFileSync(file2Path, extractedPdfs[1].buffer); + await extractedPdfs[0].saveToFileAsync(file1Path); + await extractedPdfs[1].saveToFileAsync(file2Path); const localBuffer1 = fs.readFileSync(file1Path); - const pageCount1 = await getPageCount(localBuffer1); + const inputSource1 = new BufferInput({ buffer: localBuffer1, filename: "tmp.pdf" }); + const pageCount1 = await inputSource1.getPageCount(); assert.equal(pageCount1, extractedPdfs[0].pageCount); - const localBuffer2 = fs.readFileSync(file2Path); - const pageCount2 = await getPageCount(localBuffer2); + const localBuffer2 = fs.readFileSync(file1Path); + const inputSource2 = new BufferInput({ buffer: localBuffer2, filename: "tmp.pdf" }); + const pageCount2 = await inputSource2.getPageCount(); assert.equal(pageCount2, extractedPdfs[1].pageCount); }); }); diff --git a/tests/v2/fileOperations/split.spec.ts b/tests/v2/fileOperations/split.spec.ts index 40e6802ed..e07cca8e8 100644 --- a/tests/v2/fileOperations/split.spec.ts +++ b/tests/v2/fileOperations/split.spec.ts @@ -1,48 +1,24 @@ -import path from "path"; -import assert from "node:assert/strict"; -import { describe, it } from "node:test"; +import { PathInput } from "@/index.js"; +import { ExtractedPdf } from "@/pdf/extractedPdf.js"; +import { extractSplits } from "@/v2/fileOperations/split.js"; +import { SplitFiles } from "@/v2/fileOperations/splitFiles.js"; import { LocalResponse } from "@/v2/parsing/index.js"; import { SplitResponse } from "@/v2/product/split/splitResponse.js"; -import { PathInput } from "@/index.js"; +import assert from "node:assert/strict"; +import { describe, it } from "node:test"; +import path from "path"; +import { BufferInput } from "../../../src/index.js"; import { V2_PRODUCT_PATH } from "../../index.js"; -import { loadOptionalDependency } from "@/dependency/index.js"; -import type * as pdfLibTypes from "@cantoo/pdf-lib"; -import { extractSplits } from "@/v2/fileOperations/split.js"; -import { SplitFiles } from "@/v2/fileOperations/splitFiles.js"; -import { ExtractedPdf } from "@/pdf/extractedPdf.js"; const splitPath = path.join(V2_PRODUCT_PATH, "split"); const financialDocumentPath = path.join(V2_PRODUCT_PATH, "extraction", "financial_document"); -let pdfLib: typeof pdfLibTypes | null = null; - -async function getPdfLib(): Promise { - if (!pdfLib) { - const pdfLibImport = await loadOptionalDependency("@cantoo/pdf-lib", "PDF Parsing"); - pdfLib = (pdfLibImport as any).default || pdfLibImport; - } - return pdfLib!; -} - async function loadV2Split(resourcePath: string): Promise { const localResponse = new LocalResponse(resourcePath); return localResponse.deserializeResponse(SplitResponse); } -/** - * Gets the page count of a buffer, routing to pdf-lib for PDFs. - */ -async function getPageCount(buffer: Buffer): Promise { - const isPdf = buffer.subarray(0, 4).toString("ascii") === "%PDF"; - if (isPdf) { - const pdfLib = await getPdfLib(); - const pdfDoc = await pdfLib.PDFDocument.load(buffer); - return pdfDoc.getPageCount(); - } - return 1; -} - describe("MindeeV2 - Product - SplitResponse #OptionalDepsRequired", async () => { await it("should process single page split correctly", async () => { @@ -60,7 +36,8 @@ describe("MindeeV2 - Product - SplitResponse #OptionalDepsRequired", async () => assert.strictEqual(extractedSplits[0].pageCount, 1); - const count0 = await getPageCount(extractedSplits[0].buffer); + const inputBuffer0 = new BufferInput({ buffer: extractedSplits[0].buffer, filename: "test.pdf" }); + const count0 = await inputBuffer0.getPageCount(); assert.strictEqual(count0, 1); }); @@ -78,15 +55,18 @@ describe("MindeeV2 - Product - SplitResponse #OptionalDepsRequired", async () => assert.strictEqual(extractedSplits.length, 3); assert.strictEqual(extractedSplits[0].pageCount, 1); - const count0 = await getPageCount(extractedSplits[0].buffer); + const bufferInput0 = extractedSplits[0].asSource(); + const count0 = await bufferInput0.getPageCount(); assert.strictEqual(count0, 1); + const bufferInput1 = extractedSplits[0].asSource(); + const count1 = await bufferInput1.getPageCount(); assert.strictEqual(extractedSplits[1].pageCount, 3); - const count1 = await getPageCount(extractedSplits[1].buffer); assert.strictEqual(count1, 3); assert.strictEqual(extractedSplits[2].pageCount, 1); - const count2 = await getPageCount(extractedSplits[2].buffer); + const bufferInput2 = extractedSplits[2].asSource(); + const count2 = await bufferInput2.getPageCount(); assert.strictEqual(count2, 1); const localExtract: ExtractedPdf = await response.inference.result.splits[0].extractFromFile(inputSample); assert.ok(extractedSplits[0].buffer.equals(localExtract.buffer)); From d4459b16ac6d8f582f7c9bf1edbb6c7e64096056 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Tue, 14 Apr 2026 10:28:11 +0200 Subject: [PATCH 8/9] fix tests again --- tests/v2/fileOperations/crop.integration.ts | 7 ++----- tests/v2/fileOperations/split.integration.ts | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/v2/fileOperations/crop.integration.ts b/tests/v2/fileOperations/crop.integration.ts index b00f88af6..fa5243980 100644 --- a/tests/v2/fileOperations/crop.integration.ts +++ b/tests/v2/fileOperations/crop.integration.ts @@ -3,7 +3,7 @@ import assert from "node:assert/strict"; import path from "node:path"; import * as fs from "node:fs"; -import { Client, PathInput, BufferInput } from "@/index.js"; +import { Client, PathInput } from "@/index.js"; import { Crop } from "@/v2/product/crop/index.js"; import { Extraction, ExtractionResponse } from "@/v2/product/extraction/index.js"; import { extractCrops } from "@/v2/fileOperations/crop.js"; @@ -61,10 +61,7 @@ describe("MindeeV2 - Integration - FileOperation - Crop #OptionalDepsRequired", assert.equal(extractedImages[0].filename, "default_sample.jpg_page0-0.jpg"); assert.equal(extractedImages[1].filename, "default_sample.jpg_page0-1.jpg"); - const extractionInput = new BufferInput({ - buffer: extractedImages[0].buffer, - filename: extractedImages[0].filename - }); + const extractionInput = extractedImages[0].asSource(); const findocParams = { modelId: findocModelId }; const invoice0 = await client.enqueueAndGetResult( diff --git a/tests/v2/fileOperations/split.integration.ts b/tests/v2/fileOperations/split.integration.ts index 372a4e9d4..2bc8293c6 100644 --- a/tests/v2/fileOperations/split.integration.ts +++ b/tests/v2/fileOperations/split.integration.ts @@ -7,7 +7,7 @@ import { Client, PathInput } from "@/index.js"; import { Split } from "@/v2/product/split/index.js"; import { Extraction, ExtractionResponse } from "@/v2/product/extraction/index.js"; import { SplitFiles } from "@/v2/fileOperations/splitFiles.js"; -import { BufferInput } from "../../../src/index.js"; +import { BufferInput } from "@/index.js"; import { V2_PRODUCT_PATH } from "../../index.js"; import { SimpleField } from "@/v2/parsing/inference/field/index.js"; const OUTPUT_DIR = path.join(__dirname, "output"); From 806e4cc95cf56a4326b951857bea0f0dd52c831a Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Tue, 14 Apr 2026 10:33:35 +0200 Subject: [PATCH 9/9] fix tests... again --- tests/v2/fileOperations/split.integration.ts | 8 +++----- tests/v2/fileOperations/split.spec.ts | 3 +-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/tests/v2/fileOperations/split.integration.ts b/tests/v2/fileOperations/split.integration.ts index 2bc8293c6..10312d299 100644 --- a/tests/v2/fileOperations/split.integration.ts +++ b/tests/v2/fileOperations/split.integration.ts @@ -7,7 +7,6 @@ import { Client, PathInput } from "@/index.js"; import { Split } from "@/v2/product/split/index.js"; import { Extraction, ExtractionResponse } from "@/v2/product/extraction/index.js"; import { SplitFiles } from "@/v2/fileOperations/splitFiles.js"; -import { BufferInput } from "@/index.js"; import { V2_PRODUCT_PATH } from "../../index.js"; import { SimpleField } from "@/v2/parsing/inference/field/index.js"; const OUTPUT_DIR = path.join(__dirname, "output"); @@ -78,13 +77,12 @@ describe("MindeeV2 - Integration - Product - Split #OptionalDepsRequired", { tim await extractedPdfs[0].saveToFileAsync(file1Path); await extractedPdfs[1].saveToFileAsync(file2Path); - const localBuffer1 = fs.readFileSync(file1Path); - const inputSource1 = new BufferInput({ buffer: localBuffer1, filename: "tmp.pdf" }); + + const inputSource1 = new PathInput({ inputPath: file1Path }); const pageCount1 = await inputSource1.getPageCount(); assert.equal(pageCount1, extractedPdfs[0].pageCount); - const localBuffer2 = fs.readFileSync(file1Path); - const inputSource2 = new BufferInput({ buffer: localBuffer2, filename: "tmp.pdf" }); + const inputSource2 = new PathInput({ inputPath: file1Path }); const pageCount2 = await inputSource2.getPageCount(); assert.equal(pageCount2, extractedPdfs[1].pageCount); }); diff --git a/tests/v2/fileOperations/split.spec.ts b/tests/v2/fileOperations/split.spec.ts index e07cca8e8..a9170e8b6 100644 --- a/tests/v2/fileOperations/split.spec.ts +++ b/tests/v2/fileOperations/split.spec.ts @@ -8,7 +8,6 @@ import { SplitResponse } from "@/v2/product/split/splitResponse.js"; import assert from "node:assert/strict"; import { describe, it } from "node:test"; import path from "path"; -import { BufferInput } from "../../../src/index.js"; import { V2_PRODUCT_PATH } from "../../index.js"; const splitPath = path.join(V2_PRODUCT_PATH, "split"); @@ -36,7 +35,7 @@ describe("MindeeV2 - Product - SplitResponse #OptionalDepsRequired", async () => assert.strictEqual(extractedSplits[0].pageCount, 1); - const inputBuffer0 = new BufferInput({ buffer: extractedSplits[0].buffer, filename: "test.pdf" }); + const inputBuffer0 = extractedSplits[0].asSource(); const count0 = await inputBuffer0.getPageCount(); assert.strictEqual(count0, 1); });