diff --git a/src/image/extractedImage.ts b/src/image/extractedImage.ts index 583c0640..f56c7c08 100644 --- a/src/image/extractedImage.ts +++ b/src/image/extractedImage.ts @@ -15,11 +15,15 @@ import { loadOptionalDependency } from "@/dependency/index.js"; */ export class ExtractedImage { public buffer: Buffer; - protected internalFileName: string; + public filename: string; + public pageId?: number; + public elementId?: number; - protected constructor(buffer: Uint8Array, fileName: string) { + constructor(buffer: Uint8Array, fileName: string, pageId?: number, elementId?: number) { this.buffer = Buffer.from(buffer); - this.internalFileName = fileName; + this.filename = fileName; + this.pageId = pageId; + this.elementId = elementId; } /** @@ -104,7 +108,7 @@ export class ExtractedImage { asSource(): BufferInput { return new BufferInput({ buffer: this.buffer, - filename: this.internalFileName, + filename: this.filename, }); } } diff --git a/src/image/imageExtractor.ts b/src/image/imageExtractor.ts index f8ba56fd..27c0e750 100644 --- a/src/image/imageExtractor.ts +++ b/src/image/imageExtractor.ts @@ -1,59 +1,107 @@ +import { loadOptionalDependency } from "@/dependency/index.js"; +import { MindeeImageError } from "@/errors/index.js"; +import { getMinMaxX, getMinMaxY, Polygon } from "@/geometry/index.js"; +import { adjustForRotation } from "@/geometry/polygonUtils.js"; +import { ExtractedImage } from "@/image/extractedImage.js"; +import { LocalInputSource } from "@/input/index.js"; +import { logger } from "@/logger.js"; +import { createPdfFromInputSource } from "@/pdf/pdfOperation.js"; +import { rasterizePage } from "@/pdf/pdfUtils.js"; // eslint-disable-next-line @typescript-eslint/ban-ts-comment // @ts-ignore import type * as pdfLibTypes from "@cantoo/pdf-lib"; -import { getMinMaxX, getMinMaxY, Polygon } from "@/geometry/index.js"; -import { adjustForRotation } from "@/geometry/polygonUtils.js"; -import { loadOptionalDependency } from "@/dependency/index.js"; let pdfLib: typeof pdfLibTypes | null = null; async function getPdfLib(): Promise { if (!pdfLib) { const pdfLibImport = await loadOptionalDependency( - "@cantoo/pdf-lib", "Text Embedding" + "@cantoo/pdf-lib", "Image Extraction" ); pdfLib = (pdfLibImport as any).default || pdfLibImport; } return pdfLib!; } + +/** + * Extracts elements from a PDF document based on a list of bounding boxes. + * @param inputSource The input source to extract from. + * @param polygonsPerPage List of polygons to extract from per page. + * @param quality JPEG quality of extracted images. + */ +export async function extractImagesFromPolygon( + inputSource: LocalInputSource, + polygonsPerPage: Map, + quality?: number +) { + const allExtractedImages: ExtractedImage[] = []; + const pdfDoc = await createPdfFromInputSource(inputSource); + + for (const [pageId, polygons] of polygonsPerPage) { + logger.debug(`Extracting images from page ${pageId}`); + const pdfPage = pdfDoc.getPage(pageId); + const extractions = (await extractFromPage(pdfPage, polygons, true, quality)); + const extractedImages = extractions.map( + (v, i) => new ExtractedImage(v, inputSource.filename + `_page${pageId}-${i}.jpg`, pageId, i) + ); + allExtractedImages.push(...extractedImages); + } + return allExtractedImages; +} + /** * Extracts elements from a page based off of a list of bounding boxes. * * @param pdfPage PDF Page to extract from. * @param polygons List of coordinates to pull the elements from. + * @param asImage Whether to return the extracted elements as images. + * @param quality JPEG quality of extracted images, given as number between 0 and 1. */ export async function extractFromPage( pdfPage: pdfLibTypes.PDFPage, - polygons: Polygon[] + polygons: Polygon[], + asImage: boolean = false, + quality?: number, ) { const pdfLib = await getPdfLib(); const { width, height } = pdfPage.getSize(); - const extractedElements :Uint8Array[] = []; - // Manual upscale. - // Fixes issues with the OCR. - const qualityScale = 300/72; + const extractedElements: Uint8Array[] = []; + if (quality && (quality < 0)) { + throw new MindeeImageError("Quality must be a number between 0 and 1"); + } + if (quality && quality > 1) { + logger.warn("Quality is greater than 1, this operation will apply a manual upscale on the output." + + " Use only if you know what you are doing."); + } + const qualityScale = quality ?? 1; const orientation = pdfPage.getRotation().angle; + const sourceDoc = pdfPage.doc; + const pageIndex = sourceDoc.getPages().indexOf(pdfPage); + for (const origPolygon of polygons) { - const polygon = adjustForRotation(origPolygon, orientation); + logger.debug(`Extracting image with polygon: ${origPolygon.toString()}`); const tempPdf = await pdfLib.PDFDocument.create(); + const [copiedPage] = await tempPdf.copyPages(sourceDoc, [pageIndex]); + + const polygon = adjustForRotation(origPolygon, orientation); + const newWidth = width * (getMinMaxX(polygon).max - getMinMaxX(polygon).min); const newHeight = height * (getMinMaxY(polygon).max - getMinMaxY(polygon).min); - const cropped = await tempPdf.embedPage(pdfPage, { + + const cropped = await tempPdf.embedPage(copiedPage, { left: getMinMaxX(polygon).min * width, right: getMinMaxX(polygon).max * width, top: height - (getMinMaxY(polygon).min * height), bottom: height - (getMinMaxY(polygon).max * height), }); - // Determine the final page dimensions based on orientation let finalWidth: number; let finalHeight: number; if (orientation === 90 || orientation === 270) { - // For 90/270 rotations, swap width and height finalWidth = newHeight * qualityScale; finalHeight = newWidth * qualityScale; } else { @@ -62,15 +110,14 @@ export async function extractFromPage( } const samplePage = tempPdf.addPage([finalWidth, finalHeight]); - samplePage.drawRectangle({ x: 0, y: 0, width: finalWidth, height: finalHeight, + color: pdfLib.rgb(1, 1, 1), }); - // Draw the cropped page with rotation applied if (orientation === 0) { samplePage.drawPage(cropped, { width: newWidth * qualityScale, @@ -102,7 +149,13 @@ export async function extractFromPage( }); } - extractedElements.push(await tempPdf.save()); + const pdfBuffer = Buffer.from(await tempPdf.save()); + if (asImage) { + extractedElements.push(await rasterizePage(pdfBuffer, 0, 100)); + } else { + extractedElements.push(pdfBuffer); + } } + return extractedElements; } diff --git a/src/pdf/extractedPdf.ts b/src/pdf/extractedPdf.ts new file mode 100644 index 00000000..58efe467 --- /dev/null +++ b/src/pdf/extractedPdf.ts @@ -0,0 +1,71 @@ +import path from "node:path"; +import { BufferInput, MIMETYPES } from "@/input/index.js"; +import { MindeeError } from "@/errors/index.js"; +import { Buffer } from "node:buffer"; +import { writeFile } from "fs/promises"; +import { logger } from "@/logger.js"; +import { writeFileSync } from "node:fs"; + +export class ExtractedPdf { + public readonly buffer: Buffer; + public readonly filename: string; + public readonly pageCount: number; + + constructor(pdfData: Buffer, filename: string, pageCount: number) { + this.buffer = pdfData; + this.filename = filename; + this.pageCount = pageCount; + } + + /** + * Saves the document to a file. + * + * @param outputPath Path to save the file to. + */ + async saveToFileAsync(outputPath: string) { + const fileExt = path.extname(outputPath).toLowerCase(); + if (fileExt !== ".pdf" && !MIMETYPES.has(fileExt)) { + outputPath += ".pdf"; + } + + try { + await writeFile(path.resolve(outputPath), this.buffer); + logger.info(`File saved successfully to ${path.resolve(outputPath)}.`); + } catch (e) { + if (e instanceof TypeError) { + throw new MindeeError("Invalid path/filename provided."); + } else { + throw e; + } + } + } + + /** + * Saves the document to a file synchronously. + * @param outputPath + */ + saveToFile(outputPath: string){ + try { + writeFileSync(path.resolve(outputPath), this.buffer); + logger.info(`File saved successfully to ${path.resolve(outputPath)}.`); + } catch (e) { + if (e instanceof TypeError) { + throw new MindeeError("Invalid path/filename provided."); + } else { + throw e; + } + } + } + + /** + * Return the file as a Mindee-compatible BufferInput source. + * + * @returns A BufferInput source. + */ + asSource(): BufferInput { + return new BufferInput({ + buffer: this.buffer, + filename: this.filename, + }); + } +} diff --git a/src/pdf/pdfCompressor.ts b/src/pdf/pdfCompressor.ts index aa4f3183..9442f8c4 100644 --- a/src/pdf/pdfCompressor.ts +++ b/src/pdf/pdfCompressor.ts @@ -1,15 +1,10 @@ import { logger } from "@/logger.js"; -import tmp from "tmp"; -import * as fs from "node:fs"; -// eslint-disable-next-line @typescript-eslint/ban-ts-comment -// @ts-ignore -import type * as popplerTypes from "node-poppler"; // eslint-disable-next-line @typescript-eslint/ban-ts-comment // @ts-ignore import type * as pdfLibTypes from "@cantoo/pdf-lib"; import { compressImage } from "@/image/index.js"; import { loadOptionalDependency } from "@/dependency/index.js"; -import { ExtractedPdfInfo, extractTextFromPdf, hasSourceText } from "./pdfUtils.js"; +import { ExtractedPdfInfo, extractTextFromPdf, hasSourceText, rasterizePage } from "./pdfUtils.js"; let pdfLib: typeof pdfLibTypes | null = null; @@ -159,7 +154,7 @@ async function compressPagesWithQuality( const page = pdfDoc.getPages()[i]; const rasterizedPage = await rasterizePage(pdfData, i + 1, imageQuality); const compressedImage = await compressImage( - Buffer.from(rasterizedPage, "binary"), imageQuality + rasterizedPage, imageQuality ); if (!disableSourceText) { await addTextToPdfPage(page, extractedText); @@ -260,48 +255,6 @@ async function getFontFromName(fontName: string): Promise { return font; } -/** - * Rasterizes a PDF page. - * - * @param pdfData Buffer representation of the entire PDF file. - * @param index Index of the page to rasterize. - * @param quality Quality to apply during rasterization. - */ -async function rasterizePage( - pdfData: Buffer, index: number, quality = 85 -): Promise { - const popplerImport = await loadOptionalDependency( - "node-poppler", "Image Processing" - ); - const poppler = (popplerImport as any).default || popplerImport; - const popplerInstance = new poppler.Poppler(); - const tmpPdf = tmp.fileSync(); - const tempPdfPath = tmpPdf.name; - const antialiasOption: "fast" | "best" | "default" | "good" | "gray" | "none" | "subpixel" = "best"; - try { - await fs.promises.writeFile(tempPdfPath, pdfData); - const options = { - antialias: antialiasOption, - firstPageToConvert: index, - lastPageToConvert: index, - jpegFile: true, - jpegOptions: `quality=${quality}`, - singleFile: true - }; - - const jpegBuffer = await popplerInstance.pdfToCairo(tempPdfPath, undefined, options); - - await fs.promises.unlink(tempPdfPath); - - return jpegBuffer; - } catch (error) { - logger.error("Error rasterizing PDF:", error); - throw error; - } finally { - tmpPdf.removeCallback(); - } -} - /** * Performs linear interpolation between two numbers. * @param start The starting value. diff --git a/src/pdf/pdfExtractor.ts b/src/pdf/pdfExtractor.ts new file mode 100644 index 00000000..78a0ae99 --- /dev/null +++ b/src/pdf/pdfExtractor.ts @@ -0,0 +1,142 @@ +// eslint-disable-next-line @typescript-eslint/ban-ts-comment +// @ts-ignore +import type * as pdfLibTypes from "@cantoo/pdf-lib"; +import { LocalInputSource, PageOptions, PageOptionsOperation, PathInput } from "@/input/index.js"; +import { logger } from "@/logger.js"; +import path from "path"; +import { loadOptionalDependency } from "@/dependency/index.js"; +import { MindeeInputSourceError, MindeePdfError } from "@/errors/index.js"; +import { ExtractedPdf } from "@/pdf/extractedPdf.js"; +import { createPdfFromInputSource, extractPages } from "@/pdf/pdfOperation.js"; + +let pdfLib: typeof pdfLibTypes | null = null; + +async function getPdfLib(): Promise { + if (!pdfLib) { + const pdfLibImport = await loadOptionalDependency("@cantoo/pdf-lib", "Text Embedding"); + pdfLib = (pdfLibImport as any).default || pdfLibImport; + } + return pdfLib!; +} + +export class PdfExtractor { + /** + * Buffer containing the PDF data. + * @private + */ + private sourcePdf: Buffer | null = null; + /** + * Filename of the PDF. + * @private + */ + private filename: string | null = null; + /** + * Input document. + * @private + */ + private readonly inputDocument: string | LocalInputSource; + /** + * Whether the extractor has been initialized. + * @private + */ + private initialized: boolean = false; + /** + * PDF library instance. + * @private + */ + private pdfLib: typeof pdfLibTypes | null = null; + + /** + * List of extracted PDFs. + * @private + */ + private extractedPdfs: ExtractedPdf[] | null = null; + + constructor(inputDocument: string | LocalInputSource) { + this.inputDocument = inputDocument; + } + + async init() { + this.pdfLib = await getPdfLib(); + if (typeof this.inputDocument === "string") { + logger.debug(`Loading from path: ${this.inputDocument}`); + try { + const tempPathInput = new PathInput({ inputPath: this.inputDocument }); + await tempPathInput.init(); + if (tempPathInput.isPdf()) { + this.sourcePdf = tempPathInput.fileObject; + } else { + const pdfObject = await createPdfFromInputSource(tempPathInput); + this.sourcePdf = Buffer.from(await pdfObject.save()); + } + } catch { + throw new MindeeInputSourceError("Couldn't generate PDF from input."); + } + this.filename = path.basename(this.inputDocument); + } else { + logger.debug(`Loading document: ${this.inputDocument.filename}`); + await this.inputDocument.init(); + if (this.inputDocument.isPdf()) { + this.sourcePdf = this.inputDocument.fileObject as Buffer; + } else { + const pdfObject = await createPdfFromInputSource(this.inputDocument); + const arrayBuffer = await pdfObject.save(); + this.sourcePdf = Buffer.from(arrayBuffer); + } + this.filename = this.inputDocument.filename; + } + this.initialized = true; + if (!this.sourcePdf) { + throw new MindeePdfError("Could not load PDF source."); + } + } + + /** + * Gets the number of pages in the PDF. + * @returns The number of pages in the PDF. + */ + async getPageCount() { + if (!this.initialized) { + await this.init(); + } + const currentPdf = await this.pdfLib!.PDFDocument.load(this.sourcePdf!, { + ignoreEncryption: true, + password: "" + }); + return currentPdf.getPageCount(); + } + + /** + * Extracts pages from the PDF. + * @param pageIndexes + */ + async extractSubDocuments(pageIndexes: number[][]): Promise { + if (this.extractedPdfs && this.extractedPdfs.length > 0) { + return this.extractedPdfs; + } + if (!this.initialized) { + await this.init(); + } + this.extractedPdfs = []; + for (const pageRange of pageIndexes) { + logger.debug(`Extracting pages ${pageRange.join(", ")}`); + if (pageRange.length === 0) { + throw new MindeeInputSourceError("Empty indexes not allowed for extraction."); + } + const pageOptions: PageOptions = { + pageIndexes: pageRange, + operation: PageOptionsOperation.KeepOnly, + onMinPages: 1, + }; + const splitName = path.basename(this.filename!, path.extname(this.filename!)); + + const startPage = String(pageRange[0] + 1).padStart(3, "0"); + const endPage = String(pageRange[pageRange.length - 1] + 1).padStart(3, "0"); + + const fieldFilename = `${splitName}_page_${startPage}-${endPage}.pdf`; + const page = await extractPages(this.sourcePdf!, pageOptions); + this.extractedPdfs.push(new ExtractedPdf(page.file, fieldFilename, pageRange.length)); + } + return this.extractedPdfs; + } +} diff --git a/src/pdf/pdfOperation.ts b/src/pdf/pdfOperation.ts index ebc22bb4..8af6b32a 100644 --- a/src/pdf/pdfOperation.ts +++ b/src/pdf/pdfOperation.ts @@ -3,9 +3,10 @@ import type * as pdfLibTypes from "@cantoo/pdf-lib"; import { errorHandler } from "@/errors/handler.js"; import { PageOptions, PageOptionsOperation } from "@/input/pageOptions.js"; -import { MindeeError } from "@/errors/index.js"; +import { MindeeError, MindeeInputSourceError } from "@/errors/index.js"; import { logger } from "@/logger.js"; import { loadOptionalDependency } from "@/dependency/index.js"; +import { LocalInputSource } from "@/input/index.js"; let pdfLib: typeof pdfLibTypes | null = null; @@ -111,3 +112,37 @@ export async function countPages(file: Buffer): Promise { }); return currentPdf.getPageCount(); } + + +/** + * Creates a PDF from a local file. Converts images to PDFs if needed. + * @param inputSource The input source to create a PDF from. + */ +export async function createPdfFromInputSource(inputSource: LocalInputSource) { + const pdfLib = await getPdfLib(); + let pdfDoc: pdfLibTypes.PDFDocument; + if (!["image/jpeg", "image/jpg", "image/png", "application/pdf"].includes(inputSource.mimeType)) { + throw new MindeeInputSourceError( + 'Unsupported file type "' + + inputSource.mimeType + + '" Currently supported types are .png, .jpg and .pdf' + ); + } else if (inputSource.isPdf()) { + pdfDoc = await pdfLib.PDFDocument.load(inputSource.fileObject, { + ignoreEncryption: true, + password: "" + }); + } else { + pdfDoc = await pdfLib.PDFDocument.create(); + let image: pdfLibTypes.PDFImage; + if (inputSource.mimeType === "image/png") { + image = await pdfDoc.embedPng(inputSource.fileObject); + } else { + image = await pdfDoc.embedJpg(inputSource.fileObject); + } + const imageDims = image.scale(1); + const pageImage = pdfDoc.addPage([imageDims.width, imageDims.height]); + pageImage.drawImage(image); + } + return pdfDoc; +} diff --git a/src/pdf/pdfUtils.ts b/src/pdf/pdfUtils.ts index ea32f595..748d9c9b 100644 --- a/src/pdf/pdfUtils.ts +++ b/src/pdf/pdfUtils.ts @@ -1,8 +1,14 @@ // eslint-disable-next-line @typescript-eslint/ban-ts-comment // @ts-ignore import type * as pdfJsExtractTypes from "pdf.js-extract"; +// eslint-disable-next-line @typescript-eslint/ban-ts-comment +// @ts-ignore +import type * as popplerTypes from "node-poppler"; +import tmp from "tmp"; +import * as fs from "node:fs"; import { MindeePdfError } from "@/errors/index.js"; import { loadOptionalDependency } from "@/dependency/index.js"; +import { logger } from "@/logger.js"; export interface PageTextInfo { @@ -85,3 +91,46 @@ export async function hasSourceText(pdfData: Buffer): Promise { const text = await extractTextFromPdf(pdfData); return text.getConcatenatedText().trim().length > 0; } + +/** + * Rasterizes a PDF page. + * + * @param pdfData Buffer representation of the entire PDF file. + * @param index Index of the page to rasterize. + * @param quality Quality to apply during rasterization. + * @return Buffer containing the rasterized image data. + */ +export async function rasterizePage( + pdfData: Buffer, index: number, quality = 85 +): Promise { + const popplerImport = await loadOptionalDependency( + "node-poppler", "Image Processing" + ); + const poppler = (popplerImport as any).default || popplerImport; + const popplerInstance = new poppler.Poppler(); + const tmpPdf = tmp.fileSync(); + const tempPdfPath = tmpPdf.name; + const antialiasOption: "fast" | "best" | "default" | "good" | "gray" | "none" | "subpixel" = "best"; + try { + await fs.promises.writeFile(tempPdfPath, pdfData); + const options = { + antialias: antialiasOption, + firstPageToConvert: index, + lastPageToConvert: index, + jpegFile: true, + jpegOptions: `quality=${quality}`, + singleFile: true + }; + + const jpegBuffer = await popplerInstance.pdfToCairo(tempPdfPath, undefined, options); + + await fs.promises.unlink(tempPdfPath); + + return Buffer.from(jpegBuffer, "binary"); + } catch (error) { + logger.error("Error rasterizing PDF:", error); + throw error; + } finally { + tmpPdf.removeCallback(); + } +} diff --git a/src/v1/extraction/multiReceiptsExtractor/multiReceiptsExtractor.ts b/src/v1/extraction/multiReceiptsExtractor/multiReceiptsExtractor.ts index c49e37b5..a751b401 100644 --- a/src/v1/extraction/multiReceiptsExtractor/multiReceiptsExtractor.ts +++ b/src/v1/extraction/multiReceiptsExtractor/multiReceiptsExtractor.ts @@ -1,7 +1,8 @@ // eslint-disable-next-line @typescript-eslint/ban-ts-comment // @ts-ignore import type * as pdfLibTypes from "@cantoo/pdf-lib"; -import { MindeeError, MindeeInputSourceError } from "@/errors/index.js"; +import { MindeeError } from "@/errors/index.js"; +import { createPdfFromInputSource } from "@/pdf/pdfOperation.js"; import { Polygon } from "@/geometry/index.js"; import { MultiReceiptsDetectorV1 } from "@/v1/product/index.js"; import { ExtractedMultiReceiptImage } from "@/v1/extraction/index.js"; @@ -33,7 +34,8 @@ async function extractReceiptsFromPage( pdfPage: pdfLibTypes.PDFPage, boundingBoxes: Polygon[], pageId: number) { - const extractedReceiptsRaw = await extractFromPage(pdfPage, boundingBoxes); + const manualUpscaleFactor = 300/72; + const extractedReceiptsRaw = await extractFromPage(pdfPage, boundingBoxes, false, manualUpscaleFactor); const extractedReceipts = []; for (let i = 0; i < extractedReceiptsRaw.length; i++) { extractedReceipts.push(new ExtractedMultiReceiptImage(extractedReceiptsRaw[i], pageId, i)); @@ -41,35 +43,6 @@ async function extractReceiptsFromPage( return extractedReceipts; } -async function loadPdfDoc(inputFile: LocalInputSource) { - const pdfLib = await getPdfLib(); - let pdfDoc: pdfLibTypes.PDFDocument; - if (!["image/jpeg", "image/jpg", "image/png", "application/pdf"].includes(inputFile.mimeType)) { - throw new MindeeInputSourceError( - 'Unsupported file type "' + - inputFile.mimeType + - '" Currently supported types are .png, .jpg and .pdf' - ); - } else if (inputFile.isPdf()) { - pdfDoc = await pdfLib.PDFDocument.load(inputFile.fileObject, { - ignoreEncryption: true, - password: "" - }); - } else { - pdfDoc = await pdfLib.PDFDocument.create(); - let image: pdfLibTypes.PDFImage; - if (inputFile.mimeType === "image/png") { - image = await pdfDoc.embedPng(inputFile.fileObject); - } else { - image = await pdfDoc.embedJpg(inputFile.fileObject); - } - const imageDims = image.scale(1); - const pageImage = pdfDoc.addPage([imageDims.width, imageDims.height]); - pageImage.drawImage(image); - } - return pdfDoc; -} - /** * Extracts individual receipts from multi-receipts documents. * @@ -86,9 +59,9 @@ export async function extractReceipts( if (!inference.prediction.receipts) { throw new MindeeError("No possible receipts candidates found for MultiReceipts extraction."); } - const pdfDoc = await loadPdfDoc(inputFile); + const pdfDoc = await createPdfFromInputSource(inputFile); for (let pageId = 0; pageId < pdfDoc.getPageCount(); pageId++) { - const [page] = await pdfDoc.copyPages(pdfDoc, [pageId]); + const page = pdfDoc.getPage(pageId); page.setRotation(pdfLib.degrees(inference.pages[pageId].orientation?.value ?? 0)); const receiptPositions = inference.pages[pageId].prediction.receipts.map( (receipt: PositionField) => receipt.boundingBox diff --git a/src/v2/fileOperations/crop.ts b/src/v2/fileOperations/crop.ts new file mode 100644 index 00000000..591013ae --- /dev/null +++ b/src/v2/fileOperations/crop.ts @@ -0,0 +1,47 @@ +import { LocalInputSource } from "@/input/index.js"; +import { CropItem } from "@/v2/product/crop/index.js"; +import { MindeeError } from "@/errors/index.js"; +import { extractImagesFromPolygon } from "@/image/imageExtractor.js"; +import { Polygon } from "@/geometry/index.js"; +import { CropFiles } from "@/v2/fileOperations/cropFiles.js"; +import { ExtractedImage } from "@/image/index.js"; +import { logger } from "@/logger.js"; + + +/** + * Extracts a single specified crop from a given input source. + * @param inputSource Local input source. + * @param crop Crop to extract. + */ +export async function extractSingleCrop(inputSource: LocalInputSource, crop: CropItem): Promise { + return (await extractCrops(inputSource, [crop]))[0]; +} + + +/** + * Extracts a list of crops from a document. + * @param inputSource Local input source. + * @param crops List of crops to extract. + * @param quality JPEG quality of extracted images. + * @return a list of extracted files, as a CropFiles object. + */ +export async function extractCrops( + inputSource: LocalInputSource, + crops: CropItem[], + quality?: number , +): Promise { + if (crops.length === 0) { + throw new MindeeError("No crop indexes provided."); + } + logger.debug("Extracting crops: " + crops.join(", ")); + const polygonsByPage = new Map(); + for (const crop of crops) { + const pageId: number = crop.location.page; + if (!polygonsByPage.has(pageId)) { + polygonsByPage.set(pageId, []); + } + polygonsByPage.get(pageId)!.push(crop.location.polygon); + } + const extractedCrops = await extractImagesFromPolygon(inputSource, polygonsByPage, quality); + return new CropFiles(...extractedCrops); +} diff --git a/src/v2/fileOperations/cropFiles.ts b/src/v2/fileOperations/cropFiles.ts new file mode 100644 index 00000000..61d3ff22 --- /dev/null +++ b/src/v2/fileOperations/cropFiles.ts @@ -0,0 +1,7 @@ +import { ExtractedImage } from "@/image/index.js"; + +export class CropFiles extends Array { + constructor(...items: ExtractedImage[]) { + super(...items); + } +} diff --git a/src/v2/fileOperations/index.ts b/src/v2/fileOperations/index.ts new file mode 100644 index 00000000..e69de29b diff --git a/src/v2/fileOperations/split.ts b/src/v2/fileOperations/split.ts new file mode 100644 index 00000000..cedb065d --- /dev/null +++ b/src/v2/fileOperations/split.ts @@ -0,0 +1,53 @@ +import { LocalInputSource } from "@/input/index.js"; +import { MindeeError } from "@/errors/index.js"; +import { PdfExtractor } from "@/pdf/pdfExtractor.js"; +import { SplitFiles } from "@/v2/fileOperations/splitFiles.js"; +import { logger } from "@/logger.js"; +import { ExtractedPdf } from "@/pdf/extractedPdf.js"; + +/** + * Extracts a single specified split from a + * @param inputSource + * @param split + */ +export async function extractSingleSplit(inputSource: LocalInputSource, split: number[]) { + return await extractSplits(inputSource, [split]); +} + +/** + * Extracts splits as complete PDFs from the document. + * @param inputSource Local input source. + * @param splits List of sub-lists of pages to keep. + * @return a list of extracted files. + * @throws MindeeError if no indexes are provided. + */ +export async function extractSplits(inputSource: LocalInputSource, splits: number[][]): Promise { + const pageGroups = splits.filter(e => e.length > 0); + if (pageGroups.length === 0) { + throw new MindeeError("No valid split indexes provided."); + } + logger.debug("Extracting splits: " + splits.join(", ")); + const pdfExtractor = new PdfExtractor(inputSource); + await pdfExtractor.init(); + + if (splits.length === 0) { + return new SplitFiles(); + } + const pageCount = await pdfExtractor.getPageCount(); + if (splits.length === 1 && splits[0].at(-1) === pageCount-1) { + return new SplitFiles(new ExtractedPdf(inputSource.fileObject as Buffer, inputSource.filename, pageCount)); + } + const subDocuments = await pdfExtractor.extractSubDocuments(pageGroups); + return new SplitFiles(...subDocuments); +} + +/** + * Expands a range of pages into a list of page indexes. + * @param range start and end of the page range + */ +export function expandRange(range: [number, number]): number[] { + if (range[0] > range[1]) { + throw new MindeeError("Invalid page range provided."); + } + return Array.from({ length: range[1] - range[0] + 1 }, (_, i) => range[0] + i); +} diff --git a/src/v2/fileOperations/splitFiles.ts b/src/v2/fileOperations/splitFiles.ts new file mode 100644 index 00000000..14dcb755 --- /dev/null +++ b/src/v2/fileOperations/splitFiles.ts @@ -0,0 +1,8 @@ +import { ExtractedPdf } from "@/pdf/extractedPdf.js"; + +export class SplitFiles extends Array { + + constructor(...args: ExtractedPdf[]) { + super(...args); + } +} diff --git a/src/v2/product/crop/cropItem.ts b/src/v2/product/crop/cropItem.ts index c49c3872..6f5a56fd 100644 --- a/src/v2/product/crop/cropItem.ts +++ b/src/v2/product/crop/cropItem.ts @@ -1,5 +1,8 @@ import { FieldLocation } from "@/v2/parsing/inference/field/index.js"; import { StringDict } from "@/parsing/index.js"; +import { LocalInputSource } from "@/input/index.js"; +import { extractCrops } from "@/v2/fileOperations/crop.js"; +import { ExtractedImage } from "@/image/index.js"; export class CropItem { objectType: string; @@ -13,4 +16,13 @@ export class CropItem { toString(): string { return `* :Location: ${this.location}\n :Object Type: ${this.objectType}`; } + + /** + * Extracts a single crop from an input. + * @param inputSource The input file to extract from. + * @param quality Optional quality parameter for image extraction, default is undefined (full quality). + */ + async extractFromFile(inputSource: LocalInputSource, quality: number = 1): Promise{ + return (await extractCrops(inputSource, [this], quality))[0]; + } } diff --git a/src/v2/product/crop/cropResponse.ts b/src/v2/product/crop/cropResponse.ts index c50b2518..0f63e316 100644 --- a/src/v2/product/crop/cropResponse.ts +++ b/src/v2/product/crop/cropResponse.ts @@ -1,6 +1,9 @@ +import { LocalInputSource } from "@/input/index.js"; import { StringDict } from "@/parsing/stringDict.js"; -import { CropInference } from "./cropInference.js"; +import { extractCrops } from "@/v2/fileOperations/crop.js"; +import { CropFiles } from "@/v2/fileOperations/cropFiles.js"; import { BaseResponse } from "@/v2/parsing/index.js"; +import { CropInference } from "./cropInference.js"; export class CropResponse extends BaseResponse { /** @@ -15,4 +18,13 @@ export class CropResponse extends BaseResponse { super(serverResponse); this.inference = new CropInference(serverResponse["inference"]); } + + /** + * Extracts all crops from an input. + * @param inputSource The input file to extract from. + * @param quality Optional quality parameter for image extraction, default is undefined (full quality). + */ + async extractFromFile(inputSource: LocalInputSource, quality: number = 1): Promise { + return await extractCrops(inputSource, this.inference.result.crops, quality); + } } diff --git a/src/v2/product/split/splitRange.ts b/src/v2/product/split/splitRange.ts index 0974b435..059bfbc7 100644 --- a/src/v2/product/split/splitRange.ts +++ b/src/v2/product/split/splitRange.ts @@ -1,4 +1,6 @@ import { StringDict } from "@/parsing/index.js"; +import { LocalInputSource } from "@/input/index.js"; +import { expandRange, extractSplits } from "@/v2/fileOperations/split.js"; /** * Split inference result. @@ -24,4 +26,13 @@ export class SplitRange { const pageRange = this.pageRange.join(","); return `* :Page Range: ${pageRange}\n :Document Type: ${this.documentType}`; } + + /** + * Extracts a single split from the input file. + * @param inputSource The input file to extract from. + */ + async extractFromFile(inputSource: LocalInputSource) { + const pageRange = [expandRange(this.pageRange as [number, number])]; + return (await extractSplits(inputSource, pageRange))[0]; + } } diff --git a/src/v2/product/split/splitResponse.ts b/src/v2/product/split/splitResponse.ts index 891cb30d..123ab286 100644 --- a/src/v2/product/split/splitResponse.ts +++ b/src/v2/product/split/splitResponse.ts @@ -1,6 +1,9 @@ import { StringDict } from "@/parsing/stringDict.js"; import { SplitInference } from "./splitInference.js"; import { BaseResponse } from "@/v2/parsing/index.js"; +import { LocalInputSource } from "@/input/index.js"; +import { expandRange, extractSplits } from "@/v2/fileOperations/split.js"; +import { SplitFiles } from "@/v2/fileOperations/splitFiles.js"; export class SplitResponse extends BaseResponse { /** @@ -15,4 +18,16 @@ export class SplitResponse extends BaseResponse { super(serverResponse); this.inference = new SplitInference(serverResponse["inference"]); } + + /** + * Extracts all splits from an input PDF. + * @param inputSource The input file to extract from. + */ + async extractFromFile(inputSource: LocalInputSource): Promise{ + const splits: number[][] = []; + for (const split of this.inference.result.splits) { + splits.push(expandRange(split.pageRange as [number, number])); + } + return await extractSplits(inputSource, splits); + } } diff --git a/tests/index.ts b/tests/index.ts index 2d161c7d..69a5c918 100644 --- a/tests/index.ts +++ b/tests/index.ts @@ -4,6 +4,7 @@ import path from "path"; const currentDirName = dirname(fileURLToPath(import.meta.url)); export const RESOURCE_PATH = path.join(currentDirName, "data"); +export const OUTPUT_PATH = path.join(RESOURCE_PATH, "output"); export const V1_RESOURCE_PATH = path.join(RESOURCE_PATH, "v1"); export const V1_PRODUCT_PATH = path.join(V1_RESOURCE_PATH, "products"); diff --git a/tests/v1/extraction/multiReceipts.spec.ts b/tests/v1/extraction/multiReceipts.spec.ts index f520a346..49cc7f62 100644 --- a/tests/v1/extraction/multiReceipts.spec.ts +++ b/tests/v1/extraction/multiReceipts.spec.ts @@ -15,7 +15,6 @@ const dataPath = { multiPageSample: path.join(V1_PRODUCT_PATH, "multi_receipts_detector/multipage_sample.pdf"), }; describe("MindeeV1 - Multi-Receipt Extraction #OptionalDepsRequired", () => { - describe("A single-page multi-receipts document", () => { it("should be split properly.", async () => { const jsonDataNA = await fs.readFile(path.resolve(dataPath.complete)); diff --git a/tests/v2/fileOperations/crop.integration.ts b/tests/v2/fileOperations/crop.integration.ts new file mode 100644 index 00000000..fa524398 --- /dev/null +++ b/tests/v2/fileOperations/crop.integration.ts @@ -0,0 +1,85 @@ +import { after, beforeEach, describe, it } from "node:test"; +import assert from "node:assert/strict"; +import path from "node:path"; +import * as fs from "node:fs"; + +import { Client, PathInput } from "@/index.js"; +import { Crop } from "@/v2/product/crop/index.js"; +import { Extraction, ExtractionResponse } from "@/v2/product/extraction/index.js"; +import { extractCrops } from "@/v2/fileOperations/crop.js"; +import { V2_PRODUCT_PATH, OUTPUT_PATH } from "../../index.js"; +import { SimpleField } from "@/v2/parsing/inference/field/index.js"; + + +function checkFindocReturn(findocResponse: ExtractionResponse) { + assert.ok(findocResponse.inference.model.id.length > 0); + const totalAmount = findocResponse.inference.result.fields.get("total_amount") as SimpleField; + assert.ok(totalAmount !== undefined); + assert.ok((totalAmount.value as number) > 0); +} + +describe("MindeeV2 - Integration - FileOperation - Crop #OptionalDepsRequired", { timeout: 120000 }, () => { + let client: Client; + let cropModelId: string; + let findocModelId: string; + + const cropSample = path.join( + V2_PRODUCT_PATH, + "crop", + "default_sample.jpg" + ); + + beforeEach(() => { + const apiKey = process.env["MINDEE_V2_API_KEY"] ?? ""; + cropModelId = process.env["MINDEE_V2_SE_TESTS_CROP_MODEL_ID"] ?? ""; + findocModelId = process.env["MINDEE_V2_SE_TESTS_FINDOC_MODEL_ID"] ?? ""; + + client = new Client({ apiKey: apiKey, debug: true }); + }); + + after(() => { + const file1 = path.join(OUTPUT_PATH, "crop_001.jpg"); + const file2 = path.join(OUTPUT_PATH, "crop_002.jpg"); + if (fs.existsSync(file1)) fs.rmSync(file1); + if (fs.existsSync(file2)) fs.rmSync(file2); + }); + + it("extracts crops from image correctly", async () => { + const cropInput = new PathInput({ inputPath: cropSample }); + + const cropParams = { modelId: cropModelId }; + + const response = await client.enqueueAndGetResult( + Crop, cropInput, cropParams + ); + + assert.equal(response.inference.result.crops.length, 2); + + const extractedImages = await extractCrops(cropInput, response.inference.result.crops); + + assert.equal(extractedImages.length, 2); + assert.equal(extractedImages[0].filename, "default_sample.jpg_page0-0.jpg"); + assert.equal(extractedImages[1].filename, "default_sample.jpg_page0-1.jpg"); + + const extractionInput = extractedImages[0].asSource(); + const findocParams = { modelId: findocModelId }; + + const invoice0 = await client.enqueueAndGetResult( + Extraction, extractionInput, findocParams + ); + + checkFindocReturn(invoice0); + + const file1Path = path.join(OUTPUT_PATH, "crop_001.jpg"); + const file2Path = path.join(OUTPUT_PATH, "crop_002.jpg"); + + fs.writeFileSync(file1Path, extractedImages[0].buffer); + fs.writeFileSync(file2Path, extractedImages[1].buffer); + + const stat1 = fs.statSync(file1Path); + assert.ok(stat1.size >= 3100000 && stat1.size <= 3200000); + + const stat2 = fs.statSync(file2Path); + assert.ok(stat2.size >= 3200000 && stat2.size <= 3300000); + }); +}); diff --git a/tests/v2/fileOperations/crop.spec.ts b/tests/v2/fileOperations/crop.spec.ts new file mode 100644 index 00000000..18c7218b --- /dev/null +++ b/tests/v2/fileOperations/crop.spec.ts @@ -0,0 +1,122 @@ +import { loadOptionalDependency } from "@/dependency/index.js"; +import { ExtractedImage } from "@/image/index.js"; +import { PathInput } from "@/index.js"; +import { extractCrops } from "@/v2/fileOperations/crop.js"; + +import { LocalResponse } from "@/v2/parsing/index.js"; +import { CropResponse } from "@/v2/product/crop/cropResponse.js"; +import type * as pdfLibTypes from "@cantoo/pdf-lib"; +import assert from "node:assert/strict"; +import { describe, it } from "node:test"; +import path from "path"; +import type * as SharpTypes from "sharp"; +import { V2_PRODUCT_PATH } from "../../index.js"; + +const cropPath = path.join(V2_PRODUCT_PATH, "crop"); +let pdfLib: typeof pdfLibTypes | null = null; + +async function getPdfLib(): Promise { + if (!pdfLib) { + const pdfLibImport = await loadOptionalDependency("@cantoo/pdf-lib", "Text Embedding"); + pdfLib = (pdfLibImport as any).default || pdfLibImport; + } + return pdfLib!; +} + +async function loadV2Crop(resourcePath: string): Promise { + const localResponse = new LocalResponse(resourcePath); + return localResponse.deserializeResponse(CropResponse); +} +/** + * Gets dimensions of a buffer, routing to pdf-lib for PDFs and sharp for images. + */ +async function getFileDimensions(buffer: Buffer, sharpInstance: any) { + const isPdf = buffer.subarray(0, 4).toString("ascii") === "%PDF"; + const pdfLib = await getPdfLib(); + if (isPdf) { + const pdfDoc = await pdfLib.PDFDocument.load(buffer); + const page = pdfDoc.getPage(0); + const { width, height } = page.getSize(); + + return { width, height }; + } + const metadata = await sharpInstance(buffer).metadata(); + return { width: metadata.width, height: metadata.height }; + +} + +describe("MindeeV2 - FileOperation - Crop #OptionalDepsRequired", async () => { + const sharpLoaded = await loadOptionalDependency("sharp", "Image compression"); + const sharp = (sharpLoaded as any).default || sharpLoaded; + await it("should process single page crop correctly", async () => { + const inputSample = new PathInput({ + inputPath: + path.join(cropPath, "default_sample.jpg") + }); + const response = await loadV2Crop( + path.join(cropPath, "crop_single.json") + ); + + const extractedCrops = await response.extractFromFile(inputSample); + + assert.strictEqual(extractedCrops.length, 1); + + assert.strictEqual(extractedCrops[0].pageId, 0); + const dimensions = await getFileDimensions(extractedCrops[0].buffer, sharp); + assert.strictEqual(Math.round(dimensions.width), 5880); + assert.strictEqual(Math.round(dimensions.height), 3275); + const localExtract: ExtractedImage = await response.inference.result.crops[0].extractFromFile(inputSample); + assert.ok(localExtract.buffer.equals(extractedCrops[0].buffer)); + }); + + await it("should extract and still work with lower quality", async () => { + const inputSample = new PathInput({ + inputPath: + path.join(cropPath, "default_sample.jpg") + }); + const response = await loadV2Crop( + path.join(cropPath, "crop_single.json") + ); + + const extractedCrops = await response.extractFromFile(inputSample, 0.4); + + assert.strictEqual(extractedCrops.length, 1); + + assert.strictEqual(extractedCrops[0].pageId, 0); + const dimensions = await getFileDimensions(extractedCrops[0].buffer, sharp); + assert.strictEqual(Math.round(dimensions.width), 5880 * 0.4); + assert.strictEqual(Math.round(dimensions.height), 3275 * 0.4); + const localExtract: ExtractedImage = await response.inference.result.crops[0].extractFromFile(inputSample, 0.4); + assert.ok(localExtract.buffer.equals(extractedCrops[0].buffer)); + }); + + await it("should process multi page receipt crops correctly", async () => { + const inputSample = new PathInput({ + inputPath: + path.join(cropPath, "multipage_sample.pdf") + }); + const response = await loadV2Crop( + path.join(cropPath, "crop_multiple.json") + ); + + const extractedCrops = await extractCrops( + inputSample, + response.inference.result.crops + ); + + assert.strictEqual(extractedCrops.length, 2); + + assert.strictEqual(extractedCrops[0].pageId, 0); + assert.strictEqual(extractedCrops[0].elementId, 0); + + const dimensions1 = await getFileDimensions(extractedCrops[0].buffer, sharp); + assert.strictEqual(Math.round(dimensions1.width), 325); + assert.strictEqual(Math.round(dimensions1.height), 1579); + + assert.strictEqual(extractedCrops[1].pageId, 0); + assert.strictEqual(extractedCrops[1].elementId, 1); + const dimensions2 = await getFileDimensions(extractedCrops[1].buffer, sharp); + assert.strictEqual(Math.round(dimensions2.width), 391); + assert.strictEqual(Math.round(dimensions2.height), 1439); + }); +}); diff --git a/tests/v2/fileOperations/split.integration.ts b/tests/v2/fileOperations/split.integration.ts new file mode 100644 index 00000000..10312d29 --- /dev/null +++ b/tests/v2/fileOperations/split.integration.ts @@ -0,0 +1,89 @@ +import { after, beforeEach, describe, it } from "node:test"; +import assert from "node:assert/strict"; +import path from "node:path"; +import * as fs from "node:fs"; + +import { Client, PathInput } from "@/index.js"; +import { Split } from "@/v2/product/split/index.js"; +import { Extraction, ExtractionResponse } from "@/v2/product/extraction/index.js"; +import { SplitFiles } from "@/v2/fileOperations/splitFiles.js"; +import { V2_PRODUCT_PATH } from "../../index.js"; +import { SimpleField } from "@/v2/parsing/inference/field/index.js"; +const OUTPUT_DIR = path.join(__dirname, "output"); + +function checkFindocReturn(findocResponse: ExtractionResponse) { + assert.ok(findocResponse.inference.model.id.length > 0); + const totalAmount = findocResponse.inference.result.fields.get("total_amount") as SimpleField; + assert.ok(totalAmount !== undefined); + assert.ok((totalAmount.value as number) > 0); +} + +describe("MindeeV2 - Integration - Product - Split #OptionalDepsRequired", { timeout: 120000 }, () => { + let client: Client; + let splitModelId: string; + let findocModelId: string; + + const splitSample = path.join( + V2_PRODUCT_PATH, + "split", + "default_sample.pdf" + ); + + beforeEach(() => { + const apiKey = process.env["MINDEE_V2_API_KEY"] ?? ""; + splitModelId = process.env["MINDEE_V2_SE_TESTS_SPLIT_MODEL_ID"] ?? ""; + findocModelId = process.env["MINDEE_V2_SE_TESTS_FINDOC_MODEL_ID"] ?? ""; + + client = new Client({ apiKey: apiKey, debug: true }); + }); + + after(() => { + const file1 = path.join(OUTPUT_DIR, "split_001.pdf"); + const file2 = path.join(OUTPUT_DIR, "split_002.pdf"); + if (fs.existsSync(file1)) fs.rmSync(file1); + if (fs.existsSync(file2)) fs.rmSync(file2); + }); + + it("extracts splits from pdf correctly", async () => { + const splitInput = new PathInput({ inputPath: splitSample }); + + const splitParams = { modelId: splitModelId }; + + const response: any = await client.enqueueAndGetResult( + Split, splitInput, splitParams + ); + + assert.equal(response.inference.file.pageCount, 2); + + const extractedPdfs: SplitFiles = await response.extractFromFile(splitInput); + + assert.equal(extractedPdfs.length, 2); + assert.equal(extractedPdfs[0].filename, "default_sample_page_001-001.pdf"); + assert.equal(extractedPdfs[1].filename, "default_sample_page_002-002.pdf"); + + const extractionInput = extractedPdfs[0].asSource(); + + const findocParams = { modelId: findocModelId }; + + const invoice0 = await client.enqueueAndGetResult( + Extraction, extractionInput, findocParams + ); + + checkFindocReturn(invoice0 as ExtractionResponse); + + const file1Path = path.join(OUTPUT_DIR, "split_001.pdf"); + const file2Path = path.join(OUTPUT_DIR, "split_002.pdf"); + + await extractedPdfs[0].saveToFileAsync(file1Path); + await extractedPdfs[1].saveToFileAsync(file2Path); + + + const inputSource1 = new PathInput({ inputPath: file1Path }); + const pageCount1 = await inputSource1.getPageCount(); + assert.equal(pageCount1, extractedPdfs[0].pageCount); + + const inputSource2 = new PathInput({ inputPath: file1Path }); + const pageCount2 = await inputSource2.getPageCount(); + assert.equal(pageCount2, extractedPdfs[1].pageCount); + }); +}); diff --git a/tests/v2/fileOperations/split.spec.ts b/tests/v2/fileOperations/split.spec.ts new file mode 100644 index 00000000..a9170e8b --- /dev/null +++ b/tests/v2/fileOperations/split.spec.ts @@ -0,0 +1,83 @@ +import { PathInput } from "@/index.js"; +import { ExtractedPdf } from "@/pdf/extractedPdf.js"; +import { extractSplits } from "@/v2/fileOperations/split.js"; +import { SplitFiles } from "@/v2/fileOperations/splitFiles.js"; + +import { LocalResponse } from "@/v2/parsing/index.js"; +import { SplitResponse } from "@/v2/product/split/splitResponse.js"; +import assert from "node:assert/strict"; +import { describe, it } from "node:test"; +import path from "path"; +import { V2_PRODUCT_PATH } from "../../index.js"; + +const splitPath = path.join(V2_PRODUCT_PATH, "split"); +const financialDocumentPath = path.join(V2_PRODUCT_PATH, "extraction", "financial_document"); + +async function loadV2Split(resourcePath: string): Promise { + const localResponse = new LocalResponse(resourcePath); + return localResponse.deserializeResponse(SplitResponse); +} + +describe("MindeeV2 - Product - SplitResponse #OptionalDepsRequired", async () => { + + await it("should process single page split correctly", async () => { + const inputSample = new PathInput({ + inputPath: path.join(financialDocumentPath, "default_sample.jpg") + }); + + const response = await loadV2Split( + path.join(splitPath, "split_single.json") + ); + + const extractedSplits = await response.extractFromFile(inputSample); + + assert.strictEqual(extractedSplits.length, 1); + + assert.strictEqual(extractedSplits[0].pageCount, 1); + + const inputBuffer0 = extractedSplits[0].asSource(); + const count0 = await inputBuffer0.getPageCount(); + assert.strictEqual(count0, 1); + }); + + await it("should process multi page receipt split correctly", async () => { + const inputSample = new PathInput({ + inputPath: path.join(splitPath, "invoice_5p.pdf") + }); + + const response = await loadV2Split( + path.join(splitPath, "split_multiple.json") + ); + + const extractedSplits = await response.extractFromFile(inputSample); + + assert.strictEqual(extractedSplits.length, 3); + + assert.strictEqual(extractedSplits[0].pageCount, 1); + const bufferInput0 = extractedSplits[0].asSource(); + const count0 = await bufferInput0.getPageCount(); + assert.strictEqual(count0, 1); + + const bufferInput1 = extractedSplits[0].asSource(); + const count1 = await bufferInput1.getPageCount(); + assert.strictEqual(extractedSplits[1].pageCount, 3); + assert.strictEqual(count1, 3); + + assert.strictEqual(extractedSplits[2].pageCount, 1); + const bufferInput2 = extractedSplits[2].asSource(); + const count2 = await bufferInput2.getPageCount(); + assert.strictEqual(count2, 1); + const localExtract: ExtractedPdf = await response.inference.result.splits[0].extractFromFile(inputSample); + assert.ok(extractedSplits[0].buffer.equals(localExtract.buffer)); + }); + + await it("extracts a file as itself if the split count is its own length", async () => { + const inputSample = new PathInput({ + inputPath: path.join(splitPath, "invoice_5p.pdf") + }); + const splitFiles: SplitFiles = await extractSplits(inputSample, [[0, 1, 2, 3, 4]]); + assert(splitFiles.length === 1); + assert(splitFiles[0].pageCount === 5); + assert(splitFiles[0].buffer === inputSample.fileObject); + }); +});