Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions src/image/extractedImage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,15 @@ import { loadOptionalDependency } from "@/dependency/index.js";
*/
export class ExtractedImage {
public buffer: Buffer;
protected internalFileName: string;
public filename: string;
public pageId?: number;
public elementId?: number;

protected constructor(buffer: Uint8Array, fileName: string) {
constructor(buffer: Uint8Array, fileName: string, pageId?: number, elementId?: number) {
this.buffer = Buffer.from(buffer);
this.internalFileName = fileName;
this.filename = fileName;
this.pageId = pageId;
this.elementId = elementId;
}

/**
Expand Down Expand Up @@ -104,7 +108,7 @@ export class ExtractedImage {
asSource(): BufferInput {
return new BufferInput({
buffer: this.buffer,
filename: this.internalFileName,
filename: this.filename,
});
}
}
85 changes: 69 additions & 16 deletions src/image/imageExtractor.ts
Original file line number Diff line number Diff line change
@@ -1,59 +1,107 @@
import { loadOptionalDependency } from "@/dependency/index.js";
import { MindeeImageError } from "@/errors/index.js";
import { getMinMaxX, getMinMaxY, Polygon } from "@/geometry/index.js";
import { adjustForRotation } from "@/geometry/polygonUtils.js";
import { ExtractedImage } from "@/image/extractedImage.js";
import { LocalInputSource } from "@/input/index.js";
import { logger } from "@/logger.js";
import { createPdfFromInputSource } from "@/pdf/pdfOperation.js";
import { rasterizePage } from "@/pdf/pdfUtils.js";
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore
import type * as pdfLibTypes from "@cantoo/pdf-lib";
import { getMinMaxX, getMinMaxY, Polygon } from "@/geometry/index.js";
import { adjustForRotation } from "@/geometry/polygonUtils.js";
import { loadOptionalDependency } from "@/dependency/index.js";

let pdfLib: typeof pdfLibTypes | null = null;

async function getPdfLib(): Promise<typeof pdfLibTypes> {
if (!pdfLib) {
const pdfLibImport = await loadOptionalDependency<typeof pdfLibTypes>(
"@cantoo/pdf-lib", "Text Embedding"
"@cantoo/pdf-lib", "Image Extraction"
);
pdfLib = (pdfLibImport as any).default || pdfLibImport;
}
return pdfLib!;
}


/**
* Extracts elements from a PDF document based on a list of bounding boxes.
* @param inputSource The input source to extract from.
* @param polygonsPerPage List of polygons to extract from per page.
* @param quality JPEG quality of extracted images.
*/
export async function extractImagesFromPolygon(
inputSource: LocalInputSource,
polygonsPerPage: Map<number, Polygon[]>,
quality?: number
) {
const allExtractedImages: ExtractedImage[] = [];
const pdfDoc = await createPdfFromInputSource(inputSource);

for (const [pageId, polygons] of polygonsPerPage) {
logger.debug(`Extracting images from page ${pageId}`);
const pdfPage = pdfDoc.getPage(pageId);
const extractions = (await extractFromPage(pdfPage, polygons, true, quality));
const extractedImages = extractions.map(
(v, i) => new ExtractedImage(v, inputSource.filename + `_page${pageId}-${i}.jpg`, pageId, i)
);
allExtractedImages.push(...extractedImages);
}
return allExtractedImages;
}

/**
* Extracts elements from a page based off of a list of bounding boxes.
*
* @param pdfPage PDF Page to extract from.
* @param polygons List of coordinates to pull the elements from.
* @param asImage Whether to return the extracted elements as images.
* @param quality JPEG quality of extracted images, given as number between 0 and 1.
*/
export async function extractFromPage(
pdfPage: pdfLibTypes.PDFPage,
polygons: Polygon[]
polygons: Polygon[],
asImage: boolean = false,
quality?: number,
) {
const pdfLib = await getPdfLib();
const { width, height } = pdfPage.getSize();
const extractedElements :Uint8Array[] = [];
// Manual upscale.
// Fixes issues with the OCR.
const qualityScale = 300/72;
const extractedElements: Uint8Array[] = [];
if (quality && (quality < 0)) {
throw new MindeeImageError("Quality must be a number between 0 and 1");
}
if (quality && quality > 1) {
logger.warn("Quality is greater than 1, this operation will apply a manual upscale on the output." +
" Use only if you know what you are doing.");
}
const qualityScale = quality ?? 1;
const orientation = pdfPage.getRotation().angle;

const sourceDoc = pdfPage.doc;
const pageIndex = sourceDoc.getPages().indexOf(pdfPage);

for (const origPolygon of polygons) {
const polygon = adjustForRotation(origPolygon, orientation);
logger.debug(`Extracting image with polygon: ${origPolygon.toString()}`);

const tempPdf = await pdfLib.PDFDocument.create();

const [copiedPage] = await tempPdf.copyPages(sourceDoc, [pageIndex]);

const polygon = adjustForRotation(origPolygon, orientation);

const newWidth = width * (getMinMaxX(polygon).max - getMinMaxX(polygon).min);
const newHeight = height * (getMinMaxY(polygon).max - getMinMaxY(polygon).min);
const cropped = await tempPdf.embedPage(pdfPage, {

const cropped = await tempPdf.embedPage(copiedPage, {
left: getMinMaxX(polygon).min * width,
right: getMinMaxX(polygon).max * width,
top: height - (getMinMaxY(polygon).min * height),
bottom: height - (getMinMaxY(polygon).max * height),
});

// Determine the final page dimensions based on orientation
let finalWidth: number;
let finalHeight: number;
if (orientation === 90 || orientation === 270) {
// For 90/270 rotations, swap width and height
finalWidth = newHeight * qualityScale;
finalHeight = newWidth * qualityScale;
} else {
Expand All @@ -62,15 +110,14 @@ export async function extractFromPage(
}

const samplePage = tempPdf.addPage([finalWidth, finalHeight]);

samplePage.drawRectangle({
x: 0,
y: 0,
width: finalWidth,
height: finalHeight,
color: pdfLib.rgb(1, 1, 1),
});

// Draw the cropped page with rotation applied
if (orientation === 0) {
samplePage.drawPage(cropped, {
width: newWidth * qualityScale,
Expand Down Expand Up @@ -102,7 +149,13 @@ export async function extractFromPage(
});
}

extractedElements.push(await tempPdf.save());
const pdfBuffer = Buffer.from(await tempPdf.save());
if (asImage) {
extractedElements.push(await rasterizePage(pdfBuffer, 0, 100));
} else {
extractedElements.push(pdfBuffer);
}
}

return extractedElements;
}
71 changes: 71 additions & 0 deletions src/pdf/extractedPdf.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import path from "node:path";
import { BufferInput, MIMETYPES } from "@/input/index.js";
import { MindeeError } from "@/errors/index.js";
import { Buffer } from "node:buffer";
import { writeFile } from "fs/promises";
import { logger } from "@/logger.js";
import { writeFileSync } from "node:fs";

export class ExtractedPdf {
public readonly buffer: Buffer;
public readonly filename: string;
public readonly pageCount: number;

constructor(pdfData: Buffer<ArrayBufferLike>, filename: string, pageCount: number) {
this.buffer = pdfData;
this.filename = filename;
this.pageCount = pageCount;
}

/**
* Saves the document to a file.
*
* @param outputPath Path to save the file to.
*/
async saveToFileAsync(outputPath: string) {
const fileExt = path.extname(outputPath).toLowerCase();
if (fileExt !== ".pdf" && !MIMETYPES.has(fileExt)) {
outputPath += ".pdf";
}

try {
await writeFile(path.resolve(outputPath), this.buffer);
logger.info(`File saved successfully to ${path.resolve(outputPath)}.`);
} catch (e) {
if (e instanceof TypeError) {
throw new MindeeError("Invalid path/filename provided.");
} else {
throw e;
}
}
}

/**
* Saves the document to a file synchronously.
* @param outputPath
*/
saveToFile(outputPath: string){
try {
writeFileSync(path.resolve(outputPath), this.buffer);
logger.info(`File saved successfully to ${path.resolve(outputPath)}.`);
} catch (e) {
if (e instanceof TypeError) {
throw new MindeeError("Invalid path/filename provided.");
} else {
throw e;
}
}
}

/**
* Return the file as a Mindee-compatible BufferInput source.
*
* @returns A BufferInput source.
*/
asSource(): BufferInput {
return new BufferInput({
buffer: this.buffer,
filename: this.filename,
});
}
}
51 changes: 2 additions & 49 deletions src/pdf/pdfCompressor.ts
Original file line number Diff line number Diff line change
@@ -1,15 +1,10 @@
import { logger } from "@/logger.js";
import tmp from "tmp";
import * as fs from "node:fs";
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore
import type * as popplerTypes from "node-poppler";
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore
import type * as pdfLibTypes from "@cantoo/pdf-lib";
import { compressImage } from "@/image/index.js";
import { loadOptionalDependency } from "@/dependency/index.js";
import { ExtractedPdfInfo, extractTextFromPdf, hasSourceText } from "./pdfUtils.js";
import { ExtractedPdfInfo, extractTextFromPdf, hasSourceText, rasterizePage } from "./pdfUtils.js";

let pdfLib: typeof pdfLibTypes | null = null;

Expand Down Expand Up @@ -159,7 +154,7 @@ async function compressPagesWithQuality(
const page = pdfDoc.getPages()[i];
const rasterizedPage = await rasterizePage(pdfData, i + 1, imageQuality);
const compressedImage = await compressImage(
Buffer.from(rasterizedPage, "binary"), imageQuality
rasterizedPage, imageQuality
);
if (!disableSourceText) {
await addTextToPdfPage(page, extractedText);
Expand Down Expand Up @@ -260,48 +255,6 @@ async function getFontFromName(fontName: string): Promise<pdfLibTypes.PDFFont> {
return font;
}

/**
* Rasterizes a PDF page.
*
* @param pdfData Buffer representation of the entire PDF file.
* @param index Index of the page to rasterize.
* @param quality Quality to apply during rasterization.
*/
async function rasterizePage(
pdfData: Buffer, index: number, quality = 85
): Promise<string> {
const popplerImport = await loadOptionalDependency<typeof popplerTypes>(
"node-poppler", "Image Processing"
);
const poppler = (popplerImport as any).default || popplerImport;
const popplerInstance = new poppler.Poppler();
const tmpPdf = tmp.fileSync();
const tempPdfPath = tmpPdf.name;
const antialiasOption: "fast" | "best" | "default" | "good" | "gray" | "none" | "subpixel" = "best";
try {
await fs.promises.writeFile(tempPdfPath, pdfData);
const options = {
antialias: antialiasOption,
firstPageToConvert: index,
lastPageToConvert: index,
jpegFile: true,
jpegOptions: `quality=${quality}`,
singleFile: true
};

const jpegBuffer = await popplerInstance.pdfToCairo(tempPdfPath, undefined, options);

await fs.promises.unlink(tempPdfPath);

return jpegBuffer;
} catch (error) {
logger.error("Error rasterizing PDF:", error);
throw error;
} finally {
tmpPdf.removeCallback();
}
}

/**
* Performs linear interpolation between two numbers.
* @param start The starting value.
Expand Down
Loading
Loading