Merge pull request #74 from shebinleo/extract-images-from-pdf

shebinleo · web-flow · commit ae61af54500e · 2025-07-13T20:02:05.000+08:00
added feature to extract all images from the pdf #44
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@
 [![License](https://img.shields.io/npm/l/pdf2html.svg)](https://www.npmjs.org/package/pdf2html)
 [![Node.js Version](https://img.shields.io/node/v/pdf2html.svg)](https://nodejs.org)
 
-> Convert PDF files to HTML, extract text, generate thumbnails, and extract metadata using Apache Tika and PDFBox
+> Convert PDF files to HTML, extract text, generate thumbnails, extract images, and extract metadata using Apache Tika and PDFBox
 
 ## 🚀 Features
 
@@ -15,6 +15,7 @@
 - **Page-by-page processing** - Process PDFs page by page
 - **Metadata extraction** - Extract author, title, creation date, and more
 - **Thumbnail generation** - Generate preview images from PDF pages
+- **Image extraction** - Extract all embedded images from PDFs
 - **Buffer support** - Process PDFs from memory buffers or file paths
 - **TypeScript support** - Full type definitions included
 - **Async/Promise based** - Modern async API
@@ -142,6 +143,30 @@ const thumbnailPath = await pdf2html.thumbnail(pdfBuffer, {
 });
 ```
 
+### Extract Images
+
+```javascript
+// From file path
+const imagePaths = await pdf2html.extractImages('path/to/document.pdf');
+console.log('Extracted images:', imagePaths);
+// Output: ['/absolute/path/to/files/image/document1.jpg', '/absolute/path/to/files/image/document2.png', ...]
+
+// From buffer
+const pdfBuffer = fs.readFileSync('path/to/document.pdf');
+const imagePaths = await pdf2html.extractImages(pdfBuffer);
+
+// With custom output directory
+const imagePaths = await pdf2html.extractImages(pdfBuffer, {
+    outputDirectory: './extracted-images', // Custom output directory
+});
+
+// With custom buffer size for large PDFs
+const imagePaths = await pdf2html.extractImages('large-document.pdf', {
+    outputDirectory: './output',
+    maxBuffer: 1024 * 1024 * 10, // 10MB buffer
+});
+```
+
 ## 💻 TypeScript Support
 
 This package includes TypeScript type definitions out of the box. No need to install `@types/pdf2html`.
@@ -151,7 +176,7 @@ This package includes TypeScript type definitions out of the box. No need to ins
 ```typescript
 import * as pdf2html from 'pdf2html';
 // or
-import { html, text, pages, meta, thumbnail, PDFMetadata, PDFProcessingError } from 'pdf2html';
+import { html, text, pages, meta, thumbnail, extractImages, PDFMetadata, PDFProcessingError } from 'pdf2html';
 
 async function convertPDF() {
     try {
diff --git a/index.d.ts b/index.d.ts
@@ -129,6 +129,15 @@ declare module 'pdf2html' {
      */
     export function thumbnail(input: PDFInput, options?: ThumbnailOptions): Promise<string>;
 
+    /**
+     * Extract images from PDF
+     * @param input - Path to PDF file or PDF buffer
+     * @param options - Image extraction options
+     * @returns Promise resolving to an array of paths to extracted images
+     * @throws Error if a file not found or processing fails
+     */
+    export function extractImages(input: PDFInput, options?: ProcessingOptions): Promise<string[]>;
+
     /**
      * PDF processing error class
      */
diff --git a/index.js b/index.js
@@ -14,6 +14,7 @@ module.exports = {
     text: PDFProcessor.toText.bind(PDFProcessor),
     meta: PDFProcessor.extractMetadata.bind(PDFProcessor),
     thumbnail: PDFProcessor.generateThumbnail.bind(PDFProcessor),
+    extractImages: PDFProcessor.extractImages.bind(PDFProcessor),
 
     // Export classes for advanced usage
     PDFProcessor,
diff --git a/lib/PDFBoxWrapper.js b/lib/PDFBoxWrapper.js
@@ -56,6 +56,33 @@ class PDFBoxWrapper {
             await fse.remove(sourcePath).catch((err) => debug(`Failed to remove PDFBox image: ${err.message}`));
         }
     }
+
+    static async extractAllImages(filepath, options = {}) {
+        const outputDirectory = options.outputDirectory || constants.DIRECTORY.IMAGE;
+        await fse.ensureDir(outputDirectory);
+
+        const pdfFileName = path.basename(filepath, path.extname(filepath));
+        const prefix = path.join(outputDirectory, pdfFileName);
+
+        const args = [
+            '-jar',
+            path.join(constants.DIRECTORY.VENDOR, constants.VENDOR_PDF_BOX_JAR),
+            'ExtractImages',
+            '-prefix',
+            prefix,
+            filepath,
+        ];
+
+        await CommandExecutor.execute('java', args, {
+            maxBuffer: options.maxBuffer || DEFAULT_OPTIONS.command.maxBuffer,
+        });
+
+        const extractedImages = await fse.readdir(outputDirectory);
+
+        return extractedImages
+            .filter((file) => file.startsWith(pdfFileName) && (file.endsWith('.jpg') || file.endsWith('.png') || file.endsWith('.gif') || file.endsWith('.bmp') || file.endsWith('.jpeg')))
+            .map((file) => path.join(outputDirectory, file));
+    }
 }
 
 module.exports = PDFBoxWrapper;
diff --git a/lib/PDFProcessor.js b/lib/PDFProcessor.js
@@ -102,6 +102,20 @@ class PDFProcessor {
         });
     }
 
+    /**
+     * Extract images from PDF
+     * @param {string|Buffer} input - Path to PDF file or PDF buffer
+     * @param {Object} options - Processing options, including output directory
+     * @returns {Promise<Array<string>>} Array of paths to extracted images
+     */
+    static async extractImages(input, options = {}) {
+        return FileManager.processInput(input, async (filePath) => {
+            await this.validateFile(filePath);
+            await FileManager.ensureDirectories();
+            return PDFBoxWrapper.extractAllImages(filePath, options);
+        });
+    }
+
     /**
      * Validate file existence
      * @private
diff --git a/test/image_extraction.test.js b/test/image_extraction.test.js
@@ -0,0 +1,78 @@
+const path = require('path');
+const fs = require('fs');
+const chai = require('chai');
+const fse = require('fs-extra');
+
+const { expect } = chai;
+const should = chai.should();
+
+const pdf2html = require('../index');
+
+const pdfImageFilepath = path.join(__dirname, './sample-images.pdf');
+const pdfImageBuffer = fs.readFileSync(pdfImageFilepath);
+
+describe('PDF to Images with images', () => {
+    const outputDir = path.join(__dirname, '../files/temp_extracted_images');
+
+    beforeEach(async () => {
+        await fse.remove(outputDir);
+        await fse.ensureDir(outputDir);
+    });
+
+    afterEach(async () => {
+        await fse.remove(outputDir);
+    });
+
+    describe('File path input', () => {
+        it('should extract images to the specified directory', async () => {
+            const extractedImagePaths = await pdf2html.extractImages(pdfImageFilepath, { outputDirectory: outputDir });
+            should.exist(extractedImagePaths);
+            expect(extractedImagePaths).to.be.an('array');
+            expect(extractedImagePaths).to.have.lengthOf(3);
+        });
+    });
+
+    describe('Buffer input', () => {
+        it('should extract images from buffer to the specified directory', async () => {
+            const extractedImagePaths = await pdf2html.extractImages(pdfImageBuffer, { outputDirectory: outputDir });
+            should.exist(extractedImagePaths);
+            expect(extractedImagePaths).to.be.an('array');
+            expect(extractedImagePaths).to.have.lengthOf(3);
+        });
+    });
+
+    describe('Default options', () => {
+        it('should extract images with default options when options not provided', async () => {
+            const extractedImagePaths = await pdf2html.extractImages(pdfImageFilepath);
+            should.exist(extractedImagePaths);
+            expect(extractedImagePaths).to.be.an('array');
+            expect(extractedImagePaths).to.have.lengthOf(3);
+            // Check that images are saved to default directory
+            extractedImagePaths.forEach(imagePath => {
+                expect(imagePath).to.include('/files/image/');
+            });
+        });
+    });
+
+    describe('Error handling', () => {
+        it('should handle non-existent PDF file', async () => {
+            try {
+                await pdf2html.extractImages('/path/to/non-existent.pdf');
+                expect.fail('Should have thrown an error');
+            } catch (error) {
+                should.exist(error);
+                expect(error.message).to.include('not found');
+            }
+        });
+
+        it('should handle invalid PDF buffer', async () => {
+            const invalidBuffer = Buffer.from('This is not a PDF');
+            try {
+                await pdf2html.extractImages(invalidBuffer, { outputDirectory: outputDir });
+                expect.fail('Should have thrown an error');
+            } catch (error) {
+                should.exist(error);
+            }
+        });
+    });
+});
diff --git a/test/sample-images.pdf b/test/sample-images.pdf
diff --git a/test/sample.pdf b/test/sample.pdf
diff --git a/test/test.js b/test/test.js