Skip to content

Commit ae61af5

Browse files
authored
Merge pull request #74 from shebinleo/extract-images-from-pdf
added feature to extract all images from the pdf #44
2 parents 36a97b4 + 9207bfd commit ae61af5

File tree

9 files changed

+298
-10
lines changed

9 files changed

+298
-10
lines changed

README.md

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
[![License](https://img.shields.io/npm/l/pdf2html.svg)](https://www.npmjs.org/package/pdf2html)
77
[![Node.js Version](https://img.shields.io/node/v/pdf2html.svg)](https://nodejs.org)
88

9-
> Convert PDF files to HTML, extract text, generate thumbnails, and extract metadata using Apache Tika and PDFBox
9+
> Convert PDF files to HTML, extract text, generate thumbnails, extract images, and extract metadata using Apache Tika and PDFBox
1010
1111
## 🚀 Features
1212

@@ -15,6 +15,7 @@
1515
- **Page-by-page processing** - Process PDFs page by page
1616
- **Metadata extraction** - Extract author, title, creation date, and more
1717
- **Thumbnail generation** - Generate preview images from PDF pages
18+
- **Image extraction** - Extract all embedded images from PDFs
1819
- **Buffer support** - Process PDFs from memory buffers or file paths
1920
- **TypeScript support** - Full type definitions included
2021
- **Async/Promise based** - Modern async API
@@ -142,6 +143,30 @@ const thumbnailPath = await pdf2html.thumbnail(pdfBuffer, {
142143
});
143144
```
144145

146+
### Extract Images
147+
148+
```javascript
149+
// From file path
150+
const imagePaths = await pdf2html.extractImages('path/to/document.pdf');
151+
console.log('Extracted images:', imagePaths);
152+
// Output: ['/absolute/path/to/files/image/document1.jpg', '/absolute/path/to/files/image/document2.png', ...]
153+
154+
// From buffer
155+
const pdfBuffer = fs.readFileSync('path/to/document.pdf');
156+
const imagePaths = await pdf2html.extractImages(pdfBuffer);
157+
158+
// With custom output directory
159+
const imagePaths = await pdf2html.extractImages(pdfBuffer, {
160+
outputDirectory: './extracted-images', // Custom output directory
161+
});
162+
163+
// With custom buffer size for large PDFs
164+
const imagePaths = await pdf2html.extractImages('large-document.pdf', {
165+
outputDirectory: './output',
166+
maxBuffer: 1024 * 1024 * 10, // 10MB buffer
167+
});
168+
```
169+
145170
## 💻 TypeScript Support
146171

147172
This package includes TypeScript type definitions out of the box. No need to install `@types/pdf2html`.
@@ -151,7 +176,7 @@ This package includes TypeScript type definitions out of the box. No need to ins
151176
```typescript
152177
import * as pdf2html from 'pdf2html';
153178
// or
154-
import { html, text, pages, meta, thumbnail, PDFMetadata, PDFProcessingError } from 'pdf2html';
179+
import { html, text, pages, meta, thumbnail, extractImages, PDFMetadata, PDFProcessingError } from 'pdf2html';
155180

156181
async function convertPDF() {
157182
try {

index.d.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,15 @@ declare module 'pdf2html' {
129129
*/
130130
export function thumbnail(input: PDFInput, options?: ThumbnailOptions): Promise<string>;
131131

132+
/**
133+
* Extract images from PDF
134+
* @param input - Path to PDF file or PDF buffer
135+
* @param options - Image extraction options
136+
* @returns Promise resolving to an array of paths to extracted images
137+
* @throws Error if a file not found or processing fails
138+
*/
139+
export function extractImages(input: PDFInput, options?: ProcessingOptions): Promise<string[]>;
140+
132141
/**
133142
* PDF processing error class
134143
*/

index.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ module.exports = {
1414
text: PDFProcessor.toText.bind(PDFProcessor),
1515
meta: PDFProcessor.extractMetadata.bind(PDFProcessor),
1616
thumbnail: PDFProcessor.generateThumbnail.bind(PDFProcessor),
17+
extractImages: PDFProcessor.extractImages.bind(PDFProcessor),
1718

1819
// Export classes for advanced usage
1920
PDFProcessor,

lib/PDFBoxWrapper.js

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,33 @@ class PDFBoxWrapper {
5656
await fse.remove(sourcePath).catch((err) => debug(`Failed to remove PDFBox image: ${err.message}`));
5757
}
5858
}
59+
60+
static async extractAllImages(filepath, options = {}) {
61+
const outputDirectory = options.outputDirectory || constants.DIRECTORY.IMAGE;
62+
await fse.ensureDir(outputDirectory);
63+
64+
const pdfFileName = path.basename(filepath, path.extname(filepath));
65+
const prefix = path.join(outputDirectory, pdfFileName);
66+
67+
const args = [
68+
'-jar',
69+
path.join(constants.DIRECTORY.VENDOR, constants.VENDOR_PDF_BOX_JAR),
70+
'ExtractImages',
71+
'-prefix',
72+
prefix,
73+
filepath,
74+
];
75+
76+
await CommandExecutor.execute('java', args, {
77+
maxBuffer: options.maxBuffer || DEFAULT_OPTIONS.command.maxBuffer,
78+
});
79+
80+
const extractedImages = await fse.readdir(outputDirectory);
81+
82+
return extractedImages
83+
.filter((file) => file.startsWith(pdfFileName) && (file.endsWith('.jpg') || file.endsWith('.png') || file.endsWith('.gif') || file.endsWith('.bmp') || file.endsWith('.jpeg')))
84+
.map((file) => path.join(outputDirectory, file));
85+
}
5986
}
6087

6188
module.exports = PDFBoxWrapper;

lib/PDFProcessor.js

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,20 @@ class PDFProcessor {
102102
});
103103
}
104104

105+
/**
106+
* Extract images from PDF
107+
* @param {string|Buffer} input - Path to PDF file or PDF buffer
108+
* @param {Object} options - Processing options, including output directory
109+
* @returns {Promise<Array<string>>} Array of paths to extracted images
110+
*/
111+
static async extractImages(input, options = {}) {
112+
return FileManager.processInput(input, async (filePath) => {
113+
await this.validateFile(filePath);
114+
await FileManager.ensureDirectories();
115+
return PDFBoxWrapper.extractAllImages(filePath, options);
116+
});
117+
}
118+
105119
/**
106120
* Validate file existence
107121
* @private

test/image_extraction.test.js

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
const path = require('path');
2+
const fs = require('fs');
3+
const chai = require('chai');
4+
const fse = require('fs-extra');
5+
6+
const { expect } = chai;
7+
const should = chai.should();
8+
9+
const pdf2html = require('../index');
10+
11+
const pdfImageFilepath = path.join(__dirname, './sample-images.pdf');
12+
const pdfImageBuffer = fs.readFileSync(pdfImageFilepath);
13+
14+
describe('PDF to Images with images', () => {
15+
const outputDir = path.join(__dirname, '../files/temp_extracted_images');
16+
17+
beforeEach(async () => {
18+
await fse.remove(outputDir);
19+
await fse.ensureDir(outputDir);
20+
});
21+
22+
afterEach(async () => {
23+
await fse.remove(outputDir);
24+
});
25+
26+
describe('File path input', () => {
27+
it('should extract images to the specified directory', async () => {
28+
const extractedImagePaths = await pdf2html.extractImages(pdfImageFilepath, { outputDirectory: outputDir });
29+
should.exist(extractedImagePaths);
30+
expect(extractedImagePaths).to.be.an('array');
31+
expect(extractedImagePaths).to.have.lengthOf(3);
32+
});
33+
});
34+
35+
describe('Buffer input', () => {
36+
it('should extract images from buffer to the specified directory', async () => {
37+
const extractedImagePaths = await pdf2html.extractImages(pdfImageBuffer, { outputDirectory: outputDir });
38+
should.exist(extractedImagePaths);
39+
expect(extractedImagePaths).to.be.an('array');
40+
expect(extractedImagePaths).to.have.lengthOf(3);
41+
});
42+
});
43+
44+
describe('Default options', () => {
45+
it('should extract images with default options when options not provided', async () => {
46+
const extractedImagePaths = await pdf2html.extractImages(pdfImageFilepath);
47+
should.exist(extractedImagePaths);
48+
expect(extractedImagePaths).to.be.an('array');
49+
expect(extractedImagePaths).to.have.lengthOf(3);
50+
// Check that images are saved to default directory
51+
extractedImagePaths.forEach(imagePath => {
52+
expect(imagePath).to.include('/files/image/');
53+
});
54+
});
55+
});
56+
57+
describe('Error handling', () => {
58+
it('should handle non-existent PDF file', async () => {
59+
try {
60+
await pdf2html.extractImages('/path/to/non-existent.pdf');
61+
expect.fail('Should have thrown an error');
62+
} catch (error) {
63+
should.exist(error);
64+
expect(error.message).to.include('not found');
65+
}
66+
});
67+
68+
it('should handle invalid PDF buffer', async () => {
69+
const invalidBuffer = Buffer.from('This is not a PDF');
70+
try {
71+
await pdf2html.extractImages(invalidBuffer, { outputDirectory: outputDir });
72+
expect.fail('Should have thrown an error');
73+
} catch (error) {
74+
should.exist(error);
75+
}
76+
});
77+
});
78+
});

test/sample-images.pdf

79.2 KB
Binary file not shown.
File renamed without changes.

0 commit comments

Comments
 (0)