Skip to content

Commit 36a97b4

Browse files
authored
Merge pull request #73 from shebinleo/xss-protection-and-temp-dir-usage-dependency-fixes
fixes potential xss, outdated dependencies, path traversal
2 parents a217320 + a7a8335 commit 36a97b4

15 files changed

+1355
-709
lines changed

.eslintignore

Lines changed: 0 additions & 5 deletions
This file was deleted.

.eslintrc.json

Lines changed: 0 additions & 23 deletions
This file was deleted.

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,5 @@ temp/
5757

5858
# Optional REPL history
5959
.node_repl_history
60+
61+
.gemini

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
## 4.3.1 (2025-07-05)
2+
3+
**Security Fixes**
4+
5+
- **Path Traversal:** Patched a path traversal vulnerability by ensuring all file processing occurs within a temporary directory. This prevents attackers from accessing or manipulating files outside of the intended directory.
6+
- **Cross-Site Scripting (XSS):** Implemented HTML sanitization using `dompurify` to prevent potential XSS attacks from malicious PDF files. This ensures that any HTML generated by the package is safe to render in a browser.
7+
- **Dependency Vulnerabilities:** Updated the `brace-expansion` dependency to resolve a low-severity vulnerability.

README.md

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -224,8 +224,6 @@ class PDFProcessingError extends Error {
224224

225225
Full IntelliSense support in VS Code and other TypeScript-aware editors:
226226

227-
![TypeScript IntelliSense](https://via.placeholder.com/600x200?text=IntelliSense+Demo)
228-
229227
- Auto-completion for all methods and options
230228
- Inline documentation on hover
231229
- Type checking at compile time
@@ -373,10 +371,10 @@ If automatic download fails (e.g., due to network restrictions), you can manuall
373371
cd node_modules/pdf2html/vendor
374372
375373
# Download Apache PDFBox
376-
wget https://archive.apache.org/dist/pdfbox/2.0.33/pdfbox-app-2.0.33.jar
374+
wget https://archive.apache.org/dist/pdfbox/2.0.34/pdfbox-app-2.0.34.jar
377375
378376
# Download Apache Tika
379-
wget https://archive.apache.org/dist/tika/3.1.0/tika-app-3.1.0.jar
377+
wget https://archive.apache.org/dist/tika/3.2.0/tika-app-3.2.0.jar
380378
```
381379

382380
3. Verify the files are in place:
@@ -390,19 +388,16 @@ If automatic download fails (e.g., due to network restrictions), you can manuall
390388
### Common Issues
391389

392390
1. **"Java is not installed"**
393-
394391
- Install Java JRE 8 or higher
395392
- Ensure `java` is in your system PATH
396393
- Verify with: `java -version`
397394

398395
2. **"File not found" errors**
399-
400396
- Check that the PDF path is correct
401397
- Use absolute paths for better reliability
402398
- Ensure the file has read permissions
403399

404400
3. **"Buffer size exceeded"**
405-
406401
- Increase maxBuffer option
407402
- Process large PDFs page by page
408403
- Consider splitting very large PDFs
@@ -441,7 +436,7 @@ This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENS
441436

442437
## 📊 Dependencies
443438

444-
- **Production**: Apache Tika 3.1.0, Apache PDFBox 2.0.33
439+
- **Production**: Apache Tika 3.2.0, Apache PDFBox 2.0.34
445440
- **Development**: See package.json for development dependencies
446441

447442
---

constants.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
const path = require('path');
22

33
module.exports = {
4-
VENDOR_PDF_BOX_JAR: 'pdfbox-app-2.0.33.jar',
5-
VENDOR_TIKA_JAR: 'tika-app-3.1.0.jar',
4+
VENDOR_PDF_BOX_JAR: 'pdfbox-app-2.0.34.jar',
5+
VENDOR_TIKA_JAR: 'tika-app-3.2.0.jar',
66

77
DIRECTORY: {
88
PDF: path.join(__dirname, './files/pdf/'),

eslint.config.js

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
const globals = require('globals');
2+
const js = require('@eslint/js');
3+
4+
const prettierPlugin = require('eslint-plugin-prettier');
5+
const prettierConfig = require('eslint-config-prettier');
6+
7+
module.exports = [
8+
js.configs.recommended,
9+
10+
prettierConfig,
11+
{
12+
files: ['**/*.js'],
13+
languageOptions: {
14+
ecmaVersion: 'latest',
15+
sourceType: 'commonjs',
16+
globals: {
17+
...globals.browser,
18+
...globals.node,
19+
...globals.mocha,
20+
},
21+
},
22+
plugins: {
23+
prettier: prettierPlugin,
24+
},
25+
rules: {
26+
'prettier/prettier': 'error',
27+
'no-console': 'off',
28+
},
29+
},
30+
];

lib/HTMLParser.js

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
// lib/HTMLParser.js
22
const cheerio = require('cheerio');
3+
const { JSDOM } = require('jsdom');
4+
const DOMPurify = require('dompurify');
5+
6+
const window = new JSDOM('').window;
7+
const purify = DOMPurify(window);
38

49
/**
510
* HTML content parser
@@ -11,7 +16,7 @@ class HTMLParser {
1116

1217
$('.page').each((index, element) => {
1318
const $page = $(element);
14-
const content = options.text ? $page.text().trim() : $page.html();
19+
const content = options.text ? $page.text().trim() : purify.sanitize($page.html());
1520
pages.push(content);
1621
});
1722

lib/PDFBoxWrapper.js

Lines changed: 1 addition & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ const debug = require('debug')('pdf2html');
33
const path = require('path');
44
const fse = require('fs-extra');
55
const defaults = require('lodash.defaults');
6-
const URI = require('urijs');
6+
77
const CommandExecutor = require('./CommandExecutor');
88
const ImageProcessor = require('./ImageProcessor');
99
const FileManager = require('./FileManager');
@@ -16,27 +16,7 @@ const { DEFAULT_OPTIONS } = require('./config');
1616
class PDFBoxWrapper {
1717
static async generateImage(filepath, options) {
1818
const opts = defaults(options, DEFAULT_OPTIONS.thumbnail);
19-
const uri = new URI(filepath);
20-
21-
// Check if the filepath is already in the temp directory
22-
const isInTempDir = filepath.includes(constants.DIRECTORY.PDF);
23-
24-
if (isInTempDir) {
25-
// File is already in the temp directory, process it directly
26-
// Generate image using PDFBox
27-
await this.executePDFBox(filepath, opts);
28-
29-
// Determine file paths
30-
const pdfBoxImagePath = this.getPDFBoxImagePath(filepath, opts);
31-
const finalImagePath = path.join(constants.DIRECTORY.IMAGE, uri.filename().replace(uri.suffix(), opts.imageType));
32-
33-
// Process the generated image
34-
await this.processGeneratedImage(pdfBoxImagePath, finalImagePath, opts);
35-
36-
return finalImagePath;
37-
}
3819

39-
// Use the original withTempFile logic for non-temp files
4020
return FileManager.withTempFile(filepath, constants.DIRECTORY.PDF, async (tempFilePath, tempUri) => {
4121
// Generate image using PDFBox
4222
await this.executePDFBox(tempFilePath, opts);

lib/TikaWrapper.js

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,19 @@ const path = require('path');
44
const CommandExecutor = require('./CommandExecutor');
55
const { DEFAULT_OPTIONS } = require('./config');
66
const constants = require('../constants');
7+
const FileManager = require('./FileManager');
78

89
/**
910
* Apache Tika wrapper for content extraction
1011
*/
1112
class TikaWrapper {
1213
static async extract(filepath, format, options = {}) {
13-
const args = ['-jar', path.join(constants.DIRECTORY.VENDOR, constants.VENDOR_TIKA_JAR), `--${format}`, filepath];
14+
return FileManager.withTempFile(filepath, constants.DIRECTORY.PDF, async (tempFilePath) => {
15+
const args = ['-jar', path.join(constants.DIRECTORY.VENDOR, constants.VENDOR_TIKA_JAR), `--${format}`, tempFilePath];
1416

15-
const maxBuffer = options.maxBuffer || DEFAULT_OPTIONS.command.maxBuffer;
16-
return CommandExecutor.execute('java', args, { maxBuffer });
17+
const maxBuffer = options.maxBuffer || DEFAULT_OPTIONS.command.maxBuffer;
18+
return CommandExecutor.execute('java', args, { maxBuffer });
19+
});
1720
}
1821

1922
static async extractHTML(filepath, options) {

0 commit comments

Comments
 (0)