Skip to content

Commit d076750

Browse files
authored
fix(pdfs): use unpdf instead of pdf-parse (#2004)
1 parent 8bd75de commit d076750

File tree

5 files changed

+18
-35
lines changed

5 files changed

+18
-35
lines changed

apps/sim/lib/file-parsers/pdf-parser.ts

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -28,29 +28,23 @@ export class PdfParser implements FileParser {
2828
try {
2929
logger.info('Starting to parse buffer, size:', dataBuffer.length)
3030

31-
const { PDFParse } = await import('pdf-parse')
31+
const { extractText, getDocumentProxy } = await import('unpdf')
3232

33-
const parser = new PDFParse({ data: dataBuffer })
34-
const textResult = await parser.getText()
35-
const infoResult = await parser.getInfo()
36-
await parser.destroy()
33+
const uint8Array = new Uint8Array(dataBuffer)
3734

38-
logger.info(
39-
'PDF parsed successfully, pages:',
40-
textResult.total,
41-
'text length:',
42-
textResult.text.length
43-
)
35+
const pdf = await getDocumentProxy(uint8Array)
4436

45-
const cleanContent = textResult.text.replace(/\u0000/g, '')
37+
const { totalPages, text } = await extractText(pdf, { mergePages: true })
38+
39+
logger.info('PDF parsed successfully, pages:', totalPages, 'text length:', text.length)
40+
41+
const cleanContent = text.replace(/\u0000/g, '')
4642

4743
return {
4844
content: cleanContent,
4945
metadata: {
50-
pageCount: textResult.total,
51-
info: infoResult.info,
52-
version: infoResult.metadata?.get('pdf:PDFVersion'),
53-
source: 'pdf-parse',
46+
pageCount: totalPages,
47+
source: 'unpdf',
5448
},
5549
}
5650
} catch (error) {

apps/sim/next.config.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ const nextConfig: NextConfig = {
7575
turbopack: {
7676
resolveExtensions: ['.tsx', '.ts', '.jsx', '.js', '.mjs', '.json'],
7777
},
78-
serverExternalPackages: ['pdf-parse'],
78+
serverExternalPackages: ['unpdf'],
7979
experimental: {
8080
optimizeCss: true,
8181
turbopackSourceMaps: false,

apps/sim/package.json

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@
6161
"@radix-ui/react-tooltip": "1.2.8",
6262
"@react-email/components": "^0.0.34",
6363
"@trigger.dev/sdk": "4.0.4",
64-
"@types/pdf-parse": "1.1.5",
6564
"@types/three": "0.177.0",
6665
"better-auth": "1.3.12",
6766
"browser-image-compression": "^2.0.2",
@@ -76,6 +75,7 @@
7675
"entities": "6.0.1",
7776
"framer-motion": "^12.5.0",
7877
"fuse.js": "7.1.0",
78+
"gray-matter": "^4.0.3",
7979
"groq-sdk": "^0.15.0",
8080
"html-to-text": "^9.0.5",
8181
"input-otp": "^1.4.2",
@@ -96,8 +96,6 @@
9696
"officeparser": "^5.2.0",
9797
"openai": "^4.91.1",
9898
"papaparse": "5.5.3",
99-
"pdf-parse": "2.4.5",
100-
"gray-matter": "^4.0.3",
10199
"posthog-js": "1.268.9",
102100
"posthog-node": "5.9.2",
103101
"prismjs": "^1.30.0",
@@ -109,16 +107,17 @@
109107
"react-markdown": "^10.1.0",
110108
"react-simple-code-editor": "^0.14.1",
111109
"reactflow": "^11.11.4",
112-
"remark-gfm": "4.0.1",
113110
"rehype-autolink-headings": "^7.1.0",
114111
"rehype-slug": "^6.0.0",
112+
"remark-gfm": "4.0.1",
115113
"resend": "^4.1.2",
116114
"sharp": "0.34.3",
117115
"socket.io": "^4.8.1",
118116
"stripe": "18.5.0",
119117
"tailwind-merge": "^2.6.0",
120118
"tailwindcss-animate": "^1.0.7",
121119
"three": "0.177.0",
120+
"unpdf": "1.4.0",
122121
"uuid": "^11.1.0",
123122
"xlsx": "0.18.5",
124123
"zod": "^3.24.2"

apps/sim/trigger.config.ts

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,8 @@ export default defineConfig({
1616
dirs: ['./background'],
1717
build: {
1818
extensions: [
19-
// pdf-parse has native bindings, keep as external package
2019
additionalPackages({
21-
packages: ['pdf-parse'],
20+
packages: ['unpdf'],
2221
}),
2322
],
2423
},

bun.lock

Lines changed: 3 additions & 12 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)