@@ -12,46 +12,61 @@ const execPromise = promisify(exec);
1212 */
1313export const pdfToMarkdownService = {
1414 /**
15- * Convert a PDF file to Markdown content using pandoc
15+ * Convert a PDF file to Markdown content using a two-step process:
16+ * 1. First convert PDF to text using pdftotext
17+ * 2. Then convert text to Markdown using pandoc
18+ *
19+ * This approach provides better results than direct PDF to Markdown conversion
20+ * since pandoc doesn't handle PDF input very well.
21+ *
1622 * @param {Buffer } pdfBuffer - The PDF file buffer to convert
1723 * @returns {Promise<string> } - A string containing the Markdown content
1824 */
1925 async convertToMarkdown ( pdfBuffer ) {
2026 try {
21- // Create temporary files for input and output
27+ // Create temporary files for input, intermediate text, and final output
2228 const tempDir = os . tmpdir ( ) ;
2329 const inputId = uuidv4 ( ) ;
30+ const textId = uuidv4 ( ) ;
2431 const outputId = uuidv4 ( ) ;
2532 const inputPath = path . join ( tempDir , `${ inputId } .pdf` ) ;
33+ const textPath = path . join ( tempDir , `${ textId } .txt` ) ;
2634 const outputPath = path . join ( tempDir , `${ outputId } .md` ) ;
2735
2836 // Write PDF buffer to temporary file
2937 await fs . promises . writeFile ( inputPath , pdfBuffer ) ;
3038
31- // Use pandoc to convert PDF to Markdown
32- const command = `pandoc -f pdf -t markdown "${ inputPath } " -o "${ outputPath } "` ;
33- console . log ( `Executing pandoc command: ${ command } ` ) ;
39+ // Step 1: Use pdftotext to convert PDF to text
40+ const pdftotextCommand = `pdftotext "${ inputPath } " "${ textPath } "` ;
41+ console . log ( `Executing pdftotext command: ${ pdftotextCommand } ` ) ;
3442
35- await execPromise ( command ) ;
43+ await execPromise ( pdftotextCommand ) ;
44+
45+ // Step 2: Use pandoc to convert text to Markdown
46+ const pandocCommand = `pandoc -f plain -t markdown "${ textPath } " -o "${ outputPath } "` ;
47+ console . log ( `Executing pandoc command: ${ pandocCommand } ` ) ;
48+
49+ await execPromise ( pandocCommand ) ;
3650
3751 // Read the generated Markdown file
3852 const markdownContent = await fs . promises . readFile ( outputPath , 'utf8' ) ;
3953
4054 // Clean up temporary files
4155 try {
4256 await fs . promises . unlink ( inputPath ) ;
57+ await fs . promises . unlink ( textPath ) ;
4358 await fs . promises . unlink ( outputPath ) ;
4459 } catch ( cleanupError ) {
4560 console . warn ( 'Error cleaning up temporary files:' , cleanupError ) ;
4661 }
4762
4863 return markdownContent ;
4964 } catch ( error ) {
50- console . error ( 'Error converting PDF to Markdown with pandoc :' , error ) ;
65+ console . error ( 'Error in PDF to Markdown conversion process :' , error ) ;
5166
52- // If pandoc fails, provide a detailed error message
67+ // Provide detailed error output
5368 if ( error . stderr ) {
54- console . error ( 'Pandoc error output:' , error . stderr ) ;
69+ console . error ( 'Command error output:' , error . stderr ) ;
5570 }
5671
5772 throw new Error ( `Failed to convert PDF to Markdown: ${ error . message } ` ) ;
0 commit comments