Skip to content

Commit 36eda5d

Browse files
committed
feat: implement two-step PDF to Markdown conversion using pdftotext and pandoc
1 parent caee62f commit 36eda5d

File tree

2 files changed

+26
-11
lines changed

2 files changed

+26
-11
lines changed

bin/install-service.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,8 @@ fi
8585
echo -e "${YELLOW}Installing system dependencies...${NC}"
8686
if command -v apt-get &> /dev/null; then
8787
apt-get update
88-
apt-get install -y librsvg2-bin
89-
echo -e "${GREEN}System dependencies installed.${NC}"
88+
apt-get install -y librsvg2-bin poppler-utils
89+
echo -e "${GREEN}System dependencies installed (librsvg2-bin, poppler-utils for pdftotext).${NC}"
9090
else
9191
echo -e "${YELLOW}apt-get not found. Please install librsvg2-bin manually.${NC}"
9292
fi

src/services/pdf-to-markdown-service.js

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,46 +12,61 @@ const execPromise = promisify(exec);
1212
*/
1313
export const pdfToMarkdownService = {
1414
/**
15-
* Convert a PDF file to Markdown content using pandoc
15+
* Convert a PDF file to Markdown content using a two-step process:
16+
* 1. First convert PDF to text using pdftotext
17+
* 2. Then convert text to Markdown using pandoc
18+
*
19+
* This approach provides better results than direct PDF to Markdown conversion
20+
* since pandoc doesn't handle PDF input very well.
21+
*
1622
* @param {Buffer} pdfBuffer - The PDF file buffer to convert
1723
* @returns {Promise<string>} - A string containing the Markdown content
1824
*/
1925
async convertToMarkdown(pdfBuffer) {
2026
try {
21-
// Create temporary files for input and output
27+
// Create temporary files for input, intermediate text, and final output
2228
const tempDir = os.tmpdir();
2329
const inputId = uuidv4();
30+
const textId = uuidv4();
2431
const outputId = uuidv4();
2532
const inputPath = path.join(tempDir, `${inputId}.pdf`);
33+
const textPath = path.join(tempDir, `${textId}.txt`);
2634
const outputPath = path.join(tempDir, `${outputId}.md`);
2735

2836
// Write PDF buffer to temporary file
2937
await fs.promises.writeFile(inputPath, pdfBuffer);
3038

31-
// Use pandoc to convert PDF to Markdown
32-
const command = `pandoc -f pdf -t markdown "${inputPath}" -o "${outputPath}"`;
33-
console.log(`Executing pandoc command: ${command}`);
39+
// Step 1: Use pdftotext to convert PDF to text
40+
const pdftotextCommand = `pdftotext "${inputPath}" "${textPath}"`;
41+
console.log(`Executing pdftotext command: ${pdftotextCommand}`);
3442

35-
await execPromise(command);
43+
await execPromise(pdftotextCommand);
44+
45+
// Step 2: Use pandoc to convert text to Markdown
46+
const pandocCommand = `pandoc -f plain -t markdown "${textPath}" -o "${outputPath}"`;
47+
console.log(`Executing pandoc command: ${pandocCommand}`);
48+
49+
await execPromise(pandocCommand);
3650

3751
// Read the generated Markdown file
3852
const markdownContent = await fs.promises.readFile(outputPath, 'utf8');
3953

4054
// Clean up temporary files
4155
try {
4256
await fs.promises.unlink(inputPath);
57+
await fs.promises.unlink(textPath);
4358
await fs.promises.unlink(outputPath);
4459
} catch (cleanupError) {
4560
console.warn('Error cleaning up temporary files:', cleanupError);
4661
}
4762

4863
return markdownContent;
4964
} catch (error) {
50-
console.error('Error converting PDF to Markdown with pandoc:', error);
65+
console.error('Error in PDF to Markdown conversion process:', error);
5166

52-
// If pandoc fails, provide a detailed error message
67+
// Provide detailed error output
5368
if (error.stderr) {
54-
console.error('Pandoc error output:', error.stderr);
69+
console.error('Command error output:', error.stderr);
5570
}
5671

5772
throw new Error(`Failed to convert PDF to Markdown: ${error.message}`);

0 commit comments

Comments
 (0)