Skip to content

Commit 03feb43

Browse files
committed
feat: improve XLSX to Markdown conversion by processing each sheet via CSV intermediary
1 parent 6c884f0 commit 03feb43

File tree

1 file changed

+48
-28
lines changed

1 file changed

+48
-28
lines changed

src/services/xlsx-to-markdown-service.js

Lines changed: 48 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -4,65 +4,85 @@ import path from 'path';
44
import { promisify } from 'util';
55
import { v4 as uuidv4 } from 'uuid';
66
import os from 'os';
7+
import * as XLSX from 'xlsx';
78

89
const execPromise = promisify(exec);
910

1011
/**
1112
* Service for converting XLSX files to Markdown content
12-
* Note: Pandoc's support for XLSX is limited - it may not handle complex spreadsheets well
1313
*/
1414
export const xlsxToMarkdownService = {
1515
/**
16-
* Convert an XLSX file to Markdown content using pandoc
16+
* Convert an XLSX file to Markdown content by first converting to CSV
17+
* and then using pandoc to convert CSV to Markdown
1718
* @param {Buffer} xlsxBuffer - The XLSX file buffer to convert
1819
* @returns {Promise<string>} - A string containing the Markdown content
1920
*/
2021
async convertToMarkdown(xlsxBuffer) {
2122
try {
22-
// Create temporary files for input and output
23+
// Create temporary directory for our files
2324
const tempDir = os.tmpdir();
2425
const inputId = uuidv4();
25-
const outputId = uuidv4();
2626
const inputPath = path.join(tempDir, `${inputId}.xlsx`);
27-
const outputPath = path.join(tempDir, `${outputId}.md`);
2827

2928
// Write XLSX buffer to temporary file
3029
await fs.promises.writeFile(inputPath, xlsxBuffer);
3130

32-
// Try to use pandoc to convert XLSX to Markdown
33-
// Note: This may fail as pandoc has limited XLSX support
34-
// We'll attempt it but provide a fallback error message
35-
try {
36-
const command = `pandoc -f xlsx -t markdown "${inputPath}" -o "${outputPath}"`;
31+
// Read the workbook using xlsx package
32+
const workbook = XLSX.readFile(inputPath);
33+
34+
// Array to store markdown content from each sheet
35+
const markdownParts = [];
36+
37+
// Process each sheet in the workbook
38+
for (const sheetName of workbook.SheetNames) {
39+
console.log(`Processing sheet: ${sheetName}`);
40+
41+
// Get the worksheet
42+
const worksheet = workbook.Sheets[sheetName];
43+
44+
// Convert worksheet to CSV
45+
const csvContent = XLSX.utils.sheet_to_csv(worksheet);
46+
47+
// Create temporary files for CSV and Markdown
48+
const csvId = uuidv4();
49+
const mdId = uuidv4();
50+
const csvPath = path.join(tempDir, `${csvId}.csv`);
51+
const mdPath = path.join(tempDir, `${mdId}.md`);
52+
53+
// Write CSV content to file
54+
await fs.promises.writeFile(csvPath, csvContent);
55+
56+
// Use pandoc to convert CSV to Markdown
57+
const command = `pandoc -f csv -t markdown "${csvPath}" -o "${mdPath}"`;
3758
console.log(`Executing pandoc command: ${command}`);
3859

3960
await execPromise(command);
4061

4162
// Read the generated Markdown file
42-
const markdownContent = await fs.promises.readFile(outputPath, 'utf8');
63+
const sheetMarkdown = await fs.promises.readFile(mdPath, 'utf8');
4364

44-
// Clean up temporary files
45-
try {
46-
await fs.promises.unlink(inputPath);
47-
await fs.promises.unlink(outputPath);
48-
} catch (cleanupError) {
49-
console.warn('Error cleaning up temporary files:', cleanupError);
50-
}
51-
52-
return markdownContent;
53-
} catch (pandocError) {
54-
// If pandoc doesn't support XLSX directly, provide a helpful error
55-
console.warn('Pandoc XLSX conversion failed, this format may not be fully supported:', pandocError.message);
65+
// Add sheet name as header and append to markdown parts
66+
markdownParts.push(`# ${sheetName}\n\n${sheetMarkdown}\n\n`);
5667

57-
// Clean up input file
68+
// Clean up temporary CSV and MD files
5869
try {
59-
await fs.promises.unlink(inputPath);
70+
await fs.promises.unlink(csvPath);
71+
await fs.promises.unlink(mdPath);
6072
} catch (cleanupError) {
61-
console.warn('Error cleaning up input file:', cleanupError);
73+
console.warn('Error cleaning up temporary files:', cleanupError);
6274
}
63-
64-
throw new Error('XLSX to Markdown conversion is not fully supported by pandoc. Consider converting the Excel file to CSV or another supported format first.');
6575
}
76+
77+
// Clean up the input XLSX file
78+
try {
79+
await fs.promises.unlink(inputPath);
80+
} catch (cleanupError) {
81+
console.warn('Error cleaning up input file:', cleanupError);
82+
}
83+
84+
// Combine all markdown parts
85+
return markdownParts.join('---\n\n');
6686
} catch (error) {
6787
console.error('Error converting XLSX to Markdown:', error);
6888

0 commit comments

Comments
 (0)