Replace client-side libraries with pandoc for document format conversions

ralyodio · ralyodio · commit cb8d8d7084d1 · 2025-05-06T06:10:25.000-07:00
diff --git a/src/services/excel-service.js b/src/services/excel-service.js
@@ -1,45 +1,62 @@
-import * as XLSX from 'xlsx';
-import { JSDOM } from 'jsdom';
+import { exec } from 'child_process';
+import fs from 'fs';
+import path from 'path';
+import { promisify } from 'util';
+import { v4 as uuidv4 } from 'uuid';
+import os from 'os';
+
+const execPromise = promisify(exec);
 
 /**
  * Service for generating Excel spreadsheets from HTML content
  */
 export const excelService = {
   /**
-   * Generate an Excel spreadsheet from HTML content containing tables
+   * Generate an Excel spreadsheet from HTML content using pandoc
    * @param {string} html - The HTML content containing tables
    * @param {string} sheetName - The name for the worksheet (default: 'Sheet1')
-   * @returns {Buffer} - A buffer containing the Excel file data
-   * @throws {Error} - If no tables are found in the HTML content
+   * @returns {Promise<Buffer>} - A buffer containing the Excel file data
+   * @throws {Error} - If pandoc conversion fails
    */
-  generateExcel(html, sheetName = 'Sheet1') {
-    // Create a DOM from the HTML
-    const dom = new JSDOM(html);
-    const document = dom.window.document;
-    
-    // Find all tables in the HTML
-    const tables = document.querySelectorAll('table');
-    
-    if (tables.length === 0) {
-      throw new Error('No tables found in the HTML content');
+  async generateExcel(html, sheetName = 'Sheet1') {
+    try {
+      // Create temporary files for input and output
+      const tempDir = os.tmpdir();
+      const inputId = uuidv4();
+      const outputId = uuidv4();
+      const inputPath = path.join(tempDir, `${inputId}.html`);
+      const outputPath = path.join(tempDir, `${outputId}.xlsx`);
+      
+      // Write HTML to temporary file
+      await fs.promises.writeFile(inputPath, html, 'utf8');
+      
+      // Use pandoc to convert HTML to XLSX
+      const command = `pandoc -f html -t xlsx "${inputPath}" -o "${outputPath}"`;
+      console.log(`Executing pandoc command: ${command}`);
+      
+      await execPromise(command);
+      
+      // Read the generated XLSX file
+      const excelBuffer = await fs.promises.readFile(outputPath);
+      
+      // Clean up temporary files
+      try {
+        await fs.promises.unlink(inputPath);
+        await fs.promises.unlink(outputPath);
+      } catch (cleanupError) {
+        console.warn('Error cleaning up temporary files:', cleanupError);
+      }
+      
+      return excelBuffer;
+    } catch (error) {
+      console.error('Error generating Excel document with pandoc:', error);
+      
+      // If pandoc fails, provide a detailed error message
+      if (error.stderr) {
+        console.error('Pandoc error output:', error.stderr);
+      }
+      
+      throw new Error(`Failed to generate Excel document: ${error.message}`);
     }
-    
-    // Create a new workbook
-    const workbook = XLSX.utils.book_new();
-    
-    // Process each table and add it as a sheet
-    tables.forEach((table, index) => {
-      // Convert table to worksheet
-      const worksheet = XLSX.utils.table_to_sheet(table);
-      
-      // Add the worksheet to the workbook
-      const currentSheetName = tables.length === 1 ? sheetName : `${sheetName}${index + 1}`;
-      XLSX.utils.book_append_sheet(workbook, worksheet, currentSheetName);
-    });
-    
-    // Write the workbook to a buffer
-    const excelBuffer = XLSX.write(workbook, { type: 'buffer', bookType: 'xlsx' });
-    
-    return excelBuffer;
   }
 };
diff --git a/src/services/markdown-service.js b/src/services/markdown-service.js
@@ -1,5 +1,12 @@
 import { marked } from 'marked';
-import TurndownService from 'turndown';
+import { exec } from 'child_process';
+import fs from 'fs';
+import path from 'path';
+import { promisify } from 'util';
+import { v4 as uuidv4 } from 'uuid';
+import os from 'os';
+
+const execPromise = promisify(exec);
 
 /**
  * Service for converting between Markdown and HTML
@@ -27,23 +34,50 @@ export const markdownService = {
   },
 
   /**
-   * Convert HTML to Markdown
+   * Convert HTML to Markdown using pandoc
    * @param {string} html - The HTML content to convert
-   * @param {Object} options - Options for the turndown library
-   * @returns {string} - The Markdown content
+   * @param {Object} options - Options for pandoc conversion
+   * @returns {Promise<string>} - The Markdown content
    */
-  htmlToMarkdown(html, options = {}) {
-    // Create a new TurndownService instance
-    const turndownService = new TurndownService(options);
-    
-    // Configure turndown options
-    turndownService.use([
-      // Add any plugins or rules here
-    ]);
-    
-    // Convert HTML to Markdown
-    const markdown = turndownService.turndown(html);
-    
-    return markdown;
+  async htmlToMarkdown(html, options = {}) {
+    try {
+      // Create temporary files for input and output
+      const tempDir = os.tmpdir();
+      const inputId = uuidv4();
+      const outputId = uuidv4();
+      const inputPath = path.join(tempDir, `${inputId}.html`);
+      const outputPath = path.join(tempDir, `${outputId}.md`);
+      
+      // Write HTML to temporary file
+      await fs.promises.writeFile(inputPath, html, 'utf8');
+      
+      // Use pandoc to convert HTML to Markdown
+      const command = `pandoc -f html -t markdown_github "${inputPath}" -o "${outputPath}"`;
+      console.log(`Executing pandoc command: ${command}`);
+      
+      await execPromise(command);
+      
+      // Read the generated Markdown file
+      const markdown = await fs.promises.readFile(outputPath, 'utf8');
+      
+      // Clean up temporary files
+      try {
+        await fs.promises.unlink(inputPath);
+        await fs.promises.unlink(outputPath);
+      } catch (cleanupError) {
+        console.warn('Error cleaning up temporary files:', cleanupError);
+      }
+      
+      return markdown;
+    } catch (error) {
+      console.error('Error converting HTML to Markdown with pandoc:', error);
+      
+      // If pandoc fails, provide a detailed error message
+      if (error.stderr) {
+        console.error('Pandoc error output:', error.stderr);
+      }
+      
+      throw new Error(`Failed to convert HTML to Markdown: ${error.message}`);
+    }
   }
 };
diff --git a/src/services/ppt-service.js b/src/services/ppt-service.js
@@ -1,112 +1,72 @@
-import PptxGenJS from 'pptxgenjs';
-import { JSDOM } from 'jsdom';
+import { exec } from 'child_process';
+import fs from 'fs';
+import path from 'path';
+import { promisify } from 'util';
+import { v4 as uuidv4 } from 'uuid';
+import os from 'os';
+
+const execPromise = promisify(exec);
 
 /**
  * Service for generating PowerPoint presentations from HTML content
  */
 export const pptService = {
   /**
-   * Generate a PowerPoint presentation from HTML content
+   * Generate a PowerPoint presentation from HTML content using pandoc
    * @param {string} html - The HTML content to convert to PowerPoint
    * @param {string} title - The title for the presentation
    * @returns {Promise<Buffer>} - A buffer containing the PowerPoint file data
    */
   async generatePpt(html, title = 'Presentation') {
-    // Create a DOM from the HTML
-    const dom = new JSDOM(html);
-    const document = dom.window.document;
-    
-    // Create a new PowerPoint presentation
-    const pptx = new PptxGenJS();
-    
-    // Set presentation properties
-    pptx.layout = 'LAYOUT_16x9';
-    pptx.author = 'Document Generation API';
-    pptx.title = title;
-    
-    // Extract headings and content from HTML
-    const headings = document.querySelectorAll('h1, h2, h3, h4, h5, h6');
-    
-    if (headings.length === 0) {
-      // If no headings found, create a single slide with the HTML content
-      const slide = pptx.addSlide();
-      slide.addText(document.body.textContent || 'Empty content', { 
-        x: 0.5, 
-        y: 0.5, 
-        w: '90%', 
-        h: 1, 
-        fontSize: 18,
-        color: '363636'
-      });
-    } else {
-      // Create slides based on headings
-      headings.forEach((heading, index) => {
-        const slide = pptx.addSlide();
-        
-        // Add heading as title
-        slide.addText(heading.textContent || `Slide ${index + 1}`, { 
-          x: 0.5, 
-          y: 0.5, 
-          w: '90%', 
-          fontSize: 24,
-          bold: true,
-          color: '0000FF'
-        });
-        
-        // Get content until next heading or end of document
-        let content = '';
-        let nextElement = heading.nextElementSibling;
-        
-        while (nextElement && !['H1', 'H2', 'H3', 'H4', 'H5', 'H6'].includes(nextElement.tagName)) {
-          if (nextElement.textContent) {
-            content += nextElement.textContent + '\n';
-          }
-          nextElement = nextElement.nextElementSibling;
-        }
-        
-        // Add content if available
-        if (content.trim()) {
-          slide.addText(content, { 
-            x: 0.5, 
-            y: 1.5, 
-            w: '90%', 
-            fontSize: 14,
-            color: '363636'
-          });
-        }
-        
-        // Look for images in the section
-        const sectionStart = heading;
-        const sectionEnd = nextElement;
-        let currentElement = sectionStart;
-        
-        while (currentElement && currentElement !== sectionEnd) {
-          if (currentElement.tagName === 'IMG' && currentElement.src) {
-            try {
-              // For base64 images
-              if (currentElement.src.startsWith('data:image')) {
-                slide.addImage({ 
-                  data: currentElement.src, 
-                  x: 1, 
-                  y: 3, 
-                  w: 4, 
-                  h: 3 
-                });
-              }
-              // Note: For external images, we would need to fetch them first
-              // This is simplified for the example
-            } catch (err) {
-              console.error('Error adding image to slide:', err);
-            }
-          }
-          currentElement = currentElement.nextElementSibling;
-        }
-      });
+    try {
+      // Create temporary files for input and output
+      const tempDir = os.tmpdir();
+      const inputId = uuidv4();
+      const outputId = uuidv4();
+      const inputPath = path.join(tempDir, `${inputId}.html`);
+      const outputPath = path.join(tempDir, `${outputId}.pptx`);
+      
+      // Write HTML to temporary file
+      await fs.promises.writeFile(inputPath, html, 'utf8');
+      
+      // Build the pandoc command with options
+      let command = `pandoc -f html -t pptx "${inputPath}" -o "${outputPath}"`;
+      
+      // Add title if provided
+      if (title) {
+        command += ` --metadata title="${title}"`;
+      }
+      
+      console.log(`Executing pandoc command: ${command}`);
+      
+      // Execute pandoc command
+      const { stderr } = await execPromise(command);
+      
+      if (stderr && !stderr.includes('WARNING')) {
+        throw new Error(`Pandoc error: ${stderr}`);
+      }
+      
+      // Read the generated PPTX file
+      const pptBuffer = await fs.promises.readFile(outputPath);
+      
+      // Clean up temporary files
+      try {
+        await fs.promises.unlink(inputPath);
+        await fs.promises.unlink(outputPath);
+      } catch (cleanupError) {
+        console.warn('Error cleaning up temporary files:', cleanupError);
+      }
+      
+      return pptBuffer;
+    } catch (error) {
+      console.error('Error generating PowerPoint with pandoc:', error);
+      
+      // If pandoc fails, provide a detailed error message
+      if (error.stderr) {
+        console.error('Pandoc error output:', error.stderr);
+      }
+      
+      throw new Error(`Failed to generate PowerPoint: ${error.message}`);
     }
-    
-    // Generate the PowerPoint file
-    const pptBuffer = await pptx.writeFile({ outputType: 'nodebuffer' });
-    
-    return pptBuffer;
   }
 };