|
1 | 1 | import { HtmlToMarkdownConverter } from './types'; |
2 | 2 |
|
| 3 | +import * as cheerio from 'cheerio'; |
3 | 4 | import { NodeHtmlMarkdown, NodeHtmlMarkdownOptions } from 'node-html-markdown'; |
4 | 5 |
|
5 | 6 | /** |
@@ -44,24 +45,34 @@ export class NodeHtmlMarkdownConverter implements HtmlToMarkdownConverter { |
44 | 45 | } |
45 | 46 |
|
46 | 47 | /** |
47 | | - * Extract title and main content from HTML using regex |
| 48 | + * Extract title and main content from HTML using cheerio DOM parser |
48 | 49 | * @param html The HTML content to process |
49 | 50 | * @returns Object containing title and main content |
50 | 51 | */ |
51 | 52 | extractContent(html: string): { title: string, content: string } { |
52 | | - // Simple regex approach for title extraction |
53 | | - const titleMatch = html.match(/<h1[^>]*>(.*?)<\/h1>/i) || |
54 | | - html.match(/<title[^>]*>(.*?)<\/title>/i); |
55 | | - const title = titleMatch ? titleMatch[1].trim() : ''; |
| 53 | + // Load HTML into cheerio |
| 54 | + const $ = cheerio.load(html); |
| 55 | + |
| 56 | + // Extract title - first try h1, then fall back to title tag |
| 57 | + let title = $('h1').first().text().trim(); |
| 58 | + if (!title) { |
| 59 | + title = $('title').text().trim(); |
| 60 | + } |
56 | 61 |
|
57 | 62 | // Extract main content - target the md-content container |
58 | | - const contentMatch = html.match(/<div class="md-content"[^>]*data-md-component="content"[^>]*>([\s\S]*?)<\/div>/i); |
| 63 | + let contentHtml = ''; |
| 64 | + |
| 65 | + // First try to find the main content container |
| 66 | + const mdContent = $('div.md-content[data-md-component="content"]'); |
59 | 67 |
|
60 | | - // If we found the main content container, use it; otherwise use the body |
61 | | - const content = contentMatch ? |
62 | | - contentMatch[0] : |
63 | | - html.match(/<body[^>]*>([\s\S]*?)<\/body>/i)?.[0] || html; |
| 68 | + if (mdContent.length > 0) { |
| 69 | + // Get the HTML content of the md-content div |
| 70 | + contentHtml = mdContent.html() || ''; |
| 71 | + } else { |
| 72 | + // Fall back to body content if md-content not found |
| 73 | + contentHtml = $('body').html() || html; |
| 74 | + } |
64 | 75 |
|
65 | | - return { title, content }; |
| 76 | + return { title, content: contentHtml }; |
66 | 77 | } |
67 | 78 | } |
0 commit comments