Skip to content

Commit 685dc59

Browse files
authored
fix: markdown converter, replace regex with cheerio for extracting main content div (#37)
1 parent 47bc5fb commit 685dc59

File tree

4 files changed

+148
-11
lines changed

4 files changed

+148
-11
lines changed

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
"@types/cacache": "^17.0.2",
5454
"@types/node": "^22.14.1",
5555
"cacache": "^19.0.1",
56+
"cheerio": "^1.0.0",
5657
"lunr": "^2.3.9",
5758
"lunr-languages": "^1.14.0",
5859
"make-fetch-happen": "^14.0.3",

pnpm-lock.yaml

Lines changed: 99 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/services/markdown/nodeHtmlMarkdownConverter.spec.ts

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,32 @@ describe('[Markdown-Converter] When using NodeHtmlMarkdownConverter', () => {
3131
const result = converter.extractContent(html);
3232
expect(result.content).toContain('Test content');
3333
});
34+
35+
it('should extract complete content from md-content div with nested elements', () => {
36+
const html = `
37+
<html>
38+
<body>
39+
<div class="md-content" data-md-component="content">
40+
<h2>Section 1</h2>
41+
<p>First paragraph</p>
42+
<div class="nested">
43+
<h3>Subsection</h3>
44+
<p>Nested content</p>
45+
</div>
46+
<h2>Section 2</h2>
47+
<p>Final paragraph</p>
48+
</div>
49+
</body>
50+
</html>
51+
`;
52+
const result = converter.extractContent(html);
53+
expect(result.content).toContain('Section 1');
54+
expect(result.content).toContain('First paragraph');
55+
expect(result.content).toContain('Subsection');
56+
expect(result.content).toContain('Nested content');
57+
expect(result.content).toContain('Section 2');
58+
expect(result.content).toContain('Final paragraph');
59+
});
3460
});
3561

3662
describe('[Markdown-Conversion] When converting HTML to markdown', () => {

src/services/markdown/nodeHtmlMarkdownConverter.ts

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { HtmlToMarkdownConverter } from './types';
22

3+
import * as cheerio from 'cheerio';
34
import { NodeHtmlMarkdown, NodeHtmlMarkdownOptions } from 'node-html-markdown';
45

56
/**
@@ -44,24 +45,34 @@ export class NodeHtmlMarkdownConverter implements HtmlToMarkdownConverter {
4445
}
4546

4647
/**
47-
* Extract title and main content from HTML using regex
48+
* Extract title and main content from HTML using cheerio DOM parser
4849
* @param html The HTML content to process
4950
* @returns Object containing title and main content
5051
*/
5152
extractContent(html: string): { title: string, content: string } {
52-
// Simple regex approach for title extraction
53-
const titleMatch = html.match(/<h1[^>]*>(.*?)<\/h1>/i) ||
54-
html.match(/<title[^>]*>(.*?)<\/title>/i);
55-
const title = titleMatch ? titleMatch[1].trim() : '';
53+
// Load HTML into cheerio
54+
const $ = cheerio.load(html);
55+
56+
// Extract title - first try h1, then fall back to title tag
57+
let title = $('h1').first().text().trim();
58+
if (!title) {
59+
title = $('title').text().trim();
60+
}
5661

5762
// Extract main content - target the md-content container
58-
const contentMatch = html.match(/<div class="md-content"[^>]*data-md-component="content"[^>]*>([\s\S]*?)<\/div>/i);
63+
let contentHtml = '';
64+
65+
// First try to find the main content container
66+
const mdContent = $('div.md-content[data-md-component="content"]');
5967

60-
// If we found the main content container, use it; otherwise use the body
61-
const content = contentMatch ?
62-
contentMatch[0] :
63-
html.match(/<body[^>]*>([\s\S]*?)<\/body>/i)?.[0] || html;
68+
if (mdContent.length > 0) {
69+
// Get the HTML content of the md-content div
70+
contentHtml = mdContent.html() || '';
71+
} else {
72+
// Fall back to body content if md-content not found
73+
contentHtml = $('body').html() || html;
74+
}
6475

65-
return { title, content };
76+
return { title, content: contentHtml };
6677
}
6778
}

0 commit comments

Comments
 (0)