Skip to content

Commit 1756c9d

Browse files
committed
change markdown parsing dependency
Use the one used by https://github.com/cerbos/antora-llm-generator
1 parent 3c3d75b commit 1756c9d

File tree

3 files changed

+43
-636
lines changed

3 files changed

+43
-636
lines changed

lib/markdown-for-llm.js

Lines changed: 17 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,67 +1,16 @@
11
'use strict'
22

3-
const {convertHtmlToMarkdown} = require('dom-to-semantic-markdown')
4-
const {JSDOM} = require('jsdom')
5-
const File = require('vinyl')
6-
7-
function overrideElementProcessing (element) {
3+
const { NodeHtmlMarkdown } = require('node-html-markdown')
4+
const nhm = new NodeHtmlMarkdown()
85

9-
if (element.tagName?.toLowerCase() === 'a') {
10-
if (element.className === 'anchor') {
11-
return [{type: 'custom', blank: true}]
12-
}
13-
let href = element.getAttribute('href')
14-
const hasProtocol = /^[a-z]+:\/\//i
15-
if (href && !href.match(hasProtocol)) {
16-
href = href.replace(/\.html/, '.md')
17-
const content = toMarkdown(element.innerHTML || href)
18-
return [{type: 'link', href, content}]
19-
}
20-
}
21-
22-
if (element.classList?.contains("admonitionblock")) {
23-
element.classList.remove('admonitionblock')
24-
const admonition = element.className.toUpperCase()
25-
const content = toMarkdown(
26-
element.querySelector("td.content").innerHTML)
27-
28-
return [{
29-
type: 'custom',
30-
admonition,
31-
content
32-
}]
33-
}
34-
}
35-
36-
function renderCustomNode (node) {
37-
if (node.blank) {
38-
return ''
39-
}
40-
if (node.admonition) {
41-
const body = node.content.split('\n').map(line => `> ${line}`).join('\n')
42-
return `\n> [!${node.admonition}]\n${body}\n\n`
43-
}
44-
}
45-
46-
function toMarkdown (html) {
47-
const dom = new JSDOM(html)
48-
const markdown = convertHtmlToMarkdown(
49-
html,
50-
{
51-
overrideDOMParser: new dom.window.DOMParser(),
52-
overrideElementProcessing,
53-
renderCustomNode
54-
}
55-
)
56-
dom.window.close()
57-
return markdown
58-
}
6+
const File = require('vinyl')
597

608
function markdownify(page) {
619
const html = page.contents.toString()
6210

6311
const link = `[View original HTML](${page.pub.url})\n\n`
64-
const markdown = toMarkdown(html)
12+
13+
const markdown = link + nhm.translate(html)
6514

6615
page.out.path = page.out.path.replace(/\.html$/, '.md')
6716
page.pub.url = page.pub.url.replace(/\.html$/, '.md')
@@ -106,8 +55,18 @@ module.exports.register = function ({ playbook, config }) {
10655
&& page.pub
10756
&& page.out)
10857

109-
for (const page of pages) {
110-
markdownify(page)
58+
const CHUNK_SIZE = 100
59+
60+
for (let i = 0; i < pages.length; i += CHUNK_SIZE) {
61+
const chunk = pages.slice(i, i + CHUNK_SIZE)
62+
63+
for (const page of chunk) {
64+
markdownify(page)
65+
}
66+
67+
if (i + CHUNK_SIZE < pages.length) {
68+
await new Promise(resolve => setImmediate(resolve))
69+
}
11170
}
11271
})
11372
this.once('beforePublish', async ({ siteCatalog }) => {

0 commit comments

Comments
 (0)