Skip to content

Commit 0333253

Browse files
authored
perf(readability): use happy-dom window parser (microlinkhq#822)
* perf(readability): use happy-dom window parser * test: ensure lang is detected * fix(readability): close detached window via happyDOM api
1 parent a1b10fc commit 0333253

File tree

2 files changed

+25
-16
lines changed

2 files changed

+25
-16
lines changed

packages/metascraper-readability/src/index.js

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
const { memoizeOne, composeRule } = require('@metascraper/helpers')
44
const { Readability } = require('@mozilla/readability')
55
const asyncMemoizeOne = require('async-memoize-one')
6-
const { Browser } = require('happy-dom')
6+
const { Window } = require('happy-dom')
77

88
const parseReader = reader => {
99
let parsed = {}
@@ -13,23 +13,20 @@ const parseReader = reader => {
1313
return parsed
1414
}
1515

16-
const getDocument = ({ url, html }) => {
17-
const browser = new Browser({
18-
settings: {
19-
disableComputedStyleRendering: true,
20-
disableCSSFileLoading: true,
21-
disableIframePageLoading: true,
22-
disableJavaScriptEvaluation: true,
23-
disableJavaScriptFileLoading: true
24-
}
25-
})
16+
const DOCUMENT_SETTINGS = {
17+
disableComputedStyleRendering: true,
18+
disableCSSFileLoading: true,
19+
disableIframePageLoading: true,
20+
disableJavaScriptEvaluation: true,
21+
disableJavaScriptFileLoading: true
22+
}
2623

27-
const page = browser.newPage()
28-
page.url = url
29-
page.content = html
24+
const getDocument = ({ url, html }) => {
25+
const window = new Window({ url, settings: DOCUMENT_SETTINGS })
26+
window.document.documentElement.innerHTML = html
3027
return {
31-
document: page.mainFrame.document,
32-
teardown: () => browser.close()
28+
document: window.document,
29+
teardown: () => window.happyDOM.close()
3330
}
3431
}
3532

packages/metascraper-readability/test/index.js

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,3 +80,15 @@ test('serializes html once per invocation', async t => {
8080
await metascraper({ htmlDom: $, url })
8181
t.is(htmlCalls, 1)
8282
})
83+
84+
test('extracts lang from <html lang> attribute', async t => {
85+
const url = 'https://example.com'
86+
const html = `<!DOCTYPE html>
87+
<html lang="en">
88+
<head><title>Test</title></head>
89+
<body><p>Content</p></body>
90+
</html>`
91+
92+
const metadata = await metascraper({ html, url })
93+
t.is(metadata.lang, 'en')
94+
})

0 commit comments

Comments
 (0)