Skip to content

Commit 6dcbf19

Browse files
committed
Optimize traversing the DOM when analyzing text content
Previously, this was an often-repeated construct in readability implementation: charCount(ps.getInnerText(node, true)) == 0 What this would do is: - Call `dom.TextContent(node)` to append the contents of all individual text nodes together; - Pass the result through `strings.TrimSpace`; - Pass the result through the NormalizeSpaces regex which squashes consecutive runs of whitespace; - Count the Unicode runes of the result; - Finally, if the count is zero, the element would be considered "empty". The above is an example of an incredibly costly operation that could be done much more efficiently, for example: walk the DOM subtree until the first non-space character is found, then bail out and conclude that the element has content. This barely needs any memory allocations, and is the approach taken in this PR to address a variety of counting or detecting tasks that share a similar purpose. Benchmark before vs. after for processing a large HTML document reveals significant saving in memory allocations: variant | times | ns/op | Bytes/op | allocs/op --------|-------|------------|------------|---------- before | 30 | 38,986,203 | 59,623,683 | 199,876 after | 36 | 31,910,769 | 11,449,004 | 119,810
1 parent 9f5bf5c commit 6dcbf19

File tree

2 files changed

+143
-66
lines changed

2 files changed

+143
-66
lines changed

parser.go

Lines changed: 113 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"strconv"
1313
"strings"
1414
"time"
15+
"unicode"
1516

1617
"github.com/go-shiori/dom"
1718
"github.com/go-shiori/go-readability/internal/re2go"
@@ -229,6 +230,63 @@ func (ps *Parser) getAllNodesWithTag(node *html.Node, tagNames ...string) []*htm
229230
return result
230231
}
231232

233+
// hasTextContent reports whether a node or any of its descendants have text content other than spaces.
234+
func hasTextContent(node *html.Node) bool {
235+
if node.Type == html.TextNode {
236+
return hasContent(node.Data)
237+
}
238+
for child := range node.ChildNodes() {
239+
if hasTextContent(child) {
240+
return true
241+
}
242+
}
243+
return false
244+
}
245+
246+
// countCharsAndCommas returns counts for both characters and commas in a node's
247+
// text. Leading and trailing whitespace is not counted, nor are consecutive
248+
// runs of whitespace.
249+
func countCharsAndCommas(node *html.Node) (int, int) {
250+
numChars := 0
251+
numCommas := 0
252+
lastCharWasSpace := false
253+
seenNonSpace := false
254+
255+
// Walk the node and its descendants to count all non-space characters and
256+
// different comma variants separately.
257+
var walk func(*html.Node)
258+
walk = func(n *html.Node) {
259+
if n.Type == html.TextNode {
260+
for _, r := range n.Data {
261+
if unicode.IsSpace(r) {
262+
lastCharWasSpace = true
263+
continue
264+
}
265+
if lastCharWasSpace && seenNonSpace {
266+
numChars += 2
267+
} else {
268+
numChars += 1
269+
}
270+
lastCharWasSpace = false
271+
seenNonSpace = true
272+
switch r {
273+
// Commas as used in Latin, Sindhi, Chinese and various other scripts.
274+
// see: https://en.wikipedia.org/wiki/Comma#Comma_variants
275+
case '\u002C', '\u060C', '\uFE50', '\uFE10', '\uFE11', '\u2E41', '\u2E34', '\u2E32', '\uFF0C':
276+
numCommas++
277+
}
278+
}
279+
return
280+
}
281+
for child := range n.ChildNodes() {
282+
walk(child)
283+
}
284+
}
285+
286+
walk(node)
287+
return numChars, numCommas
288+
}
289+
232290
// cleanClasses removes the class="" attribute from every element in the
233291
// given subtree, except those that match CLASSES_TO_PRESERVE and the
234292
// classesToPreserve array from the options object.
@@ -327,7 +385,7 @@ func (ps *Parser) simplifyNestedElements(articleContent *html.Node) {
327385

328386
if node.Parent != nil && (nodeTagName == "div" || nodeTagName == "section") &&
329387
!strings.HasPrefix(nodeID, "readability") {
330-
if ps.isElementWithoutContent(node) {
388+
if isElementWithoutContent(node) {
331389
node = ps.removeAndGetNext(node)
332390
continue
333391
}
@@ -405,8 +463,7 @@ func (ps *Parser) getArticleTitle() string {
405463
}
406464
}
407465

408-
curTitle = strings.TrimSpace(curTitle)
409-
curTitle = re2go.NormalizeSpaces(curTitle)
466+
curTitle = normalizeWhitespace(curTitle)
410467
// If we now have 4 words or fewer as our title, and either no
411468
// 'hierarchical' separators (\, /, > or ») were found in the original
412469
// title or we decreased the number of words by more than 1 word, use
@@ -589,7 +646,7 @@ func (ps *Parser) prepArticle(articleContent *html.Node) {
589646
iframeCount := len(dom.GetElementsByTagName(p, "iframe"))
590647
totalCount := imgCount + embedCount + objectCount + iframeCount
591648

592-
return totalCount == 0 && ps.getInnerText(p, false) == ""
649+
return totalCount == 0 && !hasTextContent(p)
593650
})
594651

595652
ps.forEachNode(dom.GetElementsByTagName(articleContent, "br"), func(br *html.Node, _ int) {
@@ -714,28 +771,32 @@ func (ps *Parser) checkByline(node *html.Node, matchString string) bool {
714771

715772
rel := dom.GetAttribute(node, "rel")
716773
itemprop := dom.GetAttribute(node, "itemprop")
717-
nodeText := dom.TextContent(node)
718-
if (rel == "author" || strings.Contains(itemprop, "author") || re2go.IsByline(matchString)) &&
719-
ps.isValidByline(nodeText) {
720-
nodeText = strings.TrimSpace(nodeText)
721-
nodeText = strings.Join(strings.Fields(nodeText), " ")
722-
ps.articleByline = nodeText
723-
return true
774+
if rel != "author" && !strings.Contains(itemprop, "author") && !re2go.IsByline(matchString) {
775+
return false
724776
}
725777

778+
nodeText := ps.getInnerText(node, false)
779+
// For now, it's intentional that counting characters happens before
780+
// whitespace normalization. Doing it the other way around breaks several
781+
// tests and the bylines end up different.
782+
if nChar := charCount(nodeText); nChar > 0 && nChar < 100 {
783+
ps.articleByline = normalizeWhitespace(nodeText)
784+
return true
785+
}
726786
return false
727787
}
728788

729789
func (ps *Parser) getTextDensity(node *html.Node, tags ...string) float64 {
730-
textLength := charCount(ps.getInnerText(node, true))
790+
textLength, _ := countCharsAndCommas(node)
731791
if textLength == 0 {
732792
return 0
733793
}
734794

735795
var childrenLength int
736796
children := ps.getAllNodesWithTag(node, tags...)
737797
ps.forEachNode(children, func(child *html.Node, _ int) {
738-
childrenLength += charCount(ps.getInnerText(child, true))
798+
childLength, _ := countCharsAndCommas(child)
799+
childrenLength += childLength
739800
})
740801

741802
return float64(childrenLength) / float64(textLength)
@@ -816,7 +877,7 @@ func (ps *Parser) grabArticle() *html.Node {
816877

817878
if shouldRemoveTitleHeader && ps.headerDuplicatesTitle(node) {
818879
ps.logf("removing header: %q duplicate of %q\n",
819-
trim(dom.TextContent(node)), trim(ps.articleTitle))
880+
ps.getInnerText(node, true), normalizeWhitespace(ps.articleTitle))
820881
shouldRemoveTitleHeader = false
821882
node = ps.removeAndGetNext(node)
822883
continue
@@ -848,7 +909,7 @@ func (ps *Parser) grabArticle() *html.Node {
848909
switch nodeTagName {
849910
case "div", "section", "header",
850911
"h1", "h2", "h3", "h4", "h5", "h6":
851-
if ps.isElementWithoutContent(node) {
912+
if isElementWithoutContent(node) {
852913
node = ps.removeAndGetNext(node)
853914
continue
854915
}
@@ -911,9 +972,9 @@ func (ps *Parser) grabArticle() *html.Node {
911972
return
912973
}
913974

975+
numChars, numCommas := countCharsAndCommas(elementToScore)
914976
// If this paragraph is less than 25 characters, don't even count it.
915-
innerText := ps.getInnerText(elementToScore, true)
916-
if charCount(innerText) < 25 {
977+
if numChars < 25 {
917978
return
918979
}
919980

@@ -927,10 +988,10 @@ func (ps *Parser) grabArticle() *html.Node {
927988
contentScore := 1
928989

929990
// Add points for any commas within this paragraph.
930-
contentScore += re2go.CountCommas(innerText)
991+
contentScore += numCommas
931992

932993
// For every 100 characters in this paragraph, add another point. Up to 3 points.
933-
contentScore += int(math.Min(math.Floor(float64(charCount(innerText))/100.0), 3.0))
994+
contentScore += int(math.Min(math.Floor(float64(numChars)/100.0), 3.0))
934995

935996
// Initialize and score ancestors.
936997
ps.forEachNode(ancestors, func(ancestor *html.Node, level int) {
@@ -1199,7 +1260,7 @@ func (ps *Parser) grabArticle() *html.Node {
11991260
// gives us a higher likelihood of finding the content, and
12001261
// the sieve approach gives us a higher likelihood of
12011262
// finding the -right- content.
1202-
textLength := charCount(ps.getInnerText(articleContent, true))
1263+
textLength, _ := countCharsAndCommas(articleContent)
12031264
if textLength < ps.CharThresholds {
12041265
parseSuccessful = false
12051266

@@ -1249,15 +1310,6 @@ func (ps *Parser) grabArticle() *html.Node {
12491310
}
12501311
}
12511312

1252-
// isValidByline checks whether the input string could be a byline.
1253-
// This verifies that the input is a string, and that the length
1254-
// is less than 100 chars.
1255-
func (ps *Parser) isValidByline(byline string) bool {
1256-
byline = strings.TrimSpace(byline)
1257-
nChar := charCount(byline)
1258-
return nChar > 0 && nChar < 100
1259-
}
1260-
12611313
// getJSONLD try to extract metadata from JSON-LD object.
12621314
// For now, only Schema.org objects of type Article or its subtypes are supported.
12631315
func (ps *Parser) getJSONLD() (map[string]string, error) {
@@ -1515,8 +1567,7 @@ func (ps *Parser) isSingleImage(node *html.Node) bool {
15151567
}
15161568

15171569
children := dom.Children(node)
1518-
textContent := dom.TextContent(node)
1519-
if len(children) != 1 || strings.TrimSpace(textContent) != "" {
1570+
if len(children) != 1 || hasTextContent(node) {
15201571
return false
15211572
}
15221573

@@ -1623,16 +1674,22 @@ func (ps *Parser) hasSingleTagInsideElement(element *html.Node, tag string) bool
16231674
})
16241675
}
16251676

1626-
// isElementWithoutContent determines if node is empty
1627-
// or only fille with <br> and <hr>.
1628-
func (ps *Parser) isElementWithoutContent(node *html.Node) bool {
1629-
brs := dom.GetElementsByTagName(node, "br")
1630-
hrs := dom.GetElementsByTagName(node, "hr")
1631-
childs := dom.Children(node)
1632-
1633-
return node.Type == html.ElementNode &&
1634-
strings.TrimSpace(dom.TextContent(node)) == "" &&
1635-
(len(childs) == 0 || len(childs) == len(brs)+len(hrs))
1677+
func isElementWithoutContent(node *html.Node) bool {
1678+
if node.Type != html.ElementNode {
1679+
return false
1680+
}
1681+
// Traverse the node's descendants to find any text content that is
1682+
// non-whitespace or any elements other than <br> and <hr>.
1683+
for child := range node.ChildNodes() {
1684+
if child.Type == html.TextNode {
1685+
if hasContent(child.Data) {
1686+
return false
1687+
}
1688+
} else if child.Type == html.ElementNode && child.Data != "br" && child.Data != "hr" {
1689+
return false
1690+
}
1691+
}
1692+
return true
16361693
}
16371694

16381695
// hasChildBlockElement determines whether element has any children
@@ -1654,26 +1711,18 @@ func (ps *Parser) isPhrasingContent(node *html.Node) bool {
16541711

16551712
// isWhitespace determines if a node only used as whitespace.
16561713
func (ps *Parser) isWhitespace(node *html.Node) bool {
1657-
return (node.Type == html.TextNode && strings.TrimSpace(dom.TextContent(node)) == "") ||
1714+
return (node.Type == html.TextNode && !hasTextContent(node)) ||
16581715
(node.Type == html.ElementNode && dom.TagName(node) == "br")
16591716
}
16601717

1661-
// getInnerText gets the inner text of a node.
1662-
// This also strips * out any excess whitespace to be found.
1663-
// In Readability.js, normalizeSpaces default to true.
1718+
// getInnerText gets the inner text of a node. This also strips out any excess
1719+
// whitespace to be found. In Readability.js, normalizeSpaces defaults to true.
16641720
func (ps *Parser) getInnerText(node *html.Node, normalizeSpaces bool) string {
1665-
textContent := strings.TrimSpace(dom.TextContent(node))
1721+
textContent := dom.TextContent(node)
16661722
if normalizeSpaces {
1667-
textContent = re2go.NormalizeSpaces(textContent)
1723+
return normalizeWhitespace(textContent)
16681724
}
1669-
return textContent
1670-
}
1671-
1672-
// getCharCount returns the number of times a string s
1673-
// appears in the node.
1674-
func (ps *Parser) getCharCount(node *html.Node, s string) int {
1675-
innerText := ps.getInnerText(node, true)
1676-
return strings.Count(innerText, s)
1725+
return strings.TrimSpace(textContent)
16771726
}
16781727

16791728
// cleanStyles removes the style attribute on every node and under.
@@ -1702,7 +1751,7 @@ func (ps *Parser) cleanStyles(node *html.Node) {
17021751
// content. This is the amount of text that is inside a link divided
17031752
// by the total text in the node.
17041753
func (ps *Parser) getLinkDensity(element *html.Node) float64 {
1705-
textLength := charCount(ps.getInnerText(element, true))
1754+
textLength, _ := countCharsAndCommas(element)
17061755
if textLength == 0 {
17071756
return 0
17081757
}
@@ -1717,7 +1766,7 @@ func (ps *Parser) getLinkDensity(element *html.Node) float64 {
17171766
coefficient = 0.3
17181767
}
17191768

1720-
nodeLength := charCount(ps.getInnerText(linkNode, true))
1769+
nodeLength, _ := countCharsAndCommas(linkNode)
17211770
linkLength += float64(nodeLength) * coefficient
17221771
})
17231772

@@ -2019,10 +2068,11 @@ func (ps *Parser) cleanConditionally(element *html.Node, tag string) {
20192068
var listLength int
20202069
listNodes := ps.getAllNodesWithTag(node, "ul", "ol")
20212070
ps.forEachNode(listNodes, func(list *html.Node, _ int) {
2022-
listLength += charCount(ps.getInnerText(list, true))
2071+
n, _ := countCharsAndCommas(list)
2072+
listLength += n
20232073
})
20242074

2025-
nodeLength := charCount(ps.getInnerText(node, true))
2075+
nodeLength, _ := countCharsAndCommas(node)
20262076
isList = float64(listLength)/float64(nodeLength) > 0.9
20272077
}
20282078

@@ -2041,14 +2091,16 @@ func (ps *Parser) cleanConditionally(element *html.Node, tag string) {
20412091
return true
20422092
}
20432093

2044-
if ps.getCharCount(node, ",") < 10 {
2094+
// FIXME: countCharsAndCommas(node) was already called for non-lists above
2095+
if contentLength, commaCount := countCharsAndCommas(node); commaCount < 10 {
20452096
// If there are not very many commas, and the number of
20462097
// non-paragraph elements is more than paragraphs or other
20472098
// ominous signs, remove the element.
20482099
p := float64(len(dom.GetElementsByTagName(node, "p")))
20492100
img := float64(len(dom.GetElementsByTagName(node, "img")))
20502101
li := float64(len(dom.GetElementsByTagName(node, "li")) - 100)
20512102
input := float64(len(dom.GetElementsByTagName(node, "input")))
2103+
// FIXME: this also calls countCharsAndCommas(node)
20522104
headingDensity := ps.getTextDensity(node, "h1", "h2", "h3", "h4", "h5", "h6")
20532105

20542106
embedCount := 0
@@ -2071,8 +2123,8 @@ func (ps *Parser) cleanConditionally(element *html.Node, tag string) {
20712123
embedCount++
20722124
}
20732125

2126+
// FIXME: this also calls countCharsAndCommas(node)
20742127
linkDensity := ps.getLinkDensity(node)
2075-
contentLength := charCount(ps.getInnerText(node, true))
20762128
haveToRemove := (img > 1 && p/img < 0.5 && !ps.hasAncestorTag(node, "figure", 3, nil)) ||
20772129
(!isList && li > p) ||
20782130
(input > math.Floor(p/3)) ||

utils.go

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,10 @@ package readability
33
import (
44
nurl "net/url"
55
"strings"
6+
"unicode"
67
"unicode/utf8"
8+
9+
"github.com/go-shiori/go-readability/internal/re2go"
710
)
811

912
// indexOf returns the position of the first occurrence of a
@@ -28,6 +31,33 @@ func charCount(str string) int {
2831
return utf8.RuneCountInString(str)
2932
}
3033

34+
// normalizeWhitespace trims leading and trailing whitespace and collapses all
35+
// consecutive chains of whitespace as a single space.
36+
func normalizeWhitespace(str string) string {
37+
return re2go.NormalizeSpaces(strings.TrimSpace(str))
38+
}
39+
40+
// map of ASCII whitespace characters
41+
var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1}
42+
43+
// hasContent reports whether a string contains a non-space character.
44+
func hasContent(str string) bool {
45+
for idx := 0; idx < len(str); idx++ {
46+
c := str[idx]
47+
if c >= utf8.RuneSelf {
48+
// If we run into a non-ASCII byte, fall back to the slower
49+
// Unicode-aware method on the remaining bytes
50+
return strings.ContainsFunc(str[idx:], func(r rune) bool {
51+
return !unicode.IsSpace(r)
52+
})
53+
}
54+
if asciiSpace[c] == 0 {
55+
return true
56+
}
57+
}
58+
return false
59+
}
60+
3161
// isValidURL checks if URL is valid.
3262
func isValidURL(s string) bool {
3363
_, err := nurl.ParseRequestURI(s)
@@ -93,8 +123,3 @@ func strFilter(strs []string, filter func(string) bool) []string {
93123
}
94124
return result
95125
}
96-
97-
func trim(s string) string {
98-
s = strings.Join(strings.Fields(s), " ")
99-
return strings.TrimSpace(s)
100-
}

0 commit comments

Comments
 (0)