Skip to content

Commit 7912b9b

Browse files
jvoisinfguillot
authored andcommitted
perf(readability): avoid materializing text to count commas
There is no need to materialize the whole text content of the selection only to count its number of commas. As we already have a getLengthOfTextContent function that is pretty similar, this commit refactors it to make it more generic, in the form of a map/fold(+).
1 parent 2d24f5d commit 7912b9b

File tree

1 file changed

+32
-28
lines changed

1 file changed

+32
-28
lines changed

internal/reader/readability/readability.go

Lines changed: 32 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -103,23 +103,35 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er
103103
}
104104

105105
func getSelectionLength(s *goquery.Selection) int {
106-
var getLengthOfTextContent func(*html.Node) int
107-
getLengthOfTextContent = func(n *html.Node) int {
106+
return sumMapOnSelection(s, func(s string) int { return len(s) })
107+
}
108+
109+
func getSelectionCommaCount(s *goquery.Selection) int {
110+
return sumMapOnSelection(s, func(s string) int { return strings.Count(s, ",") })
111+
}
112+
113+
// sumMapOnSelection maps `f` on the selection, and return the sum of the result.
114+
// This construct is used instead of goquery.Selection's .Text() method,
115+
// to avoid materializing the text to simply map/sum on it, saving a significant
116+
// amount of memory of large selections, and reducing the pressure on the garbage-collector.
117+
func sumMapOnSelection(s *goquery.Selection, f func(str string) int) int {
118+
var recursiveFunction func(*html.Node) int
119+
recursiveFunction = func(n *html.Node) int {
108120
total := 0
109121
if n.Type == html.TextNode {
110-
total += len(n.Data)
122+
total += f(n.Data)
111123
}
112124
if n.FirstChild != nil {
113125
for c := n.FirstChild; c != nil; c = c.NextSibling {
114-
total += getLengthOfTextContent(c)
126+
total += recursiveFunction(c)
115127
}
116128
}
117129
return total
118130
}
119131

120132
sum := 0
121133
for _, n := range s.Nodes {
122-
sum += getLengthOfTextContent(n)
134+
sum += recursiveFunction(n)
123135
}
124136
return sum
125137
}
@@ -246,38 +258,30 @@ func getCandidates(document *goquery.Document) candidateList {
246258
return
247259
}
248260

249-
parent := s.Parent()
250-
parentNode := parent.Get(0)
261+
// Add a point for the paragraph itself as a base.
262+
contentScore := 1
251263

252-
grandParent := parent.Parent()
253-
var grandParentNode *html.Node
254-
if grandParent.Length() > 0 {
255-
grandParentNode = grandParent.Get(0)
256-
}
264+
// Add points for any commas within this paragraph.
265+
contentScore += getSelectionCommaCount(s) + 1
266+
267+
// For every 100 characters in this paragraph, add another point. Up to 3 points.
268+
contentScore += min(textLen/100, 3)
257269

270+
parent := s.Parent()
271+
parentNode := parent.Get(0)
258272
if _, found := candidates[parentNode]; !found {
259273
candidates[parentNode] = scoreNode(parent)
260274
}
275+
candidates[parentNode].score += float32(contentScore)
261276

262-
if grandParentNode != nil {
277+
// The score of the current node influences its grandparent's one as well, but scaled to 50%.
278+
grandParent := parent.Parent()
279+
if grandParent.Length() > 0 {
280+
grandParentNode := grandParent.Get(0)
263281
if _, found := candidates[grandParentNode]; !found {
264282
candidates[grandParentNode] = scoreNode(grandParent)
265283
}
266-
}
267-
268-
// Add a point for the paragraph itself as a base.
269-
contentScore := float32(1.0)
270-
271-
// Add points for any commas within this paragraph.
272-
text := s.Text()
273-
contentScore += float32(strings.Count(text, ",") + 1)
274-
275-
// For every 100 characters in this paragraph, add another point. Up to 3 points.
276-
contentScore += float32(min(textLen/100.0, 3))
277-
278-
candidates[parentNode].score += contentScore
279-
if grandParentNode != nil {
280-
candidates[grandParentNode].score += contentScore / 2.0
284+
candidates[grandParentNode].score += float32(contentScore) / 2.0
281285
}
282286
})
283287

0 commit comments

Comments
 (0)