@@ -103,23 +103,35 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er
103103}
104104
105105func getSelectionLength (s * goquery.Selection ) int {
106- var getLengthOfTextContent func (* html.Node ) int
107- getLengthOfTextContent = func (n * html.Node ) int {
106+ return sumMapOnSelection (s , func (s string ) int { return len (s ) })
107+ }
108+
109+ func getSelectionCommaCount (s * goquery.Selection ) int {
110+ return sumMapOnSelection (s , func (s string ) int { return strings .Count (s , "," ) })
111+ }
112+
113+ // sumMapOnSelection maps `f` on the selection, and return the sum of the result.
114+ // This construct is used instead of goquery.Selection's .Text() method,
115+ // to avoid materializing the text to simply map/sum on it, saving a significant
116+ // amount of memory of large selections, and reducing the pressure on the garbage-collector.
117+ func sumMapOnSelection (s * goquery.Selection , f func (str string ) int ) int {
118+ var recursiveFunction func (* html.Node ) int
119+ recursiveFunction = func (n * html.Node ) int {
108120 total := 0
109121 if n .Type == html .TextNode {
110- total += len (n .Data )
122+ total += f (n .Data )
111123 }
112124 if n .FirstChild != nil {
113125 for c := n .FirstChild ; c != nil ; c = c .NextSibling {
114- total += getLengthOfTextContent (c )
126+ total += recursiveFunction (c )
115127 }
116128 }
117129 return total
118130 }
119131
120132 sum := 0
121133 for _ , n := range s .Nodes {
122- sum += getLengthOfTextContent (n )
134+ sum += recursiveFunction (n )
123135 }
124136 return sum
125137}
@@ -246,38 +258,30 @@ func getCandidates(document *goquery.Document) candidateList {
246258 return
247259 }
248260
249- parent := s . Parent ()
250- parentNode := parent . Get ( 0 )
261+ // Add a point for the paragraph itself as a base.
262+ contentScore := 1
251263
252- grandParent := parent . Parent ()
253- var grandParentNode * html. Node
254- if grandParent . Length () > 0 {
255- grandParentNode = grandParent . Get ( 0 )
256- }
264+ // Add points for any commas within this paragraph.
265+ contentScore += getSelectionCommaCount ( s ) + 1
266+
267+ // For every 100 characters in this paragraph, add another point. Up to 3 points.
268+ contentScore += min ( textLen / 100 , 3 )
257269
270+ parent := s .Parent ()
271+ parentNode := parent .Get (0 )
258272 if _ , found := candidates [parentNode ]; ! found {
259273 candidates [parentNode ] = scoreNode (parent )
260274 }
275+ candidates [parentNode ].score += float32 (contentScore )
261276
262- if grandParentNode != nil {
277+ // The score of the current node influences its grandparent's one as well, but scaled to 50%.
278+ grandParent := parent .Parent ()
279+ if grandParent .Length () > 0 {
280+ grandParentNode := grandParent .Get (0 )
263281 if _ , found := candidates [grandParentNode ]; ! found {
264282 candidates [grandParentNode ] = scoreNode (grandParent )
265283 }
266- }
267-
268- // Add a point for the paragraph itself as a base.
269- contentScore := float32 (1.0 )
270-
271- // Add points for any commas within this paragraph.
272- text := s .Text ()
273- contentScore += float32 (strings .Count (text , "," ) + 1 )
274-
275- // For every 100 characters in this paragraph, add another point. Up to 3 points.
276- contentScore += float32 (min (textLen / 100.0 , 3 ))
277-
278- candidates [parentNode ].score += contentScore
279- if grandParentNode != nil {
280- candidates [grandParentNode ].score += contentScore / 2.0
284+ candidates [grandParentNode ].score += float32 (contentScore ) / 2.0
281285 }
282286 })
283287
0 commit comments