@@ -12,6 +12,7 @@ import (
1212 "strconv"
1313 "strings"
1414 "time"
15+ "unicode"
1516
1617 "github.com/go-shiori/dom"
1718 "github.com/go-shiori/go-readability/internal/re2go"
@@ -229,6 +230,63 @@ func (ps *Parser) getAllNodesWithTag(node *html.Node, tagNames ...string) []*htm
229230 return result
230231}
231232
233+ // hasTextContent reports whether a node or any of its descendants have text content other than spaces.
234+ func hasTextContent (node * html.Node ) bool {
235+ if node .Type == html .TextNode {
236+ return hasContent (node .Data )
237+ }
238+ for child := range node .ChildNodes () {
239+ if hasTextContent (child ) {
240+ return true
241+ }
242+ }
243+ return false
244+ }
245+
246+ // countCharsAndCommas returns counts for both characters and commas in a node's
247+ // text. Leading and trailing whitespace is not counted, nor are consecutive
248+ // runs of whitespace.
249+ func countCharsAndCommas (node * html.Node ) (int , int ) {
250+ numChars := 0
251+ numCommas := 0
252+ lastCharWasSpace := false
253+ seenNonSpace := false
254+
255+ // Walk the node and its descendants to count all non-space characters and
256+ // different comma variants separately.
257+ var walk func (* html.Node )
258+ walk = func (n * html.Node ) {
259+ if n .Type == html .TextNode {
260+ for _ , r := range n .Data {
261+ if unicode .IsSpace (r ) {
262+ lastCharWasSpace = true
263+ continue
264+ }
265+ if lastCharWasSpace && seenNonSpace {
266+ numChars += 2
267+ } else {
268+ numChars += 1
269+ }
270+ lastCharWasSpace = false
271+ seenNonSpace = true
272+ switch r {
273+ // Commas as used in Latin, Sindhi, Chinese and various other scripts.
274+ // see: https://en.wikipedia.org/wiki/Comma#Comma_variants
275+ case '\u002C' , '\u060C' , '\uFE50' , '\uFE10' , '\uFE11' , '\u2E41' , '\u2E34' , '\u2E32' , '\uFF0C' :
276+ numCommas ++
277+ }
278+ }
279+ return
280+ }
281+ for child := range n .ChildNodes () {
282+ walk (child )
283+ }
284+ }
285+
286+ walk (node )
287+ return numChars , numCommas
288+ }
289+
232290// cleanClasses removes the class="" attribute from every element in the
233291// given subtree, except those that match CLASSES_TO_PRESERVE and the
234292// classesToPreserve array from the options object.
@@ -327,7 +385,7 @@ func (ps *Parser) simplifyNestedElements(articleContent *html.Node) {
327385
328386 if node .Parent != nil && (nodeTagName == "div" || nodeTagName == "section" ) &&
329387 ! strings .HasPrefix (nodeID , "readability" ) {
330- if ps . isElementWithoutContent (node ) {
388+ if isElementWithoutContent (node ) {
331389 node = ps .removeAndGetNext (node )
332390 continue
333391 }
@@ -405,8 +463,7 @@ func (ps *Parser) getArticleTitle() string {
405463 }
406464 }
407465
408- curTitle = strings .TrimSpace (curTitle )
409- curTitle = re2go .NormalizeSpaces (curTitle )
466+ curTitle = normalizeWhitespace (curTitle )
410467 // If we now have 4 words or fewer as our title, and either no
411468 // 'hierarchical' separators (\, /, > or ») were found in the original
412469 // title or we decreased the number of words by more than 1 word, use
@@ -589,7 +646,7 @@ func (ps *Parser) prepArticle(articleContent *html.Node) {
589646 iframeCount := len (dom .GetElementsByTagName (p , "iframe" ))
590647 totalCount := imgCount + embedCount + objectCount + iframeCount
591648
592- return totalCount == 0 && ps . getInnerText ( p , false ) == ""
649+ return totalCount == 0 && ! hasTextContent ( p )
593650 })
594651
595652 ps .forEachNode (dom .GetElementsByTagName (articleContent , "br" ), func (br * html.Node , _ int ) {
@@ -714,28 +771,32 @@ func (ps *Parser) checkByline(node *html.Node, matchString string) bool {
714771
715772 rel := dom .GetAttribute (node , "rel" )
716773 itemprop := dom .GetAttribute (node , "itemprop" )
717- nodeText := dom .TextContent (node )
718- if (rel == "author" || strings .Contains (itemprop , "author" ) || re2go .IsByline (matchString )) &&
719- ps .isValidByline (nodeText ) {
720- nodeText = strings .TrimSpace (nodeText )
721- nodeText = strings .Join (strings .Fields (nodeText ), " " )
722- ps .articleByline = nodeText
723- return true
774+ if rel != "author" && ! strings .Contains (itemprop , "author" ) && ! re2go .IsByline (matchString ) {
775+ return false
724776 }
725777
778+ nodeText := ps .getInnerText (node , false )
779+ // For now, it's intentional that counting characters happens before
780+ // whitespace normalization. Doing it the other way around breaks several
781+ // tests and the bylines end up different.
782+ if nChar := charCount (nodeText ); nChar > 0 && nChar < 100 {
783+ ps .articleByline = normalizeWhitespace (nodeText )
784+ return true
785+ }
726786 return false
727787}
728788
729789func (ps * Parser ) getTextDensity (node * html.Node , tags ... string ) float64 {
730- textLength := charCount ( ps . getInnerText ( node , true ) )
790+ textLength , _ := countCharsAndCommas ( node )
731791 if textLength == 0 {
732792 return 0
733793 }
734794
735795 var childrenLength int
736796 children := ps .getAllNodesWithTag (node , tags ... )
737797 ps .forEachNode (children , func (child * html.Node , _ int ) {
738- childrenLength += charCount (ps .getInnerText (child , true ))
798+ childLength , _ := countCharsAndCommas (child )
799+ childrenLength += childLength
739800 })
740801
741802 return float64 (childrenLength ) / float64 (textLength )
@@ -816,7 +877,7 @@ func (ps *Parser) grabArticle() *html.Node {
816877
817878 if shouldRemoveTitleHeader && ps .headerDuplicatesTitle (node ) {
818879 ps .logf ("removing header: %q duplicate of %q\n " ,
819- trim ( dom . TextContent (node )), trim (ps .articleTitle ))
880+ ps . getInnerText (node , true ), normalizeWhitespace (ps .articleTitle ))
820881 shouldRemoveTitleHeader = false
821882 node = ps .removeAndGetNext (node )
822883 continue
@@ -848,7 +909,7 @@ func (ps *Parser) grabArticle() *html.Node {
848909 switch nodeTagName {
849910 case "div" , "section" , "header" ,
850911 "h1" , "h2" , "h3" , "h4" , "h5" , "h6" :
851- if ps . isElementWithoutContent (node ) {
912+ if isElementWithoutContent (node ) {
852913 node = ps .removeAndGetNext (node )
853914 continue
854915 }
@@ -911,9 +972,9 @@ func (ps *Parser) grabArticle() *html.Node {
911972 return
912973 }
913974
975+ numChars , numCommas := countCharsAndCommas (elementToScore )
914976 // If this paragraph is less than 25 characters, don't even count it.
915- innerText := ps .getInnerText (elementToScore , true )
916- if charCount (innerText ) < 25 {
977+ if numChars < 25 {
917978 return
918979 }
919980
@@ -927,10 +988,10 @@ func (ps *Parser) grabArticle() *html.Node {
927988 contentScore := 1
928989
929990 // Add points for any commas within this paragraph.
930- contentScore += re2go . CountCommas ( innerText )
991+ contentScore += numCommas
931992
932993 // For every 100 characters in this paragraph, add another point. Up to 3 points.
933- contentScore += int (math .Min (math .Floor (float64 (charCount ( innerText ) )/ 100.0 ), 3.0 ))
994+ contentScore += int (math .Min (math .Floor (float64 (numChars )/ 100.0 ), 3.0 ))
934995
935996 // Initialize and score ancestors.
936997 ps .forEachNode (ancestors , func (ancestor * html.Node , level int ) {
@@ -1199,7 +1260,7 @@ func (ps *Parser) grabArticle() *html.Node {
11991260 // gives us a higher likelihood of finding the content, and
12001261 // the sieve approach gives us a higher likelihood of
12011262 // finding the -right- content.
1202- textLength := charCount ( ps . getInnerText ( articleContent , true ) )
1263+ textLength , _ := countCharsAndCommas ( articleContent )
12031264 if textLength < ps .CharThresholds {
12041265 parseSuccessful = false
12051266
@@ -1249,15 +1310,6 @@ func (ps *Parser) grabArticle() *html.Node {
12491310 }
12501311}
12511312
1252- // isValidByline checks whether the input string could be a byline.
1253- // This verifies that the input is a string, and that the length
1254- // is less than 100 chars.
1255- func (ps * Parser ) isValidByline (byline string ) bool {
1256- byline = strings .TrimSpace (byline )
1257- nChar := charCount (byline )
1258- return nChar > 0 && nChar < 100
1259- }
1260-
12611313// getJSONLD try to extract metadata from JSON-LD object.
12621314// For now, only Schema.org objects of type Article or its subtypes are supported.
12631315func (ps * Parser ) getJSONLD () (map [string ]string , error ) {
@@ -1515,8 +1567,7 @@ func (ps *Parser) isSingleImage(node *html.Node) bool {
15151567 }
15161568
15171569 children := dom .Children (node )
1518- textContent := dom .TextContent (node )
1519- if len (children ) != 1 || strings .TrimSpace (textContent ) != "" {
1570+ if len (children ) != 1 || hasTextContent (node ) {
15201571 return false
15211572 }
15221573
@@ -1623,16 +1674,22 @@ func (ps *Parser) hasSingleTagInsideElement(element *html.Node, tag string) bool
16231674 })
16241675}
16251676
1626- // isElementWithoutContent determines if node is empty
1627- // or only fille with <br> and <hr>.
1628- func (ps * Parser ) isElementWithoutContent (node * html.Node ) bool {
1629- brs := dom .GetElementsByTagName (node , "br" )
1630- hrs := dom .GetElementsByTagName (node , "hr" )
1631- childs := dom .Children (node )
1632-
1633- return node .Type == html .ElementNode &&
1634- strings .TrimSpace (dom .TextContent (node )) == "" &&
1635- (len (childs ) == 0 || len (childs ) == len (brs )+ len (hrs ))
1677+ func isElementWithoutContent (node * html.Node ) bool {
1678+ if node .Type != html .ElementNode {
1679+ return false
1680+ }
1681+ // Traverse the node's descendants to find any text content that is
1682+ // non-whitespace or any elements other than <br> and <hr>.
1683+ for child := range node .ChildNodes () {
1684+ if child .Type == html .TextNode {
1685+ if hasContent (child .Data ) {
1686+ return false
1687+ }
1688+ } else if child .Type == html .ElementNode && child .Data != "br" && child .Data != "hr" {
1689+ return false
1690+ }
1691+ }
1692+ return true
16361693}
16371694
16381695// hasChildBlockElement determines whether element has any children
@@ -1654,26 +1711,18 @@ func (ps *Parser) isPhrasingContent(node *html.Node) bool {
16541711
16551712// isWhitespace determines if a node only used as whitespace.
16561713func (ps * Parser ) isWhitespace (node * html.Node ) bool {
1657- return (node .Type == html .TextNode && strings . TrimSpace ( dom . TextContent ( node )) == "" ) ||
1714+ return (node .Type == html .TextNode && ! hasTextContent ( node )) ||
16581715 (node .Type == html .ElementNode && dom .TagName (node ) == "br" )
16591716}
16601717
1661- // getInnerText gets the inner text of a node.
1662- // This also strips * out any excess whitespace to be found.
1663- // In Readability.js, normalizeSpaces default to true.
1718+ // getInnerText gets the inner text of a node. This also strips out any excess
1719+ // whitespace to be found. In Readability.js, normalizeSpaces defaults to true.
16641720func (ps * Parser ) getInnerText (node * html.Node , normalizeSpaces bool ) string {
1665- textContent := strings . TrimSpace ( dom .TextContent (node ) )
1721+ textContent := dom .TextContent (node )
16661722 if normalizeSpaces {
1667- textContent = re2go . NormalizeSpaces (textContent )
1723+ return normalizeWhitespace (textContent )
16681724 }
1669- return textContent
1670- }
1671-
1672- // getCharCount returns the number of times a string s
1673- // appears in the node.
1674- func (ps * Parser ) getCharCount (node * html.Node , s string ) int {
1675- innerText := ps .getInnerText (node , true )
1676- return strings .Count (innerText , s )
1725+ return strings .TrimSpace (textContent )
16771726}
16781727
16791728// cleanStyles removes the style attribute on every node and under.
@@ -1702,7 +1751,7 @@ func (ps *Parser) cleanStyles(node *html.Node) {
17021751// content. This is the amount of text that is inside a link divided
17031752// by the total text in the node.
17041753func (ps * Parser ) getLinkDensity (element * html.Node ) float64 {
1705- textLength := charCount ( ps . getInnerText ( element , true ) )
1754+ textLength , _ := countCharsAndCommas ( element )
17061755 if textLength == 0 {
17071756 return 0
17081757 }
@@ -1717,7 +1766,7 @@ func (ps *Parser) getLinkDensity(element *html.Node) float64 {
17171766 coefficient = 0.3
17181767 }
17191768
1720- nodeLength := charCount ( ps . getInnerText ( linkNode , true ) )
1769+ nodeLength , _ := countCharsAndCommas ( linkNode )
17211770 linkLength += float64 (nodeLength ) * coefficient
17221771 })
17231772
@@ -2019,10 +2068,11 @@ func (ps *Parser) cleanConditionally(element *html.Node, tag string) {
20192068 var listLength int
20202069 listNodes := ps .getAllNodesWithTag (node , "ul" , "ol" )
20212070 ps .forEachNode (listNodes , func (list * html.Node , _ int ) {
2022- listLength += charCount (ps .getInnerText (list , true ))
2071+ n , _ := countCharsAndCommas (list )
2072+ listLength += n
20232073 })
20242074
2025- nodeLength := charCount ( ps . getInnerText ( node , true ) )
2075+ nodeLength , _ := countCharsAndCommas ( node )
20262076 isList = float64 (listLength )/ float64 (nodeLength ) > 0.9
20272077 }
20282078
@@ -2041,14 +2091,16 @@ func (ps *Parser) cleanConditionally(element *html.Node, tag string) {
20412091 return true
20422092 }
20432093
2044- if ps .getCharCount (node , "," ) < 10 {
2094+ // FIXME: countCharsAndCommas(node) was already called for non-lists above
2095+ if contentLength , commaCount := countCharsAndCommas (node ); commaCount < 10 {
20452096 // If there are not very many commas, and the number of
20462097 // non-paragraph elements is more than paragraphs or other
20472098 // ominous signs, remove the element.
20482099 p := float64 (len (dom .GetElementsByTagName (node , "p" )))
20492100 img := float64 (len (dom .GetElementsByTagName (node , "img" )))
20502101 li := float64 (len (dom .GetElementsByTagName (node , "li" )) - 100 )
20512102 input := float64 (len (dom .GetElementsByTagName (node , "input" )))
2103+ // FIXME: this also calls countCharsAndCommas(node)
20522104 headingDensity := ps .getTextDensity (node , "h1" , "h2" , "h3" , "h4" , "h5" , "h6" )
20532105
20542106 embedCount := 0
@@ -2071,8 +2123,8 @@ func (ps *Parser) cleanConditionally(element *html.Node, tag string) {
20712123 embedCount ++
20722124 }
20732125
2126+ // FIXME: this also calls countCharsAndCommas(node)
20742127 linkDensity := ps .getLinkDensity (node )
2075- contentLength := charCount (ps .getInnerText (node , true ))
20762128 haveToRemove := (img > 1 && p / img < 0.5 && ! ps .hasAncestorTag (node , "figure" , 3 , nil )) ||
20772129 (! isList && li > p ) ||
20782130 (input > math .Floor (p / 3 )) ||
0 commit comments