diff --git a/internal/re2go/grab-article.go b/internal/re2go/grab-article.go index 52924f9..8b1db0c 100644 --- a/internal/re2go/grab-article.go +++ b/internal/re2go/grab-article.go @@ -1,4 +1,4 @@ -// Code generated by re2go 4.0.2, DO NOT EDIT. +// Code generated by re2go 4.2, DO NOT EDIT. package re2go // Original pattern: (?i)-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote @@ -2024,138 +2024,3 @@ func MaybeItsACandidate(input string) bool { } } - -// Commas as used in Latin, Sindhi, Chinese and various other scripts. -// see: https://en.wikipedia.org/wiki/Comma#Comma_variants -// Original pattern: \u002C|\u060C|\uFE50|\uFE10|\uFE11|\u2E41|\u2E34|\u2E32|\uFF0C -func CountCommas(input string) int { - var count int - var cursor, marker int - input += string(rune(0)) // add terminating null - limit := len(input) - 1 // limit points at the terminating null - _ = marker - - for { - { - var yych byte - yych = input[cursor] - switch yych { - case ',': - goto yy177 - case 0xD8: - goto yy178 - case 0xE2: - goto yy179 - case 0xEF: - goto yy180 - default: - if limit <= cursor { - goto yy187 - } - goto yy175 - } - yy175: - cursor++ - yy176: - { - continue - } - yy177: - cursor++ - { - count++ - continue - } - yy178: - cursor++ - yych = input[cursor] - switch yych { - case 0x8C: - goto yy177 - default: - goto yy176 - } - yy179: - cursor++ - marker = cursor - yych = input[cursor] - switch yych { - case 0xB8: - goto yy181 - case 0xB9: - goto yy183 - default: - goto yy176 - } - yy180: - cursor++ - marker = cursor - yych = input[cursor] - switch yych { - case 0xB8: - goto yy184 - case 0xB9: - goto yy185 - case 0xBC: - goto yy186 - default: - goto yy176 - } - yy181: - cursor++ - yych = input[cursor] - switch yych { - case 0xB2: - fallthrough - case 0xB4: - goto yy177 - default: - goto yy182 - } - yy182: - cursor = marker - goto yy176 - yy183: - cursor++ - yych = input[cursor] - switch yych { - case 0x81: - goto yy177 - default: - goto yy182 - } - yy184: - cursor++ - yych = input[cursor] - switch yych { - case 0x90, 0x91: - goto yy177 - default: - goto yy182 - } - yy185: - cursor++ - yych = input[cursor] - switch yych { - case 0x90: - goto yy177 - default: - goto yy182 - } - yy186: - cursor++ - yych = input[cursor] - switch yych { - case 0x8C: - goto yy177 - default: - goto yy182 - } - yy187: - { - return count - } - } - - } -} diff --git a/internal/re2go/grab-article.re b/internal/re2go/grab-article.re index c9f861f..81e0cd8 100644 --- a/internal/re2go/grab-article.re +++ b/internal/re2go/grab-article.re @@ -37,25 +37,3 @@ func MaybeItsACandidate(input string) bool { */ } } - -// Commas as used in Latin, Sindhi, Chinese and various other scripts. -// see: https://en.wikipedia.org/wiki/Comma#Comma_variants -// Original pattern: \u002C|\u060C|\uFE50|\uFE10|\uFE11|\u2E41|\u2E34|\u2E32|\uFF0C -func CountCommas(input string) int { - var count int - var cursor, marker int - input += string(rune(0)) // add terminating null - limit := len(input) - 1 // limit points at the terminating null - _ = marker - - for { /*!use:re2c:base_template - re2c:case-insensitive = 1; - - commas = [\u002C\u060C\uFE50\uFE10\uFE11\u2E41\u2E34\u2E32\uFF0C]; - - {commas} { count++; continue } - * { continue } - $ { return count } - */ - } -} \ No newline at end of file diff --git a/internal/re2go/re2go_test.go b/internal/re2go/re2go_test.go index 1ebe32e..99abb5d 100644 --- a/internal/re2go/re2go_test.go +++ b/internal/re2go/re2go_test.go @@ -152,11 +152,6 @@ func Test_MaybeItsACandidate(t *testing.T) { assert.False(t, MaybeItsACandidate(`
Paragraph text
`)) } -func Test_CountCommas(t *testing.T) { - assert.Equal(t, 3, CountCommas("my,name,is,john")) - assert.Equal(t, 9, CountCommas("now,its،a mixed﹐commas︐from︑various⹁place⸴and⸲country,")) -} - func Test_NormalizeSpaces(t *testing.T) { assert.Equal(t, "some sentence", NormalizeSpaces("some sentence")) assert.Equal(t, "with tabs", NormalizeSpaces("with \t \ttabs")) diff --git a/parser.go b/parser.go index 6e9ad45..8c704bb 100644 --- a/parser.go +++ b/parser.go @@ -23,9 +23,7 @@ import ( var ( rxVideos = regexp.MustCompile(`(?i)//(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)`) rxTokenize = regexp.MustCompile(`(?i)\W+`) - rxWhitespace = regexp.MustCompile(`(?i)^\s*$`) rxHasContent = regexp.MustCompile(`(?i)\S$`) - rxHashURL = regexp.MustCompile(`(?i)^#.+`) rxPropertyPattern = regexp.MustCompile(`(?i)\s*(dc|dcterm|og|article|twitter)\s*:\s*(author|creator|description|title|site_name|published_time|modified_time|image\S*)\s*`) rxNamePattern = regexp.MustCompile(`(?i)^\s*(?:(dc|dcterm|article|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name|published_time|modified_time|image)\s*$`) rxTitleSeparator = regexp.MustCompile(`(?i) [\|\-\\/>»] `) @@ -405,8 +403,7 @@ func (ps *Parser) getArticleTitle() string { } } - curTitle = strings.TrimSpace(curTitle) - curTitle = re2go.NormalizeSpaces(curTitle) + curTitle = normalizeWhitespace(curTitle) // If we now have 4 words or fewer as our title, and either no // 'hierarchical' separators (\, /, > or ») were found in the original // title or we decreased the number of words by more than 1 word, use @@ -448,7 +445,7 @@ func (ps *Parser) prepDocument() { // same node is returned. func (ps *Parser) nextNode(node *html.Node) *html.Node { next := node - for next != nil && next.Type != html.ElementNode && rxWhitespace.MatchString(dom.TextContent(next)) { + for next != nil && next.Type != html.ElementNode && !hasTextContent(next) { next = next.NextSibling } return next @@ -589,7 +586,7 @@ func (ps *Parser) prepArticle(articleContent *html.Node) { iframeCount := len(dom.GetElementsByTagName(p, "iframe")) totalCount := imgCount + embedCount + objectCount + iframeCount - return totalCount == 0 && ps.getInnerText(p, false) == "" + return totalCount == 0 && !hasTextContent(p) }) ps.forEachNode(dom.GetElementsByTagName(articleContent, "br"), func(br *html.Node, _ int) { @@ -714,31 +711,19 @@ func (ps *Parser) checkByline(node *html.Node, matchString string) bool { rel := dom.GetAttribute(node, "rel") itemprop := dom.GetAttribute(node, "itemprop") - nodeText := dom.TextContent(node) - if (rel == "author" || strings.Contains(itemprop, "author") || re2go.IsByline(matchString)) && - ps.isValidByline(nodeText) { - nodeText = strings.TrimSpace(nodeText) - nodeText = strings.Join(strings.Fields(nodeText), " ") - ps.articleByline = nodeText - return true + if rel != "author" && !strings.Contains(itemprop, "author") && !re2go.IsByline(matchString) { + return false } - return false -} - -func (ps *Parser) getTextDensity(node *html.Node, tags ...string) float64 { - textLength := charCount(ps.getInnerText(node, true)) - if textLength == 0 { - return 0 + nodeText := ps.getInnerText(node, false) + // For now, it's intentional that counting characters happens before + // whitespace normalization. Doing it the other way around breaks several + // tests and the bylines end up different. + if nChar := charCount(nodeText); nChar > 0 && nChar < 100 { + ps.articleByline = normalizeWhitespace(nodeText) + return true } - - var childrenLength int - children := ps.getAllNodesWithTag(node, tags...) - ps.forEachNode(children, func(child *html.Node, _ int) { - childrenLength += charCount(ps.getInnerText(child, true)) - }) - - return float64(childrenLength) / float64(textLength) + return false } // getNodeAncestors gets the node's direct parent and grandparents. @@ -816,7 +801,7 @@ func (ps *Parser) grabArticle() *html.Node { if shouldRemoveTitleHeader && ps.headerDuplicatesTitle(node) { ps.logf("removing header: %q duplicate of %q\n", - trim(dom.TextContent(node)), trim(ps.articleTitle)) + ps.getInnerText(node, true), normalizeWhitespace(ps.articleTitle)) shouldRemoveTitleHeader = false node = ps.removeAndGetNext(node) continue @@ -911,9 +896,9 @@ func (ps *Parser) grabArticle() *html.Node { return } + numChars, numCommas := countCharsAndCommas(elementToScore) // If this paragraph is less than 25 characters, don't even count it. - innerText := ps.getInnerText(elementToScore, true) - if charCount(innerText) < 25 { + if numChars < 25 { return } @@ -927,10 +912,10 @@ func (ps *Parser) grabArticle() *html.Node { contentScore := 1 // Add points for any commas within this paragraph. - contentScore += re2go.CountCommas(innerText) + contentScore += numCommas // For every 100 characters in this paragraph, add another point. Up to 3 points. - contentScore += int(math.Min(math.Floor(float64(charCount(innerText))/100.0), 3.0)) + contentScore += int(math.Min(math.Floor(float64(numChars)/100.0), 3.0)) // Initialize and score ancestors. ps.forEachNode(ancestors, func(ancestor *html.Node, level int) { @@ -1130,6 +1115,7 @@ func (ps *Parser) grabArticle() *html.Node { appendNode = true } else if dom.TagName(sibling) == "p" { linkDensity := ps.getLinkDensity(sibling) + // FIXME: avoid gathering nodeContent just to detect whether there was a sentence period nodeContent := ps.getInnerText(sibling, true) nodeLength := charCount(nodeContent) @@ -1199,7 +1185,7 @@ func (ps *Parser) grabArticle() *html.Node { // gives us a higher likelihood of finding the content, and // the sieve approach gives us a higher likelihood of // finding the -right- content. - textLength := charCount(ps.getInnerText(articleContent, true)) + textLength, _ := countCharsAndCommas(articleContent) if textLength < ps.CharThresholds { parseSuccessful = false @@ -1249,15 +1235,6 @@ func (ps *Parser) grabArticle() *html.Node { } } -// isValidByline checks whether the input string could be a byline. -// This verifies that the input is a string, and that the length -// is less than 100 chars. -func (ps *Parser) isValidByline(byline string) bool { - byline = strings.TrimSpace(byline) - nChar := charCount(byline) - return nChar > 0 && nChar < 100 -} - // getJSONLD try to extract metadata from JSON-LD object. // For now, only Schema.org objects of type Article or its subtypes are supported. func (ps *Parser) getJSONLD() (map[string]string, error) { @@ -1515,8 +1492,7 @@ func (ps *Parser) isSingleImage(node *html.Node) bool { } children := dom.Children(node) - textContent := dom.TextContent(node) - if len(children) != 1 || strings.TrimSpace(textContent) != "" { + if len(children) != 1 || hasTextContent(node) { return false } @@ -1624,15 +1600,23 @@ func (ps *Parser) hasSingleTagInsideElement(element *html.Node, tag string) bool } // isElementWithoutContent determines if node is empty -// or only fille with