Skip to content

Commit a62b97b

Browse files
jvoisinfguillot
authored andcommitted
refactor(readability): get rid of getClassWeight
Its naming was confusing, and its code simple enough that it could be inlined.
1 parent 1de9cf4 commit a62b97b

File tree

2 files changed

+8
-60
lines changed

2 files changed

+8
-60
lines changed

internal/reader/readability/readability.go

Lines changed: 8 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,13 @@ func scoreNode(s *goquery.Selection) *candidate {
318318
c.score -= 5
319319
}
320320

321-
c.score += getClassWeight(s)
321+
if class, ok := s.Attr("class"); ok {
322+
c.score += getWeight(class)
323+
}
324+
if id, ok := s.Attr("id"); ok {
325+
c.score += getWeight(id)
326+
}
327+
322328
return c
323329
}
324330

@@ -335,22 +341,7 @@ func getLinkDensity(s *goquery.Selection) float32 {
335341
return float32(linkLength) / float32(sum)
336342
}
337343

338-
// Get an elements class/id weight. Uses regular expressions to tell if this
339-
// element looks good or bad.
340-
func getClassWeight(s *goquery.Selection) float32 {
341-
weight := 0
342-
343-
if class, ok := s.Attr("class"); ok {
344-
weight += getWeight(class)
345-
}
346-
if id, ok := s.Attr("id"); ok {
347-
weight += getWeight(id)
348-
}
349-
350-
return float32(weight)
351-
}
352-
353-
func getWeight(s string) int {
344+
func getWeight(s string) float32 {
354345
s = strings.ToLower(s)
355346
for _, keyword := range negativeKeywords {
356347
if strings.Contains(s, keyword) {

internal/reader/readability/readability_test.go

Lines changed: 0 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -350,49 +350,6 @@ func TestGetClassWeight(t *testing.T) {
350350
if selection.Length() == 0 {
351351
t.Fatal("No div element found in HTML")
352352
}
353-
354-
result := getClassWeight(selection)
355-
if result != tc.expected {
356-
t.Errorf("Expected weight %f, got %f", tc.expected, result)
357-
}
358-
})
359-
}
360-
}
361-
362-
func TestGetClassWeightRegexPatterns(t *testing.T) {
363-
// Test specific regex patterns used in getClassWeight
364-
positiveWords := []string{"article", "body", "content", "entry", "hentry", "h-entry", "main", "page", "pagination", "post", "text", "blog", "story"}
365-
negativeWords := []string{"hid", "banner", "combx", "comment", "com-", "contact", "foot", "masthead", "media", "meta", "modal", "outbrain", "promo", "related", "scroll", "share", "shoutbox", "sidebar", "skyscraper", "sponsor", "shopping", "tags", "tool", "widget", "byline", "author", "dateline", "writtenby"}
366-
367-
for _, word := range positiveWords {
368-
t.Run("positive_"+word, func(t *testing.T) {
369-
html := `<div class="` + word + `">content</div>`
370-
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
371-
if err != nil {
372-
t.Fatalf("Failed to parse HTML: %v", err)
373-
}
374-
375-
selection := doc.Find("div").First()
376-
result := getClassWeight(selection)
377-
if result != 25 {
378-
t.Errorf("Expected positive weight 25 for word '%s', got %f", word, result)
379-
}
380-
})
381-
}
382-
383-
for _, word := range negativeWords {
384-
t.Run("negative_"+word, func(t *testing.T) {
385-
html := `<div class="` + word + `">content</div>`
386-
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
387-
if err != nil {
388-
t.Fatalf("Failed to parse HTML: %v", err)
389-
}
390-
391-
selection := doc.Find("div").First()
392-
result := getClassWeight(selection)
393-
if result != -25 {
394-
t.Errorf("Expected negative weight -25 for word '%s', got %f", word, result)
395-
}
396353
})
397354
}
398355
}

0 commit comments

Comments
 (0)