Skip to content

Commit eb64822

Browse files
committed
feat(articles): added content scoring and readability assessment features
* Implemented new Scorer type for heuristic scoring of HTML content based on readability principles. * Developed confidence calculation method to assess the reliability of content scores.
1 parent b5d8851 commit eb64822

File tree

10 files changed

+2858
-136
lines changed

10 files changed

+2858
-136
lines changed

internal/articles/heuristics.go

Lines changed: 458 additions & 0 deletions
Large diffs are not rendered by default.

internal/articles/heuristics_test.go

Lines changed: 443 additions & 0 deletions
Large diffs are not rendered by default.

internal/articles/metadata.go

Lines changed: 305 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,305 @@
1+
package articles
2+
3+
import (
4+
"encoding/json"
5+
"strings"
6+
7+
"github.com/antchfx/htmlquery"
8+
"golang.org/x/net/html"
9+
)
10+
11+
// MetadataExtractor implements multi-strategy metadata extraction from HTML documents.
12+
// It attempts to extract article metadata using OpenGraph, Schema.org, meta tags,
13+
// and semantic HTML5 elements, with fallback chains for each field.
14+
type MetadataExtractor struct{}
15+
16+
// NewMetadataExtractor creates a new metadata extractor.
17+
func NewMetadataExtractor() *MetadataExtractor {
18+
return &MetadataExtractor{}
19+
}
20+
21+
// ExtractMetadata extracts all available metadata from an HTML document.
22+
// Returns an ExtractionResult with populated metadata fields.
23+
func (m *MetadataExtractor) ExtractMetadata(doc *html.Node) *ExtractionResult {
24+
if doc == nil {
25+
return &ExtractionResult{}
26+
}
27+
28+
result := &ExtractionResult{}
29+
30+
result.Title = m.ExtractTitle(doc)
31+
result.Author = m.ExtractAuthor(doc)
32+
result.PublishedDate = m.ExtractPublishedDate(doc)
33+
result.SiteName = m.ExtractSiteName(doc)
34+
result.Language = m.ExtractLanguage(doc)
35+
36+
return result
37+
}
38+
39+
// ExtractTitle extracts the article title using multiple strategies.
40+
// Tries in order: OpenGraph, Schema.org, meta tags, h1, title tag.
41+
func (m *MetadataExtractor) ExtractTitle(doc *html.Node) string {
42+
if doc == nil {
43+
return ""
44+
}
45+
46+
if title := m.getMetaContent(doc, "property", "og:title"); title != "" {
47+
return title
48+
}
49+
50+
if title := m.getSchemaOrgField(doc, "headline"); title != "" {
51+
return title
52+
}
53+
54+
if title := m.getSchemaOrgField(doc, "name"); title != "" {
55+
return title
56+
}
57+
58+
if title := m.getMetaContent(doc, "name", "twitter:title"); title != "" {
59+
return title
60+
}
61+
62+
if title := m.getMetaContent(doc, "property", "article:title"); title != "" {
63+
return title
64+
}
65+
66+
if h1 := htmlquery.FindOne(doc, "//h1"); h1 != nil {
67+
if title := htmlquery.InnerText(h1); title != "" {
68+
return strings.TrimSpace(title)
69+
}
70+
}
71+
72+
if titleNode := htmlquery.FindOne(doc, "//title"); titleNode != nil {
73+
if title := htmlquery.InnerText(titleNode); title != "" {
74+
return strings.TrimSpace(title)
75+
}
76+
}
77+
78+
return ""
79+
}
80+
81+
// ExtractAuthor extracts the article author using multiple strategies.
82+
// Tries in order: OpenGraph, Schema.org, meta tags, rel=author, byline elements.
83+
func (m *MetadataExtractor) ExtractAuthor(doc *html.Node) string {
84+
if doc == nil {
85+
return ""
86+
}
87+
88+
if author := m.getMetaContent(doc, "property", "og:author"); author != "" {
89+
return author
90+
}
91+
92+
if author := m.getSchemaOrgField(doc, "author"); author != "" {
93+
return author
94+
}
95+
96+
if author := m.getMetaContent(doc, "property", "article:author"); author != "" {
97+
return author
98+
}
99+
100+
if author := m.getMetaContent(doc, "name", "twitter:creator"); author != "" {
101+
return author
102+
}
103+
104+
if author := m.getMetaContent(doc, "name", "author"); author != "" {
105+
return author
106+
}
107+
108+
if authorLink := htmlquery.FindOne(doc, "//a[@rel='author']"); authorLink != nil {
109+
if author := htmlquery.InnerText(authorLink); author != "" {
110+
return strings.TrimSpace(author)
111+
}
112+
}
113+
114+
bylineSelectors := []string{
115+
"//span[contains(@class, 'author')]",
116+
"//div[contains(@class, 'author')]",
117+
"//p[contains(@class, 'byline')]",
118+
"//span[contains(@class, 'byline')]",
119+
}
120+
121+
for _, selector := range bylineSelectors {
122+
if node := htmlquery.FindOne(doc, selector); node != nil {
123+
if author := htmlquery.InnerText(node); author != "" {
124+
return strings.TrimSpace(author)
125+
}
126+
}
127+
}
128+
129+
return ""
130+
}
131+
132+
// ExtractPublishedDate extracts the publication date using multiple strategies.
133+
// Tries in order: OpenGraph, Schema.org, article:published_time, time elements.
134+
func (m *MetadataExtractor) ExtractPublishedDate(doc *html.Node) string {
135+
if doc == nil {
136+
return ""
137+
}
138+
139+
if date := m.getMetaContent(doc, "property", "og:published_time"); date != "" {
140+
return date
141+
}
142+
143+
if date := m.getSchemaOrgField(doc, "datePublished"); date != "" {
144+
return date
145+
}
146+
147+
if date := m.getSchemaOrgField(doc, "publishDate"); date != "" {
148+
return date
149+
}
150+
151+
if date := m.getMetaContent(doc, "property", "article:published_time"); date != "" {
152+
return date
153+
}
154+
155+
if date := m.getMetaContent(doc, "name", "publication_date"); date != "" {
156+
return date
157+
}
158+
159+
if date := m.getMetaContent(doc, "name", "date"); date != "" {
160+
return date
161+
}
162+
163+
if timeNode := htmlquery.FindOne(doc, "//time[@datetime]"); timeNode != nil {
164+
for _, attr := range timeNode.Attr {
165+
if attr.Key == "datetime" {
166+
return attr.Val
167+
}
168+
}
169+
}
170+
171+
return ""
172+
}
173+
174+
// ExtractSiteName extracts the site name using multiple strategies.
175+
// Tries in order: OpenGraph, Schema.org, meta tags.
176+
func (m *MetadataExtractor) ExtractSiteName(doc *html.Node) string {
177+
if doc == nil {
178+
return ""
179+
}
180+
181+
if siteName := m.getMetaContent(doc, "property", "og:site_name"); siteName != "" {
182+
return siteName
183+
}
184+
185+
if publisher := m.getSchemaOrgField(doc, "publisher"); publisher != "" {
186+
return publisher
187+
}
188+
189+
if siteName := m.getMetaContent(doc, "name", "application-name"); siteName != "" {
190+
return siteName
191+
}
192+
193+
return ""
194+
}
195+
196+
// ExtractLanguage extracts the document language.
197+
// Tries in order: html lang attribute, OpenGraph, meta tags.
198+
func (m *MetadataExtractor) ExtractLanguage(doc *html.Node) string {
199+
if doc == nil {
200+
return ""
201+
}
202+
203+
if htmlNode := htmlquery.FindOne(doc, "//html"); htmlNode != nil {
204+
for _, attr := range htmlNode.Attr {
205+
if attr.Key == "lang" {
206+
return attr.Val
207+
}
208+
}
209+
}
210+
211+
if locale := m.getMetaContent(doc, "property", "og:locale"); locale != "" {
212+
return locale
213+
}
214+
215+
if lang := m.getMetaContent(doc, "http-equiv", "content-language"); lang != "" {
216+
return lang
217+
}
218+
219+
return ""
220+
}
221+
222+
// getMetaContent retrieves the content attribute from a meta tag.
223+
// Searches for meta tags with the specified attribute name and value.
224+
func (m *MetadataExtractor) getMetaContent(doc *html.Node, attrName, attrValue string) string {
225+
if doc == nil {
226+
return ""
227+
}
228+
229+
xpath := "//meta[@" + attrName + "='" + attrValue + "']"
230+
metaNode := htmlquery.FindOne(doc, xpath)
231+
232+
if metaNode == nil {
233+
return ""
234+
}
235+
236+
for _, attr := range metaNode.Attr {
237+
if attr.Key == "content" {
238+
return strings.TrimSpace(attr.Val)
239+
}
240+
}
241+
242+
return ""
243+
}
244+
245+
// getSchemaOrgField extracts a field from Schema.org JSON-LD structured data.
246+
func (m *MetadataExtractor) getSchemaOrgField(doc *html.Node, fieldName string) string {
247+
if doc == nil {
248+
return ""
249+
}
250+
251+
scripts := htmlquery.Find(doc, "//script[@type='application/ld+json']")
252+
253+
for _, script := range scripts {
254+
if script.FirstChild == nil || script.FirstChild.Type != html.TextNode {
255+
continue
256+
}
257+
258+
var data map[string]any
259+
if err := json.Unmarshal([]byte(script.FirstChild.Data), &data); err != nil {
260+
continue
261+
}
262+
263+
context, hasContext := data["@context"]
264+
typeVal, hasType := data["@type"]
265+
266+
if !hasContext || !hasType {
267+
continue
268+
}
269+
270+
contextStr, ok := context.(string)
271+
if !ok || !strings.Contains(contextStr, "schema.org") {
272+
continue
273+
}
274+
275+
typeStr, ok := typeVal.(string)
276+
if !ok || (!strings.Contains(typeStr, "Article") && !strings.Contains(typeStr, "NewsArticle")) {
277+
continue
278+
}
279+
280+
if value, exists := data[fieldName]; exists {
281+
return m.extractStringValue(value)
282+
}
283+
}
284+
285+
return ""
286+
}
287+
288+
// extractStringValue extracts a string from various JSON value types.
289+
func (m *MetadataExtractor) extractStringValue(value any) string {
290+
switch v := value.(type) {
291+
case string:
292+
return v
293+
case map[string]any:
294+
if name, exists := v["name"]; exists {
295+
if nameStr, ok := name.(string); ok {
296+
return nameStr
297+
}
298+
}
299+
case []any:
300+
if len(v) > 0 {
301+
return m.extractStringValue(v[0])
302+
}
303+
}
304+
return ""
305+
}

0 commit comments

Comments
 (0)