stormlightlabs
diff --git a/‎internal/articles/heuristics.go‎
Lines changed: 458 additions & 0 deletions b/‎internal/articles/heuristics.go‎
Lines changed: 458 additions & 0 deletions
diff --git a/‎internal/articles/heuristics_test.go‎
Lines changed: 443 additions & 0 deletions b/‎internal/articles/heuristics_test.go‎
Lines changed: 443 additions & 0 deletions
diff --git a/‎internal/articles/metadata.go‎
Lines changed: 305 additions & 0 deletions b/‎internal/articles/metadata.go‎
Lines changed: 305 additions & 0 deletions
@@ -0,0 +1,305 @@
+package articles
+
+import (
+	"encoding/json"
+	"strings"
+
+	"github.com/antchfx/htmlquery"
+	"golang.org/x/net/html"
+)
+
+// MetadataExtractor implements multi-strategy metadata extraction from HTML documents.
+// It attempts to extract article metadata using OpenGraph, Schema.org, meta tags,
+// and semantic HTML5 elements, with fallback chains for each field.
+type MetadataExtractor struct{}
+
+// NewMetadataExtractor creates a new metadata extractor.
+func NewMetadataExtractor() *MetadataExtractor {
+	return &MetadataExtractor{}
+}
+
+// ExtractMetadata extracts all available metadata from an HTML document.
+// Returns an ExtractionResult with populated metadata fields.
+func (m *MetadataExtractor) ExtractMetadata(doc *html.Node) *ExtractionResult {
+	if doc == nil {
+		return &ExtractionResult{}
+	}
+
+	result := &ExtractionResult{}
+
+	result.Title = m.ExtractTitle(doc)
+	result.Author = m.ExtractAuthor(doc)
+	result.PublishedDate = m.ExtractPublishedDate(doc)
+	result.SiteName = m.ExtractSiteName(doc)
+	result.Language = m.ExtractLanguage(doc)
+
+	return result
+}
+
+// ExtractTitle extracts the article title using multiple strategies.
+// Tries in order: OpenGraph, Schema.org, meta tags, h1, title tag.
+func (m *MetadataExtractor) ExtractTitle(doc *html.Node) string {
+	if doc == nil {
+		return ""
+	}
+
+	if title := m.getMetaContent(doc, "property", "og:title"); title != "" {
+		return title
+	}
+
+	if title := m.getSchemaOrgField(doc, "headline"); title != "" {
+		return title
+	}
+
+	if title := m.getSchemaOrgField(doc, "name"); title != "" {
+		return title
+	}
+
+	if title := m.getMetaContent(doc, "name", "twitter:title"); title != "" {
+		return title
+	}
+
+	if title := m.getMetaContent(doc, "property", "article:title"); title != "" {
+		return title
+	}
+
+	if h1 := htmlquery.FindOne(doc, "//h1"); h1 != nil {
+		if title := htmlquery.InnerText(h1); title != "" {
+			return strings.TrimSpace(title)
+		}
+	}
+
+	if titleNode := htmlquery.FindOne(doc, "//title"); titleNode != nil {
+		if title := htmlquery.InnerText(titleNode); title != "" {
+			return strings.TrimSpace(title)
+		}
+	}
+
+	return ""
+}
+
+// ExtractAuthor extracts the article author using multiple strategies.
+// Tries in order: OpenGraph, Schema.org, meta tags, rel=author, byline elements.
+func (m *MetadataExtractor) ExtractAuthor(doc *html.Node) string {
+	if doc == nil {
+		return ""
+	}
+
+	if author := m.getMetaContent(doc, "property", "og:author"); author != "" {
+		return author
+	}
+
+	if author := m.getSchemaOrgField(doc, "author"); author != "" {
+		return author
+	}
+
+	if author := m.getMetaContent(doc, "property", "article:author"); author != "" {
+		return author
+	}
+
+	if author := m.getMetaContent(doc, "name", "twitter:creator"); author != "" {
+		return author
+	}
+
+	if author := m.getMetaContent(doc, "name", "author"); author != "" {
+		return author
+	}
+
+	if authorLink := htmlquery.FindOne(doc, "//a[@rel='author']"); authorLink != nil {
+		if author := htmlquery.InnerText(authorLink); author != "" {
+			return strings.TrimSpace(author)
+		}
+	}
+
+	bylineSelectors := []string{
+		"//span[contains(@class, 'author')]",
+		"//div[contains(@class, 'author')]",
+		"//p[contains(@class, 'byline')]",
+		"//span[contains(@class, 'byline')]",
+	}
+
+	for _, selector := range bylineSelectors {
+		if node := htmlquery.FindOne(doc, selector); node != nil {
+			if author := htmlquery.InnerText(node); author != "" {
+				return strings.TrimSpace(author)
+			}
+		}
+	}
+
+	return ""
+}
+
+// ExtractPublishedDate extracts the publication date using multiple strategies.
+// Tries in order: OpenGraph, Schema.org, article:published_time, time elements.
+func (m *MetadataExtractor) ExtractPublishedDate(doc *html.Node) string {
+	if doc == nil {
+		return ""
+	}
+
+	if date := m.getMetaContent(doc, "property", "og:published_time"); date != "" {
+		return date
+	}
+
+	if date := m.getSchemaOrgField(doc, "datePublished"); date != "" {
+		return date
+	}
+
+	if date := m.getSchemaOrgField(doc, "publishDate"); date != "" {
+		return date
+	}
+
+	if date := m.getMetaContent(doc, "property", "article:published_time"); date != "" {
+		return date
+	}
+
+	if date := m.getMetaContent(doc, "name", "publication_date"); date != "" {
+		return date
+	}
+
+	if date := m.getMetaContent(doc, "name", "date"); date != "" {
+		return date
+	}
+
+	if timeNode := htmlquery.FindOne(doc, "//time[@datetime]"); timeNode != nil {
+		for _, attr := range timeNode.Attr {
+			if attr.Key == "datetime" {
+				return attr.Val
+			}
+		}
+	}
+
+	return ""
+}
+
+// ExtractSiteName extracts the site name using multiple strategies.
+// Tries in order: OpenGraph, Schema.org, meta tags.
+func (m *MetadataExtractor) ExtractSiteName(doc *html.Node) string {
+	if doc == nil {
+		return ""
+	}
+
+	if siteName := m.getMetaContent(doc, "property", "og:site_name"); siteName != "" {
+		return siteName
+	}
+
+	if publisher := m.getSchemaOrgField(doc, "publisher"); publisher != "" {
+		return publisher
+	}
+
+	if siteName := m.getMetaContent(doc, "name", "application-name"); siteName != "" {
+		return siteName
+	}
+
+	return ""
+}
+
+// ExtractLanguage extracts the document language.
+// Tries in order: html lang attribute, OpenGraph, meta tags.
+func (m *MetadataExtractor) ExtractLanguage(doc *html.Node) string {
+	if doc == nil {
+		return ""
+	}
+
+	if htmlNode := htmlquery.FindOne(doc, "//html"); htmlNode != nil {
+		for _, attr := range htmlNode.Attr {
+			if attr.Key == "lang" {
+				return attr.Val
+			}
+		}
+	}
+
+	if locale := m.getMetaContent(doc, "property", "og:locale"); locale != "" {
+		return locale
+	}
+
+	if lang := m.getMetaContent(doc, "http-equiv", "content-language"); lang != "" {
+		return lang
+	}
+
+	return ""
+}
+
+// getMetaContent retrieves the content attribute from a meta tag.
+// Searches for meta tags with the specified attribute name and value.
+func (m *MetadataExtractor) getMetaContent(doc *html.Node, attrName, attrValue string) string {
+	if doc == nil {
+		return ""
+	}
+
+	xpath := "//meta[@" + attrName + "='" + attrValue + "']"
+	metaNode := htmlquery.FindOne(doc, xpath)
+
+	if metaNode == nil {
+		return ""
+	}
+
+	for _, attr := range metaNode.Attr {
+		if attr.Key == "content" {
+			return strings.TrimSpace(attr.Val)
+		}
+	}
+
+	return ""
+}
+
+// getSchemaOrgField extracts a field from Schema.org JSON-LD structured data.
+func (m *MetadataExtractor) getSchemaOrgField(doc *html.Node, fieldName string) string {
+	if doc == nil {
+		return ""
+	}
+
+	scripts := htmlquery.Find(doc, "//script[@type='application/ld+json']")
+
+	for _, script := range scripts {
+		if script.FirstChild == nil || script.FirstChild.Type != html.TextNode {
+			continue
+		}
+
+		var data map[string]any
+		if err := json.Unmarshal([]byte(script.FirstChild.Data), &data); err != nil {
+			continue
+		}
+
+		context, hasContext := data["@context"]
+		typeVal, hasType := data["@type"]
+
+		if !hasContext || !hasType {
+			continue
+		}
+
+		contextStr, ok := context.(string)
+		if !ok || !strings.Contains(contextStr, "schema.org") {
+			continue
+		}
+
+		typeStr, ok := typeVal.(string)
+		if !ok || (!strings.Contains(typeStr, "Article") && !strings.Contains(typeStr, "NewsArticle")) {
+			continue
+		}
+
+		if value, exists := data[fieldName]; exists {
+			return m.extractStringValue(value)
+		}
+	}
+
+	return ""
+}
+
+// extractStringValue extracts a string from various JSON value types.
+func (m *MetadataExtractor) extractStringValue(value any) string {
+	switch v := value.(type) {
+	case string:
+		return v
+	case map[string]any:
+		if name, exists := v["name"]; exists {
+			if nameStr, ok := name.(string); ok {
+				return nameStr
+			}
+		}
+	case []any:
+		if len(v) > 0 {
+			return m.extractStringValue(v[0])
+		}
+	}
+	return ""
+}