Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 64 additions & 7 deletions internal/reader/subscription/finder.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,11 @@ func (f *subscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string,
return Subscriptions{NewSubscription(responseHandler.EffectiveURL(), responseHandler.EffectiveURL(), feedFormat)}, nil
}

// Step 2) Check if the website URL is a YouTube channel.
// Step 2) Find the canonical URL of the website.
slog.Debug("Try to find the canonical URL of the website", slog.String("website_url", websiteURL))
websiteURL = f.findCanonicalURL(websiteURL, responseHandler.ContentType(), bytes.NewReader(responseBody))

// Step 3) Check if the website URL is a YouTube channel.
slog.Debug("Try to detect feeds for a YouTube page", slog.String("website_url", websiteURL))
if subscriptions, localizedError := f.findSubscriptionsFromYouTube(websiteURL); localizedError != nil {
return nil, localizedError
Expand All @@ -78,7 +82,7 @@ func (f *subscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string,
return subscriptions, nil
}

// Step 3) Parse web page to find feeds from HTML meta tags.
// Step 4) Parse web page to find feeds from HTML meta tags.
slog.Debug("Try to detect feeds from HTML meta tags",
slog.String("website_url", websiteURL),
slog.String("content_type", responseHandler.ContentType()),
Expand All @@ -90,7 +94,7 @@ func (f *subscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string,
return subscriptions, nil
}

// Step 4) Check if the website URL can use RSS-Bridge.
// Step 5) Check if the website URL can use RSS-Bridge.
if rssBridgeURL != "" {
slog.Debug("Try to detect feeds with RSS-Bridge", slog.String("website_url", websiteURL))
if subscriptions, localizedError := f.findSubscriptionsFromRSSBridge(websiteURL, rssBridgeURL, rssBridgeToken); localizedError != nil {
Expand All @@ -101,7 +105,7 @@ func (f *subscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string,
}
}

// Step 5) Check if the website has a known feed URL.
// Step 6) Check if the website has a known feed URL.
slog.Debug("Try to detect feeds from well-known URLs", slog.String("website_url", websiteURL))
if subscriptions, localizedError := f.findSubscriptionsFromWellKnownURLs(websiteURL); localizedError != nil {
return nil, localizedError
Expand Down Expand Up @@ -274,6 +278,15 @@ func (f *subscriptionFinder) findSubscriptionsFromRSSBridge(websiteURL, rssBridg
}

func (f *subscriptionFinder) findSubscriptionsFromYouTube(websiteURL string) (Subscriptions, *locale.LocalizedErrorWrapper) {
playlistPrefixes := []struct {
prefix string
title string
}{
{"UULF", "Videos"},
{"UUSH", "Short videos"},
{"UULV", "Live streams"},
}

decodedURL, err := url.Parse(websiteURL)
if err != nil {
return nil, locale.NewLocalizedErrorWrapper(err, "error.invalid_site_url", err)
Expand All @@ -283,9 +296,19 @@ func (f *subscriptionFinder) findSubscriptionsFromYouTube(websiteURL string) (Su
slog.Debug("YouTube feed discovery skipped: not a YouTube domain", slog.String("website_url", websiteURL))
return nil, nil
}
if _, channelID, found := strings.Cut(decodedURL.Path, "channel/"); found {
feedURL := "https://www.youtube.com/feeds/videos.xml?channel_id=" + channelID
return Subscriptions{NewSubscription(decodedURL.String(), feedURL, parser.FormatAtom)}, nil

if _, baseID, found := strings.Cut(decodedURL.Path, "channel/UC"); found {
var subscriptions Subscriptions

channelFeedURL := "https://www.youtube.com/feeds/videos.xml?channel_id=UC" + baseID
subscriptions = append(subscriptions, NewSubscription("Channel", channelFeedURL, parser.FormatAtom))

for _, playlist := range playlistPrefixes {
playlistFeedURL := "https://www.youtube.com/feeds/videos.xml?playlist_id=" + playlist.prefix + baseID
subscriptions = append(subscriptions, NewSubscription(playlist.title, playlistFeedURL, parser.FormatAtom))
}

return subscriptions, nil
}

if strings.HasPrefix(decodedURL.Path, "/watch") || strings.HasPrefix(decodedURL.Path, "/playlist") {
Expand All @@ -297,3 +320,37 @@ func (f *subscriptionFinder) findSubscriptionsFromYouTube(websiteURL string) (Su

return nil, nil
}

// findCanonicalURL extracts the canonical URL from the HTML <link rel="canonical"> tag.
// Returns the canonical URL if found, otherwise returns the effective URL.
func (f *subscriptionFinder) findCanonicalURL(effectiveURL, contentType string, body io.Reader) string {
htmlDocumentReader, err := encoding.NewCharsetReader(body, contentType)
if err != nil {
return effectiveURL
}

doc, err := goquery.NewDocumentFromReader(htmlDocumentReader)
if err != nil {
return effectiveURL
}

baseURL := effectiveURL
if hrefValue, exists := doc.FindMatcher(goquery.Single("head base")).Attr("href"); exists {
hrefValue = strings.TrimSpace(hrefValue)
if urllib.IsAbsoluteURL(hrefValue) {
baseURL = hrefValue
}
}

canonicalHref, exists := doc.Find("link[rel='canonical' i]").First().Attr("href")
if !exists || strings.TrimSpace(canonicalHref) == "" {
return effectiveURL
}

canonicalURL, err := urllib.AbsoluteURL(baseURL, strings.TrimSpace(canonicalHref))
if err != nil {
return effectiveURL
}

return canonicalURL
}
74 changes: 57 additions & 17 deletions internal/reader/subscription/finder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,65 +11,70 @@ import (
func TestFindYoutubeFeed(t *testing.T) {
type testResult struct {
websiteURL string
feedURL string
feedURLs []string
discoveryError bool
}

scenarios := []testResult{
// Video URL
{
websiteURL: "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
feedURL: "",
feedURLs: []string{},
},
// Video URL with position argument
{
websiteURL: "https://www.youtube.com/watch?v=dQw4w9WgXcQ&t=1",
feedURL: "",
feedURLs: []string{},
},
// Video URL with position argument
{
websiteURL: "https://www.youtube.com/watch?t=1&v=dQw4w9WgXcQ",
feedURL: "",
feedURLs: []string{},
},
// Channel URL
{
websiteURL: "https://www.youtube.com/channel/UC-Qj80avWItNRjkZ41rzHyw",
feedURL: "https://www.youtube.com/feeds/videos.xml?channel_id=UC-Qj80avWItNRjkZ41rzHyw",
feedURLs: []string{
"https://www.youtube.com/feeds/videos.xml?channel_id=UC-Qj80avWItNRjkZ41rzHyw",
"https://www.youtube.com/feeds/videos.xml?playlist_id=UULF-Qj80avWItNRjkZ41rzHyw",
"https://www.youtube.com/feeds/videos.xml?playlist_id=UUSH-Qj80avWItNRjkZ41rzHyw",
"https://www.youtube.com/feeds/videos.xml?playlist_id=UULV-Qj80avWItNRjkZ41rzHyw",
},
},
// Channel URL with name
{
websiteURL: "https://www.youtube.com/@ABCDEFG",
feedURL: "",
feedURLs: []string{},
},
// Playlist URL
{
websiteURL: "https://www.youtube.com/playlist?list=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR",
feedURL: "https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR",
feedURLs: []string{"https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR"},
},
// Playlist URL with video ID
{
websiteURL: "https://www.youtube.com/watch?v=dQw4w9WgXcQ&list=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM",
feedURL: "https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM",
feedURLs: []string{"https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM"},
},
// Playlist URL with video ID and index argument
{
websiteURL: "https://www.youtube.com/watch?v=6IutBmRJNLk&list=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR&index=4",
feedURL: "https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR",
feedURLs: []string{"https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR"},
},
// Empty playlist ID parameter
{
websiteURL: "https://www.youtube.com/playlist?list=",
feedURL: "",
feedURLs: []string{},
},
// Non-Youtube URL
{
websiteURL: "https://www.example.com/channel/UC-Qj80avWItNRjkZ41rzHyw",
feedURL: "",
feedURLs: []string{},
},
// Invalid URL
{
websiteURL: "https://example|org/",
feedURL: "",
feedURLs: []string{},
discoveryError: true,
},
}
Expand All @@ -82,7 +87,7 @@ func TestFindYoutubeFeed(t *testing.T) {
}
}

if scenario.feedURL == "" {
if len(scenario.feedURLs) == 0 {
if len(subscriptions) > 0 {
t.Fatalf(`Parsing an invalid URL should not return any subscription: %q -> %v`, scenario.websiteURL, subscriptions)
}
Expand All @@ -91,12 +96,14 @@ func TestFindYoutubeFeed(t *testing.T) {
t.Fatalf(`Parsing a correctly formatted YouTube playlist or channel page should not return any error: %v`, localizedError)
}

if len(subscriptions) != 1 {
t.Fatalf(`Incorrect number of subscriptions returned`)
if len(subscriptions) != len(scenario.feedURLs) {
t.Fatalf(`Incorrect number of subscriptions returned, expected %d, got %d`, len(scenario.feedURLs), len(subscriptions))
}

if subscriptions[0].URL != scenario.feedURL {
t.Errorf(`Unexpected feed, got %s, instead of %s`, subscriptions[0].URL, scenario.feedURL)
for i := range scenario.feedURLs {
if subscriptions[i].URL != scenario.feedURLs[i] {
t.Errorf(`Unexpected feed, got %s, instead of %s`, subscriptions[i].URL, scenario.feedURLs[i])
}
}
}
}
Expand Down Expand Up @@ -397,3 +404,36 @@ func TestParseWebPageWithNoHref(t *testing.T) {
t.Fatal(`Incorrect number of subscriptions returned`)
}
}

func TestFindCanonicalURL(t *testing.T) {
htmlPage := `
<!doctype html>
<html>
<head>
<link rel="canonical" href="https://example.org/canonical-page">
</head>
<body>
</body>
</html>`

canonicalURL := NewSubscriptionFinder(nil).findCanonicalURL("https://example.org/page", "text/html", strings.NewReader(htmlPage))
if canonicalURL != "https://example.org/canonical-page" {
t.Errorf(`Unexpected canonical URL, got %q, expected %q`, canonicalURL, "https://example.org/canonical-page")
}
}

func TestFindCanonicalURLNotFound(t *testing.T) {
htmlPage := `
<!doctype html>
<html>
<head>
</head>
<body>
</body>
</html>`

canonicalURL := NewSubscriptionFinder(nil).findCanonicalURL("https://example.org/page", "text/html", strings.NewReader(htmlPage))
if canonicalURL != "https://example.org/page" {
t.Errorf(`Expected effective URL when canonical not found, got %q`, canonicalURL)
}
}