diff --git a/internal/reader/subscription/finder.go b/internal/reader/subscription/finder.go index 03f0d2a0743..1422bed66f6 100644 --- a/internal/reader/subscription/finder.go +++ b/internal/reader/subscription/finder.go @@ -69,7 +69,11 @@ func (f *subscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string, return Subscriptions{NewSubscription(responseHandler.EffectiveURL(), responseHandler.EffectiveURL(), feedFormat)}, nil } - // Step 2) Check if the website URL is a YouTube channel. + // Step 2) Find the canonical URL of the website. + slog.Debug("Try to find the canonical URL of the website", slog.String("website_url", websiteURL)) + websiteURL = f.findCanonicalURL(websiteURL, responseHandler.ContentType(), bytes.NewReader(responseBody)) + + // Step 3) Check if the website URL is a YouTube channel. slog.Debug("Try to detect feeds for a YouTube page", slog.String("website_url", websiteURL)) if subscriptions, localizedError := f.findSubscriptionsFromYouTube(websiteURL); localizedError != nil { return nil, localizedError @@ -78,7 +82,7 @@ func (f *subscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string, return subscriptions, nil } - // Step 3) Parse web page to find feeds from HTML meta tags. + // Step 4) Parse web page to find feeds from HTML meta tags. slog.Debug("Try to detect feeds from HTML meta tags", slog.String("website_url", websiteURL), slog.String("content_type", responseHandler.ContentType()), @@ -90,7 +94,7 @@ func (f *subscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string, return subscriptions, nil } - // Step 4) Check if the website URL can use RSS-Bridge. + // Step 5) Check if the website URL can use RSS-Bridge. if rssBridgeURL != "" { slog.Debug("Try to detect feeds with RSS-Bridge", slog.String("website_url", websiteURL)) if subscriptions, localizedError := f.findSubscriptionsFromRSSBridge(websiteURL, rssBridgeURL, rssBridgeToken); localizedError != nil { @@ -101,7 +105,7 @@ func (f *subscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string, } } - // Step 5) Check if the website has a known feed URL. + // Step 6) Check if the website has a known feed URL. slog.Debug("Try to detect feeds from well-known URLs", slog.String("website_url", websiteURL)) if subscriptions, localizedError := f.findSubscriptionsFromWellKnownURLs(websiteURL); localizedError != nil { return nil, localizedError @@ -274,6 +278,15 @@ func (f *subscriptionFinder) findSubscriptionsFromRSSBridge(websiteURL, rssBridg } func (f *subscriptionFinder) findSubscriptionsFromYouTube(websiteURL string) (Subscriptions, *locale.LocalizedErrorWrapper) { + playlistPrefixes := []struct { + prefix string + title string + }{ + {"UULF", "Videos"}, + {"UUSH", "Short videos"}, + {"UULV", "Live streams"}, + } + decodedURL, err := url.Parse(websiteURL) if err != nil { return nil, locale.NewLocalizedErrorWrapper(err, "error.invalid_site_url", err) @@ -283,9 +296,19 @@ func (f *subscriptionFinder) findSubscriptionsFromYouTube(websiteURL string) (Su slog.Debug("YouTube feed discovery skipped: not a YouTube domain", slog.String("website_url", websiteURL)) return nil, nil } - if _, channelID, found := strings.Cut(decodedURL.Path, "channel/"); found { - feedURL := "https://www.youtube.com/feeds/videos.xml?channel_id=" + channelID - return Subscriptions{NewSubscription(decodedURL.String(), feedURL, parser.FormatAtom)}, nil + + if _, baseID, found := strings.Cut(decodedURL.Path, "channel/UC"); found { + var subscriptions Subscriptions + + channelFeedURL := "https://www.youtube.com/feeds/videos.xml?channel_id=UC" + baseID + subscriptions = append(subscriptions, NewSubscription("Channel", channelFeedURL, parser.FormatAtom)) + + for _, playlist := range playlistPrefixes { + playlistFeedURL := "https://www.youtube.com/feeds/videos.xml?playlist_id=" + playlist.prefix + baseID + subscriptions = append(subscriptions, NewSubscription(playlist.title, playlistFeedURL, parser.FormatAtom)) + } + + return subscriptions, nil } if strings.HasPrefix(decodedURL.Path, "/watch") || strings.HasPrefix(decodedURL.Path, "/playlist") { @@ -297,3 +320,37 @@ func (f *subscriptionFinder) findSubscriptionsFromYouTube(websiteURL string) (Su return nil, nil } + +// findCanonicalURL extracts the canonical URL from the HTML tag. +// Returns the canonical URL if found, otherwise returns the effective URL. +func (f *subscriptionFinder) findCanonicalURL(effectiveURL, contentType string, body io.Reader) string { + htmlDocumentReader, err := encoding.NewCharsetReader(body, contentType) + if err != nil { + return effectiveURL + } + + doc, err := goquery.NewDocumentFromReader(htmlDocumentReader) + if err != nil { + return effectiveURL + } + + baseURL := effectiveURL + if hrefValue, exists := doc.FindMatcher(goquery.Single("head base")).Attr("href"); exists { + hrefValue = strings.TrimSpace(hrefValue) + if urllib.IsAbsoluteURL(hrefValue) { + baseURL = hrefValue + } + } + + canonicalHref, exists := doc.Find("link[rel='canonical' i]").First().Attr("href") + if !exists || strings.TrimSpace(canonicalHref) == "" { + return effectiveURL + } + + canonicalURL, err := urllib.AbsoluteURL(baseURL, strings.TrimSpace(canonicalHref)) + if err != nil { + return effectiveURL + } + + return canonicalURL +} diff --git a/internal/reader/subscription/finder_test.go b/internal/reader/subscription/finder_test.go index 324df1526e5..ec3f5887871 100644 --- a/internal/reader/subscription/finder_test.go +++ b/internal/reader/subscription/finder_test.go @@ -11,7 +11,7 @@ import ( func TestFindYoutubeFeed(t *testing.T) { type testResult struct { websiteURL string - feedURL string + feedURLs []string discoveryError bool } @@ -19,57 +19,62 @@ func TestFindYoutubeFeed(t *testing.T) { // Video URL { websiteURL: "https://www.youtube.com/watch?v=dQw4w9WgXcQ", - feedURL: "", + feedURLs: []string{}, }, // Video URL with position argument { websiteURL: "https://www.youtube.com/watch?v=dQw4w9WgXcQ&t=1", - feedURL: "", + feedURLs: []string{}, }, // Video URL with position argument { websiteURL: "https://www.youtube.com/watch?t=1&v=dQw4w9WgXcQ", - feedURL: "", + feedURLs: []string{}, }, // Channel URL { websiteURL: "https://www.youtube.com/channel/UC-Qj80avWItNRjkZ41rzHyw", - feedURL: "https://www.youtube.com/feeds/videos.xml?channel_id=UC-Qj80avWItNRjkZ41rzHyw", + feedURLs: []string{ + "https://www.youtube.com/feeds/videos.xml?channel_id=UC-Qj80avWItNRjkZ41rzHyw", + "https://www.youtube.com/feeds/videos.xml?playlist_id=UULF-Qj80avWItNRjkZ41rzHyw", + "https://www.youtube.com/feeds/videos.xml?playlist_id=UUSH-Qj80avWItNRjkZ41rzHyw", + "https://www.youtube.com/feeds/videos.xml?playlist_id=UULV-Qj80avWItNRjkZ41rzHyw", + }, }, // Channel URL with name { websiteURL: "https://www.youtube.com/@ABCDEFG", - feedURL: "", + feedURLs: []string{}, }, // Playlist URL { websiteURL: "https://www.youtube.com/playlist?list=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR", - feedURL: "https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR", + feedURLs: []string{"https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR"}, }, // Playlist URL with video ID { websiteURL: "https://www.youtube.com/watch?v=dQw4w9WgXcQ&list=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM", - feedURL: "https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM", + feedURLs: []string{"https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM"}, }, // Playlist URL with video ID and index argument { websiteURL: "https://www.youtube.com/watch?v=6IutBmRJNLk&list=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR&index=4", - feedURL: "https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR", + feedURLs: []string{"https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR"}, }, // Empty playlist ID parameter { websiteURL: "https://www.youtube.com/playlist?list=", - feedURL: "", + feedURLs: []string{}, }, // Non-Youtube URL { websiteURL: "https://www.example.com/channel/UC-Qj80avWItNRjkZ41rzHyw", - feedURL: "", + feedURLs: []string{}, }, // Invalid URL { websiteURL: "https://example|org/", - feedURL: "", + feedURLs: []string{}, discoveryError: true, }, } @@ -82,7 +87,7 @@ func TestFindYoutubeFeed(t *testing.T) { } } - if scenario.feedURL == "" { + if len(scenario.feedURLs) == 0 { if len(subscriptions) > 0 { t.Fatalf(`Parsing an invalid URL should not return any subscription: %q -> %v`, scenario.websiteURL, subscriptions) } @@ -91,12 +96,14 @@ func TestFindYoutubeFeed(t *testing.T) { t.Fatalf(`Parsing a correctly formatted YouTube playlist or channel page should not return any error: %v`, localizedError) } - if len(subscriptions) != 1 { - t.Fatalf(`Incorrect number of subscriptions returned`) + if len(subscriptions) != len(scenario.feedURLs) { + t.Fatalf(`Incorrect number of subscriptions returned, expected %d, got %d`, len(scenario.feedURLs), len(subscriptions)) } - if subscriptions[0].URL != scenario.feedURL { - t.Errorf(`Unexpected feed, got %s, instead of %s`, subscriptions[0].URL, scenario.feedURL) + for i := range scenario.feedURLs { + if subscriptions[i].URL != scenario.feedURLs[i] { + t.Errorf(`Unexpected feed, got %s, instead of %s`, subscriptions[i].URL, scenario.feedURLs[i]) + } } } } @@ -397,3 +404,36 @@ func TestParseWebPageWithNoHref(t *testing.T) { t.Fatal(`Incorrect number of subscriptions returned`) } } + +func TestFindCanonicalURL(t *testing.T) { + htmlPage := ` + + + + + + + + ` + + canonicalURL := NewSubscriptionFinder(nil).findCanonicalURL("https://example.org/page", "text/html", strings.NewReader(htmlPage)) + if canonicalURL != "https://example.org/canonical-page" { + t.Errorf(`Unexpected canonical URL, got %q, expected %q`, canonicalURL, "https://example.org/canonical-page") + } +} + +func TestFindCanonicalURLNotFound(t *testing.T) { + htmlPage := ` + + + + + + + ` + + canonicalURL := NewSubscriptionFinder(nil).findCanonicalURL("https://example.org/page", "text/html", strings.NewReader(htmlPage)) + if canonicalURL != "https://example.org/page" { + t.Errorf(`Expected effective URL when canonical not found, got %q`, canonicalURL) + } +}