From 54042fc728547a6831678ec76323071b2aa31ad9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Jab=C5=82o=C5=84ski?= Date: Mon, 17 Nov 2025 00:22:32 +0100 Subject: [PATCH 1/3] feat(finder): enhance youtube channel parsing with default playlists --- internal/reader/subscription/finder.go | 71 +++++++++++++++++- internal/reader/subscription/finder_test.go | 81 ++++++++++++++++----- 2 files changed, 131 insertions(+), 21 deletions(-) diff --git a/internal/reader/subscription/finder.go b/internal/reader/subscription/finder.go index 03f0d2a0743..d5941923be7 100644 --- a/internal/reader/subscription/finder.go +++ b/internal/reader/subscription/finder.go @@ -71,7 +71,8 @@ func (f *subscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string, // Step 2) Check if the website URL is a YouTube channel. slog.Debug("Try to detect feeds for a YouTube page", slog.String("website_url", websiteURL)) - if subscriptions, localizedError := f.findSubscriptionsFromYouTube(websiteURL); localizedError != nil { + youtubeURL := f.findCanonicalURL(websiteURL, responseHandler.ContentType(), bytes.NewReader(responseBody)) + if subscriptions, localizedError := f.findSubscriptionsFromYouTube(youtubeURL); localizedError != nil { return nil, localizedError } else if len(subscriptions) > 0 { slog.Debug("Subscriptions found from YouTube page", slog.String("website_url", websiteURL), slog.Any("subscriptions", subscriptions)) @@ -274,6 +275,24 @@ func (f *subscriptionFinder) findSubscriptionsFromRSSBridge(websiteURL, rssBridg } func (f *subscriptionFinder) findSubscriptionsFromYouTube(websiteURL string) (Subscriptions, *locale.LocalizedErrorWrapper) { + playlistPrefixes := []struct { + prefix string + title string + }{ + {"UULF", "Videos"}, + {"UUSH", "Short videos"}, + {"UULV", "Live streams"}, + + {"UULP", "Popular videos"}, + {"UUPS", "Popular short videos"}, + {"UUPV", "Popular live streams"}, + + {"UUMO", "Members-only contents (videos, short videos and live streams)"}, + {"UUMF", "Members-only videos"}, + {"UUMS", "Members-only short videos"}, + {"UUMV", "Members-only live streams"}, + } + decodedURL, err := url.Parse(websiteURL) if err != nil { return nil, locale.NewLocalizedErrorWrapper(err, "error.invalid_site_url", err) @@ -283,9 +302,19 @@ func (f *subscriptionFinder) findSubscriptionsFromYouTube(websiteURL string) (Su slog.Debug("YouTube feed discovery skipped: not a YouTube domain", slog.String("website_url", websiteURL)) return nil, nil } - if _, channelID, found := strings.Cut(decodedURL.Path, "channel/"); found { - feedURL := "https://www.youtube.com/feeds/videos.xml?channel_id=" + channelID - return Subscriptions{NewSubscription(decodedURL.String(), feedURL, parser.FormatAtom)}, nil + + if _, baseID, found := strings.Cut(decodedURL.Path, "channel/UC"); found { + var subscriptions Subscriptions + + channelFeedURL := "https://www.youtube.com/feeds/videos.xml?channel_id=UC" + baseID + subscriptions = append(subscriptions, NewSubscription("Channel", channelFeedURL, parser.FormatAtom)) + + for _, playlist := range playlistPrefixes { + playlistFeedURL := "https://www.youtube.com/feeds/videos.xml?playlist_id=" + playlist.prefix + baseID + subscriptions = append(subscriptions, NewSubscription(playlist.title, playlistFeedURL, parser.FormatAtom)) + } + + return subscriptions, nil } if strings.HasPrefix(decodedURL.Path, "/watch") || strings.HasPrefix(decodedURL.Path, "/playlist") { @@ -297,3 +326,37 @@ func (f *subscriptionFinder) findSubscriptionsFromYouTube(websiteURL string) (Su return nil, nil } + +// findCanonicalURL extracts the canonical URL from the HTML tag. +// Returns the canonical URL if found, otherwise returns the effective URL. +func (f *subscriptionFinder) findCanonicalURL(effectiveURL, contentType string, body io.Reader) string { + htmlDocumentReader, err := encoding.NewCharsetReader(body, contentType) + if err != nil { + return effectiveURL + } + + doc, err := goquery.NewDocumentFromReader(htmlDocumentReader) + if err != nil { + return effectiveURL + } + + baseURL := effectiveURL + if hrefValue, exists := doc.FindMatcher(goquery.Single("head base")).Attr("href"); exists { + hrefValue = strings.TrimSpace(hrefValue) + if urllib.IsAbsoluteURL(hrefValue) { + baseURL = hrefValue + } + } + + canonicalHref, exists := doc.Find("link[rel='canonical' i]").First().Attr("href") + if !exists || strings.TrimSpace(canonicalHref) == "" { + return effectiveURL + } + + canonicalURL, err := urllib.AbsoluteURL(baseURL, strings.TrimSpace(canonicalHref)) + if err != nil { + return effectiveURL + } + + return canonicalURL +} diff --git a/internal/reader/subscription/finder_test.go b/internal/reader/subscription/finder_test.go index 324df1526e5..d4da3569e98 100644 --- a/internal/reader/subscription/finder_test.go +++ b/internal/reader/subscription/finder_test.go @@ -11,7 +11,7 @@ import ( func TestFindYoutubeFeed(t *testing.T) { type testResult struct { websiteURL string - feedURL string + feedURLs []string discoveryError bool } @@ -19,57 +19,69 @@ func TestFindYoutubeFeed(t *testing.T) { // Video URL { websiteURL: "https://www.youtube.com/watch?v=dQw4w9WgXcQ", - feedURL: "", + feedURLs: []string{}, }, // Video URL with position argument { websiteURL: "https://www.youtube.com/watch?v=dQw4w9WgXcQ&t=1", - feedURL: "", + feedURLs: []string{}, }, // Video URL with position argument { websiteURL: "https://www.youtube.com/watch?t=1&v=dQw4w9WgXcQ", - feedURL: "", + feedURLs: []string{}, }, // Channel URL { websiteURL: "https://www.youtube.com/channel/UC-Qj80avWItNRjkZ41rzHyw", - feedURL: "https://www.youtube.com/feeds/videos.xml?channel_id=UC-Qj80avWItNRjkZ41rzHyw", + feedURLs: []string{ + "https://www.youtube.com/feeds/videos.xml?channel_id=UC-Qj80avWItNRjkZ41rzHyw", + "https://www.youtube.com/feeds/videos.xml?playlist_id=UULF-Qj80avWItNRjkZ41rzHyw", + "https://www.youtube.com/feeds/videos.xml?playlist_id=UUSH-Qj80avWItNRjkZ41rzHyw", + "https://www.youtube.com/feeds/videos.xml?playlist_id=UULV-Qj80avWItNRjkZ41rzHyw", + "https://www.youtube.com/feeds/videos.xml?playlist_id=UULP-Qj80avWItNRjkZ41rzHyw", + "https://www.youtube.com/feeds/videos.xml?playlist_id=UUPS-Qj80avWItNRjkZ41rzHyw", + "https://www.youtube.com/feeds/videos.xml?playlist_id=UUPV-Qj80avWItNRjkZ41rzHyw", + "https://www.youtube.com/feeds/videos.xml?playlist_id=UUMO-Qj80avWItNRjkZ41rzHyw", + "https://www.youtube.com/feeds/videos.xml?playlist_id=UUMF-Qj80avWItNRjkZ41rzHyw", + "https://www.youtube.com/feeds/videos.xml?playlist_id=UUMS-Qj80avWItNRjkZ41rzHyw", + "https://www.youtube.com/feeds/videos.xml?playlist_id=UUMV-Qj80avWItNRjkZ41rzHyw", + }, }, // Channel URL with name { websiteURL: "https://www.youtube.com/@ABCDEFG", - feedURL: "", + feedURLs: []string{}, }, // Playlist URL { websiteURL: "https://www.youtube.com/playlist?list=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR", - feedURL: "https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR", + feedURLs: []string{"https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR"}, }, // Playlist URL with video ID { websiteURL: "https://www.youtube.com/watch?v=dQw4w9WgXcQ&list=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM", - feedURL: "https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM", + feedURLs: []string{"https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM"}, }, // Playlist URL with video ID and index argument { websiteURL: "https://www.youtube.com/watch?v=6IutBmRJNLk&list=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR&index=4", - feedURL: "https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR", + feedURLs: []string{"https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR"}, }, // Empty playlist ID parameter { websiteURL: "https://www.youtube.com/playlist?list=", - feedURL: "", + feedURLs: []string{}, }, // Non-Youtube URL { websiteURL: "https://www.example.com/channel/UC-Qj80avWItNRjkZ41rzHyw", - feedURL: "", + feedURLs: []string{}, }, // Invalid URL { websiteURL: "https://example|org/", - feedURL: "", + feedURLs: []string{}, discoveryError: true, }, } @@ -82,7 +94,7 @@ func TestFindYoutubeFeed(t *testing.T) { } } - if scenario.feedURL == "" { + if len(scenario.feedURLs) == 0 { if len(subscriptions) > 0 { t.Fatalf(`Parsing an invalid URL should not return any subscription: %q -> %v`, scenario.websiteURL, subscriptions) } @@ -91,12 +103,14 @@ func TestFindYoutubeFeed(t *testing.T) { t.Fatalf(`Parsing a correctly formatted YouTube playlist or channel page should not return any error: %v`, localizedError) } - if len(subscriptions) != 1 { - t.Fatalf(`Incorrect number of subscriptions returned`) + if len(subscriptions) != len(scenario.feedURLs) { + t.Fatalf(`Incorrect number of subscriptions returned, expected %d, got %d`, len(scenario.feedURLs), len(subscriptions)) } - if subscriptions[0].URL != scenario.feedURL { - t.Errorf(`Unexpected feed, got %s, instead of %s`, subscriptions[0].URL, scenario.feedURL) + for i := range scenario.feedURLs { + if subscriptions[i].URL != scenario.feedURLs[i] { + t.Errorf(`Unexpected feed, got %s, instead of %s`, subscriptions[i].URL, scenario.feedURLs[i]) + } } } } @@ -397,3 +411,36 @@ func TestParseWebPageWithNoHref(t *testing.T) { t.Fatal(`Incorrect number of subscriptions returned`) } } + +func TestFindCanonicalURL(t *testing.T) { + htmlPage := ` + + + + + + + + ` + + canonicalURL := NewSubscriptionFinder(nil).findCanonicalURL("https://example.org/page", "text/html", strings.NewReader(htmlPage)) + if canonicalURL != "https://example.org/canonical-page" { + t.Errorf(`Unexpected canonical URL, got %q, expected %q`, canonicalURL, "https://example.org/canonical-page") + } +} + +func TestFindCanonicalURLNotFound(t *testing.T) { + htmlPage := ` + + + + + + + ` + + canonicalURL := NewSubscriptionFinder(nil).findCanonicalURL("https://example.org/page", "text/html", strings.NewReader(htmlPage)) + if canonicalURL != "https://example.org/page" { + t.Errorf(`Expected effective URL when canonical not found, got %q`, canonicalURL) + } +} From d5344cc3ea028c437ac3aafeed663d0b718386f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Jab=C5=82o=C5=84ski?= Date: Sat, 29 Nov 2025 12:31:04 +0100 Subject: [PATCH 2/3] feat(finder): make cannonical url detection a proper step --- internal/reader/subscription/finder.go | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/internal/reader/subscription/finder.go b/internal/reader/subscription/finder.go index d5941923be7..267964ae800 100644 --- a/internal/reader/subscription/finder.go +++ b/internal/reader/subscription/finder.go @@ -68,18 +68,21 @@ func (f *subscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string, f.feedDownloaded = true return Subscriptions{NewSubscription(responseHandler.EffectiveURL(), responseHandler.EffectiveURL(), feedFormat)}, nil } + + // Step 2) Find the canonical URL of the website. + slog.Debug("Try to find the canonical URL of the website", slog.String("website_url", websiteURL)) + websiteURL = f.findCanonicalURL(websiteURL, responseHandler.ContentType(), bytes.NewReader(responseBody)) - // Step 2) Check if the website URL is a YouTube channel. + // Step 3) Check if the website URL is a YouTube channel. slog.Debug("Try to detect feeds for a YouTube page", slog.String("website_url", websiteURL)) - youtubeURL := f.findCanonicalURL(websiteURL, responseHandler.ContentType(), bytes.NewReader(responseBody)) - if subscriptions, localizedError := f.findSubscriptionsFromYouTube(youtubeURL); localizedError != nil { + if subscriptions, localizedError := f.findSubscriptionsFromYouTube(websiteURL); localizedError != nil { return nil, localizedError } else if len(subscriptions) > 0 { slog.Debug("Subscriptions found from YouTube page", slog.String("website_url", websiteURL), slog.Any("subscriptions", subscriptions)) return subscriptions, nil } - // Step 3) Parse web page to find feeds from HTML meta tags. + // Step 4) Parse web page to find feeds from HTML meta tags. slog.Debug("Try to detect feeds from HTML meta tags", slog.String("website_url", websiteURL), slog.String("content_type", responseHandler.ContentType()), @@ -91,7 +94,7 @@ func (f *subscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string, return subscriptions, nil } - // Step 4) Check if the website URL can use RSS-Bridge. + // Step 5) Check if the website URL can use RSS-Bridge. if rssBridgeURL != "" { slog.Debug("Try to detect feeds with RSS-Bridge", slog.String("website_url", websiteURL)) if subscriptions, localizedError := f.findSubscriptionsFromRSSBridge(websiteURL, rssBridgeURL, rssBridgeToken); localizedError != nil { @@ -102,7 +105,7 @@ func (f *subscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string, } } - // Step 5) Check if the website has a known feed URL. + // Step 6) Check if the website has a known feed URL. slog.Debug("Try to detect feeds from well-known URLs", slog.String("website_url", websiteURL)) if subscriptions, localizedError := f.findSubscriptionsFromWellKnownURLs(websiteURL); localizedError != nil { return nil, localizedError From 8bd3ec5f9991ff461c48712bb188bdd6f5c2cc00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Jab=C5=82o=C5=84ski?= Date: Sat, 29 Nov 2025 12:33:12 +0100 Subject: [PATCH 3/3] feat(finder): generate feeds for stable youtube playlists --- internal/reader/subscription/finder.go | 17 ++++------------- internal/reader/subscription/finder_test.go | 7 ------- 2 files changed, 4 insertions(+), 20 deletions(-) diff --git a/internal/reader/subscription/finder.go b/internal/reader/subscription/finder.go index 267964ae800..1422bed66f6 100644 --- a/internal/reader/subscription/finder.go +++ b/internal/reader/subscription/finder.go @@ -68,7 +68,7 @@ func (f *subscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string, f.feedDownloaded = true return Subscriptions{NewSubscription(responseHandler.EffectiveURL(), responseHandler.EffectiveURL(), feedFormat)}, nil } - + // Step 2) Find the canonical URL of the website. slog.Debug("Try to find the canonical URL of the website", slog.String("website_url", websiteURL)) websiteURL = f.findCanonicalURL(websiteURL, responseHandler.ContentType(), bytes.NewReader(responseBody)) @@ -285,15 +285,6 @@ func (f *subscriptionFinder) findSubscriptionsFromYouTube(websiteURL string) (Su {"UULF", "Videos"}, {"UUSH", "Short videos"}, {"UULV", "Live streams"}, - - {"UULP", "Popular videos"}, - {"UUPS", "Popular short videos"}, - {"UUPV", "Popular live streams"}, - - {"UUMO", "Members-only contents (videos, short videos and live streams)"}, - {"UUMF", "Members-only videos"}, - {"UUMS", "Members-only short videos"}, - {"UUMV", "Members-only live streams"}, } decodedURL, err := url.Parse(websiteURL) @@ -308,15 +299,15 @@ func (f *subscriptionFinder) findSubscriptionsFromYouTube(websiteURL string) (Su if _, baseID, found := strings.Cut(decodedURL.Path, "channel/UC"); found { var subscriptions Subscriptions - + channelFeedURL := "https://www.youtube.com/feeds/videos.xml?channel_id=UC" + baseID subscriptions = append(subscriptions, NewSubscription("Channel", channelFeedURL, parser.FormatAtom)) - + for _, playlist := range playlistPrefixes { playlistFeedURL := "https://www.youtube.com/feeds/videos.xml?playlist_id=" + playlist.prefix + baseID subscriptions = append(subscriptions, NewSubscription(playlist.title, playlistFeedURL, parser.FormatAtom)) } - + return subscriptions, nil } diff --git a/internal/reader/subscription/finder_test.go b/internal/reader/subscription/finder_test.go index d4da3569e98..ec3f5887871 100644 --- a/internal/reader/subscription/finder_test.go +++ b/internal/reader/subscription/finder_test.go @@ -39,13 +39,6 @@ func TestFindYoutubeFeed(t *testing.T) { "https://www.youtube.com/feeds/videos.xml?playlist_id=UULF-Qj80avWItNRjkZ41rzHyw", "https://www.youtube.com/feeds/videos.xml?playlist_id=UUSH-Qj80avWItNRjkZ41rzHyw", "https://www.youtube.com/feeds/videos.xml?playlist_id=UULV-Qj80avWItNRjkZ41rzHyw", - "https://www.youtube.com/feeds/videos.xml?playlist_id=UULP-Qj80avWItNRjkZ41rzHyw", - "https://www.youtube.com/feeds/videos.xml?playlist_id=UUPS-Qj80avWItNRjkZ41rzHyw", - "https://www.youtube.com/feeds/videos.xml?playlist_id=UUPV-Qj80avWItNRjkZ41rzHyw", - "https://www.youtube.com/feeds/videos.xml?playlist_id=UUMO-Qj80avWItNRjkZ41rzHyw", - "https://www.youtube.com/feeds/videos.xml?playlist_id=UUMF-Qj80avWItNRjkZ41rzHyw", - "https://www.youtube.com/feeds/videos.xml?playlist_id=UUMS-Qj80avWItNRjkZ41rzHyw", - "https://www.youtube.com/feeds/videos.xml?playlist_id=UUMV-Qj80avWItNRjkZ41rzHyw", }, }, // Channel URL with name