Skip to content

Commit 3ea4aee

Browse files
mtsz-plfguillot
authored andcommitted
feat(finder): enhance youtube channel parsing with default playlists
1 parent f447307 commit 3ea4aee

File tree

2 files changed

+131
-21
lines changed

2 files changed

+131
-21
lines changed

internal/reader/subscription/finder.go

Lines changed: 67 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,8 @@ func (f *subscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string,
7171

7272
// Step 2) Check if the website URL is a YouTube channel.
7373
slog.Debug("Try to detect feeds for a YouTube page", slog.String("website_url", websiteURL))
74-
if subscriptions, localizedError := f.findSubscriptionsFromYouTube(websiteURL); localizedError != nil {
74+
youtubeURL := f.findCanonicalURL(websiteURL, responseHandler.ContentType(), bytes.NewReader(responseBody))
75+
if subscriptions, localizedError := f.findSubscriptionsFromYouTube(youtubeURL); localizedError != nil {
7576
return nil, localizedError
7677
} else if len(subscriptions) > 0 {
7778
slog.Debug("Subscriptions found from YouTube page", slog.String("website_url", websiteURL), slog.Any("subscriptions", subscriptions))
@@ -274,6 +275,24 @@ func (f *subscriptionFinder) findSubscriptionsFromRSSBridge(websiteURL, rssBridg
274275
}
275276

276277
func (f *subscriptionFinder) findSubscriptionsFromYouTube(websiteURL string) (Subscriptions, *locale.LocalizedErrorWrapper) {
278+
playlistPrefixes := []struct {
279+
prefix string
280+
title string
281+
}{
282+
{"UULF", "Videos"},
283+
{"UUSH", "Short videos"},
284+
{"UULV", "Live streams"},
285+
286+
{"UULP", "Popular videos"},
287+
{"UUPS", "Popular short videos"},
288+
{"UUPV", "Popular live streams"},
289+
290+
{"UUMO", "Members-only contents (videos, short videos and live streams)"},
291+
{"UUMF", "Members-only videos"},
292+
{"UUMS", "Members-only short videos"},
293+
{"UUMV", "Members-only live streams"},
294+
}
295+
277296
decodedURL, err := url.Parse(websiteURL)
278297
if err != nil {
279298
return nil, locale.NewLocalizedErrorWrapper(err, "error.invalid_site_url", err)
@@ -283,9 +302,19 @@ func (f *subscriptionFinder) findSubscriptionsFromYouTube(websiteURL string) (Su
283302
slog.Debug("YouTube feed discovery skipped: not a YouTube domain", slog.String("website_url", websiteURL))
284303
return nil, nil
285304
}
286-
if _, channelID, found := strings.Cut(decodedURL.Path, "channel/"); found {
287-
feedURL := "https://www.youtube.com/feeds/videos.xml?channel_id=" + channelID
288-
return Subscriptions{NewSubscription(decodedURL.String(), feedURL, parser.FormatAtom)}, nil
305+
306+
if _, baseID, found := strings.Cut(decodedURL.Path, "channel/UC"); found {
307+
var subscriptions Subscriptions
308+
309+
channelFeedURL := "https://www.youtube.com/feeds/videos.xml?channel_id=UC" + baseID
310+
subscriptions = append(subscriptions, NewSubscription("Channel", channelFeedURL, parser.FormatAtom))
311+
312+
for _, playlist := range playlistPrefixes {
313+
playlistFeedURL := "https://www.youtube.com/feeds/videos.xml?playlist_id=" + playlist.prefix + baseID
314+
subscriptions = append(subscriptions, NewSubscription(playlist.title, playlistFeedURL, parser.FormatAtom))
315+
}
316+
317+
return subscriptions, nil
289318
}
290319

291320
if strings.HasPrefix(decodedURL.Path, "/watch") || strings.HasPrefix(decodedURL.Path, "/playlist") {
@@ -297,3 +326,37 @@ func (f *subscriptionFinder) findSubscriptionsFromYouTube(websiteURL string) (Su
297326

298327
return nil, nil
299328
}
329+
330+
// findCanonicalURL extracts the canonical URL from the HTML <link rel="canonical"> tag.
331+
// Returns the canonical URL if found, otherwise returns the effective URL.
332+
func (f *subscriptionFinder) findCanonicalURL(effectiveURL, contentType string, body io.Reader) string {
333+
htmlDocumentReader, err := encoding.NewCharsetReader(body, contentType)
334+
if err != nil {
335+
return effectiveURL
336+
}
337+
338+
doc, err := goquery.NewDocumentFromReader(htmlDocumentReader)
339+
if err != nil {
340+
return effectiveURL
341+
}
342+
343+
baseURL := effectiveURL
344+
if hrefValue, exists := doc.FindMatcher(goquery.Single("head base")).Attr("href"); exists {
345+
hrefValue = strings.TrimSpace(hrefValue)
346+
if urllib.IsAbsoluteURL(hrefValue) {
347+
baseURL = hrefValue
348+
}
349+
}
350+
351+
canonicalHref, exists := doc.Find("link[rel='canonical' i]").First().Attr("href")
352+
if !exists || strings.TrimSpace(canonicalHref) == "" {
353+
return effectiveURL
354+
}
355+
356+
canonicalURL, err := urllib.AbsoluteURL(baseURL, strings.TrimSpace(canonicalHref))
357+
if err != nil {
358+
return effectiveURL
359+
}
360+
361+
return canonicalURL
362+
}

internal/reader/subscription/finder_test.go

Lines changed: 64 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -11,65 +11,77 @@ import (
1111
func TestFindYoutubeFeed(t *testing.T) {
1212
type testResult struct {
1313
websiteURL string
14-
feedURL string
14+
feedURLs []string
1515
discoveryError bool
1616
}
1717

1818
scenarios := []testResult{
1919
// Video URL
2020
{
2121
websiteURL: "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
22-
feedURL: "",
22+
feedURLs: []string{},
2323
},
2424
// Video URL with position argument
2525
{
2626
websiteURL: "https://www.youtube.com/watch?v=dQw4w9WgXcQ&t=1",
27-
feedURL: "",
27+
feedURLs: []string{},
2828
},
2929
// Video URL with position argument
3030
{
3131
websiteURL: "https://www.youtube.com/watch?t=1&v=dQw4w9WgXcQ",
32-
feedURL: "",
32+
feedURLs: []string{},
3333
},
3434
// Channel URL
3535
{
3636
websiteURL: "https://www.youtube.com/channel/UC-Qj80avWItNRjkZ41rzHyw",
37-
feedURL: "https://www.youtube.com/feeds/videos.xml?channel_id=UC-Qj80avWItNRjkZ41rzHyw",
37+
feedURLs: []string{
38+
"https://www.youtube.com/feeds/videos.xml?channel_id=UC-Qj80avWItNRjkZ41rzHyw",
39+
"https://www.youtube.com/feeds/videos.xml?playlist_id=UULF-Qj80avWItNRjkZ41rzHyw",
40+
"https://www.youtube.com/feeds/videos.xml?playlist_id=UUSH-Qj80avWItNRjkZ41rzHyw",
41+
"https://www.youtube.com/feeds/videos.xml?playlist_id=UULV-Qj80avWItNRjkZ41rzHyw",
42+
"https://www.youtube.com/feeds/videos.xml?playlist_id=UULP-Qj80avWItNRjkZ41rzHyw",
43+
"https://www.youtube.com/feeds/videos.xml?playlist_id=UUPS-Qj80avWItNRjkZ41rzHyw",
44+
"https://www.youtube.com/feeds/videos.xml?playlist_id=UUPV-Qj80avWItNRjkZ41rzHyw",
45+
"https://www.youtube.com/feeds/videos.xml?playlist_id=UUMO-Qj80avWItNRjkZ41rzHyw",
46+
"https://www.youtube.com/feeds/videos.xml?playlist_id=UUMF-Qj80avWItNRjkZ41rzHyw",
47+
"https://www.youtube.com/feeds/videos.xml?playlist_id=UUMS-Qj80avWItNRjkZ41rzHyw",
48+
"https://www.youtube.com/feeds/videos.xml?playlist_id=UUMV-Qj80avWItNRjkZ41rzHyw",
49+
},
3850
},
3951
// Channel URL with name
4052
{
4153
websiteURL: "https://www.youtube.com/@ABCDEFG",
42-
feedURL: "",
54+
feedURLs: []string{},
4355
},
4456
// Playlist URL
4557
{
4658
websiteURL: "https://www.youtube.com/playlist?list=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR",
47-
feedURL: "https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR",
59+
feedURLs: []string{"https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR"},
4860
},
4961
// Playlist URL with video ID
5062
{
5163
websiteURL: "https://www.youtube.com/watch?v=dQw4w9WgXcQ&list=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM",
52-
feedURL: "https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM",
64+
feedURLs: []string{"https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM"},
5365
},
5466
// Playlist URL with video ID and index argument
5567
{
5668
websiteURL: "https://www.youtube.com/watch?v=6IutBmRJNLk&list=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR&index=4",
57-
feedURL: "https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR",
69+
feedURLs: []string{"https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR"},
5870
},
5971
// Empty playlist ID parameter
6072
{
6173
websiteURL: "https://www.youtube.com/playlist?list=",
62-
feedURL: "",
74+
feedURLs: []string{},
6375
},
6476
// Non-Youtube URL
6577
{
6678
websiteURL: "https://www.example.com/channel/UC-Qj80avWItNRjkZ41rzHyw",
67-
feedURL: "",
79+
feedURLs: []string{},
6880
},
6981
// Invalid URL
7082
{
7183
websiteURL: "https://example|org/",
72-
feedURL: "",
84+
feedURLs: []string{},
7385
discoveryError: true,
7486
},
7587
}
@@ -82,7 +94,7 @@ func TestFindYoutubeFeed(t *testing.T) {
8294
}
8395
}
8496

85-
if scenario.feedURL == "" {
97+
if len(scenario.feedURLs) == 0 {
8698
if len(subscriptions) > 0 {
8799
t.Fatalf(`Parsing an invalid URL should not return any subscription: %q -> %v`, scenario.websiteURL, subscriptions)
88100
}
@@ -91,12 +103,14 @@ func TestFindYoutubeFeed(t *testing.T) {
91103
t.Fatalf(`Parsing a correctly formatted YouTube playlist or channel page should not return any error: %v`, localizedError)
92104
}
93105

94-
if len(subscriptions) != 1 {
95-
t.Fatalf(`Incorrect number of subscriptions returned`)
106+
if len(subscriptions) != len(scenario.feedURLs) {
107+
t.Fatalf(`Incorrect number of subscriptions returned, expected %d, got %d`, len(scenario.feedURLs), len(subscriptions))
96108
}
97109

98-
if subscriptions[0].URL != scenario.feedURL {
99-
t.Errorf(`Unexpected feed, got %s, instead of %s`, subscriptions[0].URL, scenario.feedURL)
110+
for i := range scenario.feedURLs {
111+
if subscriptions[i].URL != scenario.feedURLs[i] {
112+
t.Errorf(`Unexpected feed, got %s, instead of %s`, subscriptions[i].URL, scenario.feedURLs[i])
113+
}
100114
}
101115
}
102116
}
@@ -397,3 +411,36 @@ func TestParseWebPageWithNoHref(t *testing.T) {
397411
t.Fatal(`Incorrect number of subscriptions returned`)
398412
}
399413
}
414+
415+
func TestFindCanonicalURL(t *testing.T) {
416+
htmlPage := `
417+
<!doctype html>
418+
<html>
419+
<head>
420+
<link rel="canonical" href="https://example.org/canonical-page">
421+
</head>
422+
<body>
423+
</body>
424+
</html>`
425+
426+
canonicalURL := NewSubscriptionFinder(nil).findCanonicalURL("https://example.org/page", "text/html", strings.NewReader(htmlPage))
427+
if canonicalURL != "https://example.org/canonical-page" {
428+
t.Errorf(`Unexpected canonical URL, got %q, expected %q`, canonicalURL, "https://example.org/canonical-page")
429+
}
430+
}
431+
432+
func TestFindCanonicalURLNotFound(t *testing.T) {
433+
htmlPage := `
434+
<!doctype html>
435+
<html>
436+
<head>
437+
</head>
438+
<body>
439+
</body>
440+
</html>`
441+
442+
canonicalURL := NewSubscriptionFinder(nil).findCanonicalURL("https://example.org/page", "text/html", strings.NewReader(htmlPage))
443+
if canonicalURL != "https://example.org/page" {
444+
t.Errorf(`Expected effective URL when canonical not found, got %q`, canonicalURL)
445+
}
446+
}

0 commit comments

Comments
 (0)