14
14
15
15
"""Fetches, embeds, and displays lyrics."""
16
16
17
+ from __future__ import annotations
18
+
17
19
import difflib
18
20
import errno
19
21
import itertools
22
24
import re
23
25
import struct
24
26
import unicodedata
25
- import urllib
26
27
import warnings
28
+ from functools import partial
29
+ from typing import ClassVar
30
+ from urllib .parse import quote , urlencode
27
31
28
32
import requests
29
33
from unidecode import unidecode
46
50
47
51
import beets
48
52
from beets import plugins , ui
49
- from beets .autotag .hooks import string_dist
50
53
51
54
DIV_RE = re .compile (r"<(/?)div>?" , re .I )
52
55
COMMENT_RE = re .compile (r"<!--.*-->" , re .S )
53
56
TAG_RE = re .compile (r"<[^>]*>" )
54
57
BREAK_RE = re .compile (r"\n?\s*<br([\s|/][^>]*)*>\s*\n?" , re .I )
55
- URL_CHARACTERS = {
56
- "\u2018 " : "'" ,
57
- "\u2019 " : "'" ,
58
- "\u201c " : '"' ,
59
- "\u201d " : '"' ,
60
- "\u2010 " : "-" ,
61
- "\u2011 " : "-" ,
62
- "\u2012 " : "-" ,
63
- "\u2013 " : "-" ,
64
- "\u2014 " : "-" ,
65
- "\u2015 " : "-" ,
66
- "\u2016 " : "-" ,
67
- "\u2026 " : "..." ,
68
- }
69
58
USER_AGENT = f"beets/{ beets .__version__ } "
70
59
71
60
# The content for the base index.rst generated in ReST mode.
@@ -233,21 +222,6 @@ def __init__(self, config, log):
233
222
self ._log = log
234
223
self .config = config
235
224
236
- @staticmethod
237
- def _encode (s ):
238
- """Encode the string for inclusion in a URL"""
239
- if isinstance (s , str ):
240
- for char , repl in URL_CHARACTERS .items ():
241
- s = s .replace (char , repl )
242
- s = s .encode ("utf-8" , "ignore" )
243
- return urllib .parse .quote (s )
244
-
245
- def build_url (self , artist , title ):
246
- return self .URL_PATTERN % (
247
- self ._encode (artist .title ()),
248
- self ._encode (title .title ()),
249
- )
250
-
251
225
def fetch_url (self , url ):
252
226
"""Retrieve the content at a given URL, or return None if the source
253
227
is unreachable.
@@ -308,7 +282,24 @@ def fetch(self, artist, title, album=None, length=None):
308
282
return data .get ("plainLyrics" )
309
283
310
284
311
- class MusiXmatch (Backend ):
285
+ class DirectBackend (Backend ):
286
+ """A backend for fetching lyrics directly."""
287
+
288
+ URL_TEMPLATE : ClassVar [str ] #: May include formatting placeholders
289
+
290
+ @classmethod
291
+ def encode (cls , text : str ) -> str :
292
+ """Encode the string for inclusion in a URL."""
293
+ raise NotImplementedError
294
+
295
+ @classmethod
296
+ def build_url (cls , * args : str ) -> str :
297
+ return cls .URL_TEMPLATE .format (* map (cls .encode , args ))
298
+
299
+
300
+ class MusiXmatch (DirectBackend ):
301
+ URL_TEMPLATE = "https://www.musixmatch.com/lyrics/{}/{}"
302
+
312
303
REPLACEMENTS = {
313
304
r"\s+" : "-" ,
314
305
"<" : "Less_Than" ,
@@ -318,14 +309,12 @@ class MusiXmatch(Backend):
318
309
r"[\]\}]" : ")" ,
319
310
}
320
311
321
- URL_PATTERN = "https://www.musixmatch.com/lyrics/%s/%s"
322
-
323
312
@classmethod
324
- def _encode (cls , s ) :
313
+ def encode (cls , text : str ) -> str :
325
314
for old , new in cls .REPLACEMENTS .items ():
326
- s = re .sub (old , new , s )
315
+ text = re .sub (old , new , text )
327
316
328
- return super (). _encode ( s )
317
+ return quote ( unidecode ( text ) )
329
318
330
319
def fetch (self , artist , title , album = None , length = None ):
331
320
url = self .build_url (artist , title )
@@ -494,90 +483,34 @@ def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup):
494
483
return lyrics_div .get_text ()
495
484
496
485
497
- class Tekstowo (Backend ):
498
- # Fetch lyrics from Tekstowo.pl.
499
- REQUIRES_BS = True
500
-
501
- BASE_URL = "http://www.tekstowo.pl"
502
- URL_PATTERN = BASE_URL + "/wyszukaj.html?search-title=%s&search-artist=%s"
503
-
504
- def fetch (self , artist , title , album = None , length = None ):
505
- url = self .build_url (title , artist )
506
- search_results = self .fetch_url (url )
507
- if not search_results :
508
- return None
486
+ class Tekstowo (DirectBackend ):
487
+ """Fetch lyrics from Tekstowo.pl."""
509
488
510
- song_page_url = self .parse_search_results (search_results )
511
- if not song_page_url :
512
- return None
513
-
514
- song_page_html = self .fetch_url (song_page_url )
515
- if not song_page_html :
516
- return None
517
-
518
- return self .extract_lyrics (song_page_html , artist , title )
519
-
520
- def parse_search_results (self , html ):
521
- html = _scrape_strip_cruft (html )
522
- html = _scrape_merge_paragraphs (html )
523
-
524
- soup = try_parse_html (html )
525
- if not soup :
526
- return None
527
-
528
- content_div = soup .find ("div" , class_ = "content" )
529
- if not content_div :
530
- return None
489
+ REQUIRES_BS = True
490
+ URL_TEMPLATE = "https://www.tekstowo.pl/piosenka,{},{}.html"
531
491
532
- card_div = content_div .find ("div" , class_ = "card" )
533
- if not card_div :
534
- return None
492
+ non_alpha_to_underscore = partial (re .compile (r"\W" ).sub , "_" )
535
493
536
- song_rows = card_div .find_all ("div" , class_ = "box-przeboje" )
537
- if not song_rows :
538
- return None
539
-
540
- song_row = song_rows [0 ]
541
- if not song_row :
542
- return None
494
+ @classmethod
495
+ def encode (cls , text : str ) -> str :
496
+ return cls .non_alpha_to_underscore (unidecode (text .lower ()))
543
497
544
- link = song_row . find ( "a" )
545
- if not link :
546
- return None
498
+ def fetch ( self , artist , title , album = None , length = None ):
499
+ if html := self . fetch_url ( self . build_url ( artist , title )) :
500
+ return self . extract_lyrics ( html )
547
501
548
- return self . BASE_URL + link . get ( "href" )
502
+ return None
549
503
550
- def extract_lyrics (self , html , artist , title ) :
504
+ def extract_lyrics (self , html : str ) -> str | None :
551
505
html = _scrape_strip_cruft (html )
552
506
html = _scrape_merge_paragraphs (html )
553
507
554
508
soup = try_parse_html (html )
555
- if not soup :
556
- return None
557
-
558
- info_div = soup .find ("div" , class_ = "col-auto" )
559
- if not info_div :
560
- return None
561
-
562
- info_elements = info_div .find_all ("a" )
563
- if not info_elements :
564
- return None
565
509
566
- html_title = info_elements [ - 1 ]. get_text ()
567
- html_artist = info_elements [ - 2 ] .get_text ()
510
+ if lyrics_div := soup . select_one ( "div.song-text > div.inner-text" ):
511
+ return lyrics_div .get_text ()
568
512
569
- title_dist = string_dist (html_title , title )
570
- artist_dist = string_dist (html_artist , artist )
571
-
572
- thresh = self .config ["dist_thresh" ].get (float )
573
- if title_dist > thresh or artist_dist > thresh :
574
- return None
575
-
576
- lyrics_div = soup .select ("div.song-text > div.inner-text" )
577
- if not lyrics_div :
578
- return None
579
-
580
- return lyrics_div [0 ].get_text ()
513
+ return None
581
514
582
515
583
516
def remove_credits (text ):
@@ -739,7 +672,7 @@ def fetch(self, artist, title, album=None, length=None):
739
672
url = "https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=%s" % (
740
673
self .api_key ,
741
674
self .engine_id ,
742
- urllib . parse . quote (query .encode ("utf-8" )),
675
+ quote (query .encode ("utf-8" )),
743
676
)
744
677
745
678
data = self .fetch_url (url )
@@ -886,7 +819,7 @@ def get_bing_access_token(self):
886
819
oauth_token = json .loads (
887
820
requests .post (
888
821
oauth_url ,
889
- data = urllib . parse . urlencode (params ),
822
+ data = urlencode (params ),
890
823
timeout = 10 ,
891
824
).content
892
825
)
0 commit comments