Skip to content

Commit 65d78cb

Browse files
authored
Lyrics: Fetch lyrics directly from Tekstowo (#5457)
Fixes #5456. This pull request updates the `tekstowo` backend in the `lyrics` plugin to fetch lyrics directly from the Tekstowo.pl website. Recent updates to their website made the previous search-based approach unworkable. ## Changes 1. **Refactor Backend Classes:** - Introduced a new `DirectBackend` class for backends that fetch lyrics directly. - Updated `MusiXmatch` and `Tekstowo` classes to inherit from `DirectBackend`. 2. **Encoding and URL Building:** - Added `encode` and `build_url` methods to `DirectBackend` for URL encoding and construction. - Replaced our custom encoding functionality with `unidecode`. 3. **Tekstowo Backend:** - Added encoding logic, which converts artist and title to the format used by tekstowo. - Removed the search functionality (`parse_search_results`), its test and related search html files. - Updated `artist` and `title` checks from `extract_lyrics`.
2 parents 03f1205 + d3955ba commit 65d78cb

File tree

5 files changed

+49
-1291
lines changed

5 files changed

+49
-1291
lines changed

beetsplug/lyrics.py

Lines changed: 44 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414

1515
"""Fetches, embeds, and displays lyrics."""
1616

17+
from __future__ import annotations
18+
1719
import difflib
1820
import errno
1921
import itertools
@@ -22,8 +24,10 @@
2224
import re
2325
import struct
2426
import unicodedata
25-
import urllib
2627
import warnings
28+
from functools import partial
29+
from typing import ClassVar
30+
from urllib.parse import quote, urlencode
2731

2832
import requests
2933
from unidecode import unidecode
@@ -46,26 +50,11 @@
4650

4751
import beets
4852
from beets import plugins, ui
49-
from beets.autotag.hooks import string_dist
5053

5154
DIV_RE = re.compile(r"<(/?)div>?", re.I)
5255
COMMENT_RE = re.compile(r"<!--.*-->", re.S)
5356
TAG_RE = re.compile(r"<[^>]*>")
5457
BREAK_RE = re.compile(r"\n?\s*<br([\s|/][^>]*)*>\s*\n?", re.I)
55-
URL_CHARACTERS = {
56-
"\u2018": "'",
57-
"\u2019": "'",
58-
"\u201c": '"',
59-
"\u201d": '"',
60-
"\u2010": "-",
61-
"\u2011": "-",
62-
"\u2012": "-",
63-
"\u2013": "-",
64-
"\u2014": "-",
65-
"\u2015": "-",
66-
"\u2016": "-",
67-
"\u2026": "...",
68-
}
6958
USER_AGENT = f"beets/{beets.__version__}"
7059

7160
# The content for the base index.rst generated in ReST mode.
@@ -233,21 +222,6 @@ def __init__(self, config, log):
233222
self._log = log
234223
self.config = config
235224

236-
@staticmethod
237-
def _encode(s):
238-
"""Encode the string for inclusion in a URL"""
239-
if isinstance(s, str):
240-
for char, repl in URL_CHARACTERS.items():
241-
s = s.replace(char, repl)
242-
s = s.encode("utf-8", "ignore")
243-
return urllib.parse.quote(s)
244-
245-
def build_url(self, artist, title):
246-
return self.URL_PATTERN % (
247-
self._encode(artist.title()),
248-
self._encode(title.title()),
249-
)
250-
251225
def fetch_url(self, url):
252226
"""Retrieve the content at a given URL, or return None if the source
253227
is unreachable.
@@ -308,7 +282,24 @@ def fetch(self, artist, title, album=None, length=None):
308282
return data.get("plainLyrics")
309283

310284

311-
class MusiXmatch(Backend):
285+
class DirectBackend(Backend):
286+
"""A backend for fetching lyrics directly."""
287+
288+
URL_TEMPLATE: ClassVar[str] #: May include formatting placeholders
289+
290+
@classmethod
291+
def encode(cls, text: str) -> str:
292+
"""Encode the string for inclusion in a URL."""
293+
raise NotImplementedError
294+
295+
@classmethod
296+
def build_url(cls, *args: str) -> str:
297+
return cls.URL_TEMPLATE.format(*map(cls.encode, args))
298+
299+
300+
class MusiXmatch(DirectBackend):
301+
URL_TEMPLATE = "https://www.musixmatch.com/lyrics/{}/{}"
302+
312303
REPLACEMENTS = {
313304
r"\s+": "-",
314305
"<": "Less_Than",
@@ -318,14 +309,12 @@ class MusiXmatch(Backend):
318309
r"[\]\}]": ")",
319310
}
320311

321-
URL_PATTERN = "https://www.musixmatch.com/lyrics/%s/%s"
322-
323312
@classmethod
324-
def _encode(cls, s):
313+
def encode(cls, text: str) -> str:
325314
for old, new in cls.REPLACEMENTS.items():
326-
s = re.sub(old, new, s)
315+
text = re.sub(old, new, text)
327316

328-
return super()._encode(s)
317+
return quote(unidecode(text))
329318

330319
def fetch(self, artist, title, album=None, length=None):
331320
url = self.build_url(artist, title)
@@ -494,90 +483,34 @@ def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup):
494483
return lyrics_div.get_text()
495484

496485

497-
class Tekstowo(Backend):
498-
# Fetch lyrics from Tekstowo.pl.
499-
REQUIRES_BS = True
500-
501-
BASE_URL = "http://www.tekstowo.pl"
502-
URL_PATTERN = BASE_URL + "/wyszukaj.html?search-title=%s&search-artist=%s"
503-
504-
def fetch(self, artist, title, album=None, length=None):
505-
url = self.build_url(title, artist)
506-
search_results = self.fetch_url(url)
507-
if not search_results:
508-
return None
486+
class Tekstowo(DirectBackend):
487+
"""Fetch lyrics from Tekstowo.pl."""
509488

510-
song_page_url = self.parse_search_results(search_results)
511-
if not song_page_url:
512-
return None
513-
514-
song_page_html = self.fetch_url(song_page_url)
515-
if not song_page_html:
516-
return None
517-
518-
return self.extract_lyrics(song_page_html, artist, title)
519-
520-
def parse_search_results(self, html):
521-
html = _scrape_strip_cruft(html)
522-
html = _scrape_merge_paragraphs(html)
523-
524-
soup = try_parse_html(html)
525-
if not soup:
526-
return None
527-
528-
content_div = soup.find("div", class_="content")
529-
if not content_div:
530-
return None
489+
REQUIRES_BS = True
490+
URL_TEMPLATE = "https://www.tekstowo.pl/piosenka,{},{}.html"
531491

532-
card_div = content_div.find("div", class_="card")
533-
if not card_div:
534-
return None
492+
non_alpha_to_underscore = partial(re.compile(r"\W").sub, "_")
535493

536-
song_rows = card_div.find_all("div", class_="box-przeboje")
537-
if not song_rows:
538-
return None
539-
540-
song_row = song_rows[0]
541-
if not song_row:
542-
return None
494+
@classmethod
495+
def encode(cls, text: str) -> str:
496+
return cls.non_alpha_to_underscore(unidecode(text.lower()))
543497

544-
link = song_row.find("a")
545-
if not link:
546-
return None
498+
def fetch(self, artist, title, album=None, length=None):
499+
if html := self.fetch_url(self.build_url(artist, title)):
500+
return self.extract_lyrics(html)
547501

548-
return self.BASE_URL + link.get("href")
502+
return None
549503

550-
def extract_lyrics(self, html, artist, title):
504+
def extract_lyrics(self, html: str) -> str | None:
551505
html = _scrape_strip_cruft(html)
552506
html = _scrape_merge_paragraphs(html)
553507

554508
soup = try_parse_html(html)
555-
if not soup:
556-
return None
557-
558-
info_div = soup.find("div", class_="col-auto")
559-
if not info_div:
560-
return None
561-
562-
info_elements = info_div.find_all("a")
563-
if not info_elements:
564-
return None
565509

566-
html_title = info_elements[-1].get_text()
567-
html_artist = info_elements[-2].get_text()
510+
if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
511+
return lyrics_div.get_text()
568512

569-
title_dist = string_dist(html_title, title)
570-
artist_dist = string_dist(html_artist, artist)
571-
572-
thresh = self.config["dist_thresh"].get(float)
573-
if title_dist > thresh or artist_dist > thresh:
574-
return None
575-
576-
lyrics_div = soup.select("div.song-text > div.inner-text")
577-
if not lyrics_div:
578-
return None
579-
580-
return lyrics_div[0].get_text()
513+
return None
581514

582515

583516
def remove_credits(text):
@@ -739,7 +672,7 @@ def fetch(self, artist, title, album=None, length=None):
739672
url = "https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=%s" % (
740673
self.api_key,
741674
self.engine_id,
742-
urllib.parse.quote(query.encode("utf-8")),
675+
quote(query.encode("utf-8")),
743676
)
744677

745678
data = self.fetch_url(url)
@@ -886,7 +819,7 @@ def get_bing_access_token(self):
886819
oauth_token = json.loads(
887820
requests.post(
888821
oauth_url,
889-
data=urllib.parse.urlencode(params),
822+
data=urlencode(params),
890823
timeout=10,
891824
).content
892825
)

docs/changelog.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ Bug fixes:
4444
* :doc:`plugins/discogs`: Fix the ``TypeError`` when there is no description.
4545
* Remove single quotes from all SQL queries
4646
:bug:`4709`
47+
* :doc:`plugins/lyrics`: Update ``tekstowo`` backend to fetch lyrics directly
48+
since recent updates to their website made it unsearchable.
49+
:bug:`5456`
4750

4851
For packagers:
4952

test/plugins/test_lyrics.py

Lines changed: 2 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -564,10 +564,7 @@ def test_good_lyrics(self):
564564
"""Ensure we are able to scrape a page with lyrics"""
565565
url = "https://www.tekstowo.pl/piosenka,24kgoldn,city_of_angels_1.html"
566566
mock = MockFetchUrl()
567-
assert (
568-
tekstowo.extract_lyrics(mock(url), "24kGoldn", "City of Angels")
569-
is not None
570-
)
567+
assert tekstowo.extract_lyrics(mock(url))
571568

572569
def test_no_lyrics(self):
573570
"""Ensure we don't crash when the scraping the html for a Tekstowo page
@@ -578,61 +575,7 @@ def test_no_lyrics(self):
578575
"beethoven_piano_sonata_17_tempest_the_3rd_movement.html"
579576
)
580577
mock = MockFetchUrl()
581-
assert (
582-
tekstowo.extract_lyrics(
583-
mock(url),
584-
"Beethoven",
585-
"Beethoven Piano Sonata 17" "Tempest The 3rd Movement",
586-
)
587-
is None
588-
)
589-
590-
def test_song_no_match(self):
591-
"""Ensure we return None when a song does not match the search query"""
592-
# https://github.com/beetbox/beets/issues/4406
593-
# expected return value None
594-
url = (
595-
"https://www.tekstowo.pl/piosenka,bailey_bigger"
596-
",black_eyed_susan.html"
597-
)
598-
mock = MockFetchUrl()
599-
assert (
600-
tekstowo.extract_lyrics(
601-
mock(url), "Kelly Bailey", "Black Mesa Inbound"
602-
)
603-
is None
604-
)
605-
606-
607-
class TekstowoParseSearchResultsTest(TekstowoBaseTest):
608-
"""tests Tekstowo.parse_search_results()"""
609-
610-
def setUp(self):
611-
"""Set up configuration"""
612-
TekstowoBaseTest.setUp(self)
613-
self.plugin = lyrics.LyricsPlugin()
614-
615-
def test_multiple_results(self):
616-
"""Ensure we are able to scrape a page with multiple search results"""
617-
url = (
618-
"https://www.tekstowo.pl/szukaj,wykonawca,juice+wrld"
619-
",tytul,lucid+dreams.html"
620-
)
621-
mock = MockFetchUrl()
622-
assert (
623-
tekstowo.parse_search_results(mock(url))
624-
== "http://www.tekstowo.pl/piosenka,juice_wrld,"
625-
"lucid_dreams__remix__ft__lil_uzi_vert.html"
626-
)
627-
628-
def test_no_results(self):
629-
"""Ensure we are able to scrape a page with no search results"""
630-
url = (
631-
"https://www.tekstowo.pl/szukaj,wykonawca,"
632-
"agfdgja,tytul,agfdgafg.html"
633-
)
634-
mock = MockFetchUrl()
635-
assert tekstowo.parse_search_results(mock(url)) is None
578+
assert not tekstowo.extract_lyrics(mock(url))
636579

637580

638581
class TekstowoIntegrationTest(TekstowoBaseTest, LyricsAssertions):

0 commit comments

Comments
 (0)