Skip to content

Commit 4d84876

Browse files
committed
Merge pull request #1192 from Kraymer/lyrics-fix-incomplete
lyrics : remove empty divs before scraping
2 parents fd94094 + e63a8c1 commit 4d84876

File tree

5 files changed

+30
-13
lines changed

5 files changed

+30
-13
lines changed

beetsplug/lyrics.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def extract_text_between(html, start_marker, end_marker):
9090
html, _ = html.split(end_marker, 1)
9191
except ValueError:
9292
return u''
93-
return _scrape_strip_cruft(html, True)
93+
return html
9494

9595

9696
def extract_text_in(html, starttag):
@@ -124,8 +124,7 @@ def extract_text_in(html, starttag):
124124
else:
125125
print('no closing tag found!')
126126
return
127-
lyrics = ''.join(parts)
128-
return _scrape_strip_cruft(lyrics, True)
127+
return u''.join(parts)
129128

130129

131130
def search_pairs(item):
@@ -221,7 +220,7 @@ def fetch_lyricswiki(artist, title):
221220
if not html:
222221
return
223222

224-
lyrics = extract_text_in(html, "<div class='lyricbox'>")
223+
lyrics = extract_text_in(html, u"<div class='lyricbox'>")
225224
if lyrics and 'Unfortunately, we are not licensed' not in lyrics:
226225
return lyrics
227226

@@ -360,13 +359,14 @@ def _scrape_strip_cruft(html, plain_text_out=False):
360359
html = COMMENT_RE.sub('', html)
361360
html = TAG_RE.sub('', html)
362361

363-
# Strip lines
364362
html = '\n'.join([x.strip() for x in html.strip().split('\n')])
363+
html = re.sub(r'\n{3,}', r'\n\n', html)
365364
return html
366365

367366

368367
def _scrape_merge_paragraphs(html):
369-
return re.sub(r'</p>\s*<p(\s*[^>]*)>', '\n', html)
368+
html = re.sub(r'</p>\s*<p(\s*[^>]*)>', '\n', html)
369+
return re.sub(r'<div .*>\s*</div>', '\n', html)
370370

371371

372372
def scrape_lyrics_from_html(html):
@@ -541,4 +541,4 @@ def get_lyrics(self, artist, title):
541541
if lyrics:
542542
log.debug(u'got lyrics from backend: {0}'
543543
.format(backend.__name__))
544-
return lyrics.strip()
544+
return _scrape_strip_cruft(lyrics, True)

docs/changelog.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ New:
3030

3131
Fixed:
3232

33+
* :doc:`/plugins/lyrics`: Avoid fetching truncated lyrics from the Google
34+
backed by merging text blocks separated by empty ``<div>`` before scraping.
3335
* Fix a new crash with the latest version of Mutagen (1.26).
3436
* We now print a better error message when the database file is corrupted.
3537
* :doc:`/plugins/discogs`: Only prompt for authentication when running the

test/rsrc/lyrics/examplecom/beetssong.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,9 @@ e9.size = "120x600, 160x600";
222222
<h2>John Doe <br> beets song lyrics</h2>
223223
<img src="images/phone-left.gif" alt="Ringtones left icon" width="16" height="17"> <a href="http://www.ringtonematcher.com/go/?sid=LBSMros&amp;artist=The+John Doe&amp;song=Beets+Song" target="_blank"><b><font size="+1" color="red" face="arial">Send "beets song" Ringtone to your Cell</font></b></a> <img src="images/phone-right.gif" alt="Ringtones right icon" width="16" height="17"><br><br><center>Beets is the media library management system for obsessive-compulsive music geeks.<br>
224224
The purpose of beets is to get your music collection right once and for all. It catalogs your collection, automatically improving its metadata as it goes. It then provides a bouquet of tools for manipulating and accessing your music.<br>
225-
Here's an example of beets' brainy tag corrector doing its thing:</center>
225+
<div class='flow breaker'> </div>
226+
Here's an example of beets' brainy tag corrector doing its thing:
227+
Because beets is designed as a library, it can do almost anything you can imagine for your music collection. Via plugins, beets becomes a panacea</center>
226228
<img src="images/phone-left.gif" alt="Ringtones left icon" width="16" height="17"> <a href="http://www.ringtonematcher.com/go/?sid=LBSMros&amp;artist=The+John Doe&amp;song=Beets+Song" target="_blank"><b><font size="+1" color="red" face="arial">Send "beets song" Ringtone to your Cell</font></b></a> <img src="images/phone-right.gif" alt="Ringtones right icon" width="16" height="17"><br><br>
227229
<center>
228230
<font color="black" size="2" face="arial">Share <strong>beets song lyrics</strong></font><br><p style="height: 1px; margin: 3pt; padding: 0pt;"></p>

test/rsrc/lyricstext.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
Beets_song:
2+
- geeks
3+
- bouquet
4+
- panacea
5+
16
Amsterdam:
27
- oriflammes
38
- fortune

test/test_lyrics.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,15 @@ def setUp(self):
300300
lyrics.LyricsPlugin()
301301
lyrics.fetch_url = MockFetchUrl()
302302

303+
def test_mocked_source_ok(self):
304+
"""Test that lyrics of the mocked page are correctly scraped"""
305+
url = self.source['url'] + self.source['path']
306+
if os.path.isfile(url_to_filename(url)):
307+
res = lyrics.scrape_lyrics_from_html(lyrics.fetch_url(url))
308+
self.assertTrue(lyrics.is_lyrics(res), url)
309+
self.assertTrue(is_lyrics_content_ok(self.source['title'], res),
310+
url)
311+
303312
def test_google_sources_ok(self):
304313
"""Test if lyrics present on websites registered in beets google custom
305314
search engine are correctly scraped."""
@@ -319,12 +328,11 @@ def test_default_ok(self):
319328
for (fun, s) in zip([lyrics.fetch_lyricswiki,
320329
lyrics.fetch_lyricscom,
321330
lyrics.fetch_musixmatch], DEFAULT_SOURCES):
322-
if os.path.isfile(url_to_filename(
323-
s['url'] + s['path'])):
331+
url = s['url'] + s['path']
332+
if os.path.isfile(url_to_filename(url)):
324333
res = fun(s['artist'], s['title'])
325-
self.assertTrue(lyrics.is_lyrics(res))
326-
self.assertTrue(is_lyrics_content_ok(
327-
s['title'], res))
334+
self.assertTrue(lyrics.is_lyrics(res), url)
335+
self.assertTrue(is_lyrics_content_ok(s['title'], res), url)
328336

329337
def test_is_page_candidate_exact_match(self):
330338
"""Test matching html page title with song infos -- when song infos are

0 commit comments

Comments
 (0)