-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenius_multi_album_extractor.py
More file actions
158 lines (122 loc) · 4.75 KB
/
genius_multi_album_extractor.py
File metadata and controls
158 lines (122 loc) · 4.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# first attempt to get lyrics from genius
import os
import re
import time
import hashlib
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager
# === CONFIG ===
ALBUM_URLS = [
#put genius album urls here
]
LINK_PATTERN = re.compile(r"https://genius\.com/[^\"?#]+-lyrics")
# === SETUP ===
options = Options()
options.add_argument("--headless")
service = Service(GeckoDriverManager().install())
driver = webdriver.Firefox(service=service, options=options)
# === FUNCTIONS ===
def get_next_folder_name(base_name="lyrics"):
counter = 1
while os.path.exists(f"{base_name}{counter}"):
counter += 1
return f"{base_name}{counter}"
def short_hash(s):
return hashlib.md5(s.encode("utf-8")).hexdigest()[:4]
def extract_album_year(soup):
for span in soup.find_all("span"):
text = span.get_text(strip=True)
match = re.search(r"(19|20)\\d{2}", text)
if match:
return match.group(0)
return "____"
def get_song_links(album_url):
print(f"Loading album page: {album_url}")
driver.get(album_url)
time.sleep(3)
soup = BeautifulSoup(driver.page_source, "html.parser")
year = extract_album_year(soup)
links = []
# Only find links inside the tracklist (chart_row-content)
track_links = soup.select("div.chart_row-content a[href]")
for a in track_links:
href = a["href"]
if LINK_PATTERN.match(href):
links.append(href.strip())
# Deduplicate and preserve order
seen = set()
ordered_links = []
for i, href in enumerate(links):
if href not in seen:
seen.add(href)
ordered_links.append((i + 1, href, year))
return ordered_links
def extract_lyrics(url):
print(f"Visiting: {url}")
driver.get(url)
time.sleep(2)
soup = BeautifulSoup(driver.page_source, "html.parser")
lyrics_blocks = soup.find_all("div", {"data-lyrics-container": "true"})
if not lyrics_blocks:
return None
# Collect lines while filtering out non-lyrics content
lines = []
for block in lyrics_blocks:
for line in block.stripped_strings:
# Skip common non-lyrics patterns
if "Contributors" in line or "Read More" in line or "Lyrics" in line:
continue
lines.append(line)
# DEBUG (optional)
print("RAW LINES:", lines)
# Filter out any text before first [Verse], [Hook], etc.
for idx, line in enumerate(lines):
if line.startswith('['): # First verse/hook found
lines = lines[idx:]
break
lyrics_text = "\n".join(lines)
return lyrics_text.strip()
def save_lyrics(title, lyrics, album_name, year, output_file):
safe_title = re.sub(r'[\\/*?:\"<>|]', "_", title)
heading = f"{{{safe_title} - {album_name} - {year}}}"
with open(output_file, "a", encoding="utf-8") as f:
f.write(f"{heading}\n\n{lyrics}\n\n{'=' * 50}\n\n")
def generate_unique_filename(base_name):
if not os.path.exists(base_name):
return base_name
name, ext = os.path.splitext(base_name)
counter = 1
while os.path.exists(f"{name}_{counter}{ext}"):
counter += 1
return f"{name}_{counter}{ext}"
def main():
script_dir = os.path.dirname(os.path.abspath(__file__))
output_dir = os.path.join(script_dir, get_next_folder_name())
os.makedirs(output_dir, exist_ok=True)
for url in ALBUM_URLS:
album_slug = url.rstrip("/").split("/")[-1]
album_name = album_slug.replace("-", " ").lower()
# Save each album's lyrics inside this new folder
base_output_file = os.path.join(output_dir, f"{album_slug[:10].lower()}_{short_hash(album_slug)}.txt")
output_file = generate_unique_filename(base_output_file)
print(f"\n== Scraping album: {album_name} ==")
links = get_song_links(url)
print(f"Found {len(links)} songs.")
if not links:
print(f"⚠️ No songs found for: {album_name}")
continue
for i, (track_num, link, year) in enumerate(links, 1):
lyrics = extract_lyrics(link)
if lyrics:
title = link.split("/")[-1].replace("-lyrics", "").replace("-", " ").title()
save_lyrics(title, lyrics, album_name, year, output_file)
print(f"[{i}/{len(links)}] ✓ Added: {title}")
else:
print(f"[{i}/{len(links)}] ✗ No lyrics found.")
driver.quit()
# === RUN ===
if __name__ == "__main__":
main()