Skip to content

Commit 514a5b1

Browse files
authored
Add files via upload
1 parent 453fdfd commit 514a5b1

File tree

9 files changed

+16099
-0
lines changed

9 files changed

+16099
-0
lines changed

Recommendation Models/Podcast Recommendation System/README.md

Lines changed: 318 additions & 0 deletions
Large diffs are not rendered by default.

Recommendation Models/Podcast Recommendation System/notebooks/get_podcast_ep_data.ipynb

Lines changed: 11889 additions & 0 deletions
Large diffs are not rendered by default.

Recommendation Models/Podcast Recommendation System/notebooks/nlp_podcast_analysis.ipynb

Lines changed: 2270 additions & 0 deletions
Large diffs are not rendered by default.

Recommendation Models/Podcast Recommendation System/notebooks/podcast_recommender.ipynb

Lines changed: 819 additions & 0 deletions
Large diffs are not rendered by default.

Recommendation Models/Podcast Recommendation System/notebooks/w2v_visualization.ipynb

Lines changed: 561 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import json
2+
import numpy as np
3+
import pandas as pd
4+
import re
5+
6+
7+
def main():
8+
with open('json_files/podcasts_info.json') as json_file:
9+
podcasts = json.load(json_file)
10+
11+
podcasts_df = pd.DataFrame(podcasts)
12+
13+
podcasts_df = podcasts_df[['title', 'producer', 'genre', 'description', 'num_episodes',
14+
'rating', 'num_reviews', 'link']]
15+
podcasts_df = podcasts_df.replace('NA', np.nan)
16+
podcasts_df = podcasts_df.dropna()
17+
18+
podcast_titles = list(podcasts_df['title'])
19+
podcast_titles = [title.replace(" ", "") for title in podcast_titles]
20+
podcast_titles = [re.sub(r'[^\w\s]', '', title) for title in podcast_titles]
21+
is_english = [bool(re.match("^[A-Za-z0-9]*$", title)) for title in podcast_titles]
22+
podcasts_df['is_english'] = is_english
23+
podcasts_df = podcasts_df[podcasts_df.is_english == True]
24+
podcasts_df = podcasts_df.drop(columns=['is_english'])
25+
26+
podcasts_df = podcasts_df.reset_index(drop=True)
27+
28+
podcasts_df = podcasts_df.reset_index(drop=True)
29+
30+
podcasts_df.to_pickle('pickle_files/english_podcasts.pkl')
31+
32+
33+
if __name__ == "__main__":
34+
main()
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import pandas as pd
2+
import numpy as np
3+
import requests
4+
import json
5+
import re
6+
import unidecode
7+
import time
8+
from tqdm import tqdm
9+
from bs4 import BeautifulSoup
10+
from selenium import webdriver
11+
12+
13+
def clean_title(t):
14+
t = unidecode.unidecode(t)
15+
t = t.replace('\n', ' ')
16+
t = re.sub(r'[^\w\s]', '', t)
17+
t = re.sub(r'\d+', '', t)
18+
t = t.lower()
19+
t = t.strip()
20+
return t
21+
22+
23+
def clean_description(d):
24+
d = unidecode.unidecode(d)
25+
d = d.replace('\n', ' ')
26+
d = re.sub(r'[^\w\s]', '', d)
27+
d = re.sub(r'\d+', '', d)
28+
if re.findall(r'(.*) brought to you by.*', d):
29+
d = re.sub(r'brought to you by.*', '', d)
30+
if re.search(r'(.*) sponsored by.*', d):
31+
d = re.sub(r'sponsored by.*', '', d)
32+
d = d.lower()
33+
d = d.strip()
34+
return d
35+
36+
37+
def get_recent_podcast_episodes(link):
38+
episode_titles = ''
39+
episode_desc = ''
40+
41+
driver = webdriver.PhantomJS()
42+
driver.get(link)
43+
html = driver.page_source.encode('utf-8')
44+
45+
soup = BeautifulSoup(html, 'lxml')
46+
text = str(soup.find('script'))
47+
48+
try:
49+
text = text.split('"workExample":')[1].split(',"aggregateRating"')[0]
50+
episode_data = json.loads(text)
51+
52+
for episode in episode_data:
53+
title = episode['name']
54+
c_title = clean_title(title)
55+
56+
description = episode['description']
57+
c_description = clean_description(description)
58+
59+
episode_titles += (c_title + " ")
60+
episode_desc += (c_description + " ")
61+
62+
episode_titles = episode_titles.strip()
63+
episode_desc = episode_desc.strip()
64+
65+
except Exception:
66+
episode_title = np.nan
67+
episode_desc = np.nan
68+
print("Failed on: " + str(link))
69+
70+
return [episode_titles, episode_desc]
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
from bs4 import BeautifulSoup
2+
import requests
3+
import json
4+
5+
6+
def convert_si_to_number(x):
7+
total_stars = 0
8+
if 'K' in x:
9+
if len(x) > 1:
10+
total_stars = float(x.replace('K', '')) * 1000 # convert K to a thousand
11+
elif 'M' in x:
12+
if len(x) > 1:
13+
total_stars = float(x.replace('M', '')) * 1000000 # convert M to a million
14+
elif 'B' in x:
15+
total_stars = float(x.replace('B', '')) * 1000000000 # convert B to a Billion
16+
else:
17+
total_stars = int(x) # Less than 1000
18+
return int(total_stars)
19+
20+
21+
def build_podcast_object(link):
22+
try:
23+
response = requests.get(link, timeout=10)
24+
content = BeautifulSoup(response.content, "lxml")
25+
try:
26+
title = content.find('h1').find('span').text.strip()
27+
except AttributeError:
28+
title = 'NA'
29+
try:
30+
producer = content.find('h1').find('a').text.strip()
31+
except AttributeError:
32+
try:
33+
producer = content.find('h1').find('span', class_='product-header__identity podcast-header__identity') \
34+
.text.strip()
35+
except AttributeError:
36+
producer = 'NA'
37+
try:
38+
genre = content.find('li', class_="product-header__list__item").text.strip()
39+
except AttributeError:
40+
genre = 'NA'
41+
try:
42+
description = content.find('div', class_="product-hero-desc product-hero-desc--side-bar").text.strip()
43+
except AttributeError:
44+
description = 'NA'
45+
try:
46+
num_episodes = int(content.find('div', class_="product-artwork__caption small-hide medium-show")
47+
.text.strip().strip('episodes'))
48+
except AttributeError:
49+
num_episodes = 'NA'
50+
try:
51+
rating = float(content.find('span', class_="we-customer-ratings__averages__display").text.strip())
52+
except AttributeError:
53+
rating = 'NA'
54+
try:
55+
num_reviews = convert_si_to_number(content
56+
.find('div', class_="we-customer-ratings__count small-hide medium-show")
57+
.text.strip().strip('Ratings'))
58+
except AttributeError:
59+
num_reviews = 'NA'
60+
except Exception:
61+
title = 'NA'
62+
producer = 'NA'
63+
genre = 'NA'
64+
description = 'NA'
65+
num_episodes = 'NA'
66+
rating = 'NA'
67+
num_reviews = 'NA'
68+
69+
podcast_object = {
70+
'title': title,
71+
'producer': producer,
72+
'genre': genre,
73+
'description': description,
74+
'num_episodes': num_episodes,
75+
'rating': rating,
76+
'num_reviews': num_reviews,
77+
'link': link
78+
}
79+
return podcast_object
80+
81+
82+
counter = 1
83+
podcastObjects = list()
84+
with open('podcast_links.json') as json_file:
85+
podcasts = json.load(json_file)
86+
for podcast_link in podcasts:
87+
try:
88+
podcast = build_podcast_object(podcast_link)
89+
podcastObjects.append(podcast)
90+
print('Completed Podcast: ' + str(counter))
91+
counter += 1
92+
except Exception:
93+
print('Failed on ' + podcast_link)
94+
counter += 1
95+
pass
96+
97+
with open('podcasts_info.json', 'w') as outfile:
98+
json.dump(podcastObjects, outfile)
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
from bs4 import BeautifulSoup
2+
import requests
3+
import json
4+
5+
CATEGORIES = {
6+
'arts': "https://podcasts.apple.com/us/genre/podcasts-arts/id1301",
7+
'business': "https://podcasts.apple.com/us/genre/podcasts-business/id1321",
8+
'comedy': "https://podcasts.apple.com/us/genre/podcasts-comedy/id1303",
9+
'education': "https://podcasts.apple.com/us/genre/podcasts-education/id1304",
10+
'fiction': "https://podcasts.apple.com/us/genre/podcasts-fiction/id1483",
11+
'government': "https://podcasts.apple.com/us/genre/podcasts-government/id1511",
12+
'health': "https://podcasts.apple.com/us/genre/podcasts-health-fitness/id1512",
13+
'history': "https://podcasts.apple.com/us/genre/podcasts-history/id1487",
14+
'kids_and_family': "https://podcasts.apple.com/us/genre/podcasts-kids-family/id1305",
15+
'leisure': "https://podcasts.apple.com/us/genre/podcasts-leisure/id1502",
16+
'music': "https://podcasts.apple.com/us/genre/podcasts-music/id1310",
17+
'news': "https://podcasts.apple.com/us/genre/podcasts-news/id1489",
18+
'religion_and_spirituality': "https://podcasts.apple.com/us/genre/podcasts-religion-spirituality/id1314",
19+
'science': "https://podcasts.apple.com/us/genre/podcasts-science/id1533",
20+
'society_and_culture': "https://podcasts.apple.com/us/genre/podcasts-society-culture/id1324",
21+
'sports': "https://podcasts.apple.com/us/genre/podcasts-sports/id1545",
22+
'tv_and_film': "https://podcasts.apple.com/us/genre/podcasts-tv-film/id1309",
23+
'technology': "https://podcasts.apple.com/us/genre/podcasts-technology/id1318",
24+
'true_crime': "https://podcasts.apple.com/us/genre/podcasts-true-crime/id1488"
25+
}
26+
27+
all_podcast_links = list()
28+
29+
for category_url in CATEGORIES.values():
30+
response = requests.get(category_url, timeout=5)
31+
content = BeautifulSoup(response.content, "lxml")
32+
podcast_links = content.find('div', class_='grid3-column')
33+
34+
for link in podcast_links.findAll('a'):
35+
all_podcast_links.append(link.get('href'))
36+
37+
all_podcast_links = list(set(all_podcast_links))
38+
39+
with open('podcast_links.json', 'w') as outfile:
40+
json.dump(all_podcast_links, outfile)

0 commit comments

Comments
 (0)