Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
275 changes: 275 additions & 0 deletions Recommendation Models/Podcast Recommendation System/README.md

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import json
import numpy as np
import pandas as pd
import re


def main():
with open('json_files/podcasts_info.json') as json_file:
podcasts = json.load(json_file)

podcasts_df = pd.DataFrame(podcasts)

podcasts_df = podcasts_df[['title', 'producer', 'genre', 'description', 'num_episodes',
'rating', 'num_reviews', 'link']]
podcasts_df = podcasts_df.replace('NA', np.nan)
podcasts_df = podcasts_df.dropna()

podcast_titles = list(podcasts_df['title'])
podcast_titles = [title.replace(" ", "") for title in podcast_titles]
podcast_titles = [re.sub(r'[^\w\s]', '', title) for title in podcast_titles]
is_english = [bool(re.match("^[A-Za-z0-9]*$", title)) for title in podcast_titles]
podcasts_df['is_english'] = is_english
podcasts_df = podcasts_df[podcasts_df.is_english == True]
podcasts_df = podcasts_df.drop(columns=['is_english'])

podcasts_df = podcasts_df.reset_index(drop=True)

podcasts_df = podcasts_df.reset_index(drop=True)

podcasts_df.to_pickle('pickle_files/english_podcasts.pkl')


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import pandas as pd
import numpy as np
import requests
import json
import re
import unidecode
import time
from tqdm import tqdm
from bs4 import BeautifulSoup
from selenium import webdriver


def clean_title(t):
t = unidecode.unidecode(t)
t = t.replace('\n', ' ')
t = re.sub(r'[^\w\s]', '', t)
t = re.sub(r'\d+', '', t)
t = t.lower()
t = t.strip()
return t


def clean_description(d):
d = unidecode.unidecode(d)
d = d.replace('\n', ' ')
d = re.sub(r'[^\w\s]', '', d)
d = re.sub(r'\d+', '', d)
if re.findall(r'(.*) brought to you by.*', d):
d = re.sub(r'brought to you by.*', '', d)
if re.search(r'(.*) sponsored by.*', d):
d = re.sub(r'sponsored by.*', '', d)
d = d.lower()
d = d.strip()
return d


def get_recent_podcast_episodes(link):
episode_titles = ''
episode_desc = ''

driver = webdriver.PhantomJS()
driver.get(link)
html = driver.page_source.encode('utf-8')

soup = BeautifulSoup(html, 'lxml')
text = str(soup.find('script'))

try:
text = text.split('"workExample":')[1].split(',"aggregateRating"')[0]
episode_data = json.loads(text)

for episode in episode_data:
title = episode['name']
c_title = clean_title(title)

description = episode['description']
c_description = clean_description(description)

episode_titles += (c_title + " ")
episode_desc += (c_description + " ")

episode_titles = episode_titles.strip()
episode_desc = episode_desc.strip()

except Exception:
episode_title = np.nan
episode_desc = np.nan
print("Failed on: " + str(link))

return [episode_titles, episode_desc]
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from bs4 import BeautifulSoup
import requests
import json


def convert_si_to_number(x):
total_stars = 0
if 'K' in x:
if len(x) > 1:
total_stars = float(x.replace('K', '')) * 1000 # convert K to a thousand
elif 'M' in x:
if len(x) > 1:
total_stars = float(x.replace('M', '')) * 1000000 # convert M to a million
elif 'B' in x:
total_stars = float(x.replace('B', '')) * 1000000000 # convert B to a Billion
else:
total_stars = int(x) # Less than 1000
return int(total_stars)


def build_podcast_object(link):
try:
response = requests.get(link, timeout=10)
content = BeautifulSoup(response.content, "lxml")
try:
title = content.find('h1').find('span').text.strip()
except AttributeError:
title = 'NA'
try:
producer = content.find('h1').find('a').text.strip()
except AttributeError:
try:
producer = content.find('h1').find('span', class_='product-header__identity podcast-header__identity') \
.text.strip()
except AttributeError:
producer = 'NA'
try:
genre = content.find('li', class_="product-header__list__item").text.strip()
except AttributeError:
genre = 'NA'
try:
description = content.find('div', class_="product-hero-desc product-hero-desc--side-bar").text.strip()
except AttributeError:
description = 'NA'
try:
num_episodes = int(content.find('div', class_="product-artwork__caption small-hide medium-show")
.text.strip().strip('episodes'))
except AttributeError:
num_episodes = 'NA'
try:
rating = float(content.find('span', class_="we-customer-ratings__averages__display").text.strip())
except AttributeError:
rating = 'NA'
try:
num_reviews = convert_si_to_number(content
.find('div', class_="we-customer-ratings__count small-hide medium-show")
.text.strip().strip('Ratings'))
except AttributeError:
num_reviews = 'NA'
except Exception:
title = 'NA'
producer = 'NA'
genre = 'NA'
description = 'NA'
num_episodes = 'NA'
rating = 'NA'
num_reviews = 'NA'

podcast_object = {
'title': title,
'producer': producer,
'genre': genre,
'description': description,
'num_episodes': num_episodes,
'rating': rating,
'num_reviews': num_reviews,
'link': link
}
return podcast_object


counter = 1
podcastObjects = list()
with open('podcast_links.json') as json_file:
podcasts = json.load(json_file)
for podcast_link in podcasts:
try:
podcast = build_podcast_object(podcast_link)
podcastObjects.append(podcast)
print('Completed Podcast: ' + str(counter))
counter += 1
except Exception:
print('Failed on ' + podcast_link)
counter += 1
pass

with open('podcasts_info.json', 'w') as outfile:
json.dump(podcastObjects, outfile)
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from bs4 import BeautifulSoup
import requests
import json

CATEGORIES = {
'arts': "https://podcasts.apple.com/us/genre/podcasts-arts/id1301",
'business': "https://podcasts.apple.com/us/genre/podcasts-business/id1321",
'comedy': "https://podcasts.apple.com/us/genre/podcasts-comedy/id1303",
'education': "https://podcasts.apple.com/us/genre/podcasts-education/id1304",
'fiction': "https://podcasts.apple.com/us/genre/podcasts-fiction/id1483",
'government': "https://podcasts.apple.com/us/genre/podcasts-government/id1511",
'health': "https://podcasts.apple.com/us/genre/podcasts-health-fitness/id1512",
'history': "https://podcasts.apple.com/us/genre/podcasts-history/id1487",
'kids_and_family': "https://podcasts.apple.com/us/genre/podcasts-kids-family/id1305",
'leisure': "https://podcasts.apple.com/us/genre/podcasts-leisure/id1502",
'music': "https://podcasts.apple.com/us/genre/podcasts-music/id1310",
'news': "https://podcasts.apple.com/us/genre/podcasts-news/id1489",
'religion_and_spirituality': "https://podcasts.apple.com/us/genre/podcasts-religion-spirituality/id1314",
'science': "https://podcasts.apple.com/us/genre/podcasts-science/id1533",
'society_and_culture': "https://podcasts.apple.com/us/genre/podcasts-society-culture/id1324",
'sports': "https://podcasts.apple.com/us/genre/podcasts-sports/id1545",
'tv_and_film': "https://podcasts.apple.com/us/genre/podcasts-tv-film/id1309",
'technology': "https://podcasts.apple.com/us/genre/podcasts-technology/id1318",
'true_crime': "https://podcasts.apple.com/us/genre/podcasts-true-crime/id1488"
}

all_podcast_links = list()

for category_url in CATEGORIES.values():
response = requests.get(category_url, timeout=5)
content = BeautifulSoup(response.content, "lxml")
podcast_links = content.find('div', class_='grid3-column')

for link in podcast_links.findAll('a'):
all_podcast_links.append(link.get('href'))

all_podcast_links = list(set(all_podcast_links))

with open('podcast_links.json', 'w') as outfile:
json.dump(all_podcast_links, outfile)
Loading