recodehive · sanjay-kv · Oct 23, 2024 · Oct 22, 2024 · Oct 22, 2024 · Oct 22, 2024
diff --git a/Recommendation Models/Podcast Recommendation System/README.md b/Recommendation Models/Podcast Recommendation System/README.md
diff --git a/Recommendation Models/Podcast Recommendation System/notebooks/get_podcast_ep_data.ipynb b/Recommendation Models/Podcast Recommendation System/notebooks/get_podcast_ep_data.ipynb
diff --git a/Recommendation Models/Podcast Recommendation System/notebooks/nlp_podcast_analysis.ipynb b/Recommendation Models/Podcast Recommendation System/notebooks/nlp_podcast_analysis.ipynb
diff --git a/Recommendation Models/Podcast Recommendation System/notebooks/podcast_recommender.ipynb b/Recommendation Models/Podcast Recommendation System/notebooks/podcast_recommender.ipynb
diff --git a/Recommendation Models/Podcast Recommendation System/notebooks/w2v_visualization.ipynb b/Recommendation Models/Podcast Recommendation System/notebooks/w2v_visualization.ipynb
diff --git a/Recommendation Models/Podcast Recommendation System/scripts/clean_podcast_df.py b/Recommendation Models/Podcast Recommendation System/scripts/clean_podcast_df.py
@@ -0,0 +1,34 @@
+import json
+import numpy as np
+import pandas as pd
+import re
+
+
+def main():
+    with open('json_files/podcasts_info.json') as json_file:
+        podcasts = json.load(json_file)
+
+    podcasts_df = pd.DataFrame(podcasts)
+
+    podcasts_df = podcasts_df[['title', 'producer', 'genre', 'description', 'num_episodes',
+                               'rating', 'num_reviews', 'link']]
+    podcasts_df = podcasts_df.replace('NA', np.nan)
+    podcasts_df = podcasts_df.dropna()
+
+    podcast_titles = list(podcasts_df['title'])
+    podcast_titles = [title.replace(" ", "") for title in podcast_titles]
+    podcast_titles = [re.sub(r'[^\w\s]', '', title) for title in podcast_titles]
+    is_english = [bool(re.match("^[A-Za-z0-9]*$", title)) for title in podcast_titles]
+    podcasts_df['is_english'] = is_english
+    podcasts_df = podcasts_df[podcasts_df.is_english == True]
+    podcasts_df = podcasts_df.drop(columns=['is_english'])
+
+    podcasts_df = podcasts_df.reset_index(drop=True)
+
+    podcasts_df = podcasts_df.reset_index(drop=True)
+
+    podcasts_df.to_pickle('pickle_files/english_podcasts.pkl')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/Recommendation Models/Podcast Recommendation System/scripts/get_podcast_ep_data.py b/Recommendation Models/Podcast Recommendation System/scripts/get_podcast_ep_data.py
@@ -0,0 +1,70 @@
+import pandas as pd
+import numpy as np
+import requests
+import json
+import re
+import unidecode
+import time
+from tqdm import tqdm
+from bs4 import BeautifulSoup
+from selenium import webdriver
+
+
+def clean_title(t):
+    t = unidecode.unidecode(t)
+    t = t.replace('\n', ' ')
+    t = re.sub(r'[^\w\s]', '', t)
+    t = re.sub(r'\d+', '', t)
+    t = t.lower()
+    t = t.strip()
+    return t
+
+
+def clean_description(d):
+    d = unidecode.unidecode(d)
+    d = d.replace('\n', ' ')
+    d = re.sub(r'[^\w\s]', '', d)
+    d = re.sub(r'\d+', '', d)
+    if re.findall(r'(.*) brought to you by.*', d):
+        d = re.sub(r'brought to you by.*', '', d)
+    if re.search(r'(.*) sponsored by.*', d):
+        d = re.sub(r'sponsored by.*', '', d)
+    d = d.lower()
+    d = d.strip()
+    return d
+
+
+def get_recent_podcast_episodes(link):
+    episode_titles = ''
+    episode_desc = ''
+
+    driver = webdriver.PhantomJS()
+    driver.get(link)
+    html = driver.page_source.encode('utf-8')
+
+    soup = BeautifulSoup(html, 'lxml')
+    text = str(soup.find('script'))
+
+    try:
+        text = text.split('"workExample":')[1].split(',"aggregateRating"')[0]
+        episode_data = json.loads(text)
+
+        for episode in episode_data:
+            title = episode['name']
+            c_title = clean_title(title)
+
+            description = episode['description']
+            c_description = clean_description(description)
+
+            episode_titles += (c_title + " ")
+            episode_desc += (c_description + " ")
+
+        episode_titles = episode_titles.strip()
+        episode_desc = episode_desc.strip()
+
+    except Exception:
+        episode_title = np.nan
+        episode_desc = np.nan
+        print("Failed on: " + str(link))
+
+    return [episode_titles, episode_desc]
diff --git a/Recommendation Models/Podcast Recommendation System/scripts/get_podcast_info.py b/Recommendation Models/Podcast Recommendation System/scripts/get_podcast_info.py
@@ -0,0 +1,98 @@
+from bs4 import BeautifulSoup
+import requests
+import json
+
+
+def convert_si_to_number(x):
+    total_stars = 0
+    if 'K' in x:
+        if len(x) > 1:
+            total_stars = float(x.replace('K', '')) * 1000  # convert K to a thousand
+    elif 'M' in x:
+        if len(x) > 1:
+            total_stars = float(x.replace('M', '')) * 1000000  # convert M to a million
+    elif 'B' in x:
+        total_stars = float(x.replace('B', '')) * 1000000000  # convert B to a Billion
+    else:
+        total_stars = int(x)  # Less than 1000
+    return int(total_stars)
+
+
+def build_podcast_object(link):
+    try:
+        response = requests.get(link, timeout=10)
+        content = BeautifulSoup(response.content, "lxml")
+        try:
+            title = content.find('h1').find('span').text.strip()
+        except AttributeError:
+            title = 'NA'
+        try:
+            producer = content.find('h1').find('a').text.strip()
+        except AttributeError:
+            try:
+                producer = content.find('h1').find('span', class_='product-header__identity podcast-header__identity') \
+                    .text.strip()
+            except AttributeError:
+                producer = 'NA'
+        try:
+            genre = content.find('li', class_="product-header__list__item").text.strip()
+        except AttributeError:
+            genre = 'NA'
+        try:
+            description = content.find('div', class_="product-hero-desc product-hero-desc--side-bar").text.strip()
+        except AttributeError:
+            description = 'NA'
+        try:
+            num_episodes = int(content.find('div', class_="product-artwork__caption small-hide medium-show")
+                               .text.strip().strip('episodes'))
+        except AttributeError:
+            num_episodes = 'NA'
+        try:
+            rating = float(content.find('span', class_="we-customer-ratings__averages__display").text.strip())
+        except AttributeError:
+            rating = 'NA'
+        try:
+            num_reviews = convert_si_to_number(content
+                                               .find('div', class_="we-customer-ratings__count small-hide medium-show")
+                                               .text.strip().strip('Ratings'))
+        except AttributeError:
+            num_reviews = 'NA'
+    except Exception:
+        title = 'NA'
+        producer = 'NA'
+        genre = 'NA'
+        description = 'NA'
+        num_episodes = 'NA'
+        rating = 'NA'
+        num_reviews = 'NA'
+
+    podcast_object = {
+        'title': title,
+        'producer': producer,
+        'genre': genre,
+        'description': description,
+        'num_episodes': num_episodes,
+        'rating': rating,
+        'num_reviews': num_reviews,
+        'link': link
+    }
+    return podcast_object
+
+
+counter = 1
+podcastObjects = list()
+with open('podcast_links.json') as json_file:
+    podcasts = json.load(json_file)
+    for podcast_link in podcasts:
+        try:
+            podcast = build_podcast_object(podcast_link)
+            podcastObjects.append(podcast)
+            print('Completed Podcast: ' + str(counter))
+            counter += 1
+        except Exception:
+            print('Failed on ' + podcast_link)
+            counter += 1
+            pass
+
+with open('podcasts_info.json', 'w') as outfile:
+    json.dump(podcastObjects, outfile)
diff --git a/Recommendation Models/Podcast Recommendation System/scripts/podcast_link_scraper.py b/Recommendation Models/Podcast Recommendation System/scripts/podcast_link_scraper.py
@@ -0,0 +1,40 @@
+from bs4 import BeautifulSoup
+import requests
+import json
+
+CATEGORIES = {
+    'arts': "https://podcasts.apple.com/us/genre/podcasts-arts/id1301",
+    'business': "https://podcasts.apple.com/us/genre/podcasts-business/id1321",
+    'comedy': "https://podcasts.apple.com/us/genre/podcasts-comedy/id1303",
+    'education': "https://podcasts.apple.com/us/genre/podcasts-education/id1304",
+    'fiction': "https://podcasts.apple.com/us/genre/podcasts-fiction/id1483",
+    'government': "https://podcasts.apple.com/us/genre/podcasts-government/id1511",
+    'health': "https://podcasts.apple.com/us/genre/podcasts-health-fitness/id1512",
+    'history': "https://podcasts.apple.com/us/genre/podcasts-history/id1487",
+    'kids_and_family': "https://podcasts.apple.com/us/genre/podcasts-kids-family/id1305",
+    'leisure': "https://podcasts.apple.com/us/genre/podcasts-leisure/id1502",
+    'music': "https://podcasts.apple.com/us/genre/podcasts-music/id1310",
+    'news': "https://podcasts.apple.com/us/genre/podcasts-news/id1489",
+    'religion_and_spirituality': "https://podcasts.apple.com/us/genre/podcasts-religion-spirituality/id1314",
+    'science': "https://podcasts.apple.com/us/genre/podcasts-science/id1533",
+    'society_and_culture': "https://podcasts.apple.com/us/genre/podcasts-society-culture/id1324",
+    'sports': "https://podcasts.apple.com/us/genre/podcasts-sports/id1545",
+    'tv_and_film': "https://podcasts.apple.com/us/genre/podcasts-tv-film/id1309",
+    'technology': "https://podcasts.apple.com/us/genre/podcasts-technology/id1318",
+    'true_crime': "https://podcasts.apple.com/us/genre/podcasts-true-crime/id1488"
+}
+
+all_podcast_links = list()
+
+for category_url in CATEGORIES.values():
+    response = requests.get(category_url, timeout=5)
+    content = BeautifulSoup(response.content, "lxml")
+    podcast_links = content.find('div', class_='grid3-column')
+
+    for link in podcast_links.findAll('a'):
+        all_podcast_links.append(link.get('href'))
+
+all_podcast_links = list(set(all_podcast_links))
+
+with open('podcast_links.json', 'w') as outfile:
+    json.dump(all_podcast_links, outfile)