|
1 |
| -# main.py |
2 |
| - |
3 | 1 | import requests
|
| 2 | +from bs4 import BeautifulSoup |
4 | 3 | from sklearn.feature_extraction.text import TfidfVectorizer
|
5 | 4 | from sklearn.cluster import KMeans
|
6 | 5 | from sklearn.decomposition import PCA
|
7 | 6 | from sklearn.metrics import silhouette_score
|
8 | 7 | import matplotlib.pyplot as plt
|
9 |
| -from scraper import scrape_imdb_top_movies # Import the scraping function |
10 | 8 |
|
11 |
| -# Scrape IMDb for summaries and titles of 250 top-rated movies |
| 9 | + |
| 10 | +def scrape_imdb_top_movies(num_movies): |
| 11 | + headers = { |
| 12 | + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' |
| 13 | + } |
| 14 | + request = requests.get('https://www.imdb.com/chart/top/?ref_=login', headers=headers) |
| 15 | + content = request.content |
| 16 | + soup = BeautifulSoup(content, 'html.parser') |
| 17 | + movie_link = soup.find_all('a', {"class": "ipc-title-link-wrapper"}) |
| 18 | + |
| 19 | + hrefs = [] |
| 20 | + movie_titles = [] |
| 21 | + for movie in movie_link: |
| 22 | + text = movie.text |
| 23 | + if text[0].isdigit(): |
| 24 | + movie_titles.append(text) |
| 25 | + hrefs.append(movie.get("href")) |
| 26 | + |
| 27 | + summaries = [] |
| 28 | + for index in range(num_movies): |
| 29 | + url = "https://www.imdb.com" + hrefs[index] |
| 30 | + print(f"Fetching summary for: {movie_titles[index]}") |
| 31 | + r = requests.get(url, headers=headers) |
| 32 | + url_soup = BeautifulSoup(r.content, 'html.parser') |
| 33 | + summary = url_soup.find('span', {'data-testid': 'plot-l'}).text if url_soup.find('span', {'data-testid': 'plot-l'}) else "No summary available" |
| 34 | + summaries.append(summary) |
| 35 | + |
| 36 | + return movie_titles[:num_movies], summaries |
| 37 | + |
| 38 | + |
12 | 39 | num_movies = 250
|
13 | 40 | movie_titles, summaries = scrape_imdb_top_movies(num_movies)
|
14 | 41 |
|
15 |
| -# TF-IDF Vectorization |
| 42 | + |
16 | 43 | vectorizer = TfidfVectorizer(stop_words='english')
|
17 | 44 | tfidf_matrix = vectorizer.fit_transform(summaries)
|
18 | 45 |
|
19 |
| -# Dimensionality Reduction with PCA |
20 |
| -pca = PCA(n_components=2) |
| 46 | + |
| 47 | +pca = PCA(n_components=2) |
21 | 48 | tfidf_pca = pca.fit_transform(tfidf_matrix.toarray())
|
22 | 49 |
|
23 |
| -# Finding the Optimal Number of Clusters |
24 | 50 |
|
25 | 51 | # Elbow Method
|
26 | 52 | sum_of_squared_distances = []
|
|
37 | 63 | plt.title('Elbow Method for Optimal k')
|
38 | 64 | plt.show()
|
39 | 65 |
|
40 |
| -# Silhouette Score |
| 66 | + |
41 | 67 | silhouette_avg = []
|
42 | 68 | for k in range(2, min(10, num_movies)): # Adjust the range to be less than or equal to the number of samples
|
43 | 69 | kmeans = KMeans(n_clusters=k)
|
|
53 | 79 | plt.show()
|
54 | 80 |
|
55 | 81 | # Choose the optimal number of clusters
|
56 |
| -optimal_k = 5 # Example value; replace with the best k determined from the plots |
57 |
| - |
| 82 | +optimal_k = 5 |
58 | 83 | # K-means Clustering with Optimal k
|
59 | 84 | kmeans = KMeans(n_clusters=optimal_k)
|
60 | 85 | kmeans.fit(tfidf_pca)
|
|
0 commit comments