|
| 1 | +import requests |
| 2 | +from bs4 import BeautifulSoup |
| 3 | +from sklearn.feature_extraction.text import TfidfVectorizer |
| 4 | +from sklearn.cluster import KMeans |
| 5 | +from sklearn.decomposition import PCA |
| 6 | +from sklearn.metrics import silhouette_score |
| 7 | +import matplotlib.pyplot as plt |
| 8 | + |
| 9 | + |
| 10 | +def scrape_imdb_top_movies(num_movies): |
| 11 | + headers = { |
| 12 | + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' |
| 13 | + } |
| 14 | + request = requests.get('https://www.imdb.com/chart/top/?ref_=login', headers=headers) |
| 15 | + content = request.content |
| 16 | + soup = BeautifulSoup(content, 'html.parser') |
| 17 | + movie_link = soup.find_all('a', {"class": "ipc-title-link-wrapper"}) |
| 18 | + |
| 19 | + hrefs = [] |
| 20 | + movie_titles = [] |
| 21 | + for movie in movie_link: |
| 22 | + text = movie.text |
| 23 | + if text[0].isdigit(): |
| 24 | + movie_titles.append(text) |
| 25 | + hrefs.append(movie.get("href")) |
| 26 | + |
| 27 | + summaries = [] |
| 28 | + for index in range(num_movies): |
| 29 | + url = "https://www.imdb.com" + hrefs[index] |
| 30 | + print(f"Fetching summary for: {movie_titles[index]}") |
| 31 | + r = requests.get(url, headers=headers) |
| 32 | + url_soup = BeautifulSoup(r.content, 'html.parser') |
| 33 | + summary = url_soup.find('span', {'data-testid': 'plot-l'}).text if url_soup.find('span', {'data-testid': 'plot-l'}) else "No summary available" |
| 34 | + summaries.append(summary) |
| 35 | + |
| 36 | + return movie_titles[:num_movies], summaries |
| 37 | + |
| 38 | + |
| 39 | +num_movies = 250 |
| 40 | +movie_titles, summaries = scrape_imdb_top_movies(num_movies) |
| 41 | + |
| 42 | + |
| 43 | +vectorizer = TfidfVectorizer(stop_words='english') |
| 44 | +tfidf_matrix = vectorizer.fit_transform(summaries) |
| 45 | + |
| 46 | + |
| 47 | +pca = PCA(n_components=2) |
| 48 | +tfidf_pca = pca.fit_transform(tfidf_matrix.toarray()) |
| 49 | + |
| 50 | + |
| 51 | +# Elbow Method |
| 52 | +sum_of_squared_distances = [] |
| 53 | +K = range(2, min(10, num_movies)) # Adjust the range to be less than or equal to the number of samples |
| 54 | +for k in K: |
| 55 | + km = KMeans(n_clusters=k) |
| 56 | + km = km.fit(tfidf_pca) |
| 57 | + sum_of_squared_distances.append(km.inertia_) |
| 58 | + |
| 59 | +plt.figure(figsize=(10, 7)) |
| 60 | +plt.plot(K, sum_of_squared_distances, 'bx-') |
| 61 | +plt.xlabel('Number of clusters (k)') |
| 62 | +plt.ylabel('Sum of squared distances') |
| 63 | +plt.title('Elbow Method for Optimal k') |
| 64 | +plt.show() |
| 65 | + |
| 66 | + |
| 67 | +silhouette_avg = [] |
| 68 | +for k in range(2, min(10, num_movies)): # Adjust the range to be less than or equal to the number of samples |
| 69 | + kmeans = KMeans(n_clusters=k) |
| 70 | + kmeans.fit(tfidf_pca) |
| 71 | + labels = kmeans.labels_ |
| 72 | + silhouette_avg.append(silhouette_score(tfidf_pca, labels)) |
| 73 | + |
| 74 | +plt.figure(figsize=(10, 7)) |
| 75 | +plt.plot(range(2, min(10, num_movies)), silhouette_avg, 'bx-') |
| 76 | +plt.xlabel('Number of clusters (k)') |
| 77 | +plt.ylabel('Silhouette Score') |
| 78 | +plt.title('Silhouette Score for Optimal k') |
| 79 | +plt.show() |
| 80 | + |
| 81 | +# Choose the optimal number of clusters |
| 82 | +optimal_k = 5 |
| 83 | +# K-means Clustering with Optimal k |
| 84 | +kmeans = KMeans(n_clusters=optimal_k) |
| 85 | +kmeans.fit(tfidf_pca) |
| 86 | +labels = kmeans.labels_ |
| 87 | + |
| 88 | +# Visualization |
| 89 | +plt.figure(figsize=(10, 7)) |
| 90 | +for i in range(optimal_k): |
| 91 | + points = tfidf_pca[labels == i] |
| 92 | + plt.scatter(points[:, 0], points[:, 1], label=f'Cluster {i}') |
| 93 | + |
| 94 | +plt.xlabel('PCA Component 1') |
| 95 | +plt.ylabel('PCA Component 2') |
| 96 | +plt.title('K-means Clustering of IMDb Movie Summaries (after PCA)') |
| 97 | +plt.legend() |
| 98 | +plt.show() |
0 commit comments