|
| 1 | +# main.py |
| 2 | + |
| 3 | +import requests |
| 4 | +from sklearn.feature_extraction.text import TfidfVectorizer |
| 5 | +from sklearn.cluster import KMeans |
| 6 | +from sklearn.decomposition import PCA |
| 7 | +from sklearn.metrics import silhouette_score |
| 8 | +import matplotlib.pyplot as plt |
| 9 | +from scraper import scrape_imdb_top_movies # Import the scraping function |
| 10 | + |
| 11 | +# Scrape IMDb for summaries and titles of 250 top-rated movies |
| 12 | +num_movies = 250 |
| 13 | +movie_titles, summaries = scrape_imdb_top_movies(num_movies) |
| 14 | + |
| 15 | +# TF-IDF Vectorization |
| 16 | +vectorizer = TfidfVectorizer(stop_words='english') |
| 17 | +tfidf_matrix = vectorizer.fit_transform(summaries) |
| 18 | + |
| 19 | +# Dimensionality Reduction with PCA |
| 20 | +pca = PCA(n_components=2) |
| 21 | +tfidf_pca = pca.fit_transform(tfidf_matrix.toarray()) |
| 22 | + |
| 23 | +# Finding the Optimal Number of Clusters |
| 24 | + |
| 25 | +# Elbow Method |
| 26 | +sum_of_squared_distances = [] |
| 27 | +K = range(2, min(10, num_movies)) # Adjust the range to be less than or equal to the number of samples |
| 28 | +for k in K: |
| 29 | + km = KMeans(n_clusters=k) |
| 30 | + km = km.fit(tfidf_pca) |
| 31 | + sum_of_squared_distances.append(km.inertia_) |
| 32 | + |
| 33 | +plt.figure(figsize=(10, 7)) |
| 34 | +plt.plot(K, sum_of_squared_distances, 'bx-') |
| 35 | +plt.xlabel('Number of clusters (k)') |
| 36 | +plt.ylabel('Sum of squared distances') |
| 37 | +plt.title('Elbow Method for Optimal k') |
| 38 | +plt.show() |
| 39 | + |
| 40 | +# Silhouette Score |
| 41 | +silhouette_avg = [] |
| 42 | +for k in range(2, min(10, num_movies)): # Adjust the range to be less than or equal to the number of samples |
| 43 | + kmeans = KMeans(n_clusters=k) |
| 44 | + kmeans.fit(tfidf_pca) |
| 45 | + labels = kmeans.labels_ |
| 46 | + silhouette_avg.append(silhouette_score(tfidf_pca, labels)) |
| 47 | + |
| 48 | +plt.figure(figsize=(10, 7)) |
| 49 | +plt.plot(range(2, min(10, num_movies)), silhouette_avg, 'bx-') |
| 50 | +plt.xlabel('Number of clusters (k)') |
| 51 | +plt.ylabel('Silhouette Score') |
| 52 | +plt.title('Silhouette Score for Optimal k') |
| 53 | +plt.show() |
| 54 | + |
| 55 | +# Choose the optimal number of clusters |
| 56 | +optimal_k = 5 # Example value; replace with the best k determined from the plots |
| 57 | + |
| 58 | +# K-means Clustering with Optimal k |
| 59 | +kmeans = KMeans(n_clusters=optimal_k) |
| 60 | +kmeans.fit(tfidf_pca) |
| 61 | +labels = kmeans.labels_ |
| 62 | + |
| 63 | +# Visualization |
| 64 | +plt.figure(figsize=(10, 7)) |
| 65 | +for i in range(optimal_k): |
| 66 | + points = tfidf_pca[labels == i] |
| 67 | + plt.scatter(points[:, 0], points[:, 1], label=f'Cluster {i}') |
| 68 | + |
| 69 | +plt.xlabel('PCA Component 1') |
| 70 | +plt.ylabel('PCA Component 2') |
| 71 | +plt.title('K-means Clustering of IMDb Movie Summaries (after PCA)') |
| 72 | +plt.legend() |
| 73 | +plt.show() |
0 commit comments