|
| 1 | +from fastapi import FastAPI |
| 2 | +from sklearn.feature_extraction.text import TfidfVectorizer |
| 3 | +from sklearn.cluster import KMeans |
| 4 | +from sklearn.metrics import pairwise_distances_argmin_min |
| 5 | +import nltk |
| 6 | +from nltk.stem import WordNetLemmatizer |
| 7 | +from nltk.corpus import stopwords |
| 8 | +from nltk.tokenize import sent_tokenize |
| 9 | +import string |
| 10 | +import numpy as np |
| 11 | + |
| 12 | +app = FastAPI() |
| 13 | + |
| 14 | +# Preprocessing functions |
| 15 | +def preprocess_text(text): |
| 16 | + # Tokenize into sentences |
| 17 | + sentences = sent_tokenize(text) |
| 18 | + |
| 19 | + # Remove punctuation and convert to lowercase |
| 20 | + translator = str.maketrans("", "", string.punctuation) |
| 21 | + sentences = [sentence.translate(translator).lower() for sentence in sentences] |
| 22 | + |
| 23 | + # Lemmatize words |
| 24 | + lemmatizer = WordNetLemmatizer() |
| 25 | + sentences = [[lemmatizer.lemmatize(word) for word in sentence.split()] for sentence in sentences] |
| 26 | + |
| 27 | + # Remove stopwords |
| 28 | + stop_words = set(stopwords.words("english")) |
| 29 | + sentences = [[word for word in sentence if word not in stop_words] for sentence in sentences] |
| 30 | + |
| 31 | + # Convert sentences back to text |
| 32 | + preprocessed_text = [" ".join(sentence) for sentence in sentences] |
| 33 | + return preprocessed_text |
| 34 | + |
| 35 | +# API route for extracting topic-wise chunks |
| 36 | +@app.post("/extract_chunks") |
| 37 | +def extract_chunks(text: str): |
| 38 | + # Preprocess the input text |
| 39 | + preprocessed_text = preprocess_text(text) |
| 40 | + |
| 41 | + # Vectorize the preprocessed text |
| 42 | + vectorizer = TfidfVectorizer() |
| 43 | + tfidf_matrix = vectorizer.fit_transform(preprocessed_text) |
| 44 | + |
| 45 | + # Determine the optimal number of clusters using the Elbow Method |
| 46 | + distortions = [] |
| 47 | + K = range(1, 10) # Set the range of possible clusters |
| 48 | + for k in K: |
| 49 | + kmeans = KMeans(n_clusters=k) |
| 50 | + kmeans.fit(tfidf_matrix) |
| 51 | + distortions.append(kmeans.inertia_) |
| 52 | + |
| 53 | + # Find the "elbow" point in the distortion plot |
| 54 | + elbow_index = np.argmin(np.diff(distortions)) + 1 |
| 55 | + num_clusters = K[elbow_index] |
| 56 | + |
| 57 | + # Perform clustering with the determined number of clusters |
| 58 | + kmeans = KMeans(n_clusters=num_clusters) |
| 59 | + kmeans.fit(tfidf_matrix) |
| 60 | + |
| 61 | + # Find the closest sentence to each cluster centroid |
| 62 | + closest_indices = pairwise_distances_argmin_min(kmeans.cluster_centers_, tfidf_matrix) |
| 63 | + |
| 64 | + # Retrieve topic-wise chunks |
| 65 | + chunks = [] |
| 66 | + for cluster_index, closest_index in enumerate(closest_indices[0]): |
| 67 | + chunk = preprocessed_text[closest_index] |
| 68 | + chunks.append({"topic": f"Topic {cluster_index+1}", "chunk": chunk}) |
| 69 | + |
| 70 | + return chunks |
0 commit comments