Skip to content

Commit b8a4435

Browse files
committed
Create NotesChunker.py
1 parent 2a450ed commit b8a4435

File tree

1 file changed

+70
-0
lines changed

1 file changed

+70
-0
lines changed

Backend/NotesChunker.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
from fastapi import FastAPI
2+
from sklearn.feature_extraction.text import TfidfVectorizer
3+
from sklearn.cluster import KMeans
4+
from sklearn.metrics import pairwise_distances_argmin_min
5+
import nltk
6+
from nltk.stem import WordNetLemmatizer
7+
from nltk.corpus import stopwords
8+
from nltk.tokenize import sent_tokenize
9+
import string
10+
import numpy as np
11+
12+
app = FastAPI()
13+
14+
# Preprocessing functions
15+
def preprocess_text(text):
16+
# Tokenize into sentences
17+
sentences = sent_tokenize(text)
18+
19+
# Remove punctuation and convert to lowercase
20+
translator = str.maketrans("", "", string.punctuation)
21+
sentences = [sentence.translate(translator).lower() for sentence in sentences]
22+
23+
# Lemmatize words
24+
lemmatizer = WordNetLemmatizer()
25+
sentences = [[lemmatizer.lemmatize(word) for word in sentence.split()] for sentence in sentences]
26+
27+
# Remove stopwords
28+
stop_words = set(stopwords.words("english"))
29+
sentences = [[word for word in sentence if word not in stop_words] for sentence in sentences]
30+
31+
# Convert sentences back to text
32+
preprocessed_text = [" ".join(sentence) for sentence in sentences]
33+
return preprocessed_text
34+
35+
# API route for extracting topic-wise chunks
36+
@app.post("/extract_chunks")
37+
def extract_chunks(text: str):
38+
# Preprocess the input text
39+
preprocessed_text = preprocess_text(text)
40+
41+
# Vectorize the preprocessed text
42+
vectorizer = TfidfVectorizer()
43+
tfidf_matrix = vectorizer.fit_transform(preprocessed_text)
44+
45+
# Determine the optimal number of clusters using the Elbow Method
46+
distortions = []
47+
K = range(1, 10) # Set the range of possible clusters
48+
for k in K:
49+
kmeans = KMeans(n_clusters=k)
50+
kmeans.fit(tfidf_matrix)
51+
distortions.append(kmeans.inertia_)
52+
53+
# Find the "elbow" point in the distortion plot
54+
elbow_index = np.argmin(np.diff(distortions)) + 1
55+
num_clusters = K[elbow_index]
56+
57+
# Perform clustering with the determined number of clusters
58+
kmeans = KMeans(n_clusters=num_clusters)
59+
kmeans.fit(tfidf_matrix)
60+
61+
# Find the closest sentence to each cluster centroid
62+
closest_indices = pairwise_distances_argmin_min(kmeans.cluster_centers_, tfidf_matrix)
63+
64+
# Retrieve topic-wise chunks
65+
chunks = []
66+
for cluster_index, closest_index in enumerate(closest_indices[0]):
67+
chunk = preprocessed_text[closest_index]
68+
chunks.append({"topic": f"Topic {cluster_index+1}", "chunk": chunk})
69+
70+
return chunks

0 commit comments

Comments
 (0)