Skip to content

Commit 33b7343

Browse files
committed
EDIT
1 parent b8a4435 commit 33b7343

File tree

5 files changed

+23
-43
lines changed

5 files changed

+23
-43
lines changed

Backend/NotesChunker.py

Lines changed: 21 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,36 @@
1-
from fastapi import FastAPI
2-
from sklearn.feature_extraction.text import TfidfVectorizer
1+
from fastapi import APIRouter
2+
import tensorflow_hub as hub
3+
import tensorflow_text
34
from sklearn.cluster import KMeans
4-
from sklearn.metrics import pairwise_distances_argmin_min
5-
import nltk
6-
from nltk.stem import WordNetLemmatizer
7-
from nltk.corpus import stopwords
8-
from nltk.tokenize import sent_tokenize
9-
import string
105
import numpy as np
116

12-
app = FastAPI()
7+
8+
9+
app = APIRouter()
10+
11+
# Load the USE model
12+
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")
1313

1414
# Preprocessing functions
1515
def preprocess_text(text):
16-
# Tokenize into sentences
17-
sentences = sent_tokenize(text)
18-
19-
# Remove punctuation and convert to lowercase
20-
translator = str.maketrans("", "", string.punctuation)
21-
sentences = [sentence.translate(translator).lower() for sentence in sentences]
22-
23-
# Lemmatize words
24-
lemmatizer = WordNetLemmatizer()
25-
sentences = [[lemmatizer.lemmatize(word) for word in sentence.split()] for sentence in sentences]
26-
27-
# Remove stopwords
28-
stop_words = set(stopwords.words("english"))
29-
sentences = [[word for word in sentence if word not in stop_words] for sentence in sentences]
30-
31-
# Convert sentences back to text
32-
preprocessed_text = [" ".join(sentence) for sentence in sentences]
33-
return preprocessed_text
16+
sentences = text.split('\n') # Split text into sentences
17+
return sentences
3418

3519
# API route for extracting topic-wise chunks
3620
@app.post("/extract_chunks")
3721
def extract_chunks(text: str):
3822
# Preprocess the input text
39-
preprocessed_text = preprocess_text(text)
23+
sentences = preprocess_text(text)
4024

41-
# Vectorize the preprocessed text
42-
vectorizer = TfidfVectorizer()
43-
tfidf_matrix = vectorizer.fit_transform(preprocessed_text)
25+
# Generate sentence embeddings
26+
sentence_embeddings = embed(sentences)
4427

4528
# Determine the optimal number of clusters using the Elbow Method
4629
distortions = []
4730
K = range(1, 10) # Set the range of possible clusters
4831
for k in K:
4932
kmeans = KMeans(n_clusters=k)
50-
kmeans.fit(tfidf_matrix)
33+
kmeans.fit(sentence_embeddings)
5134
distortions.append(kmeans.inertia_)
5235

5336
# Find the "elbow" point in the distortion plot
@@ -56,15 +39,12 @@ def extract_chunks(text: str):
5639

5740
# Perform clustering with the determined number of clusters
5841
kmeans = KMeans(n_clusters=num_clusters)
59-
kmeans.fit(tfidf_matrix)
60-
61-
# Find the closest sentence to each cluster centroid
62-
closest_indices = pairwise_distances_argmin_min(kmeans.cluster_centers_, tfidf_matrix)
42+
kmeans.fit(sentence_embeddings)
6343

64-
# Retrieve topic-wise chunks
44+
# Retrieve topic-wise chunks with subsections
6545
chunks = []
66-
for cluster_index, closest_index in enumerate(closest_indices[0]):
67-
chunk = preprocessed_text[closest_index]
68-
chunks.append({"topic": f"Topic {cluster_index+1}", "chunk": chunk})
46+
for cluster_index in range(num_clusters):
47+
chunk_sentences = [sentences[i] for i in range(len(sentences)) if kmeans.labels_[i] == cluster_index]
48+
chunks.append({"topic": f"Topic {cluster_index+1}", "subsections": chunk_sentences})
6949

70-
return chunks
50+
return chunks
1.31 KB
Binary file not shown.
141 Bytes
Binary file not shown.

__pycache__/app.cpython-310.pyc

64 Bytes
Binary file not shown.

app.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
#from Backend.Notes_Analyser import router as api4_router
99
#from Backend.Narrator import router as api5_router
10-
#from Backend.pyqsorter import router as sorter
10+
from Backend.NotesChunker import app as chunker
1111
from Backend.NotesToText import router as notestotxt
1212

1313
# import other API routers as needed
@@ -32,7 +32,7 @@
3232

3333
#app.include_router(sorter)
3434
#app.include_router(api4_router)
35-
#app.include_router(api6_router)
35+
app.include_router(chunker)
3636
app.include_router(notestotxt)
3737

3838
# include other API routers as needed

0 commit comments

Comments
 (0)