ANGELOANTU7
diff --git a/‎Backend/NotesChunker.py‎
Lines changed: 21 additions & 41 deletions b/‎Backend/NotesChunker.py‎
Lines changed: 21 additions & 41 deletions
diff --git a/‎Backend/__pycache__/NotesChunker.cpython-310.pyc‎
1.31 KB b/‎Backend/__pycache__/NotesChunker.cpython-310.pyc‎
1.31 KB
diff --git a/‎Backend/__pycache__/NotesToText.cpython-310.pyc‎
141 Bytes b/‎Backend/__pycache__/NotesToText.cpython-310.pyc‎
141 Bytes
diff --git a/‎__pycache__/app.cpython-310.pyc‎
64 Bytes b/‎__pycache__/app.cpython-310.pyc‎
64 Bytes
diff --git a/‎app.py‎
Lines changed: 2 additions & 2 deletions b/‎app.py‎
Lines changed: 2 additions & 2 deletions
@@ -1,53 +1,36 @@
-from fastapi import FastAPI
-from sklearn.feature_extraction.text import TfidfVectorizer
+from fastapi import APIRouter
+import tensorflow_hub as hub
+import tensorflow_text
 from sklearn.cluster import KMeans
-from sklearn.metrics import pairwise_distances_argmin_min
-import nltk
-from nltk.stem import WordNetLemmatizer
-from nltk.corpus import stopwords
-from nltk.tokenize import sent_tokenize
-import string
 import numpy as np
 
-app = FastAPI()
+
+
+app = APIRouter()
+
+# Load the USE model
+embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")
 
 # Preprocessing functions
 def preprocess_text(text):
-    # Tokenize into sentences
-    sentences = sent_tokenize(text)
-    
-    # Remove punctuation and convert to lowercase
-    translator = str.maketrans("", "", string.punctuation)
-    sentences = [sentence.translate(translator).lower() for sentence in sentences]
-    
-    # Lemmatize words
-    lemmatizer = WordNetLemmatizer()
-    sentences = [[lemmatizer.lemmatize(word) for word in sentence.split()] for sentence in sentences]
-    
-    # Remove stopwords
-    stop_words = set(stopwords.words("english"))
-    sentences = [[word for word in sentence if word not in stop_words] for sentence in sentences]
-    
-    # Convert sentences back to text
-    preprocessed_text = [" ".join(sentence) for sentence in sentences]
-    return preprocessed_text
+    sentences = text.split('\n')  # Split text into sentences
+    return sentences
 
 # API route for extracting topic-wise chunks
 @app.post("/extract_chunks")
 def extract_chunks(text: str):
     # Preprocess the input text
-    preprocessed_text = preprocess_text(text)
+    sentences = preprocess_text(text)
 
-    # Vectorize the preprocessed text
-    vectorizer = TfidfVectorizer()
-    tfidf_matrix = vectorizer.fit_transform(preprocessed_text)
+    # Generate sentence embeddings
+    sentence_embeddings = embed(sentences)
 
     # Determine the optimal number of clusters using the Elbow Method
     distortions = []
     K = range(1, 10)  # Set the range of possible clusters
     for k in K:
         kmeans = KMeans(n_clusters=k)
-        kmeans.fit(tfidf_matrix)
+        kmeans.fit(sentence_embeddings)
         distortions.append(kmeans.inertia_)
 
     # Find the "elbow" point in the distortion plot
@@ -56,15 +39,12 @@ def extract_chunks(text: str):
 
     # Perform clustering with the determined number of clusters
     kmeans = KMeans(n_clusters=num_clusters)
-    kmeans.fit(tfidf_matrix)
-    
-    # Find the closest sentence to each cluster centroid
-    closest_indices = pairwise_distances_argmin_min(kmeans.cluster_centers_, tfidf_matrix)
+    kmeans.fit(sentence_embeddings)
 
-    # Retrieve topic-wise chunks
+    # Retrieve topic-wise chunks with subsections
     chunks = []
-    for cluster_index, closest_index in enumerate(closest_indices[0]):
-        chunk = preprocessed_text[closest_index]
-        chunks.append({"topic": f"Topic {cluster_index+1}", "chunk": chunk})
+    for cluster_index in range(num_clusters):
+        chunk_sentences = [sentences[i] for i in range(len(sentences)) if kmeans.labels_[i] == cluster_index]
+        chunks.append({"topic": f"Topic {cluster_index+1}", "subsections": chunk_sentences})
 
-    return chunks
+    return chunks
@@ -7,7 +7,7 @@
 
 #from Backend.Notes_Analyser import router as api4_router
 #from Backend.Narrator import router as api5_router
-#from Backend.pyqsorter import router as sorter 
+from Backend.NotesChunker import app as chunker 
 from Backend.NotesToText import router as notestotxt
 
 # import other API routers as needed
@@ -32,7 +32,7 @@
 
 #app.include_router(sorter)
 #app.include_router(api4_router)
-#app.include_router(api6_router)
+app.include_router(chunker)
 app.include_router(notestotxt)
 
 # include other API routers as needed