1
- from fastapi import FastAPI
2
- from sklearn .feature_extraction .text import TfidfVectorizer
1
+ from fastapi import APIRouter
2
+ import tensorflow_hub as hub
3
+ import tensorflow_text
3
4
from sklearn .cluster import KMeans
4
- from sklearn .metrics import pairwise_distances_argmin_min
5
- import nltk
6
- from nltk .stem import WordNetLemmatizer
7
- from nltk .corpus import stopwords
8
- from nltk .tokenize import sent_tokenize
9
- import string
10
5
import numpy as np
11
6
12
- app = FastAPI ()
7
+
8
+
9
+ app = APIRouter ()
10
+
11
+ # Load the USE model
12
+ embed = hub .load ("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3" )
13
13
14
14
# Preprocessing functions
15
15
def preprocess_text (text ):
16
- # Tokenize into sentences
17
- sentences = sent_tokenize (text )
18
-
19
- # Remove punctuation and convert to lowercase
20
- translator = str .maketrans ("" , "" , string .punctuation )
21
- sentences = [sentence .translate (translator ).lower () for sentence in sentences ]
22
-
23
- # Lemmatize words
24
- lemmatizer = WordNetLemmatizer ()
25
- sentences = [[lemmatizer .lemmatize (word ) for word in sentence .split ()] for sentence in sentences ]
26
-
27
- # Remove stopwords
28
- stop_words = set (stopwords .words ("english" ))
29
- sentences = [[word for word in sentence if word not in stop_words ] for sentence in sentences ]
30
-
31
- # Convert sentences back to text
32
- preprocessed_text = [" " .join (sentence ) for sentence in sentences ]
33
- return preprocessed_text
16
+ sentences = text .split ('\n ' ) # Split text into sentences
17
+ return sentences
34
18
35
19
# API route for extracting topic-wise chunks
36
20
@app .post ("/extract_chunks" )
37
21
def extract_chunks (text : str ):
38
22
# Preprocess the input text
39
- preprocessed_text = preprocess_text (text )
23
+ sentences = preprocess_text (text )
40
24
41
- # Vectorize the preprocessed text
42
- vectorizer = TfidfVectorizer ()
43
- tfidf_matrix = vectorizer .fit_transform (preprocessed_text )
25
+ # Generate sentence embeddings
26
+ sentence_embeddings = embed (sentences )
44
27
45
28
# Determine the optimal number of clusters using the Elbow Method
46
29
distortions = []
47
30
K = range (1 , 10 ) # Set the range of possible clusters
48
31
for k in K :
49
32
kmeans = KMeans (n_clusters = k )
50
- kmeans .fit (tfidf_matrix )
33
+ kmeans .fit (sentence_embeddings )
51
34
distortions .append (kmeans .inertia_ )
52
35
53
36
# Find the "elbow" point in the distortion plot
@@ -56,15 +39,12 @@ def extract_chunks(text: str):
56
39
57
40
# Perform clustering with the determined number of clusters
58
41
kmeans = KMeans (n_clusters = num_clusters )
59
- kmeans .fit (tfidf_matrix )
60
-
61
- # Find the closest sentence to each cluster centroid
62
- closest_indices = pairwise_distances_argmin_min (kmeans .cluster_centers_ , tfidf_matrix )
42
+ kmeans .fit (sentence_embeddings )
63
43
64
- # Retrieve topic-wise chunks
44
+ # Retrieve topic-wise chunks with subsections
65
45
chunks = []
66
- for cluster_index , closest_index in enumerate ( closest_indices [ 0 ] ):
67
- chunk = preprocessed_text [ closest_index ]
68
- chunks .append ({"topic" : f"Topic { cluster_index + 1 } " , "chunk " : chunk })
46
+ for cluster_index in range ( num_clusters ):
47
+ chunk_sentences = [ sentences [ i ] for i in range ( len ( sentences )) if kmeans . labels_ [ i ] == cluster_index ]
48
+ chunks .append ({"topic" : f"Topic { cluster_index + 1 } " , "subsections " : chunk_sentences })
69
49
70
- return chunks
50
+ return chunks
0 commit comments