1
1
import os
2
- from sentence_transformers import SentenceTransformer
3
- from sklearn .metrics .pairwise import cosine_similarity
4
- import numpy as np
5
- from nltk .tokenize import sent_tokenize , word_tokenize
6
2
import nltk
3
+ from nltk .tokenize import sent_tokenize , word_tokenize
7
4
from nltk .corpus import stopwords
8
5
from collections import Counter
6
+ from sentence_transformers import SentenceTransformer
7
+ from sklearn .metrics .pairwise import cosine_similarity
8
+ import numpy as np
9
9
10
10
MODEL_NAME = 'all-MiniLM-L6-v2'
11
11
MODEL_FOLDER = 'model'
12
+ NLTK_DATA_FOLDER = os .path .join (MODEL_FOLDER , 'nltk_data' )
12
13
13
14
def load_or_download_model ():
14
15
model_path = os .path .join (MODEL_FOLDER , MODEL_NAME )
@@ -24,13 +25,17 @@ def load_or_download_model():
24
25
return model
25
26
26
27
def download_nltk_resources ():
27
- resources = ['punkt' , 'stopwords' ]
28
- for resource in resources :
28
+ nltk .data .path .append (NLTK_DATA_FOLDER )
29
+ os .makedirs (NLTK_DATA_FOLDER , exist_ok = True )
30
+
31
+ resources = [('punkt' , 'tokenizers' ), ('stopwords' , 'corpora' )]
32
+ for resource , folder in resources :
29
33
try :
30
- nltk .data .find (f'tokenizers/{ resource } ' )
34
+ nltk .data .find (f'{ folder } /{ resource } ' )
35
+ print (f"{ resource } is being Loaded." )
31
36
except LookupError :
32
37
print (f"Downloading { resource } ..." )
33
- nltk .download (resource , quiet = True )
38
+ nltk .download (resource , download_dir = NLTK_DATA_FOLDER , quiet = True )
34
39
35
40
def extract_keywords (text , model , top_n = 10 ):
36
41
# Tokenize the text
0 commit comments