-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathnltk_utils.py
More file actions
24 lines (20 loc) · 835 Bytes
/
nltk_utils.py
File metadata and controls
24 lines (20 loc) · 835 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# Import Libraries
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import SnowballStemmer
stemmer= SnowballStemmer(language= 'english')
from nltk.corpus import stopwords
nltk.download('stopwords')
# Tokenize text i.e make all text be in a list format e.g "I am sick" = ['i', 'am', 'sick']
def tokenize(text):
return [stemmer.stem(token) for token in word_tokenize(text)]
# Create stopwords to reduce noise in data
english_stopwords= stopwords.words('english')
# Create a vectosizer to learn all words in order to convert them into numbers
def vectorizer():
vectorizer= TfidfVectorizer(tokenizer=tokenize,
stop_words=english_stopwords,
)
return vectorizer