forked from lukas/ml-class
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlemma-tokenizer.py
More file actions
37 lines (23 loc) · 881 Bytes
/
lemma-tokenizer.py
File metadata and controls
37 lines (23 loc) · 881 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
def lemma_tokenizer():
return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
vect = CountVectorizer(tokenizer=LemmaTokenizer())
import pandas as pd
import numpy as np
df = pd.read_csv('tweets.csv')
target = df['is_there_an_emotion_directed_at_a_brand_or_product']
text = df['tweet_text']
fixed_text = text[pd.notnull(text)]
fixed_target = target[pd.notnull(text)]
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(tokenizer=tokenize_1)
count_vect.fit(fixed_text)
counts = count_vect.transform(lemma_tokenizer)
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
from sklearn.model_selection import cross_val_score
scores = cross_val_score(nb, counts, fixed_target, cv=10)
print(scores)
print(scores.mean())