@@ -39,55 +39,115 @@ def get_number_of_classes(y):
39
39
return y .shape [1 ]
40
40
41
41
class NBSVM :
42
- def __init__ (self , solver = 'liblinear' , dual = True , C = 4 , ngram_range = (1 , 2 )):
43
- self .solver = solver # 'lbfgs' - large, liblinear for small datasets
44
- self .dual = dual
45
- self .C = C
46
- self .ngram_range = ngram_range
42
+ def __init__ (self , experiment ):
43
+ self .experiment = experiment
47
44
48
45
re_tok = re .compile (f'([{ string .punctuation } “”¨«»®´·º½¾¿¡§£₤‘’])' )
46
+ re_tok_fixed = re .compile (f'([{ string .punctuation } “”¨«»®´·º½¾¿¡§£₤‘’])' .replace ('<' , '' ).replace ('>' , '' ).replace ('/' , '' ))
49
47
50
- def tokenize (self , s ):
48
+ def tokenize (self , s ):
51
49
return self .re_tok .sub (r' \1 ' , s ).split ()
52
50
51
+ def tokenize_fixed (self , s ):
52
+ return self .re_tok_fixed .sub (r' \1 ' , s ).split ()
53
+
53
54
def pr (self , y_i , y ):
54
55
p = self .trn_term_doc [y == y_i ].sum (0 )
55
56
return (p + 1 ) / ((y == y_i ).sum ()+ 1 )
56
57
57
58
def get_mdl (self , y ):
58
59
y = y .values
59
60
r = np .log (self .pr (1 , y ) / self .pr (0 , y ))
60
- m = LogisticRegression (C = self .C , dual = self .dual , solver = self .solver , max_iter = 1000 )
61
+ m = LogisticRegression (C = self .experiment .C , penalty = self .experiment .penalty ,
62
+ dual = self .experiment .dual , solver = self .experiment .solver ,
63
+ max_iter = self .experiment .max_iter )
61
64
x_nb = self .trn_term_doc .multiply (r )
62
65
return m .fit (x_nb , y ), r
63
66
64
67
def bow (self , X_train ):
65
68
self .n = X_train .shape [0 ]
66
- self .vec = TfidfVectorizer (ngram_range = self .ngram_range , tokenizer = self .tokenize ,
67
- min_df = 3 , max_df = 0.9 , strip_accents = 'unicode' , use_idf = 1 ,
68
- smooth_idf = 1 , sublinear_tf = 1 )
69
+
70
+ if self .experiment .vectorizer == "tfidf" :
71
+ self .vec = TfidfVectorizer (ngram_range = self .experiment .ngram_range ,
72
+ tokenizer = self .tokenize_fixed if self .experiment .fixed_tokenizer else self .tokenize ,
73
+ min_df = self .experiment .min_df , max_df = self .experiment .max_df ,
74
+ strip_accents = 'unicode' , use_idf = 1 ,
75
+ smooth_idf = 1 , sublinear_tf = 1 )
76
+ elif self .experiment .vectorizer == "count" :
77
+ self .vec = CountVectorizer (ngram_range = self .experiment .ngram_range , tokenizer = self .tokenize ,
78
+ min_df = self .experiment .min_df , max_df = self .experiment .max_df ,
79
+ strip_accents = 'unicode' )
80
+ else :
81
+ raise Exception (f"Unknown vectorizer type: { self .experiment .vectorizer } " )
82
+
69
83
return self .vec .fit_transform (X_train )
70
84
71
85
def train_models (self , y_train ):
72
86
self .models = []
73
- for i in range (0 , self .c ):
74
- print ('fit' , i )
75
- m , r = self .get_mdl (get_class_column (y_train , i ))
76
- self .models .append ((m , r ))
87
+ if self .experiment .multinomial_type == "manual" :
88
+ for i in range (0 , self .c ):
89
+ #print('fit', i)
90
+ m , r = self .get_mdl (get_class_column (y_train , i ))
91
+ self .models .append ((m , r ))
92
+ elif self .experiment .multinomial_type == "multinomial" :
93
+ m = LogisticRegression (C = self .experiment .C , penalty = self .experiment .penalty ,
94
+ dual = self .experiment .dual , solver = self .experiment .solver ,
95
+ max_iter = self .experiment .max_iter ,
96
+ multi_class = "multinomial" , class_weight = self .experiment .class_weight )
97
+ x_nb = self .trn_term_doc
98
+ self .models .append (m .fit (x_nb , y_train ))
99
+ else :
100
+ raise Exception (f"Unsupported multinomial_type { self .experiment .multinomial_type } " )
77
101
78
102
def fit (self , X_train , y_train ):
79
103
self .trn_term_doc = self .bow (X_train )
80
104
self .c = get_number_of_classes (y_train )
81
105
self .train_models (y_train )
82
106
83
107
def predict_proba (self , X_test ):
84
- preds = np .zeros ((len (X_test ), self .c ))
85
108
test_term_doc = self .vec .transform (X_test )
86
- for i in range (0 , self .c ):
87
- m , r = self .models [i ]
88
- preds [:, i ] = m .predict_proba (test_term_doc .multiply (r ))[:, 1 ]
109
+ if self .experiment .multinomial_type == "manual" :
110
+ preds = np .zeros ((len (X_test ), self .c ))
111
+ for i in range (0 , self .c ):
112
+ m , r = self .models [i ]
113
+ preds [:, i ] = m .predict_proba (test_term_doc .multiply (r ))[:, 1 ]
114
+ elif self .experiment .multinomial_type == "multinomial" :
115
+ preds = self .models [0 ].predict_proba (test_term_doc )
116
+ else :
117
+ raise Exception (f"Unsupported multinomial_type { self .experiment .multinomial_type } " )
89
118
return preds
90
-
119
+
120
+ def sort_features_by_importance (self , label ):
121
+ label = label .value
122
+ names = np .array (self .vec .get_feature_names ())
123
+ if self .experiment .multinomial_type == "manual" :
124
+ m , r = self .models [label ]
125
+ f = m .coef_ [0 ] * np .array (r [0 ])
126
+ elif self .experiment .multinomial_type == "multinomial" :
127
+ f = self .models [0 ].coef_ [label ]
128
+ else :
129
+ raise Exception (f"Unsupported multinomial_type { self .experiment .multinomial_type } " )
130
+ if self .experiment .vectorizer == "tfidf" :
131
+ f *= self .vec .idf_
132
+ indices = f .argsort ()[::- 1 ]
133
+ return names [indices ], f [indices ]
134
+
135
+ def get_mismatched (self , df , true_label , predicted_label ):
136
+ true_label = true_label .value
137
+ predicted_label = predicted_label .value
138
+
139
+ probs = self .predict_proba (df ["text" ])
140
+ preds = np .argmax (probs , axis = 1 )
141
+ true_y = df ["label" ]
142
+
143
+ mismatched_indices = (true_y == true_label ) & (preds == predicted_label )
144
+ mismatched = df [mismatched_indices ]
145
+ diff = probs [mismatched_indices , true_label ] - probs [mismatched_indices , predicted_label ]
146
+ indices = diff .argsort ()
147
+ mismatched = mismatched .iloc [indices ]
148
+ mismatched ["pr_diff" ] = diff [indices ]
149
+ return mismatched
150
+
91
151
def validate (self , X_test , y_test ):
92
152
acc = (np .argmax (self .predict_proba (X_test ), axis = 1 ) == y_test ).mean ()
93
153
return acc
@@ -98,10 +158,14 @@ def metrics(preds, true_y):
98
158
acc = (p == y ).mean ()
99
159
tp = ((y != 0 ) & (p == y )).sum ()
100
160
fp = ((p != 0 ) & (p != y )).sum ()
161
+ fn = ((y != 0 ) & (p == 0 )).sum ()
162
+
101
163
prec = tp / (fp + tp )
164
+ reca = tp / (fn + tp )
102
165
return {
103
166
"precision" : prec ,
104
167
"accuracy" : acc ,
168
+ "recall" : reca ,
105
169
"TP" : tp ,
106
170
"FP" : fp ,
107
171
}
@@ -130,6 +194,18 @@ def preds_for_cell_content_multi(test_df, probs, group_by=["cell_content"]):
130
194
'counts' : grouped_counts })
131
195
return results
132
196
197
+ def preds_for_cell_content_best (test_df , probs , group_by = ["cell_content" ]):
198
+ test_df = test_df .copy ()
199
+ probs_df = pd .DataFrame (probs , index = test_df .index )
200
+ test_df = pd .concat ([test_df , probs_df ], axis = 1 )
201
+ grouped_preds = np .argmax (test_df .groupby (
202
+ group_by )[probs_df .columns ].sum ().values , axis = 1 )
203
+ grouped_counts = test_df .groupby (group_by )["label" ].count ()
204
+ results = pd .DataFrame ({'true' : test_df .groupby (group_by )["label" ].agg (lambda x : x .value_counts ().index [0 ]),
205
+ 'pred' : grouped_preds ,
206
+ 'counts' : grouped_counts })
207
+ return results
208
+
133
209
def test_model (model , tdf ):
134
210
probs = model (tdf ["text" ])
135
211
preds = np .argmax (probs , axis = 1 )
0 commit comments