@@ -39,10 +39,11 @@ def get_number_of_classes(y):
39
39
return y .shape [1 ]
40
40
41
41
class NBSVM :
42
- def __init__ (self , solver = 'liblinear' , dual = True ):
42
+ def __init__ (self , solver = 'liblinear' , dual = True , C = 4 , ngram_range = ( 1 , 2 ) ):
43
43
self .solver = solver # 'lbfgs' - large, liblinear for small datasets
44
44
self .dual = dual
45
- pass
45
+ self .C = C
46
+ self .ngram_range = ngram_range
46
47
47
48
re_tok = re .compile (f'([{ string .punctuation } “”¨«»®´·º½¾¿¡§£₤‘’])' )
48
49
@@ -56,13 +57,13 @@ def pr(self, y_i, y):
56
57
def get_mdl (self , y ):
57
58
y = y .values
58
59
r = np .log (self .pr (1 , y ) / self .pr (0 , y ))
59
- m = LogisticRegression (C = 4 , dual = self .dual , solver = self .solver , max_iter = 1000 )
60
+ m = LogisticRegression (C = self . C , dual = self .dual , solver = self .solver , max_iter = 1000 )
60
61
x_nb = self .trn_term_doc .multiply (r )
61
62
return m .fit (x_nb , y ), r
62
63
63
64
def bow (self , X_train ):
64
65
self .n = X_train .shape [0 ]
65
- self .vec = TfidfVectorizer (ngram_range = ( 1 , 2 ) , tokenizer = self .tokenize ,
66
+ self .vec = TfidfVectorizer (ngram_range = self . ngram_range , tokenizer = self .tokenize ,
66
67
min_df = 3 , max_df = 0.9 , strip_accents = 'unicode' , use_idf = 1 ,
67
68
smooth_idf = 1 , sublinear_tf = 1 )
68
69
return self .vec .fit_transform (X_train )
0 commit comments