1313def main ():
1414 """
1515 Testing four text classifiers: LinearSVC, ComplementNB, MultinomialNB, and RandomForestClassifier.
16+ The following TfidfVectorizer parameters were tested:
17+ - stop_words="english"
18+ - min_df=5
1619 """
1720 df = pd .read_csv ("data.csv" )
1821 print (df ["rna_related" ].value_counts ())
1922 # rna_related
20- # 1 3347
21- # 0 3347
23+ # 1 3363
24+ # 0 3331
2225
2326 X = df ["abstract" ]
2427 y = df ["rna_related" ]
2528 X_train , X_test , y_train , y_test = train_test_split (X , y , test_size = 0.2 , random_state = 42 )
2629
27- pipeMNB = Pipeline (steps = [("tfidf" , TfidfVectorizer ()), ("clf" , MultinomialNB ())])
30+ pipeMNB = Pipeline (steps = [("tfidf" , TfidfVectorizer (stop_words = "english" , min_df = 5 )), ("clf" , MultinomialNB ())])
2831 pipeMNB .fit (X_train , y_train )
2932 predictMNB = pipeMNB .predict (X_test )
3033
@@ -44,20 +47,20 @@ def main():
4447 print (f"CNB: { accuracy_score (y_test , predictCNB ):.2f} " )
4548 print (f"SVC: { accuracy_score (y_test , predictSVC ):.2f} " )
4649 print (f"RF: { accuracy_score (y_test , predictRF ):.2f} " )
47- # MNB: 0.94
48- # CNB: 0.94
49- # SVC: 0.99
50+ # MNB: 0.93
51+ # CNB: 0.93
52+ # SVC: 0.98
5053 # RF: 0.96
5154
5255 print (classification_report (y_test , predictSVC ))
5356 # precision recall f1-score support
5457 #
55- # 0 0.99 0.98 0.98 669
56- # 1 0.98 0.99 0.99 670
58+ # 0 0.98 0.98 0.98 665
59+ # 1 0.98 0.98 0.98 674
5760 #
58- # accuracy 0.99 1339
59- # macro avg 0.99 0.99 0.99 1339
60- # weighted avg 0.99 0.99 0.99 1339
61+ # accuracy 0.98 1339
62+ # macro avg 0.98 0.98 0.98 1339
63+ # weighted avg 0.98 0.98 0.98 1339
6164
6265 joblib .dump (pipeSVC , "svc_pipeline.pkl" )
6366
0 commit comments