RNAcentral
diff --git a/‎training/svc_pipeline.pkl‎
100644100755
709 KB b/‎training/svc_pipeline.pkl‎
100644100755
709 KB
diff --git a/‎training/text_classification.py‎
Lines changed: 14 additions & 11 deletions b/‎training/text_classification.py‎
Lines changed: 14 additions & 11 deletions
@@ -13,18 +13,21 @@
 def main():
     """
     Testing four text classifiers: LinearSVC, ComplementNB, MultinomialNB, and RandomForestClassifier.
+    The following TfidfVectorizer parameters were tested:
+    - stop_words="english"
+    - min_df=5
     """
     df = pd.read_csv("data.csv")
     print(df["rna_related"].value_counts())
     # rna_related
-    # 1    3347
-    # 0    3347
+    # 1    3363
+    # 0    3331
 
     X = df["abstract"]
     y = df["rna_related"]
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
-    pipeMNB = Pipeline(steps=[("tfidf", TfidfVectorizer()), ("clf", MultinomialNB())])
+    pipeMNB = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words="english", min_df=5)), ("clf", MultinomialNB())])
     pipeMNB.fit(X_train, y_train)
     predictMNB = pipeMNB.predict(X_test)
 
@@ -44,20 +47,20 @@ def main():
     print(f"CNB: {accuracy_score(y_test, predictCNB):.2f}")
     print(f"SVC: {accuracy_score(y_test, predictSVC):.2f}")
     print(f"RF: {accuracy_score(y_test, predictRF):.2f}")
-    # MNB: 0.94
-    # CNB: 0.94
-    # SVC: 0.99
+    # MNB: 0.93
+    # CNB: 0.93
+    # SVC: 0.98
     # RF: 0.96
 
     print(classification_report(y_test, predictSVC))
     #               precision    recall  f1-score   support
     #
-    #            0       0.99      0.98      0.98       669
-    #            1       0.98      0.99      0.99       670
+    #            0       0.98      0.98      0.98       665
+    #            1       0.98      0.98      0.98       674
     #
-    #     accuracy                           0.99      1339
-    #    macro avg       0.99      0.99      0.99      1339
-    # weighted avg       0.99      0.99      0.99      1339
+    #     accuracy                           0.98      1339
+    #    macro avg       0.98      0.98      0.98      1339
+    # weighted avg       0.98      0.98      0.98      1339
 
     joblib.dump(pipeSVC, "svc_pipeline.pkl")