Skip to content

Commit 21307fa

Browse files
committed
Update model
1 parent e35bc49 commit 21307fa

File tree

2 files changed

+14
-11
lines changed

2 files changed

+14
-11
lines changed

training/svc_pipeline.pkl

100644100755
709 KB
Binary file not shown.

training/text_classification.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,18 +13,21 @@
1313
def main():
1414
"""
1515
Testing four text classifiers: LinearSVC, ComplementNB, MultinomialNB, and RandomForestClassifier.
16+
The following TfidfVectorizer parameters were tested:
17+
- stop_words="english"
18+
- min_df=5
1619
"""
1720
df = pd.read_csv("data.csv")
1821
print(df["rna_related"].value_counts())
1922
# rna_related
20-
# 1 3347
21-
# 0 3347
23+
# 1 3363
24+
# 0 3331
2225

2326
X = df["abstract"]
2427
y = df["rna_related"]
2528
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
2629

27-
pipeMNB = Pipeline(steps=[("tfidf", TfidfVectorizer()), ("clf", MultinomialNB())])
30+
pipeMNB = Pipeline(steps=[("tfidf", TfidfVectorizer(stop_words="english", min_df=5)), ("clf", MultinomialNB())])
2831
pipeMNB.fit(X_train, y_train)
2932
predictMNB = pipeMNB.predict(X_test)
3033

@@ -44,20 +47,20 @@ def main():
4447
print(f"CNB: {accuracy_score(y_test, predictCNB):.2f}")
4548
print(f"SVC: {accuracy_score(y_test, predictSVC):.2f}")
4649
print(f"RF: {accuracy_score(y_test, predictRF):.2f}")
47-
# MNB: 0.94
48-
# CNB: 0.94
49-
# SVC: 0.99
50+
# MNB: 0.93
51+
# CNB: 0.93
52+
# SVC: 0.98
5053
# RF: 0.96
5154

5255
print(classification_report(y_test, predictSVC))
5356
# precision recall f1-score support
5457
#
55-
# 0 0.99 0.98 0.98 669
56-
# 1 0.98 0.99 0.99 670
58+
# 0 0.98 0.98 0.98 665
59+
# 1 0.98 0.98 0.98 674
5760
#
58-
# accuracy 0.99 1339
59-
# macro avg 0.99 0.99 0.99 1339
60-
# weighted avg 0.99 0.99 0.99 1339
61+
# accuracy 0.98 1339
62+
# macro avg 0.98 0.98 0.98 1339
63+
# weighted avg 0.98 0.98 0.98 1339
6164

6265
joblib.dump(pipeSVC, "svc_pipeline.pkl")
6366

0 commit comments

Comments
 (0)