Skip to content

Commit a4d8ad5

Browse files
committed
Run clean_text first
1 parent 6a274db commit a4d8ad5

File tree

1 file changed

+19
-1
lines changed

1 file changed

+19
-1
lines changed

training/app.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,31 @@
11
import joblib
22
import streamlit as st
3+
import re
4+
5+
6+
def clean_text(text: str) -> str:
7+
"""
8+
Change abstract by removing HTML tags, URLs, content inside brackets,
9+
and extra whitespace, and converts it to lowercase.
10+
11+
:param text: abstract text
12+
:return: cleaned text
13+
"""
14+
text = text.lower()
15+
text = re.sub(r'<[^>]*>', " ", text)
16+
text = re.sub(r"\[.*?\]", "", text)
17+
text = re.sub(r"https?://\S+|www\.\S+", "", text)
18+
text = re.sub(r"\s+", " ", text)
19+
return text.strip()
320

421

522
model = joblib.load("training/svc_pipeline.pkl")
623
st.title("Text Classification - RNA Related or Not")
724
user_input = st.text_area("Enter text for classification:", "")
825

926
if st.button("Classify"):
10-
if user_input.strip():
27+
if user_input:
28+
user_input = clean_text(user_input)
1129
prediction = model.predict([user_input])[0]
1230
if prediction:
1331
st.success(f"Prediction: RNA-related")

0 commit comments

Comments
 (0)