Skip to content

Commit 3341c72

Browse files
authored
Update sentiment_analyzer.py
1 parent a102da4 commit 3341c72

File tree

1 file changed

+53
-28
lines changed

1 file changed

+53
-28
lines changed

nlp-sentiment-analysis/sentiment_analyzer.py

Lines changed: 53 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2,33 +2,43 @@
22
import random
33
import spacy
44
from spacy.util import minibatch, compounding
5+
import pandas as pd
56

67

78
TEST_REVIEW = """
89
Transcendently beautiful in moments outside the office, it seems almost
910
sitcom-like in those scenes. When Toni Colette walks out and ponders
1011
life silently, it's gorgeous.<br /><br />The movie doesn't seem to decide
1112
whether it's slapstick, farce, magical realism, or drama, but the best of it
12-
doesn't matter. (The worst is sort of tedious - like Office Space with
13-
less humor.)
13+
doesn't matter. (The worst is sort of tedious - like Office Space with less humor.)
1414
"""
1515

1616

17-
def train_model(training_data: list, test_data: list, iterations: int = 20):
17+
eval_list = []
18+
19+
20+
def train_model(
21+
training_data: list,
22+
test_data: list,
23+
iterations: int = 20
24+
) -> None:
1825
# Build pipeline
1926
nlp = spacy.load("en_core_web_sm")
2027
if "textcat" not in nlp.pipe_names:
21-
textcat = nlp.create_pipe("textcat", config={"architecture": "simple_cnn"})
28+
textcat = nlp.create_pipe(
29+
"textcat", config={"architecture": "simple_cnn"}
30+
)
2231
nlp.add_pipe(textcat, last=True)
2332
else:
2433
textcat = nlp.get_pipe("textcat")
2534

26-
# Add labels
2735
textcat.add_label("pos")
2836
textcat.add_label("neg")
2937

3038
# Train only textcat
31-
training_excluded_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
39+
training_excluded_pipes = [
40+
pipe for pipe in nlp.pipe_names if pipe != "textcat"
41+
]
3242
with nlp.disable_pipes(training_excluded_pipes):
3343
optimizer = nlp.begin_training()
3444
# Training loop
@@ -47,40 +57,46 @@ def train_model(training_data: list, test_data: list, iterations: int = 20):
4757
nlp.update(text, labels, drop=0.2, sgd=optimizer, losses=loss)
4858
with textcat.model.use_params(optimizer.averages):
4959
evaluation_results = evaluate_model(
50-
tokenizer=nlp.tokenizer, textcat=textcat, test_data=test_data
60+
tokenizer=nlp.tokenizer,
61+
textcat=textcat,
62+
test_data=test_data
5163
)
5264
print(
53-
f"{loss['textcat']}\t{evaluation_results['precision']}\t{evaluation_results['recall']}\t{evaluation_results['f-score']}"
65+
f"{loss['textcat']}\t{evaluation_results['precision']}"
66+
f"\t{evaluation_results['recall']}"
67+
f"\t{evaluation_results['f-score']}"
5468
)
5569

5670
# Save model
5771
with nlp.use_params(optimizer.averages):
5872
nlp.to_disk("model_artifacts")
5973

6074

61-
def evaluate_model(tokenizer, textcat, test_data: list) -> dict:
75+
def evaluate_model(
76+
tokenizer, textcat, test_data: list
77+
) -> dict:
6278
reviews, labels = zip(*test_data)
6379
reviews = (tokenizer(review) for review in reviews)
6480
true_positives = 0
6581
false_positives = 1e-8 # Can't be 0 because of presence in denominator
6682
true_negatives = 0
6783
false_negatives = 1e-8
6884
for i, review in enumerate(textcat.pipe(reviews)):
69-
true_label = labels[i]["cats"]
85+
true_label = labels[i]
7086
for predicted_label, score in review.cats.items():
71-
# Every `cats` dictionary includes both labels, you
72-
# can get all the info we need with just the pos label
87+
# Every cats dictionary includes both labels, you can get all
88+
# the info you need with just the pos label
7389
if (
7490
predicted_label == "neg"
7591
):
7692
continue
77-
if score >= 0.5 and true_label["pos"]:
93+
if score >= 0.5 and true_label == "pos":
7894
true_positives += 1
79-
elif score >= 0.5 and true_label["neg"]:
95+
elif score >= 0.5 and true_label == "neg":
8096
false_positives += 1
81-
elif score < 0.5 and true_label["neg"]:
97+
elif score < 0.5 and true_label == "neg":
8298
true_negatives += 1
83-
elif score < 0.5 and true_label["pos"]:
99+
elif score < 0.5 and true_label == "pos":
84100
false_negatives += 1
85101
precision = true_positives / (true_positives + false_positives)
86102
recall = true_positives / (true_positives + false_negatives)
@@ -95,21 +111,26 @@ def evaluate_model(tokenizer, textcat, test_data: list) -> dict:
95111
def test_model(input_data: str = TEST_REVIEW):
96112
# Load saved trained model
97113
loaded_model = spacy.load("model_artifacts")
114+
# Generate prediction
98115
parsed_text = loaded_model(input_data)
99-
prediction = (
100-
"Positive" if parsed_text.cats["pos"] > parsed_text.cats["neg"] else "Negative"
101-
)
102-
score = (
103-
parsed_text.cats["pos"] if prediction == "Positive" else parsed_text.cats["neg"]
104-
)
116+
# Determine prediction to return
117+
if parsed_text.cats["pos"] > parsed_text.cats["neg"]:
118+
prediction = "Positive"
119+
score = parsed_text.cats["pos"]
120+
else:
121+
prediction = "Negative"
122+
score = parsed_text.cats["neg"]
105123
print(
106-
f"Review text: {input_data}\nPredicted sentiment: {prediction}\tScore: {score}"
124+
f"Review text: {input_data}\nPredicted sentiment: {prediction}"
125+
f"\tScore: {score}"
107126
)
108127

109128

110129
def load_training_data(
111-
data_directory: str = "aclImdb/train", split: float = 0.8, limit: int = 0
112-
) -> list:
130+
data_directory: str = "aclImdb/train",
131+
split: float = 0.8,
132+
limit: int = 0
133+
) -> tuple:
113134
# Load from files
114135
reviews = []
115136
for label in ["pos", "neg"]:
@@ -121,20 +142,24 @@ def load_training_data(
121142
text = text.replace("<br />", "\n\n")
122143
if text.strip():
123144
spacy_label = {
124-
"cats": {"pos": "pos" == label, "neg": "neg" == label}
145+
"cats": {
146+
"pos": "pos" == label,
147+
"neg": "neg" == label}
125148
}
126149
reviews.append((text, spacy_label))
127-
# Shuffle
128150
random.shuffle(reviews)
151+
129152
if limit:
130153
reviews = reviews[:limit]
131154
split = int(len(reviews) * split)
132155
return reviews[:split], reviews[split:]
133156

134157

135158
if __name__ == "__main__":
136-
train, test = load_training_data(limit=2500)
159+
train, test = load_training_data(limit=25)
137160
print("Training model")
138161
train_model(train, test)
162+
df = pd.DataFrame(eval_list)
163+
pd.DataFrame.plot(df)
139164
print("Testing model")
140165
test_model()

0 commit comments

Comments
 (0)