22import random
33import spacy
44from spacy .util import minibatch , compounding
5+ import pandas as pd
56
67
78TEST_REVIEW = """
89Transcendently beautiful in moments outside the office, it seems almost
910sitcom-like in those scenes. When Toni Colette walks out and ponders
1011life silently, it's gorgeous.<br /><br />The movie doesn't seem to decide
1112whether it's slapstick, farce, magical realism, or drama, but the best of it
12- doesn't matter. (The worst is sort of tedious - like Office Space with
13- less humor.)
13+ doesn't matter. (The worst is sort of tedious - like Office Space with less humor.)
1414"""
1515
1616
17- def train_model (training_data : list , test_data : list , iterations : int = 20 ):
17+ eval_list = []
18+
19+
20+ def train_model (
21+ training_data : list ,
22+ test_data : list ,
23+ iterations : int = 20
24+ ) -> None :
1825 # Build pipeline
1926 nlp = spacy .load ("en_core_web_sm" )
2027 if "textcat" not in nlp .pipe_names :
21- textcat = nlp .create_pipe ("textcat" , config = {"architecture" : "simple_cnn" })
28+ textcat = nlp .create_pipe (
29+ "textcat" , config = {"architecture" : "simple_cnn" }
30+ )
2231 nlp .add_pipe (textcat , last = True )
2332 else :
2433 textcat = nlp .get_pipe ("textcat" )
2534
26- # Add labels
2735 textcat .add_label ("pos" )
2836 textcat .add_label ("neg" )
2937
3038 # Train only textcat
31- training_excluded_pipes = [pipe for pipe in nlp .pipe_names if pipe != "textcat" ]
39+ training_excluded_pipes = [
40+ pipe for pipe in nlp .pipe_names if pipe != "textcat"
41+ ]
3242 with nlp .disable_pipes (training_excluded_pipes ):
3343 optimizer = nlp .begin_training ()
3444 # Training loop
@@ -47,40 +57,46 @@ def train_model(training_data: list, test_data: list, iterations: int = 20):
4757 nlp .update (text , labels , drop = 0.2 , sgd = optimizer , losses = loss )
4858 with textcat .model .use_params (optimizer .averages ):
4959 evaluation_results = evaluate_model (
50- tokenizer = nlp .tokenizer , textcat = textcat , test_data = test_data
60+ tokenizer = nlp .tokenizer ,
61+ textcat = textcat ,
62+ test_data = test_data
5163 )
5264 print (
53- f"{ loss ['textcat' ]} \t { evaluation_results ['precision' ]} \t { evaluation_results ['recall' ]} \t { evaluation_results ['f-score' ]} "
65+ f"{ loss ['textcat' ]} \t { evaluation_results ['precision' ]} "
66+ f"\t { evaluation_results ['recall' ]} "
67+ f"\t { evaluation_results ['f-score' ]} "
5468 )
5569
5670 # Save model
5771 with nlp .use_params (optimizer .averages ):
5872 nlp .to_disk ("model_artifacts" )
5973
6074
61- def evaluate_model (tokenizer , textcat , test_data : list ) -> dict :
75+ def evaluate_model (
76+ tokenizer , textcat , test_data : list
77+ ) -> dict :
6278 reviews , labels = zip (* test_data )
6379 reviews = (tokenizer (review ) for review in reviews )
6480 true_positives = 0
6581 false_positives = 1e-8 # Can't be 0 because of presence in denominator
6682 true_negatives = 0
6783 false_negatives = 1e-8
6884 for i , review in enumerate (textcat .pipe (reviews )):
69- true_label = labels [i ][ "cats" ]
85+ true_label = labels [i ]
7086 for predicted_label , score in review .cats .items ():
71- # Every ` cats` dictionary includes both labels, you
72- # can get all the info we need with just the pos label
87+ # Every cats dictionary includes both labels, you can get all
88+ # the info you need with just the pos label
7389 if (
7490 predicted_label == "neg"
7591 ):
7692 continue
77- if score >= 0.5 and true_label [ "pos" ] :
93+ if score >= 0.5 and true_label == "pos" :
7894 true_positives += 1
79- elif score >= 0.5 and true_label [ "neg" ] :
95+ elif score >= 0.5 and true_label == "neg" :
8096 false_positives += 1
81- elif score < 0.5 and true_label [ "neg" ] :
97+ elif score < 0.5 and true_label == "neg" :
8298 true_negatives += 1
83- elif score < 0.5 and true_label [ "pos" ] :
99+ elif score < 0.5 and true_label == "pos" :
84100 false_negatives += 1
85101 precision = true_positives / (true_positives + false_positives )
86102 recall = true_positives / (true_positives + false_negatives )
@@ -95,21 +111,26 @@ def evaluate_model(tokenizer, textcat, test_data: list) -> dict:
95111def test_model (input_data : str = TEST_REVIEW ):
96112 # Load saved trained model
97113 loaded_model = spacy .load ("model_artifacts" )
114+ # Generate prediction
98115 parsed_text = loaded_model (input_data )
99- prediction = (
100- "Positive" if parsed_text .cats ["pos" ] > parsed_text .cats ["neg" ] else "Negative"
101- )
102- score = (
103- parsed_text .cats ["pos" ] if prediction == "Positive" else parsed_text .cats ["neg" ]
104- )
116+ # Determine prediction to return
117+ if parsed_text .cats ["pos" ] > parsed_text .cats ["neg" ]:
118+ prediction = "Positive"
119+ score = parsed_text .cats ["pos" ]
120+ else :
121+ prediction = "Negative"
122+ score = parsed_text .cats ["neg" ]
105123 print (
106- f"Review text: { input_data } \n Predicted sentiment: { prediction } \t Score: { score } "
124+ f"Review text: { input_data } \n Predicted sentiment: { prediction } "
125+ f"\t Score: { score } "
107126 )
108127
109128
110129def load_training_data (
111- data_directory : str = "aclImdb/train" , split : float = 0.8 , limit : int = 0
112- ) -> list :
130+ data_directory : str = "aclImdb/train" ,
131+ split : float = 0.8 ,
132+ limit : int = 0
133+ ) -> tuple :
113134 # Load from files
114135 reviews = []
115136 for label in ["pos" , "neg" ]:
@@ -121,20 +142,24 @@ def load_training_data(
121142 text = text .replace ("<br />" , "\n \n " )
122143 if text .strip ():
123144 spacy_label = {
124- "cats" : {"pos" : "pos" == label , "neg" : "neg" == label }
145+ "cats" : {
146+ "pos" : "pos" == label ,
147+ "neg" : "neg" == label }
125148 }
126149 reviews .append ((text , spacy_label ))
127- # Shuffle
128150 random .shuffle (reviews )
151+
129152 if limit :
130153 reviews = reviews [:limit ]
131154 split = int (len (reviews ) * split )
132155 return reviews [:split ], reviews [split :]
133156
134157
135158if __name__ == "__main__" :
136- train , test = load_training_data (limit = 2500 )
159+ train , test = load_training_data (limit = 25 )
137160 print ("Training model" )
138161 train_model (train , test )
162+ df = pd .DataFrame (eval_list )
163+ pd .DataFrame .plot (df )
139164 print ("Testing model" )
140165 test_model ()
0 commit comments