Update sentiment_analyzer.py

kylestratis · web-flow · commit 3341c727a4aa · 2020-10-13T18:59:40.000-07:00
diff --git a/nlp-sentiment-analysis/sentiment_analyzer.py b/nlp-sentiment-analysis/sentiment_analyzer.py
@@ -2,33 +2,43 @@
 import random
 import spacy
 from spacy.util import minibatch, compounding
+import pandas as pd
 
 
 TEST_REVIEW = """
 Transcendently beautiful in moments outside the office, it seems almost
 sitcom-like in those scenes. When Toni Colette walks out and ponders
 life silently, it's gorgeous.<br /><br />The movie doesn't seem to decide
 whether it's slapstick, farce, magical realism, or drama, but the best of it
-doesn't matter. (The worst is sort of tedious - like Office Space with 
-less humor.)
+doesn't matter. (The worst is sort of tedious - like Office Space with less humor.)
 """
 
 
-def train_model(training_data: list, test_data: list, iterations: int = 20):
+eval_list = []
+
+
+def train_model(
+    training_data: list,
+    test_data: list,
+    iterations: int = 20
+) -> None:
     # Build pipeline
     nlp = spacy.load("en_core_web_sm")
     if "textcat" not in nlp.pipe_names:
-        textcat = nlp.create_pipe("textcat", config={"architecture": "simple_cnn"})
+        textcat = nlp.create_pipe(
+            "textcat", config={"architecture": "simple_cnn"}
+        )
         nlp.add_pipe(textcat, last=True)
     else:
         textcat = nlp.get_pipe("textcat")
 
-    # Add labels
     textcat.add_label("pos")
     textcat.add_label("neg")
 
     # Train only textcat
-    training_excluded_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
+    training_excluded_pipes = [
+        pipe for pipe in nlp.pipe_names if pipe != "textcat"
+    ]
     with nlp.disable_pipes(training_excluded_pipes):
         optimizer = nlp.begin_training()
         # Training loop
@@ -47,40 +57,46 @@ def train_model(training_data: list, test_data: list, iterations: int = 20):
                 nlp.update(text, labels, drop=0.2, sgd=optimizer, losses=loss)
             with textcat.model.use_params(optimizer.averages):
                 evaluation_results = evaluate_model(
-                    tokenizer=nlp.tokenizer, textcat=textcat, test_data=test_data
+                    tokenizer=nlp.tokenizer,
+                    textcat=textcat,
+                    test_data=test_data
                 )
                 print(
-                    f"{loss['textcat']}\t{evaluation_results['precision']}\t{evaluation_results['recall']}\t{evaluation_results['f-score']}"
+                    f"{loss['textcat']}\t{evaluation_results['precision']}"
+                    f"\t{evaluation_results['recall']}"
+                    f"\t{evaluation_results['f-score']}"
                 )
 
     # Save model
     with nlp.use_params(optimizer.averages):
         nlp.to_disk("model_artifacts")
 
 
-def evaluate_model(tokenizer, textcat, test_data: list) -> dict:
+def evaluate_model(
+    tokenizer, textcat, test_data: list
+) -> dict:
     reviews, labels = zip(*test_data)
     reviews = (tokenizer(review) for review in reviews)
     true_positives = 0
     false_positives = 1e-8  # Can't be 0 because of presence in denominator
     true_negatives = 0
     false_negatives = 1e-8
     for i, review in enumerate(textcat.pipe(reviews)):
-        true_label = labels[i]["cats"]
+        true_label = labels[i]
         for predicted_label, score in review.cats.items():
-            # Every `cats` dictionary includes both labels, you
-            # can get all the info we need with just the pos label
+            # Every cats dictionary includes both labels, you can get all
+            # the info you need with just the pos label
             if (
                 predicted_label == "neg"
             ):
                 continue
-            if score >= 0.5 and true_label["pos"]:
+            if score >= 0.5 and true_label == "pos":
                 true_positives += 1
-            elif score >= 0.5 and true_label["neg"]:
+            elif score >= 0.5 and true_label == "neg":
                 false_positives += 1
-            elif score < 0.5 and true_label["neg"]:
+            elif score < 0.5 and true_label == "neg":
                 true_negatives += 1
-            elif score < 0.5 and true_label["pos"]:
+            elif score < 0.5 and true_label == "pos":
                 false_negatives += 1
     precision = true_positives / (true_positives + false_positives)
     recall = true_positives / (true_positives + false_negatives)
@@ -95,21 +111,26 @@ def evaluate_model(tokenizer, textcat, test_data: list) -> dict:
 def test_model(input_data: str = TEST_REVIEW):
     #  Load saved trained model
     loaded_model = spacy.load("model_artifacts")
+    # Generate prediction
     parsed_text = loaded_model(input_data)
-    prediction = (
-        "Positive" if parsed_text.cats["pos"] > parsed_text.cats["neg"] else "Negative"
-    )
-    score = (
-        parsed_text.cats["pos"] if prediction == "Positive" else parsed_text.cats["neg"]
-    )
+    # Determine prediction to return
+    if parsed_text.cats["pos"] > parsed_text.cats["neg"]:
+        prediction = "Positive"
+        score = parsed_text.cats["pos"]
+    else:
+        prediction = "Negative"
+        score = parsed_text.cats["neg"]
     print(
-        f"Review text: {input_data}\nPredicted sentiment: {prediction}\tScore: {score}"
+        f"Review text: {input_data}\nPredicted sentiment: {prediction}"
+        f"\tScore: {score}"
     )
 
 
 def load_training_data(
-    data_directory: str = "aclImdb/train", split: float = 0.8, limit: int = 0
-) -> list:
+    data_directory: str = "aclImdb/train",
+    split: float = 0.8,
+    limit: int = 0
+) -> tuple:
     # Load from files
     reviews = []
     for label in ["pos", "neg"]:
@@ -121,20 +142,24 @@ def load_training_data(
                     text = text.replace("<br />", "\n\n")
                     if text.strip():
                         spacy_label = {
-                            "cats": {"pos": "pos" == label, "neg": "neg" == label}
+                            "cats": {
+                                "pos": "pos" == label,
+                                "neg": "neg" == label}
                         }
                         reviews.append((text, spacy_label))
-    # Shuffle
     random.shuffle(reviews)
+
     if limit:
         reviews = reviews[:limit]
     split = int(len(reviews) * split)
     return reviews[:split], reviews[split:]
 
 
 if __name__ == "__main__":
-    train, test = load_training_data(limit=2500)
+    train, test = load_training_data(limit=25)
     print("Training model")
     train_model(train, test)
+    df = pd.DataFrame(eval_list)
+    pd.DataFrame.plot(df)
     print("Testing model")
     test_model()