Create sentiment_analyzer.md

kylestratis · web-flow · commit d32811e471e4 · 2020-07-11T23:50:26.000-04:00
diff --git a/nlp-sentiment-analysis/sentiment_analyzer.md b/nlp-sentiment-analysis/sentiment_analyzer.md
@@ -0,0 +1,137 @@
+import os
+import random
+import spacy
+from spacy.util import minibatch, compounding
+
+
+TEST_REVIEW = """
+Transcendently beautiful in moments outside the office, it seems almost
+sitcom-like in those scenes. When Toni Colette walks out and ponders
+life silently, it's gorgeous.<br /><br />The movie doesn't seem to decide
+whether it's slapstick, farce, magical realism, or drama, but the best of it
+doesn't matter. (The worst is sort of tedious - like Office Space with less humor.)
+"""
+
+
+def train_model(training_data: list, test_data: list, iterations: int = 20):
+    # Build pipeline
+    nlp = spacy.load("en_core_web_sm")
+    if "textcat" not in nlp.pipe_names:
+        textcat = nlp.create_pipe("textcat", config={"architecture": "simple_cnn"})
+        nlp.add_pipe(textcat, last=True)
+    else:
+        textcat = nlp.get_pipe("textcat")
+
+    # Add labels
+    textcat.add_label("pos")
+    textcat.add_label("neg")
+
+    # Train only textcat
+    training_excluded_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
+    with nlp.disable_pipes(training_excluded_pipes):
+        optimizer = nlp.begin_training()
+        # Training loop
+        print("Beginning training")
+        print("Loss\tPrecision\tRecall\tF-score")
+        batch_sizes = compounding(
+            4.0, 32.0, 1.001
+        )  # A generator that yields infinite series of input numbers
+        for i in range(iterations):
+            print(f"Training iteration {i}")
+            loss = {}
+            random.shuffle(training_data)
+            batches = minibatch(training_data, size=batch_sizes)
+            for batch in batches:
+                text, labels = zip(*batch)
+                nlp.update(text, labels, drop=0.2, sgd=optimizer, losses=loss)
+            with textcat.model.use_params(optimizer.averages):
+                evaluation_results = evaluate_model(
+                    tokenizer=nlp.tokenizer, textcat=textcat, test_data=test_data
+                )
+                print(
+                    f"{loss['textcat']}\t{evaluation_results['precision']}\t{evaluation_results['recall']}\t{evaluation_results['f-score']}"
+                )
+
+    # Save model
+    with nlp.use_params(optimizer.averages):
+        nlp.to_disk("model_artifacts")
+
+
+def evaluate_model(tokenizer, textcat, test_data: list) -> dict:
+    reviews, labels = zip(*test_data)
+    reviews = (tokenizer(review) for review in reviews)
+    true_positives = 0
+    false_positives = 1e-8  # Can't be 0 because of presence in denominator
+    true_negatives = 0
+    false_negatives = 1e-8
+    for i, review in enumerate(textcat.pipe(reviews)):
+        true_label = labels[i]["cats"]
+        for predicted_label, score in review.cats.items():
+            if (
+                predicted_label == "neg"
+            ):  # Every `cats` dictionary  includes both labels, you can get all the info we need with just the pos label
+                continue
+            if score >= 0.5 and true_label["pos"]:
+                true_positives += 1
+            elif score >= 0.5 and true_label["neg"]:
+                false_positives += 1
+            elif score < 0.5 and true_label["neg"]:
+                true_negatives += 1
+            elif score < 0.5 and true_label["pos"]:
+                false_negatives += 1
+    precision = true_positives / (true_positives + false_positives)
+    recall = true_positives / (true_positives + false_negatives)
+
+    if precision + recall == 0:
+        f_score = 0
+    else:
+        f_score = 2 * (precision * recall) / (precision + recall)
+    return {"precision": precision, "recall": recall, "f-score": f_score}
+
+
+def test_model(input_data: str = TEST_REVIEW):
+    #  Load saved trained model
+    loaded_model = spacy.load("model_artifacts")
+    parsed_text = loaded_model(input_data)
+    prediction = (
+        "Positive" if parsed_text.cats["pos"] > parsed_text.cats["neg"] else "Negative"
+    )
+    score = (
+        parsed_text.cats["pos"] if prediction == "Positive" else parsed_text.cats["neg"]
+    )
+    print(
+        f"Review text: {input_data}\nPredicted sentiment: {prediction}\tScore: {score}"
+    )
+
+
+def load_training_data(
+    data_directory: str = "aclImdb/train", split: float = 0.8, limit: int = 0
+) -> list:
+    # Load from files
+    reviews = []
+    for label in ["pos", "neg"]:
+        labeled_directory = f"{data_directory}/{label}"
+        for review in os.listdir(labeled_directory):
+            if review.endswith(".txt"):
+                with open(f"{labeled_directory}/{review}") as f:
+                    text = f.read()
+                    text = text.replace("<br />", "\n\n")
+                    if text.strip():
+                        spacy_label = {
+                            "cats": {"pos": "pos" == label, "neg": "neg" == label}
+                        }
+                        reviews.append((text, spacy_label))
+    # Shuffle
+    random.shuffle(reviews)
+    if limit:
+        reviews = reviews[:limit]
+    split = int(len(reviews) * split)
+    return reviews[:split], reviews[split:]
+
+
+if __name__ == "__main__":
+    train, test = load_training_data(limit=2500)
+    print("Training model")
+    train_model(train, test)
+    print("Testing model")
+    test_model()