Merge pull request #135 from realpython/kylestratis-patch-2

jaschmitt · web-flow · commit 74778f38d970 · 2020-10-27T09:23:49.000-07:00
Create sentiment_analyzer.py
diff --git a/nlp-sentiment-analysis/README.md b/nlp-sentiment-analysis/README.md
@@ -0,0 +1,38 @@
+# Use Sentiment Analysis With Python to Classify Reviews
+
+Resources and materials for Real Python's [Use Sentiment Analysis With Python to Classify Reviews](https://realpython.com/use-sentiment-analysis-python-classify-movie-reviews/) tutorial.
+
+## Installation
+
+Create and activate a new virtual environment:
+
+```shell
+$ python -m venv .venv
+$ source .venv/bin/activate
+```
+
+Install Python dependencies into the active virtual environment:
+
+```shell
+(.venv) $ python -m pip install -r requirements.txt
+```
+
+Download English model for spaCy:
+
+```shell
+(.venv) $ python -m spacy download en_core_web_sm
+```
+
+Download and extract the [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/) compiled by [Andrew Maas](http://www.andrew-maas.net/):
+
+```shell
+$ curl -s https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz | tar xvz
+```
+
+## Usage
+
+Get the sentiment of a movie review stored in the `TEST_REVIEW` variable:
+
+```shell
+(.venv) $ python sentiment_analyzer.py
+```
diff --git a/nlp-sentiment-analysis/requirements.txt b/nlp-sentiment-analysis/requirements.txt
@@ -0,0 +1,2 @@
+pandas==1.1.2
+spacy==2.3.2
diff --git a/nlp-sentiment-analysis/sentiment_analyzer.py b/nlp-sentiment-analysis/sentiment_analyzer.py
@@ -0,0 +1,159 @@
+import os
+import random
+import spacy
+from spacy.util import minibatch, compounding
+import pandas as pd
+
+
+TEST_REVIEW = """
+Transcendently beautiful in moments outside the office, it seems almost
+sitcom-like in those scenes. When Toni Colette walks out and ponders
+life silently, it's gorgeous.<br /><br />The movie doesn't seem to decide
+whether it's slapstick, farce, magical realism, or drama, but the best of it
+doesn't matter. (The worst is sort of tedious - like Office Space with less
+humor.)
+"""
+
+
+eval_list = []
+
+
+def train_model(
+    training_data: list, test_data: list, iterations: int = 20
+) -> None:
+    # Build pipeline
+    nlp = spacy.load("en_core_web_sm")
+    if "textcat" not in nlp.pipe_names:
+        textcat = nlp.create_pipe(
+            "textcat", config={"architecture": "simple_cnn"}
+        )
+        nlp.add_pipe(textcat, last=True)
+    else:
+        textcat = nlp.get_pipe("textcat")
+
+    textcat.add_label("pos")
+    textcat.add_label("neg")
+
+    # Train only textcat
+    training_excluded_pipes = [
+        pipe for pipe in nlp.pipe_names if pipe != "textcat"
+    ]
+    with nlp.disable_pipes(training_excluded_pipes):
+        optimizer = nlp.begin_training()
+        # Training loop
+        print("Beginning training")
+        print("Loss\tPrecision\tRecall\tF-score")
+        batch_sizes = compounding(
+            4.0, 32.0, 1.001
+        )  # A generator that yields infinite series of input numbers
+        for i in range(iterations):
+            print(f"Training iteration {i}")
+            loss = {}
+            random.shuffle(training_data)
+            batches = minibatch(training_data, size=batch_sizes)
+            for batch in batches:
+                text, labels = zip(*batch)
+                nlp.update(text, labels, drop=0.2, sgd=optimizer, losses=loss)
+            with textcat.model.use_params(optimizer.averages):
+                evaluation_results = evaluate_model(
+                    tokenizer=nlp.tokenizer,
+                    textcat=textcat,
+                    test_data=test_data,
+                )
+                print(
+                    f"{loss['textcat']}\t{evaluation_results['precision']}"
+                    f"\t{evaluation_results['recall']}"
+                    f"\t{evaluation_results['f-score']}"
+                )
+
+    # Save model
+    with nlp.use_params(optimizer.averages):
+        nlp.to_disk("model_artifacts")
+
+
+def evaluate_model(tokenizer, textcat, test_data: list) -> dict:
+    reviews, labels = zip(*test_data)
+    reviews = (tokenizer(review) for review in reviews)
+    true_positives = 0
+    false_positives = 1e-8  # Can't be 0 because of presence in denominator
+    true_negatives = 0
+    false_negatives = 1e-8
+    for i, review in enumerate(textcat.pipe(reviews)):
+        true_label = labels[i]
+        for predicted_label, score in review.cats.items():
+            # Every cats dictionary includes both labels, you can get all
+            # the info you need with just the pos label
+            if predicted_label == "neg":
+                continue
+            if score >= 0.5 and true_label == "pos":
+                true_positives += 1
+            elif score >= 0.5 and true_label == "neg":
+                false_positives += 1
+            elif score < 0.5 and true_label == "neg":
+                true_negatives += 1
+            elif score < 0.5 and true_label == "pos":
+                false_negatives += 1
+    precision = true_positives / (true_positives + false_positives)
+    recall = true_positives / (true_positives + false_negatives)
+
+    if precision + recall == 0:
+        f_score = 0
+    else:
+        f_score = 2 * (precision * recall) / (precision + recall)
+    return {"precision": precision, "recall": recall, "f-score": f_score}
+
+
+def test_model(input_data: str = TEST_REVIEW):
+    #  Load saved trained model
+    loaded_model = spacy.load("model_artifacts")
+    # Generate prediction
+    parsed_text = loaded_model(input_data)
+    # Determine prediction to return
+    if parsed_text.cats["pos"] > parsed_text.cats["neg"]:
+        prediction = "Positive"
+        score = parsed_text.cats["pos"]
+    else:
+        prediction = "Negative"
+        score = parsed_text.cats["neg"]
+    print(
+        f"Review text: {input_data}\nPredicted sentiment: {prediction}"
+        f"\tScore: {score}"
+    )
+
+
+def load_training_data(
+    data_directory: str = "aclImdb/train", split: float = 0.8, limit: int = 0
+) -> tuple:
+    # Load from files
+    reviews = []
+    for label in ["pos", "neg"]:
+        labeled_directory = f"{data_directory}/{label}"
+        for review in os.listdir(labeled_directory):
+            if review.endswith(".txt"):
+                with open(f"{labeled_directory}/{review}") as f:
+                    text = f.read()
+                    text = text.replace("<br />", "\n\n")
+                    if text.strip():
+                        spacy_label = {
+                            "cats": {
+                                "pos": "pos" == label,
+                                "neg": "neg" == label,
+                            }
+                        }
+                        reviews.append((text, spacy_label))
+    random.shuffle(reviews)
+
+    if limit:
+        reviews = reviews[:limit]
+    split = int(len(reviews) * split)
+    return reviews[:split], reviews[split:]
+
+
+if __name__ == "__main__":
+    train, test = load_training_data(limit=25)
+    print("Training model")
+    train_model(train, test)
+    df = pd.DataFrame(eval_list)
+    pd.DataFrame.plot(df)
+    print("Testing model")
+    test_model()