Merge pull request #1366 from mehul-m-prajapati/emotion

sanjay-kv · web-flow · commit 033994503483 · 2024-10-11T12:40:15.000+11:00
Added Emotion classification model
diff --git a/Classification Models/Emotion Classification/README.md b/Classification Models/Emotion Classification/README.md
@@ -0,0 +1,32 @@
+# Emotion Classification from Text
+
+This project implements an emotion classification model using a pre-trained transformer model (DistilBERT) to classify emotions based on text inputs. The model is trained on a dataset containing various emotional statements.
+
+## Table of Contents
+
+- [Overview](#overview)
+- [Dataset](#dataset)
+- [Installation](#installation)
+
+## Overview
+
+The goal of this project is to classify emotions expressed in text using natural language processing (NLP) techniques. We leverage the Hugging Face Transformers library to fine-tune a pre-trained DistilBERT model on our dataset.
+
+## Dataset
+
+The dataset used for training the model should have the following structure:
+
+| content                        | sentiment |
+|--------------------------------|-----------|
+| alonzo feels angry             | anger     |
+| alonzo feels sad               | sadness   |
+| alonzo feels terrified          | fear      |
+
+Make sure to place your dataset in the project directory and name it `emotion_data.csv`.
+
+## Installation
+
+To run this project, you'll need to install the required Python packages. You can do this using pip:
+
+```bash
+pip install transformers torch pandas scikit-learn
diff --git a/Classification Models/Emotion Classification/emotion.ipynb b/Classification Models/Emotion Classification/emotion.ipynb
@@ -0,0 +1,215 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 398
+        },
+        "id": "a0XVvs0_OqUu",
+        "outputId": "0c1a6d2a-ef35-40fb-8198-bd9427c5754b"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Unique labels: ['anger', 'sadness', 'fear', 'joy']\n",
+            "Categories (4, object): ['anger', 'fear', 'joy', 'sadness']\n",
+            "Encoded labels: [0 3 1 2]\n",
+            "Training set size: 6720\n",
+            "Validation set size: 1680\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
+            "  warnings.warn(\n",
+            "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+            "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py:1525: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
+            "  warnings.warn(\n"
+          ]
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ],
+            "text/html": [
+              "\n",
+              "    <div>\n",
+              "      \n",
+              "      <progress value='1260' max='1260' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+              "      [1260/1260 39:01, Epoch 3/3]\n",
+              "    </div>\n",
+              "    <table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              " <tr style=\"text-align: left;\">\n",
+              "      <th>Epoch</th>\n",
+              "      <th>Training Loss</th>\n",
+              "      <th>Validation Loss</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <td>1</td>\n",
+              "      <td>0.002200</td>\n",
+              "      <td>0.001418</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>2</td>\n",
+              "      <td>0.000400</td>\n",
+              "      <td>0.000254</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>3</td>\n",
+              "      <td>0.000300</td>\n",
+              "      <td>0.000174</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table><p>"
+            ]
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "The sentiment classified is: joy\n"
+          ]
+        }
+      ],
+      "source": [
+        "import pandas as pd\n",
+        "import torch\n",
+        "from sklearn.model_selection import train_test_split\n",
+        "from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments\n",
+        "\n",
+        "# Load the dataset\n",
+        "df = pd.read_csv('emotion_data.csv')\n",
+        "\n",
+        "# Remove rows with invalid sentiments\n",
+        "df = df[df['sentiment'].notna()]  # Remove rows with missing sentiment values\n",
+        "df = df[df['sentiment'].isin(df['sentiment'].unique())]  # Ensure only valid sentiments are included\n",
+        "\n",
+        "# Encode labels\n",
+        "df['sentiment'] = df['sentiment'].astype('category')\n",
+        "df['label'] = df['sentiment'].cat.codes\n",
+        "\n",
+        "# Print unique labels to verify\n",
+        "print(\"Unique labels:\", df['sentiment'].unique())\n",
+        "print(\"Encoded labels:\", df['label'].unique())\n",
+        "\n",
+        "# Split the dataset\n",
+        "train_texts, val_texts, train_labels, val_labels = train_test_split(\n",
+        "    df['content'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42\n",
+        ")\n",
+        "\n",
+        "# Check sizes of the datasets\n",
+        "print(\"Training set size:\", len(train_texts))\n",
+        "print(\"Validation set size:\", len(val_texts))\n",
+        "\n",
+        "# Load tokenizer and model\n",
+        "model_name = \"distilbert-base-uncased\"\n",
+        "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+        "model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(df['sentiment'].unique()))\n",
+        "\n",
+        "# Tokenization\n",
+        "train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)\n",
+        "val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)\n",
+        "\n",
+        "# Create dataset class\n",
+        "class EmotionDataset(torch.utils.data.Dataset):\n",
+        "    def __init__(self, encodings, labels):\n",
+        "        self.encodings = encodings\n",
+        "        self.labels = labels\n",
+        "\n",
+        "    def __getitem__(self, idx):\n",
+        "        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}\n",
+        "        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)\n",
+        "        return item\n",
+        "\n",
+        "    def __len__(self):\n",
+        "        return len(self.labels)\n",
+        "\n",
+        "# Create datasets\n",
+        "train_dataset = EmotionDataset(train_encodings, train_labels)\n",
+        "val_dataset = EmotionDataset(val_encodings, val_labels)\n",
+        "\n",
+        "# Set training arguments\n",
+        "training_args = TrainingArguments(\n",
+        "    output_dir='./results',\n",
+        "    num_train_epochs=3,\n",
+        "    per_device_train_batch_size=16,\n",
+        "    per_device_eval_batch_size=64,\n",
+        "    warmup_steps=500,\n",
+        "    weight_decay=0.01,\n",
+        "    logging_dir='./logs',\n",
+        "    logging_steps=10,\n",
+        "    evaluation_strategy='epoch'\n",
+        ")\n",
+        "\n",
+        "# Create Trainer\n",
+        "trainer = Trainer(\n",
+        "    model=model,\n",
+        "    args=training_args,\n",
+        "    train_dataset=train_dataset,\n",
+        "    eval_dataset=val_dataset\n",
+        ")\n",
+        "\n",
+        "# Train the model\n",
+        "try:\n",
+        "    trainer.train()\n",
+        "except RuntimeError as e:\n",
+        "    print(f\"Error during training: {e}\")\n",
+        "\n",
+        "# Save the model\n",
+        "trainer.save_model(\"emotion_classifier\")\n",
+        "\n",
+        "# Function to classify sentiments\n",
+        "def classify_sentiment(text):\n",
+        "    inputs = tokenizer(text, return_tensors=\"pt\", truncation=True, padding=True, max_length=128)\n",
+        "    with torch.no_grad():\n",
+        "        outputs = model(**inputs)\n",
+        "    predictions = torch.argmax(outputs.logits, dim=-1)\n",
+        "    return df['sentiment'].cat.categories[predictions.item()]\n",
+        "\n",
+        "# Example usage\n",
+        "if __name__ == \"__main__\":\n",
+        "    text = \"alonzo feels extremely happy!\"\n",
+        "    sentiment = classify_sentiment(text)\n",
+        "    print(f\"The sentiment classified is: {sentiment}\")\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "hLUDgJr5Pnx8"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
diff --git a/Classification Models/Emotion Classification/emotion_data.csv b/Classification Models/Emotion Classification/emotion_data.csv