Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions Classification Models/Emotion Classification/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Emotion Classification from Text

This project implements an emotion classification model using a pre-trained transformer model (DistilBERT) to classify emotions based on text inputs. The model is trained on a dataset containing various emotional statements.

## Table of Contents

- [Overview](#overview)
- [Dataset](#dataset)
- [Installation](#installation)

## Overview

The goal of this project is to classify emotions expressed in text using natural language processing (NLP) techniques. We leverage the Hugging Face Transformers library to fine-tune a pre-trained DistilBERT model on our dataset.

## Dataset

The dataset used for training the model should have the following structure:

| content | sentiment |
|--------------------------------|-----------|
| alonzo feels angry | anger |
| alonzo feels sad | sadness |
| alonzo feels terrified | fear |

Make sure to place your dataset in the project directory and name it `emotion_data.csv`.

## Installation

To run this project, you'll need to install the required Python packages. You can do this using pip:

```bash
pip install transformers torch pandas scikit-learn
215 changes: 215 additions & 0 deletions Classification Models/Emotion Classification/emotion.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 398
},
"id": "a0XVvs0_OqUu",
"outputId": "0c1a6d2a-ef35-40fb-8198-bd9427c5754b"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Unique labels: ['anger', 'sadness', 'fear', 'joy']\n",
"Categories (4, object): ['anger', 'fear', 'joy', 'sadness']\n",
"Encoded labels: [0 3 1 2]\n",
"Training set size: 6720\n",
"Validation set size: 1680\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
" warnings.warn(\n",
"Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"/usr/local/lib/python3.10/dist-packages/transformers/training_args.py:1525: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
" warnings.warn(\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"<IPython.core.display.HTML object>"
],
"text/html": [
"\n",
" <div>\n",
" \n",
" <progress value='1260' max='1260' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
" [1260/1260 39:01, Epoch 3/3]\n",
" </div>\n",
" <table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: left;\">\n",
" <th>Epoch</th>\n",
" <th>Training Loss</th>\n",
" <th>Validation Loss</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>0.002200</td>\n",
" <td>0.001418</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>0.000400</td>\n",
" <td>0.000254</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>0.000300</td>\n",
" <td>0.000174</td>\n",
" </tr>\n",
" </tbody>\n",
"</table><p>"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"The sentiment classified is: joy\n"
]
}
],
"source": [
"import pandas as pd\n",
"import torch\n",
"from sklearn.model_selection import train_test_split\n",
"from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments\n",
"\n",
"# Load the dataset\n",
"df = pd.read_csv('emotion_data.csv')\n",
"\n",
"# Remove rows with invalid sentiments\n",
"df = df[df['sentiment'].notna()] # Remove rows with missing sentiment values\n",
"df = df[df['sentiment'].isin(df['sentiment'].unique())] # Ensure only valid sentiments are included\n",
"\n",
"# Encode labels\n",
"df['sentiment'] = df['sentiment'].astype('category')\n",
"df['label'] = df['sentiment'].cat.codes\n",
"\n",
"# Print unique labels to verify\n",
"print(\"Unique labels:\", df['sentiment'].unique())\n",
"print(\"Encoded labels:\", df['label'].unique())\n",
"\n",
"# Split the dataset\n",
"train_texts, val_texts, train_labels, val_labels = train_test_split(\n",
" df['content'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42\n",
")\n",
"\n",
"# Check sizes of the datasets\n",
"print(\"Training set size:\", len(train_texts))\n",
"print(\"Validation set size:\", len(val_texts))\n",
"\n",
"# Load tokenizer and model\n",
"model_name = \"distilbert-base-uncased\"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
"model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(df['sentiment'].unique()))\n",
"\n",
"# Tokenization\n",
"train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)\n",
"val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)\n",
"\n",
"# Create dataset class\n",
"class EmotionDataset(torch.utils.data.Dataset):\n",
" def __init__(self, encodings, labels):\n",
" self.encodings = encodings\n",
" self.labels = labels\n",
"\n",
" def __getitem__(self, idx):\n",
" item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}\n",
" item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)\n",
" return item\n",
"\n",
" def __len__(self):\n",
" return len(self.labels)\n",
"\n",
"# Create datasets\n",
"train_dataset = EmotionDataset(train_encodings, train_labels)\n",
"val_dataset = EmotionDataset(val_encodings, val_labels)\n",
"\n",
"# Set training arguments\n",
"training_args = TrainingArguments(\n",
" output_dir='./results',\n",
" num_train_epochs=3,\n",
" per_device_train_batch_size=16,\n",
" per_device_eval_batch_size=64,\n",
" warmup_steps=500,\n",
" weight_decay=0.01,\n",
" logging_dir='./logs',\n",
" logging_steps=10,\n",
" evaluation_strategy='epoch'\n",
")\n",
"\n",
"# Create Trainer\n",
"trainer = Trainer(\n",
" model=model,\n",
" args=training_args,\n",
" train_dataset=train_dataset,\n",
" eval_dataset=val_dataset\n",
")\n",
"\n",
"# Train the model\n",
"try:\n",
" trainer.train()\n",
"except RuntimeError as e:\n",
" print(f\"Error during training: {e}\")\n",
"\n",
"# Save the model\n",
"trainer.save_model(\"emotion_classifier\")\n",
"\n",
"# Function to classify sentiments\n",
"def classify_sentiment(text):\n",
" inputs = tokenizer(text, return_tensors=\"pt\", truncation=True, padding=True, max_length=128)\n",
" with torch.no_grad():\n",
" outputs = model(**inputs)\n",
" predictions = torch.argmax(outputs.logits, dim=-1)\n",
" return df['sentiment'].cat.categories[predictions.item()]\n",
"\n",
"# Example usage\n",
"if __name__ == \"__main__\":\n",
" text = \"alonzo feels extremely happy!\"\n",
" sentiment = classify_sentiment(text)\n",
" print(f\"The sentiment classified is: {sentiment}\")\n"
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "hLUDgJr5Pnx8"
},
"execution_count": null,
"outputs": []
}
]
}
Loading
Loading