Skip to content

Commit 0339945

Browse files
authored
Merge pull request #1366 from mehul-m-prajapati/emotion
Added Emotion classification model
2 parents 0ce2bfc + d94d605 commit 0339945

File tree

3 files changed

+8888
-0
lines changed

3 files changed

+8888
-0
lines changed
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Emotion Classification from Text
2+
3+
This project implements an emotion classification model using a pre-trained transformer model (DistilBERT) to classify emotions based on text inputs. The model is trained on a dataset containing various emotional statements.
4+
5+
## Table of Contents
6+
7+
- [Overview](#overview)
8+
- [Dataset](#dataset)
9+
- [Installation](#installation)
10+
11+
## Overview
12+
13+
The goal of this project is to classify emotions expressed in text using natural language processing (NLP) techniques. We leverage the Hugging Face Transformers library to fine-tune a pre-trained DistilBERT model on our dataset.
14+
15+
## Dataset
16+
17+
The dataset used for training the model should have the following structure:
18+
19+
| content | sentiment |
20+
|--------------------------------|-----------|
21+
| alonzo feels angry | anger |
22+
| alonzo feels sad | sadness |
23+
| alonzo feels terrified | fear |
24+
25+
Make sure to place your dataset in the project directory and name it `emotion_data.csv`.
26+
27+
## Installation
28+
29+
To run this project, you'll need to install the required Python packages. You can do this using pip:
30+
31+
```bash
32+
pip install transformers torch pandas scikit-learn
Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
{
2+
"nbformat": 4,
3+
"nbformat_minor": 0,
4+
"metadata": {
5+
"colab": {
6+
"provenance": []
7+
},
8+
"kernelspec": {
9+
"name": "python3",
10+
"display_name": "Python 3"
11+
},
12+
"language_info": {
13+
"name": "python"
14+
}
15+
},
16+
"cells": [
17+
{
18+
"cell_type": "code",
19+
"execution_count": 5,
20+
"metadata": {
21+
"colab": {
22+
"base_uri": "https://localhost:8080/",
23+
"height": 398
24+
},
25+
"id": "a0XVvs0_OqUu",
26+
"outputId": "0c1a6d2a-ef35-40fb-8198-bd9427c5754b"
27+
},
28+
"outputs": [
29+
{
30+
"output_type": "stream",
31+
"name": "stdout",
32+
"text": [
33+
"Unique labels: ['anger', 'sadness', 'fear', 'joy']\n",
34+
"Categories (4, object): ['anger', 'fear', 'joy', 'sadness']\n",
35+
"Encoded labels: [0 3 1 2]\n",
36+
"Training set size: 6720\n",
37+
"Validation set size: 1680\n"
38+
]
39+
},
40+
{
41+
"output_type": "stream",
42+
"name": "stderr",
43+
"text": [
44+
"/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
45+
" warnings.warn(\n",
46+
"Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']\n",
47+
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
48+
"/usr/local/lib/python3.10/dist-packages/transformers/training_args.py:1525: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
49+
" warnings.warn(\n"
50+
]
51+
},
52+
{
53+
"output_type": "display_data",
54+
"data": {
55+
"text/plain": [
56+
"<IPython.core.display.HTML object>"
57+
],
58+
"text/html": [
59+
"\n",
60+
" <div>\n",
61+
" \n",
62+
" <progress value='1260' max='1260' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
63+
" [1260/1260 39:01, Epoch 3/3]\n",
64+
" </div>\n",
65+
" <table border=\"1\" class=\"dataframe\">\n",
66+
" <thead>\n",
67+
" <tr style=\"text-align: left;\">\n",
68+
" <th>Epoch</th>\n",
69+
" <th>Training Loss</th>\n",
70+
" <th>Validation Loss</th>\n",
71+
" </tr>\n",
72+
" </thead>\n",
73+
" <tbody>\n",
74+
" <tr>\n",
75+
" <td>1</td>\n",
76+
" <td>0.002200</td>\n",
77+
" <td>0.001418</td>\n",
78+
" </tr>\n",
79+
" <tr>\n",
80+
" <td>2</td>\n",
81+
" <td>0.000400</td>\n",
82+
" <td>0.000254</td>\n",
83+
" </tr>\n",
84+
" <tr>\n",
85+
" <td>3</td>\n",
86+
" <td>0.000300</td>\n",
87+
" <td>0.000174</td>\n",
88+
" </tr>\n",
89+
" </tbody>\n",
90+
"</table><p>"
91+
]
92+
},
93+
"metadata": {}
94+
},
95+
{
96+
"output_type": "stream",
97+
"name": "stdout",
98+
"text": [
99+
"The sentiment classified is: joy\n"
100+
]
101+
}
102+
],
103+
"source": [
104+
"import pandas as pd\n",
105+
"import torch\n",
106+
"from sklearn.model_selection import train_test_split\n",
107+
"from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments\n",
108+
"\n",
109+
"# Load the dataset\n",
110+
"df = pd.read_csv('emotion_data.csv')\n",
111+
"\n",
112+
"# Remove rows with invalid sentiments\n",
113+
"df = df[df['sentiment'].notna()] # Remove rows with missing sentiment values\n",
114+
"df = df[df['sentiment'].isin(df['sentiment'].unique())] # Ensure only valid sentiments are included\n",
115+
"\n",
116+
"# Encode labels\n",
117+
"df['sentiment'] = df['sentiment'].astype('category')\n",
118+
"df['label'] = df['sentiment'].cat.codes\n",
119+
"\n",
120+
"# Print unique labels to verify\n",
121+
"print(\"Unique labels:\", df['sentiment'].unique())\n",
122+
"print(\"Encoded labels:\", df['label'].unique())\n",
123+
"\n",
124+
"# Split the dataset\n",
125+
"train_texts, val_texts, train_labels, val_labels = train_test_split(\n",
126+
" df['content'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42\n",
127+
")\n",
128+
"\n",
129+
"# Check sizes of the datasets\n",
130+
"print(\"Training set size:\", len(train_texts))\n",
131+
"print(\"Validation set size:\", len(val_texts))\n",
132+
"\n",
133+
"# Load tokenizer and model\n",
134+
"model_name = \"distilbert-base-uncased\"\n",
135+
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
136+
"model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(df['sentiment'].unique()))\n",
137+
"\n",
138+
"# Tokenization\n",
139+
"train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)\n",
140+
"val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)\n",
141+
"\n",
142+
"# Create dataset class\n",
143+
"class EmotionDataset(torch.utils.data.Dataset):\n",
144+
" def __init__(self, encodings, labels):\n",
145+
" self.encodings = encodings\n",
146+
" self.labels = labels\n",
147+
"\n",
148+
" def __getitem__(self, idx):\n",
149+
" item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}\n",
150+
" item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)\n",
151+
" return item\n",
152+
"\n",
153+
" def __len__(self):\n",
154+
" return len(self.labels)\n",
155+
"\n",
156+
"# Create datasets\n",
157+
"train_dataset = EmotionDataset(train_encodings, train_labels)\n",
158+
"val_dataset = EmotionDataset(val_encodings, val_labels)\n",
159+
"\n",
160+
"# Set training arguments\n",
161+
"training_args = TrainingArguments(\n",
162+
" output_dir='./results',\n",
163+
" num_train_epochs=3,\n",
164+
" per_device_train_batch_size=16,\n",
165+
" per_device_eval_batch_size=64,\n",
166+
" warmup_steps=500,\n",
167+
" weight_decay=0.01,\n",
168+
" logging_dir='./logs',\n",
169+
" logging_steps=10,\n",
170+
" evaluation_strategy='epoch'\n",
171+
")\n",
172+
"\n",
173+
"# Create Trainer\n",
174+
"trainer = Trainer(\n",
175+
" model=model,\n",
176+
" args=training_args,\n",
177+
" train_dataset=train_dataset,\n",
178+
" eval_dataset=val_dataset\n",
179+
")\n",
180+
"\n",
181+
"# Train the model\n",
182+
"try:\n",
183+
" trainer.train()\n",
184+
"except RuntimeError as e:\n",
185+
" print(f\"Error during training: {e}\")\n",
186+
"\n",
187+
"# Save the model\n",
188+
"trainer.save_model(\"emotion_classifier\")\n",
189+
"\n",
190+
"# Function to classify sentiments\n",
191+
"def classify_sentiment(text):\n",
192+
" inputs = tokenizer(text, return_tensors=\"pt\", truncation=True, padding=True, max_length=128)\n",
193+
" with torch.no_grad():\n",
194+
" outputs = model(**inputs)\n",
195+
" predictions = torch.argmax(outputs.logits, dim=-1)\n",
196+
" return df['sentiment'].cat.categories[predictions.item()]\n",
197+
"\n",
198+
"# Example usage\n",
199+
"if __name__ == \"__main__\":\n",
200+
" text = \"alonzo feels extremely happy!\"\n",
201+
" sentiment = classify_sentiment(text)\n",
202+
" print(f\"The sentiment classified is: {sentiment}\")\n"
203+
]
204+
},
205+
{
206+
"cell_type": "code",
207+
"source": [],
208+
"metadata": {
209+
"id": "hLUDgJr5Pnx8"
210+
},
211+
"execution_count": null,
212+
"outputs": []
213+
}
214+
]
215+
}

0 commit comments

Comments
 (0)