1
+ {
2
+ "nbformat" : 4 ,
3
+ "nbformat_minor" : 0 ,
4
+ "metadata" : {
5
+ "colab" : {
6
+ "provenance" : []
7
+ },
8
+ "kernelspec" : {
9
+ "name" : " python3" ,
10
+ "display_name" : " Python 3"
11
+ },
12
+ "language_info" : {
13
+ "name" : " python"
14
+ }
15
+ },
16
+ "cells" : [
17
+ {
18
+ "cell_type" : " code" ,
19
+ "execution_count" : 5 ,
20
+ "metadata" : {
21
+ "colab" : {
22
+ "base_uri" : " https://localhost:8080/" ,
23
+ "height" : 398
24
+ },
25
+ "id" : " a0XVvs0_OqUu" ,
26
+ "outputId" : " 0c1a6d2a-ef35-40fb-8198-bd9427c5754b"
27
+ },
28
+ "outputs" : [
29
+ {
30
+ "output_type" : " stream" ,
31
+ "name" : " stdout" ,
32
+ "text" : [
33
+ " Unique labels: ['anger', 'sadness', 'fear', 'joy']\n " ,
34
+ " Categories (4, object): ['anger', 'fear', 'joy', 'sadness']\n " ,
35
+ " Encoded labels: [0 3 1 2]\n " ,
36
+ " Training set size: 6720\n " ,
37
+ " Validation set size: 1680\n "
38
+ ]
39
+ },
40
+ {
41
+ "output_type" : " stream" ,
42
+ "name" : " stderr" ,
43
+ "text" : [
44
+ " /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n " ,
45
+ " warnings.warn(\n " ,
46
+ " Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']\n " ,
47
+ " You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n " ,
48
+ " /usr/local/lib/python3.10/dist-packages/transformers/training_args.py:1525: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n " ,
49
+ " warnings.warn(\n "
50
+ ]
51
+ },
52
+ {
53
+ "output_type" : " display_data" ,
54
+ "data" : {
55
+ "text/plain" : [
56
+ " <IPython.core.display.HTML object>"
57
+ ],
58
+ "text/html" : [
59
+ " \n " ,
60
+ " <div>\n " ,
61
+ " \n " ,
62
+ " <progress value='1260' max='1260' style='width:300px; height:20px; vertical-align: middle;'></progress>\n " ,
63
+ " [1260/1260 39:01, Epoch 3/3]\n " ,
64
+ " </div>\n " ,
65
+ " <table border=\" 1\" class=\" dataframe\" >\n " ,
66
+ " <thead>\n " ,
67
+ " <tr style=\" text-align: left;\" >\n " ,
68
+ " <th>Epoch</th>\n " ,
69
+ " <th>Training Loss</th>\n " ,
70
+ " <th>Validation Loss</th>\n " ,
71
+ " </tr>\n " ,
72
+ " </thead>\n " ,
73
+ " <tbody>\n " ,
74
+ " <tr>\n " ,
75
+ " <td>1</td>\n " ,
76
+ " <td>0.002200</td>\n " ,
77
+ " <td>0.001418</td>\n " ,
78
+ " </tr>\n " ,
79
+ " <tr>\n " ,
80
+ " <td>2</td>\n " ,
81
+ " <td>0.000400</td>\n " ,
82
+ " <td>0.000254</td>\n " ,
83
+ " </tr>\n " ,
84
+ " <tr>\n " ,
85
+ " <td>3</td>\n " ,
86
+ " <td>0.000300</td>\n " ,
87
+ " <td>0.000174</td>\n " ,
88
+ " </tr>\n " ,
89
+ " </tbody>\n " ,
90
+ " </table><p>"
91
+ ]
92
+ },
93
+ "metadata" : {}
94
+ },
95
+ {
96
+ "output_type" : " stream" ,
97
+ "name" : " stdout" ,
98
+ "text" : [
99
+ " The sentiment classified is: joy\n "
100
+ ]
101
+ }
102
+ ],
103
+ "source" : [
104
+ " import pandas as pd\n " ,
105
+ " import torch\n " ,
106
+ " from sklearn.model_selection import train_test_split\n " ,
107
+ " from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments\n " ,
108
+ " \n " ,
109
+ " # Load the dataset\n " ,
110
+ " df = pd.read_csv('emotion_data.csv')\n " ,
111
+ " \n " ,
112
+ " # Remove rows with invalid sentiments\n " ,
113
+ " df = df[df['sentiment'].notna()] # Remove rows with missing sentiment values\n " ,
114
+ " df = df[df['sentiment'].isin(df['sentiment'].unique())] # Ensure only valid sentiments are included\n " ,
115
+ " \n " ,
116
+ " # Encode labels\n " ,
117
+ " df['sentiment'] = df['sentiment'].astype('category')\n " ,
118
+ " df['label'] = df['sentiment'].cat.codes\n " ,
119
+ " \n " ,
120
+ " # Print unique labels to verify\n " ,
121
+ " print(\" Unique labels:\" , df['sentiment'].unique())\n " ,
122
+ " print(\" Encoded labels:\" , df['label'].unique())\n " ,
123
+ " \n " ,
124
+ " # Split the dataset\n " ,
125
+ " train_texts, val_texts, train_labels, val_labels = train_test_split(\n " ,
126
+ " df['content'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42\n " ,
127
+ " )\n " ,
128
+ " \n " ,
129
+ " # Check sizes of the datasets\n " ,
130
+ " print(\" Training set size:\" , len(train_texts))\n " ,
131
+ " print(\" Validation set size:\" , len(val_texts))\n " ,
132
+ " \n " ,
133
+ " # Load tokenizer and model\n " ,
134
+ " model_name = \" distilbert-base-uncased\"\n " ,
135
+ " tokenizer = AutoTokenizer.from_pretrained(model_name)\n " ,
136
+ " model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(df['sentiment'].unique()))\n " ,
137
+ " \n " ,
138
+ " # Tokenization\n " ,
139
+ " train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)\n " ,
140
+ " val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)\n " ,
141
+ " \n " ,
142
+ " # Create dataset class\n " ,
143
+ " class EmotionDataset(torch.utils.data.Dataset):\n " ,
144
+ " def __init__(self, encodings, labels):\n " ,
145
+ " self.encodings = encodings\n " ,
146
+ " self.labels = labels\n " ,
147
+ " \n " ,
148
+ " def __getitem__(self, idx):\n " ,
149
+ " item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}\n " ,
150
+ " item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)\n " ,
151
+ " return item\n " ,
152
+ " \n " ,
153
+ " def __len__(self):\n " ,
154
+ " return len(self.labels)\n " ,
155
+ " \n " ,
156
+ " # Create datasets\n " ,
157
+ " train_dataset = EmotionDataset(train_encodings, train_labels)\n " ,
158
+ " val_dataset = EmotionDataset(val_encodings, val_labels)\n " ,
159
+ " \n " ,
160
+ " # Set training arguments\n " ,
161
+ " training_args = TrainingArguments(\n " ,
162
+ " output_dir='./results',\n " ,
163
+ " num_train_epochs=3,\n " ,
164
+ " per_device_train_batch_size=16,\n " ,
165
+ " per_device_eval_batch_size=64,\n " ,
166
+ " warmup_steps=500,\n " ,
167
+ " weight_decay=0.01,\n " ,
168
+ " logging_dir='./logs',\n " ,
169
+ " logging_steps=10,\n " ,
170
+ " evaluation_strategy='epoch'\n " ,
171
+ " )\n " ,
172
+ " \n " ,
173
+ " # Create Trainer\n " ,
174
+ " trainer = Trainer(\n " ,
175
+ " model=model,\n " ,
176
+ " args=training_args,\n " ,
177
+ " train_dataset=train_dataset,\n " ,
178
+ " eval_dataset=val_dataset\n " ,
179
+ " )\n " ,
180
+ " \n " ,
181
+ " # Train the model\n " ,
182
+ " try:\n " ,
183
+ " trainer.train()\n " ,
184
+ " except RuntimeError as e:\n " ,
185
+ " print(f\" Error during training: {e}\" )\n " ,
186
+ " \n " ,
187
+ " # Save the model\n " ,
188
+ " trainer.save_model(\" emotion_classifier\" )\n " ,
189
+ " \n " ,
190
+ " # Function to classify sentiments\n " ,
191
+ " def classify_sentiment(text):\n " ,
192
+ " inputs = tokenizer(text, return_tensors=\" pt\" , truncation=True, padding=True, max_length=128)\n " ,
193
+ " with torch.no_grad():\n " ,
194
+ " outputs = model(**inputs)\n " ,
195
+ " predictions = torch.argmax(outputs.logits, dim=-1)\n " ,
196
+ " return df['sentiment'].cat.categories[predictions.item()]\n " ,
197
+ " \n " ,
198
+ " # Example usage\n " ,
199
+ " if __name__ == \" __main__\" :\n " ,
200
+ " text = \" alonzo feels extremely happy!\"\n " ,
201
+ " sentiment = classify_sentiment(text)\n " ,
202
+ " print(f\" The sentiment classified is: {sentiment}\" )\n "
203
+ ]
204
+ },
205
+ {
206
+ "cell_type" : " code" ,
207
+ "source" : [],
208
+ "metadata" : {
209
+ "id" : " hLUDgJr5Pnx8"
210
+ },
211
+ "execution_count" : null ,
212
+ "outputs" : []
213
+ }
214
+ ]
215
+ }
0 commit comments