Skip to content

Commit 18eb082

Browse files
Merge pull request #1585 from okaditya84/master
Added sentiment analyzer program
2 parents 7d2be06 + 2df2a60 commit 18eb082

File tree

5 files changed

+385
-0
lines changed

5 files changed

+385
-0
lines changed

Sentiment Analyser/README.md

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Sentiment Analyser
2+
3+
This is a sentiment analyser that takes in a sentence and returns the sentiment of the sentence.
4+
It gives rating to the emotions present in the sentence, from 0 to 1.
5+
It uses NLP and ML algroithms to do so.
6+
This is trained on word2vec dataset using IMDB movie reviews.
7+
And the test model is based on RNN.
8+
9+
10+
11+
## Setup instructions
12+
- Install python 3.6 or above
13+
- Install the required packages using the following command
14+
```bash
15+
pip install -r requirements.txt
16+
```
17+
- First execute the word2vec.ipynb file to train the model
18+
- Then execute the RNN(w2v).ipynb file to test the model
19+
- Change the epochs batch size based on your system configuration and the accuracy you want.
20+
- It will then create a csv file with the sentiment of the sentence.
21+
22+
## Output
23+
Check the screenshot in this folder for the output.
24+
25+
## Author(s)
26+
27+
- [Aditya Jethani](https://github.com/okaditya84)
28+

Sentiment Analyser/RNN(w2v).ipynb

Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"id": "e5997f58",
7+
"metadata": {},
8+
"outputs": [
9+
{
10+
"name": "stdout",
11+
"output_type": "stream",
12+
"text": [
13+
"Epoch 1/15\n",
14+
"196/196 [==============================] - 59s 272ms/step - loss: 0.6775 - acc: 0.5654 - val_loss: 0.6538 - val_acc: 0.6173 - lr: 0.0010\n",
15+
"Epoch 2/15\n",
16+
"196/196 [==============================] - 52s 265ms/step - loss: 0.6585 - acc: 0.6066 - val_loss: 0.6501 - val_acc: 0.6184 - lr: 0.0010\n",
17+
"Epoch 3/15\n",
18+
"196/196 [==============================] - 55s 280ms/step - loss: 0.6488 - acc: 0.6216 - val_loss: 0.6378 - val_acc: 0.6365 - lr: 0.0010\n",
19+
"Epoch 4/15\n",
20+
"196/196 [==============================] - 58s 294ms/step - loss: 0.6427 - acc: 0.6304 - val_loss: 0.6505 - val_acc: 0.6307 - lr: 0.0010\n",
21+
"Epoch 5/15\n",
22+
"196/196 [==============================] - ETA: 0s - loss: 0.6337 - acc: 0.6417\n",
23+
"Epoch 5: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.\n",
24+
"196/196 [==============================] - 57s 291ms/step - loss: 0.6337 - acc: 0.6417 - val_loss: 0.6780 - val_acc: 0.6249 - lr: 0.0010\n",
25+
"Epoch 6/15\n",
26+
"196/196 [==============================] - 55s 280ms/step - loss: 0.6225 - acc: 0.6509 - val_loss: 0.6244 - val_acc: 0.6507 - lr: 1.0000e-04\n",
27+
"Epoch 7/15\n",
28+
"196/196 [==============================] - 55s 280ms/step - loss: 0.6213 - acc: 0.6510 - val_loss: 0.6274 - val_acc: 0.6496 - lr: 1.0000e-04\n",
29+
"Epoch 8/15\n",
30+
"196/196 [==============================] - 55s 281ms/step - loss: 0.6181 - acc: 0.6549 - val_loss: 0.6220 - val_acc: 0.6522 - lr: 1.0000e-04\n",
31+
"Epoch 9/15\n",
32+
"196/196 [==============================] - 56s 285ms/step - loss: 0.6180 - acc: 0.6551 - val_loss: 0.6195 - val_acc: 0.6536 - lr: 1.0000e-04\n",
33+
"Epoch 10/15\n",
34+
"196/196 [==============================] - 56s 284ms/step - loss: 0.6165 - acc: 0.6585 - val_loss: 0.6242 - val_acc: 0.6512 - lr: 1.0000e-04\n",
35+
"Epoch 11/15\n",
36+
"196/196 [==============================] - 56s 287ms/step - loss: 0.6176 - acc: 0.6549 - val_loss: 0.6187 - val_acc: 0.6538 - lr: 1.0000e-04\n",
37+
"Epoch 12/15\n",
38+
"196/196 [==============================] - 56s 285ms/step - loss: 0.6148 - acc: 0.6585 - val_loss: 0.6178 - val_acc: 0.6574 - lr: 1.0000e-04\n",
39+
"Epoch 13/15\n",
40+
"196/196 [==============================] - 56s 287ms/step - loss: 0.6160 - acc: 0.6599 - val_loss: 0.6160 - val_acc: 0.6574 - lr: 1.0000e-04\n",
41+
"Epoch 14/15\n",
42+
"196/196 [==============================] - 54s 275ms/step - loss: 0.6142 - acc: 0.6572 - val_loss: 0.6156 - val_acc: 0.6571 - lr: 1.0000e-04\n",
43+
"Epoch 15/15\n",
44+
"196/196 [==============================] - 54s 274ms/step - loss: 0.6136 - acc: 0.6595 - val_loss: 0.6162 - val_acc: 0.6576 - lr: 1.0000e-04\n",
45+
"196/196 [==============================] - 7s 35ms/step - loss: 0.6162 - acc: 0.6576\n",
46+
"Test score: 0.6162243485450745\n",
47+
"Test accuracy: 0.6576399803161621\n"
48+
]
49+
}
50+
],
51+
"source": [
52+
"from keras.datasets import imdb\n",
53+
"from keras.models import Sequential\n",
54+
"from keras.layers import LSTM, Dense, Embedding, Dropout\n",
55+
"from keras.callbacks import EarlyStopping, ReduceLROnPlateau\n",
56+
"from keras.preprocessing.text import Tokenizer\n",
57+
"from gensim.models import Word2Vec\n",
58+
"import numpy as np\n",
59+
"import pandas as pd\n",
60+
"from tensorflow.keras.preprocessing.sequence import pad_sequences \n",
61+
"import pickle\n",
62+
"# Load the IMDB dataset and split it into training and test sets\n",
63+
"(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)\n",
64+
"\n",
65+
"# Tokenize the text and convert it to sequences\n",
66+
"tokenizer = Tokenizer(num_words=10000)\n",
67+
"x_train_str = [str(text) for text in x_train]\n",
68+
"tokenizer.fit_on_texts(x_train_str)\n",
69+
"x_train = tokenizer.texts_to_sequences(x_train_str)\n",
70+
"x_test_str = [str(text) for text in x_test]\n",
71+
"x_test = tokenizer.texts_to_sequences(x_test_str)\n",
72+
"\n",
73+
"# Pad the sequences to a fixed length\n",
74+
"maxlen = 100\n",
75+
"x_train = pad_sequences(x_train, maxlen=maxlen)\n",
76+
"x_test = pad_sequences(x_test, maxlen=maxlen)\n",
77+
"\n",
78+
"# Load pre-trained Word2Vec model\n",
79+
"w2v_model = Word2Vec.load('w2v_model.bin')\n",
80+
"\n",
81+
"# Create embedding matrix\n",
82+
"word_index = tokenizer.word_index\n",
83+
"embedding_matrix = np.zeros((len(word_index) + 1, 100))\n",
84+
"for word, i in word_index.items():\n",
85+
" if word in w2v_model.wv.key_to_index:\n",
86+
" embedding_matrix[i] = w2v_model.wv[word]\n",
87+
"\n",
88+
"# Define the model architecture\n",
89+
"model = Sequential()\n",
90+
"model.add(Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], input_length=maxlen, trainable=False))\n",
91+
"model.add(Dropout(0.2))\n",
92+
"model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))\n",
93+
"model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2))\n",
94+
"model.add(Dense(1, activation='sigmoid'))\n",
95+
"\n",
96+
"# Compile the model\n",
97+
"model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])\n",
98+
"\n",
99+
"# Define early stopping and learning rate reduction callbacks\n",
100+
"early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='min')\n",
101+
"reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, verbose=1, mode='min')\n",
102+
"\n",
103+
"# Train the model\n",
104+
"history = model.fit(\n",
105+
" x_train, y_train ,\n",
106+
" batch_size=128,\n",
107+
" epochs=15,\n",
108+
" validation_data=(x_test, y_test),\n",
109+
" callbacks=[early_stopping, reduce_lr]\n",
110+
")\n",
111+
"#save the model in pickle format\n",
112+
"pickle.dump(model, open('model.pkl', 'wb'))\n",
113+
"#save the tokenizer in pickle format\n",
114+
"pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))\n",
115+
"\n",
116+
"\n",
117+
"# Evaluate the model on the test set\n",
118+
"score, acc = model.evaluate(x_test, y_test, batch_size=128)\n",
119+
"print('Test score:', score)\n",
120+
"print('Test accuracy:', acc)\n",
121+
"\n"
122+
]
123+
},
124+
{
125+
"cell_type": "code",
126+
"execution_count": null,
127+
"id": "7885e840",
128+
"metadata": {},
129+
"outputs": [],
130+
"source": []
131+
},
132+
{
133+
"cell_type": "code",
134+
"execution_count": 2,
135+
"id": "43521b94",
136+
"metadata": {},
137+
"outputs": [
138+
{
139+
"name": "stdout",
140+
"output_type": "stream",
141+
"text": [
142+
"782/782 [==============================] - 11s 14ms/step\n",
143+
"Saved results to CSV file.\n"
144+
]
145+
}
146+
],
147+
"source": [
148+
"# Get predictions on the test set\n",
149+
"# Get predicted probabilities on the test set\n",
150+
"y_pred_prob = model.predict(x_test)\n",
151+
"\n",
152+
"# Convert probabilities to classes\n",
153+
"y_pred = np.argmax(y_pred_prob, axis=1)\n",
154+
"\n",
155+
"# Convert the integer labels to sentiment strings\n",
156+
"sentiments = ['negative', 'positive']\n",
157+
"y_test_str = np.array([sentiments[label] for label in y_test])\n",
158+
"y_pred_str = np.array([sentiments[label] for label in y_pred])\n",
159+
"\n",
160+
"# Store the results in a CSV file\n",
161+
"results = pd.DataFrame({'Review': x_test_str, 'Actual Sentiment': y_test_str, 'Predicted Sentiment': y_pred_str})\n",
162+
"results.to_csv('imdb_sentiments.csv', index=False)\n",
163+
"\n",
164+
"print('Saved results to CSV file.')\n"
165+
]
166+
},
167+
{
168+
"cell_type": "code",
169+
"execution_count": null,
170+
"id": "92a7b5ad",
171+
"metadata": {},
172+
"outputs": [],
173+
"source": []
174+
}
175+
],
176+
"metadata": {
177+
"kernelspec": {
178+
"display_name": "Python 3 (ipykernel)",
179+
"language": "python",
180+
"name": "python3"
181+
},
182+
"language_info": {
183+
"codemirror_mode": {
184+
"name": "ipython",
185+
"version": 3
186+
},
187+
"file_extension": ".py",
188+
"mimetype": "text/x-python",
189+
"name": "python",
190+
"nbconvert_exporter": "python",
191+
"pygments_lexer": "ipython3",
192+
"version": "3.11.1"
193+
}
194+
},
195+
"nbformat": 4,
196+
"nbformat_minor": 5
197+
}
72.4 KB
Loading

Sentiment Analyser/requirements.txt

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
absl-py==1.4.0
2+
asttokens==2.2.1
3+
astunparse==1.6.3
4+
backcall==0.2.0
5+
cachetools==5.3.0
6+
certifi==2022.12.7
7+
charset-normalizer==3.1.0
8+
colorama==0.4.6
9+
comm==0.1.2
10+
debugpy==1.6.6
11+
decorator==5.1.1
12+
executing==1.2.0
13+
flatbuffers==23.3.3
14+
gast==0.4.0
15+
gensim==4.3.1
16+
google-auth==2.16.2
17+
google-auth-oauthlib==0.4.6
18+
google-pasta==0.2.0
19+
grpcio==1.51.3
20+
h5py==3.8.0
21+
idna==3.4
22+
ipykernel==6.21.3
23+
ipython==8.11.0
24+
jax==0.4.6
25+
jedi==0.18.2
26+
jupyter_client==8.0.3
27+
jupyter_core==5.3.0
28+
keras==2.12.0
29+
libclang==15.0.6.1
30+
Markdown==3.4.1
31+
MarkupSafe==2.1.2
32+
matplotlib-inline==0.1.6
33+
nest-asyncio==1.5.6
34+
numpy==1.23.5
35+
oauthlib==3.2.2
36+
opt-einsum==3.3.0
37+
packaging==23.0
38+
pandas==1.5.3
39+
parso==0.8.3
40+
pickleshare==0.7.5
41+
platformdirs==3.1.1
42+
prompt-toolkit==3.0.38
43+
protobuf==4.22.1
44+
psutil==5.9.4
45+
pure-eval==0.2.2
46+
pyasn1==0.4.8
47+
pyasn1-modules==0.2.8
48+
Pygments==2.14.0
49+
python-dateutil==2.8.2
50+
pytz==2022.7.1
51+
pywin32==305
52+
pyzmq==25.0.1
53+
requests==2.28.2
54+
requests-oauthlib==1.3.1
55+
rsa==4.9
56+
scipy==1.10.1
57+
six==1.16.0
58+
smart-open==6.3.0
59+
stack-data==0.6.2
60+
tensorboard==2.12.0
61+
tensorboard-data-server==0.7.0
62+
tensorboard-plugin-wit==1.8.1
63+
tensorflow==2.12.0rc1
64+
tensorflow-estimator==2.12.0
65+
tensorflow-intel==2.12.0rc1
66+
tensorflow-io-gcs-filesystem==0.31.0
67+
termcolor==2.2.0
68+
tornado==6.2
69+
traitlets==5.9.0
70+
typing_extensions==4.5.0
71+
urllib3==1.26.15
72+
wcwidth==0.2.6
73+
Werkzeug==2.2.3
74+
wincertstore==0.2
75+
wrapt==1.14.1

Sentiment Analyser/word2vec.ipynb

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"colab": {
8+
"base_uri": "https://localhost:8080/"
9+
},
10+
"id": "ex-oKOQ95wAu",
11+
"outputId": "0eef58ac-1fb5-4879-df27-bc9378bef581"
12+
},
13+
"outputs": [],
14+
"source": []
15+
},
16+
{
17+
"cell_type": "code",
18+
"execution_count": null,
19+
"metadata": {
20+
"id": "Y0NOzxRF5zzT"
21+
},
22+
"outputs": [],
23+
"source": [
24+
"from gensim.models import Word2Vec\n",
25+
"from keras.datasets import imdb\n",
26+
"\n",
27+
"# Load the IMDB dataset\n",
28+
"(x_train, _), (x_test, _) = imdb.load_data(num_words=10000)\n",
29+
"\n",
30+
"# Convert the sequences of word indexes to lists of words\n",
31+
"word_index = imdb.get_word_index()\n",
32+
"index_to_word = {i: word for word, i in word_index.items()}\n",
33+
"index_to_word[0] = '<PAD>'\n",
34+
"index_to_word[1] = '<START>'\n",
35+
"index_to_word[2] = '<UNK>'\n",
36+
"x_train = [[index_to_word.get(i, '') for i in seq] for seq in x_train]\n",
37+
"x_test = [[index_to_word.get(i, '') for i in seq] for seq in x_test]\n",
38+
"\n",
39+
"# Train the Word2Vec model\n",
40+
"w2v_model = Word2Vec(sentences=x_train + x_test, vector_size=100, window=5, min_count=1, workers=4, epochs=10)\n",
41+
"\n",
42+
"# Save the trained model to a file\n",
43+
"w2v_model.save('w2v_model.bin')\n"
44+
]
45+
},
46+
{
47+
"cell_type": "code",
48+
"execution_count": null,
49+
"metadata": {},
50+
"outputs": [],
51+
"source": []
52+
},
53+
{
54+
"cell_type": "code",
55+
"execution_count": null,
56+
"metadata": {},
57+
"outputs": [],
58+
"source": []
59+
}
60+
],
61+
"metadata": {
62+
"colab": {
63+
"provenance": []
64+
},
65+
"kernelspec": {
66+
"display_name": "Python 3 (ipykernel)",
67+
"language": "python",
68+
"name": "python3"
69+
},
70+
"language_info": {
71+
"codemirror_mode": {
72+
"name": "ipython",
73+
"version": 3
74+
},
75+
"file_extension": ".py",
76+
"mimetype": "text/x-python",
77+
"name": "python",
78+
"nbconvert_exporter": "python",
79+
"pygments_lexer": "ipython3",
80+
"version": "3.11.1"
81+
}
82+
},
83+
"nbformat": 4,
84+
"nbformat_minor": 1
85+
}

0 commit comments

Comments
 (0)