Merge pull request #1585 from okaditya84/master

avinashkranjan · web-flow · commit 18eb082edabb · 2023-06-09T13:52:56.000+05:30
Added sentiment analyzer program
diff --git a/Sentiment Analyser/README.md b/Sentiment Analyser/README.md
@@ -0,0 +1,28 @@
+# Sentiment Analyser
+
+This is a sentiment analyser that takes in a sentence and returns the sentiment of the sentence.
+It gives rating to the emotions present in the sentence, from 0 to 1.
+It uses NLP and ML algroithms to do so.
+This is trained on word2vec dataset using IMDB movie reviews.
+And the test model is based on RNN.
+
+
+
+## Setup instructions
+- Install python 3.6 or above
+- Install the required packages using the following command
+```bash
+pip install -r requirements.txt
+```
+- First execute the word2vec.ipynb file to train the model
+- Then execute the RNN(w2v).ipynb file to test the model
+- Change the epochs batch size based on your system configuration and the accuracy you want.
+- It will then create a csv file with the sentiment of the sentence.
+
+## Output
+Check the screenshot in this folder for the output.
+
+## Author(s)
+
+- [Aditya Jethani](https://github.com/okaditya84)
+
diff --git a/Sentiment Analyser/RNN(w2v).ipynb b/Sentiment Analyser/RNN(w2v).ipynb
@@ -0,0 +1,197 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "e5997f58",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 1/15\n",
+      "196/196 [==============================] - 59s 272ms/step - loss: 0.6775 - acc: 0.5654 - val_loss: 0.6538 - val_acc: 0.6173 - lr: 0.0010\n",
+      "Epoch 2/15\n",
+      "196/196 [==============================] - 52s 265ms/step - loss: 0.6585 - acc: 0.6066 - val_loss: 0.6501 - val_acc: 0.6184 - lr: 0.0010\n",
+      "Epoch 3/15\n",
+      "196/196 [==============================] - 55s 280ms/step - loss: 0.6488 - acc: 0.6216 - val_loss: 0.6378 - val_acc: 0.6365 - lr: 0.0010\n",
+      "Epoch 4/15\n",
+      "196/196 [==============================] - 58s 294ms/step - loss: 0.6427 - acc: 0.6304 - val_loss: 0.6505 - val_acc: 0.6307 - lr: 0.0010\n",
+      "Epoch 5/15\n",
+      "196/196 [==============================] - ETA: 0s - loss: 0.6337 - acc: 0.6417\n",
+      "Epoch 5: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.\n",
+      "196/196 [==============================] - 57s 291ms/step - loss: 0.6337 - acc: 0.6417 - val_loss: 0.6780 - val_acc: 0.6249 - lr: 0.0010\n",
+      "Epoch 6/15\n",
+      "196/196 [==============================] - 55s 280ms/step - loss: 0.6225 - acc: 0.6509 - val_loss: 0.6244 - val_acc: 0.6507 - lr: 1.0000e-04\n",
+      "Epoch 7/15\n",
+      "196/196 [==============================] - 55s 280ms/step - loss: 0.6213 - acc: 0.6510 - val_loss: 0.6274 - val_acc: 0.6496 - lr: 1.0000e-04\n",
+      "Epoch 8/15\n",
+      "196/196 [==============================] - 55s 281ms/step - loss: 0.6181 - acc: 0.6549 - val_loss: 0.6220 - val_acc: 0.6522 - lr: 1.0000e-04\n",
+      "Epoch 9/15\n",
+      "196/196 [==============================] - 56s 285ms/step - loss: 0.6180 - acc: 0.6551 - val_loss: 0.6195 - val_acc: 0.6536 - lr: 1.0000e-04\n",
+      "Epoch 10/15\n",
+      "196/196 [==============================] - 56s 284ms/step - loss: 0.6165 - acc: 0.6585 - val_loss: 0.6242 - val_acc: 0.6512 - lr: 1.0000e-04\n",
+      "Epoch 11/15\n",
+      "196/196 [==============================] - 56s 287ms/step - loss: 0.6176 - acc: 0.6549 - val_loss: 0.6187 - val_acc: 0.6538 - lr: 1.0000e-04\n",
+      "Epoch 12/15\n",
+      "196/196 [==============================] - 56s 285ms/step - loss: 0.6148 - acc: 0.6585 - val_loss: 0.6178 - val_acc: 0.6574 - lr: 1.0000e-04\n",
+      "Epoch 13/15\n",
+      "196/196 [==============================] - 56s 287ms/step - loss: 0.6160 - acc: 0.6599 - val_loss: 0.6160 - val_acc: 0.6574 - lr: 1.0000e-04\n",
+      "Epoch 14/15\n",
+      "196/196 [==============================] - 54s 275ms/step - loss: 0.6142 - acc: 0.6572 - val_loss: 0.6156 - val_acc: 0.6571 - lr: 1.0000e-04\n",
+      "Epoch 15/15\n",
+      "196/196 [==============================] - 54s 274ms/step - loss: 0.6136 - acc: 0.6595 - val_loss: 0.6162 - val_acc: 0.6576 - lr: 1.0000e-04\n",
+      "196/196 [==============================] - 7s 35ms/step - loss: 0.6162 - acc: 0.6576\n",
+      "Test score: 0.6162243485450745\n",
+      "Test accuracy: 0.6576399803161621\n"
+     ]
+    }
+   ],
+   "source": [
+    "from keras.datasets import imdb\n",
+    "from keras.models import Sequential\n",
+    "from keras.layers import LSTM, Dense, Embedding, Dropout\n",
+    "from keras.callbacks import EarlyStopping, ReduceLROnPlateau\n",
+    "from keras.preprocessing.text import Tokenizer\n",
+    "from gensim.models import Word2Vec\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from tensorflow.keras.preprocessing.sequence import pad_sequences \n",
+    "import pickle\n",
+    "# Load the IMDB dataset and split it into training and test sets\n",
+    "(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)\n",
+    "\n",
+    "# Tokenize the text and convert it to sequences\n",
+    "tokenizer = Tokenizer(num_words=10000)\n",
+    "x_train_str = [str(text) for text in x_train]\n",
+    "tokenizer.fit_on_texts(x_train_str)\n",
+    "x_train = tokenizer.texts_to_sequences(x_train_str)\n",
+    "x_test_str = [str(text) for text in x_test]\n",
+    "x_test = tokenizer.texts_to_sequences(x_test_str)\n",
+    "\n",
+    "# Pad the sequences to a fixed length\n",
+    "maxlen = 100\n",
+    "x_train = pad_sequences(x_train, maxlen=maxlen)\n",
+    "x_test = pad_sequences(x_test, maxlen=maxlen)\n",
+    "\n",
+    "# Load pre-trained Word2Vec model\n",
+    "w2v_model = Word2Vec.load('w2v_model.bin')\n",
+    "\n",
+    "# Create embedding matrix\n",
+    "word_index = tokenizer.word_index\n",
+    "embedding_matrix = np.zeros((len(word_index) + 1, 100))\n",
+    "for word, i in word_index.items():\n",
+    "    if word in w2v_model.wv.key_to_index:\n",
+    "        embedding_matrix[i] = w2v_model.wv[word]\n",
+    "\n",
+    "# Define the model architecture\n",
+    "model = Sequential()\n",
+    "model.add(Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], input_length=maxlen, trainable=False))\n",
+    "model.add(Dropout(0.2))\n",
+    "model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))\n",
+    "model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2))\n",
+    "model.add(Dense(1, activation='sigmoid'))\n",
+    "\n",
+    "# Compile the model\n",
+    "model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])\n",
+    "\n",
+    "# Define early stopping and learning rate reduction callbacks\n",
+    "early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='min')\n",
+    "reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, verbose=1, mode='min')\n",
+    "\n",
+    "# Train the model\n",
+    "history = model.fit(\n",
+    "    x_train, y_train ,\n",
+    "    batch_size=128,\n",
+    "    epochs=15,\n",
+    "    validation_data=(x_test, y_test),\n",
+    "    callbacks=[early_stopping, reduce_lr]\n",
+    ")\n",
+    "#save the model in pickle format\n",
+    "pickle.dump(model, open('model.pkl', 'wb'))\n",
+    "#save the tokenizer in pickle format\n",
+    "pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))\n",
+    "\n",
+    "\n",
+    "# Evaluate the model on the test set\n",
+    "score, acc = model.evaluate(x_test, y_test, batch_size=128)\n",
+    "print('Test score:', score)\n",
+    "print('Test accuracy:', acc)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7885e840",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "43521b94",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "782/782 [==============================] - 11s 14ms/step\n",
+      "Saved results to CSV file.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Get predictions on the test set\n",
+    "# Get predicted probabilities on the test set\n",
+    "y_pred_prob = model.predict(x_test)\n",
+    "\n",
+    "# Convert probabilities to classes\n",
+    "y_pred = np.argmax(y_pred_prob, axis=1)\n",
+    "\n",
+    "# Convert the integer labels to sentiment strings\n",
+    "sentiments = ['negative', 'positive']\n",
+    "y_test_str = np.array([sentiments[label] for label in y_test])\n",
+    "y_pred_str = np.array([sentiments[label] for label in y_pred])\n",
+    "\n",
+    "# Store the results in a CSV file\n",
+    "results = pd.DataFrame({'Review': x_test_str, 'Actual Sentiment': y_test_str, 'Predicted Sentiment': y_pred_str})\n",
+    "results.to_csv('imdb_sentiments.csv', index=False)\n",
+    "\n",
+    "print('Saved results to CSV file.')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "92a7b5ad",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Sentiment Analyser/Screenshot 2023-05-30 123233.png b/Sentiment Analyser/Screenshot 2023-05-30 123233.png
diff --git a/Sentiment Analyser/requirements.txt b/Sentiment Analyser/requirements.txt
@@ -0,0 +1,75 @@
+absl-py==1.4.0
+asttokens==2.2.1
+astunparse==1.6.3
+backcall==0.2.0
+cachetools==5.3.0
+certifi==2022.12.7
+charset-normalizer==3.1.0
+colorama==0.4.6
+comm==0.1.2
+debugpy==1.6.6
+decorator==5.1.1
+executing==1.2.0
+flatbuffers==23.3.3
+gast==0.4.0
+gensim==4.3.1
+google-auth==2.16.2
+google-auth-oauthlib==0.4.6
+google-pasta==0.2.0
+grpcio==1.51.3
+h5py==3.8.0
+idna==3.4
+ipykernel==6.21.3
+ipython==8.11.0
+jax==0.4.6
+jedi==0.18.2
+jupyter_client==8.0.3
+jupyter_core==5.3.0
+keras==2.12.0
+libclang==15.0.6.1
+Markdown==3.4.1
+MarkupSafe==2.1.2
+matplotlib-inline==0.1.6
+nest-asyncio==1.5.6
+numpy==1.23.5
+oauthlib==3.2.2
+opt-einsum==3.3.0
+packaging==23.0
+pandas==1.5.3
+parso==0.8.3
+pickleshare==0.7.5
+platformdirs==3.1.1
+prompt-toolkit==3.0.38
+protobuf==4.22.1
+psutil==5.9.4
+pure-eval==0.2.2
+pyasn1==0.4.8
+pyasn1-modules==0.2.8
+Pygments==2.14.0
+python-dateutil==2.8.2
+pytz==2022.7.1
+pywin32==305
+pyzmq==25.0.1
+requests==2.28.2
+requests-oauthlib==1.3.1
+rsa==4.9
+scipy==1.10.1
+six==1.16.0
+smart-open==6.3.0
+stack-data==0.6.2
+tensorboard==2.12.0
+tensorboard-data-server==0.7.0
+tensorboard-plugin-wit==1.8.1
+tensorflow==2.12.0rc1
+tensorflow-estimator==2.12.0
+tensorflow-intel==2.12.0rc1
+tensorflow-io-gcs-filesystem==0.31.0
+termcolor==2.2.0
+tornado==6.2
+traitlets==5.9.0
+typing_extensions==4.5.0
+urllib3==1.26.15
+wcwidth==0.2.6
+Werkzeug==2.2.3
+wincertstore==0.2
+wrapt==1.14.1
diff --git a/Sentiment Analyser/word2vec.ipynb b/Sentiment Analyser/word2vec.ipynb
@@ -0,0 +1,85 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "ex-oKOQ95wAu",
+    "outputId": "0eef58ac-1fb5-4879-df27-bc9378bef581"
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Y0NOzxRF5zzT"
+   },
+   "outputs": [],
+   "source": [
+    "from gensim.models import Word2Vec\n",
+    "from keras.datasets import imdb\n",
+    "\n",
+    "# Load the IMDB dataset\n",
+    "(x_train, _), (x_test, _) = imdb.load_data(num_words=10000)\n",
+    "\n",
+    "# Convert the sequences of word indexes to lists of words\n",
+    "word_index = imdb.get_word_index()\n",
+    "index_to_word = {i: word for word, i in word_index.items()}\n",
+    "index_to_word[0] = '<PAD>'\n",
+    "index_to_word[1] = '<START>'\n",
+    "index_to_word[2] = '<UNK>'\n",
+    "x_train = [[index_to_word.get(i, '') for i in seq] for seq in x_train]\n",
+    "x_test = [[index_to_word.get(i, '') for i in seq] for seq in x_test]\n",
+    "\n",
+    "# Train the Word2Vec model\n",
+    "w2v_model = Word2Vec(sentences=x_train + x_test, vector_size=100, window=5, min_count=1, workers=4, epochs=10)\n",
+    "\n",
+    "# Save the trained model to a file\n",
+    "w2v_model.save('w2v_model.bin')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}