Merge pull request #2900 from smty2018/lstm

avinashkranjan · web-flow · commit ec0178624337 · 2023-08-10T23:15:58.000+05:30
Spam Email Classification using LSTM
diff --git a/Email Spam Classifer(LSTM Networks)/EmailSpamClassifier(LSTM_Network).ipynb b/Email Spam Classifer(LSTM Networks)/EmailSpamClassifier(LSTM_Network).ipynb
@@ -0,0 +1,240 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "source": [
+        "import numpy as np\n",
+        "import pandas as pd\n",
+        "from sklearn.model_selection import train_test_split\n",
+        "from sklearn.feature_extraction.text import CountVectorizer\n",
+        "from tensorflow.keras.models import Sequential\n",
+        "from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout\n",
+        "from tensorflow.keras.preprocessing.text import Tokenizer\n",
+        "from tensorflow.keras.preprocessing.sequence import pad_sequences"
+      ],
+      "metadata": {
+        "id": "q9n8UiMR74n3"
+      },
+      "execution_count": 12,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "data = pd.read_csv('spam_ham_dataset.csv')\n",
+        "X = data['text']\n",
+        "y = data['label']\n",
+        "y = y.map({'ham': 0, 'spam': 1})"
+      ],
+      "metadata": {
+        "id": "mzWr2hfq773m"
+      },
+      "execution_count": 13,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42)"
+      ],
+      "metadata": {
+        "id": "DUbwo5FV7-LB"
+      },
+      "execution_count": 14,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "tokenizer = Tokenizer()\n",
+        "tokenizer.fit_on_texts(X_tr)\n",
+        "X_tr_seq = tokenizer.texts_to_sequences(X_tr)\n",
+        "X_te_seq = tokenizer.texts_to_sequences(X_te)\n",
+        "max_seq_len = 100\n",
+        "X_tr_pad = pad_sequences(X_tr_seq, maxlen=max_seq_len, padding='post')\n",
+        "X_te_pad = pad_sequences(X_te_seq, maxlen=max_seq_len, padding='post')"
+      ],
+      "metadata": {
+        "id": "Em0mEiBR8BPb"
+      },
+      "execution_count": 15,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "vocab_size = len(tokenizer.word_index) + 1\n",
+        "model = Sequential()\n",
+        "model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_seq_len))\n",
+        "model.add(LSTM(64, return_sequences=True))\n",
+        "model.add(Dropout(0.5))\n",
+        "model.add(LSTM(64))\n",
+        "model.add(Dense(1, activation='sigmoid'))\n",
+        "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
+        "model.fit(X_tr_pad, y_tr, epochs=10, batch_size=32, validation_split=0.2)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "s_M5u0tM8Fkb",
+        "outputId": "93d7c7f8-40b7-44be-b425-8f57dcc18190"
+      },
+      "execution_count": 16,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Epoch 1/10\n",
+            "104/104 [==============================] - 35s 297ms/step - loss: 0.5534 - accuracy: 0.7482 - val_loss: 0.5267 - val_accuracy: 0.7705\n",
+            "Epoch 2/10\n",
+            "104/104 [==============================] - 28s 272ms/step - loss: 0.5402 - accuracy: 0.7582 - val_loss: 0.5110 - val_accuracy: 0.7742\n",
+            "Epoch 3/10\n",
+            "104/104 [==============================] - 29s 277ms/step - loss: 0.4195 - accuracy: 0.8210 - val_loss: 0.2600 - val_accuracy: 0.9118\n",
+            "Epoch 4/10\n",
+            "104/104 [==============================] - 28s 271ms/step - loss: 0.2672 - accuracy: 0.8951 - val_loss: 0.1177 - val_accuracy: 0.9614\n",
+            "Epoch 5/10\n",
+            "104/104 [==============================] - 28s 268ms/step - loss: 0.0643 - accuracy: 0.9797 - val_loss: 0.1393 - val_accuracy: 0.9650\n",
+            "Epoch 6/10\n",
+            "104/104 [==============================] - 28s 268ms/step - loss: 0.0444 - accuracy: 0.9879 - val_loss: 0.1399 - val_accuracy: 0.9710\n",
+            "Epoch 7/10\n",
+            "104/104 [==============================] - 29s 280ms/step - loss: 0.0451 - accuracy: 0.9912 - val_loss: 0.1501 - val_accuracy: 0.9674\n",
+            "Epoch 8/10\n",
+            "104/104 [==============================] - 28s 268ms/step - loss: 0.0311 - accuracy: 0.9946 - val_loss: 0.1582 - val_accuracy: 0.9686\n",
+            "Epoch 9/10\n",
+            "104/104 [==============================] - 29s 279ms/step - loss: 0.0275 - accuracy: 0.9952 - val_loss: 0.1492 - val_accuracy: 0.9710\n",
+            "Epoch 10/10\n",
+            "104/104 [==============================] - 29s 283ms/step - loss: 0.0249 - accuracy: 0.9955 - val_loss: 0.1553 - val_accuracy: 0.9698\n"
+          ]
+        },
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "<keras.callbacks.History at 0x78bd259d00a0>"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 16
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "model.summary()"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "95K3lpbr8SkX",
+        "outputId": "72b34217-6840-4f9e-f435-64c8861c5222"
+      },
+      "execution_count": 17,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Model: \"sequential_2\"\n",
+            "_________________________________________________________________\n",
+            " Layer (type)                Output Shape              Param #   \n",
+            "=================================================================\n",
+            " embedding_2 (Embedding)     (None, 100, 128)          6625664   \n",
+            "                                                                 \n",
+            " lstm_4 (LSTM)               (None, 100, 64)           49408     \n",
+            "                                                                 \n",
+            " dropout_2 (Dropout)         (None, 100, 64)           0         \n",
+            "                                                                 \n",
+            " lstm_5 (LSTM)               (None, 64)                33024     \n",
+            "                                                                 \n",
+            " dense_2 (Dense)             (None, 1)                 65        \n",
+            "                                                                 \n",
+            "=================================================================\n",
+            "Total params: 6,708,161\n",
+            "Trainable params: 6,708,161\n",
+            "Non-trainable params: 0\n",
+            "_________________________________________________________________\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 18,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "JjiVnmmi5Mm6",
+        "outputId": "dcd0456d-87ea-454d-a363-1ad8193eb132"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "33/33 [==============================] - 1s 42ms/step - loss: 0.1547 - accuracy: 0.9700\n",
+            "Test Loss: 0.1547\n",
+            "Test Accuracy: 0.9700\n"
+          ]
+        }
+      ],
+      "source": [
+        "loss, acc = model.evaluate(X_te_pad, y_te)\n",
+        "print(f\"Test Loss: {loss:.4f}\")\n",
+        "print(f\"Test Accuracy: {acc:.4f}\")\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "email_text = input(\"Enter an email text: \")\n",
+        "\n",
+        "sequence = tokenizer.texts_to_sequences([email_text])\n",
+        "padded_sequence = pad_sequences(sequence, maxlen=100, padding='post')\n",
+        "prediction = model.predict(padded_sequence)\n",
+        "\n",
+        "if prediction > 0.5:\n",
+        "    print(\"Prediction: Spam\")\n",
+        "else:\n",
+        "    print(\"Prediction: Ham\")"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "WvGAXfeW7f3h",
+        "outputId": "084ba5c2-4e54-47af-d239-b83a6485b511"
+      },
+      "execution_count": 19,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Enter an email text: you won 1 mill\n",
+            "1/1 [==============================] - 1s 855ms/step\n",
+            "Prediction: Spam\n"
+          ]
+        }
+      ]
+    }
+  ]
+}
diff --git a/Email Spam Classifer(LSTM Networks)/ReadMe.txt b/Email Spam Classifer(LSTM Networks)/ReadMe.txt
@@ -0,0 +1,24 @@
+Description:
+The primary goal of this project is to develop a machine learning model that can automatically classify emails as either spam or legitimate (ham) based on their content. The LSTM model(Long Short-Term Memory), recurrent neural network (RNN) architecture, is employed to capture sequential patterns in the text data, making it well-suited for natural language processing tasks like this.
+
+Dataset
+
+The project employs a labeled dataset of emails, containing both spam and ham samples. The dataset is preprocessed to tokenize and pad the text data before feeding it into the LSTM model. Please replace `'spam_ham_dataset.csv'` in the code with your actual dataset path.
+
+Model Architecture
+
+The LSTM model architecture involves the following key components:
+
+- `Embedding` Layer: Converts the integer-encoded vocabulary into dense vectors of fixed size.
+- First `LSTM` Layer: Captures sequential patterns by returning sequences instead of a single output.
+- `Dropout` Layer: Helps prevent overfitting by randomly deactivating a fraction of input units during training.
+- Second `LSTM` Layer: Aggregates the output of the previous LSTM layer.
+- `Dense` Layer: Produces a single output unit with sigmoid activation for binary classification.
+
+The model is trained using binary cross-entropy loss and the Adam optimizer.
+
+
+Install required packages:
+pip install numpy pandas tensorflow
+
+
diff --git a/Email Spam Classifer(LSTM Networks)/dataset/spam_ham_dataset.csv b/Email Spam Classifer(LSTM Networks)/dataset/spam_ham_dataset.csv