diff --git a/Natural Language Processing/Autocorrect/Autocorrect.ipynb b/Natural Language Processing/Autocorrect/Autocorrect.ipynb new file mode 100644 index 000000000..dbd34b341 --- /dev/null +++ b/Natural Language Processing/Autocorrect/Autocorrect.ipynb @@ -0,0 +1,572 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "6bbabd44-4865-477e-bd6e-08b1d5a83cf0", + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "import tensorflow.keras as keras\n", + "from tensorflow.keras.models import load_model\n", + "import random\n", + "import string\n", + "import pandas as pd\n", + "from tqdm import tqdm\n", + "import pickle\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.ticker as ticker\n", + "from sklearn.model_selection import train_test_split\n", + "import unicodedata\n", + "import re\n", + "import numpy as np\n", + "import os\n", + "import io\n", + "import time" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "300c205d-e928-4fd3-ab28-088b0ac6433d", + "metadata": {}, + "outputs": [], + "source": [ + "# Download the file\n", + "path_to_zip = tf.keras.utils.get_file(\n", + " 'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',\n", + " extract=True)\n", + "\n", + "path_to_file = os.path.dirname(path_to_zip)+\"/spa-eng/spa.txt\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "bc0c774c-e2b8-4752-9afe-002a5a8be54c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "may i borrow this book\n" + ] + } + ], + "source": [ + "# Converts the unicode file to ascii\n", + "def unicode_to_ascii(s):\n", + " return ''.join(c for c in unicodedata.normalize('NFD', s)\n", + " if unicodedata.category(c) != 'Mn')\n", + "\n", + "\n", + "def preprocess_sentence(w):\n", + " w = unicode_to_ascii(w.lower().strip())\n", + "\n", + " # Add space between punctuation and words\n", + " w = re.sub(r\"([?.!,¿])\", r\" \\1 \", w)\n", + " w = re.sub(r'[\" \"]+', \" \", w)\n", + "\n", + " # Replace non-alphabetical characters\n", + " w = re.sub(r\"[^a-zA-Z ]+\", \"\", w)\n", + "\n", + " w = w.strip()\n", + "\n", + " # Add start and end tokens\n", + " return w\n", + "\n", + "\n", + "en_sentence = u\"May I borrow this book?\"\n", + "print(preprocess_sentence(en_sentence))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c44800a6-b476-4807-a35e-fdacab0f11af", + "metadata": {}, + "outputs": [], + "source": [ + "def noise(data):\n", + " input_texts = []\n", + " target_texts = []\n", + " for line in data:\n", + " input_text = line.lower()\n", + " input_text = re.sub(r'[^a-zA-Z ]+', '', input_text)\n", + " target_text = \"\\t\" + input_text + \"\\n\"\n", + " input_texts.append(input_text)\n", + " target_texts.append(target_text)\n", + " inp = input_text\n", + " for _ in range(2):\n", + " input_text = inp\n", + " for i in range(np.random.choice(np.arange(0, 2), p=[0.1, 0.9])):\n", + " input_text = input_text.replace(random.choice(list(input_text)), random.choice(string.ascii_letters))\n", + "\n", + " input_texts.append(input_text.lower())\n", + " target_texts.append(target_text)\n", + "\n", + " return input_texts, target_texts" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "daf99c05-0fd0-4da0-a801-ff954896ea23", + "metadata": {}, + "outputs": [], + "source": [ + "def create_dataset(path, num_examples):\n", + " lines = io.open(path, encoding='UTF-8').read().strip().split('\\n')\n", + " en = [preprocess_sentence(line.split('\\t')[0]) for line in lines[:num_examples]]\n", + "\n", + " inp, targ = noise(en)\n", + "\n", + " return inp, targ\n", + "\n", + "\n", + "def tokenize(lang):\n", + " lang_tokenizer = keras.preprocessing.text.Tokenizer(filters='', char_level=True)\n", + " lang_tokenizer.fit_on_texts(lang)\n", + "\n", + " tensor = lang_tokenizer.texts_to_sequences(lang)\n", + "\n", + " tensor = keras.preprocessing.sequence.pad_sequences(tensor, padding='post')\n", + "\n", + " return tensor, lang_tokenizer\n", + "\n", + "\n", + "def load_dataset(path, num_examples=None):\n", + " inp_lang, targ_lang = create_dataset(path, num_examples)\n", + " print(inp_lang[55000])\n", + " input_tensor, inp_lang_tokenizer = tokenize(inp_lang)\n", + " target_tensor, targ_lang_tokenizer = tokenize(targ_lang)\n", + "\n", + " return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "4cd4438c-7454-4785-b7c3-4ee377aaa024", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "theell call\n" + ] + } + ], + "source": [ + "# Number of examples to train\n", + "num_examples = 100000\n", + "input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)\n", + "\n", + "# Calculate max length of the tensors\n", + "max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ad49d409-0685-4ca9-aad9-10d4a71a6b3a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24000 24000 6000 6000\n" + ] + } + ], + "source": [ + "# Creating training and validation sets (80-20 split)\n", + "input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)\n", + "\n", + "# Show length of datasets\n", + "print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "63704928-b156-4171-a387-95ae32d1ccae", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input Language; index to word mapping\n", + "13 ----> d\n", + "4 ----> o\n", + "1 ----> \n", + "18 ----> c\n", + "4 ----> o\n", + "11 ----> m\n", + "2 ----> e\n", + "1 ----> \n", + "6 ----> a\n", + "17 ----> g\n", + "6 ----> a\n", + "5 ----> i\n", + "9 ----> n\n", + "\n", + "Target Language; index to word mapping\n", + "3 ----> \t\n", + "15 ----> d\n", + "6 ----> o\n", + "1 ----> \n", + "20 ----> c\n", + "6 ----> o\n", + "13 ----> m\n", + "2 ----> e\n", + "1 ----> \n", + "8 ----> a\n", + "19 ----> g\n", + "8 ----> a\n", + "7 ----> i\n", + "11 ----> n\n", + "4 ----> \n", + "\n" + ] + } + ], + "source": [ + "def convert(lang, tensor):\n", + " for t in tensor:\n", + " if t != 0:\n", + " print(f'{t} ----> {lang.index_word[t]}')\n", + "\n", + "print(\"Input Language; index to word mapping\")\n", + "convert(inp_lang, input_tensor_train[5])\n", + "\n", + "print(\"\\nTarget Language; index to word mapping\")\n", + "convert(targ_lang, target_tensor_train[5])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "dcefafe8-6244-47a6-bb37-a168782e77d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(64, 16) (64, 18)\n" + ] + } + ], + "source": [ + "# Hyperparameters and data preparation\n", + "BUFFER_SIZE = len(input_tensor_train)\n", + "BATCH_SIZE = 64\n", + "steps_per_epoch = len(input_tensor_train) // BATCH_SIZE\n", + "embedding_dim = 256\n", + "units = 1024\n", + "vocab_inp_size = len(inp_lang.word_index) + 1\n", + "vocab_tar_size = len(targ_lang.word_index) + 1\n", + "\n", + "# Dataset pipeline\n", + "dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)\n", + "dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)\n", + "\n", + "example_input_batch, example_target_batch = next(iter(dataset))\n", + "print(example_input_batch.shape, example_target_batch.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "cfeed281-a0b2-46ee-b8b2-962eb85d6fda", + "metadata": {}, + "outputs": [], + "source": [ + "# Encoder model\n", + "class Encoder(tf.keras.Model):\n", + " def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):\n", + " super(Encoder, self).__init__()\n", + " self.batch_sz = batch_sz\n", + " self.enc_units = enc_units\n", + " self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n", + " self.gru = tf.keras.layers.GRU(self.enc_units,\n", + " return_sequences=True,\n", + " return_state=True,\n", + " recurrent_initializer='glorot_uniform')\n", + "\n", + " def call(self, x, hidden):\n", + " x = self.embedding(x)\n", + " output, state = self.gru(x, initial_state=hidden)\n", + " return output, state\n", + "\n", + " def initialize_hidden_state(self):\n", + " return tf.zeros((self.batch_sz, self.enc_units))\n", + "\n", + "encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "6da15f50-f2c2-4147-9eac-54e69a95aba7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Encoder output shape: (batch size, sequence length, units) (64, 16, 1024)\n", + "Encoder Hidden state shape: (batch size, units) (64, 1024)\n" + ] + } + ], + "source": [ + "# Testing Encoder\n", + "sample_hidden = encoder.initialize_hidden_state()\n", + "sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)\n", + "print('Encoder output shape: (batch size, sequence length, units)', sample_output.shape)\n", + "print('Encoder Hidden state shape: (batch size, units)', sample_hidden.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "1588562a-73e7-4ac6-9575-6ee79e94faae", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Attention result shape: (batch size, units) (64, 1024)\n", + "Attention weights shape: (batch size, sequence_length, 1) (64, 16, 1)\n" + ] + } + ], + "source": [ + "# Attention layer (Bahdanau Attention)\n", + "class BahdanauAttention(tf.keras.layers.Layer):\n", + " def __init__(self, units):\n", + " super(BahdanauAttention, self).__init__()\n", + " self.W1 = tf.keras.layers.Dense(units)\n", + " self.W2 = tf.keras.layers.Dense(units)\n", + " self.V = tf.keras.layers.Dense(1)\n", + "\n", + " def call(self, query, values):\n", + " query_with_time_axis = tf.expand_dims(query, 1)\n", + " score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))\n", + " attention_weights = tf.nn.softmax(score, axis=1)\n", + " context_vector = attention_weights * values\n", + " context_vector = tf.reduce_sum(context_vector, axis=1)\n", + "\n", + " return context_vector, attention_weights\n", + "\n", + "attention_layer = BahdanauAttention(10)\n", + "attention_result, attention_weights = attention_layer(sample_hidden, sample_output)\n", + "\n", + "print(\"Attention result shape: (batch size, units)\", attention_result.shape)\n", + "print(\"Attention weights shape: (batch size, sequence_length, 1)\", attention_weights.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "fd1218f6-3dca-4ee6-8a96-de7244f002f7", + "metadata": {}, + "outputs": [], + "source": [ + "# Decoder model\n", + "class Decoder(tf.keras.Model):\n", + " def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):\n", + " super(Decoder, self).__init__()\n", + " self.batch_sz = batch_sz\n", + " self.dec_units = dec_units\n", + " self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n", + " self.gru = tf.keras.layers.GRU(self.dec_units,\n", + " return_sequences=True,\n", + " return_state=True,\n", + " recurrent_initializer='glorot_uniform')\n", + " self.fc = tf.keras.layers.Dense(vocab_size)\n", + "\n", + " # Used for attention\n", + " self.attention = BahdanauAttention(self.dec_units)\n", + "\n", + " def call(self, x, hidden, enc_output):\n", + " context_vector, attention_weights = self.attention(hidden, enc_output)\n", + " x = self.embedding(x)\n", + " x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)\n", + " output, state = self.gru(x)\n", + " output = tf.reshape(output, (-1, output.shape[2]))\n", + " x = self.fc(output)\n", + "\n", + " return x, state, attention_weights\n", + "\n", + "decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "f0c25ed5-c3bb-4133-80a8-1331f2461864", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Decoder output shape: (batch_size, vocab size) (64, 30)\n" + ] + } + ], + "source": [ + "# Testing Decoder\n", + "sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)), sample_hidden, sample_output)\n", + "print('Decoder output shape: (batch_size, vocab size)', sample_decoder_output.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "ded85b52-68fa-4591-a26b-fb6eda4c764c", + "metadata": {}, + "outputs": [], + "source": [ + "# Optimizer and loss function\n", + "optimizer = tf.keras.optimizers.Adam()\n", + "loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')\n", + "\n", + "# Loss function\n", + "def loss_function(real, pred):\n", + " mask = tf.math.logical_not(tf.math.equal(real, 0))\n", + " loss_ = loss_object(real, pred)\n", + " mask = tf.cast(mask, dtype=loss_.dtype)\n", + " loss_ *= mask\n", + " return tf.reduce_mean(loss_)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "6e55f472-68c5-4afe-b1b3-4218af4d4226", + "metadata": {}, + "outputs": [], + "source": [ + "# Training step\n", + "@tf.function\n", + "def train_step(inp, targ, enc_hidden):\n", + " loss = 0\n", + " with tf.GradientTape() as tape:\n", + " enc_output, enc_hidden = encoder(inp, enc_hidden)\n", + " dec_hidden = enc_hidden\n", + " dec_input = tf.expand_dims([targ_lang.word_index['\\t']] * BATCH_SIZE, 1)\n", + "\n", + " # Teacher forcing\n", + " for t in range(1, targ.shape[1]):\n", + " predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)\n", + " loss += loss_function(targ[:, t], predictions)\n", + " dec_input = tf.expand_dims(targ[:, t], 1) # Teacher forcing\n", + "\n", + " batch_loss = (loss / int(targ.shape[1]))\n", + " variables = encoder.trainable_variables + decoder.trainable_variables\n", + " gradients = tape.gradient(loss, variables)\n", + " optimizer.apply_gradients(zip(gradients, variables))\n", + "\n", + " return batch_loss" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "f1b5c0a3-63c5-41f3-a62c-9300a41ad82e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1 Batch 0 Loss 2.5512\n", + "Epoch 1 Batch 100 Loss 1.6371\n", + "Epoch 1 Batch 200 Loss 0.8117\n", + "Epoch 1 Batch 300 Loss 0.5870\n", + "Epoch 1 Loss 1.1844\n", + "Time taken for 1 epoch 758.07 sec\n", + "\n", + "Epoch 2 Batch 0 Loss 0.2411\n", + "Epoch 2 Batch 100 Loss 0.3191\n", + "Epoch 2 Batch 200 Loss 0.1944\n", + "Epoch 2 Batch 300 Loss 0.2445\n", + "Epoch 2 Loss 0.3206\n", + "Time taken for 1 epoch 727.31 sec\n", + "\n" + ] + } + ], + "source": [ + "# Training loop\n", + "EPOCHS = 2\n", + "for epoch in range(EPOCHS):\n", + " start = time.time()\n", + " enc_hidden = encoder.initialize_hidden_state()\n", + " total_loss = 0\n", + "\n", + " for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):\n", + " batch_loss = train_step(inp, targ, enc_hidden)\n", + " total_loss += batch_loss\n", + "\n", + " if batch % 100 == 0:\n", + " print(f'Epoch {epoch+1} Batch {batch} Loss {batch_loss.numpy():.4f}')\n", + "\n", + " print(f'Epoch {epoch+1} Loss {total_loss/steps_per_epoch:.4f}')\n", + " print(f'Time taken for 1 epoch {time.time()-start:.2f} sec\\n')" + ] + }, + { + "cell_type": "markdown", + "id": "670efd87-642a-4701-8b40-bf56a23cbdeb", + "metadata": {}, + "source": [ + "### Note: The loss shown here is high since these models were not trained on the maximum capacity of dataset.\n", + "### A system with high RAM can be used to train much smarter models." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "d399ab70-ba6e-4b05-a878-96afb62c002d", + "metadata": {}, + "outputs": [], + "source": [ + "# Save the models (Encoder and Decoder) using the new `.keras` format\n", + "encoder.save('encoder_model.keras')\n", + "decoder.save('decoder_model.keras')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Natural Language Processing/Autocorrect/README.md b/Natural Language Processing/Autocorrect/README.md new file mode 100644 index 000000000..28877fb43 --- /dev/null +++ b/Natural Language Processing/Autocorrect/README.md @@ -0,0 +1,30 @@ +# Neural Machine Translation (NMT)-Based Autocorrect Model + +This project is an implementation of an autocorrect system using a sequence-to-sequence neural machine translation (NMT) model with attention. The model is trained to correct spelling mistakes in sentences by learning to translate noisy text into correct English sentences. + +## Introduction +The autocorrect model is built using a sequence-to-sequence (Seq2Seq) approach with an encoder-decoder architecture. Bahdanau attention is incorporated to help the model focus on relevant parts of the input sentence during decoding. This model aims to correct noisy or misspelled sentences by "translating" them into their corrected versions. + +## Model Architecture + +The model consists of the following components: +- **Encoder**: Processes the input sequence and outputs the hidden states. +- **Bahdanau Attention**: Calculates attention weights for each input timestep, helping the decoder focus on relevant parts of the input. +- **Decoder**: Uses the encoder's context vectors and its own hidden states to generate the corrected output sequence. + +### Custom Objects +- `Encoder`: Encodes the input sequence into context vectors. +- `BahdanauAttention`: Computes attention weights to focus on important parts of the input. +- `Decoder`: Generates the output sequence by attending to the encoder's context vectors. + +## Known Issues and Fixes +- The dataset is large and takes a lot of time to train. As a result, only a portion of the data can be used to train the model. +- This model sometimes repeats words in a sentence. +- A system with high specs can be used to compile a model with a much larger portion of the downloaded dataset to ensure higher accuracy and precision. + +## Dependencies +Install the following dependencies: +```bash +pip install tensorflow==2.13.0 +pip install numpy +pip install pandas