update transformer documentation

pythonlessons · pythonlessons · commit 0d4605ba4771 · 2023-07-15T10:57:44.000+03:00
diff --git a/Tutorials/09_translation_transformer/transformer.ipynb b/Tutorials/09_translation_transformer/transformer.ipynb
@@ -115,12 +115,12 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2023-07-13 16:32:44.194110: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "2023-07-14 16:36:31.417479: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
       "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "2023-07-13 16:32:44.646582: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
-      "2023-07-13 16:32:45.301857: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
-      "2023-07-13 16:32:45.315705: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
-      "2023-07-13 16:32:45.315853: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n"
+      "2023-07-14 16:36:31.847184: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+      "2023-07-14 16:36:32.502173: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
+      "2023-07-14 16:36:32.515143: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
+      "2023-07-14 16:36:32.515309: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n"
      ]
     }
    ],
@@ -182,13 +182,13 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2023-07-13 16:32:56.568097: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
-      "2023-07-13 16:32:56.568260: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
-      "2023-07-13 16:32:56.568369: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
-      "2023-07-13 16:32:56.941768: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
-      "2023-07-13 16:32:56.941932: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
-      "2023-07-13 16:32:56.942048: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
-      "2023-07-13 16:32:56.942142: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 17652 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:2b:00.0, compute capability: 8.6\n"
+      "2023-07-14 16:36:32.683864: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
+      "2023-07-14 16:36:32.684043: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
+      "2023-07-14 16:36:32.684149: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
+      "2023-07-14 16:36:33.059900: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
+      "2023-07-14 16:36:33.060059: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
+      "2023-07-14 16:36:33.060173: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
+      "2023-07-14 16:36:33.060265: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 17421 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:2b:00.0, compute capability: 8.6\n"
      ]
     },
     {
@@ -588,8 +588,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2023-07-13 16:33:49.284195: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.\n",
-      "2023-07-13 16:33:49.391670: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8600\n"
+      "2023-07-14 16:36:34.267761: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.\n",
+      "2023-07-14 16:36:34.365535: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8600\n"
      ]
     }
    ],
@@ -645,7 +645,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -694,9 +694,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "encoder_embeddings shape (1, 100, 512)\n"
+     ]
+    },
+    {
+     "ename": "TypeError",
+     "evalue": "GlobalSelfAttention.call() takes 2 positional arguments but 3 were given",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[10], line 13\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mencoder_embeddings shape\u001b[39m\u001b[39m\"\u001b[39m, encoder_embeddings\u001b[39m.\u001b[39mshape)\n\u001b[1;32m     12\u001b[0m cross_attention_layer \u001b[39m=\u001b[39m GlobalSelfAttention(num_heads\u001b[39m=\u001b[39m\u001b[39m2\u001b[39m, key_dim\u001b[39m=\u001b[39m\u001b[39m512\u001b[39m)\n\u001b[0;32m---> 13\u001b[0m cross_attention_output \u001b[39m=\u001b[39m cross_attention_layer(decoder_embeddings, encoder_embeddings)\n\u001b[1;32m     15\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mglobal_self_attention_output shape\u001b[39m\u001b[39m\"\u001b[39m, cross_attention_output\u001b[39m.\u001b[39mshape)\n",
+      "File \u001b[0;32m~/Personal/mltu/venv/lib/python3.10/site-packages/keras/utils/traceback_utils.py:70\u001b[0m, in \u001b[0;36mfilter_traceback.<locals>.error_handler\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     67\u001b[0m     filtered_tb \u001b[39m=\u001b[39m _process_traceback_frames(e\u001b[39m.\u001b[39m__traceback__)\n\u001b[1;32m     68\u001b[0m     \u001b[39m# To get the full stack trace, call:\u001b[39;00m\n\u001b[1;32m     69\u001b[0m     \u001b[39m# `tf.debugging.disable_traceback_filtering()`\u001b[39;00m\n\u001b[0;32m---> 70\u001b[0m     \u001b[39mraise\u001b[39;00m e\u001b[39m.\u001b[39mwith_traceback(filtered_tb) \u001b[39mfrom\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m     71\u001b[0m \u001b[39mfinally\u001b[39;00m:\n\u001b[1;32m     72\u001b[0m     \u001b[39mdel\u001b[39;00m filtered_tb\n",
+      "File \u001b[0;32m~/Personal/mltu/venv/lib/python3.10/site-packages/keras/utils/traceback_utils.py:96\u001b[0m, in \u001b[0;36minject_argument_info_in_traceback.<locals>.error_handler\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     94\u001b[0m bound_signature \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m     95\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m---> 96\u001b[0m     \u001b[39mreturn\u001b[39;00m fn(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[1;32m     97\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m     98\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39mhasattr\u001b[39m(e, \u001b[39m\"\u001b[39m\u001b[39m_keras_call_info_injected\u001b[39m\u001b[39m\"\u001b[39m):\n\u001b[1;32m     99\u001b[0m         \u001b[39m# Only inject info for the innermost failing call\u001b[39;00m\n",
+      "\u001b[0;31mTypeError\u001b[0m: GlobalSelfAttention.call() takes 2 positional arguments but 3 were given"
+     ]
+    }
+   ],
    "source": [
     "encoder_vocab_size = 1000\n",
     "d_model = 512\n",
@@ -726,14 +747,174 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### 3.5"
+    "### 3.5 FeedForward layer\n",
+    "\n",
+    "Looking closer at the encoder and decoder layers, we can see that there is a `FeedForward` layer after each attention layer:\n",
+    "\n",
+    "![feedForward.png](https://www.tensorflow.org/images/tutorials/transformer/FeedForward.png)\n",
+    "\n",
+    "The `FeedForward` layer consists of two dense layers that are applied to each position separately and identically. The `FeedForward` layer is primarily used to transform the representation of the input sequence into a form that is more suitable for the task at hand. This is achieved by applying a linear transformation, followed by a non-linear activation function. The output of the `FeedForward` layer has the same shape as the input, which is then added to the original input.\n",
+    "\n",
+    "Let's implement this layer:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class FeedForward(tf.keras.layers.Layer):\n",
+    "    \"\"\"\n",
+    "    A class that implements the feed-forward layer.\n",
+    "\n",
+    "    Methods:\n",
+    "        call: Performs the forward pass of the layer.\n",
+    "\n",
+    "    Attributes:\n",
+    "        seq (tf.keras.Sequential): The sequential layer that contains the feed-forward layers. It applies the two feed-forward layers and the dropout layer.\n",
+    "        add (tf.keras.layers.Add): The Add layer.\n",
+    "        layer_norm (tf.keras.layers.LayerNormalization): The LayerNormalization layer.\n",
+    "    \"\"\"\n",
+    "    def __init__(self, d_model: int, dff: int, dropout_rate: float=0.1):\n",
+    "        \"\"\"\n",
+    "        Constructor of the FeedForward layer.\n",
+    "\n",
+    "        Args:\n",
+    "            d_model (int): The dimensionality of the model.\n",
+    "            dff (int): The dimensionality of the feed-forward layer.\n",
+    "            dropout_rate (float): The dropout rate.\n",
+    "        \"\"\"\n",
+    "        super().__init__()\n",
+    "        self.seq = tf.keras.Sequential([\n",
+    "            tf.keras.layers.Dense(dff, activation='relu'),\n",
+    "            tf.keras.layers.Dense(d_model),\n",
+    "            tf.keras.layers.Dropout(dropout_rate)\n",
+    "        ])\n",
+    "        self.add = tf.keras.layers.Add()\n",
+    "        self.layer_norm = tf.keras.layers.LayerNormalization()\n",
+    "\n",
+    "    def call(self, x: tf.Tensor) -> tf.Tensor:\n",
+    "        \"\"\"\n",
+    "        The call function that performs the feed-forward operation. \n",
+    "\n",
+    "        Args:\n",
+    "            x (tf.Tensor): The input sequence of shape (batch_size, seq_length, d_model).\n",
+    "\n",
+    "        Returns:\n",
+    "            tf.Tensor: The output sequence of shape (batch_size, seq_length, d_model).\n",
+    "        \"\"\"\n",
+    "        x = self.add([x, self.seq(x)])\n",
+    "        x = self.layer_norm(x) \n",
+    "        return x"
    ]
   },
   {
    "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
-   "source": []
+   "source": [
+    "Let's test the FeedForward layer. We will use the same random input as before. The output shape should be the same as the input shape."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "encoder_embeddings shape (1, 100, 512)\n",
+      "feed_forward_output shape (1, 100, 512)\n"
+     ]
+    }
+   ],
+   "source": [
+    "encoder_vocab_size = 1000\n",
+    "d_model = 512\n",
+    "\n",
+    "encoder_embedding_layer = PositionalEmbedding(vocab_size, d_model)\n",
+    "\n",
+    "random_encoder_input = np.random.randint(0, encoder_vocab_size, size=(1, 100))\n",
+    "\n",
+    "encoder_embeddings = encoder_embedding_layer(random_encoder_input)\n",
+    "\n",
+    "print(\"encoder_embeddings shape\", encoder_embeddings.shape)\n",
+    "\n",
+    "feed_forward_layer = FeedForward(d_model, dff=2048)\n",
+    "feed_forward_output = feed_forward_layer(encoder_embeddings)\n",
+    "\n",
+    "print(\"feed_forward_output shape\", feed_forward_output.shape)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Implementing Encoder and Decoder layers\n",
+    "### 4.1. EncoderLayer layer\n",
+    "\n",
+    "Great, now we have all the layers we need to implement the Encoder and Decoder layers. Let's start with the EncoderLayer layer. Why it called `EncoderLayer`? Because it is a single layer of the Encoder. The Encoder is composed of multiple EncoderLayers. The same goes for the Decoder:\n",
+    "\n",
+    "![encoderDecoder.png](https://www.tensorflow.org/images/tutorials/transformer/EncoderLayer.png)\n",
+    "\n",
+    "The EncoderLayer consists of two sublayers: a `MultiHeadAttention` layer, more specifically `GlobalSelfAttention` layer and a `FeedForward` layer. Each of these sublayers has a residual connection around it, followed by a layer normalization. Residual connections help in avoiding the vanishing gradient problem in deep networks. Let's implement this layer:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class EncoderLayer(tf.keras.layers.Layer):\n",
+    "    \"\"\"\n",
+    "    A single layer of the Encoder. Usually there are multiple layers stacked on top of each other.\n",
+    "\n",
+    "    Methods:\n",
+    "        call: Performs the forward pass of the layer.\n",
+    "\n",
+    "    Attributes:\n",
+    "        self_attention (GlobalSelfAttention): The global self-attention layer.\n",
+    "        ffn (FeedForward): The feed-forward layer.\n",
+    "    \"\"\"\n",
+    "    def __init__(self, d_model: int, num_heads: int, dff: int, dropout_rate: float=0.1):\n",
+    "        \"\"\"\n",
+    "        Constructor of the EncoderLayer.\n",
+    "\n",
+    "        Args:\n",
+    "            d_model (int): The dimensionality of the model.\n",
+    "            num_heads (int): The number of heads in the multi-head attention layer.\n",
+    "            dff (int): The dimensionality of the feed-forward layer.\n",
+    "            dropout_rate (float): The dropout rate.\n",
+    "        \"\"\"\n",
+    "        super().__init__()\n",
+    "\n",
+    "        self.self_attention = GlobalSelfAttention(\n",
+    "            num_heads=num_heads,\n",
+    "            key_dim=d_model,\n",
+    "            dropout=dropout_rate\n",
+    "            )\n",
+    "\n",
+    "        self.ffn = FeedForward(d_model, dff)\n",
+    "\n",
+    "    def call(self, x: tf.Tensor) -> tf.Tensor:\n",
+    "        \"\"\"\n",
+    "        The call function that performs the forward pass of the layer.\n",
+    "\n",
+    "        Args:\n",
+    "            x (tf.Tensor): The input sequence of shape (batch_size, seq_length, d_model).\n",
+    "\n",
+    "        Returns:\n",
+    "            tf.Tensor: The output sequence of shape (batch_size, seq_length, d_model).\n",
+    "        \"\"\"\n",
+    "        x = self.self_attention(x)\n",
+    "        x = self.ffn(x)\n",
+    "        return x"
+   ]
   },
   {
    "attachments": {},