Skip to content

Commit 0d4605b

Browse files
committed
update transformer documentation
1 parent f3bb060 commit 0d4605b

File tree

1 file changed

+200
-19
lines changed

1 file changed

+200
-19
lines changed

Tutorials/09_translation_transformer/transformer.ipynb

Lines changed: 200 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -115,12 +115,12 @@
115115
"name": "stderr",
116116
"output_type": "stream",
117117
"text": [
118-
"2023-07-13 16:32:44.194110: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
118+
"2023-07-14 16:36:31.417479: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
119119
"To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
120-
"2023-07-13 16:32:44.646582: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
121-
"2023-07-13 16:32:45.301857: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
122-
"2023-07-13 16:32:45.315705: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
123-
"2023-07-13 16:32:45.315853: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n"
120+
"2023-07-14 16:36:31.847184: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
121+
"2023-07-14 16:36:32.502173: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
122+
"2023-07-14 16:36:32.515143: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
123+
"2023-07-14 16:36:32.515309: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n"
124124
]
125125
}
126126
],
@@ -182,13 +182,13 @@
182182
"name": "stderr",
183183
"output_type": "stream",
184184
"text": [
185-
"2023-07-13 16:32:56.568097: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
186-
"2023-07-13 16:32:56.568260: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
187-
"2023-07-13 16:32:56.568369: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
188-
"2023-07-13 16:32:56.941768: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
189-
"2023-07-13 16:32:56.941932: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
190-
"2023-07-13 16:32:56.942048: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
191-
"2023-07-13 16:32:56.942142: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 17652 MB memory: -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:2b:00.0, compute capability: 8.6\n"
185+
"2023-07-14 16:36:32.683864: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
186+
"2023-07-14 16:36:32.684043: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
187+
"2023-07-14 16:36:32.684149: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
188+
"2023-07-14 16:36:33.059900: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
189+
"2023-07-14 16:36:33.060059: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
190+
"2023-07-14 16:36:33.060173: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n",
191+
"2023-07-14 16:36:33.060265: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 17421 MB memory: -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:2b:00.0, compute capability: 8.6\n"
192192
]
193193
},
194194
{
@@ -588,8 +588,8 @@
588588
"name": "stderr",
589589
"output_type": "stream",
590590
"text": [
591-
"2023-07-13 16:33:49.284195: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.\n",
592-
"2023-07-13 16:33:49.391670: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8600\n"
591+
"2023-07-14 16:36:34.267761: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.\n",
592+
"2023-07-14 16:36:34.365535: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8600\n"
593593
]
594594
}
595595
],
@@ -645,7 +645,7 @@
645645
},
646646
{
647647
"cell_type": "code",
648-
"execution_count": 10,
648+
"execution_count": 9,
649649
"metadata": {},
650650
"outputs": [],
651651
"source": [
@@ -694,9 +694,30 @@
694694
},
695695
{
696696
"cell_type": "code",
697-
"execution_count": null,
697+
"execution_count": 10,
698698
"metadata": {},
699-
"outputs": [],
699+
"outputs": [
700+
{
701+
"name": "stdout",
702+
"output_type": "stream",
703+
"text": [
704+
"encoder_embeddings shape (1, 100, 512)\n"
705+
]
706+
},
707+
{
708+
"ename": "TypeError",
709+
"evalue": "GlobalSelfAttention.call() takes 2 positional arguments but 3 were given",
710+
"output_type": "error",
711+
"traceback": [
712+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
713+
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
714+
"Cell \u001b[0;32mIn[10], line 13\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mencoder_embeddings shape\u001b[39m\u001b[39m\"\u001b[39m, encoder_embeddings\u001b[39m.\u001b[39mshape)\n\u001b[1;32m 12\u001b[0m cross_attention_layer \u001b[39m=\u001b[39m GlobalSelfAttention(num_heads\u001b[39m=\u001b[39m\u001b[39m2\u001b[39m, key_dim\u001b[39m=\u001b[39m\u001b[39m512\u001b[39m)\n\u001b[0;32m---> 13\u001b[0m cross_attention_output \u001b[39m=\u001b[39m cross_attention_layer(decoder_embeddings, encoder_embeddings)\n\u001b[1;32m 15\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mglobal_self_attention_output shape\u001b[39m\u001b[39m\"\u001b[39m, cross_attention_output\u001b[39m.\u001b[39mshape)\n",
715+
"File \u001b[0;32m~/Personal/mltu/venv/lib/python3.10/site-packages/keras/utils/traceback_utils.py:70\u001b[0m, in \u001b[0;36mfilter_traceback.<locals>.error_handler\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 67\u001b[0m filtered_tb \u001b[39m=\u001b[39m _process_traceback_frames(e\u001b[39m.\u001b[39m__traceback__)\n\u001b[1;32m 68\u001b[0m \u001b[39m# To get the full stack trace, call:\u001b[39;00m\n\u001b[1;32m 69\u001b[0m \u001b[39m# `tf.debugging.disable_traceback_filtering()`\u001b[39;00m\n\u001b[0;32m---> 70\u001b[0m \u001b[39mraise\u001b[39;00m e\u001b[39m.\u001b[39mwith_traceback(filtered_tb) \u001b[39mfrom\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m 71\u001b[0m \u001b[39mfinally\u001b[39;00m:\n\u001b[1;32m 72\u001b[0m \u001b[39mdel\u001b[39;00m filtered_tb\n",
716+
"File \u001b[0;32m~/Personal/mltu/venv/lib/python3.10/site-packages/keras/utils/traceback_utils.py:96\u001b[0m, in \u001b[0;36minject_argument_info_in_traceback.<locals>.error_handler\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 94\u001b[0m bound_signature \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m 95\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m---> 96\u001b[0m \u001b[39mreturn\u001b[39;00m fn(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[1;32m 97\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m 98\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mhasattr\u001b[39m(e, \u001b[39m\"\u001b[39m\u001b[39m_keras_call_info_injected\u001b[39m\u001b[39m\"\u001b[39m):\n\u001b[1;32m 99\u001b[0m \u001b[39m# Only inject info for the innermost failing call\u001b[39;00m\n",
717+
"\u001b[0;31mTypeError\u001b[0m: GlobalSelfAttention.call() takes 2 positional arguments but 3 were given"
718+
]
719+
}
720+
],
700721
"source": [
701722
"encoder_vocab_size = 1000\n",
702723
"d_model = 512\n",
@@ -726,14 +747,174 @@
726747
"cell_type": "markdown",
727748
"metadata": {},
728749
"source": [
729-
"### 3.5"
750+
"### 3.5 FeedForward layer\n",
751+
"\n",
752+
"Looking closer at the encoder and decoder layers, we can see that there is a `FeedForward` layer after each attention layer:\n",
753+
"\n",
754+
"![feedForward.png](https://www.tensorflow.org/images/tutorials/transformer/FeedForward.png)\n",
755+
"\n",
756+
"The `FeedForward` layer consists of two dense layers that are applied to each position separately and identically. The `FeedForward` layer is primarily used to transform the representation of the input sequence into a form that is more suitable for the task at hand. This is achieved by applying a linear transformation, followed by a non-linear activation function. The output of the `FeedForward` layer has the same shape as the input, which is then added to the original input.\n",
757+
"\n",
758+
"Let's implement this layer:"
759+
]
760+
},
761+
{
762+
"cell_type": "code",
763+
"execution_count": 12,
764+
"metadata": {},
765+
"outputs": [],
766+
"source": [
767+
"class FeedForward(tf.keras.layers.Layer):\n",
768+
" \"\"\"\n",
769+
" A class that implements the feed-forward layer.\n",
770+
"\n",
771+
" Methods:\n",
772+
" call: Performs the forward pass of the layer.\n",
773+
"\n",
774+
" Attributes:\n",
775+
" seq (tf.keras.Sequential): The sequential layer that contains the feed-forward layers. It applies the two feed-forward layers and the dropout layer.\n",
776+
" add (tf.keras.layers.Add): The Add layer.\n",
777+
" layer_norm (tf.keras.layers.LayerNormalization): The LayerNormalization layer.\n",
778+
" \"\"\"\n",
779+
" def __init__(self, d_model: int, dff: int, dropout_rate: float=0.1):\n",
780+
" \"\"\"\n",
781+
" Constructor of the FeedForward layer.\n",
782+
"\n",
783+
" Args:\n",
784+
" d_model (int): The dimensionality of the model.\n",
785+
" dff (int): The dimensionality of the feed-forward layer.\n",
786+
" dropout_rate (float): The dropout rate.\n",
787+
" \"\"\"\n",
788+
" super().__init__()\n",
789+
" self.seq = tf.keras.Sequential([\n",
790+
" tf.keras.layers.Dense(dff, activation='relu'),\n",
791+
" tf.keras.layers.Dense(d_model),\n",
792+
" tf.keras.layers.Dropout(dropout_rate)\n",
793+
" ])\n",
794+
" self.add = tf.keras.layers.Add()\n",
795+
" self.layer_norm = tf.keras.layers.LayerNormalization()\n",
796+
"\n",
797+
" def call(self, x: tf.Tensor) -> tf.Tensor:\n",
798+
" \"\"\"\n",
799+
" The call function that performs the feed-forward operation. \n",
800+
"\n",
801+
" Args:\n",
802+
" x (tf.Tensor): The input sequence of shape (batch_size, seq_length, d_model).\n",
803+
"\n",
804+
" Returns:\n",
805+
" tf.Tensor: The output sequence of shape (batch_size, seq_length, d_model).\n",
806+
" \"\"\"\n",
807+
" x = self.add([x, self.seq(x)])\n",
808+
" x = self.layer_norm(x) \n",
809+
" return x"
730810
]
731811
},
732812
{
733813
"attachments": {},
734814
"cell_type": "markdown",
735815
"metadata": {},
736-
"source": []
816+
"source": [
817+
"Let's test the FeedForward layer. We will use the same random input as before. The output shape should be the same as the input shape."
818+
]
819+
},
820+
{
821+
"cell_type": "code",
822+
"execution_count": 13,
823+
"metadata": {},
824+
"outputs": [
825+
{
826+
"name": "stdout",
827+
"output_type": "stream",
828+
"text": [
829+
"encoder_embeddings shape (1, 100, 512)\n",
830+
"feed_forward_output shape (1, 100, 512)\n"
831+
]
832+
}
833+
],
834+
"source": [
835+
"encoder_vocab_size = 1000\n",
836+
"d_model = 512\n",
837+
"\n",
838+
"encoder_embedding_layer = PositionalEmbedding(vocab_size, d_model)\n",
839+
"\n",
840+
"random_encoder_input = np.random.randint(0, encoder_vocab_size, size=(1, 100))\n",
841+
"\n",
842+
"encoder_embeddings = encoder_embedding_layer(random_encoder_input)\n",
843+
"\n",
844+
"print(\"encoder_embeddings shape\", encoder_embeddings.shape)\n",
845+
"\n",
846+
"feed_forward_layer = FeedForward(d_model, dff=2048)\n",
847+
"feed_forward_output = feed_forward_layer(encoder_embeddings)\n",
848+
"\n",
849+
"print(\"feed_forward_output shape\", feed_forward_output.shape)"
850+
]
851+
},
852+
{
853+
"attachments": {},
854+
"cell_type": "markdown",
855+
"metadata": {},
856+
"source": [
857+
"## 4. Implementing Encoder and Decoder layers\n",
858+
"### 4.1. EncoderLayer layer\n",
859+
"\n",
860+
"Great, now we have all the layers we need to implement the Encoder and Decoder layers. Let's start with the EncoderLayer layer. Why it called `EncoderLayer`? Because it is a single layer of the Encoder. The Encoder is composed of multiple EncoderLayers. The same goes for the Decoder:\n",
861+
"\n",
862+
"![encoderDecoder.png](https://www.tensorflow.org/images/tutorials/transformer/EncoderLayer.png)\n",
863+
"\n",
864+
"The EncoderLayer consists of two sublayers: a `MultiHeadAttention` layer, more specifically `GlobalSelfAttention` layer and a `FeedForward` layer. Each of these sublayers has a residual connection around it, followed by a layer normalization. Residual connections help in avoiding the vanishing gradient problem in deep networks. Let's implement this layer:"
865+
]
866+
},
867+
{
868+
"cell_type": "code",
869+
"execution_count": null,
870+
"metadata": {},
871+
"outputs": [],
872+
"source": [
873+
"class EncoderLayer(tf.keras.layers.Layer):\n",
874+
" \"\"\"\n",
875+
" A single layer of the Encoder. Usually there are multiple layers stacked on top of each other.\n",
876+
"\n",
877+
" Methods:\n",
878+
" call: Performs the forward pass of the layer.\n",
879+
"\n",
880+
" Attributes:\n",
881+
" self_attention (GlobalSelfAttention): The global self-attention layer.\n",
882+
" ffn (FeedForward): The feed-forward layer.\n",
883+
" \"\"\"\n",
884+
" def __init__(self, d_model: int, num_heads: int, dff: int, dropout_rate: float=0.1):\n",
885+
" \"\"\"\n",
886+
" Constructor of the EncoderLayer.\n",
887+
"\n",
888+
" Args:\n",
889+
" d_model (int): The dimensionality of the model.\n",
890+
" num_heads (int): The number of heads in the multi-head attention layer.\n",
891+
" dff (int): The dimensionality of the feed-forward layer.\n",
892+
" dropout_rate (float): The dropout rate.\n",
893+
" \"\"\"\n",
894+
" super().__init__()\n",
895+
"\n",
896+
" self.self_attention = GlobalSelfAttention(\n",
897+
" num_heads=num_heads,\n",
898+
" key_dim=d_model,\n",
899+
" dropout=dropout_rate\n",
900+
" )\n",
901+
"\n",
902+
" self.ffn = FeedForward(d_model, dff)\n",
903+
"\n",
904+
" def call(self, x: tf.Tensor) -> tf.Tensor:\n",
905+
" \"\"\"\n",
906+
" The call function that performs the forward pass of the layer.\n",
907+
"\n",
908+
" Args:\n",
909+
" x (tf.Tensor): The input sequence of shape (batch_size, seq_length, d_model).\n",
910+
"\n",
911+
" Returns:\n",
912+
" tf.Tensor: The output sequence of shape (batch_size, seq_length, d_model).\n",
913+
" \"\"\"\n",
914+
" x = self.self_attention(x)\n",
915+
" x = self.ffn(x)\n",
916+
" return x"
917+
]
737918
},
738919
{
739920
"attachments": {},

0 commit comments

Comments
 (0)