Link to transformer tutorial in transformer docs (pytorch#144425)

mikaylagawarecki · pytorchmergebot · commit b8f383107eeb · 2025-01-09T17:42:09.000Z
<img width="1045" alt="Screenshot 2025-01-08 at 4 50 20 PM" src="https://github.com/user-attachments/assets/05adfecb-8a23-4c48-9a2c-50c5b3f886b0" /> Pull Request resolved: pytorch#144425 Approved by: https://github.com/albanD
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
@@ -6022,6 +6022,11 @@ def multi_head_attention_forward(
 ) -> Tuple[Tensor, Optional[Tensor]]:
     r"""Forward method for MultiHeadAttention.
 
+    .. note::
+        See `this tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
+        for an in depth discussion of the performant building blocks PyTorch offers for building your own
+        transformer layers.
+
     See :class:`torch.nn.MultiheadAttention` for details.
 
     Args:
diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
@@ -973,6 +973,11 @@ def _is_make_fx_tracing():
 class MultiheadAttention(Module):
     r"""Allows the model to jointly attend to information from different representation subspaces.
 
+    .. note::
+        See `this tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
+        for an in depth discussion of the performant building blocks PyTorch offers for building your own
+        transformer layers.
+
     Method described in the paper:
     `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.
 
diff --git a/torch/nn/modules/transformer.py b/torch/nn/modules/transformer.py
@@ -57,6 +57,11 @@ def _get_seq_len(src: Tensor, batch_first: bool) -> Optional[int]:
 class Transformer(Module):
     r"""A transformer model.
 
+    .. note::
+        See `this tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
+        for an in depth discussion of the performant building blocks PyTorch offers for building your own
+        transformer layers.
+
     User is able to modify the attributes as needed. The architecture
     is based on the paper "Attention Is All You Need". Ashish Vaswani, Noam Shazeer,
     Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and
@@ -305,6 +310,11 @@ def _reset_parameters(self):
 class TransformerEncoder(Module):
     r"""TransformerEncoder is a stack of N encoder layers.
 
+    .. note::
+        See `this tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
+        for an in depth discussion of the performant building blocks PyTorch offers for building your own
+        transformer layers.
+
     Users can build the BERT(https://arxiv.org/abs/1810.04805) model with corresponding parameters.
 
     Args:
@@ -523,6 +533,11 @@ def forward(
 class TransformerDecoder(Module):
     r"""TransformerDecoder is a stack of N decoder layers.
 
+    .. note::
+        See `this tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
+        for an in depth discussion of the performant building blocks PyTorch offers for building your own
+        transformer layers.
+
     Args:
         decoder_layer: an instance of the TransformerDecoderLayer() class (required).
         num_layers: the number of sub-decoder-layers in the decoder (required).
@@ -615,6 +630,11 @@ def forward(
 class TransformerEncoderLayer(Module):
     r"""TransformerEncoderLayer is made up of self-attn and feedforward network.
 
+    .. note::
+        See `this tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
+        for an in depth discussion of the performant building blocks PyTorch offers for building your own
+        transformer layers.
+
     This standard encoder layer is based on the paper "Attention Is All You Need".
     Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
     Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
@@ -931,6 +951,11 @@ def _ff_block(self, x: Tensor) -> Tensor:
 class TransformerDecoderLayer(Module):
     r"""TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network.
 
+    .. note::
+        See `this tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
+        for an in depth discussion of the performant building blocks PyTorch offers for building your own
+        transformer layers.
+
     This standard decoder layer is based on the paper "Attention Is All You Need".
     Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
     Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in