@@ -57,6 +57,11 @@ def _get_seq_len(src: Tensor, batch_first: bool) -> Optional[int]:
5757class Transformer (Module ):
5858 r"""A transformer model.
5959
60+ .. note::
61+ See `this tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
62+ for an in depth discussion of the performant building blocks PyTorch offers for building your own
63+ transformer layers.
64+
6065 User is able to modify the attributes as needed. The architecture
6166 is based on the paper "Attention Is All You Need". Ashish Vaswani, Noam Shazeer,
6267 Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and
@@ -305,6 +310,11 @@ def _reset_parameters(self):
305310class TransformerEncoder (Module ):
306311 r"""TransformerEncoder is a stack of N encoder layers.
307312
313+ .. note::
314+ See `this tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
315+ for an in depth discussion of the performant building blocks PyTorch offers for building your own
316+ transformer layers.
317+
308318 Users can build the BERT(https://arxiv.org/abs/1810.04805) model with corresponding parameters.
309319
310320 Args:
@@ -523,6 +533,11 @@ def forward(
523533class TransformerDecoder (Module ):
524534 r"""TransformerDecoder is a stack of N decoder layers.
525535
536+ .. note::
537+ See `this tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
538+ for an in depth discussion of the performant building blocks PyTorch offers for building your own
539+ transformer layers.
540+
526541 Args:
527542 decoder_layer: an instance of the TransformerDecoderLayer() class (required).
528543 num_layers: the number of sub-decoder-layers in the decoder (required).
@@ -615,6 +630,11 @@ def forward(
615630class TransformerEncoderLayer (Module ):
616631 r"""TransformerEncoderLayer is made up of self-attn and feedforward network.
617632
633+ .. note::
634+ See `this tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
635+ for an in depth discussion of the performant building blocks PyTorch offers for building your own
636+ transformer layers.
637+
618638 This standard encoder layer is based on the paper "Attention Is All You Need".
619639 Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
620640 Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
@@ -931,6 +951,11 @@ def _ff_block(self, x: Tensor) -> Tensor:
931951class TransformerDecoderLayer (Module ):
932952 r"""TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network.
933953
954+ .. note::
955+ See `this tutorial <https://pytorch.org/tutorials/intermediate/transformer_building_blocks.html>`_
956+ for an in depth discussion of the performant building blocks PyTorch offers for building your own
957+ transformer layers.
958+
934959 This standard decoder layer is based on the paper "Attention Is All You Need".
935960 Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
936961 Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
0 commit comments