diff --git a/intermediate_source/transformer_building_blocks.py b/intermediate_source/transformer_building_blocks.py index decaf0602f7..b0b7f96ae26 100644 --- a/intermediate_source/transformer_building_blocks.py +++ b/intermediate_source/transformer_building_blocks.py @@ -79,6 +79,9 @@ # sequence lengths. They eliminate the need for the bug-prone practices of explicit # padding and masking (think ``key_padding_mask`` in ``nn.MultiHeadAttention``). # +# .. warning:: +# Nested tensors are not currently under active development. Use at your own risk. +# # * `scaled_dot_product_attention `_ # # ``scaled_dot_product_attention`` is a primitive for diff --git a/unstable_source/nestedtensor.py b/unstable_source/nestedtensor.py index 77f8a4cebe1..ccec8825793 100644 --- a/unstable_source/nestedtensor.py +++ b/unstable_source/nestedtensor.py @@ -3,6 +3,9 @@ Getting Started with Nested Tensors =============================================================== +.. warning:: + Nested tensors are not currently under active development. Use at your own risk. + Nested tensors generalize the shape of regular dense tensors, allowing for representation of ragged-sized data. @@ -21,8 +24,6 @@ they are invaluable for building transformers that can efficiently operate on ragged sequential inputs. Below, we present an implementation of multi-head attention using nested tensors that, combined usage of ``torch.compile``, out-performs operating naively on tensors with padding. - -Nested tensors are currently a prototype feature and are subject to change. """ import numpy as np