diff --git a/intermediate_source/transformer_building_blocks.py b/intermediate_source/transformer_building_blocks.py index decaf0602f7..84cfdf05f93 100644 --- a/intermediate_source/transformer_building_blocks.py +++ b/intermediate_source/transformer_building_blocks.py @@ -79,6 +79,10 @@ # sequence lengths. They eliminate the need for the bug-prone practices of explicit # padding and masking (think ``key_padding_mask`` in ``nn.MultiHeadAttention``). # +# ```{warning} +# Nested tensors are not currently under active development. Use at your own risk. +# ``` +# # * `scaled_dot_product_attention `_ # # ``scaled_dot_product_attention`` is a primitive for diff --git a/unstable_source/nestedtensor.py b/unstable_source/nestedtensor.py index 77f8a4cebe1..9463328cbff 100644 --- a/unstable_source/nestedtensor.py +++ b/unstable_source/nestedtensor.py @@ -3,6 +3,8 @@ Getting Started with Nested Tensors =============================================================== +**Warning: Nested tensors are not currently under active development. Use at your own risk.** + Nested tensors generalize the shape of regular dense tensors, allowing for representation of ragged-sized data. @@ -21,8 +23,6 @@ they are invaluable for building transformers that can efficiently operate on ragged sequential inputs. Below, we present an implementation of multi-head attention using nested tensors that, combined usage of ``torch.compile``, out-performs operating naively on tensors with padding. - -Nested tensors are currently a prototype feature and are subject to change. """ import numpy as np