pytorch · jbschlosser · Jan 16, 2026 · Jan 16, 2026 · Jan 16, 2026 · Jan 16, 2026
diff --git a/intermediate_source/transformer_building_blocks.py b/intermediate_source/transformer_building_blocks.py
@@ -79,6 +79,10 @@
 # sequence lengths. They eliminate the need for the bug-prone practices of explicit
 # padding and masking (think ``key_padding_mask`` in ``nn.MultiHeadAttention``).
 #
+# ```{warning}
+# Nested tensors are not currently under active development. Use at your own risk.
+# ```
+#
 # * `scaled_dot_product_attention <https://pytorch.org/tutorials/intermediate/scaled_dot_product_attention_tutorial.html>`_
 #
 # ``scaled_dot_product_attention`` is a primitive for

diff --git a/unstable_source/nestedtensor.py b/unstable_source/nestedtensor.py
@@ -3,6 +3,8 @@
 Getting Started with Nested Tensors
 ===============================================================
 
+**Warning: Nested tensors are not currently under active development. Use at your own risk.**
+
 Nested tensors generalize the shape of regular dense tensors, allowing for representation
 of ragged-sized data.
 
@@ -21,8 +23,6 @@
 they are invaluable for building transformers that can efficiently operate on ragged sequential
 inputs. Below, we present an implementation of multi-head attention using nested tensors that,
 combined usage of ``torch.compile``, out-performs operating naively on tensors with padding.
-
-Nested tensors are currently a prototype feature and are subject to change.
 """
 
 import numpy as np