[BE][Ez]: Fix docs recommending inefficient tensor op order (pytorch#144270)

Skylion007 · pytorchmergebot · commit e4a05dec0f7c · 2025-01-07T17:31:32.000Z
`detach().clone()` is faster than `.clone().detatch()` since the gradients are not cloned. Let's update all the documentation and tests so that users do not use the inefficient op ordering. Pull Request resolved: pytorch#144270 Approved by: https://github.com/awgu, https://github.com/XuehaiPan
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
@@ -6607,8 +6607,10 @@ def tensor(data, *, dtype=None, device=None, pin_memory=False, requires_grad=Fal
     # TODO (or not): support names kwarg
     if isinstance(data, torch.Tensor):
         warnings.warn(
-            "To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() "
-            "or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor)"
+            "To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() "
+            "or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor)",
+            UserWarning,
+            stacklevel=2,
         )
     type_inference = dtype is None
     new_tensor = _internal_new_from_data(
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
@@ -57,9 +57,9 @@ def add_docstr_all(method, docstr):
 .. warning::
 
     When data is a tensor `x`, :func:`new_tensor()` reads out 'the data' from whatever it is passed,
-    and constructs a leaf variable. Therefore ``tensor.new_tensor(x)`` is equivalent to ``x.clone().detach()``
-    and ``tensor.new_tensor(x, requires_grad=True)`` is equivalent to ``x.clone().detach().requires_grad_(True)``.
-    The equivalents using ``clone()`` and ``detach()`` are recommended.
+    and constructs a leaf variable. Therefore ``tensor.new_tensor(x)`` is equivalent to ``x.detach().clone()``
+    and ``tensor.new_tensor(x, requires_grad=True)`` is equivalent to ``x.detach().clone().requires_grad_(True)``.
+    The equivalents using ``detach()`` and ``clone()`` are recommended.
 
 Args:
     data (array_like): The returned Tensor copies :attr:`data`.
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
@@ -9034,8 +9034,8 @@ def merge_dicts(*dicts):
     When working with tensors prefer using :func:`torch.Tensor.clone`,
     :func:`torch.Tensor.detach`, and :func:`torch.Tensor.requires_grad_` for
     readability. Letting `t` be a tensor, ``torch.tensor(t)`` is equivalent to
-    ``t.clone().detach()``, and ``torch.tensor(t, requires_grad=True)``
-    is equivalent to ``t.clone().detach().requires_grad_(True)``.
+    ``t.detach().clone()``, and ``torch.tensor(t, requires_grad=True)``
+    is equivalent to ``t.detach().clone().requires_grad_(True)``.
 
 .. seealso::
 
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
@@ -1458,8 +1458,8 @@ Tensor tensor_ctor(
     if (THPVariable_Check(data)) {
       auto ret = PyErr_WarnEx(
           PyExc_UserWarning,
-          "To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() "
-          "or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).",
+          "To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() "
+          "or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).",
           1);
       if (ret != 0)
         throw python_error();
diff --git a/torch/masked/maskedtensor/core.py b/torch/masked/maskedtensor/core.py
@@ -170,7 +170,7 @@ def __new__(cls, data, mask, requires_grad=False):
         if data.requires_grad:
             warnings.warn(
                 "It is not recommended to create a MaskedTensor with a tensor that requires_grad. "
-                "To avoid this, you can use data.clone().detach()",
+                "To avoid this, you can use data.detach().clone()",
                 UserWarning,
                 stacklevel=2,
             )
diff --git a/torch/testing/_internal/common_optimizers.py b/torch/testing/_internal/common_optimizers.py
@@ -2308,7 +2308,7 @@ def __init__(self, assert_eq_kwargs=None):
 
     def add(self, tensor):
         """
-        Add a clone().detach()'d version of the tensor
+        Add a detach().clone()'d version of the tensor
         """
         self.tensors.append(tensor.detach().clone())
 
diff --git a/torch/testing/_internal/opinfo/definitions/nested.py b/torch/testing/_internal/opinfo/definitions/nested.py
@@ -226,7 +226,7 @@ def _raggedness_matches(nt1, nt2):
 # as this causes autograd problems.
 def _clone(t):
     requires_grad = t.requires_grad
-    return t.clone().detach().requires_grad_(requires_grad)
+    return t.detach().clone().requires_grad_(requires_grad)
 
 
 # Helper function to update a sample with new kwargs / name
@@ -1316,10 +1316,10 @@ def _get_njts():
         # non-contiguous transposed
         yield njt.transpose(1, 3)
         # non-contiguous with holes
-        values = njt.values().clone().detach()
-        offsets = njt.offsets().clone().detach()
+        values = njt.values().detach().clone()
+        offsets = njt.offsets().detach().clone()
         # subtract 1 to cause holes
-        lengths = (offsets.diff() - 1).clone().detach()
+        lengths = (offsets.diff() - 1).detach().clone()
         yield torch.nested.nested_tensor_from_jagged(
             values=values,
             offsets=offsets,

Original file line number	Diff line number	Diff line change
`@@ -170,7 +170,7 @@ def __new__(cls, data, mask, requires_grad=False):`
`170`	`170`	`if data.requires_grad:`
`171`	`171`	`warnings.warn(`
`172`	`172`	`"It is not recommended to create a MaskedTensor with a tensor that requires_grad. "`
`173`		`- "To avoid this, you can use data.clone().detach()",`
	`173`	`+ "To avoid this, you can use data.detach().clone()",`
`174`	`174`	`UserWarning,`
`175`	`175`	`stacklevel=2,`
`176`	`176`	`)`