Move out super large one off foreach_copy test (pytorch#158880)

pytorchbot · janeyx99 · web-flow · commit c76b2356bc31 · 2025-07-22T19:12:15.000-04:00
Move out super large one off foreach_copy test (pytorch#156876) Pull Request resolved: pytorch#156876 Approved by: https://github.com/albanD, https://github.com/jeffdaily (cherry picked from commit 50b2069) Co-authored-by: Jane Xu <janeyx@meta.com>
diff --git a/test/test_foreach.py b/test/test_foreach.py
@@ -16,6 +16,7 @@
 from torch.testing._internal.common_device_type import (
     dtypes,
     instantiate_device_type_tests,
+    largeTensorTest,
     onlyCUDA,
     OpDTypes,
     ops,
@@ -1358,22 +1359,13 @@ def test_foreach_copy_with_multi_dtypes(self, device, dtype, op):
         # check (a) multi_tensor_apply is called and (b) numerical parity with for-loop and Tensor.copy_
         foreach_copy_ = ForeachFuncWrapper(op.inplace_variant)
 
-        tested_large_input = False
-
         for sample in op.sample_inputs(
             device, dtype, noncontiguous=False, allow_higher_dtype_scalars=True
         ):
             for src_dtype in floating_types_and(torch.half, torch.bfloat16):
                 if src_dtype == dtype:
                     continue
                 self_tensors = [t.clone() for t in sample.input]
-                if not tested_large_input:
-                    # see https://github.com/pytorch/pytorch/issues/156261
-                    self_tensors.append(
-                        torch.empty(2**31 + 1, device=device, dtype=dtype)
-                    )
-                    tested_large_input = True
-
                 src_tensors = [t.to(src_dtype) for t in self_tensors]
                 out = foreach_copy_(
                     (self_tensors, src_tensors), is_cuda=True, expect_fastpath=True
@@ -1385,6 +1377,17 @@ def test_foreach_copy_with_multi_dtypes(self, device, dtype, op):
                 for t, ref_t in zip(out, ref_out):
                     self.assertTrue(torch.equal(t, ref_t))
 
+    @onlyCUDA
+    @largeTensorTest("40GB", device="cuda")
+    def test_foreach_copy_with_multi_dtypes_large_input(self):
+        # see https://github.com/pytorch/pytorch/issues/156261
+        self_tensor = torch.empty(2**31 + 1, device="cuda", dtype=torch.float32)
+        src_tensor = torch.ones(2**31 + 1, device="cuda", dtype=torch.bfloat16)
+
+        torch._foreach_copy_([self_tensor], [src_tensor])
+        ref_out = torch.empty_like(self_tensor).copy_(src_tensor)
+        self.assertEqual(self_tensor, ref_out)
+
     @requires_cuda
     @ops(filter(lambda op: op.name == "_foreach_copy", foreach_binary_op_db))
     def test_foreach_copy_with_different_device_inputs(self, device, dtype, op):