Use random64 in Fischer-Yates algorithm for large N (pytorch#143682)

ngimel · pytorchmergebot · commit 2e42be059548 · 2025-01-07T03:48:56.000Z
Fixes bug in randperm https://nbsanity.com/static/a4774194938414dedcec7d6e99727d31/Shuffling_20in_20torch_20vs_20numpy-public.html Pull Request resolved: pytorch#143682 Approved by: https://github.com/eqy, https://github.com/albanD, https://github.com/malfet
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
@@ -1322,29 +1322,48 @@ Tensor randn_like(
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ randperm ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 namespace {
+
 template <typename scalar_t>
 void randperm_cpu(Tensor& result, int64_t n, CPUGeneratorImpl* generator) {
   scalar_t* r__data = result.data_ptr<scalar_t>();
 
   result.resize_({n});
   int64_t r__stride_0 = result.stride(0);
 
-  at::parallel_for(
-      0,
-      n,
-      internal::GRAIN_SIZE,
-      [&r__data, &r__stride_0](int64_t p_begin, int64_t p_end) {
-        for (const auto i : c10::irange(p_begin, p_end)) {
-          r__data[i * r__stride_0] = static_cast<scalar_t>(i);
-        }
-      });
-
-  for (int64_t i = 0; i < n - 1; i++) {
-    // NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand)
-    int64_t z = generator->random() % (n - i);
-    scalar_t sav = r__data[i * r__stride_0];
-    r__data[i * r__stride_0] = r__data[(z + i) * r__stride_0];
-    r__data[(z + i) * r__stride_0] = sav;
+  // for small n, preserve old behavior
+  if (n < std::numeric_limits<uint32_t>::max() / 20) {
+    at::parallel_for(
+        0,
+        n,
+        internal::GRAIN_SIZE,
+        [&r__data, &r__stride_0](int64_t p_begin, int64_t p_end) {
+          for (const auto i : c10::irange(p_begin, p_end)) {
+            r__data[i * r__stride_0] = static_cast<scalar_t>(i);
+          }
+        });
+
+    for (int64_t i = 0; i < n - 1; i++) {
+      // NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand)
+      int64_t z = generator->random() % (n - i);
+      scalar_t sav = r__data[i * r__stride_0];
+      r__data[i * r__stride_0] = r__data[(z + i) * r__stride_0];
+      r__data[(z + i) * r__stride_0] = sav;
+    }
+    return;
+  }
+
+  // we need to pick a number uniformly distributed between 0 and n
+  // when n is of the same order of magnitude as the biggest number returned by
+  // random the % result is not uniformly distributed
+  // so we use random64(), you'd run out of RAM before you
+  // start seeing the skew
+  // use no-initialization Fischer-Yates variant
+  // https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle#The_.22inside-out.22_algorithm
+  for (int64_t i = 0; i < n; i++) {
+    int64_t z = (int64_t)(generator->random64() % (i + 1));
+    r__data[i * r__stride_0] = i;
+    r__data[i * r__stride_0] = r__data[z * r__stride_0];
+    r__data[z * r__stride_0] = i;
   }
 }
 } // namespace
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
@@ -1956,7 +1956,7 @@ def test_shape(d1, d2, d3, nnz, transposed, index_dtype):
     @dtypesIfCUDA(*floating_and_complex_types_and(
                   *[torch.half] if SM53OrLater and TEST_CUSPARSE_GENERIC else [],
                   *[torch.bfloat16] if SM80OrLater and TEST_CUSPARSE_GENERIC else []))
-    @precisionOverride({torch.bfloat16: 1e-2, torch.float16: 1e-2})
+    @precisionOverride({torch.bfloat16: 3.5e-2, torch.float16: 1e-2})
     def test_sparse_addmm(self, device, dtype):
         def test_shape(m, n, p, nnz, broadcast, index_dtype, alpha_beta=None):
             if alpha_beta is None:
@@ -2617,7 +2617,7 @@ def run_test(m, n, k, nnz, train):
     @skipIfTorchDynamo()
     @onlyCPU
     @dtypes(torch.float32, torch.float64, torch.bfloat16, torch.float16)
-    @precisionOverride({torch.bfloat16: 0.01, torch.float16: 0.01})
+    @precisionOverride({torch.bfloat16: 0.02, torch.float16: 0.01})
     def test_sparse_mm_reduce(self, device, dtype):
         def run_test(m, n, k, nnz, reduce_type, index_dtype, train):
             csr = self.genSparseCSRTensor((m, n), nnz, dtype=dtype, device=device, index_dtype=index_dtype)
diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
@@ -3576,6 +3576,29 @@ def test_randperm(self, device):
             self.assertEqual(non_contiguous_tensor, res)
             self.assertEqual(res.sort().values.long(), torch.arange(n, device=device))
 
+
+    @largeTensorTest("10GB", "cpu")
+    @largeTensorTest("40GB", "cuda")
+    @slowTest
+    def test_randperm_large(self, device):
+        # Test even distribution where rand32 might produce skewed "uniform" distribution
+        # n_items is chosen to not evenly divide 2**32 and be sufficiently large
+        # to easily detect skew
+        def decile(index, collection_size):
+            return index // (collection_size // 10)
+
+        n_items = 700_000_000
+        shuffled = torch.randperm(n_items, device=device)
+        interval = 1_000_000
+        shuffled_interval = shuffled[:interval]
+        # histogram implemented for float only
+        deciles = decile(shuffled_interval, shuffled.shape[0]).float().cpu()
+        hist, _ = deciles.histogram(10, range=(0, 10))
+        expected_bin = shuffled_interval.shape[0] / 10
+        expected_error = math.sqrt(expected_bin) / expected_bin * 3
+        error = (hist - expected_bin).abs().max() / expected_bin
+        self.assertTrue(error < expected_error, f"error {error} > {expected_error}")
+
     # Test exceptions when device and generator types are incompatible
     @onlyCUDA
     @unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "Produces inconsistent errors when run in fbcode.")
diff --git a/test/torch_np/test_random.py b/test/torch_np/test_random.py
@@ -87,7 +87,7 @@ def test_1d(self, use_numpy):
     @parametrize("use_numpy", [True, False])
     def test_2d(self, use_numpy):
         # np.shuffle only shuffles the first axis
-        ax = tnp.asarray([[1, 2, 3], [4, 5, 6]])
+        ax = tnp.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
         ox = ax.copy()
 
         tnp.random.seed(1234)