[ROCm] fix torch.layer_norm invalid configuration problem when input is large tensor (pytorch#144007)

hongxiayang · pytorchmergebot · commit aa69d73e6bf5 · 2025-01-07T19:17:02.000Z
Fixes pytorch#136291 This PR is to fix the `invalid configuration argument` problem happened on ROCm when input is a large tensor when calling `torch.layer_norm`. ``` File "/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/nn/functional.py", line 2573, in layer_norm return torch.layer_norm RuntimeError: HIP error: invalid configuration argument ``` After investigation, I found that the reason why this error happened is: The amd compute language runtime checks whether `gridDim.x * blockDim.x` is greater than `std::numeric_limits<uint32_t>::max()` or not. If yes, it will error out with the "invalid configuration argument" message. The fix is to split the whole task to several chunks so that each chunk will not trigger the failure condition. This will ensure the correctness and completeness given the current kernel implementation logic of `vectorized_layer_norm_kernel`. Also added a largeTensor layer_norm unit test `test_layer_norm_large_tensor` with the same shape `[16, 3000, 3000, 16]` as the one used by the pytorch issue pytorch#136291 so that the unit test can check the expected output value to ensure correctness. The future work may include performance optimization of layer_norm and CK layer_norm integration. Pull Request resolved: pytorch#144007 Approved by: https://github.com/eqy
diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
@@ -745,12 +745,49 @@ void launch_vectorized_layer_norm_kernel(
     auto stream = at::cuda::getCurrentCUDAStream().stream();
     const int warp_size = at::cuda::warp_size();
     const dim3 threads(warp_size, num_threads() / warp_size, 1);
-    const dim3 blocks(M);
+    dim3 blocks(M);
+
+#ifdef USE_ROCM
+    uint64_t workgroupSize = static_cast<uint64_t>(blocks.x) * static_cast<uint64_t>(threads.x);
+    // this caused invalid configuration problem
+    if (workgroupSize > std::numeric_limits<uint32_t>::max()) {
+      // Fix invalid configuration https://github.com/pytorch/pytorch/issues/136291
+      blocks.x = std::numeric_limits<uint32_t>::max() / threads.x;
+    }
+#endif
+
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(threads.y % 2 == 0 || threads.y == 1);
     int nshared = threads.y > 1 ? threads.y * 3/2 *sizeof(T_ACC) : 0;
     vectorized_layer_norm_kernel<<<blocks, threads, nshared, stream>>>(N, eps, X_data,
     gamma_data, beta_data, mean_data, rstd_data, Y_data);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+#ifdef USE_ROCM
+    // the blocks.x contains the max grid x dimention without invalid configuration error
+    // Fix invalid configuration https://github.com/pytorch/pytorch/issues/136291
+    // Ensure all elements are processed. Prepare for next round
+    int64_t remaining = M - blocks.x;
+    const T* X_data2 = X_data;
+    T_ACC* mean_data2 = mean_data;
+    T_ACC* rstd_data2 = rstd_data;
+    T* Y_data2 = Y_data;
+
+    while (remaining > 0) {
+      X_data2 += N * blocks.x;
+      mean_data2 += blocks.x;
+      rstd_data2 += blocks.x;
+      Y_data2 += N * blocks.x;
+
+      blocks.x = (remaining > blocks.x) ? blocks.x : remaining;
+
+      vectorized_layer_norm_kernel<<<blocks, threads, nshared, stream>>>(N, eps, X_data2,
+        gamma_data, beta_data, mean_data2, rstd_data2, Y_data2);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+      remaining -= blocks.x;
+    }
+#endif
+
 }
 
 template <typename T, typename T_ACC>
diff --git a/test/test_nn.py b/test/test_nn.py
@@ -7139,6 +7139,24 @@ def test_layer_norm_eps(self):
         ln = torch.nn.LayerNorm(2, eps=1e-6, elementwise_affine=False)
         self.assertEqual(ln.forward(x), torch.zeros_like(x))
 
+    @largeTensorTest("40GB", device="cuda")
+    def test_layer_norm_large_tensor(self):
+        # test for https://github.com/pytorch/pytorch/issues/136291
+        device = torch.device("cuda")
+        b, n, dp = 16, 3000, 16
+        pairwise_repr = torch.randn(b, n, n, dp)
+
+        attn_bias_norm = nn.LayerNorm(dp).to(device=device)
+        pairwise_repr = pairwise_repr.to(dtype=torch.float32, device=device)
+        # we want a smaller copy to compare the results
+        pairwise_small = pairwise_repr[-1, -1, -1].detach().clone()
+        norm = attn_bias_norm(pairwise_repr)
+        norm_small = attn_bias_norm(pairwise_small)
+
+        self.assertEqual(norm.shape, torch.Size([16, 3000, 3000, 16]))
+        # Check output to make sure it is correct.
+        torch.testing.assert_close(norm_small, norm[-1, -1, -1])
+
     def test_padding_list(self):
         # Padding can be a list, or tuple (regression test for gh-54452)
         x = torch.randn(4, 8, 32, 32)