Revert "[ATen][CUDA] Implement 128 bit vectorization v2 (pytorch#145746)"

pytorchmergebot · pytorchmergebot · commit b60120d0df96 · 2025-01-29T23:02:23.000Z
This reverts commit 81685d8. Reverted pytorch#145746 on behalf of https://github.com/ZainRizvi due to Sorry but this is breaking in trunk. See functorch/test_ops.py::TestOperatorsCUDA::test_jvp_nn_functional_multi_head_attention_forward_cuda_float32 [GH job link](https://github.com/pytorch/pytorch/actions/runs/13032483748/job/36358184032) [HUD commit link](https://hud.pytorch.org/pytorch/pytorch/commit/81685d81eb86595d169f55a564da26eaafb2ddf5) ([comment](pytorch#145746 (comment)))
diff --git a/aten/src/ATen/native/cuda/CUDAJitLoops.cuh b/aten/src/ATen/native/cuda/CUDAJitLoops.cuh
@@ -49,8 +49,8 @@ struct JittedVecKernelCache {
   at::cuda::jit::NvrtcFunction vec1;
   at::cuda::jit::NvrtcFunction vec2;
   at::cuda::jit::NvrtcFunction vec4;
-  at::cuda::jit::NvrtcFunction vec8;
 #ifdef USE_ROCM
+  at::cuda::jit::NvrtcFunction vec8;
   at::cuda::jit::NvrtcFunction vec16;
 #endif
 
@@ -131,30 +131,18 @@ void launch_jitted_vectorized_kernel(
   int vec_size = at::cuda::jit::can_vectorize_up_to(
       desc, c10::ArrayRef<char*>(data.data(), data.size()));
 
-#ifndef USE_ROCM
-  const auto input_size = c10::scalarTypeToTypeMeta(desc.f_inputs_type).itemsize();
-  const int optimal_vec_size = 16 / static_cast<int>(input_size);
-  vec_size = std::min<int>(optimal_vec_size, vec_size);
-  // Here we purposely omit vec8 for 1-byte data because of a bug in NVCC
-  // that causes some numerical mismatches with uint8 on sm80 and sm90.
-  // TODO: Revisit this after CUDA 12.8 update.
-  if (input_size < 2) {
-    vec_size = std::min<int>(vec_size, 4);
-  }
-#endif
-
   // Different kernels are compiled depending on what we're vectorizing up to (1, 2 or 4 elements)
   //   fn_ptr is set to the appropriate function based on the vec size and GPU used
   at::cuda::jit::NvrtcFunction* fn_ptr = nullptr;
 
 #ifdef USE_ROCM
   if (vec_size == 16) {
     fn_ptr = &fn_cache.vec16;
+  } else if (vec_size == 8) {
+    fn_ptr = &fn_cache.vec8;
   } else
 #endif
-  if (vec_size == 8) {
-    fn_ptr = &fn_cache.vec8;
-  } else if (vec_size == 4) {
+  if (vec_size == 4) {
     fn_ptr = &fn_cache.vec4;
   } else if (vec_size == 2) {
     fn_ptr = &fn_cache.vec2;
diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh
@@ -61,7 +61,6 @@ constexpr auto sum_of_sizes(args_t args, std::index_sequence<Is...>) {
     }
 }
 
-#ifdef USE_ROCM
 template <int io_sizes>
 constexpr auto elems_per_thread(){
   if constexpr (io_sizes == 1) {
@@ -72,16 +71,6 @@ constexpr auto elems_per_thread(){
     return 4;
   }
 }
-#else
-template <int io_sizes>
-constexpr auto elems_per_thread(){
-  if constexpr (io_sizes == 1) {
-    return 16;
-  } else {
-    return 8;
-  }
-}
-#endif
 
 template <int io_sizes>
 constexpr auto io_block_work_size() {
@@ -202,33 +191,21 @@ static inline void launch_vectorized_kernel(
   constexpr auto io_size = calc_io_size<func_t>();
   int64_t grid = (N + io_block_work_size<io_size>() - 1) / io_block_work_size<io_size>();
   auto stream = at::cuda::getCurrentCUDAStream();
-#ifdef USE_ROCM
   int vec_size = memory::can_vectorize_up_to<func_t>(data);
-#else
-  using cpp_type = typename function_traits<func_t>::result_type;
-  const uint16_t max_vec_size = memory::can_vectorize_up_to<func_t>(data);
-  uint16_t vec_size = 16 / static_cast<uint16_t>(sizeof(cpp_type));
-  vec_size = std::min<uint16_t>(vec_size, max_vec_size);
-  // Here we purposely omit vec8 for 1-byte data because of a bug in NVCC
-  // that causes some numerical mismatches with uint8 on sm80 and sm90.
-  // TODO: Revisit this after CUDA 12.8 update.
-  if (sizeof(cpp_type) < 2) {
-    vec_size = std::min<uint16_t>(vec_size, 4);
-  }
-#endif
+
   switch (vec_size) {
 #ifdef USE_ROCM
     case 16:
       vectorized_elementwise_kernel<16, func_t, array_t>
           <<<grid, num_threads(), 0, stream>>>(N, f, data);
       C10_CUDA_KERNEL_LAUNCH_CHECK();
       break;
-#endif
     case 8:
       vectorized_elementwise_kernel<8, func_t, array_t>
           <<<grid, num_threads(), 0, stream>>>(N, f, data);
       C10_CUDA_KERNEL_LAUNCH_CHECK();
       break;
+#endif
     case 4:
       vectorized_elementwise_kernel<4, func_t, array_t>
           <<<grid, num_threads(), 0, stream>>>(N, f, data);
diff --git a/aten/src/ATen/native/cuda/Dropout.cu b/aten/src/ATen/native/cuda/Dropout.cu
@@ -218,7 +218,7 @@ int get_vector_size(at::Tensor self, at::Tensor ret, at::Tensor mask) {
     TORCH_INTERNAL_ASSERT(vec_size <= 16, "Value of VEC must be in [2, 4, 8, 16]");
 #else
     // make sure we don't break assumption that we can't have > 4 elements / thread
-    TORCH_INTERNAL_ASSERT(vec_size <= 8, "Value of VEC must be in [2, 4, 8]");
+    TORCH_INTERNAL_ASSERT(vec_size <= 4, "Value of VEC must be in [2, 4]");
 #endif
   }
 
diff --git a/aten/src/ATen/native/cuda/MemoryAccess.cuh b/aten/src/ATen/native/cuda/MemoryAccess.cuh
@@ -351,19 +351,15 @@ inline C10_HOST_DEVICE int can_vectorize_up_to(const char *pointer) {
   uint64_t address = reinterpret_cast<uint64_t>(pointer);
   constexpr int vec2_alignment = std::alignment_of_v<aligned_vector<scalar_t, 2>>;
   constexpr int vec4_alignment = std::alignment_of_v<aligned_vector<scalar_t, 4>>;
-  constexpr int vec8_alignment = std::alignment_of_v<aligned_vector<scalar_t, 8>>;
 #ifdef USE_ROCM
+  constexpr int vec8_alignment = std::alignment_of_v<aligned_vector<scalar_t, 8>>;
   constexpr int vec16_alignment = std::alignment_of_v<aligned_vector<scalar_t, 16>>;
   constexpr int type_size = sizeof(scalar_t);
   if (type_size == 1 && (address % vec16_alignment == 0)) {
     return 16;
   } else if (type_size <= 2 && (address % vec8_alignment == 0)) {
     return 8;
   } else
-#else
-  if (address % vec8_alignment == 0) {
-   return 8;
-  } else
 #endif
   if (address % vec4_alignment == 0) {
     return 4;
diff --git a/aten/src/ATen/native/cuda/jit_utils.cpp b/aten/src/ATen/native/cuda/jit_utils.cpp
@@ -932,6 +932,7 @@ void initializeCudaContext() {
   }
 }
 
+#ifdef USE_ROCM
 int calc_io_size(
     const int nInputs,
     const int nOutputs,
@@ -951,6 +952,7 @@ int calc_io_size(
 
     return 0;
 }
+#endif
 
 int calc_thread_work_size(
     const int nInputs,
@@ -969,14 +971,7 @@ int calc_thread_work_size(
     }
     return io_size;
 #else
-    auto io_size = at::cuda::jit::calc_io_size(nInputs, nOutputs, inputs_type, result_type);
-    TORCH_INTERNAL_ASSERT(io_size > 0);
-    if (io_size == 1) {
-        return 16;
-    } else {
-        return 8;
-    }
-    return io_size;
+    return JIT_THREAD_WORK_SIZE;
 #endif
 }
 
diff --git a/aten/src/ATen/native/cuda/jit_utils.h b/aten/src/ATen/native/cuda/jit_utils.h
@@ -60,10 +60,6 @@ inline int can_vectorize_up_to(size_t default_alignment, void *pointer) {
   if ((default_alignment <= 2) && (ip % (8 * default_alignment) == 0)) {
     return 8;
   }
-#else
-  if (ip % (8 * default_alignment) == 0) {
-    return 8;
-  }
 #endif
   if (ip % (4 * default_alignment) == 0) {
     return 4;
@@ -92,17 +88,15 @@ inline int can_vectorize_up_to(const KernelDescriptor &desc, c10::ArrayRef<char*
 }
 
 //FIXME - this are defined in Loops.cuh, but including Loops.cuh here would lead to circular includes Loops.cuh -> CUDALoops.cuh -> jit_utils.h -> Loops.cuh
-#ifdef USE_ROCM
 #define JIT_THREAD_WORK_SIZE 4
-#else
-#define JIT_THREAD_WORK_SIZE 8
-#endif
 
+#ifdef USE_ROCM
 int calc_io_size(
     const int nInputs,
     const int nOutputs,
     const c10::ScalarType& inputs_type,
     const c10::ScalarType& result_type);
+#endif
 
 int calc_thread_work_size(
     const int nInputs,
diff --git a/aten/src/ATen/native/cuda/thread_constants.h b/aten/src/ATen/native/cuda/thread_constants.h
@@ -12,14 +12,11 @@
 constexpr int num_threads() {
   return 256;
 }
-
-constexpr int thread_work_size() { return 4; }
 #else
 constexpr uint32_t num_threads() {
   return C10_WARP_SIZE * 4;
 }
-
-constexpr int thread_work_size() { return 8; }
 #endif
 
+constexpr int thread_work_size() { return 4; }
 constexpr int block_work_size() { return thread_work_size() * num_threads(); }
diff --git a/aten/src/ATen/test/cuda_vectorized_test.cu b/aten/src/ATen/test/cuda_vectorized_test.cu
@@ -47,11 +47,11 @@ TEST(TestLoops, HasSameArgTypes) {
 TEST(TestVectorizedMemoryAccess, CanVectorizeUpTo) {
   char *ptr = reinterpret_cast<char *>(buffer1);
 
-  ASSERT_EQ(memory::can_vectorize_up_to<bool>(ptr), 8);
-  ASSERT_EQ(memory::can_vectorize_up_to<int8_t>(ptr), 8);
-  ASSERT_EQ(memory::can_vectorize_up_to<int16_t>(ptr), 8);
-  ASSERT_EQ(memory::can_vectorize_up_to<int>(ptr), 8);
-  ASSERT_EQ(memory::can_vectorize_up_to<int64_t>(ptr), 8);
+  ASSERT_EQ(memory::can_vectorize_up_to<bool>(ptr), 4);
+  ASSERT_EQ(memory::can_vectorize_up_to<int8_t>(ptr), 4);
+  ASSERT_EQ(memory::can_vectorize_up_to<int16_t>(ptr), 4);
+  ASSERT_EQ(memory::can_vectorize_up_to<int>(ptr), 4);
+  ASSERT_EQ(memory::can_vectorize_up_to<int64_t>(ptr), 4);
 
   ASSERT_EQ(memory::can_vectorize_up_to<bool>(ptr + 1), 1);
   ASSERT_EQ(memory::can_vectorize_up_to<int8_t>(ptr + 1), 1);
@@ -65,8 +65,8 @@ TEST(TestVectorizedMemoryAccess, CanVectorizeUpTo) {
   ASSERT_EQ(memory::can_vectorize_up_to<int16_t>(ptr + 4), 2);
   ASSERT_EQ(memory::can_vectorize_up_to<int>(ptr + 4), 1);
 
-  ASSERT_EQ(memory::can_vectorize_up_to<bool>(ptr + 8), 8);
-  ASSERT_EQ(memory::can_vectorize_up_to<int8_t>(ptr + 8), 8);
+  ASSERT_EQ(memory::can_vectorize_up_to<bool>(ptr + 8), 4);
+  ASSERT_EQ(memory::can_vectorize_up_to<int8_t>(ptr + 8), 4);
   ASSERT_EQ(memory::can_vectorize_up_to<int16_t>(ptr + 8), 4);
   ASSERT_EQ(memory::can_vectorize_up_to<int>(ptr + 8), 2);
   ASSERT_EQ(memory::can_vectorize_up_to<int64_t>(ptr + 8), 1);

Original file line number	Diff line number	Diff line change
`@@ -218,7 +218,7 @@ int get_vector_size(at::Tensor self, at::Tensor ret, at::Tensor mask) {`
`218`	`218`	`TORCH_INTERNAL_ASSERT(vec_size <= 16, "Value of VEC must be in [2, 4, 8, 16]");`
`219`	`219`	`#else`
`220`	`220`	`// make sure we don't break assumption that we can't have > 4 elements / thread`
`221`		`- TORCH_INTERNAL_ASSERT(vec_size <= 8, "Value of VEC must be in [2, 4, 8]");`
	`221`	`+ TORCH_INTERNAL_ASSERT(vec_size <= 4, "Value of VEC must be in [2, 4]");`
`222`	`222`	`#endif`
`223`	`223`	`}`
`224`	`224`
Original file line number	Diff line number	Diff line change
`@@ -932,6 +932,7 @@ void initializeCudaContext() {`
`932`	`932`	`}`
`933`	`933`	`}`
`934`	`934`
	`935`	`+#ifdef USE_ROCM`
`935`	`936`	`int calc_io_size(`
`936`	`937`	`const int nInputs,`
`937`	`938`	`const int nOutputs,`
`@@ -951,6 +952,7 @@ int calc_io_size(`
`951`	`952`
`952`	`953`	`return 0;`
`953`	`954`	`}`
	`955`	`+#endif`
`954`	`956`
`955`	`957`	`int calc_thread_work_size(`
`956`	`958`	`const int nInputs,`
`@@ -969,14 +971,7 @@ int calc_thread_work_size(`
`969`	`971`	`}`
`970`	`972`	`return io_size;`
`971`	`973`	`#else`
`972`		`- auto io_size = at::cuda::jit::calc_io_size(nInputs, nOutputs, inputs_type, result_type);`
`973`		`- TORCH_INTERNAL_ASSERT(io_size > 0);`
`974`		`- if (io_size == 1) {`
`975`		`- return 16;`
`976`		`- } else {`
`977`		`- return 8;`
`978`		`- }`
`979`		`- return io_size;`
	`974`	`+ return JIT_THREAD_WORK_SIZE;`
`980`	`975`	`#endif`
`981`	`976`	`}`
`982`	`977`