File tree Expand file tree Collapse file tree 1 file changed +4
-10
lines changed
aten/src/ATen/native/cuda Expand file tree Collapse file tree 1 file changed +4
-10
lines changed Original file line number Diff line number Diff line change @@ -227,7 +227,10 @@ static inline void launch_vectorized_kernel(
227227 auto stream = at::cuda::getCurrentCUDAStream ();
228228#ifdef USE_ROCM
229229 int vec_size = memory::can_vectorize_up_to<func_t >(data);
230- <<<<<<< HEAD
230+ // Similar check in vectorized_elementwise_kernel() as well. Both should be in sync.
231+ c10::DeviceIndex curDevice = -1 ;
232+ AT_CUDA_CHECK (c10::cuda::GetDevice (&curDevice));
233+ int tws = at::detail::getCUDAHooks ().isGPUArch (curDevice, {" gfx942" }) ? ((io_size >= 2 ) ? 8 : 16 ) : elems_per_thread<io_size>();
231234#else
232235 using cpp_type = typename function_traits<func_t >::result_type;
233236 const uint16_t max_vec_size = memory::can_vectorize_up_to<func_t >(data);
@@ -239,19 +242,10 @@ static inline void launch_vectorized_kernel(
239242 if constexpr (sizeof (cpp_type) < 2 ) {
240243 vec_size = std::min<uint16_t >(vec_size, 4 );
241244 }
242- #endif
243- =======
244- #ifdef USE_ROCM
245- // Similar check in vectorized_elementwise_kernel() as well. Both should be in sync.
246- c10::DeviceIndex curDevice = -1 ;
247- AT_CUDA_CHECK (c10::cuda::GetDevice(&curDevice));
248- int tws = at::detail::getCUDAHooks().isGPUArch(curDevice, {" gfx942" }) ? ((io_size >= 2 ) ? 8 : 16 ) : elems_per_thread<io_size>();
249- #else
250245 int tws = elems_per_thread<io_size>();
251246#endif
252247 int bws = tws * num_threads ();
253248 int64_t grid = (N + bws - 1 ) / bws;
254- >>> >>> > 2a63495dac ([ROCm] Improve vectorized elementwise kernel performance in MI300X (#2185 ))
255249 switch (vec_size) {
256250#ifdef USE_ROCM
257251 case 16 :
You can’t perform that action at this time.
0 commit comments