Skip to content

Commit 212bd8f

Browse files
committed
[ROCm] Set thread_work_size to 16 for vectorized elementwise kernels (#2259)
* thread_work_size of 16 is giving better perf with many workloads
1 parent 6797279 commit 212bd8f

File tree

1 file changed

+2
-2
lines changed

1 file changed

+2
-2
lines changed

aten/src/ATen/native/cuda/CUDALoops.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ __global__ void vectorized_elementwise_kernel(int N, func_t f, array_t data) {
147147
constexpr auto io_size = calc_io_size<func_t>();
148148
#if defined(USE_ROCM) && defined(__gfx942__)
149149
// Similar check in launch_vectorized_kernel() as well. Both should be in sync.
150-
constexpr int tws = (io_size >= 2) ? 8 : 16;
150+
constexpr int tws = 16;
151151
#else
152152
constexpr int tws = elems_per_thread<io_size>();
153153
#endif
@@ -220,7 +220,7 @@ static inline void launch_vectorized_kernel(
220220
// Similar check in vectorized_elementwise_kernel() as well. Both should be in sync.
221221
c10::DeviceIndex curDevice = -1;
222222
AT_CUDA_CHECK(c10::cuda::GetDevice(&curDevice));
223-
int tws = at::detail::getCUDAHooks().isGPUArch(curDevice, {"gfx942"}) ? ((io_size >= 2) ? 8 : 16) : elems_per_thread<io_size>();
223+
int tws = at::detail::getCUDAHooks().isGPUArch(curDevice, {"gfx942"}) ? 16 : elems_per_thread<io_size>();
224224
#else
225225
int tws = elems_per_thread<io_size>();
226226
#endif

0 commit comments

Comments
 (0)