Skip to content

Commit fb81400

Browse files
jerrymannilpragupta
authored andcommitted
[ROCm] Set thread_work_size to 16 for vectorized elementwise kernels (#2259)
* thread_work_size of 16 is giving better perf with many workloads (cherry picked from commit 7edf50c)
1 parent 730c7e6 commit fb81400

File tree

1 file changed

+4
-3
lines changed

1 file changed

+4
-3
lines changed

aten/src/ATen/native/cuda/CUDALoops.cuh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -226,8 +226,9 @@ C10_LAUNCH_BOUNDS_1(num_threads())
226226
__global__ void vectorized_elementwise_kernel(int N, func_t f, array_t data) {
227227
using traits = function_traits<func_t>;
228228
constexpr auto io_size = calc_io_size<func_t>();
229-
#ifdef __gfx942__
230-
constexpr int tws = (io_size >= 2) ? 8 : 16;
229+
#if defined(USE_ROCM) && defined(__gfx942__)
230+
// Similar check in launch_vectorized_kernel() as well. Both should be in sync.
231+
constexpr int tws = 16;
231232
#else
232233
constexpr int tws = elems_per_thread<io_size>();
233234
#endif
@@ -296,7 +297,7 @@ static inline void launch_vectorized_kernel(
296297
int vec_size = memory::can_vectorize_up_to<func_t>(data);
297298
c10::DeviceIndex curDevice = -1;
298299
AT_CUDA_CHECK(c10::cuda::GetDevice(&curDevice));
299-
int tws = at::detail::getCUDAHooks().isGPUArch({"gfx942"}, curDevice) ? ((io_size >= 2) ? 8 : 16) : elems_per_thread<io_size>();
300+
int tws = at::detail::getCUDAHooks().isGPUArch({"gfx942"}, curDevice) ? 16 : elems_per_thread<io_size>();
300301
#else
301302
using cpp_type = typename function_traits<func_t>::result_type;
302303
const uint16_t max_vec_size = memory::can_vectorize_up_to<func_t>(data);

0 commit comments

Comments
 (0)