Skip to content

Commit 7a876ff

Browse files
committed
Resolve merge conflicts
1 parent 83133ff commit 7a876ff

File tree

1 file changed

+4
-10
lines changed

1 file changed

+4
-10
lines changed

aten/src/ATen/native/cuda/CUDALoops.cuh

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,10 @@ static inline void launch_vectorized_kernel(
227227
auto stream = at::cuda::getCurrentCUDAStream();
228228
#ifdef USE_ROCM
229229
int vec_size = memory::can_vectorize_up_to<func_t>(data);
230-
<<<<<<< HEAD
230+
// Similar check in vectorized_elementwise_kernel() as well. Both should be in sync.
231+
c10::DeviceIndex curDevice = -1;
232+
AT_CUDA_CHECK(c10::cuda::GetDevice(&curDevice));
233+
int tws = at::detail::getCUDAHooks().isGPUArch(curDevice, {"gfx942"}) ? ((io_size >= 2) ? 8 : 16) : elems_per_thread<io_size>();
231234
#else
232235
using cpp_type = typename function_traits<func_t>::result_type;
233236
const uint16_t max_vec_size = memory::can_vectorize_up_to<func_t>(data);
@@ -239,19 +242,10 @@ static inline void launch_vectorized_kernel(
239242
if constexpr (sizeof(cpp_type) < 2) {
240243
vec_size = std::min<uint16_t>(vec_size, 4);
241244
}
242-
#endif
243-
=======
244-
#ifdef USE_ROCM
245-
// Similar check in vectorized_elementwise_kernel() as well. Both should be in sync.
246-
c10::DeviceIndex curDevice = -1;
247-
AT_CUDA_CHECK(c10::cuda::GetDevice(&curDevice));
248-
int tws = at::detail::getCUDAHooks().isGPUArch(curDevice, {"gfx942"}) ? ((io_size >= 2) ? 8 : 16) : elems_per_thread<io_size>();
249-
#else
250245
int tws = elems_per_thread<io_size>();
251246
#endif
252247
int bws = tws * num_threads();
253248
int64_t grid = (N + bws - 1) / bws;
254-
>>>>>>> 2a63495dac ([ROCm] Improve vectorized elementwise kernel performance in MI300X (#2185))
255249
switch (vec_size) {
256250
#ifdef USE_ROCM
257251
case 16:

0 commit comments

Comments
 (0)