ROCm
diff --git a/‎aten/src/ATen/native/cuda/CUDALoops.cuh‎
Lines changed: 229 additions & 0 deletions b/‎aten/src/ATen/native/cuda/CUDALoops.cuh‎
Lines changed: 229 additions & 0 deletions
@@ -51,6 +51,23 @@
 
 namespace at::native {
 
+#ifdef USE_ROCM
+// Custom configuration for vectorized elementwise kernel
+// with template instantiation.
+namespace vectorized_templated_config {
+constexpr int num_threads() {
+  return 512;
+}
+
+constexpr int elems_per_thread() {
+  return 32;
+}
+
+constexpr int block_work_size() {
+  return elems_per_thread() * num_threads();
+}
+} // namespace vectorized_templated_config
+#endif
 
 template <typename args_t, size_t... Is>
 constexpr auto sum_of_sizes(args_t args, std::index_sequence<Is...>) {
@@ -255,6 +272,139 @@ static inline void launch_vectorized_kernel(
   }
 }
 
+#ifdef USE_ROCM
+template <
+    int vec_size,
+    typename func_t,
+    typename array_t,
+    typename inp_calc_t,
+    typename out_calc_t,
+    typename loader_t,
+    typename storer_t,
+    typename OutputType,
+    typename... InputTypes>
+C10_LAUNCH_BOUNDS_1(vectorized_templated_config::num_threads())
+__global__ void vectorized_templated_elementwise_kernel(
+    int N,
+    func_t f,
+    array_t data,
+    inp_calc_t inp_calc,
+    out_calc_t out_calc,
+    loader_t loader,
+    storer_t storer) {
+  int remaining =
+      N - vectorized_templated_config::block_work_size() * blockIdx.x;
+  if (remaining <
+      vectorized_templated_config::block_work_size()) { // if this block handles
+                                                        // the reminder,
+    // just do a naive unrolled loop
+    auto policy = memory::policies::unroll_base<
+        vectorized_templated_config::num_threads(),
+        array_t,
+        inp_calc_t,
+        out_calc_t,
+        loader_t,
+        storer_t,
+        vectorized_templated_config::elems_per_thread()>(
+        data, remaining, inp_calc, out_calc, loader, storer);
+    elementwise_kernel_helper(f, policy);
+  } else { // if this block has a full `block_work_size` data to handle, use
+           // vectorized memory access
+    elementwise_kernel_helper(
+        f,
+        memory::policies::vectorized_templated<
+            vec_size,
+            array_t,
+            vectorized_templated_config::elems_per_thread(),
+            vectorized_templated_config::num_threads(),
+            OutputType,
+            InputTypes...>(data));
+  }
+}
+
+// This function assume trivial 1d and supports template specialization
+// to avoid dynamic casting.
+// Input vectorization size is based on runtime information, i.e.
+// the actual data types of the input and output tensor and cannot
+// be determined using the functor type, as in regular non-templated
+// vectorized kernels. The caller is in charge of selecting the correct input
+// vectorization length.
+template <
+    typename func_t,
+    typename array_t,
+    typename inp_calc_t,
+    typename out_calc_t,
+    typename loader_t,
+    typename storer_t,
+    typename OutputType,
+    typename... InputTypes>
+static inline void launch_vectorized_templated_kernel(
+    int64_t N,
+    const func_t& f,
+    array_t data,
+    inp_calc_t ic,
+    out_calc_t oc,
+    loader_t l,
+    storer_t s) {
+  TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
+  using traits = function_traits<func_t>;
+  int64_t grid = (N + vectorized_templated_config::block_work_size() - 1) /
+      vectorized_templated_config::block_work_size();
+  auto stream = at::cuda::getCurrentCUDAStream();
+  int vec_size = memory::can_vectorize_up_to<func_t>(data);
+  switch (vec_size) {
+    case 8:
+      vectorized_templated_elementwise_kernel<
+          8,
+          func_t,
+          array_t,
+          inp_calc_t,
+          out_calc_t,
+          loader_t,
+          storer_t,
+          OutputType,
+          InputTypes...>
+          <<<grid, vectorized_templated_config::num_threads(), 0, stream>>>(
+              N, f, data, ic, oc, l, s);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+      break;
+    case 4:
+      vectorized_templated_elementwise_kernel<
+          4,
+          func_t,
+          array_t,
+          inp_calc_t,
+          out_calc_t,
+          loader_t,
+          storer_t,
+          OutputType,
+          InputTypes...>
+          <<<grid, vectorized_templated_config::num_threads(), 0, stream>>>(
+              N, f, data, ic, oc, l, s);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+      break;
+    case 2:
+      vectorized_templated_elementwise_kernel<
+          2,
+          func_t,
+          array_t,
+          inp_calc_t,
+          out_calc_t,
+          loader_t,
+          storer_t,
+          OutputType,
+          InputTypes...>
+          <<<grid, vectorized_templated_config::num_threads(), 0, stream>>>(
+              N, f, data, ic, oc, l, s);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+      break;
+    default:
+      // vector size 1 is not handled as part of vectorize_templated kernel
+      TORCH_INTERNAL_ASSERT(false, "Unexpected vectorization size");
+  }
+}
+#endif
+
 template <
     typename func_t,
     typename array_t,
@@ -500,6 +650,46 @@ void gpu_kernel_impl_nocast(TensorIteratorBase& iter, const func_t& f) {
 #endif
 }
 
+#ifdef USE_ROCM
+namespace {
+template <typename TupleLike, size_t arity, size_t arg_num = 0>
+struct check_types {
+  constexpr static inline bool check() {
+    if constexpr (arity != 2)
+      return false;
+    if constexpr (arg_num == 0) {
+      using SelectedType = std::tuple_element_t<arg_num, TupleLike>;
+      if constexpr (std::is_same_v<float, SelectedType>)
+        return check_types<TupleLike, arity, arg_num + 1>::check();
+    } else if constexpr (arg_num == 1) {
+      using SelectedType2 = std::tuple_element_t<arg_num, TupleLike>;
+      if constexpr (std::is_same_v<float, SelectedType2>)
+        return check_types<TupleLike, arity, arg_num + 1>::check();
+    }
+    return false;
+  }
+};
+
+// Bottom case: if we got this far, assume correct type matching except
+// when there are no arguments (arity == 0).
+template <typename TupleLike, size_t arity>
+struct check_types<TupleLike, arity, arity> {
+  constexpr static inline bool check() {
+    if constexpr (arity != 0)
+      return true;
+    return false;
+  }
+};
+
+template <typename TupleLike>
+struct check_types<TupleLike, 0, 0> {
+  constexpr static inline bool check() {
+    return false;
+  }
+};
+} // namespace
+#endif
+
 template <typename func_t>
 void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
   if (!needs_dynamic_casting<func_t>::check(iter)) {
@@ -524,6 +714,45 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
 
   if (contiguous) {
 #ifdef USE_ROCM
+    // Attempt to call specialized vectorized elementwise kernel
+    // that enables interleaving.
+    using float_map = c10::CppTypeToScalarType<float>;
+    using bfloat16_map = c10::CppTypeToScalarType<BFloat16>;
+    if (iter.ninputs() == 2 && iter.input_dtype(0) == float_map::value &&
+        iter.input_dtype(1) == bfloat16_map::value &&
+        memory::can_vectorize_up_to<func_t>(data) > 1) {
+      // constexpr to reduce the amount of kernels (empty) generated for
+      // vectorized templated elementwise and limit which functors are actually
+      // applied to the load and store at compile time.
+      using func_tuple = typename traits::ArgsTuple;
+      if constexpr (
+          std::is_same_v<float, arg0_t> && traits::arity == 2 &&
+          check_types<func_tuple, traits::arity, 0>::check()) {
+        auto input_offset_calculator = TrivialOffsetCalculator<traits::arity>();
+        auto output_offset_calculator = TrivialOffsetCalculator<1>();
+        auto loader = memory::LoadWithCast<traits::arity>(iter);
+        auto storer = memory::StoreWithCast<1>(iter);
+        launch_vectorized_templated_kernel<
+            func_t,
+            std::array<char*, ntensors>,
+            decltype(input_offset_calculator),
+            decltype(output_offset_calculator),
+            decltype(loader),
+            decltype(storer),
+            float,
+            float,
+            BFloat16>(
+            numel,
+            f,
+            data,
+            input_offset_calculator,
+            output_offset_calculator,
+            loader,
+            storer);
+        return;
+      }
+    }
+
     std::array<ScalarType, ntensors> dtypes;
     auto inner_strides = iter.get_inner_strides();
     std::array<int, ntensors> strides;