[ROCM] Fix in-place aten sum with specialized templated kernels. (pytorch#151230)

carlobertolli · pruthvistony · commit 1cb818bcb568 · 2025-05-17T20:07:38.000-05:00
We noticed a regression when doing aten.sum in-place (a+=b) and the type of the output is not the same as the functor. Co-authored by: Jerry Mannil <jerry.mannil@amd.com> Pull Request resolved: pytorch#151230 Approved by: https://github.com/jeffdaily
diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh
@@ -720,28 +720,41 @@ struct check_binary_functor_types_for_specialization<
 };
 
 // The following is a list of type specializations for vectorized_templated
-// elementwise kernel. It refers to the first and second runtime types of the
-// arguments of a binary functor.
-
+// elementwise kernel. The three types refer to runtime types of the output
+// tensor, first tensor argument, and the second tensor argument used for a
+// binary functor.
 constexpr std::array rt_binary_specializations = {
-    std::array<c10::ScalarType, 2>(
+    std::array<c10::ScalarType, 3>(
         {c10::CppTypeToScalarType<float>::value,
+         c10::CppTypeToScalarType<float>::value,
          c10::CppTypeToScalarType<BFloat16>::value}),
-    std::array<c10::ScalarType, 2>(
+    std::array<c10::ScalarType, 3>(
+        {c10::CppTypeToScalarType<float>::value,
+         c10::CppTypeToScalarType<BFloat16>::value,
+         c10::CppTypeToScalarType<float>::value}),
+    std::array<c10::ScalarType, 3>(
         {c10::CppTypeToScalarType<BFloat16>::value,
+         c10::CppTypeToScalarType<BFloat16>::value,
          c10::CppTypeToScalarType<float>::value}),
-    std::array<c10::ScalarType, 2>(
+    std::array<c10::ScalarType, 3>(
         {c10::CppTypeToScalarType<float>::value,
+         c10::CppTypeToScalarType<float>::value,
          c10::CppTypeToScalarType<Half>::value}),
-    std::array<c10::ScalarType, 2>(
+    std::array<c10::ScalarType, 3>(
+        {c10::CppTypeToScalarType<float>::value,
+         c10::CppTypeToScalarType<Half>::value,
+         c10::CppTypeToScalarType<float>::value}),
+    std::array<c10::ScalarType, 3>(
         {c10::CppTypeToScalarType<Half>::value,
+         c10::CppTypeToScalarType<Half>::value,
          c10::CppTypeToScalarType<float>::value})};
 
 bool check_binary_rt_types_for_specialization(TensorIteratorBase& iter) {
   if (iter.ninputs() != 2)
     return false;
   for (auto spec : rt_binary_specializations)
-    if (iter.input_dtype(0) == spec[0] && iter.input_dtype(1) == spec[1])
+    if (iter.dtype(0) == spec[0] && iter.input_dtype(0) == spec[1] &&
+        iter.input_dtype(1) == spec[2])
       return true;
   return false;
 }
@@ -756,6 +769,7 @@ struct type_specialized_kernel_launcher {
       typename loader_t,
       typename storer_t>
   static void apply(
+      ScalarType ret_t,
       ScalarType arg0_t,
       ScalarType arg1_t,
       int64_t numel,
@@ -765,22 +779,22 @@ struct type_specialized_kernel_launcher {
       out_calc_t output_offset_calculator,
       loader_t loader,
       storer_t storer) {
-    using traits = function_traits<func_t>;
-    using return_t = typename traits::result_type;
-    if (arg0_t == rt_binary_specializations[arg_index][0] &&
-        arg1_t == rt_binary_specializations[arg_index][1])
+    if (ret_t == rt_binary_specializations[arg_index][0] &&
+        arg0_t == rt_binary_specializations[arg_index][1] &&
+        arg1_t == rt_binary_specializations[arg_index][2])
       launch_vectorized_templated_kernel<
           func_t,
           array_t,
           inp_calc_t,
           out_calc_t,
           loader_t,
           storer_t,
-          return_t,
           decltype(c10::impl::ScalarTypeToCPPType<
                    rt_binary_specializations[arg_index][0]>::t),
           decltype(c10::impl::ScalarTypeToCPPType<
-                   rt_binary_specializations[arg_index][1]>::t)>(
+                   rt_binary_specializations[arg_index][1]>::t),
+          decltype(c10::impl::ScalarTypeToCPPType<
+                   rt_binary_specializations[arg_index][2]>::t)>(
           numel,
           f,
           data,
@@ -820,7 +834,6 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
 #ifdef USE_ROCM
     // Attempt to call specialized vectorized elementwise kernel
     // that enables interleaving.
-
     if (check_binary_rt_types_for_specialization(iter) &&
         memory::can_vectorize_up_to<func_t>(data) > 1) {
       // constexpr to reduce the amount of kernels generated for
@@ -848,6 +861,7 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
             type_specialized_kernel_launcher,
             rt_binary_specializations.size()>::
             with_args(
+                iter.dtype(0),
                 iter.input_dtype(0),
                 iter.input_dtype(1),
                 numel,
diff --git a/aten/src/ATen/native/cuda/MemoryAccess.cuh b/aten/src/ATen/native/cuda/MemoryAccess.cuh
@@ -407,8 +407,8 @@ struct vectorized_templated {
   // float(float,bfloat16) and functor add on float(float,float).
   template <typename scalar_t>
   __device__ inline void store(scalar_t* from, int idx) {
-    using vec_t = aligned_vector<scalar_t, vec_size>;
-    scalar_t* to = reinterpret_cast<scalar_t*>(data[0]) + block_work_size * idx;
+    using vec_t = aligned_vector<CastToT, vec_size>;
+    CastToT* to = reinterpret_cast<CastToT*>(data[0]) + block_work_size * idx;
     vec_t* to_ = reinterpret_cast<vec_t*>(to);
     int thread_idx = threadIdx.x;
 #pragma unroll