Rebase with supporting int32 index to align with latest PyTorch index operator definition (#3749) (#3808)

xiaolil1 · tye1 · gujinghui · web-flow · commit f7bb4873c041 · 2024-02-23T15:10:59.000+08:00
* Enable int32 index tensor for index operator

---------

Co-authored-by: Ye Ting &lt;ting.ye@intel.com&gt;
Co-authored-by: Jinghui &lt;jinghui.gu@intel.com&gt;
diff --git a/csrc/gpu/aten/operators/Indexing.h b/csrc/gpu/aten/operators/Indexing.h
@@ -346,7 +346,7 @@ static inline void launch_index_kernel(IdxConfig& cfg) {
   auto& queue = dpcppGetCurrentQueue();
   auto cgf = DPCPP_Q_CGF(__cgh) {
     IndexKernel<IdxConfig, TrivialOffCal, known_problem_inner> idx_ker(cfg);
-    __cgh.parallel_for(
+    __cgh.parallel_for<decltype(idx_ker)>(
         sycl::nd_range<2>(cfg.global_size(), cfg.group_size()), idx_ker);
   };
   DPCPP_Q_SUBMIT(queue, cgf);
@@ -510,6 +510,137 @@ static inline void _index_select_kernel(
   }
 }
 
+template <typename func_t, typename index_buf_type>
+struct DpcppSmallIndexKernelImplFunctor {
+  void operator()(sycl::nd_item<1> item_id) const {
+    auto local_id = item_id.get_local_id(0);
+    auto group_id = item_id.get_group(0);
+
+    // construct a indices_size table on SLM
+    for (int64_t local_index = local_id; local_index < indices_size;
+         local_index += wgroup_size) {
+      int64_t offset = 0;
+      for (size_t i = 0; i < num_indices; i++) {
+        // handle int32 index tensor according to the indice_size_bytes.
+        // we didn't use template parametor to avoid too many kernels' creation
+        // with numbers of input datatypes.
+        if (indice_size_bytes == 4) {
+          int32_t index =
+              *(int32_t*)(index_ptrs[i] + local_index * indice_size_bytes);
+          SYCL_KERNEL_ASSERT(
+              index >= -sizes[i] && index < sizes[i] && "index out of bounds");
+          if (index < 0) {
+            index += sizes[i];
+          }
+          offset += index * strides[i];
+        } else {
+          int64_t index =
+              *(int64_t*)(index_ptrs[i] + local_index * indice_size_bytes);
+          SYCL_KERNEL_ASSERT(
+              index >= -sizes[i] && index < sizes[i] && "index out of bounds");
+          if (index < 0) {
+            index += sizes[i];
+          }
+          offset += index * strides[i];
+        }
+      }
+      local_offset[local_index] = offset;
+    }
+
+    // calculate the number of workloads on each group
+    auto group_linear_id = group_id * group_numel;
+    auto group_numel_range = group_numel;
+    if (group_num_tail && group_id >= group_num) {
+      group_linear_id =
+          group_num * group_numel + (group_id - group_num) * group_numel_tail;
+      group_numel_range = group_numel_tail;
+    }
+    auto out_ptr = out_data;
+    auto in_ptr = in_data;
+    item_id.barrier(sycl::access::fence_space::local_space);
+
+    // compute the in/out/indices offsets and perform memory copy
+    for (int64_t local_index = local_id; local_index < group_numel_range;
+         local_index += wgroup_size) {
+      auto linear_id = group_linear_id + local_index;
+      auto out_offset = linear_id * element_size_bytes;
+      auto src_linear_id = linear_id / indices_size;
+      int64_t in_offset = 0;
+      for (int i = num_non_indices - 1; i > 0; --i) {
+        in_offset += (src_linear_id % src_sizes[i]) * src_strides[i];
+        src_linear_id /= src_sizes[i];
+      }
+      in_offset += src_linear_id * src_strides0;
+
+      auto offset = local_offset[local_index % indices_size];
+      f(out_ptr + out_offset, in_ptr + in_offset, offset);
+    }
+  }
+  DpcppSmallIndexKernelImplFunctor(
+      const func_t f_,
+      int64_t indices_size_,
+      int64_t group_num_tail_,
+      int64_t group_num_,
+      int64_t group_numel_,
+      int64_t group_numel_tail_,
+      int64_t wgroup_size_,
+      size_t num_non_indices_,
+      at::detail::Array<int64_t, MAX_TENSORINFO_DIMS> src_sizes_,
+      at::detail::Array<int64_t, MAX_TENSORINFO_DIMS> src_strides_,
+      int64_t src_strides0_,
+      size_t num_indices_,
+      at::detail::Array<int64_t, MAX_TENSORINFO_DIMS> sizes_,
+      at::detail::Array<int64_t, MAX_TENSORINFO_DIMS> strides_,
+      int64_t element_size_bytes_,
+      int64_t indice_size_bytes_,
+      char* out_data_,
+      char* in_data_,
+      at::detail::Array<index_buf_type, MAX_TENSORINFO_DIMS> index_ptrs_,
+      dpcpp_local_acc_t<int64_t, 1> local_offset_)
+      : f(f_),
+        indices_size(indices_size_),
+        group_num_tail(group_num_tail_),
+        group_num(group_num_),
+        group_numel(group_numel_),
+        group_numel_tail(group_numel_tail_),
+        wgroup_size(wgroup_size_),
+        num_non_indices(num_non_indices_),
+        src_sizes(src_sizes_),
+        src_strides(src_strides_),
+        src_strides0(src_strides0_),
+        num_indices(num_indices_),
+        sizes(sizes_),
+        strides(strides_),
+        element_size_bytes(element_size_bytes_),
+        indice_size_bytes(indice_size_bytes_),
+        out_data(out_data_),
+        in_data(in_data_),
+        index_ptrs(index_ptrs_),
+        local_offset(local_offset_) {}
+
+ private:
+  const func_t f;
+  int64_t indices_size;
+  int64_t group_num_tail;
+  int64_t group_num;
+  int64_t group_numel;
+  int64_t group_numel_tail;
+  int64_t wgroup_size;
+  size_t num_non_indices;
+  at::detail::Array<int64_t, MAX_TENSORINFO_DIMS> src_sizes;
+  at::detail::Array<int64_t, MAX_TENSORINFO_DIMS> src_strides;
+  int64_t src_strides0;
+  size_t num_indices;
+  at::detail::Array<int64_t, MAX_TENSORINFO_DIMS> sizes;
+  at::detail::Array<int64_t, MAX_TENSORINFO_DIMS> strides;
+  int64_t element_size_bytes;
+  int64_t indice_size_bytes;
+  char* out_data;
+  char* in_data;
+  at::detail::Array<index_buf_type, MAX_TENSORINFO_DIMS> index_ptrs;
+  dpcpp_local_acc_t<int64_t, 1> local_offset;
+};
+
 // DPCPP suggest: it’s possible (and even desirable) to oversubscribe tasks to
 // device;
 constexpr int OVER_SUBSCRIBE_DSS_FACTOR = 16;
@@ -572,64 +703,103 @@ void dpcpp_small_index_kernel_impl(
     }
 
     dpcpp_local_acc_t<int64_t, 1> local_offset(indices_size, __cgh);
-    auto kfn = DPCPP_Q_KFN(sycl::nd_item<1> item_id) {
-      auto local_id = item_id.get_local_id(0);
-      auto group_id = item_id.get_group(0);
-
-      // construct a indices_size table on SLM
-      for (int64_t local_index = local_id; local_index < indices_size;
-           local_index += wgroup_size) {
-        int64_t offset = 0;
-        for (size_t i = 0; i < num_indices; i++) {
-          int64_t index =
-              *(int64_t*)(index_ptrs[i] + local_index * indice_size_bytes);
-          SYCL_KERNEL_ASSERT(
-              index >= -sizes[i] && index < sizes[i] && "index out of bounds");
-          if (index < 0) {
-            index += sizes[i];
-          }
-          offset += index * strides[i];
-        }
-        local_offset[local_index] = offset;
-      }
-
-      // calculate the number of workloads on each group
-      auto group_linear_id = group_id * group_numel;
-      auto group_numel_range = group_numel;
-      if (group_num_tail && group_id >= group_num) {
-        group_linear_id =
-            group_num * group_numel + (group_id - group_num) * group_numel_tail;
-        group_numel_range = group_numel_tail;
-      }
-      auto out_ptr = out_data;
-      auto in_ptr = in_data;
-      item_id.barrier(sycl::access::fence_space::local_space);
-
-      // compute the in/out/indices offsets and perform memory copy
-      for (int64_t local_index = local_id; local_index < group_numel_range;
-           local_index += wgroup_size) {
-        auto linear_id = group_linear_id + local_index;
-        auto out_offset = linear_id * element_size_bytes;
-        auto src_linear_id = linear_id / indices_size;
-        int64_t in_offset = 0;
-        for (int i = num_non_indices - 1; i > 0; --i) {
-          in_offset += (src_linear_id % src_sizes[i]) * src_strides[i];
-          src_linear_id /= src_sizes[i];
-        }
-        in_offset += src_linear_id * src_strides0;
-
-        auto offset = local_offset[local_index % indices_size];
-        f(out_ptr + out_offset, in_ptr + in_offset, offset);
-      }
-    };
-    __cgh.parallel_for(
+    DpcppSmallIndexKernelImplFunctor<func_t, index_buf_type> kfn(
+        f,
+        indices_size,
+        group_num_tail,
+        group_num,
+        group_numel,
+        group_numel_tail,
+        wgroup_size,
+        num_non_indices,
+        src_sizes,
+        src_strides,
+        src_strides0,
+        num_indices,
+        sizes,
+        strides,
+        element_size_bytes,
+        indice_size_bytes,
+        out_data,
+        in_data,
+        index_ptrs,
+        local_offset);
+    __cgh.parallel_for<decltype(kfn)>(
         sycl::nd_range<1>(
             sycl::range<1>(global_size), sycl::range<1>(wgroup_size)),
         kfn);
   };
   DPCPP_Q_SUBMIT(dpcpp_queue, cgf);
 }
 
+template <
+    typename func_t,
+    typename index_buf_type,
+    typename OffsetCalculatorType>
+struct DpcppIndexKernelImplFunctor {
+  void operator()(sycl::item<1> item_id) const {
+    auto linear_idx = item_id.get_linear_id();
+    auto offsets = offset_calc.get(linear_idx);
+    auto out_ptr = out_data + offsets[0];
+    auto in_ptr = in_data + offsets[1];
+    int64_t offset = 0;
+    //#pragma unroll
+    for (size_t i = 0; i < num_indices; i++) {
+      // handle int32 index tensor according to the indice_size_bytes.
+      // we didn't use template parametor to avoid too many kernels' creation
+      // with numbers of input datatypes.
+      if (indice_size_bytes == 4) {
+        int32_t index = *(int32_t*)(index_ptrs[i] + offsets[2]);
+        SYCL_KERNEL_ASSERT(
+            index >= -sizes[i] && index < sizes[i] && "index out of bounds");
+        if (index < 0) {
+          index += sizes[i];
+        }
+        offset += index * strides[i];
+      } else {
+        int64_t index = *(int64_t*)(index_ptrs[i] + offsets[2]);
+        SYCL_KERNEL_ASSERT(
+            index >= -sizes[i] && index < sizes[i] && "index out of bounds");
+        if (index < 0) {
+          index += sizes[i];
+        }
+        offset += index * strides[i];
+      }
+    }
+    f(out_ptr, in_ptr, offset);
+  }
+  DpcppIndexKernelImplFunctor(
+      const func_t f_,
+      OffsetCalculatorType offset_calc_,
+      int64_t indice_size_bytes_,
+      char* out_data_,
+      char* in_data_,
+      size_t num_indices_,
+      at::detail::Array<index_buf_type, MAX_TENSORINFO_DIMS> index_ptrs_,
+      at::detail::Array<int64_t, MAX_TENSORINFO_DIMS> sizes_,
+      at::detail::Array<int64_t, MAX_TENSORINFO_DIMS> strides_)
+      : f(f_),
+        offset_calc(offset_calc_),
+        indice_size_bytes(indice_size_bytes_),
+        out_data(out_data_),
+        in_data(in_data_),
+        num_indices(num_indices_),
+        index_ptrs(index_ptrs_),
+        sizes(sizes_),
+        strides(strides_) {}
+
+ private:
+  const func_t f;
+  OffsetCalculatorType offset_calc;
+  int64_t indice_size_bytes;
+  char* out_data;
+  char* in_data;
+  size_t num_indices;
+  at::detail::Array<index_buf_type, MAX_TENSORINFO_DIMS> index_ptrs;
+  at::detail::Array<int64_t, MAX_TENSORINFO_DIMS> sizes;
+  at::detail::Array<int64_t, MAX_TENSORINFO_DIMS> strides;
+};
+
 template <typename func_t>
 void dpcpp_index_kernel_impl(
     TensorIterator& iter,
@@ -645,6 +815,8 @@ void dpcpp_index_kernel_impl(
     strides[i] = index_stride[i];
   }
 
+  int64_t indice_size_bytes = iter.tensor(2).element_size();
+
   auto& dpcpp_queue = dpcppGetCurrentQueue();
 
   auto cgf = DPCPP_Q_CGF(__cgh) {
@@ -657,25 +829,17 @@ void dpcpp_index_kernel_impl(
     }
 
     auto offset_calc = make_offset_calculator<3>(iter);
-    auto kfn = DPCPP_Q_KFN(sycl::item<1> item_id) {
-      auto linear_idx = item_id.get_linear_id();
-      auto offsets = offset_calc.get(linear_idx);
-      auto out_ptr = out_data + offsets[0];
-      auto in_ptr = in_data + offsets[1];
-      int64_t offset = 0;
-      //#pragma unroll
-      for (size_t i = 0; i < num_indices; i++) {
-        int64_t index = *(int64_t*)(index_ptrs[i] + offsets[2]);
-        SYCL_KERNEL_ASSERT(
-            index >= -sizes[i] && index < sizes[i] && "index out of bounds");
-        if (index < 0) {
-          index += sizes[i];
-        }
-        offset += index * strides[i];
-      }
-      f(out_ptr, in_ptr, offset);
-    };
-    __cgh.parallel_for(sycl::range</*dim=*/1>(numel), kfn);
+    DpcppIndexKernelImplFunctor<func_t, index_buf_type, decltype(offset_calc)>
+        kfn(f,
+            offset_calc,
+            indice_size_bytes,
+            out_data,
+            in_data,
+            num_indices,
+            index_ptrs,
+            sizes,
+            strides);
+    __cgh.parallel_for<decltype(kfn)>(sycl::range</*dim=*/1>(numel), kfn);
   };
   DPCPP_Q_SUBMIT(dpcpp_queue, cgf);
 }
@@ -700,19 +864,19 @@ void dpcpp_index_kernel(
       num_indices == static_cast<size_t>(iter.ntensors()) - 2);
   TORCH_INTERNAL_ASSERT(num_indices <= MAX_TENSORINFO_DIMS);
 
-  // the dpcpp_small_index_kernel_impl is applied for last several successive
-  // dims indexing of an input tensor Taking 3-dims tensor input
+  // the dpcpp_small_index_kernel_impl is applied for last several
+  // successive dims indexing of an input tensor Taking 3-dims tensor input
   // (input.shape=[x,y,z]) for example: input[:,:,idx] or input[:,idx1,idx2]
   // when input tensor satisfies the following conditions, the
-  // small_index_kernel path will be selected: 1.there are common indices such
-  // as input[:,:,idx] and input[:,idx1,idx2] instead of
+  // small_index_kernel path will be selected: 1.there are common indices
+  // such as input[:,:,idx] and input[:,idx1,idx2] instead of
   //   input[idx0,idx1,idx2], input[idx0,idx1,:], input[idx0,:,idx2],
   //   input[idx0,:,:], input[:,idx1,:]
   // 2.the common indices numel should larger than 2 times of the
   // dpcppMaxComputeUnitSize (then we can get memory access benifit) 3.the
-  // workloads in each group should larger than the maximum number of workitem
-  // (ensure all the workitem activate) 4.the indices_table size should
-  // satisfied the SLM limit condition
+  // workloads in each group should larger than the maximum number of
+  // workitem (ensure all the workitem activate) 4.the indices_table size
+  // should satisfied the SLM limit condition
 
   // check whether the current case satisfying the condition 1
   // for 3-dims input:
diff --git a/tests/gpu/regression/test_indexing.py b/tests/gpu/regression/test_indexing.py
@@ -67,3 +67,12 @@ def test_index_small_shape(self, dtype=torch.float):
         result_cpu = vertices_cpu[vert_filter_cpu_rand]
         result_xpu = vertices_xpu[vert_filter_xpu_rand]
         self.assertEqual(result_cpu, result_xpu.cpu())
+
+    def test_index_int32(self):
+        probs = torch.ones((256, 50272), dtype=torch.float32)
+        indice = torch.range(0, 255, dtype=torch.int32)
+
+        out_cpu = probs[indice]
+        out_xpu = probs.xpu()[indice.xpu()]
+
+        self.assertEqual(out_xpu.to("cpu"), out_cpu)