Use syclMaxWorkGroupSize instead of dpcppMaxWorkGroupSize in OP take (#4925) (#4953)

huaiyuzh · web-flow · commit e71d223f8233 · 2024-10-29T08:37:50.000+08:00
* Use syclMaxWorkGroupSize instead of dpcppMaxWorkGroupSize in OP take

* fix comments
diff --git a/csrc/gpu/aten/operators/Indexing.cpp b/csrc/gpu/aten/operators/Indexing.cpp
@@ -1366,10 +1366,6 @@ void take_dpcpp(Tensor& dst, const Tensor& src, const Tensor& index) {
   idx_info.collapseDims();
 
   auto& dpcpp_queue = dpcppGetCurrentQueue();
-  auto dev_id = dpcppGetDeviceIdOfCurrentQueue();
-  auto wgroup_size = dpcppMaxWorkGroupSize(dev_id);
-  auto wgroup_range = (dst_num_elem + wgroup_size - 1) / wgroup_size;
-
   auto cgf = DPCPP_Q_CGF(cgh) {
     auto src_data = src.data_ptr<scalar_t>();
     auto dst_data = dst.data_ptr<scalar_t>();
@@ -1384,6 +1380,8 @@ void take_dpcpp(Tensor& dst, const Tensor& src, const Tensor& index) {
         src_data,
         dst_data,
         idx_data);
+    auto wgroup_size = dpcppMaxWorkGroupSize(kfn);
+    auto wgroup_range = (dst_num_elem + wgroup_size - 1) / wgroup_size;
 
     cgh.parallel_for<decltype(kfn)>(
         sycl::nd_range<1>({wgroup_range * wgroup_size}, {wgroup_size}), kfn);
diff --git a/csrc/gpu/runtime/Utils.h b/csrc/gpu/runtime/Utils.h
@@ -21,6 +21,28 @@ static inline sycl::queue& dpcppGetCurrentQueue() {
   return at::xpu::getCurrentXPUStream().queue();
 }
 
+template <class KernelClass>
+static int64_t dpcppMaxWorkGroupSize(
+    at::DeviceIndex dev_id = dpcppGetDeviceIdOfCurrentQueue()) {
+  auto q = c10::xpu::getCurrentXPUStream(dev_id).queue();
+  auto ctx = q.get_context();
+  auto dev = q.get_device();
+
+  auto kid = ::sycl::get_kernel_id<KernelClass>();
+  auto kbundle =
+      ::sycl::get_kernel_bundle<::sycl::bundle_state::executable>(ctx, {kid});
+
+  ::sycl::kernel k = kbundle.get_kernel(kid);
+  return k.get_info<::sycl::info::kernel_device_specific::work_group_size>(dev);
+}
+
+template <class KernelClass>
+static int64_t dpcppMaxWorkGroupSize(
+    KernelClass /*kfn*/,
+    at::DeviceIndex dev_id = dpcppGetDeviceIdOfCurrentQueue()) {
+  return dpcppMaxWorkGroupSize<KernelClass>(dev_id);
+}
+
 static inline int64_t dpcppMaxWorkGroupSize(
     DeviceId dev_id = dpcppGetDeviceIdOfCurrentQueue()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
diff --git a/tests/gpu/examples/test_take.py b/tests/gpu/examples/test_take.py
@@ -2,17 +2,12 @@
 from torch.testing._internal.common_utils import TestCase
 
 import intel_extension_for_pytorch  # noqa
-import pytest
 
 cpu_device = torch.device("cpu")
 dpcpp_device = torch.device("xpu")
 
 
 class TestNNMethod(TestCase):
-    @pytest.mark.skip(
-        reason="PT2.5: Total number of work-items in a work-group cannot exceed 512 for this kernel \
-            -54 (PI_ERROR_INVALID_WORK_GROUP_SIZE)"
-    )
     def test_take(self, dtype=torch.float):
         src = torch.rand(2, 3)
         print(src)