Adopt templatized approach to avoid having to grab kernel_bundle

joeatodd · joeatodd · commit 0a44c80898e9 · 2024-10-22T16:08:42.000+01:00
Also reorder params to match the rest of syclcompat
diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md
@@ -1574,23 +1574,23 @@ public:
 ```
 
 SYCLcompat provides a wrapper API `max_active_work_groups_per_cu` providing
-'work-groups per compute unit' semantics. It takes a `sycl::kernel` object, a
-`sycl::queue`, a work-groups size represented by either `sycl::range<Dim>` or
-`syclcompat::dim3`, and the local memory size in bytes. The function returns
-the maximum number of work-groups which can be executed per compute unit. May
-return *zero* even when below resource limits (i.e. returning `0` does not
-imply the kernel cannot execute).
+'work-groups per compute unit' semantics. It is templated on the kernel
+functor, and takes a `sycl::queue`, a work-groups size represented by either
+`sycl::range<Dim>` or `syclcompat::dim3`, and the local memory size in bytes.
+The function returns the maximum number of work-groups which can be executed
+per compute unit. May return *zero* even when below resource limits (i.e.
+returning `0` does not imply the kernel cannot execute).
 ```cpp
 namespace syclcompat{
 template <class KernelName>
-size_t max_active_work_groups_per_cu(KernelName kernel, sycl::queue q,
-                                     syclcompat::dim3 wg_dim3,
-                                     size_t local_mem_size);
+size_t max_active_work_groups_per_cu(
+    syclcompat::dim3 wg_dim3, size_t local_mem_size,
+    sycl::queue queue = syclcompat::get_default_queue());
 
 template <class KernelName, int RangeDim>
-size_t max_active_work_groups_per_cu(KernelName kernel, sycl::queue q,
-                                     sycl::range<RangeDim> wg_range,
-                                     size_t local_mem_size);
+size_t max_active_work_groups_per_cu(
+    sycl::range<RangeDim> wg_range, size_t local_mem_size,
+    sycl::queue queue = syclcompat::get_default_queue());
 }
 ```
 
diff --git a/sycl/include/syclcompat/util.hpp b/sycl/include/syclcompat/util.hpp
@@ -921,39 +921,43 @@ class group : public group_base<dimensions> {
 } // namespace experimental
 
 // Calculate the number of work-groups per compute unit
-// \param [in] kernel SYCL kernel to calculate for
+// \tparam [in] KernelName SYCL kernel name to calculate for
 // \param [in] q SYCL queue used to execute kernel
 // \param [in] wg_dim3 dim3 representing work-group shape
 // \param [in] local_mem_size Local memory usage per work-group in bytes
 // \return size_t representing maximum work-groups per compute unit
 template <class KernelName>
-size_t max_active_work_groups_per_cu(KernelName kernel, sycl::queue q,
-                                     syclcompat::dim3 wg_dim3,
-                                     size_t local_mem_size) {
+size_t max_active_work_groups_per_cu(
+    syclcompat::dim3 wg_dim3, size_t local_mem_size,
+    sycl::queue queue = syclcompat::get_default_queue()) {
   namespace syclex = sycl::ext::oneapi::experimental;
   // max_num_work_groups only supports range<3>
+  auto ctx = queue.get_context();
+  auto bundle = sycl::get_kernel_bundle<sycl::bundle_state::executable>(ctx);
+  auto kernel = bundle.template get_kernel<KernelName>();
   sycl::range<3> wg_range_3d(wg_dim3);
   size_t max_wgs = kernel.template ext_oneapi_get_info<
-      syclex::info::kernel_queue_specific::max_num_work_groups>(q, wg_range_3d,
+      syclex::info::kernel_queue_specific::max_num_work_groups>(queue, wg_range_3d,
                                                                 local_mem_size);
   size_t max_compute_units =
-      q.get_device().get_info<sycl::info::device::max_compute_units>();
+      queue.get_device().get_info<sycl::info::device::max_compute_units>();
   // Spec dictates max_compute_units > 0, so no need to catch div 0
   return max_wgs / max_compute_units;
 }
 
 // Calculate the number of work-groups per compute unit
-// \param [in] kernel SYCL kernel to calculate for
+// \tparam [in] KernelName SYCL kernel name to calculate for
+// \tparam [in] RangeDim the dimension of the sycl::range
 // \param [in] q SYCL queue used to execute kernel
 // \param [in] wg_range SYCL work-group range
 // \param [in] local_mem_size Local memory usage per work-group in bytes
 // \return size_t representing maximum work-groups per compute unit
 template <class KernelName, int RangeDim>
-size_t max_active_work_groups_per_cu(KernelName kernel, sycl::queue q,
-                                     sycl::range<RangeDim> wg_range,
-                                     size_t local_mem_size) {
-  return max_active_work_groups_per_cu(kernel, q, syclcompat::dim3(wg_range),
-                                       local_mem_size);
+size_t max_active_work_groups_per_cu(
+    sycl::range<RangeDim> wg_range, size_t local_mem_size,
+    sycl::queue queue = syclcompat::get_default_queue()) {
+  return max_active_work_groups_per_cu<KernelName>(syclcompat::dim3(wg_range),
+                                                   local_mem_size, queue);
 }
 
 /// If x <= 2, then return a pointer to the default queue;
diff --git a/sycl/test-e2e/syclcompat/util/max_active_work_groups_per_cu.cpp b/sycl/test-e2e/syclcompat/util/max_active_work_groups_per_cu.cpp
@@ -1,4 +1,3 @@
-
 /***************************************************************************
  *
  *  Copyright (C) Codeplay Software Ltd.
@@ -66,23 +65,22 @@ void test_max_active_work_groups_per_cu(sycl::queue q,
   if constexpr (!KernelName<RangeDim>::has_local_mem)
     assert(local_mem_size == 0 && "Bad test setup");
 
-  auto ctx = q.get_context();
-  auto bundle = sycl::get_kernel_bundle<sycl::bundle_state::executable>(ctx);
-  auto kernel = bundle.template get_kernel<KernelName<RangeDim>>();
-
-  size_t max_per_cu = syclcompat::max_active_work_groups_per_cu(
-      kernel, q, wg_range, local_mem_size);
-
+  size_t max_per_cu = syclcompat::max_active_work_groups_per_cu<KernelName<RangeDim>>(
+      wg_range, local_mem_size, q);
+ 
   // Check we get the same result passing equivalent dim3
   syclcompat::dim3 wg_dim3{wg_range};
-  size_t max_per_cu_dim3 = syclcompat::max_active_work_groups_per_cu(
-      kernel, q, wg_dim3, local_mem_size);
+  size_t max_per_cu_dim3 = syclcompat::max_active_work_groups_per_cu<KernelName<RangeDim>>(
+      wg_dim3, local_mem_size, q);
   assert(max_per_cu == max_per_cu_dim3);
 
   // Compare w/ reference impl
   size_t max_compute_units =
       q.get_device().get_info<sycl::info::device::max_compute_units>();
   namespace syclex = sycl::ext::oneapi::experimental;
+  auto ctx = q.get_context();
+  auto bundle = sycl::get_kernel_bundle<sycl::bundle_state::executable>(ctx);
+  auto kernel = bundle.template get_kernel<KernelName<RangeDim>>();
   size_t max_wgs = kernel.template ext_oneapi_get_info<
       syclex::info::kernel_queue_specific::max_num_work_groups>(
       q, sycl::range<3>{syclcompat::dim3{wg_range}}, local_mem_size);