[NFC][SYCL][Reduction] Make detail::reduction_parallel_for "callable" by tests (#7405)

aelovikov-intel · web-flow · commit 33db95c75b71 · 2022-11-16T09:18:44.000+01:00
Eliminate "std::shared_ptr&lt;detail::queue_impl&gt; Queue" param and make
them handler's friends instead, so that I can write tests/benchmarks
referencing those directly to bypass reduction strategy auto-selection.
diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp
@@ -2036,8 +2036,8 @@ class __SYCL_EXPORT handler {
       ext::oneapi::experimental::is_property_list<PropertiesT>::value>
   parallel_for(range<Dims> Range, PropertiesT Properties, Reduction Redu,
                _KERNELFUNCPARAM(KernelFunc)) {
-    detail::reduction_parallel_for<KernelName>(*this, MQueue, Range, Properties,
-                                               Redu, std::move(KernelFunc));
+    detail::reduction_parallel_for<KernelName>(*this, Range, Properties, Redu,
+                                               std::move(KernelFunc));
   }
 
   template <typename KernelName = detail::auto_name, typename KernelType,
@@ -2057,7 +2057,7 @@ class __SYCL_EXPORT handler {
       detail::AreAllButLastReductions<RestT...>::value &&
       ext::oneapi::experimental::is_property_list<PropertiesT>::value>
   parallel_for(nd_range<Dims> Range, PropertiesT Properties, RestT &&...Rest) {
-    detail::reduction_parallel_for<KernelName>(*this, MQueue, Range, Properties,
+    detail::reduction_parallel_for<KernelName>(*this, Range, Properties,
                                                std::forward<RestT>(Rest)...);
   }
 
@@ -2519,6 +2519,19 @@ class __SYCL_EXPORT handler {
   template <class FunctorTy>
   friend void detail::reduction::withAuxHandler(handler &CGH, FunctorTy Func);
 
+  template <typename KernelName, detail::reduction::strategy Strategy, int Dims,
+            typename PropertiesT, typename KernelType, typename Reduction>
+  friend void detail::reduction_parallel_for(handler &CGH, range<Dims> Range,
+                                             PropertiesT Properties,
+                                             Reduction Redu,
+                                             KernelType KernelFunc);
+
+  template <typename KernelName, detail::reduction::strategy Strategy, int Dims,
+            typename PropertiesT, typename... RestT>
+  friend void
+  detail::reduction_parallel_for(handler &CGH, nd_range<Dims> NDRange,
+                                 PropertiesT Properties, RestT... Rest);
+
 #ifndef __SYCL_DEVICE_ONLY__
   friend void detail::associateWithHandler(handler &,
                                            detail::AccessorBaseHost *,
diff --git a/sycl/include/sycl/reduction.hpp b/sycl/include/sycl/reduction.hpp
@@ -2304,11 +2304,9 @@ template <> struct NDRangeReduction<reduction::strategy::auto_select> {
 
 template <typename KernelName, reduction::strategy Strategy, int Dims,
           typename PropertiesT, typename... RestT>
-void reduction_parallel_for(handler &CGH,
-                            std::shared_ptr<detail::queue_impl> Queue,
-                            nd_range<Dims> NDRange, PropertiesT Properties,
-                            RestT... Rest) {
-  NDRangeReduction<Strategy>::template run<KernelName>(CGH, Queue, NDRange,
+void reduction_parallel_for(handler &CGH, nd_range<Dims> NDRange,
+                            PropertiesT Properties, RestT... Rest) {
+  NDRangeReduction<Strategy>::template run<KernelName>(CGH, CGH.MQueue, NDRange,
                                                        Properties, Rest...);
 }
 
@@ -2317,10 +2315,9 @@ reduGetMaxNumConcurrentWorkGroups(std::shared_ptr<queue_impl> Queue);
 
 template <typename KernelName, reduction::strategy Strategy, int Dims,
           typename PropertiesT, typename KernelType, typename Reduction>
-void reduction_parallel_for(handler &CGH,
-                            std::shared_ptr<detail::queue_impl> Queue,
-                            range<Dims> Range, PropertiesT Properties,
-                            Reduction Redu, KernelType KernelFunc) {
+void reduction_parallel_for(handler &CGH, range<Dims> Range,
+                            PropertiesT Properties, Reduction Redu,
+                            KernelType KernelFunc) {
   // Before running the kernels, check that device has enough local memory
   // to hold local arrays required for the tree-reduction algorithm.
   constexpr bool IsTreeReduction =
@@ -2331,13 +2328,13 @@ void reduction_parallel_for(handler &CGH,
 #ifdef __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS
       __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS;
 #else
-      reduGetMaxNumConcurrentWorkGroups(Queue);
+      reduGetMaxNumConcurrentWorkGroups(CGH.MQueue);
 #endif
 
   // TODO: currently the preferred work group size is determined for the given
   // queue/device, while it is safer to use queries to the kernel pre-compiled
   // for the device.
-  size_t PrefWGSize = reduGetPreferredWGSize(Queue, OneElemSize);
+  size_t PrefWGSize = reduGetPreferredWGSize(CGH.MQueue, OneElemSize);
 
   size_t NWorkItems = Range.size();
   size_t WGSize = std::min(NWorkItems, PrefWGSize);
@@ -2387,8 +2384,8 @@ void reduction_parallel_for(handler &CGH,
       return reduction::strategy::range_basic;
   }();
 
-  reduction_parallel_for<KernelName, StrategyToUse>(
-      CGH, Queue, NDRange, Properties, Redu, UpdatedKernelFunc);
+  reduction_parallel_for<KernelName, StrategyToUse>(CGH, NDRange, Properties,
+                                                    Redu, UpdatedKernelFunc);
 }
 } // namespace detail
 
diff --git a/sycl/include/sycl/reduction_forward.hpp b/sycl/include/sycl/reduction_forward.hpp
@@ -48,18 +48,15 @@ template <typename KernelName,
           reduction::strategy Strategy = reduction::strategy::auto_select,
           int Dims, typename PropertiesT, typename KernelType,
           typename Reduction>
-void reduction_parallel_for(handler &CGH,
-                            std::shared_ptr<detail::queue_impl> Queue,
-                            range<Dims> Range, PropertiesT Properties,
-                            Reduction Redu, KernelType KernelFunc);
+void reduction_parallel_for(handler &CGH, range<Dims> Range,
+                            PropertiesT Properties, Reduction Redu,
+                            KernelType KernelFunc);
 
 template <typename KernelName,
           reduction::strategy Strategy = reduction::strategy::auto_select,
           int Dims, typename PropertiesT, typename... RestT>
-void reduction_parallel_for(handler &CGH,
-                            std::shared_ptr<detail::queue_impl> Queue,
-                            nd_range<Dims> NDRange, PropertiesT Properties,
-                            RestT... Rest);
+void reduction_parallel_for(handler &CGH, nd_range<Dims> NDRange,
+                            PropertiesT Properties, RestT... Rest);
 
 template <typename T> struct IsReduction;
 template <typename FirstT, typename... RestT> struct AreAllButLastReductions;