@@ -834,10 +834,6 @@ using __sycl_init_mem_for =
834834 std::conditional_t <std::is_same_v<KernelName, auto_name>, auto_name,
835835 reduction::InitMemKrn<KernelName>>;
836836
837- __SYCL_EXPORT void
838- addCounterInit (handler &CGH, std::shared_ptr<sycl::detail::queue_impl> &Queue,
839- std::shared_ptr<int > &Counter);
840-
841837template <typename T, class BinaryOperation , int Dims, size_t Extent,
842838 bool ExplicitIdentity, typename RedOutVar>
843839class reduction_impl_algo {
@@ -995,7 +991,7 @@ class reduction_impl_algo {
995991 accessor Mem{*Buf, CGH};
996992 Func (Mem);
997993
998- reduction::withAuxHandler (CGH, [&](handler &CopyHandler) {
994+ HandlerAccess::postProcess (CGH, [&](handler &CopyHandler) {
999995 // MSVC (19.32.31329) has problems compiling the line below when used
1000996 // as a host compiler in c++17 mode (but not in c++latest)
1001997 // accessor Mem{*Buf, CopyHandler};
@@ -1071,19 +1067,16 @@ class reduction_impl_algo {
10711067 // On discrete (vs. integrated) GPUs it's faster to initialize memory with an
10721068 // extra kernel than copy it from the host.
10731069 auto getGroupsCounterAccDiscrete (handler &CGH) {
1074- queue q = createSyclObjFromImpl<queue>(CGH.MQueue );
1075- device Dev = q.get_device ();
1070+ queue q = CGH.getQueue ();
10761071 auto Deleter = [=](auto *Ptr) { free (Ptr, q); };
10771072
10781073 std::shared_ptr<int > Counter (malloc_device<int >(1 , q), Deleter);
10791074 CGH.addReduction (Counter);
10801075
1081- #ifdef __INTEL_PREVIEW_BREAKING_CHANGES
1082- std::shared_ptr<detail::queue_impl> Queue (CGH.MQueue );
1083- #else
1084- std::shared_ptr<detail::queue_impl> &Queue = CGH.MQueue ;
1085- #endif
1086- addCounterInit (CGH, Queue, Counter);
1076+ HandlerAccess::preProcess (CGH, q,
1077+ [Counter = Counter.get ()](handler &AuxHandler) {
1078+ AuxHandler.memset (Counter, 0 , sizeof (int ));
1079+ });
10871080
10881081 return Counter.get ();
10891082 }
@@ -1178,20 +1171,6 @@ auto make_reduction(RedOutVar RedVar, RestTy &&...Rest) {
11781171
11791172namespace reduction {
11801173inline void finalizeHandler (handler &CGH) { CGH.finalize (); }
1181- template <class FunctorTy > void withAuxHandler (handler &CGH, FunctorTy Func) {
1182- #ifdef __INTEL_PREVIEW_BREAKING_CHANGES
1183- detail::EventImplPtr E = CGH.finalize ();
1184- #else
1185- event E = CGH.finalize ();
1186- #endif
1187- handler AuxHandler (CGH.MQueue , CGH.eventNeeded ());
1188- if (!createSyclObjFromImpl<queue>(CGH.MQueue ).is_in_order ())
1189- AuxHandler.depends_on (E);
1190- AuxHandler.copyCodeLoc (CGH);
1191- Func (AuxHandler);
1192- CGH.MLastEvent = AuxHandler.finalize ();
1193- return ;
1194- }
11951174} // namespace reduction
11961175
11971176// This method is used for implementation of parallel_for accepting 1 reduction.
@@ -1785,7 +1764,7 @@ struct NDRangeReduction<
17851764 " the reduction." );
17861765 size_t NWorkItems = NDRange.get_group_range ().size ();
17871766 while (NWorkItems > 1 ) {
1788- reduction::withAuxHandler (CGH, [&](handler &AuxHandler) {
1767+ HandlerAccess::postProcess (CGH, [&](handler &AuxHandler) {
17891768 size_t NElements = Reduction::num_elements;
17901769 size_t NWorkGroups;
17911770 size_t WGSize = reduComputeWGSize (NWorkItems, MaxWGSize, NWorkGroups);
@@ -1837,7 +1816,7 @@ struct NDRangeReduction<
18371816 } // end while (NWorkItems > 1)
18381817
18391818 if constexpr (Reduction::is_usm) {
1840- reduction::withAuxHandler (CGH, [&](handler &CopyHandler) {
1819+ HandlerAccess::postProcess (CGH, [&](handler &CopyHandler) {
18411820 reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
18421821 });
18431822 }
@@ -1969,7 +1948,7 @@ template <> struct NDRangeReduction<reduction::strategy::basic> {
19691948 size_t WGSize = reduComputeWGSize (NWorkItems, MaxWGSize, NWorkGroups);
19701949
19711950 auto Rest = [&](auto KernelTag) {
1972- reduction::withAuxHandler (CGH, [&](handler &AuxHandler) {
1951+ HandlerAccess::postProcess (CGH, [&](handler &AuxHandler) {
19731952 // We can deduce IsOneWG from the tag type.
19741953 constexpr bool IsOneWG =
19751954 std::is_same_v<std::remove_reference_t <decltype (KernelTag)>,
@@ -2650,7 +2629,7 @@ template <> struct NDRangeReduction<reduction::strategy::multi> {
26502629
26512630 size_t NWorkItems = NDRange.get_group_range ().size ();
26522631 while (NWorkItems > 1 ) {
2653- reduction::withAuxHandler (CGH, [&](handler &AuxHandler) {
2632+ HandlerAccess::postProcess (CGH, [&](handler &AuxHandler) {
26542633 NWorkItems = reduAuxCGFunc<KernelName, decltype (KernelFunc)>(
26552634 AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
26562635 });
0 commit comments