@@ -793,6 +793,43 @@ template <class FunctorTy> void withAuxHandler(handler &CGH, FunctorTy Func) {
793793}
794794} // namespace reduction
795795
796+ // This method is used for implementation of parallel_for accepting 1 reduction.
797+ // TODO: remove this method when everything is switched to general algorithm
798+ // implementing arbitrary number of reductions in parallel_for().
799+ // / Copies the final reduction result kept in read-write accessor to user's
800+ // / accessor. This method is not called for user's read-write accessors
801+ // / requiring update-write to it.
802+ template <typename KernelName, class Reduction >
803+ std::enable_if_t <!Reduction::is_usm>
804+ reduSaveFinalResultToUserMem (handler &CGH, Reduction &Redu) {
805+ auto InAcc = Redu.getReadAccToPreviousPartialReds (CGH);
806+ associateWithHandler (CGH, &Redu.getUserRedVar (), access::target::device);
807+ CGH.copy (InAcc, Redu.getUserRedVar ());
808+ }
809+
810+ // This method is used for implementation of parallel_for accepting 1 reduction.
811+ // TODO: remove this method when everything is switched to general algorithm
812+ // implementing arbitrary number of reductions in parallel_for().
813+ // / Copies the final reduction result kept in read-write accessor to user's
814+ // / USM memory.
815+ template <typename KernelName, class Reduction >
816+ std::enable_if_t <Reduction::is_usm>
817+ reduSaveFinalResultToUserMem (handler &CGH, Reduction &Redu) {
818+ size_t NElements = Reduction::num_elements;
819+ auto InAcc = Redu.getReadAccToPreviousPartialReds (CGH);
820+ auto UserVarPtr = Redu.getUserRedVar ();
821+ bool IsUpdateOfUserVar = !Redu.initializeToIdentity ();
822+ auto BOp = Redu.getBinaryOperation ();
823+ CGH.single_task <KernelName>([=] {
824+ for (int i = 0 ; i < NElements; ++i) {
825+ if (IsUpdateOfUserVar)
826+ UserVarPtr[i] = BOp (UserVarPtr[i], InAcc.get_pointer ()[i]);
827+ else
828+ UserVarPtr[i] = InAcc.get_pointer ()[i];
829+ }
830+ });
831+ }
832+
796833// / A helper to pass undefined (sycl::detail::auto_name) names unmodified. We
797834// / must do that to avoid name collisions.
798835template <template <typename ...> class Namer , class KernelName , class ... Ts>
@@ -834,7 +871,7 @@ template <class KernelName> struct RangeFastAtomics;
834871} // namespace reduction
835872template <typename KernelName, typename KernelType, int Dims,
836873 typename PropertiesT, class Reduction >
837- bool reduCGFuncForRangeFastAtomics (handler &CGH, KernelType KernelFunc,
874+ void reduCGFuncForRangeFastAtomics (handler &CGH, KernelType KernelFunc,
838875 const range<Dims> &Range,
839876 const nd_range<1 > &NDRange,
840877 PropertiesT Properties, Reduction &Redu) {
@@ -871,7 +908,11 @@ bool reduCGFuncForRangeFastAtomics(handler &CGH, KernelType KernelFunc,
871908 Reducer.template atomic_combine (&Out[0 ]);
872909 }
873910 });
874- return Reduction::is_usm || Redu.initializeToIdentity ();
911+
912+ if (Reduction::is_usm || Redu.initializeToIdentity ())
913+ reduction::withAuxHandler (CGH, [&](handler &CopyHandler) {
914+ reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
915+ });
875916}
876917
877918namespace reduction {
@@ -881,7 +922,7 @@ template <class KernelName, class NWorkGroupsFinished> struct RangeFastReduce;
881922} // namespace reduction
882923template <typename KernelName, typename KernelType, int Dims,
883924 typename PropertiesT, class Reduction >
884- bool reduCGFuncForRangeFastReduce (handler &CGH, KernelType KernelFunc,
925+ void reduCGFuncForRangeFastReduce (handler &CGH, KernelType KernelFunc,
885926 const range<Dims> &Range,
886927 const nd_range<1 > &NDRange,
887928 PropertiesT Properties, Reduction &Redu) {
@@ -972,9 +1013,6 @@ bool reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
9721013 Rest (Redu.getReadWriteAccessorToInitializedGroupsCounter (CGH));
9731014 else
9741015 Rest (Redu.getGroupsCounterAccDiscrete (CGH));
975-
976- // We've updated user's variable, no extra work needed.
977- return false ;
9781016}
9791017
9801018namespace reduction {
@@ -984,7 +1022,7 @@ template <class KernelName> struct RangeBasic;
9841022} // namespace reduction
9851023template <typename KernelName, typename KernelType, int Dims,
9861024 typename PropertiesT, class Reduction >
987- bool reduCGFuncForRangeBasic (handler &CGH, KernelType KernelFunc,
1025+ void reduCGFuncForRangeBasic (handler &CGH, KernelType KernelFunc,
9881026 const range<Dims> &Range,
9891027 const nd_range<1 > &NDRange, PropertiesT Properties,
9901028 Reduction &Redu) {
@@ -1088,14 +1126,18 @@ bool reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
10881126 }
10891127 }
10901128 });
1091- return Reduction::is_usm;
1129+
1130+ if (Reduction::is_usm)
1131+ reduction::withAuxHandler (CGH, [&](handler &CopyHandler) {
1132+ reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
1133+ });
10921134}
10931135
10941136// / Returns "true" if the result has to be saved to user's variable by
10951137// / reduSaveFinalResultToUserMem.
10961138template <typename KernelName, typename KernelType, int Dims,
10971139 typename PropertiesT, class Reduction >
1098- bool reduCGFuncForRange (handler &CGH, KernelType KernelFunc,
1140+ void reduCGFuncForRange (handler &CGH, KernelType KernelFunc,
10991141 const range<Dims> &Range, size_t MaxWGSize,
11001142 uint32_t NumConcurrentWorkGroups,
11011143 PropertiesT Properties, Reduction &Redu) {
@@ -1110,14 +1152,14 @@ bool reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
11101152 nd_range<1 > NDRange{range<1 >{NDRItems}, range<1 >{WGSize}};
11111153
11121154 if constexpr (Reduction::has_fast_reduce)
1113- return reduCGFuncForRangeFastReduce<KernelName>(CGH, KernelFunc, Range,
1114- NDRange, Properties, Redu);
1155+ reduCGFuncForRangeFastReduce<KernelName>(CGH, KernelFunc, Range, NDRange ,
1156+ Properties, Redu);
11151157 else if constexpr (Reduction::has_fast_atomics)
1116- return reduCGFuncForRangeFastAtomics<KernelName>(CGH, KernelFunc, Range,
1117- NDRange, Properties, Redu);
1158+ reduCGFuncForRangeFastAtomics<KernelName>(CGH, KernelFunc, Range, NDRange ,
1159+ Properties, Redu);
11181160 else
1119- return reduCGFuncForRangeBasic<KernelName>(CGH, KernelFunc, Range, NDRange,
1120- Properties, Redu);
1161+ reduCGFuncForRangeBasic<KernelName>(CGH, KernelFunc, Range, NDRange,
1162+ Properties, Redu);
11211163}
11221164
11231165namespace reduction {
@@ -1158,6 +1200,12 @@ void reduCGFuncForNDRangeBothFastReduceAndAtomics(handler &CGH,
11581200 if (NDIt.get_local_linear_id () == 0 )
11591201 Reducer.atomic_combine (&Out[0 ]);
11601202 });
1203+
1204+ if (Reduction::is_usm || Redu.initializeToIdentity ()) {
1205+ reduction::withAuxHandler (CGH, [&](handler &CopyHandler) {
1206+ reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
1207+ });
1208+ }
11611209}
11621210
11631211namespace reduction {
@@ -1242,6 +1290,12 @@ void reduCGFuncForNDRangeFastAtomicsOnly(handler &CGH, KernelType KernelFunc,
12421290 Reducer.atomic_combine (&Out[0 ]);
12431291 }
12441292 });
1293+
1294+ if (Reduction::is_usm || Redu.initializeToIdentity ()) {
1295+ reduction::withAuxHandler (CGH, [&](handler &CopyHandler) {
1296+ reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
1297+ });
1298+ }
12451299}
12461300
12471301namespace reduction {
@@ -1544,43 +1598,6 @@ size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
15441598 return NWorkGroups;
15451599}
15461600
1547- // This method is used for implementation of parallel_for accepting 1 reduction.
1548- // TODO: remove this method when everything is switched to general algorithm
1549- // implementing arbitrary number of reductions in parallel_for().
1550- // / Copies the final reduction result kept in read-write accessor to user's
1551- // / accessor. This method is not called for user's read-write accessors
1552- // / requiring update-write to it.
1553- template <typename KernelName, class Reduction >
1554- std::enable_if_t <!Reduction::is_usm>
1555- reduSaveFinalResultToUserMem (handler &CGH, Reduction &Redu) {
1556- auto InAcc = Redu.getReadAccToPreviousPartialReds (CGH);
1557- associateWithHandler (CGH, &Redu.getUserRedVar (), access::target::device);
1558- CGH.copy (InAcc, Redu.getUserRedVar ());
1559- }
1560-
1561- // This method is used for implementation of parallel_for accepting 1 reduction.
1562- // TODO: remove this method when everything is switched to general algorithm
1563- // implementing arbitrary number of reductions in parallel_for().
1564- // / Copies the final reduction result kept in read-write accessor to user's
1565- // / USM memory.
1566- template <typename KernelName, class Reduction >
1567- std::enable_if_t <Reduction::is_usm>
1568- reduSaveFinalResultToUserMem (handler &CGH, Reduction &Redu) {
1569- size_t NElements = Reduction::num_elements;
1570- auto InAcc = Redu.getReadAccToPreviousPartialReds (CGH);
1571- auto UserVarPtr = Redu.getUserRedVar ();
1572- bool IsUpdateOfUserVar = !Redu.initializeToIdentity ();
1573- auto BOp = Redu.getBinaryOperation ();
1574- CGH.single_task <KernelName>([=] {
1575- for (int i = 0 ; i < NElements; ++i) {
1576- if (IsUpdateOfUserVar)
1577- UserVarPtr[i] = BOp (UserVarPtr[i], InAcc.get_pointer ()[i]);
1578- else
1579- UserVarPtr[i] = InAcc.get_pointer ()[i];
1580- }
1581- });
1582- }
1583-
15841601// / For the given 'Reductions' types pack and indices enumerating them this
15851602// / function either creates new temporary accessors for partial sums (if IsOneWG
15861603// / is false) or returns user's accessor/USM-pointer if (IsOneWG is true).
@@ -2220,13 +2237,8 @@ void reduction_parallel_for(handler &CGH,
22202237 // queue/device, while it is safer to use queries to the kernel pre-compiled
22212238 // for the device.
22222239 size_t PrefWGSize = reduGetPreferredWGSize (Queue, OneElemSize);
2223- if (reduCGFuncForRange<KernelName>(CGH, KernelFunc, Range, PrefWGSize,
2224- NumConcurrentWorkGroups, Properties,
2225- Redu)) {
2226- reduction::withAuxHandler (CGH, [&](handler &CopyHandler) {
2227- reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
2228- });
2229- }
2240+ reduCGFuncForRange<KernelName>(CGH, KernelFunc, Range, PrefWGSize,
2241+ NumConcurrentWorkGroups, Properties, Redu);
22302242}
22312243
22322244template <typename KernelName, int Dims, typename PropertiesT,
@@ -2317,43 +2329,24 @@ void reduction_parallel_for(handler &CGH,
23172329 nd_range<Dims> Range, PropertiesT Properties,
23182330 Reduction Redu, KernelType KernelFunc) {
23192331 if constexpr (Reduction::has_float64_atomics) {
2320- device D = detail::getDeviceFromHandler (CGH);
2321-
2322- if (D.has (aspect::atomic64)) {
2323- reduCGFuncForNDRangeBothFastReduceAndAtomics<KernelName>(
2332+ if (detail::getDeviceFromHandler (CGH).has (aspect::atomic64))
2333+ return reduCGFuncForNDRangeBothFastReduceAndAtomics<KernelName>(
23242334 CGH, KernelFunc, Range, Properties, Redu);
2325- } else {
2326- reduction_parallel_for_basic_impl<KernelName>(
2327- CGH, Queue, Range, Properties, Redu, KernelFunc);
2328- return ;
2329- }
2335+
2336+ return reduction_parallel_for_basic_impl<KernelName>(
2337+ CGH, Queue, Range, Properties, Redu, KernelFunc);
23302338 } else if constexpr (Reduction::has_fast_atomics) {
23312339 if constexpr (Reduction::has_fast_reduce) {
2332- reduCGFuncForNDRangeBothFastReduceAndAtomics<KernelName, KernelType>(
2340+ return reduCGFuncForNDRangeBothFastReduceAndAtomics<KernelName,
2341+ KernelType>(
23332342 CGH, KernelFunc, Range, Properties, Redu);
23342343 } else {
2335- reduCGFuncForNDRangeFastAtomicsOnly<KernelName, KernelType>(
2344+ return reduCGFuncForNDRangeFastAtomicsOnly<KernelName, KernelType>(
23362345 CGH, KernelFunc, Range, Properties, Redu);
23372346 }
23382347 } else {
2339- reduction_parallel_for_basic_impl<KernelName>(CGH, Queue, Range, Properties,
2340- Redu, KernelFunc);
2341- return ;
2342- }
2343-
2344- // If the reduction variable must be initialized with the identity value
2345- // before the kernel run, then an additional working accessor is created,
2346- // initialized with the identity value and used in the kernel. That
2347- // working accessor is then copied to user's accessor or USM pointer after
2348- // the kernel run.
2349- // For USM pointers without initialize_to_identity properties the same
2350- // scheme with working accessor is used as re-using user's USM pointer in
2351- // the kernel would require creation of another variant of user's kernel,
2352- // which does not seem efficient.
2353- if (Reduction::is_usm || Redu.initializeToIdentity ()) {
2354- reduction::withAuxHandler (CGH, [&](handler &CopyHandler) {
2355- reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
2356- });
2348+ return reduction_parallel_for_basic_impl<KernelName>(
2349+ CGH, Queue, Range, Properties, Redu, KernelFunc);
23572350 }
23582351}
23592352
0 commit comments