@@ -837,55 +837,25 @@ using __sycl_reduction_kernel =
837837 std::conditional_t <std::is_same<KernelName, auto_name>::value, auto_name,
838838 Namer<KernelName, Ts...>>;
839839
840- // / Called in device code. This function iterates through the index space
841- // / by assigning contiguous chunks to each work-group, then iterating
842- // / through each chunk using a stride equal to the work-group's local range,
843- // / which gives much better performance than using stride equal to 1.
844- // / For each of the index the given \p F function/functor is called and
845- // / the reduction value hold in \p Reducer is accumulated in those calls.
846- template <typename KernelFunc, int Dims, typename ReducerT>
847- void reductionLoop (const range<Dims> &Range, const size_t PerGroup,
848- ReducerT &Reducer, const nd_item<1 > &NdId, KernelFunc &F) {
849- // Divide into contiguous chunks and assign each chunk to a Group
850- // Rely on precomputed division to avoid repeating expensive operations
851- // TODO: Some devices may prefer alternative remainder handling
852- auto Group = NdId.get_group ();
853- size_t GroupId = Group.get_group_linear_id ();
854- size_t NumGroups = Group.get_group_linear_range ();
855- bool LastGroup = (GroupId == NumGroups - 1 );
856- size_t GroupStart = GroupId * PerGroup;
857- size_t GroupEnd = LastGroup ? Range.size () : (GroupStart + PerGroup);
858-
859- // Loop over the contiguous chunk
860- size_t Start = GroupStart + NdId.get_local_id (0 );
861- size_t End = GroupEnd;
862- size_t Stride = NdId.get_local_range (0 );
863- for (size_t I = Start; I < End; I += Stride)
864- F (getDelinearizedId (Range, I), Reducer);
865- }
866-
867840namespace reduction {
868841namespace main_krn {
869842template <class KernelName > struct RangeFastAtomics ;
870843} // namespace main_krn
871844} // namespace reduction
872- template <typename KernelName, typename KernelType, int Dims ,
873- typename PropertiesT, class Reduction >
845+ template <typename KernelName, typename KernelType, typename PropertiesT ,
846+ class Reduction >
874847void reduCGFuncForRangeFastAtomics (handler &CGH, KernelType KernelFunc,
875- const range<Dims> &Range,
876848 const nd_range<1 > &NDRange,
877849 PropertiesT Properties, Reduction &Redu) {
878850 size_t NElements = Reduction::num_elements;
879851 auto Out = Redu.getReadWriteAccessorToInitializedMem (CGH);
880852 local_accessor<typename Reduction::result_type, 1 > GroupSum{NElements, CGH};
881853 using Name = __sycl_reduction_kernel<reduction::main_krn::RangeFastAtomics,
882854 KernelName>;
883- size_t NWorkGroups = NDRange.get_group_range ().size ();
884- size_t PerGroup = Range.size () / NWorkGroups;
885855 CGH.parallel_for <Name>(NDRange, Properties, [=](nd_item<1 > NDId) {
886856 // Call user's functions. Reducer.MValue gets initialized there.
887857 typename Reduction::reducer_type Reducer;
888- reductionLoop (Range, PerGroup, Reducer, NDId, KernelFunc );
858+ KernelFunc ( NDId, Reducer );
889859
890860 // Work-group cooperates to initialize multiple reduction variables
891861 auto LID = NDId.get_local_id (0 );
@@ -920,10 +890,9 @@ namespace main_krn {
920890template <class KernelName , class NWorkGroupsFinished > struct RangeFastReduce ;
921891} // namespace main_krn
922892} // namespace reduction
923- template <typename KernelName, typename KernelType, int Dims ,
924- typename PropertiesT, class Reduction >
893+ template <typename KernelName, typename KernelType, typename PropertiesT ,
894+ class Reduction >
925895void reduCGFuncForRangeFastReduce (handler &CGH, KernelType KernelFunc,
926- const range<Dims> &Range,
927896 const nd_range<1 > &NDRange,
928897 PropertiesT Properties, Reduction &Redu) {
929898 size_t NElements = Reduction::num_elements;
@@ -941,13 +910,13 @@ void reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
941910 auto Rest = [&](auto NWorkGroupsFinished) {
942911 local_accessor<int , 1 > DoReducePartialSumsInLastWG{1 , CGH};
943912
944- using Name = __sycl_reduction_kernel<reduction::main_krn::RangeFastReduce,
945- KernelName, decltype (NWorkGroupsFinished)>;
946- size_t PerGroup = Range. size () / NWorkGroups ;
913+ using Name =
914+ __sycl_reduction_kernel<reduction::main_krn::RangeFastReduce,
915+ KernelName, decltype (NWorkGroupsFinished)> ;
947916 CGH.parallel_for <Name>(NDRange, Properties, [=](nd_item<1 > NDId) {
948917 // Call user's functions. Reducer.MValue gets initialized there.
949918 typename Reduction::reducer_type Reducer;
950- reductionLoop (Range, PerGroup, Reducer, NDId, KernelFunc );
919+ KernelFunc ( NDId, Reducer );
951920
952921 typename Reduction::binary_operation BOp;
953922 auto Group = NDId.get_group ();
@@ -1020,10 +989,9 @@ namespace main_krn {
1020989template <class KernelName > struct RangeBasic ;
1021990} // namespace main_krn
1022991} // namespace reduction
1023- template <typename KernelName, typename KernelType, int Dims ,
1024- typename PropertiesT, class Reduction >
992+ template <typename KernelName, typename KernelType, typename PropertiesT ,
993+ class Reduction >
1025994void reduCGFuncForRangeBasic (handler &CGH, KernelType KernelFunc,
1026- const range<Dims> &Range,
1027995 const nd_range<1 > &NDRange, PropertiesT Properties,
1028996 Reduction &Redu) {
1029997 size_t NElements = Reduction::num_elements;
@@ -1045,11 +1013,10 @@ void reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
10451013 auto BOp = Redu.getBinaryOperation ();
10461014 using Name =
10471015 __sycl_reduction_kernel<reduction::main_krn::RangeBasic, KernelName>;
1048- size_t PerGroup = Range.size () / NWorkGroups;
10491016 CGH.parallel_for <Name>(NDRange, Properties, [=](nd_item<1 > NDId) {
10501017 // Call user's functions. Reducer.MValue gets initialized there.
10511018 typename Reduction::reducer_type Reducer (Identity, BOp);
1052- reductionLoop (Range, PerGroup, Reducer, NDId, KernelFunc );
1019+ KernelFunc ( NDId, Reducer );
10531020
10541021 // If there are multiple values, reduce each separately
10551022 // This prevents local memory from scaling with elements
@@ -1133,35 +1100,6 @@ void reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
11331100 });
11341101}
11351102
1136- // / Returns "true" if the result has to be saved to user's variable by
1137- // / reduSaveFinalResultToUserMem.
1138- template <typename KernelName, typename KernelType, int Dims,
1139- typename PropertiesT, class Reduction >
1140- void reduCGFuncForRange (handler &CGH, KernelType KernelFunc,
1141- const range<Dims> &Range, size_t MaxWGSize,
1142- uint32_t NumConcurrentWorkGroups,
1143- PropertiesT Properties, Reduction &Redu) {
1144- size_t NWorkItems = Range.size ();
1145- size_t WGSize = std::min (NWorkItems, MaxWGSize);
1146- size_t NWorkGroups = NWorkItems / WGSize;
1147- if (NWorkItems % WGSize)
1148- NWorkGroups++;
1149- size_t MaxNWorkGroups = NumConcurrentWorkGroups;
1150- NWorkGroups = std::min (NWorkGroups, MaxNWorkGroups);
1151- size_t NDRItems = NWorkGroups * WGSize;
1152- nd_range<1 > NDRange{range<1 >{NDRItems}, range<1 >{WGSize}};
1153-
1154- if constexpr (Reduction::has_fast_reduce)
1155- reduCGFuncForRangeFastReduce<KernelName>(CGH, KernelFunc, Range, NDRange,
1156- Properties, Redu);
1157- else if constexpr (Reduction::has_fast_atomics)
1158- reduCGFuncForRangeFastAtomics<KernelName>(CGH, KernelFunc, Range, NDRange,
1159- Properties, Redu);
1160- else
1161- reduCGFuncForRangeBasic<KernelName>(CGH, KernelFunc, Range, NDRange,
1162- Properties, Redu);
1163- }
1164-
11651103namespace reduction {
11661104namespace main_krn {
11671105template <class KernelName > struct NDRangeBothFastReduceAndAtomics ;
@@ -2233,12 +2171,57 @@ void reduction_parallel_for(handler &CGH,
22332171#else
22342172 reduGetMaxNumConcurrentWorkGroups (Queue);
22352173#endif
2174+
22362175 // TODO: currently the preferred work group size is determined for the given
22372176 // queue/device, while it is safer to use queries to the kernel pre-compiled
22382177 // for the device.
22392178 size_t PrefWGSize = reduGetPreferredWGSize (Queue, OneElemSize);
2240- reduCGFuncForRange<KernelName>(CGH, KernelFunc, Range, PrefWGSize,
2241- NumConcurrentWorkGroups, Properties, Redu);
2179+
2180+ size_t NWorkItems = Range.size ();
2181+ size_t WGSize = std::min (NWorkItems, PrefWGSize);
2182+ size_t NWorkGroups = NWorkItems / WGSize;
2183+ if (NWorkItems % WGSize)
2184+ NWorkGroups++;
2185+ size_t MaxNWorkGroups = NumConcurrentWorkGroups;
2186+ NWorkGroups = std::min (NWorkGroups, MaxNWorkGroups);
2187+ size_t NDRItems = NWorkGroups * WGSize;
2188+ nd_range<1 > NDRange{range<1 >{NDRItems}, range<1 >{WGSize}};
2189+
2190+ size_t PerGroup = Range.size () / NWorkGroups;
2191+ // Iterate through the index space by assigning contiguous chunks to each
2192+ // work-group, then iterating through each chunk using a stride equal to the
2193+ // work-group's local range, which gives much better performance than using
2194+ // stride equal to 1. For each of the index the given the original KernelFunc
2195+ // is called and the reduction value hold in \p Reducer is accumulated in
2196+ // those calls.
2197+ auto UpdatedKernelFunc = [=](auto NDId, auto &Reducer) {
2198+ // Divide into contiguous chunks and assign each chunk to a Group
2199+ // Rely on precomputed division to avoid repeating expensive operations
2200+ // TODO: Some devices may prefer alternative remainder handling
2201+ auto Group = NDId.get_group ();
2202+ size_t GroupId = Group.get_group_linear_id ();
2203+ size_t NumGroups = Group.get_group_linear_range ();
2204+ bool LastGroup = (GroupId == NumGroups - 1 );
2205+ size_t GroupStart = GroupId * PerGroup;
2206+ size_t GroupEnd = LastGroup ? Range.size () : (GroupStart + PerGroup);
2207+
2208+ // Loop over the contiguous chunk
2209+ size_t Start = GroupStart + NDId.get_local_id (0 );
2210+ size_t End = GroupEnd;
2211+ size_t Stride = NDId.get_local_range (0 );
2212+ for (size_t I = Start; I < End; I += Stride)
2213+ KernelFunc (getDelinearizedId (Range, I), Reducer);
2214+ };
2215+
2216+ if constexpr (Reduction::has_fast_reduce)
2217+ reduCGFuncForRangeFastReduce<KernelName>(CGH, UpdatedKernelFunc, NDRange,
2218+ Properties, Redu);
2219+ else if constexpr (Reduction::has_fast_atomics)
2220+ reduCGFuncForRangeFastAtomics<KernelName>(CGH, UpdatedKernelFunc, NDRange,
2221+ Properties, Redu);
2222+ else
2223+ reduCGFuncForRangeBasic<KernelName>(CGH, UpdatedKernelFunc, NDRange,
2224+ Properties, Redu);
22422225}
22432226
22442227template <typename KernelName, int Dims, typename PropertiesT,
0 commit comments