@@ -676,14 +676,6 @@ class reduction_impl_algo : public reduction_impl_common<T, BinaryOperation> {
676676
677677 RedOutVar &getUserRedVar () { return MRedOut; }
678678
679- static inline result_type *getOutPointer (result_type *OutPtr) {
680- return OutPtr;
681- }
682- template <class AccessorType >
683- static inline result_type *getOutPointer (const AccessorType &OutAcc) {
684- return OutAcc.get_pointer ().get ();
685- }
686-
687679private:
688680 // Array reduction is performed element-wise to avoid stack growth, hence
689681 // 1-dimensional always.
@@ -885,7 +877,7 @@ bool reduCGFuncForRangeFastAtomics(handler &CGH, KernelType KernelFunc,
885877 for (size_t E = 0 ; E < NElements; ++E) {
886878 Reducer.getElement (E) = GroupSum[E];
887879 }
888- Reducer.template atomic_combine (Reduction::getOutPointer ( Out) );
880+ Reducer.template atomic_combine (& Out[ 0 ] );
889881 }
890882 });
891883 return Reduction::is_usm || Redu.initializeToIdentity ();
@@ -937,12 +929,11 @@ bool reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
937929 RedElem = reduce_over_group (Group, RedElem, BOp);
938930 if (LID == 0 ) {
939931 if (NWorkGroups == 1 ) {
940- auto &OutElem = Reduction::getOutPointer (Out)[E];
941932 // Can avoid using partial sum and write the final result
942933 // immediately.
943934 if (IsUpdateOfUserVar)
944- RedElem = BOp (RedElem, OutElem );
945- OutElem = RedElem;
935+ RedElem = BOp (RedElem, Out[E] );
936+ Out[E] = RedElem;
946937 } else {
947938 PartialSums[NDId.get_group_linear_id () * NElements + E] =
948939 Reducer.getElement (E);
@@ -968,16 +959,15 @@ bool reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
968959 // Reduce each result separately
969960 // TODO: Opportunity to parallelize across elements.
970961 for (int E = 0 ; E < NElements; ++E) {
971- auto &OutElem = Reduction::getOutPointer (Out)[E];
972962 auto LocalSum = Reducer.getIdentity ();
973963 for (size_t I = LID; I < NWorkGroups; I += WGSize)
974964 LocalSum = BOp (LocalSum, PartialSums[I * NElements + E]);
975965 auto Result = reduce_over_group (Group, LocalSum, BOp);
976966
977967 if (LID == 0 ) {
978968 if (IsUpdateOfUserVar)
979- Result = BOp (Result, OutElem );
980- OutElem = Result;
969+ Result = BOp (Result, Out[E] );
970+ Out[E] = Result;
981971 }
982972 }
983973 }
@@ -1061,10 +1051,9 @@ bool reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
10611051 if (LID == 0 ) {
10621052 auto V = BOp (LocalReds[0 ], LocalReds[WGSize]);
10631053 if (NWorkGroups == 1 && IsUpdateOfUserVar)
1064- V = BOp (V, Reduction::getOutPointer ( Out) [E]);
1054+ V = BOp (V, Out[E]);
10651055 // if NWorkGroups == 1, then PartialsSum and Out point to same memory.
1066- Reduction::getOutPointer (
1067- PartialSums)[NDId.get_group_linear_id () * NElements + E] = V;
1056+ PartialSums[NDId.get_group_linear_id () * NElements + E] = V;
10681057 }
10691058 }
10701059
@@ -1085,9 +1074,7 @@ bool reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
10851074 for (int E = 0 ; E < NElements; ++E) {
10861075 auto LocalSum = Identity;
10871076 for (size_t I = LID; I < NWorkGroups; I += WGSize)
1088- LocalSum =
1089- BOp (LocalSum,
1090- Reduction::getOutPointer (PartialSums)[I * NElements + E]);
1077+ LocalSum = BOp (LocalSum, PartialSums[I * NElements + E]);
10911078
10921079 LocalReds[LID] = LocalSum;
10931080 if (LID == 0 )
@@ -1106,8 +1093,8 @@ bool reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
11061093 if (LID == 0 ) {
11071094 auto V = BOp (LocalReds[0 ], LocalReds[WGSize]);
11081095 if (IsUpdateOfUserVar)
1109- V = BOp (V, Reduction::getOutPointer ( Out) [E]);
1110- Reduction::getOutPointer ( Out) [E] = V;
1096+ V = BOp (V, Out[E]);
1097+ Out[E] = V;
11111098 }
11121099 }
11131100 }
@@ -1179,7 +1166,7 @@ void reduCGFuncForNDRangeBothFastReduceAndAtomics(handler &CGH,
11791166 reduce_over_group (NDIt.get_group (), Reducer.getElement (E), BOp);
11801167 }
11811168 if (NDIt.get_local_linear_id () == 0 )
1182- Reducer.atomic_combine (Reduction::getOutPointer ( Out) );
1169+ Reducer.atomic_combine (& Out[ 0 ] );
11831170 });
11841171}
11851172
@@ -1260,7 +1247,7 @@ void reduCGFuncForNDRangeFastAtomicsOnly(handler &CGH, bool IsPow2WG,
12601247 }
12611248
12621249 if (LID == 0 ) {
1263- Reducer.atomic_combine (Reduction::getOutPointer ( Out) );
1250+ Reducer.atomic_combine (& Out[ 0 ] );
12641251 }
12651252 });
12661253}
@@ -1306,8 +1293,8 @@ void reduCGFuncForNDRangeFastReduceOnly(handler &CGH, KernelType KernelFunc,
13061293 PSum = reduce_over_group (NDIt.get_group (), PSum, BOp);
13071294 if (NDIt.get_local_linear_id () == 0 ) {
13081295 if (IsUpdateOfUserVar)
1309- PSum = BOp (Reduction::getOutPointer ( Out) [E], PSum);
1310- Reduction::getOutPointer ( Out) [WGID * NElements + E] = PSum;
1296+ PSum = BOp (Out[E], PSum);
1297+ Out[WGID * NElements + E] = PSum;
13111298 }
13121299 }
13131300 });
@@ -1387,8 +1374,8 @@ void reduCGFuncForNDRangeBasic(handler &CGH, bool IsPow2WG,
13871374 typename Reduction::result_type PSum =
13881375 IsPow2WG ? LocalReds[0 ] : BOp (LocalReds[0 ], LocalReds[WGSize]);
13891376 if (IsUpdateOfUserVar)
1390- PSum = BOp (*( Reduction::getOutPointer ( Out)) , PSum);
1391- Reduction::getOutPointer ( Out) [GrID * NElements + E] = PSum;
1377+ PSum = BOp (Out[ 0 ] , PSum);
1378+ Out[GrID * NElements + E] = PSum;
13921379 }
13931380
13941381 // Ensure item 0 is finished with LocalReds before next iteration
@@ -1438,8 +1425,8 @@ void reduAuxCGFuncFastReduceImpl(handler &CGH, bool UniformWG,
14381425 PSum = reduce_over_group (NDIt.get_group (), PSum, BOp);
14391426 if (NDIt.get_local_linear_id () == 0 ) {
14401427 if (IsUpdateOfUserVar)
1441- PSum = BOp (Reduction::getOutPointer ( Out) [E], PSum);
1442- Reduction::getOutPointer ( Out) [WGID * NElements + E] = PSum;
1428+ PSum = BOp (Out[E], PSum);
1429+ Out[WGID * NElements + E] = PSum;
14431430 }
14441431 }
14451432 });
@@ -1515,8 +1502,8 @@ void reduAuxCGFuncNoFastReduceNorAtomicImpl(handler &CGH, bool UniformPow2WG,
15151502 typename Reduction::result_type PSum =
15161503 UniformPow2WG ? LocalReds[0 ] : BOp (LocalReds[0 ], LocalReds[WGSize]);
15171504 if (IsUpdateOfUserVar)
1518- PSum = BOp (*( Reduction::getOutPointer ( Out)) , PSum);
1519- Reduction::getOutPointer ( Out) [GrID * NElements + E] = PSum;
1505+ PSum = BOp (Out[ 0 ] , PSum);
1506+ Out[GrID * NElements + E] = PSum;
15201507 }
15211508
15221509 // Ensure item 0 is finished with LocalReds before next iteration
@@ -1738,24 +1725,20 @@ void writeReduSumsToOutAccs(
17381725 // Add the initial value of user's variable to the final result.
17391726 if (IsOneWG)
17401727 std::tie (std::get<Is>(LocalAccs)[0 ]...) = std::make_tuple (std::get<Is>(
1741- BOPs)(std::get<Is>(LocalAccs)[0 ],
1742- IsInitializeToIdentity[Is]
1743- ? std::get<Is>(IdentityVals)
1744- : std::tuple_element_t <Is, std::tuple<Reductions...>>::
1745- getOutPointer (std::get<Is>(OutAccs))[0 ])...);
1728+ BOPs)(std::get<Is>(LocalAccs)[0 ], IsInitializeToIdentity[Is]
1729+ ? std::get<Is>(IdentityVals)
1730+ : std::get<Is>(OutAccs)[0 ])...);
17461731
17471732 if (Pow2WG) {
17481733 // The partial sums for the work-group are stored in 0-th elements of local
17491734 // accessors. Simply write those sums to output accessors.
1750- std::tie (std::tuple_element_t <Is, std::tuple<Reductions...>>::getOutPointer (
1751- std::get<Is>(OutAccs))[OutAccIndex]...) =
1735+ std::tie (std::get<Is>(OutAccs)[OutAccIndex]...) =
17521736 std::make_tuple (std::get<Is>(LocalAccs)[0 ]...);
17531737 } else {
17541738 // Each of local accessors keeps two partial sums: in 0-th and WGsize-th
17551739 // elements. Combine them into final partial sums and write to output
17561740 // accessors.
1757- std::tie (std::tuple_element_t <Is, std::tuple<Reductions...>>::getOutPointer (
1758- std::get<Is>(OutAccs))[OutAccIndex]...) =
1741+ std::tie (std::get<Is>(OutAccs)[OutAccIndex]...) =
17591742 std::make_tuple (std::get<Is>(BOPs)(std::get<Is>(LocalAccs)[0 ],
17601743 std::get<Is>(LocalAccs)[WGSize])...);
17611744 }
@@ -1922,23 +1905,21 @@ void reduCGFuncImplArrayHelper(bool Pow2WG, bool IsOneWG, nd_item<Dims> NDIt,
19221905 if (LID == 0 ) {
19231906 if (IsOneWG) {
19241907 LocalReds[0 ] =
1925- BOp (LocalReds[0 ], IsInitializeToIdentity
1926- ? Identity
1927- : Reduction::getOutPointer (Out)[E]);
1908+ BOp (LocalReds[0 ], IsInitializeToIdentity ? Identity : Out[E]);
19281909 }
19291910
19301911 size_t GrID = NDIt.get_group_linear_id ();
1931- if (Pow2WG) {
1932- // The partial sums for the work-group are stored in 0-th elements of
1933- // local accessors. Simply write those sums to output accessors.
1934- Reduction::getOutPointer (Out)[GrID * NElements + E] = LocalReds[ 0 ];
1935- } else {
1936- // Each of local accessors keeps two partial sums: in 0-th and WGsize-th
1937- // elements. Combine them into final partial sums and write to output
1938- // accessors.
1939- Reduction::getOutPointer (Out)[GrID * NElements + E] =
1940- BOp (LocalReds[ 0 ], LocalReds[WGSize]);
1941- }
1912+ Out[GrID * NElements + E] =
1913+ Pow2WG ?
1914+ // The partial sums for the work-group are stored in 0-th
1915+ // elements of local accessors. Simply write those sums to
1916+ // output accessors.
1917+ LocalReds[ 0 ]
1918+ :
1919+ // Each of local accessors keeps two partial sums: in 0-th
1920+ // and WGsize-th elements. Combine them into final partial
1921+ // sums and write to output accessors.
1922+ BOp (LocalReds[ 0 ], LocalReds[WGSize]);
19421923 }
19431924
19441925 // Ensure item 0 is finished with LocalReds before next iteration
@@ -2080,7 +2061,7 @@ void reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
20802061 }
20812062
20822063 if (NDIt.get_local_linear_id () == 0 ) {
2083- Reducer.atomic_combine (Reduction::getOutPointer ( Out) );
2064+ Reducer.atomic_combine (& Out[ 0 ] );
20842065 }
20852066 });
20862067}
@@ -2189,23 +2170,21 @@ void reduAuxCGFuncImplArrayHelper(bool UniformPow2WG, bool IsOneWG,
21892170 if (LID == 0 ) {
21902171 if (IsOneWG) {
21912172 LocalReds[0 ] =
2192- BOp (LocalReds[0 ], IsInitializeToIdentity
2193- ? Identity
2194- : Reduction::getOutPointer (Out)[E]);
2173+ BOp (LocalReds[0 ], IsInitializeToIdentity ? Identity : Out[E]);
21952174 }
21962175
21972176 size_t GrID = NDIt.get_group_linear_id ();
2198- if (UniformPow2WG) {
2199- // The partial sums for the work-group are stored in 0-th elements of
2200- // local accessors. Simply write those sums to output accessors.
2201- Reduction::getOutPointer (Out)[GrID * NElements + E] = LocalReds[ 0 ];
2202- } else {
2203- // Each of local accessors keeps two partial sums: in 0-th and WGsize-th
2204- // elements. Combine them into final partial sums and write to output
2205- // accessors.
2206- Reduction::getOutPointer (Out)[GrID * NElements + E] =
2207- BOp (LocalReds[ 0 ], LocalReds[WGSize]);
2208- }
2177+ Out[GrID * NElements + E] =
2178+ UniformPow2WG ?
2179+ // The partial sums for the work-group are stored in
2180+ // 0-th elements of local accessors. Simply write those
2181+ // sums to output accessors.
2182+ LocalReds[ 0 ]
2183+ :
2184+ // Each of local accessors keeps two partial sums: in
2185+ // 0-th and WGsize-th elements. Combine them into final
2186+ // partial sums and write to output accessors.
2187+ BOp (LocalReds[ 0 ], LocalReds[WGSize]);
22092188 }
22102189
22112190 // Ensure item 0 is finished with LocalReds before next iteration
0 commit comments