@@ -1932,45 +1932,6 @@ void reduCGFuncMulti(handler &CGH, KernelType KernelFunc,
1932
1932
Rest (createReduOutAccs<false >(NWorkGroups, CGH, ReduTuple, ReduIndices));
1933
1933
}
1934
1934
1935
- namespace reduction {
1936
- namespace main_krn {
1937
- template <class KernelName > struct NDRangeAtomic64 ;
1938
- } // namespace main_krn
1939
- } // namespace reduction
1940
-
1941
- // Specialization for devices with the atomic64 aspect, which guarantees 64 bit
1942
- // floating point support for atomic reduction operation.
1943
- template <typename KernelName, typename KernelType, int Dims,
1944
- typename PropertiesT, class Reduction >
1945
- void reduCGFuncAtomic64 (handler &CGH, KernelType KernelFunc,
1946
- const nd_range<Dims> &Range, PropertiesT Properties,
1947
- Reduction &Redu) {
1948
- auto Out = Redu.getReadWriteAccessorToInitializedMem (CGH);
1949
- static_assert (
1950
- Reduction::has_float64_atomics,
1951
- " Only suitable for reductions that have FP64 atomic operations." );
1952
- size_t NElements = Reduction::num_elements;
1953
- using Name =
1954
- __sycl_reduction_kernel<reduction::main_krn::NDRangeAtomic64, KernelName>;
1955
- CGH.parallel_for <Name>(Range, Properties, [=](nd_item<Dims> NDIt) {
1956
- // Call user's function. Reducer.MValue gets initialized there.
1957
- typename Reduction::reducer_type Reducer;
1958
- KernelFunc (NDIt, Reducer);
1959
-
1960
- // If there are multiple values, reduce each separately
1961
- // reduce_over_group is only defined for each T, not for span<T, ...>
1962
- for (int E = 0 ; E < NElements; ++E) {
1963
- typename Reduction::binary_operation BOp;
1964
- Reducer.getElement (E) =
1965
- reduce_over_group (NDIt.get_group (), Reducer.getElement (E), BOp);
1966
- }
1967
-
1968
- if (NDIt.get_local_linear_id () == 0 ) {
1969
- Reducer.atomic_combine (&Out[0 ]);
1970
- }
1971
- });
1972
- }
1973
-
1974
1935
template <typename ... Reductions, size_t ... Is>
1975
1936
void associateReduAccsWithHandler (handler &CGH,
1976
1937
std::tuple<Reductions...> &ReduTuple,
@@ -2386,8 +2347,8 @@ void reduction_parallel_for(handler &CGH,
2386
2347
device D = detail::getDeviceFromHandler (CGH);
2387
2348
2388
2349
if (D.has (aspect::atomic64)) {
2389
- reduCGFuncAtomic64 <KernelName>(CGH, KernelFunc, Range, Properties,
2390
- Redu);
2350
+ reduCGFuncForNDRangeBothFastReduceAndAtomics <KernelName>(
2351
+ CGH, KernelFunc, Range, Properties, Redu);
2391
2352
} else {
2392
2353
// Resort to basic implementation as well.
2393
2354
reduction_parallel_for_basic_impl<KernelName>(
0 commit comments