Skip to content

Commit 7fa607f

Browse files
[SYCL][ABI Break] Remove ext::oneapi::reduction (#6634)
This PR also move the implementation from the extension header to the regular sycl/reduction.hpp. There might be some simplifications in the implementation enabled by that but they are left for a future PR.
1 parent 351b123 commit 7fa607f

File tree

9 files changed

+2513
-2682
lines changed

9 files changed

+2513
-2682
lines changed

sycl/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ set(SYCL_MINOR_VERSION 7)
3030
set(SYCL_PATCH_VERSION 0)
3131
# Don't forget to re-enable sycl_symbols_windows.dump once we leave ABI-breaking
3232
# window!
33-
set(SYCL_DEV_ABI_VERSION 10)
33+
set(SYCL_DEV_ABI_VERSION 11)
3434
if (SYCL_ADD_DEV_VERSION_POSTFIX)
3535
set(SYCL_VERSION_POSTFIX "-${SYCL_DEV_ABI_VERSION}")
3636
endif()

sycl/include/sycl/ext/oneapi/reduction.hpp

Lines changed: 0 additions & 2552 deletions
This file was deleted.

sycl/include/sycl/handler.hpp

Lines changed: 25 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -234,11 +234,6 @@ class RoundedRangeKernelWithKH {
234234
KernelType KernelFunc;
235235
};
236236

237-
} // namespace detail
238-
239-
namespace ext {
240-
namespace oneapi {
241-
namespace detail {
242237
template <typename T, class BinaryOperation, int Dims, size_t Extent,
243238
typename RedOutVar>
244239
class reduction_impl_algo;
@@ -320,8 +315,6 @@ template <class FunctorTy>
320315
event withAuxHandler(std::shared_ptr<detail::queue_impl> Queue, bool IsHost,
321316
FunctorTy Func);
322317
} // namespace detail
323-
} // namespace oneapi
324-
} // namespace ext
325318

326319
/// Command group handler class.
327320
///
@@ -468,8 +461,7 @@ class __SYCL_EXPORT handler {
468461
}
469462

470463
template <class FunctorTy>
471-
friend event
472-
ext::oneapi::detail::withAuxHandler(std::shared_ptr<detail::queue_impl> Queue,
464+
friend event detail::withAuxHandler(std::shared_ptr<detail::queue_impl> Queue,
473465
bool IsHost, FunctorTy Func);
474466
/// }@
475467

@@ -1618,20 +1610,18 @@ class __SYCL_EXPORT handler {
16181610
#ifdef __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS
16191611
__SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS;
16201612
#else
1621-
ext::oneapi::detail::reduGetMaxNumConcurrentWorkGroups(MQueue);
1613+
detail::reduGetMaxNumConcurrentWorkGroups(MQueue);
16221614
#endif
16231615
// TODO: currently the preferred work group size is determined for the given
16241616
// queue/device, while it is safer to use queries to the kernel pre-compiled
16251617
// for the device.
1626-
size_t PrefWGSize =
1627-
ext::oneapi::detail::reduGetPreferredWGSize(MQueue, OneElemSize);
1628-
if (ext::oneapi::detail::reduCGFuncForRange<KernelName>(
1629-
*this, KernelFunc, Range, PrefWGSize, NumConcurrentWorkGroups,
1630-
Redu)) {
1618+
size_t PrefWGSize = detail::reduGetPreferredWGSize(MQueue, OneElemSize);
1619+
if (detail::reduCGFuncForRange<KernelName>(*this, KernelFunc, Range,
1620+
PrefWGSize,
1621+
NumConcurrentWorkGroups, Redu)) {
16311622
this->finalize();
16321623
MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
1633-
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
1634-
CopyHandler, Redu);
1624+
detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
16351625
});
16361626
}
16371627
}
@@ -1657,8 +1647,8 @@ class __SYCL_EXPORT handler {
16571647

16581648
if (D.has(aspect::atomic64)) {
16591649

1660-
ext::oneapi::detail::reduCGFuncAtomic64<KernelName>(*this, KernelFunc,
1661-
Range, Redu);
1650+
detail::reduCGFuncAtomic64<KernelName>(*this, KernelFunc, Range,
1651+
Redu);
16621652
} else {
16631653
// Resort to basic implementation as well.
16641654
parallel_for_impl<KernelName>(Range, Redu, KernelFunc);
@@ -1667,8 +1657,7 @@ class __SYCL_EXPORT handler {
16671657
} else {
16681658
// Use fast sycl::atomic operations to update reduction variable at the
16691659
// end of each work-group work.
1670-
ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range,
1671-
Redu);
1660+
detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
16721661
}
16731662
// If the reduction variable must be initialized with the identity value
16741663
// before the kernel run, then an additional working accessor is created,
@@ -1682,8 +1671,7 @@ class __SYCL_EXPORT handler {
16821671
if (Reduction::is_usm || Redu.initializeToIdentity()) {
16831672
this->finalize();
16841673
MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
1685-
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
1686-
CopyHandler, Redu);
1674+
detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
16871675
});
16881676
}
16891677
}
@@ -1719,8 +1707,7 @@ class __SYCL_EXPORT handler {
17191707
// TODO: currently the maximal work group size is determined for the given
17201708
// queue/device, while it may be safer to use queries to the kernel compiled
17211709
// for the device.
1722-
size_t MaxWGSize =
1723-
ext::oneapi::detail::reduGetMaxWGSize(MQueue, OneElemSize);
1710+
size_t MaxWGSize = detail::reduGetMaxWGSize(MQueue, OneElemSize);
17241711
if (Range.get_local_range().size() > MaxWGSize)
17251712
throw sycl::runtime_error("The implementation handling parallel_for with"
17261713
" reduction requires work group size not bigger"
@@ -1729,7 +1716,7 @@ class __SYCL_EXPORT handler {
17291716
PI_ERROR_INVALID_WORK_GROUP_SIZE);
17301717

17311718
// 1. Call the kernel that includes user's lambda function.
1732-
ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
1719+
detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
17331720
std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
17341721
this->finalize();
17351722

@@ -1749,15 +1736,14 @@ class __SYCL_EXPORT handler {
17491736
size_t NWorkItems = Range.get_group_range().size();
17501737
while (NWorkItems > 1) {
17511738
MLastEvent = withAuxHandler(QueueCopy, [&](handler &AuxHandler) {
1752-
NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName, KernelType>(
1739+
NWorkItems = detail::reduAuxCGFunc<KernelName, KernelType>(
17531740
AuxHandler, NWorkItems, MaxWGSize, Redu);
17541741
});
17551742
} // end while (NWorkItems > 1)
17561743

17571744
if (Reduction::is_usm || Reduction::is_dw_acc) {
17581745
MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
1759-
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
1760-
CopyHandler, Redu);
1746+
detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
17611747
});
17621748
}
17631749
}
@@ -1798,46 +1784,42 @@ class __SYCL_EXPORT handler {
17981784
// c) Repeat the steps (a) and (b) to get one final sum.
17991785
template <typename KernelName = detail::auto_name, int Dims,
18001786
typename... RestT>
1801-
std::enable_if_t<
1802-
(sizeof...(RestT) >= 3 &&
1803-
ext::oneapi::detail::AreAllButLastReductions<RestT...>::value)>
1787+
std::enable_if_t<(sizeof...(RestT) >= 3 &&
1788+
detail::AreAllButLastReductions<RestT...>::value)>
18041789
parallel_for(nd_range<Dims> Range, RestT... Rest) {
18051790
std::tuple<RestT...> ArgsTuple(Rest...);
18061791
constexpr size_t NumArgs = sizeof...(RestT);
18071792
auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
18081793
auto ReduIndices = std::make_index_sequence<NumArgs - 1>();
1809-
auto ReduTuple =
1810-
ext::oneapi::detail::tuple_select_elements(ArgsTuple, ReduIndices);
1794+
auto ReduTuple = detail::tuple_select_elements(ArgsTuple, ReduIndices);
18111795

18121796
size_t LocalMemPerWorkItem =
1813-
ext::oneapi::detail::reduGetMemPerWorkItem(ReduTuple, ReduIndices);
1797+
detail::reduGetMemPerWorkItem(ReduTuple, ReduIndices);
18141798
// TODO: currently the maximal work group size is determined for the given
18151799
// queue/device, while it is safer to use queries to the kernel compiled
18161800
// for the device.
1817-
size_t MaxWGSize =
1818-
ext::oneapi::detail::reduGetMaxWGSize(MQueue, LocalMemPerWorkItem);
1801+
size_t MaxWGSize = detail::reduGetMaxWGSize(MQueue, LocalMemPerWorkItem);
18191802
if (Range.get_local_range().size() > MaxWGSize)
18201803
throw sycl::runtime_error("The implementation handling parallel_for with"
18211804
" reduction requires work group size not bigger"
18221805
" than " +
18231806
std::to_string(MaxWGSize),
18241807
PI_ERROR_INVALID_WORK_GROUP_SIZE);
18251808

1826-
ext::oneapi::detail::reduCGFuncMulti<KernelName>(*this, KernelFunc, Range,
1827-
ReduTuple, ReduIndices);
1809+
detail::reduCGFuncMulti<KernelName>(*this, KernelFunc, Range, ReduTuple,
1810+
ReduIndices);
18281811
std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
18291812
this->finalize();
18301813

18311814
size_t NWorkItems = Range.get_group_range().size();
18321815
while (NWorkItems > 1) {
18331816
MLastEvent = withAuxHandler(QueueCopy, [&](handler &AuxHandler) {
1834-
NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName,
1835-
decltype(KernelFunc)>(
1817+
NWorkItems = detail::reduAuxCGFunc<KernelName, decltype(KernelFunc)>(
18361818
AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
18371819
});
18381820
} // end while (NWorkItems > 1)
18391821

1840-
auto CopyEvent = ext::oneapi::detail::reduSaveFinalResultToUserMem(
1822+
auto CopyEvent = detail::reduSaveFinalResultToUserMem(
18411823
QueueCopy, MIsHost, ReduTuple, ReduIndices);
18421824
if (CopyEvent)
18431825
MLastEvent = *CopyEvent;
@@ -2641,7 +2623,7 @@ class __SYCL_EXPORT handler {
26412623
// in handler from reduction methods.
26422624
template <typename T, class BinaryOperation, int Dims, size_t Extent,
26432625
typename RedOutVar>
2644-
friend class ext::oneapi::detail::reduction_impl_algo;
2626+
friend class detail::reduction_impl_algo;
26452627

26462628
#ifndef __SYCL_DEVICE_ONLY__
26472629
friend void detail::associateWithHandler(handler &,

sycl/include/sycl/queue.hpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1003,8 +1003,7 @@ class __SYCL_EXPORT queue {
10031003
/// const KernelType &KernelFunc".
10041004
template <typename KernelName = detail::auto_name, int Dims,
10051005
typename... RestT>
1006-
std::enable_if_t<
1007-
ext::oneapi::detail::AreAllButLastReductions<RestT...>::value, event>
1006+
std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value, event>
10081007
parallel_for(nd_range<Dims> Range, RestT &&...Rest) {
10091008
// Actual code location needs to be captured from KernelInfo object.
10101009
const detail::code_location CodeLoc = {};
@@ -1134,8 +1133,7 @@ class __SYCL_EXPORT queue {
11341133
/// \param Range specifies the global work space of the kernel
11351134
/// \param KernelFunc is the Kernel functor or lambda
11361135
template <typename KernelName, int Dims, typename... RestT>
1137-
std::enable_if_t<
1138-
ext::oneapi::detail::AreAllButLastReductions<RestT...>::value, event>
1136+
std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value, event>
11391137
parallel_for_impl(range<Dims> Range, RestT &&...Rest) {
11401138
// Actual code location needs to be captured from KernelInfo object.
11411139
const detail::code_location CodeLoc = {};

0 commit comments

Comments
 (0)