@@ -234,11 +234,6 @@ class RoundedRangeKernelWithKH {
234234 KernelType KernelFunc;
235235};
236236
237- } // namespace detail
238-
239- namespace ext {
240- namespace oneapi {
241- namespace detail {
242237template <typename T, class BinaryOperation , int Dims, size_t Extent,
243238 typename RedOutVar>
244239class reduction_impl_algo ;
@@ -320,8 +315,6 @@ template <class FunctorTy>
320315event withAuxHandler (std::shared_ptr<detail::queue_impl> Queue, bool IsHost,
321316 FunctorTy Func);
322317} // namespace detail
323- } // namespace oneapi
324- } // namespace ext
325318
326319// / Command group handler class.
327320// /
@@ -468,8 +461,7 @@ class __SYCL_EXPORT handler {
468461 }
469462
470463 template <class FunctorTy >
471- friend event
472- ext::oneapi::detail::withAuxHandler (std::shared_ptr<detail::queue_impl> Queue,
464+ friend event detail::withAuxHandler (std::shared_ptr<detail::queue_impl> Queue,
473465 bool IsHost, FunctorTy Func);
474466 // / }@
475467
@@ -1618,20 +1610,18 @@ class __SYCL_EXPORT handler {
16181610#ifdef __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS
16191611 __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS;
16201612#else
1621- ext::oneapi:: detail::reduGetMaxNumConcurrentWorkGroups (MQueue);
1613+ detail::reduGetMaxNumConcurrentWorkGroups (MQueue);
16221614#endif
16231615 // TODO: currently the preferred work group size is determined for the given
16241616 // queue/device, while it is safer to use queries to the kernel pre-compiled
16251617 // for the device.
1626- size_t PrefWGSize =
1627- ext::oneapi::detail::reduGetPreferredWGSize (MQueue, OneElemSize);
1628- if (ext::oneapi::detail::reduCGFuncForRange<KernelName>(
1629- *this , KernelFunc, Range, PrefWGSize, NumConcurrentWorkGroups,
1630- Redu)) {
1618+ size_t PrefWGSize = detail::reduGetPreferredWGSize (MQueue, OneElemSize);
1619+ if (detail::reduCGFuncForRange<KernelName>(*this , KernelFunc, Range,
1620+ PrefWGSize,
1621+ NumConcurrentWorkGroups, Redu)) {
16311622 this ->finalize ();
16321623 MLastEvent = withAuxHandler (QueueCopy, [&](handler &CopyHandler) {
1633- ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
1634- CopyHandler, Redu);
1624+ detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
16351625 });
16361626 }
16371627 }
@@ -1657,8 +1647,8 @@ class __SYCL_EXPORT handler {
16571647
16581648 if (D.has (aspect::atomic64)) {
16591649
1660- ext::oneapi:: detail::reduCGFuncAtomic64<KernelName>(*this , KernelFunc,
1661- Range, Redu);
1650+ detail::reduCGFuncAtomic64<KernelName>(*this , KernelFunc, Range ,
1651+ Redu);
16621652 } else {
16631653 // Resort to basic implementation as well.
16641654 parallel_for_impl<KernelName>(Range, Redu, KernelFunc);
@@ -1667,8 +1657,7 @@ class __SYCL_EXPORT handler {
16671657 } else {
16681658 // Use fast sycl::atomic operations to update reduction variable at the
16691659 // end of each work-group work.
1670- ext::oneapi::detail::reduCGFunc<KernelName>(*this , KernelFunc, Range,
1671- Redu);
1660+ detail::reduCGFunc<KernelName>(*this , KernelFunc, Range, Redu);
16721661 }
16731662 // If the reduction variable must be initialized with the identity value
16741663 // before the kernel run, then an additional working accessor is created,
@@ -1682,8 +1671,7 @@ class __SYCL_EXPORT handler {
16821671 if (Reduction::is_usm || Redu.initializeToIdentity ()) {
16831672 this ->finalize ();
16841673 MLastEvent = withAuxHandler (QueueCopy, [&](handler &CopyHandler) {
1685- ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
1686- CopyHandler, Redu);
1674+ detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
16871675 });
16881676 }
16891677 }
@@ -1719,8 +1707,7 @@ class __SYCL_EXPORT handler {
17191707 // TODO: currently the maximal work group size is determined for the given
17201708 // queue/device, while it may be safer to use queries to the kernel compiled
17211709 // for the device.
1722- size_t MaxWGSize =
1723- ext::oneapi::detail::reduGetMaxWGSize (MQueue, OneElemSize);
1710+ size_t MaxWGSize = detail::reduGetMaxWGSize (MQueue, OneElemSize);
17241711 if (Range.get_local_range ().size () > MaxWGSize)
17251712 throw sycl::runtime_error (" The implementation handling parallel_for with"
17261713 " reduction requires work group size not bigger"
@@ -1729,7 +1716,7 @@ class __SYCL_EXPORT handler {
17291716 PI_ERROR_INVALID_WORK_GROUP_SIZE);
17301717
17311718 // 1. Call the kernel that includes user's lambda function.
1732- ext::oneapi:: detail::reduCGFunc<KernelName>(*this , KernelFunc, Range, Redu);
1719+ detail::reduCGFunc<KernelName>(*this , KernelFunc, Range, Redu);
17331720 std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
17341721 this ->finalize ();
17351722
@@ -1749,15 +1736,14 @@ class __SYCL_EXPORT handler {
17491736 size_t NWorkItems = Range.get_group_range ().size ();
17501737 while (NWorkItems > 1 ) {
17511738 MLastEvent = withAuxHandler (QueueCopy, [&](handler &AuxHandler) {
1752- NWorkItems = ext::oneapi:: detail::reduAuxCGFunc<KernelName, KernelType>(
1739+ NWorkItems = detail::reduAuxCGFunc<KernelName, KernelType>(
17531740 AuxHandler, NWorkItems, MaxWGSize, Redu);
17541741 });
17551742 } // end while (NWorkItems > 1)
17561743
17571744 if (Reduction::is_usm || Reduction::is_dw_acc) {
17581745 MLastEvent = withAuxHandler (QueueCopy, [&](handler &CopyHandler) {
1759- ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
1760- CopyHandler, Redu);
1746+ detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
17611747 });
17621748 }
17631749 }
@@ -1798,46 +1784,42 @@ class __SYCL_EXPORT handler {
17981784 // c) Repeat the steps (a) and (b) to get one final sum.
17991785 template <typename KernelName = detail::auto_name, int Dims,
18001786 typename ... RestT>
1801- std::enable_if_t <
1802- (sizeof ...(RestT) >= 3 &&
1803- ext::oneapi::detail::AreAllButLastReductions<RestT...>::value)>
1787+ std::enable_if_t <(sizeof ...(RestT) >= 3 &&
1788+ detail::AreAllButLastReductions<RestT...>::value)>
18041789 parallel_for (nd_range<Dims> Range, RestT... Rest) {
18051790 std::tuple<RestT...> ArgsTuple (Rest...);
18061791 constexpr size_t NumArgs = sizeof ...(RestT);
18071792 auto KernelFunc = std::get<NumArgs - 1 >(ArgsTuple);
18081793 auto ReduIndices = std::make_index_sequence<NumArgs - 1 >();
1809- auto ReduTuple =
1810- ext::oneapi::detail::tuple_select_elements (ArgsTuple, ReduIndices);
1794+ auto ReduTuple = detail::tuple_select_elements (ArgsTuple, ReduIndices);
18111795
18121796 size_t LocalMemPerWorkItem =
1813- ext::oneapi:: detail::reduGetMemPerWorkItem (ReduTuple, ReduIndices);
1797+ detail::reduGetMemPerWorkItem (ReduTuple, ReduIndices);
18141798 // TODO: currently the maximal work group size is determined for the given
18151799 // queue/device, while it is safer to use queries to the kernel compiled
18161800 // for the device.
1817- size_t MaxWGSize =
1818- ext::oneapi::detail::reduGetMaxWGSize (MQueue, LocalMemPerWorkItem);
1801+ size_t MaxWGSize = detail::reduGetMaxWGSize (MQueue, LocalMemPerWorkItem);
18191802 if (Range.get_local_range ().size () > MaxWGSize)
18201803 throw sycl::runtime_error (" The implementation handling parallel_for with"
18211804 " reduction requires work group size not bigger"
18221805 " than " +
18231806 std::to_string (MaxWGSize),
18241807 PI_ERROR_INVALID_WORK_GROUP_SIZE);
18251808
1826- ext::oneapi:: detail::reduCGFuncMulti<KernelName>(*this , KernelFunc, Range,
1827- ReduTuple, ReduIndices);
1809+ detail::reduCGFuncMulti<KernelName>(*this , KernelFunc, Range, ReduTuple ,
1810+ ReduIndices);
18281811 std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
18291812 this ->finalize ();
18301813
18311814 size_t NWorkItems = Range.get_group_range ().size ();
18321815 while (NWorkItems > 1 ) {
18331816 MLastEvent = withAuxHandler (QueueCopy, [&](handler &AuxHandler) {
1834- NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName,
1835- decltype (KernelFunc)>(
1817+ NWorkItems = detail::reduAuxCGFunc<KernelName, decltype (KernelFunc)>(
18361818 AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
18371819 });
18381820 } // end while (NWorkItems > 1)
18391821
1840- auto CopyEvent = ext::oneapi:: detail::reduSaveFinalResultToUserMem (
1822+ auto CopyEvent = detail::reduSaveFinalResultToUserMem (
18411823 QueueCopy, MIsHost, ReduTuple, ReduIndices);
18421824 if (CopyEvent)
18431825 MLastEvent = *CopyEvent;
@@ -2641,7 +2623,7 @@ class __SYCL_EXPORT handler {
26412623 // in handler from reduction methods.
26422624 template <typename T, class BinaryOperation , int Dims, size_t Extent,
26432625 typename RedOutVar>
2644- friend class ext ::oneapi:: detail::reduction_impl_algo;
2626+ friend class detail ::reduction_impl_algo;
26452627
26462628#ifndef __SYCL_DEVICE_ONLY__
26472629 friend void detail::associateWithHandler (handler &,
0 commit comments