diff --git a/sycl/include/sycl/detail/nd_range_view.hpp b/sycl/include/sycl/detail/nd_range_view.hpp new file mode 100644 index 0000000000000..ea7f403f171b3 --- /dev/null +++ b/sycl/include/sycl/detail/nd_range_view.hpp @@ -0,0 +1,57 @@ +//==---- nd_range_view.hpp --- SYCL iteration with reference to ranges ---==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +namespace sycl { +inline namespace _V1 { +namespace detail { + +class NDRDescT; + +// The structure to keep dimension and references to ranges unified for +// all dimensions. +class nd_range_view { + +public: + nd_range_view() = default; + nd_range_view(const nd_range_view &Desc) = default; + nd_range_view(nd_range_view &&Desc) = default; + nd_range_view &operator=(const nd_range_view &Desc) = default; + nd_range_view &operator=(nd_range_view &&Desc) = default; + + template + nd_range_view(sycl::range &N, bool SetNumWorkGroups = false) + : MGlobalSize(&(N[0])), MSetNumWorkGroups(SetNumWorkGroups), + MDims{size_t(Dims_)} {} + + template + nd_range_view(sycl::range &GlobalSize, sycl::id &Offset) + : MGlobalSize(&(GlobalSize[0])), MOffset(&(Offset[0])), + MDims{size_t(Dims_)} {} + + template + nd_range_view(sycl::nd_range &ExecutionRange) + : MGlobalSize(&(ExecutionRange.globalSize[0])), + MLocalSize(&(ExecutionRange.localSize[0])), + MOffset(&(ExecutionRange.offset[0])), MDims{size_t(Dims_)} {} + + sycl::detail::NDRDescT toNDRDescT() const; + + const size_t *MGlobalSize = nullptr; + const size_t *MLocalSize = nullptr; + const size_t *MOffset = nullptr; + bool MSetNumWorkGroups = false; + size_t MDims = 0; +}; + +} // namespace detail +} // namespace _V1 +} // namespace sycl diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index d8d46d2a27814..6b2c4f65c7a85 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -362,6 +362,24 @@ class RoundedRangeKernelWithKH { } }; +template ::value> * = nullptr> +auto getRangeRoundedKernelLambda(KernelType KernelFunc, range UserRange) { + return detail::RoundedRangeKernelWithKH{ + UserRange, KernelFunc}; +} + +template ::value> * = nullptr> +auto getRangeRoundedKernelLambda(KernelType KernelFunc, range UserRange) { + return detail::RoundedRangeKernel{ + UserRange, KernelFunc}; +} + using std::enable_if_t; using sycl::detail::queue_impl; @@ -1218,7 +1236,7 @@ class __SYCL_EXPORT handler { if (HasRoundedRange) { using NameWT = typename detail::get_kernel_wrapper_name_t::name; auto Wrapper = - getRangeRoundedKernelLambda( + detail::getRangeRoundedKernelLambda( KernelFunc, UserRange); using KName = std::conditional_t::value, @@ -3265,26 +3283,6 @@ class __SYCL_EXPORT handler { void GetRangeRoundingSettings(size_t &MinFactor, size_t &GoodFactor, size_t &MinRange); - template ::value> * = nullptr> - auto getRangeRoundedKernelLambda(KernelType KernelFunc, - range UserRange) { - return detail::RoundedRangeKernelWithKH{UserRange, KernelFunc}; - } - - template ::value> * = nullptr> - auto getRangeRoundedKernelLambda(KernelType KernelFunc, - range UserRange) { - return detail::RoundedRangeKernel{ - UserRange, KernelFunc}; - } - #ifndef __INTEL_PREVIEW_BREAKING_CHANGES const std::shared_ptr &getContextImplPtr() const; #endif diff --git a/sycl/include/sycl/nd_range.hpp b/sycl/include/sycl/nd_range.hpp index e4ff4881be17a..30816b8a4b354 100644 --- a/sycl/include/sycl/nd_range.hpp +++ b/sycl/include/sycl/nd_range.hpp @@ -15,6 +15,10 @@ namespace sycl { inline namespace _V1 { +namespace detail { +class nd_range_view; +} + /// Defines the iteration domain of both the work-groups and the overall /// dispatch. /// @@ -65,6 +69,8 @@ template class nd_range { bool operator!=(const nd_range &rhs) const { return !(*this == rhs); } + + friend class sycl::_V1::detail::nd_range_view; }; } // namespace _V1 diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index f7bff57c2df9a..070880f48682a 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -21,6 +21,7 @@ #include // for checkValueRange #include // for is_queue_info_... #include // for KernelInfo +#include #include #include // for OwnerLessBase #include // for device @@ -40,6 +41,7 @@ #include // for nd_range #include // for property_list #include // for range +#include #include // for sycl::span #include // for size_t @@ -64,18 +66,16 @@ template auto get_native(const SyclObjectT &Obj) -> backend_return_t; -template event __SYCL_EXPORT submit_kernel_direct_with_event_impl( - const queue &Queue, const nd_range &Range, + const queue &Queue, const detail::nd_range_view &RangeView, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); -template void __SYCL_EXPORT submit_kernel_direct_without_event_impl( - const queue &Queue, const nd_range &Range, + const queue &Queue, const detail::nd_range_view &RangeView, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, @@ -162,118 +162,41 @@ class __SYCL_EXPORT SubmissionInfo { } // namespace v1 -template struct TransformUserItemType { + using type = std::conditional_t< + std::is_convertible_v, LambdaArgType>, nd_item, + std::conditional_t, LambdaArgType>, + item, LambdaArgType>>; +}; + +template auto submit_kernel_direct( - const queue &Queue, const nd_range &Range, + const queue &Queue, detail::nd_range_view RangeView, KernelTypeUniversalRef &&KernelFunc, sycl::span DepEvents, const PropertiesT &ExtraProps = ext::oneapi::experimental::empty_properties_t{}, - const detail::code_location &CodeLoc = detail::code_location::current()) { - detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - - using KernelType = - std::remove_const_t>; - - using NameT = - typename detail::get_kernel_name_t::name; - - detail::KernelWrapper::wrap(KernelFunc); - - HostKernelRef - HostKernel(std::forward(KernelFunc)); - - // Instantiating the kernel on the host improves debugging. - // Passing this pointer to another translation unit prevents optimization. -#ifndef NDEBUG - // TODO: call library to prevent dropping call due to optimization. - (void) - detail::GetInstantiateKernelOnHostPtr(); -#endif - - detail::DeviceKernelInfo *DeviceKernelInfoPtr = - &detail::getDeviceKernelInfo(); - constexpr auto Info = detail::CompileTimeKernelInfo; - - assert(Info.Name != std::string_view{} && "Kernel must have a name!"); - - static_assert( - Info.Name == std::string_view{} || sizeof(KernelType) == Info.KernelSize, - "Unexpected kernel lambda size. This can be caused by an " - "external host compiler producing a lambda with an " - "unexpected layout. This is a limitation of the compiler." - "In many cases the difference is related to capturing constexpr " - "variables. In such cases removing constexpr specifier aligns the " - "captures between the host compiler and the device compiler." - "\n" - "In case of MSVC, passing " - "-fsycl-host-compiler-options='/std:c++latest' " - "might also help."); - - detail::KernelPropertyHolderStructTy ParsedProperties; - if constexpr (ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod< - const KernelType &>::value) { - // Merge properties via get() and manually specified properties. - // get() method is used for specifying kernel properties but properties - // passed via launch_config (ExtraProps) should be kernel launch properties. - // They are mutually exclusive, so there should not be any conflict when - // merging properties. merge_properties() throws if there's a conflict. - auto MergedProps = - sycl::ext::oneapi::experimental::detail::merge_properties( - ExtraProps, - KernelFunc.get(ext::oneapi::experimental::properties_tag{})); - - ParsedProperties = extractKernelProperties(MergedProps); - } else { - ParsedProperties = extractKernelProperties(ExtraProps); - } - - if constexpr (EventNeeded) { - return submit_kernel_direct_with_event_impl( - Queue, Range, HostKernel, DeviceKernelInfoPtr, DepEvents, - ParsedProperties, TlsCodeLocCapture.query(), - TlsCodeLocCapture.isToplevel()); - } else { - submit_kernel_direct_without_event_impl( - Queue, Range, HostKernel, DeviceKernelInfoPtr, DepEvents, - ParsedProperties, TlsCodeLocCapture.query(), - TlsCodeLocCapture.isToplevel()); - } -} + const detail::code_location &CodeLoc = detail::code_location::current()); template auto submit_kernel_direct_parallel_for( - const queue &Queue, const nd_range &Range, + const queue &Queue, nd_range Range, KernelTypeUniversalRef &&KernelFunc, sycl::span DepEvents = {}, const PropertiesT &Props = ext::oneapi::experimental::empty_properties_t{}, - const detail::code_location &CodeLoc = detail::code_location::current()) { - - using KernelType = - std::remove_const_t>; + const detail::code_location &CodeLoc = detail::code_location::current()); - using LambdaArgType = - sycl::detail::lambda_arg_type>; - static_assert( - std::is_convertible_v, LambdaArgType>, - "Kernel argument of a sycl::parallel_for with sycl::nd_range " - "must be either sycl::nd_item or be convertible from sycl::nd_item"); - using TransformedArgType = sycl::nd_item; - -#ifndef __SYCL_DEVICE_ONLY__ - detail::checkValueRange(Range); -#endif - - return submit_kernel_direct( - Queue, Range, std::forward(KernelFunc), DepEvents, - Props, CodeLoc); -} +template +auto submit_kernel_direct_parallel_for( + const queue &Queue, range Range, KernelTypeUniversalRef &&KernelFunc, + sycl::span DepEvents = {}, + const PropertiesT &Props = ext::oneapi::experimental::empty_properties_t{}, + const detail::code_location &CodeLoc = detail::code_location::current()); template DepEvents = {}, const PropertiesT &Props = ext::oneapi::experimental::empty_properties_t{}, - const detail::code_location &CodeLoc = detail::code_location::current()) { - - return submit_kernel_direct( - Queue, nd_range<1>{1, 1}, - std::forward(KernelFunc), DepEvents, Props, - CodeLoc); -} + const detail::code_location &CodeLoc = detail::code_location::current()); } // namespace detail @@ -3984,11 +3899,27 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { RestT &&...Rest) { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return submit( - [&](handler &CGH) { - CGH.template parallel_for(Range, Properties, Rest...); - }, - TlsCodeLocCapture.query()); + using KernelType = std::remove_const_t< + std::remove_reference_t>>>; + using LambdaArgType = sycl::detail::lambda_arg_type>; + using TransformedArgType = std::conditional_t< + std::is_integral::value && Dims == 1, item, + typename detail::TransformUserItemType::type>; + + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. + if constexpr (sizeof...(RestT) == 1 && + !(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, TransformedArgType>::value)) { + return detail::submit_kernel_direct_parallel_for( + *this, Range, Rest..., {}, Properties, TlsCodeLocCapture.query()); + } else { + return submit( + [&](handler &CGH) { + CGH.template parallel_for(Range, Properties, Rest...); + }, + TlsCodeLocCapture.query()); + } } /// parallel_for_impl with a kernel represented as a lambda + range that @@ -4018,12 +3949,29 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { RestT &&...Rest) { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return submit( - [&](handler &CGH) { - CGH.depends_on(DepEvent); - CGH.template parallel_for(Range, Properties, Rest...); - }, - TlsCodeLocCapture.query()); + using KernelType = std::remove_const_t< + std::remove_reference_t>>>; + using LambdaArgType = sycl::detail::lambda_arg_type>; + using TransformedArgType = std::conditional_t< + std::is_integral::value && Dims == 1, item, + typename detail::TransformUserItemType::type>; + + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. + if constexpr (sizeof...(RestT) == 1 && + !(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, TransformedArgType>::value)) { + return detail::submit_kernel_direct_parallel_for( + *this, Range, Rest..., sycl::span(&DepEvent, 1), + Properties, TlsCodeLocCapture.query()); + } else { + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvent); + CGH.template parallel_for(Range, Properties, Rest...); + }, + TlsCodeLocCapture.query()); + } } /// parallel_for_impl with a kernel represented as a lambda + range that @@ -4055,12 +4003,29 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { PropertiesT Properties, RestT &&...Rest) { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return submit( - [&](handler &CGH) { - CGH.depends_on(DepEvents); - CGH.template parallel_for(Range, Properties, Rest...); - }, - TlsCodeLocCapture.query()); + using KernelType = std::remove_const_t< + std::remove_reference_t>>>; + using LambdaArgType = sycl::detail::lambda_arg_type>; + using TransformedArgType = std::conditional_t< + std::is_integral::value && Dims == 1, item, + typename detail::TransformUserItemType::type>; + + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. + if constexpr (sizeof...(RestT) == 1 && + !(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, TransformedArgType>::value)) { + return detail::submit_kernel_direct_parallel_for( + *this, Range, Rest..., DepEvents, Properties, + TlsCodeLocCapture.query()); + } else { + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvents); + CGH.template parallel_for(Range, Properties, Rest...); + }, + TlsCodeLocCapture.query()); + } } /// parallel_for_impl version with a kernel represented as a lambda + range @@ -4102,6 +4067,235 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } }; +namespace detail { + +template +auto submit_kernel_direct(const queue &Queue, detail::nd_range_view RangeView, + KernelTypeUniversalRef &&KernelFunc, + sycl::span DepEvents, + const PropertiesT &ExtraProps, + const detail::code_location &CodeLoc) { + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + + using KernelType = + std::remove_const_t>; + + detail::KernelWrapper::wrap(KernelFunc); + + HostKernelRef + HostKernel(std::forward(KernelFunc)); + + // Instantiating the kernel on the host improves debugging. + // Passing this pointer to another translation unit prevents optimization. +#ifndef NDEBUG + // TODO: call library to prevent dropping call due to optimization. + (void) + detail::GetInstantiateKernelOnHostPtr(); +#endif + + detail::DeviceKernelInfo *DeviceKernelInfoPtr = + &detail::getDeviceKernelInfo(); + constexpr auto Info = detail::CompileTimeKernelInfo; + + assert(Info.Name != std::string_view{} && "Kernel must have a name!"); + + static_assert( + Info.Name == std::string_view{} || sizeof(KernelType) == Info.KernelSize, + "Unexpected kernel lambda size. This can be caused by an " + "external host compiler producing a lambda with an " + "unexpected layout. This is a limitation of the compiler." + "In many cases the difference is related to capturing constexpr " + "variables. In such cases removing constexpr specifier aligns the " + "captures between the host compiler and the device compiler." + "\n" + "In case of MSVC, passing " + "-fsycl-host-compiler-options='/std:c++latest' " + "might also help."); + + detail::KernelPropertyHolderStructTy ParsedProperties; + if constexpr (ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod< + const KernelType &>::value) { + // Merge properties via get() and manually specified properties. + // get() method is used for specifying kernel properties but properties + // passed via launch_config (ExtraProps) should be kernel launch properties. + // They are mutually exclusive, so there should not be any conflict when + // merging properties. merge_properties() throws if there's a conflict. + auto MergedProps = + sycl::ext::oneapi::experimental::detail::merge_properties( + ExtraProps, + KernelFunc.get(ext::oneapi::experimental::properties_tag{})); + + ParsedProperties = extractKernelProperties(MergedProps); + } else { + ParsedProperties = extractKernelProperties(ExtraProps); + } + + if constexpr (EventNeeded) { + return submit_kernel_direct_with_event_impl( + Queue, RangeView, HostKernel, DeviceKernelInfoPtr, DepEvents, + ParsedProperties, TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); + } else { + submit_kernel_direct_without_event_impl( + Queue, RangeView, HostKernel, DeviceKernelInfoPtr, DepEvents, + ParsedProperties, TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); + } +} + +template +auto submit_kernel_direct_parallel_for(const queue &Queue, nd_range Range, + KernelTypeUniversalRef &&KernelFunc, + sycl::span DepEvents, + const PropertiesT &Props, + const detail::code_location &CodeLoc) { + + using KernelType = + std::remove_const_t>; + using NameT = + typename detail::get_kernel_name_t::name; + + using LambdaArgType = + sycl::detail::lambda_arg_type>; + static_assert( + std::is_convertible_v, LambdaArgType>, + "Kernel argument of a sycl::parallel_for with sycl::nd_range " + "must be either sycl::nd_item or be convertible from sycl::nd_item"); + using TransformedArgType = sycl::nd_item; + +#ifndef __SYCL_DEVICE_ONLY__ + detail::checkValueRange(Range); +#endif + + return submit_kernel_direct( + Queue, detail::nd_range_view(Range), + std::forward(KernelFunc), DepEvents, Props, + CodeLoc); +} + +template +auto submit_kernel_direct_parallel_for(const queue &Queue, range Range, + KernelTypeUniversalRef &&KernelFunc, + sycl::span DepEvents, + const PropertiesT &Props, + const detail::code_location &CodeLoc) { + +#ifndef __SYCL_DEVICE_ONLY__ + if (!range_size_fits_in_size_t(Range)) + throw sycl::exception(make_error_code(errc::runtime), + "The total number of work-items in " + "a range must fit within size_t"); +#endif + + using KernelType = + std::remove_const_t>; + using NameT = + typename detail::get_kernel_name_t::name; + using LambdaArgType = sycl::detail::lambda_arg_type>; + + // If 1D kernel argument is an integral type, convert it to sycl::item<1> + // If user type is convertible from sycl::item/sycl::nd_item, use + // sycl::item/sycl::nd_item to transport item information + using TransformedArgType = std::conditional_t< + std::is_integral::value && Dims == 1, item, + typename TransformUserItemType::type>; + + static_assert(!std::is_same_v>, + "Kernel argument cannot have a sycl::nd_item type in " + "sycl::parallel_for with sycl::range"); + + static_assert(std::is_convertible_v, LambdaArgType> || + std::is_convertible_v, LambdaArgType>, + "sycl::parallel_for(sycl::range) kernel must have the " + "first argument of sycl::item type, or of a type which is " + "implicitly convertible from sycl::item"); + + using RefLambdaArgType = std::add_lvalue_reference_t; + static_assert( + (std::is_invocable_v), + "SYCL kernel lambda/functor has an unexpected signature, it should be " + "invocable with sycl::item"); + + // Range rounding can be disabled by the user. + // Range rounding is supported only for newer SYCL standards. +#if !defined(__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__) && \ + SYCL_LANGUAGE_VERSION >= 202012L + auto [RoundedRange, HasRoundedRange] = + detail::getRoundedRange(Range, Queue.get_device()); + if (HasRoundedRange) { + using NameWT = typename detail::get_kernel_wrapper_name_t::name; + auto Wrapper = + detail::getRangeRoundedKernelLambda( + KernelFunc, Range); + + using KTypeWrapper = decltype(Wrapper); + using KName = std::conditional_t::value, + KTypeWrapper, NameWT>; +#ifndef __SYCL_DEVICE_ONLY__ + // We are executing over the rounded range, but there are still + // items/ids that are are constructed in ther range rounded + // kernel use items/ids in the user range, which means that + // __SYCL_ASSUME_INT can still be violated. So check the bounds + // of the user range, instead of the rounded range. + detail::checkValueRange(Range); +#endif + return submit_kernel_direct( + Queue, detail::nd_range_view(Range), std::move(Wrapper), DepEvents, + Props, CodeLoc); + } else +#endif // !__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ && + // SYCL_LANGUAGE_VERSION >= 202012L + { +#ifndef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ +#ifndef __SYCL_DEVICE_ONLY__ + detail::checkValueRange(Range); +#endif + return submit_kernel_direct( + Queue, detail::nd_range_view(Range), + std::forward(KernelFunc), DepEvents, Props, + CodeLoc); + +#else + (void)Range; + (void)Props; + (void)KernelFunc; +#endif // __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ + } +} + +template +auto submit_kernel_direct_single_task(const queue &Queue, + KernelTypeUniversalRef &&KernelFunc, + sycl::span DepEvents, + const PropertiesT &Props, + const detail::code_location &CodeLoc) { + + using KernelType = + std::remove_const_t>; + using NameT = + typename detail::get_kernel_name_t::name; + + return submit_kernel_direct( + Queue, detail::nd_range_view(), + std::forward(KernelFunc), DepEvents, Props, + CodeLoc); +} +} // namespace detail + } // namespace _V1 } // namespace sycl diff --git a/sycl/include/sycl/range_rounding.hpp b/sycl/include/sycl/range_rounding.hpp new file mode 100644 index 0000000000000..cf54004db4d73 --- /dev/null +++ b/sycl/include/sycl/range_rounding.hpp @@ -0,0 +1,161 @@ +//==----------- range_rounding.hpp --- SYCL range rounding utils -----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +#include // for size_t + +namespace sycl { +inline namespace _V1 { + +namespace detail { + +void __SYCL_EXPORT GetRangeRoundingSettings(size_t &MinFactor, + size_t &GoodFactor, + size_t &MinRange); + +std::tuple, bool> + __SYCL_EXPORT getMaxWorkGroups_v2(const device &Device); + +bool __SYCL_EXPORT DisableRangeRounding(); + +bool __SYCL_EXPORT RangeRoundingTrace(); + +template +std::tuple, bool> getRoundedRange(range UserRange, + const device &Device) { + range RoundedRange = UserRange; + // Disable the rounding-up optimizations under these conditions: + // 1. The env var SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is set. + // 2. The kernel is provided via an interoperability method (this uses a + // different code path). + // 3. The range is already a multiple of the rounding factor. + // + // Cases 2 and 3 could be supported with extra effort. + // As an optimization for the common case it is an + // implementation choice to not support those scenarios. + // Note that "this_item" is a free function, i.e. not tied to any + // specific id or item. When concurrent parallel_fors are executing + // on a device it is difficult to tell which parallel_for the call is + // being made from. One could replicate portions of the + // call-graph to make this_item calls kernel-specific but this is + // not considered worthwhile. + + // Perform range rounding if rounding-up is enabled. + if (DisableRangeRounding()) + return {range{}, false}; + + // Range should be a multiple of this for reasonable performance. + size_t MinFactorX = 16; + // Range should be a multiple of this for improved performance. + size_t GoodFactor = 32; + // Range should be at least this to make rounding worthwhile. + size_t MinRangeX = 1024; + + // Check if rounding parameters have been set through environment: + // SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange + GetRangeRoundingSettings(MinFactorX, GoodFactor, MinRangeX); + + // In SYCL, each dimension of a global range size is specified by + // a size_t, which can be up to 64 bits. All backends should be + // able to accept a kernel launch with a 32-bit global range size + // (i.e. do not throw an error). The OpenCL CPU backend will + // accept every 64-bit global range, but the GPU backends will not + // generally accept every 64-bit global range. So, when we get a + // non-32-bit global range, we wrap the old kernel in a new kernel + // that has each work item peform multiple invocations the old + // kernel in a 32-bit global range. + id MaxNWGs = [&] { + auto [MaxWGs, HasMaxWGs] = getMaxWorkGroups_v2(Device); + if (!HasMaxWGs) { + id Default; + for (int i = 0; i < Dims; ++i) + Default[i] = (std::numeric_limits::max)(); + return Default; + } + + id IdResult; + size_t Limit = (std::numeric_limits::max)(); + for (int i = 0; i < Dims; ++i) + IdResult[i] = (std::min)(Limit, MaxWGs[Dims - i - 1]); + return IdResult; + }(); + auto M = (std::numeric_limits::max)(); + range MaxRange; + for (int i = 0; i < Dims; ++i) { + auto DesiredSize = MaxNWGs[i] * GoodFactor; + MaxRange[i] = + DesiredSize <= M ? DesiredSize : (M / GoodFactor) * GoodFactor; + } + + bool DidAdjust = false; + auto Adjust = [&](int Dim, size_t Value) { + if (RangeRoundingTrace()) + std::cout << "parallel_for range adjusted at dim " << Dim << " from " + << RoundedRange[Dim] << " to " << Value << std::endl; + RoundedRange[Dim] = Value; + DidAdjust = true; + }; + +#ifdef __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__ + size_t GoodExpFactor = 1; + switch (Dims) { + case 1: + GoodExpFactor = 32; // Make global range multiple of {32} + break; + case 2: + GoodExpFactor = 16; // Make global range multiple of {16, 16} + break; + case 3: + GoodExpFactor = 8; // Make global range multiple of {8, 8, 8} + break; + } + + // Check if rounding parameters have been set through environment: + // SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange + GetRangeRoundingSettings(MinFactorX, GoodExpFactor, MinRangeX); + + for (auto i = 0; i < Dims; ++i) + if (UserRange[i] % GoodExpFactor) { + Adjust(i, ((UserRange[i] / GoodExpFactor) + 1) * GoodExpFactor); + } +#else + // Perform range rounding if there are sufficient work-items to + // need rounding and the user-specified range is not a multiple of + // a "good" value. + if (RoundedRange[0] % MinFactorX != 0 && RoundedRange[0] >= MinRangeX) { + // It is sufficient to round up just the first dimension. + // Multiplying the rounded-up value of the first dimension + // by the values of the remaining dimensions (if any) + // will yield a rounded-up value for the total range. + Adjust(0, ((RoundedRange[0] + GoodFactor - 1) / GoodFactor) * GoodFactor); + } +#endif // __SYCL_EXP_PARALLEL_FOR_RANGE_ROUNDING__ +#ifdef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ + // If we are forcing range rounding kernels to be used, we always want the + // rounded range kernel to be generated, even if rounding isn't needed + DidAdjust = true; +#endif // __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ + + for (int i = 0; i < Dims; ++i) + if (RoundedRange[i] > MaxRange[i]) + Adjust(i, MaxRange[i]); + + if (!DidAdjust) + return {range{}, false}; + return {RoundedRange, true}; +} + +} // namespace detail +} // namespace _V1 +} // namespace sycl \ No newline at end of file diff --git a/sycl/source/detail/ndrange_desc.hpp b/sycl/source/detail/ndrange_desc.hpp index 30108d729db31..5426f2e09d43f 100644 --- a/sycl/source/detail/ndrange_desc.hpp +++ b/sycl/source/detail/ndrange_desc.hpp @@ -32,51 +32,62 @@ class NDRDescT { NDRDescT(const NDRDescT &Desc) = default; NDRDescT(NDRDescT &&Desc) = default; - template - NDRDescT(sycl::range N, bool SetNumWorkGroups) : Dims{size_t(Dims_)} { + NDRDescT(const size_t *N, bool SetNumWorkGroups, int DimsVal) + : Dims{size_t(DimsVal)} { if (SetNumWorkGroups) { - for (size_t I = 0; I < Dims_; ++I) { + for (size_t I = 0; I < Dims; ++I) { NumWorkGroups[I] = N[I]; } } else { - for (size_t I = 0; I < Dims_; ++I) { + for (size_t I = 0; I < Dims; ++I) { GlobalSize[I] = N[I]; } - for (int I = Dims_; I < 3; ++I) { + for (int I = Dims; I < 3; ++I) { GlobalSize[I] = 1; } } } template - NDRDescT(sycl::range NumWorkItems, sycl::range LocalSizes, - sycl::id Offset) - : Dims{size_t(Dims_)} { - for (size_t I = 0; I < Dims_; ++I) { + NDRDescT(sycl::range N, bool SetNumWorkGroups) + : NDRDescT(&(N[0]), SetNumWorkGroups, Dims_) {} + + NDRDescT(const size_t *NumWorkItems, const size_t *LocalSizes, + const size_t *Offset, int DimsVal) + : Dims{size_t(DimsVal)} { + for (size_t I = 0; I < Dims; ++I) { GlobalSize[I] = NumWorkItems[I]; LocalSize[I] = LocalSizes[I]; GlobalOffset[I] = Offset[I]; } - for (int I = Dims_; I < 3; ++I) { + for (int I = Dims; I < 3; ++I) { LocalSize[I] = LocalSizes[0] ? 1 : 0; } - for (int I = Dims_; I < 3; ++I) { + for (int I = Dims; I < 3; ++I) { GlobalSize[I] = 1; } } template - NDRDescT(sycl::range NumWorkItems, sycl::id Offset) - : Dims{size_t(Dims_)} { - for (size_t I = 0; I < Dims_; ++I) { + NDRDescT(sycl::range NumWorkItems, sycl::range LocalSizes, + sycl::id Offset) + : NDRDescT(&(NumWorkItems[0]), &(LocalSizes[0]), &(Offset[0]), Dims_) {} + + NDRDescT(const size_t *NumWorkItems, const size_t *Offset, int DimsVal) + : Dims{size_t(DimsVal)} { + for (size_t I = 0; I < Dims; ++I) { GlobalSize[I] = NumWorkItems[I]; GlobalOffset[I] = Offset[I]; } } + template + NDRDescT(sycl::range NumWorkItems, sycl::id Offset) + : NDRDescT(&(NumWorkItems[0]), &(Offset[0]), Dims_) {} + template NDRDescT(sycl::nd_range ExecutionRange) : NDRDescT(ExecutionRange.get_global_range(), diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 5b7bfb5e90fae..1e01acce7f457 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -126,6 +127,18 @@ prepareSYCLEventAssociatedWithQueue(detail::queue_impl &QueueImpl) { return detail::createSyclObjFromImpl(EventImpl); } +sycl::detail::NDRDescT nd_range_view::toNDRDescT() const { + if (!MGlobalSize) { + return NDRDescT(nd_range<1>{1, 1}); + } else if (MLocalSize) { + return NDRDescT(MGlobalSize, MLocalSize, MOffset, MDims); + } else if (MOffset) { + return NDRDescT(MGlobalSize, MOffset, MDims); + } else { + return NDRDescT(MGlobalSize, MSetNumWorkGroups, MDims); + } +} + const std::vector & queue_impl::getExtendDependencyList(const std::vector &DepEvents, std::vector &MutableVec, @@ -637,6 +650,8 @@ queue_impl::submit_direct(bool CallerNeedsEvent, detail::CG::StorageInitHelper CGData; std::unique_lock Lock(MMutex); + NestedCallsTracker tracker; + // Used by queue_empty() and getLastEvent() MEmpty.store(false, std::memory_order_release); diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 7c793b619ecab..87667c4ff1ef1 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -360,29 +360,29 @@ class queue_impl : public std::enable_shared_from_this { return createSyclObjFromImpl(ResEvent); } - template event submit_kernel_direct_with_event( - const nd_range &Range, detail::HostKernelRefBase &HostKernel, + const detail::nd_range_view &RangeView, + detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = submit_kernel_direct_impl( - NDRDescT{Range}, HostKernel, DeviceKernelInfo, + RangeView.toNDRDescT(), HostKernel, DeviceKernelInfo, /*CallerNeedsEvent*/ true, DepEvents, Props, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } - template void submit_kernel_direct_without_event( - const nd_range &Range, detail::HostKernelRefBase &HostKernel, + const detail::nd_range_view &RangeView, + detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, - /*CallerNeedsEvent*/ false, DepEvents, Props, - CodeLoc, IsTopCodeLoc); + submit_kernel_direct_impl( + RangeView.toNDRDescT(), HostKernel, DeviceKernelInfo, + /*CallerNeedsEvent*/ false, DepEvents, Props, CodeLoc, IsTopCodeLoc); } void submit_without_event(const detail::type_erased_cgfo_ty &CGF, diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index a58b4b234ab3a..d096f814dd2e4 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -20,8 +20,8 @@ namespace sycl { inline namespace _V1 { -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES namespace detail { +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES SubmissionInfo::SubmissionInfo() : impl{std::make_shared()} {} @@ -58,10 +58,41 @@ const ext::oneapi::experimental::event_mode_enum & SubmissionInfo::EventMode() const { return impl->MEventMode; } -} // namespace detail #endif // __INTEL_PREVIEW_BREAKING_CHANGES +void GetRangeRoundingSettings(size_t &MinFactor, size_t &GoodFactor, + size_t &MinRange) { + SYCLConfig::GetSettings( + MinFactor, GoodFactor, MinRange); +} + +std::tuple, bool> +getMaxWorkGroups_v2(const device &Device) { + std::array UrResult = {}; + auto &DeviceImpl = getSyclObjImpl(Device); + + auto Ret = DeviceImpl->getAdapter().call_nocheck( + DeviceImpl->getHandleRef(), + UrInfoCode< + ext::oneapi::experimental::info::device::max_work_groups<3>>::value, + sizeof(UrResult), &UrResult, nullptr); + if (Ret == UR_RESULT_SUCCESS) { + return {UrResult, true}; + } + return {std::array{0, 0, 0}, false}; +} + +bool DisableRangeRounding() { + return SYCLConfig::get(); +} + +bool RangeRoundingTrace() { + return SYCLConfig::get(); +} + +} // namespace detail + queue::queue(const context &SyclContext, const device_selector &DeviceSelector, const async_handler &AsyncHandler, const property_list &PropList) { const std::vector Devs = SyclContext.get_devices(); @@ -463,80 +494,30 @@ void queue::ext_oneapi_set_external_event(const event &external_event) { const property_list &queue::getPropList() const { return impl->getPropList(); } -template event submit_kernel_direct_with_event_impl( - const queue &Queue, const nd_range &Range, + const queue &Queue, const detail::nd_range_view &RangeView, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { return getSyclObjImpl(Queue)->submit_kernel_direct_with_event( - Range, HostKernel, DeviceKernelInfo, DepEvents, Props, CodeLoc, + RangeView, HostKernel, DeviceKernelInfo, DepEvents, Props, CodeLoc, IsTopCodeLoc); } -template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<1>( - const queue &Queue, const nd_range<1> &Range, - detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - sycl::span DepEvents, - const detail::KernelPropertyHolderStructTy &Props, - const detail::code_location &CodeLoc, bool IsTopCodeLoc); - -template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<2>( - const queue &Queue, const nd_range<2> &Range, - detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - sycl::span DepEvents, - const detail::KernelPropertyHolderStructTy &Props, - const detail::code_location &CodeLoc, bool IsTopCodeLoc); - -template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<3>( - const queue &Queue, const nd_range<3> &Range, - detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - sycl::span DepEvents, - const detail::KernelPropertyHolderStructTy &Props, - const detail::code_location &CodeLoc, bool IsTopCodeLoc); - -template void submit_kernel_direct_without_event_impl( - const queue &Queue, const nd_range &Range, + const queue &Queue, const detail::nd_range_view &RangeView, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, sycl::span DepEvents, const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { getSyclObjImpl(Queue)->submit_kernel_direct_without_event( - Range, HostKernel, DeviceKernelInfo, DepEvents, Props, CodeLoc, + RangeView, HostKernel, DeviceKernelInfo, DepEvents, Props, CodeLoc, IsTopCodeLoc); } -template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<1>( - const queue &Queue, const nd_range<1> &Range, - detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - sycl::span DepEvents, - const detail::KernelPropertyHolderStructTy &Props, - const detail::code_location &CodeLoc, bool IsTopCodeLoc); - -template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<2>( - const queue &Queue, const nd_range<2> &Range, - detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - sycl::span DepEvents, - const detail::KernelPropertyHolderStructTy &Props, - const detail::code_location &CodeLoc, bool IsTopCodeLoc); - -template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<3>( - const queue &Queue, const nd_range<3> &Range, - detail::HostKernelRefBase &HostKernel, - detail::DeviceKernelInfo *DeviceKernelInfo, - sycl::span DepEvents, - const detail::KernelPropertyHolderStructTy &Props, - const detail::code_location &CodeLoc, bool IsTopCodeLoc); - } // namespace _V1 } // namespace sycl diff --git a/sycl/test-e2e/Basic/test_num_kernel_copies.cpp b/sycl/test-e2e/Basic/test_num_kernel_copies.cpp index 82f8477a10962..d0770a56696be 100644 --- a/sycl/test-e2e/Basic/test_num_kernel_copies.cpp +++ b/sycl/test-e2e/Basic/test_num_kernel_copies.cpp @@ -23,7 +23,8 @@ int main(int argc, char **argv) { kernel<0> krn0; q.parallel_for(sycl::range<1>{1}, krn0); - assert(copy_count == 1); + // The kernel is copied on the scheduler-based path only + assert(copy_count == 0); assert(move_count == 0); copy_count = 0; diff --git a/sycl/test/abi/layout_nd_range_view.cpp b/sycl/test/abi/layout_nd_range_view.cpp new file mode 100644 index 0000000000000..2a1d0693dab26 --- /dev/null +++ b/sycl/test/abi/layout_nd_range_view.cpp @@ -0,0 +1,19 @@ +// RUN: %clangxx -fsycl -c -fno-color-diagnostics -Xclang -fdump-record-layouts %s -o %t.out | FileCheck %s +// RUN: %clangxx -fsycl -fsycl-device-only -c -fno-color-diagnostics -Xclang -fdump-record-layouts %s -o %t.out | FileCheck %s +// REQUIRES: linux +// UNSUPPORTED: libcxx + +// clang-format off + +#include + + +SYCL_EXTERNAL void nd_range_view(sycl::detail::nd_range_view) {} +// CHECK: 0 | class sycl::detail::nd_range_view +// CHECK-NEXT: 0 | const size_t * MGlobalSize +// CHECK-NEXT: 8 | const size_t * MLocalSize +// CHECK-NEXT: 16 | const size_t * MOffset +// CHECK-NEXT: 24 | _Bool MSetNumWorkGroups +// CHECK-NEXT: 32 | size_t MDims +// CHECK-NEXT: | [sizeof=40, dsize=40, align=8, +// CHECK-NEXT: | nvsize=40, nvalign=8] diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index e42759431d374..9b7a17a78a54a 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -2985,12 +2985,8 @@ _ZN4sycl3_V121__isgreaterequal_implEdd _ZN4sycl3_V121__isgreaterequal_implEff _ZN4sycl3_V122accelerator_selector_vERKNS0_6deviceE _ZN4sycl3_V128verifyUSMAllocatorPropertiesERKNS0_13property_listE -_ZN4sycl3_V136submit_kernel_direct_with_event_implILi1EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoENS0_4spanIKS2_Lm18446744073709551615EEERKNSA_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSK_5intel12experimental12cache_configENSM_17use_root_sync_keyENSM_23work_group_progress_keyENSM_22sub_group_progress_keyENSM_22work_item_progress_keyENSM_4cuda12cluster_sizeILi1EEENSW_ILi2EEENSW_ILi3EEEEEERKNSA_13code_locationEb -_ZN4sycl3_V136submit_kernel_direct_with_event_implILi2EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoENS0_4spanIKS2_Lm18446744073709551615EEERKNSA_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSK_5intel12experimental12cache_configENSM_17use_root_sync_keyENSM_23work_group_progress_keyENSM_22sub_group_progress_keyENSM_22work_item_progress_keyENSM_4cuda12cluster_sizeILi1EEENSW_ILi2EEENSW_ILi3EEEEEERKNSA_13code_locationEb -_ZN4sycl3_V136submit_kernel_direct_with_event_implILi3EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoENS0_4spanIKS2_Lm18446744073709551615EEERKNSA_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSK_5intel12experimental12cache_configENSM_17use_root_sync_keyENSM_23work_group_progress_keyENSM_22sub_group_progress_keyENSM_22work_item_progress_keyENSM_4cuda12cluster_sizeILi1EEENSW_ILi2EEENSW_ILi3EEEEEERKNSA_13code_locationEb -_ZN4sycl3_V139submit_kernel_direct_without_event_implILi1EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoENS0_4spanIKNS0_5eventELm18446744073709551615EEERKNS9_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSK_5intel12experimental12cache_configENSM_17use_root_sync_keyENSM_23work_group_progress_keyENSM_22sub_group_progress_keyENSM_22work_item_progress_keyENSM_4cuda12cluster_sizeILi1EEENSW_ILi2EEENSW_ILi3EEEEEERKNS9_13code_locationEb -_ZN4sycl3_V139submit_kernel_direct_without_event_implILi2EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoENS0_4spanIKNS0_5eventELm18446744073709551615EEERKNS9_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSK_5intel12experimental12cache_configENSM_17use_root_sync_keyENSM_23work_group_progress_keyENSM_22sub_group_progress_keyENSM_22work_item_progress_keyENSM_4cuda12cluster_sizeILi1EEENSW_ILi2EEENSW_ILi3EEEEEERKNS9_13code_locationEb -_ZN4sycl3_V139submit_kernel_direct_without_event_implILi3EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoENS0_4spanIKNS0_5eventELm18446744073709551615EEERKNS9_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSK_5intel12experimental12cache_configENSM_17use_root_sync_keyENSM_23work_group_progress_keyENSM_22sub_group_progress_keyENSM_22work_item_progress_keyENSM_4cuda12cluster_sizeILi1EEENSW_ILi2EEENSW_ILi3EEEEEERKNS9_13code_locationEb +_ZN4sycl3_V136submit_kernel_direct_with_event_implERKNS0_5queueERKNS0_6detail13nd_range_viewERNS4_17HostKernelRefBaseEPNS4_16DeviceKernelInfoENS0_4spanIKNS0_5eventELm18446744073709551615EEERKNS4_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSI_5intel12experimental12cache_configENSK_17use_root_sync_keyENSK_23work_group_progress_keyENSK_22sub_group_progress_keyENSK_22work_item_progress_keyENSK_4cuda12cluster_sizeILi1EEENSU_ILi2EEENSU_ILi3EEEEEERKNS4_13code_locationEb +_ZN4sycl3_V139submit_kernel_direct_without_event_implERKNS0_5queueERKNS0_6detail13nd_range_viewERNS4_17HostKernelRefBaseEPNS4_16DeviceKernelInfoENS0_4spanIKNS0_5eventELm18446744073709551615EEERKNS4_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSI_5intel12experimental12cache_configENSK_17use_root_sync_keyENSK_23work_group_progress_keyENSK_22sub_group_progress_keyENSK_22work_item_progress_keyENSK_4cuda12cluster_sizeILi1EEENSU_ILi2EEENSU_ILi3EEEEEERKNS4_13code_locationEb _ZN4sycl3_V13ext5intel12experimental9pipe_base13get_pipe_nameB5cxx11EPKv _ZN4sycl3_V13ext5intel12experimental9pipe_base17wait_non_blockingERKNS0_5eventE _ZN4sycl3_V13ext5intel12experimental9pipe_base18get_pipe_name_implEPKv @@ -3318,11 +3314,13 @@ _ZN4sycl3_V16detail17HostProfilingInfo3endEv _ZN4sycl3_V16detail17HostProfilingInfo5startEv _ZN4sycl3_V16detail17device_global_map3addEPKvPKc _ZN4sycl3_V16detail17reduComputeWGSizeEmmRm +_ZN4sycl3_V16detail18RangeRoundingTraceEv _ZN4sycl3_V16detail18get_kernel_id_implENS1_11string_viewE _ZN4sycl3_V16detail18make_kernel_bundleEmRKNS0_7contextENS0_12bundle_stateENS0_7backendE _ZN4sycl3_V16detail18make_kernel_bundleEmRKNS0_7contextEbNS0_12bundle_stateENS0_7backendE _ZN4sycl3_V16detail18stringifyErrorCodeEi _ZN4sycl3_V16detail19getDeviceKernelInfoERKNS1_27compile_time_kernel_info_v123CompileTimeKernelInfoTyE +_ZN4sycl3_V16detail19getMaxWorkGroups_v2ERKNS0_6deviceE _ZN4sycl3_V16detail19kernel_bundle_plain21ext_oneapi_get_kernelENS1_11string_viewE _ZN4sycl3_V16detail19kernel_bundle_plain21ext_oneapi_has_kernelENS1_11string_viewE _ZN4sycl3_V16detail19kernel_bundle_plain28ext_oneapi_has_device_globalENS1_11string_viewE @@ -3330,6 +3328,7 @@ _ZN4sycl3_V16detail19kernel_bundle_plain30ext_oneapi_get_raw_kernel_nameENS1_11s _ZN4sycl3_V16detail19kernel_bundle_plain32set_specialization_constant_implEPKcPvm _ZN4sycl3_V16detail19kernel_bundle_plain33ext_oneapi_get_device_global_sizeENS1_11string_viewE _ZN4sycl3_V16detail19kernel_bundle_plain36ext_oneapi_get_device_global_addressENS1_11string_viewERKNS0_6deviceE +_ZN4sycl3_V16detail20DisableRangeRoundingEv _ZN4sycl3_V16detail20associateWithHandlerERNS0_7handlerEPNS1_16AccessorBaseHostENS0_6access6targetE _ZN4sycl3_V16detail20associateWithHandlerERNS0_7handlerEPNS1_28SampledImageAccessorBaseHostENS0_12image_targetE _ZN4sycl3_V16detail20associateWithHandlerERNS0_7handlerEPNS1_30UnsampledImageAccessorBaseHostENS0_12image_targetE @@ -3355,6 +3354,7 @@ _ZN4sycl3_V16detail22reduGetPreferredWGSizeERNS0_7handlerEm _ZN4sycl3_V16detail22reduGetPreferredWGSizeERSt10shared_ptrINS1_10queue_implEEm _ZN4sycl3_V16detail22removeDuplicateDevicesERKSt6vectorINS0_6deviceESaIS3_EE _ZN4sycl3_V16detail23constructorNotificationEPvS2_NS0_6access6targetENS3_4modeERKNS1_13code_locationE +_ZN4sycl3_V16detail24GetRangeRoundingSettingsERmS2_S2_ _ZN4sycl3_V16detail24find_device_intersectionERKSt6vectorINS0_13kernel_bundleILNS0_12bundle_stateE1EEESaIS5_EE _ZN4sycl3_V16detail26createKernelNameBasedCacheEv _ZN4sycl3_V16detail26isDeviceGlobalUsedInKernelEPKv diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 1db98f1b5cf9d..0c071dd4d33c6 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -286,12 +286,6 @@ ??$is_image_handle_supported@Usampled_image_handle@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA_NAEBUimage_descriptor@01234@W4image_memory_handle_type@01234@AEBVqueue@34@@Z ??$is_image_handle_supported@Uunsampled_image_handle@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA_NAEBUimage_descriptor@01234@W4image_memory_handle_type@01234@AEBVdevice@34@AEBVcontext@34@@Z ??$is_image_handle_supported@Uunsampled_image_handle@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA_NAEBUimage_descriptor@01234@W4image_memory_handle_type@01234@AEBVqueue@34@@Z -??$submit_kernel_direct_with_event_impl@$00@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$00@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@V?$span@$$CBVevent@_V1@sycl@@$0?0@01@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@601@AEBUcode_location@601@_N@Z -??$submit_kernel_direct_with_event_impl@$01@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$01@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@V?$span@$$CBVevent@_V1@sycl@@$0?0@01@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@601@AEBUcode_location@601@_N@Z -??$submit_kernel_direct_with_event_impl@$02@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$02@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@V?$span@$$CBVevent@_V1@sycl@@$0?0@01@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@601@AEBUcode_location@601@_N@Z -??$submit_kernel_direct_without_event_impl@$00@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$00@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@V?$span@$$CBVevent@_V1@sycl@@$0?0@01@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@501@AEBUcode_location@501@_N@Z -??$submit_kernel_direct_without_event_impl@$01@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$01@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@V?$span@$$CBVevent@_V1@sycl@@$0?0@01@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@501@AEBUcode_location@501@_N@Z -??$submit_kernel_direct_without_event_impl@$02@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$02@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@V?$span@$$CBVevent@_V1@sycl@@$0?0@01@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@501@AEBUcode_location@501@_N@Z ??$update_nd_range@$00@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$00@45@@Z ??$update_nd_range@$01@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$01@45@@Z ??$update_nd_range@$02@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$02@45@@Z @@ -4484,6 +4478,8 @@ ?submit_impl_and_postprocess@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBUcode_location@detail@23@AEBV?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@6@_N@Z ?submit_impl_and_postprocess@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@V123@AEBUcode_location@detail@23@AEBV?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@6@@Z ?submit_impl_and_postprocess@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@V123@AEBUcode_location@detail@23@AEBV?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@6@_N@Z +?submit_kernel_direct_with_event_impl@_V1@sycl@@YA?AVevent@12@AEBVqueue@12@AEBVnd_range_view@detail@12@AEAVHostKernelRefBase@612@PEAVDeviceKernelInfo@612@V?$span@$$CBVevent@_V1@sycl@@$0?0@12@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@612@AEBUcode_location@612@_N@Z +?submit_kernel_direct_without_event_impl@_V1@sycl@@YAXAEBVqueue@12@AEBVnd_range_view@detail@12@AEAVHostKernelRefBase@512@PEAVDeviceKernelInfo@512@V?$span@$$CBVevent@_V1@sycl@@$0?0@12@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@512@AEBUcode_location@512@_N@Z ?submit_with_event_impl@queue@_V1@sycl@@AEAA?AVevent@23@AEBVtype_erased_cgfo_ty@detail@23@AEBVSubmissionInfo@623@AEBUcode_location@623@_N@Z ?submit_with_event_impl@queue@_V1@sycl@@AEAA?AVevent@23@AEBVtype_erased_cgfo_ty@detail@23@AEBVSubmissionInfo@v1@623@AEBUcode_location@623@_N@Z ?submit_with_event_impl@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBVSubmissionInfo@detail@23@AEBUcode_location@823@_N@Z diff --git a/sycl/test/abi/symbol_size_alignment.cpp b/sycl/test/abi/symbol_size_alignment.cpp index 63286da1b9786..36bf933ce6b28 100644 --- a/sycl/test/abi/symbol_size_alignment.cpp +++ b/sycl/test/abi/symbol_size_alignment.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -58,6 +59,7 @@ int main() { #endif check, 16, 8>(); check(); + check(); check(); #ifdef __SYCL_DEVICE_ONLY__ check, 4, 4>(); diff --git a/sycl/test/include_deps/sycl_detail_core.hpp.cpp b/sycl/test/include_deps/sycl_detail_core.hpp.cpp index 5174c8a29bb6b..2d0cc14cd5c26 100644 --- a/sycl/test/include_deps/sycl_detail_core.hpp.cpp +++ b/sycl/test/include_deps/sycl_detail_core.hpp.cpp @@ -98,6 +98,7 @@ // CHECK-NEXT: nd_item.hpp // CHECK-NEXT: nd_range.hpp // CHECK-NEXT: detail/id_queries_fit_in_int.hpp +// CHECK-NEXT: detail/nd_range_view.hpp // CHECK-NEXT: detail/optional.hpp // CHECK-NEXT: device.hpp // CHECK-NEXT: detail/string_view.hpp @@ -151,6 +152,7 @@ // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp // CHECK-NEXT: kernel.hpp // CHECK-NEXT: sampler.hpp +// CHECK-NEXT: range_rounding.hpp // CHECK-NEXT: sycl_span.hpp // CHECK-NEXT: feature_test.hpp // CHECK-EMPTY: diff --git a/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp index 76570a99bdda7..846bd0ed4a436 100644 --- a/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp @@ -102,6 +102,7 @@ // CHECK-NEXT: nd_item.hpp // CHECK-NEXT: nd_range.hpp // CHECK-NEXT: detail/id_queries_fit_in_int.hpp +// CHECK-NEXT: detail/nd_range_view.hpp // CHECK-NEXT: detail/optional.hpp // CHECK-NEXT: device.hpp // CHECK-NEXT: detail/string_view.hpp @@ -155,5 +156,6 @@ // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp // CHECK-NEXT: kernel.hpp // CHECK-NEXT: sampler.hpp +// CHECK-NEXT: range_rounding.hpp // CHECK-NEXT: sycl_span.hpp // CHECK-EMPTY: diff --git a/sycl/test/include_deps/sycl_khr_includes_reduction.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_reduction.hpp.cpp index 50abdf954cca0..e6b167d5102a0 100644 --- a/sycl/test/include_deps/sycl_khr_includes_reduction.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_reduction.hpp.cpp @@ -180,8 +180,10 @@ // CHECK-NEXT: kernel.hpp // CHECK-NEXT: sampler.hpp // CHECK-NEXT: queue.hpp +// CHECK-NEXT: detail/nd_range_view.hpp // CHECK-NEXT: detail/optional.hpp // CHECK-NEXT: ext/oneapi/experimental/event_mode_property.hpp +// CHECK-NEXT: range_rounding.hpp // CHECK-NEXT: sycl_span.hpp // CHECK-NEXT: usm.hpp // CHECK-NEXT: usm/usm_pointer_info.hpp diff --git a/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp index f2feef5bd9871..b24f3577906ae 100644 --- a/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp @@ -122,6 +122,7 @@ // CHECK-NEXT: nd_item.hpp // CHECK-NEXT: nd_range.hpp // CHECK-NEXT: detail/id_queries_fit_in_int.hpp +// CHECK-NEXT: detail/nd_range_view.hpp // CHECK-NEXT: detail/optional.hpp // CHECK-NEXT: event.hpp // CHECK-NEXT: exception_list.hpp @@ -170,6 +171,7 @@ // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp // CHECK-NEXT: kernel.hpp // CHECK-NEXT: sampler.hpp +// CHECK-NEXT: range_rounding.hpp // CHECK-NEXT: sycl_span.hpp // CHECK-NEXT: usm/usm_pointer_info.hpp // CHECK-NEXT: usm/usm_allocator.hpp diff --git a/sycl/unittests/scheduler/CMakeLists.txt b/sycl/unittests/scheduler/CMakeLists.txt index afc0e185eb7c0..9041793ecdaf2 100644 --- a/sycl/unittests/scheduler/CMakeLists.txt +++ b/sycl/unittests/scheduler/CMakeLists.txt @@ -22,4 +22,5 @@ add_sycl_unittest(SchedulerTests OBJECT AccessorDefaultCtor.cpp HostTaskAndBarrier.cpp BarrierDependencies.cpp + NdRangeViewUsage.cpp ) diff --git a/sycl/unittests/scheduler/NdRangeViewUsage.cpp b/sycl/unittests/scheduler/NdRangeViewUsage.cpp new file mode 100644 index 0000000000000..dd1e0659cb607 --- /dev/null +++ b/sycl/unittests/scheduler/NdRangeViewUsage.cpp @@ -0,0 +1,118 @@ +//==---- NdRangeViewUsage.cpp --- Check nd_range_view ------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include +#include + +#include + +template +void TestNdRangeView(sycl::range global, sycl::range local, + sycl::id offset) { + { + sycl::nd_range nd_range{global, local, offset}; + sycl::detail::nd_range_view r{nd_range}; + ASSERT_EQ(r.MDims, size_t{dims}); + for (int d = 0; d < dims; d++) { + ASSERT_EQ(r.MGlobalSize[d], global[d]); + ASSERT_EQ(r.MLocalSize[d], local[d]); + ASSERT_EQ(r.MOffset[d], offset[d]); + } + + sycl::detail::NDRDescT NDRDesc = r.toNDRDescT(); + ASSERT_EQ(NDRDesc.Dims, size_t{dims}); + for (int d = 0; d < dims; d++) { + ASSERT_EQ(NDRDesc.GlobalSize[d], global[d]); + ASSERT_EQ(NDRDesc.LocalSize[d], local[d]); + ASSERT_EQ(NDRDesc.GlobalOffset[d], offset[d]); + } + + for (int d = dims; d < 3; d++) { + ASSERT_EQ(NDRDesc.GlobalSize[d], 1UL); + ASSERT_EQ(NDRDesc.LocalSize[d], 1UL); + } + } + { + sycl::detail::nd_range_view r{global, offset}; + ASSERT_EQ(r.MDims, size_t{dims}); + for (int d = 0; d < dims; d++) { + ASSERT_EQ(r.MGlobalSize[d], global[d]); + ASSERT_EQ(r.MOffset[d], offset[d]); + } + ASSERT_EQ(r.MLocalSize, nullptr); + + sycl::detail::NDRDescT NDRDesc = r.toNDRDescT(); + ASSERT_EQ(NDRDesc.Dims, size_t{dims}); + for (int d = 0; d < dims; d++) { + ASSERT_EQ(NDRDesc.GlobalSize[d], global[d]); + ASSERT_EQ(NDRDesc.GlobalOffset[d], offset[d]); + } + for (int d = dims; d < 3; d++) { + ASSERT_EQ(NDRDesc.GlobalSize[d], 0UL); + ASSERT_EQ(NDRDesc.GlobalOffset[d], 0UL); + } + for (int d = 0; d < 3; d++) { + ASSERT_EQ(NDRDesc.LocalSize[d], 0UL); + } + } + { + sycl::detail::nd_range_view r{global, true}; + ASSERT_EQ(r.MDims, size_t{dims}); + for (int d = 0; d < dims; d++) { + ASSERT_EQ(r.MGlobalSize[d], global[d]); + ASSERT_EQ(r.MSetNumWorkGroups, true); + } + ASSERT_EQ(r.MLocalSize, nullptr); + ASSERT_EQ(r.MOffset, nullptr); + + sycl::detail::NDRDescT NDRDesc = r.toNDRDescT(); + ASSERT_EQ(NDRDesc.Dims, size_t{dims}); + for (int d = 0; d < dims; d++) { + ASSERT_EQ(NDRDesc.NumWorkGroups[d], global[d]); + } + for (int d = dims; d < 3; d++) { + ASSERT_EQ(NDRDesc.NumWorkGroups[d], 0UL); + } + for (int d = 0; d < 3; d++) { + ASSERT_EQ(NDRDesc.LocalSize[d], 0UL); + ASSERT_EQ(NDRDesc.GlobalSize[d], 0UL); + ASSERT_EQ(NDRDesc.GlobalOffset[d], 0UL); + } + } + { + sycl::detail::nd_range_view r{global, false}; + ASSERT_EQ(r.MDims, size_t{dims}); + for (int d = 0; d < dims; d++) { + ASSERT_EQ(r.MGlobalSize[d], global[d]); + ASSERT_EQ(r.MSetNumWorkGroups, false); + } + ASSERT_EQ(r.MLocalSize, nullptr); + ASSERT_EQ(r.MOffset, nullptr); + + sycl::detail::NDRDescT NDRDesc = r.toNDRDescT(); + ASSERT_EQ(NDRDesc.Dims, size_t{dims}); + for (int d = 0; d < dims; d++) { + ASSERT_EQ(NDRDesc.GlobalSize[d], global[d]); + } + for (int d = dims; d < 3; d++) { + ASSERT_EQ(NDRDesc.GlobalSize[d], 1UL); + } + for (int d = 0; d < 3; d++) { + ASSERT_EQ(NDRDesc.LocalSize[d], 0UL); + ASSERT_EQ(NDRDesc.NumWorkGroups[d], 0UL); + ASSERT_EQ(NDRDesc.GlobalOffset[d], 0UL); + } + } +} + +TEST(RangesRefUsage, RangesRefUsage) { + TestNdRangeView(sycl::range<1>{1024}, sycl::range<1>{64}, sycl::id<1>{10}); + TestNdRangeView(sycl::range<2>{1024, 512}, sycl::range<2>{64, 32}, + sycl::id<2>{10, 5}); + TestNdRangeView(sycl::range<3>{1024, 512, 256}, sycl::range<3>{64, 32, 16}, + sycl::id<3>{10, 5, 2}); +}