diff --git a/sycl/include/sycl/detail/kernel_launch_helper.hpp b/sycl/include/sycl/detail/kernel_launch_helper.hpp index a80ddc9feb83f..05b76bd0e1b5f 100644 --- a/sycl/include/sycl/detail/kernel_launch_helper.hpp +++ b/sycl/include/sycl/detail/kernel_launch_helper.hpp @@ -14,6 +14,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -253,23 +256,179 @@ struct KernelWrapper< } }; // KernelWrapper struct -struct KernelLaunchPropertyWrapper { - template - static void parseProperties([[maybe_unused]] PropertyProcessor h, - [[maybe_unused]] const KernelType &KernelFunc) { -#ifndef __SYCL_DEVICE_ONLY__ - // If there are properties provided by get method then process them. - if constexpr (ext::oneapi::experimental::detail:: - HasKernelPropertiesGetMethod::value) { +// This namespace encapsulates everything related to parsing kernel launch +// properties. +inline namespace kernel_launch_properties_v1 { + +// This struct is used to store kernel launch properties. +// std::optional is used to indicate that the property is not set. +// This struct is used to pass kernel launch properties across the ABI +// boundary. +struct KernelLaunchPropertiesTy { + // Modeled after ur_kernel_cache_config_t + enum class StableKernelCacheConfig : int32_t { + Default = 0, + LargeSLM = 1, + LargeData = 2 + }; + + struct ScopeForwardProgressProperty { + std::optional + Guarantee; + std::optional ExecScope; + std::optional + CoordinationScope; + }; + + std::optional MCacheConfig = std::nullopt; + std::optional MIsCooperative = std::nullopt; + std::optional MWorkGroupMemorySize = std::nullopt; + std::optional MUsesClusterLaunch = std::nullopt; + size_t MClusterDims = 0; + std::array MClusterSize = {0, 0, 0}; + + // Forward progress guarantee properties for work_item, sub_group and + // work_group scopes. We need to store them for validation later. + std::array MForwardProgressProperties; +}; + +template +constexpr KernelLaunchPropertiesTy +processKernelLaunchProperties(PropertiesT Props) { + using namespace sycl::ext::oneapi::experimental; + using namespace sycl::ext::oneapi::experimental::detail; + KernelLaunchPropertiesTy retval; + + // Process Kernel cache configuration property. + { + if constexpr (PropertiesT::template has_property< + sycl::ext::intel::experimental::cache_config_key>()) { + auto Config = Props.template get_property< + sycl::ext::intel::experimental::cache_config_key>(); + if (Config == sycl::ext::intel::experimental::large_slm) { + retval.MCacheConfig = + KernelLaunchPropertiesTy::StableKernelCacheConfig::LargeSLM; + } else if (Config == sycl::ext::intel::experimental::large_data) { + retval.MCacheConfig = + KernelLaunchPropertiesTy::StableKernelCacheConfig::LargeData; + } + } else { + std::ignore = Props; + } + } + + // Process Kernel cooperative property. + { + if constexpr (PropertiesT::template has_property()) + retval.MIsCooperative = true; + } - h->template processProperties< - detail::CompileTimeKernelInfo.IsESIMD>( - KernelFunc.get(ext::oneapi::experimental::properties_tag{})); + // Process device progress properties. + { + if constexpr (PropertiesT::template has_property< + work_group_progress_key>()) { + auto prop = Props.template get_property(); + retval.MForwardProgressProperties[0].Guarantee = prop.guarantee; + retval.MForwardProgressProperties[0].ExecScope = + execution_scope::work_group; + retval.MForwardProgressProperties[0].CoordinationScope = + prop.coordinationScope; } -#endif + if constexpr (PropertiesT::template has_property< + sub_group_progress_key>()) { + auto prop = Props.template get_property(); + retval.MForwardProgressProperties[1].Guarantee = prop.guarantee; + retval.MForwardProgressProperties[1].ExecScope = + execution_scope::sub_group; + retval.MForwardProgressProperties[1].CoordinationScope = + prop.coordinationScope; + } + if constexpr (PropertiesT::template has_property< + work_item_progress_key>()) { + auto prop = Props.template get_property(); + retval.MForwardProgressProperties[2].Guarantee = prop.guarantee; + retval.MForwardProgressProperties[2].ExecScope = + execution_scope::work_item; + retval.MForwardProgressProperties[2].CoordinationScope = + prop.coordinationScope; + } + } + + // Process work group scratch memory property. + { + if constexpr (PropertiesT::template has_property< + work_group_scratch_size>()) { + auto WorkGroupMemSize = + Props.template get_property(); + retval.MWorkGroupMemorySize = WorkGroupMemSize.size; + } + } + + // Parse cluster properties. + { + constexpr std::size_t ClusterDim = getClusterDim(); + if constexpr (ClusterDim > 0) { + static_assert(ClusterDim <= 3, + "Only 1D, 2D, and 3D cluster launch is supported."); + + auto ClusterSize = + Props.template get_property>() + .get_cluster_size(); + + retval.MUsesClusterLaunch = true; + retval.MClusterDims = ClusterDim; + + for (size_t dim = 0; dim < ClusterDim; dim++) + retval.MClusterSize[dim] = ClusterSize[dim]; + } + } + + return retval; +} + +/// Note: it is important that this function *does not* depend on kernel +/// name or kernel type, because then it will be instantiated for every +/// kernel, even though body of those instantiated functions could be almost +/// the same, thus unnecessary increasing compilation time. +template +constexpr KernelLaunchPropertiesTy processKernelProperties(PropertiesT Props) { + static_assert(ext::oneapi::experimental::is_property_list::value, + "Template type is not a property list."); + static_assert( + !PropertiesT::template has_property< + sycl::ext::intel::experimental::fp_control_key>() || + (PropertiesT::template has_property< + sycl::ext::intel::experimental::fp_control_key>() && + IsESIMDKernel), + "Floating point control property is supported for ESIMD kernels only."); + static_assert( + !PropertiesT::template has_property< + sycl::ext::oneapi::experimental::indirectly_callable_key>(), + "indirectly_callable property cannot be applied to SYCL kernels"); + + return processKernelLaunchProperties(Props); +} + +// Returns KernelLaunchPropertiesTy or std::nullopt based on whether the +// kernel functor has a get method that returns properties. +template +constexpr std::optional +parseProperties([[maybe_unused]] const KernelType &KernelFunc) { +#ifndef __SYCL_DEVICE_ONLY__ + // If there are properties provided by get method then process them. + if constexpr (ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod< + const KernelType &>::value) { + + return processKernelProperties( + KernelFunc.get(ext::oneapi::experimental::properties_tag{})); } -}; // KernelLaunchPropertyWrapper struct +#endif + // If there are no properties provided by get method then return empty + // optional. + return std::nullopt; +} +} // namespace kernel_launch_properties_v1 } // namespace detail } // namespace _V1 diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index e393bd626d4d6..9aaf817d99973 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -259,17 +259,12 @@ template void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, ReductionsT &&...Reductions) { - // TODO The handler-less path does not support reductions, kernel - // function properties and kernel functions with the kernel_handler - // type argument yet. + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. if constexpr (sizeof...(ReductionsT) == 0 && - !(ext::oneapi::experimental::detail:: - HasKernelPropertiesGetMethod< - const KernelType &>::value) && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { - detail::submit_kernel_direct(std::move(Q), empty_properties_t{}, - Range, KernelObj); + detail::submit_kernel_direct(std::move(Q), Range, KernelObj); } else { submit(std::move(Q), [&](handler &CGH) { nd_launch(CGH, Range, KernelObj, @@ -296,13 +291,25 @@ template void nd_launch(queue Q, launch_config, Properties> Config, const KernelType &KernelObj, ReductionsT &&...Reductions) { - // TODO This overload of the nd_launch function takes the kernel function - // properties, which are not yet supported for the handler-less path, - // so it only supports handler based submission for now - submit(std::move(Q), [&](handler &CGH) { - nd_launch(CGH, Config, KernelObj, - std::forward(Reductions)...); - }); + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. + if constexpr (sizeof...(ReductionsT) == 0 && + !(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, sycl::nd_item>::value)) { + + ext::oneapi::experimental::detail::LaunchConfigAccess, + Properties> + LaunchConfigAccess(Config); + + detail::submit_kernel_direct( + std::move(Q), LaunchConfigAccess.getRange(), KernelObj, + LaunchConfigAccess.getProperties()); + } else { + submit(std::move(Q), [&](handler &CGH) { + nd_launch(CGH, Config, KernelObj, + std::forward(Reductions)...); + }); + } } template diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 67f21bc05857f..30629f0c5589e 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -877,6 +877,7 @@ class __SYCL_EXPORT handler { } } +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES void verifyDeviceHasProgressGuarantee( sycl::ext::oneapi::experimental::forward_progress_guarantee guarantee, sycl::ext::oneapi::experimental::execution_scope threadScope, @@ -901,64 +902,9 @@ class __SYCL_EXPORT handler { /// Stores information about kernel properties into the handler. template void processLaunchProperties(PropertiesT Props) { - if constexpr (PropertiesT::template has_property< - sycl::ext::intel::experimental::cache_config_key>()) { - auto Config = Props.template get_property< - sycl::ext::intel::experimental::cache_config_key>(); - if (Config == sycl::ext::intel::experimental::large_slm) { - setKernelCacheConfig(StableKernelCacheConfig::LargeSLM); - } else if (Config == sycl::ext::intel::experimental::large_data) { - setKernelCacheConfig(StableKernelCacheConfig::LargeData); - } - } else { - (void)Props; - } - - constexpr bool UsesRootSync = PropertiesT::template has_property< - sycl::ext::oneapi::experimental::use_root_sync_key>(); - if (UsesRootSync) { - setKernelIsCooperative(UsesRootSync); - } - if constexpr (PropertiesT::template has_property< - sycl::ext::oneapi::experimental:: - work_group_progress_key>()) { - auto prop = Props.template get_property< - sycl::ext::oneapi::experimental::work_group_progress_key>(); - verifyDeviceHasProgressGuarantee( - prop.guarantee, - sycl::ext::oneapi::experimental::execution_scope::work_group, - prop.coordinationScope); - } - if constexpr (PropertiesT::template has_property< - sycl::ext::oneapi::experimental:: - sub_group_progress_key>()) { - auto prop = Props.template get_property< - sycl::ext::oneapi::experimental::sub_group_progress_key>(); - verifyDeviceHasProgressGuarantee( - prop.guarantee, - sycl::ext::oneapi::experimental::execution_scope::sub_group, - prop.coordinationScope); - } - if constexpr (PropertiesT::template has_property< - sycl::ext::oneapi::experimental:: - work_item_progress_key>()) { - auto prop = Props.template get_property< - sycl::ext::oneapi::experimental::work_item_progress_key>(); - verifyDeviceHasProgressGuarantee( - prop.guarantee, - sycl::ext::oneapi::experimental::execution_scope::work_item, - prop.coordinationScope); - } - - if constexpr (PropertiesT::template has_property< - sycl::ext::oneapi::experimental:: - work_group_scratch_size>()) { - auto WorkGroupMemSize = Props.template get_property< - sycl::ext::oneapi::experimental::work_group_scratch_size>(); - setKernelWorkGroupMem(WorkGroupMemSize.size); - } - - checkAndSetClusterRange(Props); + detail::KernelLaunchPropertiesTy ParsedProp = + detail::processKernelLaunchProperties(Props); + setKernelLaunchProperties(ParsedProp); } /// Process kernel properties. @@ -973,23 +919,11 @@ class __SYCL_EXPORT handler { bool IsESIMDKernel, typename PropertiesT = ext::oneapi::experimental::empty_properties_t> void processProperties(PropertiesT Props) { - static_assert( - ext::oneapi::experimental::is_property_list::value, - "Template type is not a property list."); - static_assert( - !PropertiesT::template has_property< - sycl::ext::intel::experimental::fp_control_key>() || - (PropertiesT::template has_property< - sycl::ext::intel::experimental::fp_control_key>() && - IsESIMDKernel), - "Floating point control property is supported for ESIMD kernels only."); - static_assert( - !PropertiesT::template has_property< - sycl::ext::oneapi::experimental::indirectly_callable_key>(), - "indirectly_callable property cannot be applied to SYCL kernels"); - - processLaunchProperties(Props); + detail::KernelLaunchPropertiesTy ParsedProp = + detail::processKernelProperties(Props); + setKernelLaunchProperties(ParsedProp); } +#endif // INTEL_PREVIEW_BREAKING_CHANGES /// Checks whether it is possible to copy the source shape to the destination /// shape(the shapes are described by the accessor ranges) by using @@ -1297,8 +1231,10 @@ class __SYCL_EXPORT handler { decltype(Wrapper), TransformedArgType, PropertiesT>::wrap(Wrapper); - detail::KernelLaunchPropertyWrapper::parseProperties(this, - Wrapper); + if (auto prop = detail::parseProperties(Wrapper)) { + setKernelLaunchProperties(*prop); + } + #ifndef __SYCL_DEVICE_ONLY__ verifyUsedKernelBundleInternal(Info.Name); // We are executing over the rounded range, but there are still @@ -1322,11 +1258,15 @@ class __SYCL_EXPORT handler { // kernel is generated detail::KernelWrapper::wrap(KernelFunc); - detail::KernelLaunchPropertyWrapper::parseProperties(this, - KernelFunc); + if (auto prop = + detail::parseProperties(KernelFunc)) { + setKernelLaunchProperties(*prop); + } #ifndef __SYCL_DEVICE_ONLY__ verifyUsedKernelBundleInternal(Info.Name); - processProperties(Props); + detail::KernelLaunchPropertiesTy ProcessedProps = + detail::processKernelProperties(Props); + setKernelLaunchProperties(ProcessedProps); detail::checkValueRange(UserRange); setNDRangeDescriptor(std::move(UserRange)); StoreLambda( @@ -1355,7 +1295,9 @@ class __SYCL_EXPORT handler { setDeviceKernelInfo(std::move(Kernel)); detail::checkValueRange(NumWorkItems); setNDRangeDescriptor(std::move(NumWorkItems)); - processLaunchProperties(Props); + detail::KernelLaunchPropertiesTy ParsedProp = + detail::processKernelLaunchProperties(Props); + setKernelLaunchProperties(ParsedProp); extractArgsAndReqs(); #endif } @@ -1378,7 +1320,9 @@ class __SYCL_EXPORT handler { setDeviceKernelInfo(std::move(Kernel)); detail::checkValueRange(NDRange); setNDRangeDescriptor(std::move(NDRange)); - processLaunchProperties(Props); + detail::KernelLaunchPropertiesTy ParsedProp = + detail::processKernelLaunchProperties(Props); + setKernelLaunchProperties(ParsedProp); extractArgsAndReqs(); #endif } @@ -1395,12 +1339,13 @@ class __SYCL_EXPORT handler { using NameT = typename detail::get_kernel_name_t::name; (void)Props; + constexpr auto Info = detail::CompileTimeKernelInfo; detail::KernelWrapper::wrap(KernelFunc); - detail::KernelLaunchPropertyWrapper::parseProperties(this, - KernelFunc); + if (auto prop = detail::parseProperties(KernelFunc)) { + setKernelLaunchProperties(*prop); + } #ifndef __SYCL_DEVICE_ONLY__ - constexpr auto Info = detail::CompileTimeKernelInfo; if constexpr (WrapAsVal == detail::WrapAs::single_task) { throwOnKernelParameterMisuse(Info); } @@ -1416,7 +1361,9 @@ class __SYCL_EXPORT handler { } StoreLambda(std::move(KernelFunc)); - processProperties(Props); + detail::KernelLaunchPropertiesTy ProcessedProps = + detail::processKernelProperties(Props); + setKernelLaunchProperties(ProcessedProps); #endif } @@ -1439,8 +1386,9 @@ class __SYCL_EXPORT handler { (void)Kernel; detail::KernelWrapper::wrap(KernelFunc); - detail::KernelLaunchPropertyWrapper::parseProperties(this, - KernelFunc); + if (auto prop = detail::parseProperties(KernelFunc)) { + setKernelLaunchProperties(*prop); + } #ifndef __SYCL_DEVICE_ONLY__ constexpr auto Info = detail::CompileTimeKernelInfo; if constexpr (WrapAsVal == detail::WrapAs::single_task) { @@ -1467,7 +1415,9 @@ class __SYCL_EXPORT handler { "the kernel name must match the name of the lambda"); } StoreLambda(std::move(KernelFunc)); - processProperties(Props); + detail::KernelLaunchPropertiesTy ProcessedProps = + detail::processKernelProperties(Props); + setKernelLaunchProperties(ProcessedProps); #endif } #endif // __INTEL_PREVIEW_BREAKING_CHANGES @@ -3490,7 +3440,9 @@ class __SYCL_EXPORT handler { bool IsDeviceImageScoped, size_t NumBytes, size_t Offset); - // Changing values in this will break ABI/API. +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES + // Modeled after ur_kernel_cache_config_t + // Used as an argument to setKernelCacheConfig that's part of the ABI. enum class StableKernelCacheConfig : int32_t { Default = 0, LargeSLM = 1, @@ -3503,15 +3455,17 @@ class __SYCL_EXPORT handler { void setKernelIsCooperative(bool); // Set using cuda thread block cluster launch flag and set the launch bounds. -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES void setKernelClusterLaunch(sycl::range<3> ClusterSize, int Dims); -#endif void setKernelClusterLaunch(sycl::range<3> ClusterSize); void setKernelClusterLaunch(sycl::range<2> ClusterSize); void setKernelClusterLaunch(sycl::range<1> ClusterSize); // Set the request work group memory size (work_group_static ext). void setKernelWorkGroupMem(size_t Size); +#endif + + void setKernelLaunchProperties( + const detail::KernelLaunchPropertiesTy &KernelLaunchProperties); // Various checks that are only meaningful for host compilation, because they // result in runtime errors (i.e. exceptions being thrown). To save time @@ -3668,7 +3622,6 @@ class __SYCL_EXPORT handler { void instantiateKernelOnHost(void *InstantiateKernelOnHostPtr); friend class detail::HandlerAccess; - friend struct detail::KernelLaunchPropertyWrapper; #ifdef __INTEL_PREVIEW_BREAKING_CHANGES __SYCL_DLL_LOCAL detail::handler_impl *get_impl() { return impl; } diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index b04fac17a6f9c..b422964f3a3a1 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -157,16 +157,15 @@ template r, range<1> size, KernelType &&k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { - // TODO The handler-less path does not support kernel function properties - // and kernel functions with the kernel_handler type argument yet. + // TODO The handler-less path does not support kernel functions with the + // kernel_handler type argument yet. if constexpr (!(ext::oneapi::experimental::detail:: HasKernelPropertiesGetMethod< const KernelType &>::value) && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item<1>>::value)) { - detail::submit_kernel_direct( - q, ext::oneapi::experimental::empty_properties_t{}, - nd_range<1>(r, size), std::forward(k)); + detail::submit_kernel_direct(q, nd_range<1>(r, size), + std::forward(k)); } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, @@ -178,16 +177,12 @@ template r, range<2> size, KernelType &&k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { - // TODO The handler-less path does not support kernel function properties - // and kernel functions with the kernel_handler type argument yet. - if constexpr (!(ext::oneapi::experimental::detail:: - HasKernelPropertiesGetMethod< - const KernelType &>::value) && - !(detail::KernelLambdaHasKernelHandlerArgT< + // TODO The handler-less path does not support kernel functions with the + // kernel_handler type argument yet. + if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item<2>>::value)) { - detail::submit_kernel_direct( - q, ext::oneapi::experimental::empty_properties_t{}, - nd_range<2>(r, size), std::forward(k)); + detail::submit_kernel_direct(q, nd_range<2>(r, size), + std::forward(k)); } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, @@ -199,16 +194,12 @@ template r, range<3> size, KernelType &&k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { - // TODO The handler-less path does not support kernel function properties - // and kernel functions with the kernel_handler type argument yet. - if constexpr (!(ext::oneapi::experimental::detail:: - HasKernelPropertiesGetMethod< - const KernelType &>::value) && - !(detail::KernelLambdaHasKernelHandlerArgT< + // TODO The handler-less path does not support kernel functions with the + // kernel_handler type argument yet. + if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item<3>>::value)) { - detail::submit_kernel_direct( - q, ext::oneapi::experimental::empty_properties_t{}, - nd_range<3>(r, size), std::forward(k)); + detail::submit_kernel_direct(q, nd_range<3>(r, size), + std::forward(k)); } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index a1c7e16ccfe07..b6da07ed9e9a5 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -67,6 +67,7 @@ event __SYCL_EXPORT submit_kernel_direct_with_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template @@ -74,6 +75,7 @@ void __SYCL_EXPORT submit_kernel_direct_without_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); namespace detail { @@ -157,17 +159,13 @@ class __SYCL_EXPORT SubmissionInfo { } // namespace v1 template + typename PropertiesT = ext::oneapi::experimental::empty_properties_t, + typename KernelTypeUniversalRef, int Dims> auto submit_kernel_direct( - const queue &Queue, PropertiesT Props, const nd_range &Range, + const queue &Queue, const nd_range &Range, KernelTypeUniversalRef &&KernelFunc, + PropertiesT ExtraProps = ext::oneapi::experimental::empty_properties_t{}, const detail::code_location &CodeLoc = detail::code_location::current()) { - // TODO Properties not supported yet - (void)Props; - static_assert( - std::is_same_v, - "Setting properties not supported yet for no-CGH kernel submit."); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); using KernelType = @@ -189,6 +187,28 @@ auto submit_kernel_direct( HostKernelRef HostKernel(std::forward(KernelFunc)); + // Get Kernel Launch properties. User can specify properties either + // via specifying get(property_tag{}) method in kernel type or by using + // launch_config API or by explicitly passing them in call to + // parallel_for (deprecated API). + // ExtraProps are properties passed explicitly or via launch_config. + + // Assumption: If user specify properties via launch_config or explicitly + // then we don't check for properties specified via get() method. + KernelLaunchPropertiesTy parsedProps; + if constexpr (std::is_same_v) { + // Use properties passed via. get() method. + if constexpr (ext::oneapi::experimental::detail:: + HasKernelPropertiesGetMethod::value) { + auto prop = KernelFunc.get(ext::oneapi::experimental::properties_tag{}); + parsedProps = detail::processKernelProperties(prop); + } + } else { + // Use ExtraProps + parsedProps = detail::processKernelProperties(ExtraProps); + } + // Instantiating the kernel on the host improves debugging. // Passing this pointer to another translation unit prevents optimization. #ifndef NDEBUG @@ -202,11 +222,11 @@ auto submit_kernel_direct( if constexpr (EventNeeded) { return submit_kernel_direct_with_event_impl( - Queue, Range, HostKernel, DeviceKernelInfoPtr, + Queue, Range, HostKernel, DeviceKernelInfoPtr, parsedProps, TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } else { submit_kernel_direct_without_event_impl( - Queue, Range, HostKernel, DeviceKernelInfoPtr, + Queue, Range, HostKernel, DeviceKernelInfoPtr, parsedProps, TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } } @@ -3256,11 +3276,24 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { RestT &&...Rest) { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return submit( - [&](handler &CGH) { - CGH.template parallel_for(Range, Properties, Rest...); - }, - TlsCodeLocCapture.query()); + using KernelType = std::tuple_element_t<0, std::tuple>; + + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. + if constexpr (sizeof...(RestT) == 1 && + !(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, sycl::nd_item>::value)) { + + // FIXME: Can it happen that user defined both get() and properties? + // If so, we should use MergedProperties instead of Properties here. + return detail::submit_kernel_direct( + *this, Range, Rest..., Properties, TlsCodeLocCapture.query()); + } else + return submit( + [&](handler &CGH) { + CGH.template parallel_for(Range, Properties, Rest...); + }, + TlsCodeLocCapture.query()); } /// parallel_for version with a kernel represented as a lambda + nd_range that @@ -3277,18 +3310,15 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); using KernelType = std::tuple_element_t<0, std::tuple>; - // TODO The handler-less path does not support reductions, kernel - // function properties and kernel functions with the kernel_handler - // type argument yet. + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. if constexpr (sizeof...(RestT) == 1 && - !(ext::oneapi::experimental::detail:: - HasKernelPropertiesGetMethod< - const KernelType &>::value) && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { return detail::submit_kernel_direct( - *this, ext::oneapi::experimental::empty_properties_t{}, Range, - Rest..., TlsCodeLocCapture.query()); + *this, Range, Rest..., + ext::oneapi::experimental::empty_properties_t{}, + TlsCodeLocCapture.query()); } else { return submit( [&](handler &CGH) { diff --git a/sycl/source/detail/kernel_data.hpp b/sycl/source/detail/kernel_data.hpp index 7ba849dc33f1f..f78845e223057 100644 --- a/sycl/source/detail/kernel_data.hpp +++ b/sycl/source/detail/kernel_data.hpp @@ -8,12 +8,14 @@ #pragma once +#include #include #include #include #include #include +#include #include @@ -107,6 +109,7 @@ class KernelData { void setDeviceKernelInfoPtr(DeviceKernelInfo *Ptr) { MDeviceKernelInfoPtr = Ptr; } + #ifndef __INTEL_PREVIEW_BREAKING_CHANGES void setKernelInfo(void *KernelFuncPtr, int KernelNumArgs, KernelParamDescGetterT KernelParamDescGetter, @@ -134,6 +137,7 @@ class KernelData { return MDeviceKernelInfoPtr->usesAssert(); } + // Kernel launch properties getter and setters. ur_kernel_cache_config_t getKernelCacheConfig() const { return MKernelCacheConfig; } @@ -163,6 +167,88 @@ class KernelData { MKernelWorkGroupMemorySize = Size; } + void validateAndSetKernelLaunchProperties( + const detail::KernelLaunchPropertiesTy &Kprop, bool HasGraph, + const device_impl &dev) { + using execScope = ext::oneapi::experimental::execution_scope; + + // Validate properties before setting. + if (HasGraph) { + if (Kprop.MWorkGroupMemorySize) { + throw sycl::exception( + sycl::make_error_code(errc::invalid), + "Setting work group scratch memory size is not yet supported " + "for use with the SYCL Graph extension."); + } + + if (Kprop.MUsesClusterLaunch && *Kprop.MUsesClusterLaunch) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "Cluster launch is not yet supported " + "for use with the SYCL Graph extension."); + } + } + + // Validate and set forward progress guarantees. + for (int i = 0; i < 3; i++) { + if (Kprop.MForwardProgressProperties[i].Guarantee.has_value()) { + + if (!dev.supportsForwardProgress( + *Kprop.MForwardProgressProperties[i].Guarantee, + *Kprop.MForwardProgressProperties[i].ExecScope, + *Kprop.MForwardProgressProperties[i].CoordinationScope)) { + throw sycl::exception( + sycl::make_error_code(errc::feature_not_supported), + "The device associated with the queue does not support the " + "requested forward progress guarantee."); + } + + auto execScope = *Kprop.MForwardProgressProperties[i].ExecScope; + // If we are here, the device supports the guarantee required but there + // is a caveat in that if the guarantee required is a concurrent + // guarantee, then we most likely also need to enable cooperative launch + // of the kernel. That is, although the device supports the required + // guarantee, some setup work is needed to truly make the device provide + // that guarantee at runtime. Otherwise, we will get the default + // guarantee which is weaker than concurrent. Same reasoning applies for + // sub_group but not for work_item. + // TODO: Further design work is probably needed to reflect this behavior + // in Unified Runtime. + if ((execScope == execScope::work_group || + execScope == execScope::sub_group) && + (*Kprop.MForwardProgressProperties[i].Guarantee == + ext::oneapi::experimental::forward_progress_guarantee:: + concurrent)) { + setCooperative(true); + } + } + } + + if (Kprop.MIsCooperative) + setCooperative(*Kprop.MIsCooperative); + + if (Kprop.MCacheConfig) { + // KernelLaunchPropertiesTy::StableKernelCacheConfig is modeled after + // ur_kernel_cache_config_t, so this cast is safe. + setKernelCacheConfig( + static_cast(*Kprop.MCacheConfig)); + } + + if (Kprop.MWorkGroupMemorySize) + setKernelWorkGroupMemorySize(*Kprop.MWorkGroupMemorySize); + + if (Kprop.MUsesClusterLaunch && *Kprop.MUsesClusterLaunch) { + if (Kprop.MClusterDims == 1) + setClusterDimensions(sycl::range<1>{Kprop.MClusterSize[0]}); + else if (Kprop.MClusterDims == 2) + setClusterDimensions( + sycl::range<2>{Kprop.MClusterSize[0], Kprop.MClusterSize[1]}); + else if (Kprop.MClusterDims == 3) + setClusterDimensions(sycl::range<3>{Kprop.MClusterSize[0], + Kprop.MClusterSize[1], + Kprop.MClusterSize[2]}); + } + } + KernelNameStrRefT getKernelName() const { assert(MDeviceKernelInfoPtr); return static_cast(MDeviceKernelInfoPtr->Name); diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 2b18a4fb6e28f..2881ff60583da 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -567,6 +567,7 @@ EventImplPtr queue_impl::submit_command_to_graph( EventImplPtr queue_impl::submit_kernel_direct_impl( const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { KernelData KData; @@ -575,6 +576,12 @@ EventImplPtr queue_impl::submit_kernel_direct_impl( KData.setKernelFunc(HostKernel.getPtr()); KData.setNDRDesc(NDRDesc); + // Validate and set kernel launch properties. + KData.validateAndSetKernelLaunchProperties( + Props, getCommandGraph() != nullptr, /*HasGraph?*/ + getDeviceImpl() /*device_impl*/ + ); + auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData, bool SchedulerBypass) -> EventImplPtr { if (SchedulerBypass) { diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 49da7aee8c448..f57297d9f3b35 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -363,10 +363,11 @@ class queue_impl : public std::enable_shared_from_this { event submit_kernel_direct_with_event( const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, - true, CodeLoc, IsTopCodeLoc); + true, Props, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } @@ -374,9 +375,10 @@ class queue_impl : public std::enable_shared_from_this { void submit_kernel_direct_without_event( const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, - false, CodeLoc, IsTopCodeLoc); + false, Props, CodeLoc, IsTopCodeLoc); } void submit_without_event(const detail::type_erased_cgfo_ty &CGF, @@ -929,6 +931,7 @@ class queue_impl : public std::enable_shared_from_this { EventImplPtr submit_kernel_direct_impl( const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 26477c99be62c..07ec95f8ed39b 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -1752,10 +1752,12 @@ static bool checkContextSupports(detail::context_impl &ContextImpl, return SupportsOp; } +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES void handler::verifyDeviceHasProgressGuarantee( sycl::ext::oneapi::experimental::forward_progress_guarantee guarantee, sycl::ext::oneapi::experimental::execution_scope threadScope, sycl::ext::oneapi::experimental::execution_scope coordinationScope) { + using execution_scope = sycl::ext::oneapi::experimental::execution_scope; using forward_progress = sycl::ext::oneapi::experimental::forward_progress_guarantee; @@ -1797,6 +1799,7 @@ void handler::verifyDeviceHasProgressGuarantee( } } } +#endif bool handler::supportsUSMMemcpy2D() { if (impl->get_graph_or_null()) @@ -1910,6 +1913,13 @@ void handler::memcpyFromHostOnlyDeviceGlobal(void *Dest, }); } +void handler::setKernelLaunchProperties( + const detail::KernelLaunchPropertiesTy &Kprop) { + impl->MKernelData.validateAndSetKernelLaunchProperties( + Kprop, getCommandGraph() != nullptr /*hasGraph?*/, + impl->get_device() /*device_impl*/); +} + #ifndef __INTEL_PREVIEW_BREAKING_CHANGES const std::shared_ptr & handler::getContextImplPtr() const { @@ -1927,6 +1937,7 @@ detail::context_impl &handler::getContextImpl() const { return impl->get_queue().getContextImpl(); } +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES void handler::setKernelCacheConfig(handler::StableKernelCacheConfig Config) { switch (Config) { case handler::StableKernelCacheConfig::Default: @@ -1945,7 +1956,6 @@ void handler::setKernelIsCooperative(bool KernelIsCooperative) { impl->MKernelData.setCooperative(KernelIsCooperative); } -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES void handler::setKernelClusterLaunch(sycl::range<3> ClusterSize, int Dims) { throwIfGraphAssociated< syclex::detail::UnsupportedGraphFeatures:: @@ -1961,7 +1971,6 @@ void handler::setKernelClusterLaunch(sycl::range<3> ClusterSize, int Dims) { impl->MKernelData.setClusterDimensions(ClusterSize); } } -#endif void handler::setKernelClusterLaunch(sycl::range<3> ClusterSize) { throwIfGraphAssociated< @@ -1989,6 +1998,7 @@ void handler::setKernelWorkGroupMem(size_t Size) { sycl_ext_oneapi_work_group_scratch_memory>(); impl->MKernelData.setKernelWorkGroupMemorySize(Size); } +#endif // __INTEL_PREVIEW_BREAKING_CHANGES void handler::ext_oneapi_graph( ext::oneapi::experimental::command_graph< diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index f34da47852266..c97798b80bdba 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -476,27 +476,31 @@ event submit_kernel_direct_with_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { return getSyclObjImpl(Queue)->submit_kernel_direct_with_event( - Range, HostKernel, DeviceKernelInfo, CodeLoc, IsTopCodeLoc); + Range, HostKernel, DeviceKernelInfo, Props, CodeLoc, IsTopCodeLoc); } template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<1>( const queue &Queue, const nd_range<1> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<2>( const queue &Queue, const nd_range<2> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<3>( const queue &Queue, const nd_range<3> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template @@ -504,27 +508,31 @@ void submit_kernel_direct_without_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { getSyclObjImpl(Queue)->submit_kernel_direct_without_event( - Range, HostKernel, DeviceKernelInfo, CodeLoc, IsTopCodeLoc); + Range, HostKernel, DeviceKernelInfo, Props, CodeLoc, IsTopCodeLoc); } template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<1>( const queue &Queue, const nd_range<1> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<2>( const queue &Queue, const nd_range<2> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<3>( const queue &Queue, const nd_range<3> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); } // namespace _V1 diff --git a/sycl/test/extensions/properties/non_esimd_kernel_fp_control.cpp b/sycl/test/extensions/properties/non_esimd_kernel_fp_control.cpp index 46d11eccdfe54..e6910484bf52f 100644 --- a/sycl/test/extensions/properties/non_esimd_kernel_fp_control.cpp +++ b/sycl/test/extensions/properties/non_esimd_kernel_fp_control.cpp @@ -20,7 +20,7 @@ struct ESIMDKernel { int main(void) { queue q; - // expected-error-re@sycl/handler.hpp:* {{static assertion failed due to requirement {{.+}}: Floating point control property is supported for ESIMD kernels only.}} + // expected-error-re@sycl/detail/kernel_launch_helper.hpp:* {{static assertion failed due to requirement {{.+}}: Floating point control property is supported for ESIMD kernels only.}} syclex::properties properties7{ intelex::fp_control}; @@ -28,7 +28,7 @@ int main(void) { cgh.single_task(properties7, [=]() {}); }); - // expected-error-re@sycl/handler.hpp:* {{static assertion failed due to requirement {{.+}}: Floating point control property is supported for ESIMD kernels only.}} + // expected-error-re@sycl/detail/kernel_launch_helper.hpp:* {{static assertion failed due to requirement {{.+}}: Floating point control property is supported for ESIMD kernels only.}} ESIMDKernel Kern; q.submit([&](handler &cgh) { cgh.parallel_for(range<1>(1), Kern); }); diff --git a/sycl/test/include_deps/sycl_detail_core.hpp.cpp b/sycl/test/include_deps/sycl_detail_core.hpp.cpp index f4c33d1ed938f..ab84fc91c3260 100644 --- a/sycl/test/include_deps/sycl_detail_core.hpp.cpp +++ b/sycl/test/include_deps/sycl_detail_core.hpp.cpp @@ -138,6 +138,8 @@ // CHECK-NEXT: detail/kernel_launch_helper.hpp // CHECK-NEXT: ext/intel/experimental/fp_control_kernel_properties.hpp // CHECK-NEXT: ext/intel/experimental/kernel_execution_properties.hpp +// CHECK-NEXT: ext/oneapi/experimental/cluster_group_prop.hpp +// CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp // CHECK-NEXT: ext/oneapi/experimental/virtual_functions.hpp // CHECK-NEXT: ext/oneapi/kernel_properties/properties.hpp // CHECK-NEXT: ext/oneapi/work_group_scratch_memory.hpp @@ -149,9 +151,7 @@ // CHECK-NEXT: ext/oneapi/bindless_images_interop.hpp // CHECK-NEXT: ext/oneapi/interop_common.hpp // CHECK-NEXT: ext/oneapi/bindless_images_mem_handle.hpp -// CHECK-NEXT: ext/oneapi/experimental/cluster_group_prop.hpp // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp -// CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp // CHECK-NEXT: kernel.hpp // CHECK-NEXT: sampler.hpp // CHECK-NEXT: feature_test.hpp diff --git a/sycl/test/virtual-functions/properties-negative.cpp b/sycl/test/virtual-functions/properties-negative.cpp index b8e1b75f1d9a9..0ef06b3652ad1 100644 --- a/sycl/test/virtual-functions/properties-negative.cpp +++ b/sycl/test/virtual-functions/properties-negative.cpp @@ -17,15 +17,15 @@ int main() { oneapi::properties props_int{oneapi::indirectly_callable_in}; oneapi::properties props_user{oneapi::indirectly_callable_in}; - // expected-error-re@sycl/handler.hpp:* {{static assertion failed due to requirement {{.*}} indirectly_callable property cannot be applied to SYCL kernels}} + // expected-error-re@sycl/detail/kernel_launch_helper.hpp:* {{static assertion failed due to requirement {{.*}} indirectly_callable property cannot be applied to SYCL kernels}} q.single_task(props_empty, [=]() {}); // When both "props_empty" and "props_void" are in use, we won't see the // static assert firing for the second one, because there will be only one // instantiation of handler::processProperties. q.single_task(props_void, [=]() {}); - // expected-error-re@sycl/handler.hpp:* {{static assertion failed due to requirement {{.*}} indirectly_callable property cannot be applied to SYCL kernels}} + // expected-error-re@sycl/detail/kernel_launch_helper.hpp:* {{static assertion failed due to requirement {{.*}} indirectly_callable property cannot be applied to SYCL kernels}} q.single_task(props_int, [=]() {}); - // expected-error-re@sycl/handler.hpp:* {{static assertion failed due to requirement {{.*}} indirectly_callable property cannot be applied to SYCL kernels}} + // expected-error-re@sycl/detail/kernel_launch_helper.hpp:* {{static assertion failed due to requirement {{.*}} indirectly_callable property cannot be applied to SYCL kernels}} q.single_task(props_user, [=]() {}); return 0;