From 399c0a55eed5000afe3c070ccc2fdc7a0daa9205 Mon Sep 17 00:00:00 2001 From: "Agarwal, Udit" Date: Mon, 13 Oct 2025 23:27:55 +0200 Subject: [PATCH 1/4] Add support for kernel launch properties in no-handler path --- .../sycl/detail/kernel_launch_helper.hpp | 215 +++++++++++++++++- .../oneapi/experimental/enqueue_functions.hpp | 37 +-- sycl/include/sycl/handler.hpp | 151 +++++------- .../sycl/khr/free_function_commands.hpp | 37 ++- sycl/include/sycl/queue.hpp | 80 +++++-- sycl/source/detail/kernel_data.hpp | 66 ++++++ sycl/source/detail/queue_impl.cpp | 7 + sycl/source/detail/queue_impl.hpp | 7 +- sycl/source/handler.cpp | 57 ++--- sycl/source/queue.cpp | 12 +- .../non_esimd_kernel_fp_control.cpp | 4 +- .../include_deps/sycl_detail_core.hpp.cpp | 4 +- .../virtual-functions/properties-negative.cpp | 6 +- 13 files changed, 470 insertions(+), 213 deletions(-) diff --git a/sycl/include/sycl/detail/kernel_launch_helper.hpp b/sycl/include/sycl/detail/kernel_launch_helper.hpp index a80ddc9feb83f..781a992cef03f 100644 --- a/sycl/include/sycl/detail/kernel_launch_helper.hpp +++ b/sycl/include/sycl/detail/kernel_launch_helper.hpp @@ -14,6 +14,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -253,21 +256,219 @@ struct KernelWrapper< } }; // KernelWrapper struct -struct KernelLaunchPropertyWrapper { - template - static void parseProperties([[maybe_unused]] PropertyProcessor h, - [[maybe_unused]] const KernelType &KernelFunc) { +// This struct is inherited by sycl::handler. +class KernelLaunchPropertyWrapper { +public: + // This struct is used to store kernel launch properties. + // std::optional is used to indicate that the property is not set. + // In some code paths, kernel launch properties are set multiple times + // for the same kernel, that is why using std::optional to avoid overriding + // previously set properties. + struct KernelLaunchPropertiesT { + + struct ScopeForwardProgressProperty { + std::optional + Guarantee; + std::optional ExecScope; + std::optional + CoordinationScope; + }; + + std::optional MCacheConfig = std::nullopt; + std::optional MIsCooperative = std::nullopt; + std::optional MWorkGroupMemorySize = std::nullopt; + std::optional MUsesClusterLaunch = std::nullopt; + size_t MClusterDims = 0; + std::array MClusterSize = {0, 0, 0}; + + // Forward progress guarantee properties for work_item, sub_group and + // work_group scopes. + // Indexed by ExecutionScope enum. + std::array MForwardProgressProperties; + + KernelLaunchPropertiesT() = default; + + // TODO: Do you even need this? + KernelLaunchPropertiesT( + ur_kernel_cache_config_t _CacheConfig, bool _IsCooperative, + uint32_t _WorkGroupMemorySize, bool _UsesClusterLaunch, + size_t _ClusterDims, std::array _ClusterSize, + std::array _ForwardProgressProperties) + : MCacheConfig(_CacheConfig), MIsCooperative(_IsCooperative), + MWorkGroupMemorySize(_WorkGroupMemorySize), + MUsesClusterLaunch(_UsesClusterLaunch), MClusterDims(_ClusterDims), + MClusterSize(_ClusterSize), + MForwardProgressProperties(_ForwardProgressProperties) {} + }; // struct KernelLaunchPropertiesT + + /// Process runtime kernel properties. + /// + /// Stores information about kernel properties into the handler. + template + static KernelLaunchPropertiesT + processKernelLaunchProperties(PropertiesT Props) { + using namespace sycl::ext::oneapi::experimental; + using namespace sycl::ext::oneapi::experimental::detail; + KernelLaunchPropertiesT retval; + + // Process Kernel cache configuration property. + { + if constexpr (PropertiesT::template has_property< + sycl::ext::intel::experimental::cache_config_key>()) { + auto Config = Props.template get_property< + sycl::ext::intel::experimental::cache_config_key>(); + if (Config == sycl::ext::intel::experimental::large_slm) { + retval.MCacheConfig = UR_KERNEL_CACHE_CONFIG_LARGE_SLM; + } else if (Config == sycl::ext::intel::experimental::large_data) { + retval.MCacheConfig = UR_KERNEL_CACHE_CONFIG_LARGE_DATA; + } + } else { + std::ignore = Props; + } + } + + // Process Kernel cooperative property. + { + if constexpr (PropertiesT::template has_property()) + retval.MIsCooperative = true; + } + + // Process device progress properties. + { + using forward_progress = + sycl::ext::oneapi::experimental::forward_progress_guarantee; + if constexpr (PropertiesT::template has_property< + work_group_progress_key>()) { + auto prop = Props.template get_property(); + retval.MForwardProgressProperties[0].Guarantee = prop.guarantee; + retval.MForwardProgressProperties[0].ExecScope = + execution_scope::work_group; + retval.MForwardProgressProperties[0].CoordinationScope = + prop.coordinationScope; + + // If we are here, the device supports the guarantee required but there + // is a caveat in that if the guarantee required is a concurrent + // guarantee, then we most likely also need to enable cooperative launch + // of the kernel. That is, although the device supports the required + // guarantee, some setup work is needed to truly make the device provide + // that guarantee at runtime. Otherwise, we will get the default + // guarantee which is weaker than concurrent. Same reasoning applies for + // sub_group but not for work_item. + // TODO: Further design work is probably needed to reflect this behavior + // in Unified Runtime. + if constexpr (prop.guarantee == forward_progress::concurrent) + retval.MIsCooperative = true; + } + if constexpr (PropertiesT::template has_property< + sub_group_progress_key>()) { + auto prop = Props.template get_property(); + retval.MForwardProgressProperties[1].Guarantee = prop.guarantee; + retval.MForwardProgressProperties[1].ExecScope = + execution_scope::sub_group; + retval.MForwardProgressProperties[1].CoordinationScope = + prop.coordinationScope; + + // Same reasoning as above for work_group applies here. + if constexpr (prop.guarantee == forward_progress::concurrent) + retval.MIsCooperative = true; + } + if constexpr (PropertiesT::template has_property< + work_item_progress_key>()) { + auto prop = Props.template get_property(); + retval.MForwardProgressProperties[2].Guarantee = prop.guarantee; + retval.MForwardProgressProperties[2].ExecScope = + execution_scope::work_item; + retval.MForwardProgressProperties[2].CoordinationScope = + prop.coordinationScope; + } + } + + // Process work group scratch memory property. + { + if constexpr (PropertiesT::template has_property< + work_group_scratch_size>()) { + auto WorkGroupMemSize = + Props.template get_property(); + retval.MWorkGroupMemorySize = WorkGroupMemSize.size; + } + } + + // Parse cluster properties. + { + constexpr std::size_t ClusterDim = getClusterDim(); + if constexpr (ClusterDim > 0) { + + auto ClusterSize = + Props.template get_property>() + .get_cluster_size(); + retval.MUsesClusterLaunch = true; + retval.MClusterDims = ClusterDim; + if (ClusterDim == 1) { + retval.MClusterSize[0] = ClusterSize[0]; + } else if (ClusterDim == 2) { + retval.MClusterSize[0] = ClusterSize[0]; + retval.MClusterSize[1] = ClusterSize[1]; + } else if (ClusterDim == 3) { + retval.MClusterSize[0] = ClusterSize[0]; + retval.MClusterSize[1] = ClusterSize[1]; + retval.MClusterSize[2] = ClusterSize[2]; + } else { + assert(ClusterDim <= 3 && + "Only 1D, 2D, and 3D cluster launch is supported."); + } + } + } + + return retval; + } + + /// Process kernel properties. + /// + /// Stores information about kernel properties into the handler. + /// + /// Note: it is important that this function *does not* depend on kernel + /// name or kernel type, because then it will be instantiated for every + /// kernel, even though body of those instantiated functions could be almost + /// the same, thus unnecessary increasing compilation time. + template < + bool IsESIMDKernel, + typename PropertiesT = ext::oneapi::experimental::empty_properties_t> + static KernelLaunchPropertiesT processKernelProperties(PropertiesT Props) { + static_assert( + ext::oneapi::experimental::is_property_list::value, + "Template type is not a property list."); + static_assert( + !PropertiesT::template has_property< + sycl::ext::intel::experimental::fp_control_key>() || + (PropertiesT::template has_property< + sycl::ext::intel::experimental::fp_control_key>() && + IsESIMDKernel), + "Floating point control property is supported for ESIMD kernels only."); + static_assert( + !PropertiesT::template has_property< + sycl::ext::oneapi::experimental::indirectly_callable_key>(), + "indirectly_callable property cannot be applied to SYCL kernels"); + + return processKernelLaunchProperties(Props); + } + + // Returns KernelLaunchPropertiesT or std::nullopt based on whether the + // kernel functor has a get method that returns properties. + template + static std::optional + parseProperties([[maybe_unused]] const KernelType &KernelFunc) { #ifndef __SYCL_DEVICE_ONLY__ // If there are properties provided by get method then process them. if constexpr (ext::oneapi::experimental::detail:: HasKernelPropertiesGetMethod::value) { - h->template processProperties< - detail::CompileTimeKernelInfo.IsESIMD>( + return processKernelProperties( KernelFunc.get(ext::oneapi::experimental::properties_tag{})); } #endif + // If there are no properties provided by get method then return empty + // optional. + return std::nullopt; } }; // KernelLaunchPropertyWrapper struct diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index e393bd626d4d6..49aa3a252682e 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -259,17 +259,12 @@ template void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, ReductionsT &&...Reductions) { - // TODO The handler-less path does not support reductions, kernel - // function properties and kernel functions with the kernel_handler - // type argument yet. + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. if constexpr (sizeof...(ReductionsT) == 0 && - !(ext::oneapi::experimental::detail:: - HasKernelPropertiesGetMethod< - const KernelType &>::value) && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { - detail::submit_kernel_direct(std::move(Q), empty_properties_t{}, - Range, KernelObj); + detail::submit_kernel_direct(std::move(Q), Range, KernelObj); } else { submit(std::move(Q), [&](handler &CGH) { nd_launch(CGH, Range, KernelObj, @@ -296,13 +291,25 @@ template void nd_launch(queue Q, launch_config, Properties> Config, const KernelType &KernelObj, ReductionsT &&...Reductions) { - // TODO This overload of the nd_launch function takes the kernel function - // properties, which are not yet supported for the handler-less path, - // so it only supports handler based submission for now - submit(std::move(Q), [&](handler &CGH) { - nd_launch(CGH, Config, KernelObj, - std::forward(Reductions)...); - }); + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. + if constexpr (sizeof...(ReductionsT) == 0 && + !(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, sycl::nd_item>::value)) { + + ext::oneapi::experimental::detail::LaunchConfigAccess, + Properties> + ConfigAccess(Config); + + detail::submit_kernel_direct(std::move(Q), + ConfigAccess.getRange(), KernelObj, + Config.getProperties()); + } else { + submit(std::move(Q), [&](handler &CGH) { + nd_launch(CGH, Config, KernelObj, + std::forward(Reductions)...); + }); + } } template diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 67f21bc05857f..78d58c80d6b4f 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -877,6 +877,7 @@ class __SYCL_EXPORT handler { } } +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES void verifyDeviceHasProgressGuarantee( sycl::ext::oneapi::experimental::forward_progress_guarantee guarantee, sycl::ext::oneapi::experimental::execution_scope threadScope, @@ -901,64 +902,10 @@ class __SYCL_EXPORT handler { /// Stores information about kernel properties into the handler. template void processLaunchProperties(PropertiesT Props) { - if constexpr (PropertiesT::template has_property< - sycl::ext::intel::experimental::cache_config_key>()) { - auto Config = Props.template get_property< - sycl::ext::intel::experimental::cache_config_key>(); - if (Config == sycl::ext::intel::experimental::large_slm) { - setKernelCacheConfig(StableKernelCacheConfig::LargeSLM); - } else if (Config == sycl::ext::intel::experimental::large_data) { - setKernelCacheConfig(StableKernelCacheConfig::LargeData); - } - } else { - (void)Props; - } - - constexpr bool UsesRootSync = PropertiesT::template has_property< - sycl::ext::oneapi::experimental::use_root_sync_key>(); - if (UsesRootSync) { - setKernelIsCooperative(UsesRootSync); - } - if constexpr (PropertiesT::template has_property< - sycl::ext::oneapi::experimental:: - work_group_progress_key>()) { - auto prop = Props.template get_property< - sycl::ext::oneapi::experimental::work_group_progress_key>(); - verifyDeviceHasProgressGuarantee( - prop.guarantee, - sycl::ext::oneapi::experimental::execution_scope::work_group, - prop.coordinationScope); - } - if constexpr (PropertiesT::template has_property< - sycl::ext::oneapi::experimental:: - sub_group_progress_key>()) { - auto prop = Props.template get_property< - sycl::ext::oneapi::experimental::sub_group_progress_key>(); - verifyDeviceHasProgressGuarantee( - prop.guarantee, - sycl::ext::oneapi::experimental::execution_scope::sub_group, - prop.coordinationScope); - } - if constexpr (PropertiesT::template has_property< - sycl::ext::oneapi::experimental:: - work_item_progress_key>()) { - auto prop = Props.template get_property< - sycl::ext::oneapi::experimental::work_item_progress_key>(); - verifyDeviceHasProgressGuarantee( - prop.guarantee, - sycl::ext::oneapi::experimental::execution_scope::work_item, - prop.coordinationScope); - } - - if constexpr (PropertiesT::template has_property< - sycl::ext::oneapi::experimental:: - work_group_scratch_size>()) { - auto WorkGroupMemSize = Props.template get_property< - sycl::ext::oneapi::experimental::work_group_scratch_size>(); - setKernelWorkGroupMem(WorkGroupMemSize.size); - } - - checkAndSetClusterRange(Props); + detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT ParsedProp = + detail::KernelLaunchPropertyWrapper::processKernelLaunchProperties< + PropertiesT>(Props); + setKernelLaunchProperties(ParsedProp); } /// Process kernel properties. @@ -973,23 +920,12 @@ class __SYCL_EXPORT handler { bool IsESIMDKernel, typename PropertiesT = ext::oneapi::experimental::empty_properties_t> void processProperties(PropertiesT Props) { - static_assert( - ext::oneapi::experimental::is_property_list::value, - "Template type is not a property list."); - static_assert( - !PropertiesT::template has_property< - sycl::ext::intel::experimental::fp_control_key>() || - (PropertiesT::template has_property< - sycl::ext::intel::experimental::fp_control_key>() && - IsESIMDKernel), - "Floating point control property is supported for ESIMD kernels only."); - static_assert( - !PropertiesT::template has_property< - sycl::ext::oneapi::experimental::indirectly_callable_key>(), - "indirectly_callable property cannot be applied to SYCL kernels"); - - processLaunchProperties(Props); + detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT ParsedProp = + detail::KernelLaunchPropertyWrapper::processKernelProperties< + IsESIMDKernel>(Props); + setKernelLaunchProperties(ParsedProp); } +#endif // INTEL_PREVIEW_BREAKING_CHANGES /// Checks whether it is possible to copy the source shape to the destination /// shape(the shapes are described by the accessor ranges) by using @@ -1297,8 +1233,11 @@ class __SYCL_EXPORT handler { decltype(Wrapper), TransformedArgType, PropertiesT>::wrap(Wrapper); - detail::KernelLaunchPropertyWrapper::parseProperties(this, - Wrapper); + if (auto prop = detail::KernelLaunchPropertyWrapper::parseProperties< + KName, Info.IsESIMD>(Wrapper)) { + setKernelLaunchProperties(*prop); + } + #ifndef __SYCL_DEVICE_ONLY__ verifyUsedKernelBundleInternal(Info.Name); // We are executing over the rounded range, but there are still @@ -1322,11 +1261,17 @@ class __SYCL_EXPORT handler { // kernel is generated detail::KernelWrapper::wrap(KernelFunc); - detail::KernelLaunchPropertyWrapper::parseProperties(this, - KernelFunc); + if (auto prop = detail::KernelLaunchPropertyWrapper::parseProperties< + NameT, Info.IsESIMD>(KernelFunc)) { + setKernelLaunchProperties(*prop); + } #ifndef __SYCL_DEVICE_ONLY__ verifyUsedKernelBundleInternal(Info.Name); - processProperties(Props); + detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT + ProcessedProps = + detail::KernelLaunchPropertyWrapper::processKernelProperties< + Info.IsESIMD, PropertiesT>(Props); + setKernelLaunchProperties(ProcessedProps); detail::checkValueRange(UserRange); setNDRangeDescriptor(std::move(UserRange)); StoreLambda( @@ -1355,7 +1300,10 @@ class __SYCL_EXPORT handler { setDeviceKernelInfo(std::move(Kernel)); detail::checkValueRange(NumWorkItems); setNDRangeDescriptor(std::move(NumWorkItems)); - processLaunchProperties(Props); + detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT ParsedProp = + detail::KernelLaunchPropertyWrapper::processKernelLaunchProperties< + PropertiesT>(Props); + setKernelLaunchProperties(ParsedProp); extractArgsAndReqs(); #endif } @@ -1378,7 +1326,10 @@ class __SYCL_EXPORT handler { setDeviceKernelInfo(std::move(Kernel)); detail::checkValueRange(NDRange); setNDRangeDescriptor(std::move(NDRange)); - processLaunchProperties(Props); + detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT ParsedProp = + detail::KernelLaunchPropertyWrapper::processKernelLaunchProperties< + PropertiesT>(Props); + setKernelLaunchProperties(ParsedProp); extractArgsAndReqs(); #endif } @@ -1395,12 +1346,14 @@ class __SYCL_EXPORT handler { using NameT = typename detail::get_kernel_name_t::name; (void)Props; + constexpr auto Info = detail::CompileTimeKernelInfo; detail::KernelWrapper::wrap(KernelFunc); - detail::KernelLaunchPropertyWrapper::parseProperties(this, - KernelFunc); + if (auto prop = detail::KernelLaunchPropertyWrapper::parseProperties< + NameT, Info.IsESIMD>(KernelFunc)) { + setKernelLaunchProperties(*prop); + } #ifndef __SYCL_DEVICE_ONLY__ - constexpr auto Info = detail::CompileTimeKernelInfo; if constexpr (WrapAsVal == detail::WrapAs::single_task) { throwOnKernelParameterMisuse(Info); } @@ -1416,7 +1369,11 @@ class __SYCL_EXPORT handler { } StoreLambda(std::move(KernelFunc)); - processProperties(Props); + detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT + ProcessedProps = + detail::KernelLaunchPropertyWrapper::processKernelProperties< + Info.IsESIMD, PropertiesT>(Props); + setKernelLaunchProperties(ProcessedProps); #endif } @@ -1439,8 +1396,10 @@ class __SYCL_EXPORT handler { (void)Kernel; detail::KernelWrapper::wrap(KernelFunc); - detail::KernelLaunchPropertyWrapper::parseProperties(this, - KernelFunc); + if (auto prop = detail::KernelLaunchPropertyWrapper::parseProperties( + KernelFunc)) { + setKernelLaunchProperties(*prop); + } #ifndef __SYCL_DEVICE_ONLY__ constexpr auto Info = detail::CompileTimeKernelInfo; if constexpr (WrapAsVal == detail::WrapAs::single_task) { @@ -1467,7 +1426,11 @@ class __SYCL_EXPORT handler { "the kernel name must match the name of the lambda"); } StoreLambda(std::move(KernelFunc)); - processProperties(Props); + detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT + ProcessedProps = + detail::KernelLaunchPropertyWrapper::processKernelProperties< + Info.IsESIMD, PropertiesT>(Props); + setKernelLaunchProperties(ProcessedProps); #endif } #endif // __INTEL_PREVIEW_BREAKING_CHANGES @@ -3490,7 +3453,9 @@ class __SYCL_EXPORT handler { bool IsDeviceImageScoped, size_t NumBytes, size_t Offset); - // Changing values in this will break ABI/API. +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES + // Modeled after ur_kernel_cache_config_t + // Used as an argument to setKernelCacheConfig that's part of the ABI. enum class StableKernelCacheConfig : int32_t { Default = 0, LargeSLM = 1, @@ -3503,15 +3468,18 @@ class __SYCL_EXPORT handler { void setKernelIsCooperative(bool); // Set using cuda thread block cluster launch flag and set the launch bounds. -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES void setKernelClusterLaunch(sycl::range<3> ClusterSize, int Dims); -#endif void setKernelClusterLaunch(sycl::range<3> ClusterSize); void setKernelClusterLaunch(sycl::range<2> ClusterSize); void setKernelClusterLaunch(sycl::range<1> ClusterSize); // Set the request work group memory size (work_group_static ext). void setKernelWorkGroupMem(size_t Size); +#endif + + void setKernelLaunchProperties( + detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT + &KernelLaunchProperties); // Various checks that are only meaningful for host compilation, because they // result in runtime errors (i.e. exceptions being thrown). To save time @@ -3668,7 +3636,6 @@ class __SYCL_EXPORT handler { void instantiateKernelOnHost(void *InstantiateKernelOnHostPtr); friend class detail::HandlerAccess; - friend struct detail::KernelLaunchPropertyWrapper; #ifdef __INTEL_PREVIEW_BREAKING_CHANGES __SYCL_DLL_LOCAL detail::handler_impl *get_impl() { return impl; } diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index b04fac17a6f9c..b422964f3a3a1 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -157,16 +157,15 @@ template r, range<1> size, KernelType &&k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { - // TODO The handler-less path does not support kernel function properties - // and kernel functions with the kernel_handler type argument yet. + // TODO The handler-less path does not support kernel functions with the + // kernel_handler type argument yet. if constexpr (!(ext::oneapi::experimental::detail:: HasKernelPropertiesGetMethod< const KernelType &>::value) && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item<1>>::value)) { - detail::submit_kernel_direct( - q, ext::oneapi::experimental::empty_properties_t{}, - nd_range<1>(r, size), std::forward(k)); + detail::submit_kernel_direct(q, nd_range<1>(r, size), + std::forward(k)); } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, @@ -178,16 +177,12 @@ template r, range<2> size, KernelType &&k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { - // TODO The handler-less path does not support kernel function properties - // and kernel functions with the kernel_handler type argument yet. - if constexpr (!(ext::oneapi::experimental::detail:: - HasKernelPropertiesGetMethod< - const KernelType &>::value) && - !(detail::KernelLambdaHasKernelHandlerArgT< + // TODO The handler-less path does not support kernel functions with the + // kernel_handler type argument yet. + if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item<2>>::value)) { - detail::submit_kernel_direct( - q, ext::oneapi::experimental::empty_properties_t{}, - nd_range<2>(r, size), std::forward(k)); + detail::submit_kernel_direct(q, nd_range<2>(r, size), + std::forward(k)); } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, @@ -199,16 +194,12 @@ template r, range<3> size, KernelType &&k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { - // TODO The handler-less path does not support kernel function properties - // and kernel functions with the kernel_handler type argument yet. - if constexpr (!(ext::oneapi::experimental::detail:: - HasKernelPropertiesGetMethod< - const KernelType &>::value) && - !(detail::KernelLambdaHasKernelHandlerArgT< + // TODO The handler-less path does not support kernel functions with the + // kernel_handler type argument yet. + if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item<3>>::value)) { - detail::submit_kernel_direct( - q, ext::oneapi::experimental::empty_properties_t{}, - nd_range<3>(r, size), std::forward(k)); + detail::submit_kernel_direct(q, nd_range<3>(r, size), + std::forward(k)); } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index a1c7e16ccfe07..6553fda0ed709 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -67,6 +67,7 @@ event __SYCL_EXPORT submit_kernel_direct_with_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template @@ -74,6 +75,7 @@ void __SYCL_EXPORT submit_kernel_direct_without_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); namespace detail { @@ -157,17 +159,13 @@ class __SYCL_EXPORT SubmissionInfo { } // namespace v1 template + typename PropertiesT = ext::oneapi::experimental::empty_properties_t, + typename KernelTypeUniversalRef, int Dims> auto submit_kernel_direct( - const queue &Queue, PropertiesT Props, const nd_range &Range, + const queue &Queue, const nd_range &Range, KernelTypeUniversalRef &&KernelFunc, + PropertiesT ExtraProps = ext::oneapi::experimental::empty_properties_t{}, const detail::code_location &CodeLoc = detail::code_location::current()) { - // TODO Properties not supported yet - (void)Props; - static_assert( - std::is_same_v, - "Setting properties not supported yet for no-CGH kernel submit."); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); using KernelType = @@ -189,6 +187,32 @@ auto submit_kernel_direct( HostKernelRef HostKernel(std::forward(KernelFunc)); + // Get Kernel Launch properties. User can specify properties either + // via specifying get(property_tag{}) method in kernel type or by using + // launch_config API or by explicitly passing them in call to + // parallel_for (deprecated API). + // ExtraProps are properties passed explicitly or via launch_config. + + // Asumption: If user specify properties via launch_config or explicitly + // then we don't check for properties specified via get() method. + detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT parsedProps; + if constexpr (std::is_same_v) { + // Use properties passed via. get() method. + if constexpr (ext::oneapi::experimental::detail:: + HasKernelPropertiesGetMethod::value) { + auto prop = KernelFunc.get(ext::oneapi::experimental::properties_tag{}); + parsedProps = + detail::KernelLaunchPropertyWrapper::processKernelProperties( + prop); + } + } else { + // Use ExtraProps + parsedProps = + detail::KernelLaunchPropertyWrapper::processKernelProperties( + ExtraProps); + } + // Instantiating the kernel on the host improves debugging. // Passing this pointer to another translation unit prevents optimization. #ifndef NDEBUG @@ -202,11 +226,11 @@ auto submit_kernel_direct( if constexpr (EventNeeded) { return submit_kernel_direct_with_event_impl( - Queue, Range, HostKernel, DeviceKernelInfoPtr, + Queue, Range, HostKernel, DeviceKernelInfoPtr, parsedProps, TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } else { submit_kernel_direct_without_event_impl( - Queue, Range, HostKernel, DeviceKernelInfoPtr, + Queue, Range, HostKernel, DeviceKernelInfoPtr, parsedProps, TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } } @@ -3256,11 +3280,24 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { RestT &&...Rest) { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return submit( - [&](handler &CGH) { - CGH.template parallel_for(Range, Properties, Rest...); - }, - TlsCodeLocCapture.query()); + using KernelType = std::tuple_element_t<0, std::tuple>; + + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. + if constexpr (sizeof...(RestT) == 1 && + !(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, sycl::nd_item>::value)) { + + // FIXME: Can it happen that user defined both get() and properties? + // If so, we should use MergedProperties instead of Properties here. + return detail::submit_kernel_direct( + *this, Range, Rest..., Properties, TlsCodeLocCapture.query()); + } else + return submit( + [&](handler &CGH) { + CGH.template parallel_for(Range, Properties, Rest...); + }, + TlsCodeLocCapture.query()); } /// parallel_for version with a kernel represented as a lambda + nd_range that @@ -3277,18 +3314,15 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); using KernelType = std::tuple_element_t<0, std::tuple>; - // TODO The handler-less path does not support reductions, kernel - // function properties and kernel functions with the kernel_handler - // type argument yet. + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. if constexpr (sizeof...(RestT) == 1 && - !(ext::oneapi::experimental::detail:: - HasKernelPropertiesGetMethod< - const KernelType &>::value) && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { return detail::submit_kernel_direct( - *this, ext::oneapi::experimental::empty_properties_t{}, Range, - Rest..., TlsCodeLocCapture.query()); + *this, Range, Rest..., + ext::oneapi::experimental::empty_properties_t{}, + TlsCodeLocCapture.query()); } else { return submit( [&](handler &CGH) { diff --git a/sycl/source/detail/kernel_data.hpp b/sycl/source/detail/kernel_data.hpp index 7ba849dc33f1f..037341b9c0691 100644 --- a/sycl/source/detail/kernel_data.hpp +++ b/sycl/source/detail/kernel_data.hpp @@ -13,7 +13,10 @@ #include #include +#include + #include +#include #include @@ -107,6 +110,7 @@ class KernelData { void setDeviceKernelInfoPtr(DeviceKernelInfo *Ptr) { MDeviceKernelInfoPtr = Ptr; } + #ifndef __INTEL_PREVIEW_BREAKING_CHANGES void setKernelInfo(void *KernelFuncPtr, int KernelNumArgs, KernelParamDescGetterT KernelParamDescGetter, @@ -134,6 +138,7 @@ class KernelData { return MDeviceKernelInfoPtr->usesAssert(); } + // Kernel launch properties getter and setters. ur_kernel_cache_config_t getKernelCacheConfig() const { return MKernelCacheConfig; } @@ -163,6 +168,67 @@ class KernelData { MKernelWorkGroupMemorySize = Size; } + void validateAndSetKernelLaunchProperties( + const KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Kprop, + bool HasGraph, const device_impl &dev) { + + // Validate properties before setting. + { + if (HasGraph) { + if (Kprop.MWorkGroupMemorySize) { + throw sycl::exception( + sycl::make_error_code(errc::invalid), + "Setting work group scratch memory size is not yet supported " + "for use with the SYCL Graph extension."); + } + + if (Kprop.MUsesClusterLaunch && *Kprop.MUsesClusterLaunch) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "Cluster launch is not yet supported " + "for use with the SYCL Graph extension."); + } + } + + for (int i = 0; i < 3; i++) { + if (Kprop.MForwardProgressProperties[i].Guarantee.has_value()) { + + if (!dev.supportsForwardProgress( + *Kprop.MForwardProgressProperties[i].Guarantee, + *Kprop.MForwardProgressProperties[i].ExecScope, + *Kprop.MForwardProgressProperties[i].CoordinationScope)) { + // TODO: Make the error message more descriptive. + throw sycl::exception( + sycl::make_error_code(errc::feature_not_supported), + "The device associated with the queue does not support the " + "requested forward progress guarantee."); + } + } + } + } + + // Set properties. + if (Kprop.MIsCooperative) + setCooperative(*Kprop.MIsCooperative); + + if (Kprop.MCacheConfig) + setKernelCacheConfig(*Kprop.MCacheConfig); + + if (Kprop.MWorkGroupMemorySize) + setKernelWorkGroupMemorySize(*Kprop.MWorkGroupMemorySize); + + if (Kprop.MUsesClusterLaunch && *Kprop.MUsesClusterLaunch) { + if (Kprop.MClusterDims == 1) + setClusterDimensions(sycl::range<1>{Kprop.MClusterSize[0]}); + else if (Kprop.MClusterDims == 2) + setClusterDimensions( + sycl::range<2>{Kprop.MClusterSize[0], Kprop.MClusterSize[1]}); + else if (Kprop.MClusterDims == 3) + setClusterDimensions(sycl::range<3>{Kprop.MClusterSize[0], + Kprop.MClusterSize[1], + Kprop.MClusterSize[2]}); + } + } + KernelNameStrRefT getKernelName() const { assert(MDeviceKernelInfoPtr); return static_cast(MDeviceKernelInfoPtr->Name); diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 2b18a4fb6e28f..621045c567834 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -567,6 +567,7 @@ EventImplPtr queue_impl::submit_command_to_graph( EventImplPtr queue_impl::submit_kernel_direct_impl( const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { KernelData KData; @@ -575,6 +576,12 @@ EventImplPtr queue_impl::submit_kernel_direct_impl( KData.setKernelFunc(HostKernel.getPtr()); KData.setNDRDesc(NDRDesc); + // Validate and set kernel launch properties. + KData.validateAndSetKernelLaunchProperties( + Props, getCommandGraph() != nullptr, /*HasGraph?*/ + getDeviceImpl() /*device_impl*/ + ); + auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData, bool SchedulerBypass) -> EventImplPtr { if (SchedulerBypass) { diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 49da7aee8c448..4528752ba95f4 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -363,10 +363,11 @@ class queue_impl : public std::enable_shared_from_this { event submit_kernel_direct_with_event( const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, - true, CodeLoc, IsTopCodeLoc); + true, Props, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } @@ -374,9 +375,10 @@ class queue_impl : public std::enable_shared_from_this { void submit_kernel_direct_without_event( const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, - false, CodeLoc, IsTopCodeLoc); + false, Props, CodeLoc, IsTopCodeLoc); } void submit_without_event(const detail::type_erased_cgfo_ty &CGF, @@ -929,6 +931,7 @@ class queue_impl : public std::enable_shared_from_this { EventImplPtr submit_kernel_direct_impl( const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 26477c99be62c..2aacd56fda15d 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -1752,51 +1752,17 @@ static bool checkContextSupports(detail::context_impl &ContextImpl, return SupportsOp; } +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES void handler::verifyDeviceHasProgressGuarantee( sycl::ext::oneapi::experimental::forward_progress_guarantee guarantee, sycl::ext::oneapi::experimental::execution_scope threadScope, sycl::ext::oneapi::experimental::execution_scope coordinationScope) { - using execution_scope = sycl::ext::oneapi::experimental::execution_scope; - using forward_progress = - sycl::ext::oneapi::experimental::forward_progress_guarantee; - const bool supported = impl->get_device().supportsForwardProgress( - guarantee, threadScope, coordinationScope); - if (threadScope == execution_scope::work_group) { - if (!supported) { - throw sycl::exception( - sycl::errc::feature_not_supported, - "Required progress guarantee for work groups is not " - "supported by this device."); - } - // If we are here, the device supports the guarantee required but there is a - // caveat in that if the guarantee required is a concurrent guarantee, then - // we most likely also need to enable cooperative launch of the kernel. That - // is, although the device supports the required guarantee, some setup work - // is needed to truly make the device provide that guarantee at runtime. - // Otherwise, we will get the default guarantee which is weaker than - // concurrent. Same reasoning applies for sub_group but not for work_item. - // TODO: Further design work is probably needed to reflect this behavior in - // Unified Runtime. - if (guarantee == forward_progress::concurrent) - setKernelIsCooperative(true); - } else if (threadScope == execution_scope::sub_group) { - if (!supported) { - throw sycl::exception(sycl::errc::feature_not_supported, - "Required progress guarantee for sub groups is not " - "supported by this device."); - } - // Same reasoning as above. - if (guarantee == forward_progress::concurrent) - setKernelIsCooperative(true); - } else { // threadScope is execution_scope::work_item otherwise undefined - // behavior - if (!supported) { - throw sycl::exception(sycl::errc::feature_not_supported, - "Required progress guarantee for work items is not " - "supported by this device."); - } - } + + // FIXME! + detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT Kprop; + setKernelLaunchProperties(Kprop); } +#endif bool handler::supportsUSMMemcpy2D() { if (impl->get_graph_or_null()) @@ -1910,6 +1876,13 @@ void handler::memcpyFromHostOnlyDeviceGlobal(void *Dest, }); } +void handler::setKernelLaunchProperties( + detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Kprop) { + impl->MKernelData.validateAndSetKernelLaunchProperties( + Kprop, getCommandGraph() != nullptr /*hasGraph?*/, + impl->get_device() /*device_impl*/); +} + #ifndef __INTEL_PREVIEW_BREAKING_CHANGES const std::shared_ptr & handler::getContextImplPtr() const { @@ -1927,6 +1900,7 @@ detail::context_impl &handler::getContextImpl() const { return impl->get_queue().getContextImpl(); } +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES void handler::setKernelCacheConfig(handler::StableKernelCacheConfig Config) { switch (Config) { case handler::StableKernelCacheConfig::Default: @@ -1945,7 +1919,6 @@ void handler::setKernelIsCooperative(bool KernelIsCooperative) { impl->MKernelData.setCooperative(KernelIsCooperative); } -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES void handler::setKernelClusterLaunch(sycl::range<3> ClusterSize, int Dims) { throwIfGraphAssociated< syclex::detail::UnsupportedGraphFeatures:: @@ -1961,7 +1934,6 @@ void handler::setKernelClusterLaunch(sycl::range<3> ClusterSize, int Dims) { impl->MKernelData.setClusterDimensions(ClusterSize); } } -#endif void handler::setKernelClusterLaunch(sycl::range<3> ClusterSize) { throwIfGraphAssociated< @@ -1989,6 +1961,7 @@ void handler::setKernelWorkGroupMem(size_t Size) { sycl_ext_oneapi_work_group_scratch_memory>(); impl->MKernelData.setKernelWorkGroupMemorySize(Size); } +#endif // __INTEL_PREVIEW_BREAKING_CHANGES void handler::ext_oneapi_graph( ext::oneapi::experimental::command_graph< diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index f34da47852266..73a53a01abda2 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -476,27 +476,31 @@ event submit_kernel_direct_with_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { return getSyclObjImpl(Queue)->submit_kernel_direct_with_event( - Range, HostKernel, DeviceKernelInfo, CodeLoc, IsTopCodeLoc); + Range, HostKernel, DeviceKernelInfo, Props, CodeLoc, IsTopCodeLoc); } template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<1>( const queue &Queue, const nd_range<1> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<2>( const queue &Queue, const nd_range<2> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<3>( const queue &Queue, const nd_range<3> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template @@ -504,27 +508,31 @@ void submit_kernel_direct_without_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { getSyclObjImpl(Queue)->submit_kernel_direct_without_event( - Range, HostKernel, DeviceKernelInfo, CodeLoc, IsTopCodeLoc); + Range, HostKernel, DeviceKernelInfo, Props, CodeLoc, IsTopCodeLoc); } template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<1>( const queue &Queue, const nd_range<1> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<2>( const queue &Queue, const nd_range<2> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<3>( const queue &Queue, const nd_range<3> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); } // namespace _V1 diff --git a/sycl/test/extensions/properties/non_esimd_kernel_fp_control.cpp b/sycl/test/extensions/properties/non_esimd_kernel_fp_control.cpp index 46d11eccdfe54..e6910484bf52f 100644 --- a/sycl/test/extensions/properties/non_esimd_kernel_fp_control.cpp +++ b/sycl/test/extensions/properties/non_esimd_kernel_fp_control.cpp @@ -20,7 +20,7 @@ struct ESIMDKernel { int main(void) { queue q; - // expected-error-re@sycl/handler.hpp:* {{static assertion failed due to requirement {{.+}}: Floating point control property is supported for ESIMD kernels only.}} + // expected-error-re@sycl/detail/kernel_launch_helper.hpp:* {{static assertion failed due to requirement {{.+}}: Floating point control property is supported for ESIMD kernels only.}} syclex::properties properties7{ intelex::fp_control}; @@ -28,7 +28,7 @@ int main(void) { cgh.single_task(properties7, [=]() {}); }); - // expected-error-re@sycl/handler.hpp:* {{static assertion failed due to requirement {{.+}}: Floating point control property is supported for ESIMD kernels only.}} + // expected-error-re@sycl/detail/kernel_launch_helper.hpp:* {{static assertion failed due to requirement {{.+}}: Floating point control property is supported for ESIMD kernels only.}} ESIMDKernel Kern; q.submit([&](handler &cgh) { cgh.parallel_for(range<1>(1), Kern); }); diff --git a/sycl/test/include_deps/sycl_detail_core.hpp.cpp b/sycl/test/include_deps/sycl_detail_core.hpp.cpp index f4c33d1ed938f..ab84fc91c3260 100644 --- a/sycl/test/include_deps/sycl_detail_core.hpp.cpp +++ b/sycl/test/include_deps/sycl_detail_core.hpp.cpp @@ -138,6 +138,8 @@ // CHECK-NEXT: detail/kernel_launch_helper.hpp // CHECK-NEXT: ext/intel/experimental/fp_control_kernel_properties.hpp // CHECK-NEXT: ext/intel/experimental/kernel_execution_properties.hpp +// CHECK-NEXT: ext/oneapi/experimental/cluster_group_prop.hpp +// CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp // CHECK-NEXT: ext/oneapi/experimental/virtual_functions.hpp // CHECK-NEXT: ext/oneapi/kernel_properties/properties.hpp // CHECK-NEXT: ext/oneapi/work_group_scratch_memory.hpp @@ -149,9 +151,7 @@ // CHECK-NEXT: ext/oneapi/bindless_images_interop.hpp // CHECK-NEXT: ext/oneapi/interop_common.hpp // CHECK-NEXT: ext/oneapi/bindless_images_mem_handle.hpp -// CHECK-NEXT: ext/oneapi/experimental/cluster_group_prop.hpp // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp -// CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp // CHECK-NEXT: kernel.hpp // CHECK-NEXT: sampler.hpp // CHECK-NEXT: feature_test.hpp diff --git a/sycl/test/virtual-functions/properties-negative.cpp b/sycl/test/virtual-functions/properties-negative.cpp index b8e1b75f1d9a9..0ef06b3652ad1 100644 --- a/sycl/test/virtual-functions/properties-negative.cpp +++ b/sycl/test/virtual-functions/properties-negative.cpp @@ -17,15 +17,15 @@ int main() { oneapi::properties props_int{oneapi::indirectly_callable_in}; oneapi::properties props_user{oneapi::indirectly_callable_in}; - // expected-error-re@sycl/handler.hpp:* {{static assertion failed due to requirement {{.*}} indirectly_callable property cannot be applied to SYCL kernels}} + // expected-error-re@sycl/detail/kernel_launch_helper.hpp:* {{static assertion failed due to requirement {{.*}} indirectly_callable property cannot be applied to SYCL kernels}} q.single_task(props_empty, [=]() {}); // When both "props_empty" and "props_void" are in use, we won't see the // static assert firing for the second one, because there will be only one // instantiation of handler::processProperties. q.single_task(props_void, [=]() {}); - // expected-error-re@sycl/handler.hpp:* {{static assertion failed due to requirement {{.*}} indirectly_callable property cannot be applied to SYCL kernels}} + // expected-error-re@sycl/detail/kernel_launch_helper.hpp:* {{static assertion failed due to requirement {{.*}} indirectly_callable property cannot be applied to SYCL kernels}} q.single_task(props_int, [=]() {}); - // expected-error-re@sycl/handler.hpp:* {{static assertion failed due to requirement {{.*}} indirectly_callable property cannot be applied to SYCL kernels}} + // expected-error-re@sycl/detail/kernel_launch_helper.hpp:* {{static assertion failed due to requirement {{.*}} indirectly_callable property cannot be applied to SYCL kernels}} q.single_task(props_user, [=]() {}); return 0; From db1c46d3bc32f9733442b5844a31a2c5cd13b58e Mon Sep 17 00:00:00 2001 From: "Agarwal, Udit" Date: Tue, 14 Oct 2025 00:16:14 +0200 Subject: [PATCH 2/4] Minor fixes --- .../sycl/detail/kernel_launch_helper.hpp | 30 +++---------- sycl/source/detail/kernel_data.hpp | 3 +- sycl/source/handler.cpp | 43 +++++++++++++++++-- 3 files changed, 47 insertions(+), 29 deletions(-) diff --git a/sycl/include/sycl/detail/kernel_launch_helper.hpp b/sycl/include/sycl/detail/kernel_launch_helper.hpp index 781a992cef03f..bd203d58850cb 100644 --- a/sycl/include/sycl/detail/kernel_launch_helper.hpp +++ b/sycl/include/sycl/detail/kernel_launch_helper.hpp @@ -256,7 +256,8 @@ struct KernelWrapper< } }; // KernelWrapper struct -// This struct is inherited by sycl::handler. +// This class encapsulates everything related to parsing kernel launch +// properties. class KernelLaunchPropertyWrapper { public: // This struct is used to store kernel launch properties. @@ -264,8 +265,9 @@ class KernelLaunchPropertyWrapper { // In some code paths, kernel launch properties are set multiple times // for the same kernel, that is why using std::optional to avoid overriding // previously set properties. + // This struct is used to pass kernel launch properties across the ABI + // boundary. struct KernelLaunchPropertiesT { - struct ScopeForwardProgressProperty { std::optional Guarantee; @@ -282,28 +284,11 @@ class KernelLaunchPropertyWrapper { std::array MClusterSize = {0, 0, 0}; // Forward progress guarantee properties for work_item, sub_group and - // work_group scopes. - // Indexed by ExecutionScope enum. + // work_group scopes. We need to store them for validation later. std::array MForwardProgressProperties; - - KernelLaunchPropertiesT() = default; - - // TODO: Do you even need this? - KernelLaunchPropertiesT( - ur_kernel_cache_config_t _CacheConfig, bool _IsCooperative, - uint32_t _WorkGroupMemorySize, bool _UsesClusterLaunch, - size_t _ClusterDims, std::array _ClusterSize, - std::array _ForwardProgressProperties) - : MCacheConfig(_CacheConfig), MIsCooperative(_IsCooperative), - MWorkGroupMemorySize(_WorkGroupMemorySize), - MUsesClusterLaunch(_UsesClusterLaunch), MClusterDims(_ClusterDims), - MClusterSize(_ClusterSize), - MForwardProgressProperties(_ForwardProgressProperties) {} - }; // struct KernelLaunchPropertiesT + }; /// Process runtime kernel properties. - /// - /// Stores information about kernel properties into the handler. template static KernelLaunchPropertiesT processKernelLaunchProperties(PropertiesT Props) { @@ -423,9 +408,6 @@ class KernelLaunchPropertyWrapper { } /// Process kernel properties. - /// - /// Stores information about kernel properties into the handler. - /// /// Note: it is important that this function *does not* depend on kernel /// name or kernel type, because then it will be instantiated for every /// kernel, even though body of those instantiated functions could be almost diff --git a/sycl/source/detail/kernel_data.hpp b/sycl/source/detail/kernel_data.hpp index 037341b9c0691..d73229a909fa9 100644 --- a/sycl/source/detail/kernel_data.hpp +++ b/sycl/source/detail/kernel_data.hpp @@ -8,13 +8,12 @@ #pragma once +#include #include #include #include #include -#include - #include #include diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 2aacd56fda15d..619f8b121aa6d 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -1758,9 +1758,46 @@ void handler::verifyDeviceHasProgressGuarantee( sycl::ext::oneapi::experimental::execution_scope threadScope, sycl::ext::oneapi::experimental::execution_scope coordinationScope) { - // FIXME! - detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT Kprop; - setKernelLaunchProperties(Kprop); + using execution_scope = sycl::ext::oneapi::experimental::execution_scope; + using forward_progress = + sycl::ext::oneapi::experimental::forward_progress_guarantee; + const bool supported = impl->get_device().supportsForwardProgress( + guarantee, threadScope, coordinationScope); + if (threadScope == execution_scope::work_group) { + if (!supported) { + throw sycl::exception( + sycl::errc::feature_not_supported, + "Required progress guarantee for work groups is not " + "supported by this device."); + } + // If we are here, the device supports the guarantee required but there is a + // caveat in that if the guarantee required is a concurrent guarantee, then + // we most likely also need to enable cooperative launch of the kernel. That + // is, although the device supports the required guarantee, some setup work + // is needed to truly make the device provide that guarantee at runtime. + // Otherwise, we will get the default guarantee which is weaker than + // concurrent. Same reasoning applies for sub_group but not for work_item. + // TODO: Further design work is probably needed to reflect this behavior in + // Unified Runtime. + if (guarantee == forward_progress::concurrent) + setKernelIsCooperative(true); + } else if (threadScope == execution_scope::sub_group) { + if (!supported) { + throw sycl::exception(sycl::errc::feature_not_supported, + "Required progress guarantee for sub groups is not " + "supported by this device."); + } + // Same reasoning as above. + if (guarantee == forward_progress::concurrent) + setKernelIsCooperative(true); + } else { // threadScope is execution_scope::work_item otherwise undefined + // behavior + if (!supported) { + throw sycl::exception(sycl::errc::feature_not_supported, + "Required progress guarantee for work items is not " + "supported by this device."); + } + } } #endif From 5effee7d26b96e95faef14487630fff65e17b626 Mon Sep 17 00:00:00 2001 From: Udit Kumar Agarwal Date: Mon, 13 Oct 2025 15:17:44 -0700 Subject: [PATCH 3/4] Update sycl/include/sycl/queue.hpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- sycl/include/sycl/queue.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 6553fda0ed709..c61706cfeab4b 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -193,7 +193,7 @@ auto submit_kernel_direct( // parallel_for (deprecated API). // ExtraProps are properties passed explicitly or via launch_config. - // Asumption: If user specify properties via launch_config or explicitly + // Assumption: If user specify properties via launch_config or explicitly // then we don't check for properties specified via get() method. detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT parsedProps; if constexpr (std::is_same_v Date: Wed, 15 Oct 2025 01:11:32 +0200 Subject: [PATCH 4/4] Address feedback --- .../sycl/detail/kernel_launch_helper.hpp | 330 ++++++++---------- .../oneapi/experimental/enqueue_functions.hpp | 8 +- sycl/include/sycl/handler.hpp | 54 ++- sycl/include/sycl/queue.hpp | 14 +- sycl/source/detail/kernel_data.hpp | 83 +++-- sycl/source/detail/queue_impl.cpp | 2 +- sycl/source/detail/queue_impl.hpp | 6 +- sycl/source/handler.cpp | 2 +- sycl/source/queue.cpp | 16 +- 9 files changed, 247 insertions(+), 268 deletions(-) diff --git a/sycl/include/sycl/detail/kernel_launch_helper.hpp b/sycl/include/sycl/detail/kernel_launch_helper.hpp index bd203d58850cb..05b76bd0e1b5f 100644 --- a/sycl/include/sycl/detail/kernel_launch_helper.hpp +++ b/sycl/include/sycl/detail/kernel_launch_helper.hpp @@ -256,203 +256,179 @@ struct KernelWrapper< } }; // KernelWrapper struct -// This class encapsulates everything related to parsing kernel launch +// This namespace encapsulates everything related to parsing kernel launch // properties. -class KernelLaunchPropertyWrapper { -public: - // This struct is used to store kernel launch properties. - // std::optional is used to indicate that the property is not set. - // In some code paths, kernel launch properties are set multiple times - // for the same kernel, that is why using std::optional to avoid overriding - // previously set properties. - // This struct is used to pass kernel launch properties across the ABI - // boundary. - struct KernelLaunchPropertiesT { - struct ScopeForwardProgressProperty { - std::optional - Guarantee; - std::optional ExecScope; - std::optional - CoordinationScope; - }; - - std::optional MCacheConfig = std::nullopt; - std::optional MIsCooperative = std::nullopt; - std::optional MWorkGroupMemorySize = std::nullopt; - std::optional MUsesClusterLaunch = std::nullopt; - size_t MClusterDims = 0; - std::array MClusterSize = {0, 0, 0}; +inline namespace kernel_launch_properties_v1 { + +// This struct is used to store kernel launch properties. +// std::optional is used to indicate that the property is not set. +// This struct is used to pass kernel launch properties across the ABI +// boundary. +struct KernelLaunchPropertiesTy { + // Modeled after ur_kernel_cache_config_t + enum class StableKernelCacheConfig : int32_t { + Default = 0, + LargeSLM = 1, + LargeData = 2 + }; - // Forward progress guarantee properties for work_item, sub_group and - // work_group scopes. We need to store them for validation later. - std::array MForwardProgressProperties; + struct ScopeForwardProgressProperty { + std::optional + Guarantee; + std::optional ExecScope; + std::optional + CoordinationScope; }; - /// Process runtime kernel properties. - template - static KernelLaunchPropertiesT - processKernelLaunchProperties(PropertiesT Props) { - using namespace sycl::ext::oneapi::experimental; - using namespace sycl::ext::oneapi::experimental::detail; - KernelLaunchPropertiesT retval; - - // Process Kernel cache configuration property. - { - if constexpr (PropertiesT::template has_property< - sycl::ext::intel::experimental::cache_config_key>()) { - auto Config = Props.template get_property< - sycl::ext::intel::experimental::cache_config_key>(); - if (Config == sycl::ext::intel::experimental::large_slm) { - retval.MCacheConfig = UR_KERNEL_CACHE_CONFIG_LARGE_SLM; - } else if (Config == sycl::ext::intel::experimental::large_data) { - retval.MCacheConfig = UR_KERNEL_CACHE_CONFIG_LARGE_DATA; - } - } else { - std::ignore = Props; - } - } + std::optional MCacheConfig = std::nullopt; + std::optional MIsCooperative = std::nullopt; + std::optional MWorkGroupMemorySize = std::nullopt; + std::optional MUsesClusterLaunch = std::nullopt; + size_t MClusterDims = 0; + std::array MClusterSize = {0, 0, 0}; - // Process Kernel cooperative property. - { - if constexpr (PropertiesT::template has_property()) - retval.MIsCooperative = true; - } + // Forward progress guarantee properties for work_item, sub_group and + // work_group scopes. We need to store them for validation later. + std::array MForwardProgressProperties; +}; - // Process device progress properties. - { - using forward_progress = - sycl::ext::oneapi::experimental::forward_progress_guarantee; - if constexpr (PropertiesT::template has_property< - work_group_progress_key>()) { - auto prop = Props.template get_property(); - retval.MForwardProgressProperties[0].Guarantee = prop.guarantee; - retval.MForwardProgressProperties[0].ExecScope = - execution_scope::work_group; - retval.MForwardProgressProperties[0].CoordinationScope = - prop.coordinationScope; - - // If we are here, the device supports the guarantee required but there - // is a caveat in that if the guarantee required is a concurrent - // guarantee, then we most likely also need to enable cooperative launch - // of the kernel. That is, although the device supports the required - // guarantee, some setup work is needed to truly make the device provide - // that guarantee at runtime. Otherwise, we will get the default - // guarantee which is weaker than concurrent. Same reasoning applies for - // sub_group but not for work_item. - // TODO: Further design work is probably needed to reflect this behavior - // in Unified Runtime. - if constexpr (prop.guarantee == forward_progress::concurrent) - retval.MIsCooperative = true; - } - if constexpr (PropertiesT::template has_property< - sub_group_progress_key>()) { - auto prop = Props.template get_property(); - retval.MForwardProgressProperties[1].Guarantee = prop.guarantee; - retval.MForwardProgressProperties[1].ExecScope = - execution_scope::sub_group; - retval.MForwardProgressProperties[1].CoordinationScope = - prop.coordinationScope; - - // Same reasoning as above for work_group applies here. - if constexpr (prop.guarantee == forward_progress::concurrent) - retval.MIsCooperative = true; - } - if constexpr (PropertiesT::template has_property< - work_item_progress_key>()) { - auto prop = Props.template get_property(); - retval.MForwardProgressProperties[2].Guarantee = prop.guarantee; - retval.MForwardProgressProperties[2].ExecScope = - execution_scope::work_item; - retval.MForwardProgressProperties[2].CoordinationScope = - prop.coordinationScope; +template +constexpr KernelLaunchPropertiesTy +processKernelLaunchProperties(PropertiesT Props) { + using namespace sycl::ext::oneapi::experimental; + using namespace sycl::ext::oneapi::experimental::detail; + KernelLaunchPropertiesTy retval; + + // Process Kernel cache configuration property. + { + if constexpr (PropertiesT::template has_property< + sycl::ext::intel::experimental::cache_config_key>()) { + auto Config = Props.template get_property< + sycl::ext::intel::experimental::cache_config_key>(); + if (Config == sycl::ext::intel::experimental::large_slm) { + retval.MCacheConfig = + KernelLaunchPropertiesTy::StableKernelCacheConfig::LargeSLM; + } else if (Config == sycl::ext::intel::experimental::large_data) { + retval.MCacheConfig = + KernelLaunchPropertiesTy::StableKernelCacheConfig::LargeData; } + } else { + std::ignore = Props; } + } - // Process work group scratch memory property. - { - if constexpr (PropertiesT::template has_property< - work_group_scratch_size>()) { - auto WorkGroupMemSize = - Props.template get_property(); - retval.MWorkGroupMemorySize = WorkGroupMemSize.size; - } - } + // Process Kernel cooperative property. + { + if constexpr (PropertiesT::template has_property()) + retval.MIsCooperative = true; + } - // Parse cluster properties. - { - constexpr std::size_t ClusterDim = getClusterDim(); - if constexpr (ClusterDim > 0) { - - auto ClusterSize = - Props.template get_property>() - .get_cluster_size(); - retval.MUsesClusterLaunch = true; - retval.MClusterDims = ClusterDim; - if (ClusterDim == 1) { - retval.MClusterSize[0] = ClusterSize[0]; - } else if (ClusterDim == 2) { - retval.MClusterSize[0] = ClusterSize[0]; - retval.MClusterSize[1] = ClusterSize[1]; - } else if (ClusterDim == 3) { - retval.MClusterSize[0] = ClusterSize[0]; - retval.MClusterSize[1] = ClusterSize[1]; - retval.MClusterSize[2] = ClusterSize[2]; - } else { - assert(ClusterDim <= 3 && - "Only 1D, 2D, and 3D cluster launch is supported."); - } - } + // Process device progress properties. + { + if constexpr (PropertiesT::template has_property< + work_group_progress_key>()) { + auto prop = Props.template get_property(); + retval.MForwardProgressProperties[0].Guarantee = prop.guarantee; + retval.MForwardProgressProperties[0].ExecScope = + execution_scope::work_group; + retval.MForwardProgressProperties[0].CoordinationScope = + prop.coordinationScope; } + if constexpr (PropertiesT::template has_property< + sub_group_progress_key>()) { + auto prop = Props.template get_property(); + retval.MForwardProgressProperties[1].Guarantee = prop.guarantee; + retval.MForwardProgressProperties[1].ExecScope = + execution_scope::sub_group; + retval.MForwardProgressProperties[1].CoordinationScope = + prop.coordinationScope; + } + if constexpr (PropertiesT::template has_property< + work_item_progress_key>()) { + auto prop = Props.template get_property(); + retval.MForwardProgressProperties[2].Guarantee = prop.guarantee; + retval.MForwardProgressProperties[2].ExecScope = + execution_scope::work_item; + retval.MForwardProgressProperties[2].CoordinationScope = + prop.coordinationScope; + } + } - return retval; + // Process work group scratch memory property. + { + if constexpr (PropertiesT::template has_property< + work_group_scratch_size>()) { + auto WorkGroupMemSize = + Props.template get_property(); + retval.MWorkGroupMemorySize = WorkGroupMemSize.size; + } } - /// Process kernel properties. - /// Note: it is important that this function *does not* depend on kernel - /// name or kernel type, because then it will be instantiated for every - /// kernel, even though body of those instantiated functions could be almost - /// the same, thus unnecessary increasing compilation time. - template < - bool IsESIMDKernel, - typename PropertiesT = ext::oneapi::experimental::empty_properties_t> - static KernelLaunchPropertiesT processKernelProperties(PropertiesT Props) { - static_assert( - ext::oneapi::experimental::is_property_list::value, - "Template type is not a property list."); - static_assert( - !PropertiesT::template has_property< - sycl::ext::intel::experimental::fp_control_key>() || - (PropertiesT::template has_property< - sycl::ext::intel::experimental::fp_control_key>() && - IsESIMDKernel), - "Floating point control property is supported for ESIMD kernels only."); - static_assert( - !PropertiesT::template has_property< - sycl::ext::oneapi::experimental::indirectly_callable_key>(), - "indirectly_callable property cannot be applied to SYCL kernels"); + // Parse cluster properties. + { + constexpr std::size_t ClusterDim = getClusterDim(); + if constexpr (ClusterDim > 0) { + static_assert(ClusterDim <= 3, + "Only 1D, 2D, and 3D cluster launch is supported."); + + auto ClusterSize = + Props.template get_property>() + .get_cluster_size(); - return processKernelLaunchProperties(Props); + retval.MUsesClusterLaunch = true; + retval.MClusterDims = ClusterDim; + + for (size_t dim = 0; dim < ClusterDim; dim++) + retval.MClusterSize[dim] = ClusterSize[dim]; + } } - // Returns KernelLaunchPropertiesT or std::nullopt based on whether the - // kernel functor has a get method that returns properties. - template - static std::optional - parseProperties([[maybe_unused]] const KernelType &KernelFunc) { + return retval; +} + +/// Note: it is important that this function *does not* depend on kernel +/// name or kernel type, because then it will be instantiated for every +/// kernel, even though body of those instantiated functions could be almost +/// the same, thus unnecessary increasing compilation time. +template +constexpr KernelLaunchPropertiesTy processKernelProperties(PropertiesT Props) { + static_assert(ext::oneapi::experimental::is_property_list::value, + "Template type is not a property list."); + static_assert( + !PropertiesT::template has_property< + sycl::ext::intel::experimental::fp_control_key>() || + (PropertiesT::template has_property< + sycl::ext::intel::experimental::fp_control_key>() && + IsESIMDKernel), + "Floating point control property is supported for ESIMD kernels only."); + static_assert( + !PropertiesT::template has_property< + sycl::ext::oneapi::experimental::indirectly_callable_key>(), + "indirectly_callable property cannot be applied to SYCL kernels"); + + return processKernelLaunchProperties(Props); +} + +// Returns KernelLaunchPropertiesTy or std::nullopt based on whether the +// kernel functor has a get method that returns properties. +template +constexpr std::optional +parseProperties([[maybe_unused]] const KernelType &KernelFunc) { #ifndef __SYCL_DEVICE_ONLY__ - // If there are properties provided by get method then process them. - if constexpr (ext::oneapi::experimental::detail:: - HasKernelPropertiesGetMethod::value) { + // If there are properties provided by get method then process them. + if constexpr (ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod< + const KernelType &>::value) { - return processKernelProperties( - KernelFunc.get(ext::oneapi::experimental::properties_tag{})); - } -#endif - // If there are no properties provided by get method then return empty - // optional. - return std::nullopt; + return processKernelProperties( + KernelFunc.get(ext::oneapi::experimental::properties_tag{})); } -}; // KernelLaunchPropertyWrapper struct +#endif + // If there are no properties provided by get method then return empty + // optional. + return std::nullopt; +} +} // namespace kernel_launch_properties_v1 } // namespace detail } // namespace _V1 diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 49aa3a252682e..9aaf817d99973 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -299,11 +299,11 @@ void nd_launch(queue Q, launch_config, Properties> Config, ext::oneapi::experimental::detail::LaunchConfigAccess, Properties> - ConfigAccess(Config); + LaunchConfigAccess(Config); - detail::submit_kernel_direct(std::move(Q), - ConfigAccess.getRange(), KernelObj, - Config.getProperties()); + detail::submit_kernel_direct( + std::move(Q), LaunchConfigAccess.getRange(), KernelObj, + LaunchConfigAccess.getProperties()); } else { submit(std::move(Q), [&](handler &CGH) { nd_launch(CGH, Config, KernelObj, diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 78d58c80d6b4f..30629f0c5589e 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -902,9 +902,8 @@ class __SYCL_EXPORT handler { /// Stores information about kernel properties into the handler. template void processLaunchProperties(PropertiesT Props) { - detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT ParsedProp = - detail::KernelLaunchPropertyWrapper::processKernelLaunchProperties< - PropertiesT>(Props); + detail::KernelLaunchPropertiesTy ParsedProp = + detail::processKernelLaunchProperties(Props); setKernelLaunchProperties(ParsedProp); } @@ -920,9 +919,8 @@ class __SYCL_EXPORT handler { bool IsESIMDKernel, typename PropertiesT = ext::oneapi::experimental::empty_properties_t> void processProperties(PropertiesT Props) { - detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT ParsedProp = - detail::KernelLaunchPropertyWrapper::processKernelProperties< - IsESIMDKernel>(Props); + detail::KernelLaunchPropertiesTy ParsedProp = + detail::processKernelProperties(Props); setKernelLaunchProperties(ParsedProp); } #endif // INTEL_PREVIEW_BREAKING_CHANGES @@ -1233,8 +1231,7 @@ class __SYCL_EXPORT handler { decltype(Wrapper), TransformedArgType, PropertiesT>::wrap(Wrapper); - if (auto prop = detail::KernelLaunchPropertyWrapper::parseProperties< - KName, Info.IsESIMD>(Wrapper)) { + if (auto prop = detail::parseProperties(Wrapper)) { setKernelLaunchProperties(*prop); } @@ -1261,16 +1258,14 @@ class __SYCL_EXPORT handler { // kernel is generated detail::KernelWrapper::wrap(KernelFunc); - if (auto prop = detail::KernelLaunchPropertyWrapper::parseProperties< - NameT, Info.IsESIMD>(KernelFunc)) { + if (auto prop = + detail::parseProperties(KernelFunc)) { setKernelLaunchProperties(*prop); } #ifndef __SYCL_DEVICE_ONLY__ verifyUsedKernelBundleInternal(Info.Name); - detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT - ProcessedProps = - detail::KernelLaunchPropertyWrapper::processKernelProperties< - Info.IsESIMD, PropertiesT>(Props); + detail::KernelLaunchPropertiesTy ProcessedProps = + detail::processKernelProperties(Props); setKernelLaunchProperties(ProcessedProps); detail::checkValueRange(UserRange); setNDRangeDescriptor(std::move(UserRange)); @@ -1300,9 +1295,8 @@ class __SYCL_EXPORT handler { setDeviceKernelInfo(std::move(Kernel)); detail::checkValueRange(NumWorkItems); setNDRangeDescriptor(std::move(NumWorkItems)); - detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT ParsedProp = - detail::KernelLaunchPropertyWrapper::processKernelLaunchProperties< - PropertiesT>(Props); + detail::KernelLaunchPropertiesTy ParsedProp = + detail::processKernelLaunchProperties(Props); setKernelLaunchProperties(ParsedProp); extractArgsAndReqs(); #endif @@ -1326,9 +1320,8 @@ class __SYCL_EXPORT handler { setDeviceKernelInfo(std::move(Kernel)); detail::checkValueRange(NDRange); setNDRangeDescriptor(std::move(NDRange)); - detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT ParsedProp = - detail::KernelLaunchPropertyWrapper::processKernelLaunchProperties< - PropertiesT>(Props); + detail::KernelLaunchPropertiesTy ParsedProp = + detail::processKernelLaunchProperties(Props); setKernelLaunchProperties(ParsedProp); extractArgsAndReqs(); #endif @@ -1349,8 +1342,7 @@ class __SYCL_EXPORT handler { constexpr auto Info = detail::CompileTimeKernelInfo; detail::KernelWrapper::wrap(KernelFunc); - if (auto prop = detail::KernelLaunchPropertyWrapper::parseProperties< - NameT, Info.IsESIMD>(KernelFunc)) { + if (auto prop = detail::parseProperties(KernelFunc)) { setKernelLaunchProperties(*prop); } #ifndef __SYCL_DEVICE_ONLY__ @@ -1369,10 +1361,8 @@ class __SYCL_EXPORT handler { } StoreLambda(std::move(KernelFunc)); - detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT - ProcessedProps = - detail::KernelLaunchPropertyWrapper::processKernelProperties< - Info.IsESIMD, PropertiesT>(Props); + detail::KernelLaunchPropertiesTy ProcessedProps = + detail::processKernelProperties(Props); setKernelLaunchProperties(ProcessedProps); #endif } @@ -1396,8 +1386,7 @@ class __SYCL_EXPORT handler { (void)Kernel; detail::KernelWrapper::wrap(KernelFunc); - if (auto prop = detail::KernelLaunchPropertyWrapper::parseProperties( - KernelFunc)) { + if (auto prop = detail::parseProperties(KernelFunc)) { setKernelLaunchProperties(*prop); } #ifndef __SYCL_DEVICE_ONLY__ @@ -1426,10 +1415,8 @@ class __SYCL_EXPORT handler { "the kernel name must match the name of the lambda"); } StoreLambda(std::move(KernelFunc)); - detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT - ProcessedProps = - detail::KernelLaunchPropertyWrapper::processKernelProperties< - Info.IsESIMD, PropertiesT>(Props); + detail::KernelLaunchPropertiesTy ProcessedProps = + detail::processKernelProperties(Props); setKernelLaunchProperties(ProcessedProps); #endif } @@ -3478,8 +3465,7 @@ class __SYCL_EXPORT handler { #endif void setKernelLaunchProperties( - detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT - &KernelLaunchProperties); + const detail::KernelLaunchPropertiesTy &KernelLaunchProperties); // Various checks that are only meaningful for host compilation, because they // result in runtime errors (i.e. exceptions being thrown). To save time diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index c61706cfeab4b..b6da07ed9e9a5 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -67,7 +67,7 @@ event __SYCL_EXPORT submit_kernel_direct_with_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template @@ -75,7 +75,7 @@ void __SYCL_EXPORT submit_kernel_direct_without_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); namespace detail { @@ -195,22 +195,18 @@ auto submit_kernel_direct( // Assumption: If user specify properties via launch_config or explicitly // then we don't check for properties specified via get() method. - detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT parsedProps; + KernelLaunchPropertiesTy parsedProps; if constexpr (std::is_same_v) { // Use properties passed via. get() method. if constexpr (ext::oneapi::experimental::detail:: HasKernelPropertiesGetMethod::value) { auto prop = KernelFunc.get(ext::oneapi::experimental::properties_tag{}); - parsedProps = - detail::KernelLaunchPropertyWrapper::processKernelProperties( - prop); + parsedProps = detail::processKernelProperties(prop); } } else { // Use ExtraProps - parsedProps = - detail::KernelLaunchPropertyWrapper::processKernelProperties( - ExtraProps); + parsedProps = detail::processKernelProperties(ExtraProps); } // Instantiating the kernel on the host improves debugging. diff --git a/sycl/source/detail/kernel_data.hpp b/sycl/source/detail/kernel_data.hpp index d73229a909fa9..f78845e223057 100644 --- a/sycl/source/detail/kernel_data.hpp +++ b/sycl/source/detail/kernel_data.hpp @@ -168,49 +168,70 @@ class KernelData { } void validateAndSetKernelLaunchProperties( - const KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Kprop, - bool HasGraph, const device_impl &dev) { + const detail::KernelLaunchPropertiesTy &Kprop, bool HasGraph, + const device_impl &dev) { + using execScope = ext::oneapi::experimental::execution_scope; // Validate properties before setting. - { - if (HasGraph) { - if (Kprop.MWorkGroupMemorySize) { - throw sycl::exception( - sycl::make_error_code(errc::invalid), - "Setting work group scratch memory size is not yet supported " - "for use with the SYCL Graph extension."); - } + if (HasGraph) { + if (Kprop.MWorkGroupMemorySize) { + throw sycl::exception( + sycl::make_error_code(errc::invalid), + "Setting work group scratch memory size is not yet supported " + "for use with the SYCL Graph extension."); + } - if (Kprop.MUsesClusterLaunch && *Kprop.MUsesClusterLaunch) { - throw sycl::exception(sycl::make_error_code(errc::invalid), - "Cluster launch is not yet supported " - "for use with the SYCL Graph extension."); - } + if (Kprop.MUsesClusterLaunch && *Kprop.MUsesClusterLaunch) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "Cluster launch is not yet supported " + "for use with the SYCL Graph extension."); } + } - for (int i = 0; i < 3; i++) { - if (Kprop.MForwardProgressProperties[i].Guarantee.has_value()) { - - if (!dev.supportsForwardProgress( - *Kprop.MForwardProgressProperties[i].Guarantee, - *Kprop.MForwardProgressProperties[i].ExecScope, - *Kprop.MForwardProgressProperties[i].CoordinationScope)) { - // TODO: Make the error message more descriptive. - throw sycl::exception( - sycl::make_error_code(errc::feature_not_supported), - "The device associated with the queue does not support the " - "requested forward progress guarantee."); - } + // Validate and set forward progress guarantees. + for (int i = 0; i < 3; i++) { + if (Kprop.MForwardProgressProperties[i].Guarantee.has_value()) { + + if (!dev.supportsForwardProgress( + *Kprop.MForwardProgressProperties[i].Guarantee, + *Kprop.MForwardProgressProperties[i].ExecScope, + *Kprop.MForwardProgressProperties[i].CoordinationScope)) { + throw sycl::exception( + sycl::make_error_code(errc::feature_not_supported), + "The device associated with the queue does not support the " + "requested forward progress guarantee."); + } + + auto execScope = *Kprop.MForwardProgressProperties[i].ExecScope; + // If we are here, the device supports the guarantee required but there + // is a caveat in that if the guarantee required is a concurrent + // guarantee, then we most likely also need to enable cooperative launch + // of the kernel. That is, although the device supports the required + // guarantee, some setup work is needed to truly make the device provide + // that guarantee at runtime. Otherwise, we will get the default + // guarantee which is weaker than concurrent. Same reasoning applies for + // sub_group but not for work_item. + // TODO: Further design work is probably needed to reflect this behavior + // in Unified Runtime. + if ((execScope == execScope::work_group || + execScope == execScope::sub_group) && + (*Kprop.MForwardProgressProperties[i].Guarantee == + ext::oneapi::experimental::forward_progress_guarantee:: + concurrent)) { + setCooperative(true); } } } - // Set properties. if (Kprop.MIsCooperative) setCooperative(*Kprop.MIsCooperative); - if (Kprop.MCacheConfig) - setKernelCacheConfig(*Kprop.MCacheConfig); + if (Kprop.MCacheConfig) { + // KernelLaunchPropertiesTy::StableKernelCacheConfig is modeled after + // ur_kernel_cache_config_t, so this cast is safe. + setKernelCacheConfig( + static_cast(*Kprop.MCacheConfig)); + } if (Kprop.MWorkGroupMemorySize) setKernelWorkGroupMemorySize(*Kprop.MWorkGroupMemorySize); diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 621045c567834..2881ff60583da 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -567,7 +567,7 @@ EventImplPtr queue_impl::submit_command_to_graph( EventImplPtr queue_impl::submit_kernel_direct_impl( const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { KernelData KData; diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 4528752ba95f4..f57297d9f3b35 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -363,7 +363,7 @@ class queue_impl : public std::enable_shared_from_this { event submit_kernel_direct_with_event( const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, @@ -375,7 +375,7 @@ class queue_impl : public std::enable_shared_from_this { void submit_kernel_direct_without_event( const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, false, Props, CodeLoc, IsTopCodeLoc); @@ -931,7 +931,7 @@ class queue_impl : public std::enable_shared_from_this { EventImplPtr submit_kernel_direct_impl( const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 619f8b121aa6d..07ec95f8ed39b 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -1914,7 +1914,7 @@ void handler::memcpyFromHostOnlyDeviceGlobal(void *Dest, } void handler::setKernelLaunchProperties( - detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Kprop) { + const detail::KernelLaunchPropertiesTy &Kprop) { impl->MKernelData.validateAndSetKernelLaunchProperties( Kprop, getCommandGraph() != nullptr /*hasGraph?*/, impl->get_device() /*device_impl*/); diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 73a53a01abda2..c97798b80bdba 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -476,7 +476,7 @@ event submit_kernel_direct_with_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { return getSyclObjImpl(Queue)->submit_kernel_direct_with_event( Range, HostKernel, DeviceKernelInfo, Props, CodeLoc, IsTopCodeLoc); @@ -486,21 +486,21 @@ template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<1>( const queue &Queue, const nd_range<1> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<2>( const queue &Queue, const nd_range<2> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<3>( const queue &Queue, const nd_range<3> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template @@ -508,7 +508,7 @@ void submit_kernel_direct_without_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { getSyclObjImpl(Queue)->submit_kernel_direct_without_event( Range, HostKernel, DeviceKernelInfo, Props, CodeLoc, IsTopCodeLoc); @@ -518,21 +518,21 @@ template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<1>( const queue &Queue, const nd_range<1> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<2>( const queue &Queue, const nd_range<2> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<3>( const queue &Queue, const nd_range<3> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); } // namespace _V1