From e664798e5785ba1752e83a4c4d135e35d9ce8acc Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Mon, 29 Jul 2024 20:06:07 +0100 Subject: [PATCH 01/33] [SYCL] Fix use of removed ArchType enum (#14833) Two concurrent PRs added a new use of and simultaneously removed this enum. Commit 63c61d85 added a new use, while dc37699b was trying to delete it. --- .../passes/kernel-fusion/SYCLSpecConstMaterializer.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp index b1f54ee21b78d..3637930d72f8f 100644 --- a/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp +++ b/sycl-fusion/passes/kernel-fusion/SYCLSpecConstMaterializer.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #define DEBUG_TYPE "sycl-spec-const-materializer" @@ -298,9 +299,8 @@ PreservedAnalyses SYCLSpecConstMaterializer::run(Function &F, // Invariant: This pass is only intended to operate on SYCL kernels being // compiled to either `nvptx{,64}-nvidia-cuda`, or `amdgcn-amd-amdhsa` // triples. - auto AT = TargetHelpers::getArchType(*Mod); - if (TargetHelpers::ArchType::Cuda != AT && - TargetHelpers::ArchType::AMDHSA != AT) { + Triple T(Mod->getTargetTriple()); + if (!T.isNVPTX() && !T.isAMDGCN()) { LLVM_DEBUG(dbgs() << "Unsupported architecture\n"); return PreservedAnalyses::all(); } From 599fcd01696a14ff5068780cc8fafafcee9b729f Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Tue, 30 Jul 2024 09:03:29 +0100 Subject: [PATCH 02/33] [SYCL][COMPAT] New launch API to enable passing kernel & launch properties (#14441) This PR defines a new user-facing struct `launch_strategy`, and two new `launch` overloads (currently in `syclcompat::experimental`) which accept a `launch_strategy`. ## Extensions & Properties This work builds on top of the [kernel_properties](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_kernel_properties.asciidoc) and [enqueue_functions](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc) extensions. The latter defines APIs for passing `launch_properties` as part of a `launch_config` object. These are the `parallel_for` and `nd_launch` overloads used by the new `launch`. See the [note](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc#launch-configuration) in the Launch configuration section which describes how `kernel_properties` must be passed via a `get(properties_tag)` method of a kernel functor. ## Local Memory Note also that in order to properly handle local memory, we **must** construct the `KernelFunctor` object within the `cgh` lambda, passing in a `local_accessor` to the constructor. Then within `KernelFunctor::operator()` (the SYCL 'kernel') we can at last grab the local memory pointer with `local_acc.get_multi_ptr()`, since CUDA-style device functions expect to receive their dynamic local memory as a `char *`. --------- Signed-off-by: Joe Todd --- sycl/doc/syclcompat/README.md | 236 +++++----- sycl/include/syclcompat/launch.hpp | 133 ++---- .../syclcompat/launch_experimental.hpp | 105 ----- sycl/include/syclcompat/launch_policy.hpp | 254 +++++++++++ sycl/include/syclcompat/syclcompat.hpp | 1 - sycl/include/syclcompat/traits.hpp | 209 +++++++++ .../syclcompat/launch/kernel_properties.cpp | 64 +++ sycl/test-e2e/syclcompat/launch/launch.cpp | 410 +----------------- .../syclcompat/launch/launch_policy.cpp | 359 +++++++++++++++ .../syclcompat/launch/launch_policy_lmem.cpp | 275 ++++++++++++ .../launch/launch_policy_lmem_neg.cpp | 60 +++ .../syclcompat/launch/launch_policy_neg.cpp | 191 ++++++++ .../syclcompat/launch/launch_properties.cpp | 106 +++++ 13 files changed, 1695 insertions(+), 708 deletions(-) delete mode 100644 sycl/include/syclcompat/launch_experimental.hpp create mode 100644 sycl/include/syclcompat/launch_policy.hpp create mode 100644 sycl/test-e2e/syclcompat/launch/kernel_properties.cpp create mode 100644 sycl/test-e2e/syclcompat/launch/launch_policy.cpp create mode 100644 sycl/test-e2e/syclcompat/launch/launch_policy_lmem.cpp create mode 100644 sycl/test-e2e/syclcompat/launch/launch_policy_lmem_neg.cpp create mode 100644 sycl/test-e2e/syclcompat/launch/launch_policy_neg.cpp create mode 100644 sycl/test-e2e/syclcompat/launch/launch_properties.cpp diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md index 127df2d17cac9..6dd8708afeb62 100644 --- a/sycl/doc/syclcompat/README.md +++ b/sycl/doc/syclcompat/README.md @@ -42,7 +42,14 @@ Specifically, this library depends on the following SYCL extensions: ../extensions/supported/sycl_ext_oneapi_assert.asciidoc) * [sycl_ext_oneapi_enqueue_barrier]( ../extensions/supported/sycl_ext_oneapi_enqueue_barrier.asciidoc) -* [sycl_ext_oneapi_usm_device_read_only](../extensions/supported/sycl_ext_oneapi_usm_device_read_only.asciidoc) +* [sycl_ext_oneapi_usm_device_read_only]( + ../extensions/supported/sycl_ext_oneapi_usm_device_read_only.asciidoc) +* [sycl_ext_oneapi_properties]( + ../extensions/experimental/sycl_ext_oneapi_properties.asciidoc) +* [sycl_ext_oneapi_enqueue_functions]( + ../extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc) +* [sycl_ext_oneapi_kernel_properties]( + ../extensions/experimental/sycl_ext_oneapi_kernel_properties.asciidoc) If available, the following extensions extend SYCLcompat functionality: @@ -206,44 +213,6 @@ These translate any kernel dimensions from one convention to the other. An example of an equivalent SYCL call for a 3D kernel using `compat` is `syclcompat::global_id::x() == get_global_id(2)`. -### Local Memory - -When using `compat` functions, there are two distinct interfaces to allocate -device local memory. The first interface uses the _sycl_ext_oneapi_local_memory_ -extension to leverage local memory defined at compile time. -_sycl_ext_oneapi_local_memory_ is accessed through the following wrapper: - -``` c++ -namespace syclcompat { - -template auto *local_mem(); - -} // syclcompat -``` - -`syclcompat::local_mem()` can be used as illustrated in the example -below. - -```c++ -// Sample kernel -using namespace syclcompat; -template -void local_mem_2d(int *d_A) { - // Local memory extension wrapper, size defined at compile-time - auto As = local_mem(); - int id_x = local_id::x(); - int id_y = local_id::y(); - As[id_y][id_x] = id_x * BLOCK_SIZE + id_y; - wg_barrier(); - int val = As[BLOCK_SIZE - id_y - 1][BLOCK_SIZE - id_x - 1]; - d_A[global_id::y() * BLOCK_SIZE + global_id::x()] = val; -} -``` - -The second interface allows users to allocate device local memory at runtime. -SYCLcompat provides this functionality through its kernel launch interface, -`launch`, defined in the following section. - ### launch SYCLcompat provides a kernel `launch` interface which accepts a function that @@ -254,7 +223,7 @@ device _function_ with the use of an `auto F` template parameter, and a variadic `Args` for the function's arguments. Various overloads for `launch` exist to permit the user to launch on a -specific `queue`, or to define dynamically sized device local memory. +specific `queue`, or to describe the range as either `nd_range` or `dim3, dim3`. ``` c++ namespace syclcompat { @@ -273,22 +242,6 @@ template sycl::event launch(const dim3 &grid, const dim3 &threads, sycl::queue q, Args... args); -template -sycl::event launch(const sycl::nd_range &range, size_t mem_size, - sycl::queue q, Args... args); - -template -sycl::event launch(const sycl::nd_range &range, size_t mem_size, - Args... args); - -template -sycl::event launch(const dim3 &grid, const dim3 &threads, - size_t mem_size, sycl::queue q, Args... args); - -template -sycl::event launch(const dim3 &grid, const dim3 &threads, - size_t mem_size, Args... args); - } // syclcompat ``` @@ -313,67 +266,156 @@ auto range = sycl::nd_range<3>{blocksPerGrid * threadsPerBlock, syclcompat::launch(range, d_A, d_B, d_C, n); ``` -For dynamic local memory allocation, `launch` injects a pointer to a -local `char *` accessor of `mem_size` as the last argument of the kernel -function. For example, the previous function named `vectorAdd` can be modified -with the following signature, which adds a `char *` pointer to access local -memory inside the kernel: +Note that since `syclcompat::launch` accepts a device function, the kernel +lambda is constructed by SYCLcompat internally. This means that, for +example, `sycl::local_accessor`s cannot be declared. Instead, users wishing to +use local memory should launch with a `launch_policy` object as described below. -``` c++ -void vectorAdd(const float *A, const float *B, float *C, int n, - char *local_mem); +#### launch_policy + +In addition to the simple `syclcompat::launch` interface described above, +SYCLcompat provides a more flexible (`experimental`) interface to `launch` a +kernel with a given `launch_policy`. By constructing and passing a +`launch_policy`, users can pass `sycl::ext::oneapi::experimental::properties` +associated with the kernel or launch, as well as request **local memory** for +the kernel. + +In order to disambiguate the variadic constructor of `launch_policy`, the +following wrapper structs are defined. The `kernel_properties` and +`launch_properties` wrappers can be constructed *either* with a variadc set of +properties, or with an existing `sycl_exp::properties` object. + +```cpp +namespace syclcompat::experimental { +namespace sycl_exp = sycl::ext::oneapi::experimental; + +// Wrapper for kernel sycl_exp::properties +template struct kernel_properties { + using Props = Properties; + template + kernel_properties(Props... properties); + template + kernel_properties(sycl_exp::properties properties) + Properties props; +}; + +// Wrapper for launch sycl_exp::properties +template struct launch_properties { + using Props = Properties; + template + launch_properties(Props... properties); + template + launch_properties(sycl_exp::properties properties) + Properties props; +}; + +// Wrapper for local memory size +struct local_mem_size { + local_mem_size(size_t size = 0); + size_t size; +}; + +} //namespace syclcompat::experimental ``` -Then, `vectorAdd` can be launched like this: +The constructors of `launch_policy` are variadic, accepting any form of range +(`nd_range`, `range`, `dim3`, `dim3, dim3`), followed by zero or more of +`local_memory_size`, `kernel_properties`, and `launch_properties`: ``` c++ -syclcompat::launch(blocksPerGrid, threadsPerBlock, mem_size, d_A, - d_B, d_C, n); +namespace syclcompat::experimental { +namespace sycl_exp = sycl::ext::oneapi::experimental; + +// launch_policy is constructed by the user & passed to `compat_exp::launch` +template +class launch_policy { +public: + using KPropsT = KProps; + using LPropsT = LProps; + using RangeT = Range; + static constexpr bool HasLocalMem = LocalMem; + + template + launch_policy(Range range, Ts... ts); + + template + launch_policy(dim3 global_range, Ts... ts); + + template + launch_policy(dim3 global_range, dim3 local_range, Ts... ts); + + KProps get_kernel_properties(); + LProps get_launch_properties(); + size_t get_local_mem_size(); + Range get_range(); +}; +} //namespace syclcompat::experimental ``` -or this: +The `launch` overloads accepting a `launch_policy` are: + +```cpp +namespace syclcompat::experimental { + +template +sycl::event launch(LaunchPolicy launch_policy, sycl::queue q, Args... args); + +template +sycl::event launch(LaunchPolicy launch_policy, Args... args); +} //namespace syclcompat::experimental -``` c++ -auto range = sycl::nd_range<3>{globalSize, localSize}; -syclcompat::launch(range, mem_size, d_A, d_B, d_C, n); ``` -This `launch` interface allows users to define an internal memory pool, or -scratchpad, that can then be reinterpreted as the datatype required by the user -within the kernel function. +For local memory, `launch` injects a `char *` pointer to the beginning +of a local accessor of the requested `local_mem_size` as the last argument of +the kernel function. This `char *` can then be reinterpreted as the datatype +required by the user within the kernel function. -To launch a kernel with a specified sub-group size, overloads similar to above -`launch` functions are present in the `syclcompat::experimental` namespace, -which accept SubgroupSize as a template parameter and can be called as -`launch` +For example, the previous function named `vectorAdd` can be modified +with the following signature, which adds a `char *` pointer to access local +memory inside the kernel: -```cpp +``` c++ +void vectorAdd(const float *A, const float *B, float *C, int n, + char *local_mem); +``` + +Then, the new `vectorAdd` can be launched like this: + +``` c++ +using syclcompat::experimental; +launch_policy policy{blocksPerGrid, threadsPerBlock, + local_mem_size(nbytes)}; +launch(policy, d_A, d_B, d_C, n); +``` -template -sycl::event launch(sycl::nd_range<3> launch_range, std::size_t local_memory_size, - sycl::queue queue, Args... args); +To request a different cache/local memory split on supported hardware: -template -sycl::event launch(sycl::nd_range launch_range, std::size_t local_memory_size, - Args... args); +```c++ +using syclcompat::experimental; +namespace sycl_intel_exp = sycl::ext::intel::experimental; -template -sycl::event launch(::syclcompat::dim3 grid_dim, ::syclcompat::dim3 block_dim, - std::size_t local_memory_size, Args... args); +sycl_intel_exp::cache_config cache_config{ + sycl_intel_exp::large_slm}; +kernel_properties kernel_props{cache_config}; +launch_policy policy{blocksPerGrid, threadsPerBlock, + local_mem_size(nbytes), kernel_props}; +launch(policy, d_A, d_B, d_C, n); +``` -template -sycl::event launch(sycl::nd_range<3> launch_range, sycl::queue queue, - Args... args); +To request a certain cluster dimension on supported hardware: -template -sycl::event launch(sycl::nd_range launch_range, - Args... args); +```c++ +using syclcompat::experimental; +namespace sycl_exp = sycl::ext::oneapi::experimental; -template -sycl::event launch(::syclcompat::dim3 grid_dim, ::syclcompat::dim3 block_dim, - Args... args); +sycl_exp::cuda::cluster_size cluster_dims(cluster_range); +launch_policy policy{blocksPerGrid, threadsPerBlock, + local_mem_size(nbytes), + launch_properties{cluster_dims}}; +launch(policy, d_A, d_B, d_C, n); ``` ### Utilities diff --git a/sycl/include/syclcompat/launch.hpp b/sycl/include/syclcompat/launch.hpp index 503f29ff8b91f..eb5d774bc12d3 100644 --- a/sycl/include/syclcompat/launch.hpp +++ b/sycl/include/syclcompat/launch.hpp @@ -31,6 +31,7 @@ #include #include +#include namespace syclcompat { @@ -67,26 +68,6 @@ launch(const sycl::nd_range<3> &range, sycl::queue q, Args... args) { range, [=](sycl::nd_item<3>) { [[clang::always_inline]] F(args...); }); } -template -sycl::event launch(const sycl::nd_range<3> &range, size_t mem_size, - sycl::queue q, Args... args) { - static_assert(detail::getArgumentCount(F) == sizeof...(args) + 1, - "Wrong number of arguments to SYCL kernel"); - - using F_t = decltype(F); - using f_return_t = typename std::invoke_result_t; - static_assert(std::is_same::value, - "SYCL kernels should return void"); - - return q.submit([&](sycl::handler &cgh) { - auto local_acc = sycl::local_accessor(mem_size, cgh); - cgh.parallel_for(range, [=](sycl::nd_item<3>) { - auto local_mem = local_acc.get_pointer(); - [[clang::always_inline]] F(args..., local_mem); - }); - }); -} - } // namespace detail template @@ -137,87 +118,47 @@ launch(const dim3 &grid, const dim3 &threads, Args... args) { return launch(grid, threads, get_default_queue(), args...); } -/// Launches a kernel with the templated F param and arguments on a -/// device specified by the given nd_range and SYCL queue. -/// @tparam F SYCL kernel to be executed, expects signature F(T* local_mem, -/// Args... args). -/// @tparam Dim nd_range dimension number. -/// @tparam Args Types of the arguments to be passed to the kernel. -/// @param range Nd_range specifying the work group and global sizes for the -/// kernel. -/// @param q The SYCL queue on which to execute the kernel. -/// @param mem_size The size, in number of bytes, of the local -/// memory to be allocated for kernel. -/// @param args The arguments to be passed to the kernel. -/// @return A SYCL event object that can be used to synchronize with the -/// kernel's execution. -template -sycl::event launch(const sycl::nd_range &range, size_t mem_size, - sycl::queue q, Args... args) { - return detail::launch(detail::transform_nd_range(range), mem_size, q, - args...); +} // namespace syclcompat + +namespace syclcompat::experimental { + +namespace detail { + +template +sycl::event launch(LaunchPolicy launch_policy, sycl::queue q, Args... args) { + static_assert(syclcompat::args_compatible, + "Mismatch between device function signature and supplied " + "arguments. Have you correctly handled local memory/char*?"); + + sycl_exp::launch_config config(launch_policy.get_range(), + launch_policy.get_launch_properties()); + + return sycl_exp::submit_with_event(q, [&](sycl::handler &cgh) { + auto KernelFunctor = build_kernel_functor(cgh, launch_policy, args...); + if constexpr (syclcompat::detail::is_range_v< + typename LaunchPolicy::RangeT>) { + parallel_for(cgh, config, KernelFunctor); + } else { + static_assert( + syclcompat::detail::is_nd_range_v); + nd_launch(cgh, config, KernelFunctor); + } + }); } -/// Launches a kernel with the templated F param and arguments on a -/// device specified by the given nd_range using theSYCL default queue. -/// @tparam F SYCL kernel to be executed, expects signature F(T* local_mem, -/// Args... args). -/// @tparam Dim nd_range dimension number. -/// @tparam Args Types of the arguments to be passed to the kernel. -/// @param range Nd_range specifying the work group and global sizes for the -/// kernel. -/// @param mem_size The size, in number of bytes, of the local -/// memory to be allocated for kernel. -/// @param args The arguments to be passed to the kernel. -/// @return A SYCL event object that can be used to synchronize with the -/// kernel's execution. -template -sycl::event launch(const sycl::nd_range &range, size_t mem_size, - Args... args) { - return launch(range, mem_size, get_default_queue(), args...); } -/// Launches a kernel with the templated F param and arguments on a -/// device with a user-specified grid and block dimensions following the -/// standard of other programming models using a user-defined SYCL queue. -/// @tparam F SYCL kernel to be executed, expects signature F(T* local_mem, -/// Args... args). -/// @tparam Dim nd_range dimension number. -/// @tparam Args Types of the arguments to be passed to the kernel. -/// @param grid Grid dimensions represented with an (x, y, z) iteration space. -/// @param threads Block dimensions represented with an (x, y, z) iteration -/// space. -/// @param mem_size The size, in number of bytes, of the local -/// memory to be allocated for kernel. -/// @param args The arguments to be passed to the kernel. -/// @return A SYCL event object that can be used to synchronize with the -/// kernel's execution. -template -sycl::event launch(const dim3 &grid, const dim3 &threads, size_t mem_size, - sycl::queue q, Args... args) { - return launch(sycl::nd_range<3>{grid * threads, threads}, mem_size, q, - args...); + +template +sycl::event launch(LaunchPolicy launch_policy, sycl::queue q, Args... args) { + static_assert(detail::is_launch_policy_v); + return detail::launch(launch_policy, q, args...); } -/// Launches a kernel with the templated F param and arguments on a -/// device with a user-specified grid and block dimensions following the -/// standard of other programming models using the default SYCL queue. -/// @tparam F SYCL kernel to be executed, expects signature F(T* local_mem, -/// Args... args). -/// @tparam Dim nd_range dimension number. -/// @tparam Args Types of the arguments to be passed to the kernel. -/// @param grid Grid dimensions represented with an (x, y, z) iteration space. -/// @param threads Block dimensions represented with an (x, y, z) iteration -/// space. -/// @param mem_size The size, in number of bytes, of the -/// local memory to be allocated. -/// @param args The arguments to be passed to the kernel. -/// @return A SYCL event object that can be used to synchronize with the -/// kernel's execution. -template -sycl::event launch(const dim3 &grid, const dim3 &threads, size_t mem_size, - Args... args) { - return launch(grid, threads, mem_size, get_default_queue(), args...); +template +sycl::event launch(LaunchPolicy launch_policy, Args... args) { + static_assert(detail::is_launch_policy_v); + return launch(launch_policy, get_default_queue(), args...); } -} // namespace syclcompat +} // namespace syclcompat::experimental diff --git a/sycl/include/syclcompat/launch_experimental.hpp b/sycl/include/syclcompat/launch_experimental.hpp deleted file mode 100644 index 3074c8c20371e..0000000000000 --- a/sycl/include/syclcompat/launch_experimental.hpp +++ /dev/null @@ -1,105 +0,0 @@ -/*************************************************************************** - * - * Copyright (C) Codeplay Software Ltd. - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM - * Exceptions. See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SYCLcompat - * - * launch_experimental.hpp - * - * Description: - * Launch Overloads with accepting required subgroup size - **************************************************************************/ - -#pragma once - -#include -#include -#include - -namespace syclcompat { -namespace experimental { - -//================================================================================================// -// Overloads using Local Memory // -//================================================================================================// - -template -std::enable_if_t, sycl::event> -launch(sycl::nd_range<3> launch_range, std::size_t local_memory_size, - sycl::queue queue, Args... args) { - return queue.submit([&](sycl::handler &cgh) { - sycl::local_accessor loc(local_memory_size, cgh); - cgh.parallel_for( - launch_range, - [=](sycl::nd_item<3> it) [[sycl::reqd_sub_group_size(SubgroupSize)]] { - [[clang::always_inline]] F( - args..., loc.get_multi_ptr()); - }); - }); -} - -template -std::enable_if_t, sycl::event> -launch(sycl::nd_range launch_range, std::size_t local_memory_size, - Args... args) { - return launch( - ::syclcompat::detail::transform_nd_range(launch_range), local_memory_size, - ::syclcompat::get_default_queue(), args...); -} - -template -std::enable_if_t, sycl::event> -launch(::syclcompat::dim3 grid_dim, ::syclcompat::dim3 block_dim, - std::size_t local_memory_size, Args... args) { - return launch( - ::syclcompat::detail::transform_nd_range(sycl::nd_range( - sycl::range<3>(grid_dim * block_dim), sycl::range<3>(block_dim))), - local_memory_size, ::syclcompat::get_default_queue(), args...); -} - -//================================================================================================// -// Overloads not using Local Memory // -//================================================================================================// - -template -std::enable_if_t, sycl::event> -launch(sycl::nd_range<3> launch_range, sycl::queue queue, Args... args) { - return queue.submit([&](sycl::handler &cgh) { - cgh.parallel_for(launch_range, - [=](sycl::nd_item<3> it) - [[sycl::reqd_sub_group_size(SubgroupSize)]] { - [[clang::always_inline]] F(args...); - }); - }); -} - -template -std::enable_if_t, sycl::event> -launch(sycl::nd_range launch_range, Args... args) { - return launch( - ::syclcompat::detail::transform_nd_range(launch_range), - ::syclcompat::get_default_queue(), args...); -} - -template -std::enable_if_t, sycl::event> -launch(::syclcompat::dim3 grid_dim, ::syclcompat::dim3 block_dim, - Args... args) { - return launch( - ::syclcompat::detail::transform_nd_range(sycl::nd_range( - sycl::range<3>(grid_dim * block_dim), sycl::range<3>(block_dim))), - ::syclcompat::get_default_queue(), args...); -} - -} // namespace experimental -} // namespace syclcompat diff --git a/sycl/include/syclcompat/launch_policy.hpp b/sycl/include/syclcompat/launch_policy.hpp new file mode 100644 index 0000000000000..1c5f6ed3e97d6 --- /dev/null +++ b/sycl/include/syclcompat/launch_policy.hpp @@ -0,0 +1,254 @@ +/*************************************************************************** + * + * Copyright (C) Codeplay Software Ltd. + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM + * Exceptions. See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SYCL compatibility extension + * + * launch.hpp + * + * Description: + * launch functionality for the SYCL compatibility extension + **************************************************************************/ + +#pragma once + +#include "sycl/ext/oneapi/experimental/enqueue_functions.hpp" +#include "sycl/ext/oneapi/properties/properties.hpp" +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace syclcompat { +namespace experimental { + +namespace sycl_exp = sycl::ext::oneapi::experimental; + +// Wrapper for kernel sycl_exp::properties +template struct kernel_properties { + static_assert(sycl_exp::is_property_list_v); + using Props = Properties; + + template + kernel_properties(Props... properties) : props{properties...} {} + + template + kernel_properties(sycl_exp::properties properties) + : props{properties} {} + + Properties props; +}; + +template ::value, void>> +kernel_properties(Props... props) + -> kernel_properties; + +template +kernel_properties(sycl_exp::properties props) + -> kernel_properties>; + +// Wrapper for launch sycl_exp::properties +template struct launch_properties { + static_assert(sycl_exp::is_property_list_v); + using Props = Properties; + + template + launch_properties(Props... properties) : props{properties...} {} + + template + launch_properties(sycl_exp::properties properties) + : props{properties} {} + + Properties props; +}; + +template ::value, void>> +launch_properties(Props... props) + -> launch_properties; + +template +launch_properties(sycl_exp::properties props) + -> launch_properties>; + +// Wrapper for local memory size +struct local_mem_size { + local_mem_size(size_t size = 0) : size{size} {}; + size_t size; +}; + +// launch_policy is constructed by the user & passed to `compat_exp::launch` +template +class launch_policy { + static_assert(sycl_exp::is_property_list_v); + static_assert(sycl_exp::is_property_list_v); + static_assert(syclcompat::detail::is_range_or_nd_range_v); + static_assert(syclcompat::detail::is_nd_range_v || !LocalMem, + "sycl::range kernel launches are incompatible with local " + "memory usage!"); + +public: + using KPropsT = KProps; + using LPropsT = LProps; + using RangeT = Range; + static constexpr bool HasLocalMem = LocalMem; + +private: + launch_policy() = default; + + template + launch_policy(Ts... ts) + : _kernel_properties{detail::property_getter< + kernel_properties, kernel_properties, std::tuple>()( + std::tuple(ts...))}, + _launch_properties{detail::property_getter< + launch_properties, launch_properties, std::tuple>()( + std::tuple(ts...))}, + _local_mem_size{ + detail::local_mem_getter>()( + std::tuple(ts...))} { + check_variadic_args(ts...); + } + + template void check_variadic_args(Ts...) { + static_assert( + std::conjunction_v, + detail::is_launch_properties, + detail::is_local_mem_size>...>, + "Received an unexpected argument to ctor. Did you forget to wrap " + "in " + "compat::kernel_properties, launch_properties, local_mem_size?"); + } + +public: + template + launch_policy(Range range, Ts... ts) : launch_policy(ts...) { + _range = range; + check_variadic_args(ts...); + } + + template + launch_policy(dim3 global_range, Ts... ts) : launch_policy(ts...) { + _range = Range{global_range}; + check_variadic_args(ts...); + } + + template + launch_policy(dim3 global_range, dim3 local_range, Ts... ts) + : launch_policy(ts...) { + _range = Range{global_range * local_range, local_range}; + check_variadic_args(ts...); + } + + KProps get_kernel_properties() { return _kernel_properties.props; } + LProps get_launch_properties() { return _launch_properties.props; } + size_t get_local_mem_size() { return _local_mem_size.size; } + Range get_range() { return _range; } + +private: + Range _range; + kernel_properties _kernel_properties; + launch_properties _launch_properties; + local_mem_size _local_mem_size; +}; + +// Deduction guides for launch_policy +template +launch_policy(Range, Ts...) -> launch_policy< + Range, detail::properties_or_empty, + detail::properties_or_empty, + detail::has_type>::value>; + +template +launch_policy(sycl::range, sycl::range, Ts...) -> launch_policy< + sycl::nd_range, detail::properties_or_empty, + detail::properties_or_empty, + detail::has_type>::value>; + +template +launch_policy(dim3, Ts...) -> launch_policy< + sycl::range<3>, detail::properties_or_empty, + detail::properties_or_empty, + detail::has_type>::value>; + +template +launch_policy(dim3, dim3, Ts...) -> launch_policy< + sycl::nd_range<3>, detail::properties_or_empty, + detail::properties_or_empty, + detail::has_type>::value>; + +namespace detail { + +template +struct KernelFunctor { + KernelFunctor(KProps kernel_props, Args... args) + : _kernel_properties{kernel_props}, + _argument_tuple(std::make_tuple(args...)) {} + + KernelFunctor(KProps kernel_props, sycl::local_accessor local_acc, + Args... args) + : _kernel_properties{kernel_props}, _local_acc{local_acc}, + _argument_tuple(std::make_tuple(args...)) {} + + auto get(sycl_exp::properties_tag) { return _kernel_properties; } + + __syclcompat_inline__ void + operator()(syclcompat::detail::range_to_item_t) const { + if constexpr (HasLocalMem) { + char *local_mem_ptr = static_cast( + _local_acc.template get_multi_ptr().get()); + std::apply( + [lmem_ptr = local_mem_ptr](auto &&...args) { F(args..., lmem_ptr); }, + _argument_tuple); + } else { + std::apply([](auto &&...args) { F(args...); }, _argument_tuple); + } + } + + KProps _kernel_properties; + std::tuple _argument_tuple; + std::conditional_t, std::monostate> + _local_acc; // monostate for empty type +}; + +//==================================================================== +// This helper function avoids 2 nested `if constexpr` in detail::launch +template +auto build_kernel_functor(sycl::handler &cgh, LaunchPolicy launch_policy, + Args... args) + -> KernelFunctor { + if constexpr (LaunchPolicy::HasLocalMem) { + sycl::local_accessor local_memory( + launch_policy.get_local_mem_size(), cgh); + return KernelFunctor( + launch_policy.get_kernel_properties(), local_memory, args...); + } else { + return KernelFunctor( + launch_policy.get_kernel_properties(), args...); + } +} + +} // namespace detail +} // namespace experimental +} // namespace syclcompat diff --git a/sycl/include/syclcompat/syclcompat.hpp b/sycl/include/syclcompat/syclcompat.hpp index 401b5681d40dd..8c5f693794948 100644 --- a/sycl/include/syclcompat/syclcompat.hpp +++ b/sycl/include/syclcompat/syclcompat.hpp @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include diff --git a/sycl/include/syclcompat/traits.hpp b/sycl/include/syclcompat/traits.hpp index f992c67bae8ca..2f389ccf79484 100644 --- a/sycl/include/syclcompat/traits.hpp +++ b/sycl/include/syclcompat/traits.hpp @@ -23,6 +23,10 @@ #pragma once #include +#include +#include +#include +#include #include namespace syclcompat { @@ -41,4 +45,209 @@ template struct arith { }; template using arith_t = typename arith::type; +// Traits to check device function signature matches args (with or without local +// mem) +template +struct device_fn_invocable : std::is_invocable {}; + +template +struct device_fn_lmem_invocable + : std::is_invocable {}; + +template +constexpr inline bool args_compatible = + std::conditional_t, + device_fn_invocable>::value; + +namespace detail { + +// Trait for identifying sycl::range and sycl::nd_range. +template struct is_range : std::false_type {}; +template struct is_range> : std::true_type {}; + +template constexpr bool is_range_v = is_range::value; + +template struct is_nd_range : std::false_type {}; +template struct is_nd_range> : std::true_type {}; + +template constexpr bool is_nd_range_v = is_nd_range::value; + +template +constexpr bool is_range_or_nd_range_v = + std::disjunction_v, is_nd_range>; + +// Trait range_to_item_t to convert nd_range -> nd_item, range -> item +template struct range_to_item_map; +template struct range_to_item_map> { + using ItemT = sycl::nd_item; +}; +template struct range_to_item_map> { + using ItemT = sycl::item; +}; + +template +using range_to_item_t = typename range_to_item_map::ItemT; + +} // namespace detail + +// Forward decls +namespace experimental { + +template struct kernel_properties; +template struct launch_properties; +struct local_mem_size; + +template +class launch_policy; +} // namespace experimental + +namespace experimental::detail { + +// Helper for tuple_template_index +template