diff --git a/sycl/include/syclcompat/launch_policy.hpp b/sycl/include/syclcompat/launch_policy.hpp index 1c5f6ed3e97d6..f6a30fc46db1e 100644 --- a/sycl/include/syclcompat/launch_policy.hpp +++ b/sycl/include/syclcompat/launch_policy.hpp @@ -192,6 +192,17 @@ launch_policy(dim3, dim3, Ts...) -> launch_policy< detail::has_type>::value>; namespace detail { +// Custom std::apply helpers to enable inlining +template +__syclcompat_inline__ constexpr void apply_expand(F f, Tuple t, + std::index_sequence) { + [[clang::always_inline]] f(get(t)...); +} + +template +__syclcompat_inline__ constexpr void apply_helper(F f, Tuple t) { + apply_expand(f, t, std::make_index_sequence{}>{}); +} template @@ -211,12 +222,16 @@ struct KernelFunctor { operator()(syclcompat::detail::range_to_item_t) const { if constexpr (HasLocalMem) { char *local_mem_ptr = static_cast( - _local_acc.template get_multi_ptr().get()); - std::apply( - [lmem_ptr = local_mem_ptr](auto &&...args) { F(args..., lmem_ptr); }, + _local_acc.template get_multi_ptr() + .get()); + apply_helper( + [lmem_ptr = local_mem_ptr](auto &&...args) { + [[clang::always_inline]] F(args..., lmem_ptr); + }, _argument_tuple); } else { - std::apply([](auto &&...args) { F(args...); }, _argument_tuple); + apply_helper([](auto &&...args) { [[clang::always_inline]] F(args...); }, + _argument_tuple); } } diff --git a/sycl/test/syclcompat/launch/kernel_properties.cpp b/sycl/test/syclcompat/launch/kernel_properties.cpp index f17571fae0c2d..78920c62c5347 100644 --- a/sycl/test/syclcompat/launch/kernel_properties.cpp +++ b/sycl/test/syclcompat/launch/kernel_properties.cpp @@ -23,7 +23,7 @@ // We need hardware which can support at least 2 sub-group sizes, since that // hardware (presumably) supports the `intel_reqd_sub_group_size` attribute. // REQUIRES: sg-32 && sg-16 -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} -o - | FileCheck %s +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} %s -o - | FileCheck %s #include #include #include diff --git a/sycl/test/syclcompat/launch/launch_inlining.cpp b/sycl/test/syclcompat/launch/launch_inlining.cpp new file mode 100644 index 0000000000000..a224837139a56 --- /dev/null +++ b/sycl/test/syclcompat/launch/launch_inlining.cpp @@ -0,0 +1,97 @@ +/*************************************************************************** + * + * Copyright (C) Codeplay Software Ltd. + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM + * Exceptions. See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SYCLcompat API + * + * launch_inlining.cpp + * + * Description: + * Ensure kernels are inlined + **************************************************************************/ +// RUN: %clangxx -fsycl -fgpu-inline-threshold=0 %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} %s -o - | FileCheck %s +// We set -fgpu-inline-threshold=0 to disable heuristic inlining for the +// purposes of the test +#include +#include +#include +#include + +namespace compat_exp = syclcompat::experimental; +namespace sycl_exp = sycl::ext::oneapi::experimental; +namespace sycl_intel_exp = sycl::ext::intel::experimental; + +static constexpr int LOCAL_MEM_SIZE = 1024; + +// CHECK: define {{.*}}spir_kernel{{.*}}write_mem_kernel{{.*}} { +// CHECK-NOT: call {{.*}}write_mem_kernel +// CHECK: } + +template void write_mem_kernel(T *data, int num_elements) { + const int id = + sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_global_id(0); + if (id < num_elements) { + data[id] = static_cast(id); + } +}; + +// CHECK: define {{.*}}spir_kernel{{.*}}dynamic_local_mem_typed_kernel{{.*}} { +// CHECK-NOT: call {{.*}}dynamic_local_mem_typed_kernel +// CHECK: } +template +void dynamic_local_mem_typed_kernel(T *data, char *local_mem) { + constexpr size_t num_elements = LOCAL_MEM_SIZE / sizeof(T); + T *typed_local_mem = reinterpret_cast(local_mem); + + const int id = + sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_global_id(0); + if (id < num_elements) { + typed_local_mem[id] = static_cast(id); + } + sycl::group_barrier(sycl::ext::oneapi::this_work_item::get_work_group<1>()); + if (id < num_elements) { + data[id] = typed_local_mem[num_elements - id - 1]; + } +}; + +int test_write_mem() { + compat_exp::launch_policy my_dim3_config(syclcompat::dim3{32}, + syclcompat::dim3{32}); + + const int memsize = 1024; + int *d_a = (int *)syclcompat::malloc(memsize); + compat_exp::launch>(my_dim3_config, d_a, + memsize / sizeof(int)) + .wait(); + + syclcompat::free(d_a); + return 0; +} + +int test_lmem_launch() { + int local_mem_size = LOCAL_MEM_SIZE; + + size_t num_elements = local_mem_size / sizeof(int); + int *d_a = (int *)syclcompat::malloc(local_mem_size); + + compat_exp::launch_policy my_config( + sycl::nd_range<1>{{256}, {256}}, + compat_exp::local_mem_size(local_mem_size)); + + compat_exp::launch>(my_config, d_a) + .wait(); + + syclcompat::free(d_a); + + return 0; +}