From 7dc5195cd57c5bfb0441e5046f7946269aef289a Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Thu, 31 Oct 2024 09:02:43 +0000 Subject: [PATCH 01/10] Ensure `launch`ed kernels are fully inlined --- sycl/include/syclcompat/launch_policy.hpp | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/sycl/include/syclcompat/launch_policy.hpp b/sycl/include/syclcompat/launch_policy.hpp index 1c5f6ed3e97d6..7b60d269c89c3 100644 --- a/sycl/include/syclcompat/launch_policy.hpp +++ b/sycl/include/syclcompat/launch_policy.hpp @@ -192,6 +192,18 @@ launch_policy(dim3, dim3, Ts...) -> launch_policy< detail::has_type>::value>; namespace detail { +// Custom std::apply helpers to enable inlining +template +__syclcompat_inline__ constexpr void +apply_expand(F f, Tuple t, std::index_sequence) { + [[clang::always_inline]] f(get(t)...); +} + +template +__syclcompat_inline__ constexpr void +apply_helper(F f, Tuple t) { + apply_expand(f, t, std::make_index_sequence{}>{}); +} template @@ -212,11 +224,11 @@ struct KernelFunctor { if constexpr (HasLocalMem) { char *local_mem_ptr = static_cast( _local_acc.template get_multi_ptr().get()); - std::apply( - [lmem_ptr = local_mem_ptr](auto &&...args) { F(args..., lmem_ptr); }, - _argument_tuple); + apply_helper( + [lmem_ptr = local_mem_ptr](auto &&...args) { [[clang::always_inline]] F(args..., lmem_ptr); }, + _argument_tuple); } else { - std::apply([](auto &&...args) { F(args...); }, _argument_tuple); + apply_helper([](auto &&...args) { [[clang::always_inline]] F(args...); }, _argument_tuple); } } From 5fefc0eceab7f3af79df13f61cb818aba6a70ba1 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Thu, 31 Oct 2024 09:37:34 +0000 Subject: [PATCH 02/10] Formatting --- sycl/include/syclcompat/launch_policy.hpp | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/sycl/include/syclcompat/launch_policy.hpp b/sycl/include/syclcompat/launch_policy.hpp index 7b60d269c89c3..f6a30fc46db1e 100644 --- a/sycl/include/syclcompat/launch_policy.hpp +++ b/sycl/include/syclcompat/launch_policy.hpp @@ -194,15 +194,14 @@ launch_policy(dim3, dim3, Ts...) -> launch_policy< namespace detail { // Custom std::apply helpers to enable inlining template -__syclcompat_inline__ constexpr void -apply_expand(F f, Tuple t, std::index_sequence) { - [[clang::always_inline]] f(get(t)...); +__syclcompat_inline__ constexpr void apply_expand(F f, Tuple t, + std::index_sequence) { + [[clang::always_inline]] f(get(t)...); } template -__syclcompat_inline__ constexpr void -apply_helper(F f, Tuple t) { - apply_expand(f, t, std::make_index_sequence{}>{}); +__syclcompat_inline__ constexpr void apply_helper(F f, Tuple t) { + apply_expand(f, t, std::make_index_sequence{}>{}); } template ) const { if constexpr (HasLocalMem) { char *local_mem_ptr = static_cast( - _local_acc.template get_multi_ptr().get()); + _local_acc.template get_multi_ptr() + .get()); apply_helper( - [lmem_ptr = local_mem_ptr](auto &&...args) { [[clang::always_inline]] F(args..., lmem_ptr); }, - _argument_tuple); + [lmem_ptr = local_mem_ptr](auto &&...args) { + [[clang::always_inline]] F(args..., lmem_ptr); + }, + _argument_tuple); } else { - apply_helper([](auto &&...args) { [[clang::always_inline]] F(args...); }, _argument_tuple); + apply_helper([](auto &&...args) { [[clang::always_inline]] F(args...); }, + _argument_tuple); } } From c96eca6ba7840af76940af3f9e5f9382d1ed904b Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Thu, 31 Oct 2024 20:19:04 +0000 Subject: [PATCH 03/10] Fix kernel_properties.cpp test --- sycl/test/syclcompat/launch/kernel_properties.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test/syclcompat/launch/kernel_properties.cpp b/sycl/test/syclcompat/launch/kernel_properties.cpp index f17571fae0c2d..78920c62c5347 100644 --- a/sycl/test/syclcompat/launch/kernel_properties.cpp +++ b/sycl/test/syclcompat/launch/kernel_properties.cpp @@ -23,7 +23,7 @@ // We need hardware which can support at least 2 sub-group sizes, since that // hardware (presumably) supports the `intel_reqd_sub_group_size` attribute. // REQUIRES: sg-32 && sg-16 -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} -o - | FileCheck %s +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} %s -o - | FileCheck %s #include #include #include From 457f307283f1a7268481e4c01caa7baad6e208d3 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Thu, 31 Oct 2024 20:24:16 +0000 Subject: [PATCH 04/10] Test for inlining --- .../syclcompat/launch/launch_inlining.cpp | 108 ++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 sycl/test/syclcompat/launch/launch_inlining.cpp diff --git a/sycl/test/syclcompat/launch/launch_inlining.cpp b/sycl/test/syclcompat/launch/launch_inlining.cpp new file mode 100644 index 0000000000000..3b3ce6f31a068 --- /dev/null +++ b/sycl/test/syclcompat/launch/launch_inlining.cpp @@ -0,0 +1,108 @@ +/*************************************************************************** + * + * Copyright (C) Codeplay Software Ltd. + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM + * Exceptions. See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SYCLcompat API + * + * launch_inlining.cpp + * + * Description: + * Ensure kernels are fully inlined + **************************************************************************/ + +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fgpu-inline-threshold=0 %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} %s -o - | FileCheck %s +//TODO(joe): update_cc_test_checks.py +#include +#include +#include +#include + +namespace compat_exp = syclcompat::experimental; +namespace sycl_exp = sycl::ext::oneapi::experimental; +namespace sycl_intel_exp = sycl::ext::intel::experimental; + +static constexpr int LOCAL_MEM_SIZE = 1024; + +template +T dummy_fn(T input){ + return -input; +} + +// CHECK: define {{.*}}spir_kernel{{.*}}write_mem_kernel{{.*}} { +// CHECK-NOT: call {{.*}}write_mem_kernel +// CHECK-NOT: call {{.*}}dummy_fn +// CHECK: } + +template void write_mem_kernel(T *data, int num_elements) { + const int id = + sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_global_id(0); + if (id < num_elements) { + data[id] = dummy_fn(static_cast(id)); + } +}; + +// CHECK: define {{.*}}spir_kernel{{.*}}dynamic_local_mem_typed_kernel{{.*}} { +// CHECK-NOT: call {{.*}}dynamic_local_mem_typed_kernel +// CHECK-NOT: call {{.*}}dummy_fn +// CHECK: } +template +void dynamic_local_mem_typed_kernel(T *data, char *local_mem) { + constexpr size_t num_elements = LOCAL_MEM_SIZE / sizeof(T); + T *typed_local_mem = reinterpret_cast(local_mem); + + const int id = + sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_global_id(0); + if (id < num_elements) { + typed_local_mem[id] = static_cast(id); + } + sycl::group_barrier(sycl::ext::oneapi::this_work_item::get_work_group<1>()); + if (id < num_elements) { + data[id] = dummy_fn(typed_local_mem[num_elements - id - 1]); + } +}; + +int test_write_mem() { + compat_exp::launch_policy my_dim3_config(syclcompat::dim3{32}); + + const int memsize = 1024; + int *d_a = (int *)syclcompat::malloc(memsize); + compat_exp::launch>(my_dim3_config, d_a, + memsize / sizeof(int)) + .wait(); + + syclcompat::free(d_a); + return 0; +} + +int test_lmem_launch() { + using T = int; + // A property constructed at runtime: + sycl_intel_exp::cache_config my_cache_config{sycl_intel_exp::large_slm}; + + int local_mem_size = LOCAL_MEM_SIZE; // rt value + + size_t num_elements = local_mem_size / sizeof(T); + T *d_a = (T *)syclcompat::malloc(local_mem_size); + + compat_exp::launch_policy my_config( + sycl::nd_range<1>{{256}, {256}}, + compat_exp::local_mem_size(local_mem_size)); + + compat_exp::launch>(my_config, d_a) + .wait(); + + syclcompat::free(d_a); + + return 0; +} + From 89eec993bf02b710a4e3ff2339e2ded7dd573e71 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Thu, 31 Oct 2024 21:40:23 +0000 Subject: [PATCH 05/10] Simplify test --- sycl/test/syclcompat/launch/launch_inlining.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/sycl/test/syclcompat/launch/launch_inlining.cpp b/sycl/test/syclcompat/launch/launch_inlining.cpp index 3b3ce6f31a068..cea979166c2ee 100644 --- a/sycl/test/syclcompat/launch/launch_inlining.cpp +++ b/sycl/test/syclcompat/launch/launch_inlining.cpp @@ -85,14 +85,10 @@ int test_write_mem() { } int test_lmem_launch() { - using T = int; - // A property constructed at runtime: - sycl_intel_exp::cache_config my_cache_config{sycl_intel_exp::large_slm}; + int local_mem_size = LOCAL_MEM_SIZE; - int local_mem_size = LOCAL_MEM_SIZE; // rt value - - size_t num_elements = local_mem_size / sizeof(T); - T *d_a = (T *)syclcompat::malloc(local_mem_size); + size_t num_elements = local_mem_size / sizeof(int); + int *d_a = (int *)syclcompat::malloc(local_mem_size); compat_exp::launch_policy my_config( sycl::nd_range<1>{{256}, {256}}, From f640e40690a59acb391f4df6df79d8d0d76edd80 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Thu, 31 Oct 2024 22:09:10 +0000 Subject: [PATCH 06/10] Provide work-group & global size RangeRoundedKernel can prevent inlining --- sycl/test/syclcompat/launch/launch_inlining.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test/syclcompat/launch/launch_inlining.cpp b/sycl/test/syclcompat/launch/launch_inlining.cpp index cea979166c2ee..65d35d68d15af 100644 --- a/sycl/test/syclcompat/launch/launch_inlining.cpp +++ b/sycl/test/syclcompat/launch/launch_inlining.cpp @@ -72,7 +72,7 @@ void dynamic_local_mem_typed_kernel(T *data, char *local_mem) { }; int test_write_mem() { - compat_exp::launch_policy my_dim3_config(syclcompat::dim3{32}); + compat_exp::launch_policy my_dim3_config(syclcompat::dim3{32}, syclcompat::dim3{32}); const int memsize = 1024; int *d_a = (int *)syclcompat::malloc(memsize); From fa563c22a082d2cecfcdb283b1e67d233810ed28 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Fri, 1 Nov 2024 09:02:42 +0000 Subject: [PATCH 07/10] inline dummy_fn --- sycl/test/syclcompat/launch/launch_inlining.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test/syclcompat/launch/launch_inlining.cpp b/sycl/test/syclcompat/launch/launch_inlining.cpp index 65d35d68d15af..b1e11186ddac7 100644 --- a/sycl/test/syclcompat/launch/launch_inlining.cpp +++ b/sycl/test/syclcompat/launch/launch_inlining.cpp @@ -34,7 +34,7 @@ namespace sycl_intel_exp = sycl::ext::intel::experimental; static constexpr int LOCAL_MEM_SIZE = 1024; template -T dummy_fn(T input){ +__syclcompat_inline__ T dummy_fn(T input){ return -input; } From 719ff7410e5c576aaa3e3c6ea520e3ac92d672dc Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Tue, 5 Nov 2024 16:48:09 +0000 Subject: [PATCH 08/10] Remove dummy_fn We are testing for kernel inlining, dummy_fn inlining is a separate issue --- sycl/test/syclcompat/launch/launch_inlining.cpp | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/sycl/test/syclcompat/launch/launch_inlining.cpp b/sycl/test/syclcompat/launch/launch_inlining.cpp index b1e11186ddac7..433285c74dadb 100644 --- a/sycl/test/syclcompat/launch/launch_inlining.cpp +++ b/sycl/test/syclcompat/launch/launch_inlining.cpp @@ -17,11 +17,10 @@ * launch_inlining.cpp * * Description: - * Ensure kernels are fully inlined + * Ensure kernels are inlined **************************************************************************/ - // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fgpu-inline-threshold=0 %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} %s -o - | FileCheck %s -//TODO(joe): update_cc_test_checks.py +// We set -fgpu-inline-threshold=0 to disable heuristic inlining for the purposes of the test #include #include #include @@ -33,27 +32,20 @@ namespace sycl_intel_exp = sycl::ext::intel::experimental; static constexpr int LOCAL_MEM_SIZE = 1024; -template -__syclcompat_inline__ T dummy_fn(T input){ - return -input; -} - // CHECK: define {{.*}}spir_kernel{{.*}}write_mem_kernel{{.*}} { // CHECK-NOT: call {{.*}}write_mem_kernel -// CHECK-NOT: call {{.*}}dummy_fn // CHECK: } template void write_mem_kernel(T *data, int num_elements) { const int id = sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_global_id(0); if (id < num_elements) { - data[id] = dummy_fn(static_cast(id)); + data[id] = static_cast(id); } }; // CHECK: define {{.*}}spir_kernel{{.*}}dynamic_local_mem_typed_kernel{{.*}} { // CHECK-NOT: call {{.*}}dynamic_local_mem_typed_kernel -// CHECK-NOT: call {{.*}}dummy_fn // CHECK: } template void dynamic_local_mem_typed_kernel(T *data, char *local_mem) { @@ -67,7 +59,7 @@ void dynamic_local_mem_typed_kernel(T *data, char *local_mem) { } sycl::group_barrier(sycl::ext::oneapi::this_work_item::get_work_group<1>()); if (id < num_elements) { - data[id] = dummy_fn(typed_local_mem[num_elements - id - 1]); + data[id] = typed_local_mem[num_elements - id - 1]; } }; From 3e43f4c548b20cda2c27c75f49b9b3b22c7aef6f Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Tue, 5 Nov 2024 19:23:07 +0000 Subject: [PATCH 09/10] Formatting --- sycl/test/syclcompat/launch/launch_inlining.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sycl/test/syclcompat/launch/launch_inlining.cpp b/sycl/test/syclcompat/launch/launch_inlining.cpp index 433285c74dadb..55d6aeb996338 100644 --- a/sycl/test/syclcompat/launch/launch_inlining.cpp +++ b/sycl/test/syclcompat/launch/launch_inlining.cpp @@ -20,7 +20,8 @@ * Ensure kernels are inlined **************************************************************************/ // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fgpu-inline-threshold=0 %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} %s -o - | FileCheck %s -// We set -fgpu-inline-threshold=0 to disable heuristic inlining for the purposes of the test +// We set -fgpu-inline-threshold=0 to disable heuristic inlining for the +// purposes of the test #include #include #include @@ -64,12 +65,13 @@ void dynamic_local_mem_typed_kernel(T *data, char *local_mem) { }; int test_write_mem() { - compat_exp::launch_policy my_dim3_config(syclcompat::dim3{32}, syclcompat::dim3{32}); + compat_exp::launch_policy my_dim3_config(syclcompat::dim3{32}, + syclcompat::dim3{32}); const int memsize = 1024; int *d_a = (int *)syclcompat::malloc(memsize); compat_exp::launch>(my_dim3_config, d_a, - memsize / sizeof(int)) + memsize / sizeof(int)) .wait(); syclcompat::free(d_a); @@ -93,4 +95,3 @@ int test_lmem_launch() { return 0; } - From 24259038ef42ab13c1f988309e07ce93001ea475 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Wed, 6 Nov 2024 10:12:38 +0000 Subject: [PATCH 10/10] Don't specify (unused) target triple --- sycl/test/syclcompat/launch/launch_inlining.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test/syclcompat/launch/launch_inlining.cpp b/sycl/test/syclcompat/launch/launch_inlining.cpp index 55d6aeb996338..a224837139a56 100644 --- a/sycl/test/syclcompat/launch/launch_inlining.cpp +++ b/sycl/test/syclcompat/launch/launch_inlining.cpp @@ -19,7 +19,7 @@ * Description: * Ensure kernels are inlined **************************************************************************/ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fgpu-inline-threshold=0 %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} %s -o - | FileCheck %s +// RUN: %clangxx -fsycl -fgpu-inline-threshold=0 %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} %s -o - | FileCheck %s // We set -fgpu-inline-threshold=0 to disable heuristic inlining for the // purposes of the test #include