Skip to content

Commit 5edbabb

Browse files
authored
Merge branch 'sycl' into fabio/fix_level_zero_updates
2 parents e9e6af3 + dc181bb commit 5edbabb

File tree

10 files changed

+310
-11
lines changed

10 files changed

+310
-11
lines changed
Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
# commit f01741af022cfe82afcb026b9aa0be251eb6a497
2-
# Merge: 004d2474 85bb5f62
3-
# Author: Callum Fare <callum@codeplay.com>
4-
# Date: Tue Nov 5 13:39:53 2024 +0000
5-
# Merge pull request #2260 from nrspruit/refactor_l0_default_init
6-
# [L0] Refactor to remove default constructor inits
1+
# commit 3edf99755ce2af3b53102a7d8438e0fe969efac3
2+
# Merge: 5955bad3 0b968661
3+
# Author: Ross Brunton <ross@codeplay.com>
4+
# Date: Wed Nov 6 11:07:29 2024 +0000
5+
# Merge pull request #2082 from RossBrunton/ross/multiadapt
6+
# [CI] Add "loader" support to conformance testing
77
set(UNIFIED_RUNTIME_TAG 8e032a7f10acb0c07fb17338e96f3fcd72ef781c)

sycl/include/syclcompat/launch_policy.hpp

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,17 @@ launch_policy(dim3, dim3, Ts...) -> launch_policy<
192192
detail::has_type<local_mem_size, std::tuple<Ts...>>::value>;
193193

194194
namespace detail {
195+
// Custom std::apply helpers to enable inlining
196+
template <class F, class Tuple, size_t... Is>
197+
__syclcompat_inline__ constexpr void apply_expand(F f, Tuple t,
198+
std::index_sequence<Is...>) {
199+
[[clang::always_inline]] f(get<Is>(t)...);
200+
}
201+
202+
template <class F, class Tuple>
203+
__syclcompat_inline__ constexpr void apply_helper(F f, Tuple t) {
204+
apply_expand(f, t, std::make_index_sequence<std::tuple_size<Tuple>{}>{});
205+
}
195206

196207
template <auto F, typename Range, typename KProps, bool HasLocalMem,
197208
typename... Args>
@@ -211,12 +222,16 @@ struct KernelFunctor {
211222
operator()(syclcompat::detail::range_to_item_t<Range>) const {
212223
if constexpr (HasLocalMem) {
213224
char *local_mem_ptr = static_cast<char *>(
214-
_local_acc.template get_multi_ptr<sycl::access::decorated::no>().get());
215-
std::apply(
216-
[lmem_ptr = local_mem_ptr](auto &&...args) { F(args..., lmem_ptr); },
225+
_local_acc.template get_multi_ptr<sycl::access::decorated::no>()
226+
.get());
227+
apply_helper(
228+
[lmem_ptr = local_mem_ptr](auto &&...args) {
229+
[[clang::always_inline]] F(args..., lmem_ptr);
230+
},
217231
_argument_tuple);
218232
} else {
219-
std::apply([](auto &&...args) { F(args...); }, _argument_tuple);
233+
apply_helper([](auto &&...args) { [[clang::always_inline]] F(args...); },
234+
_argument_tuple);
220235
}
221236
}
222237

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
4+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
5+
// Extra run to check for immediate-command-list in Level Zero
6+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
7+
8+
#define GRAPH_E2E_EXPLICIT
9+
10+
#include "../Inputs/local_accessor.cpp"
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
// Tests basic adding of nodes with local accessors,
2+
// and submission of the graph.
3+
4+
#include "../graph_common.hpp"
5+
6+
int main() {
7+
queue Queue{};
8+
9+
using T = int;
10+
11+
const size_t LocalSize = 128;
12+
13+
std::vector<T> DataA(Size), DataB(Size), DataC(Size);
14+
15+
std::iota(DataA.begin(), DataA.end(), 10);
16+
17+
std::vector<T> ReferenceA(DataA);
18+
19+
exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
20+
21+
T *PtrA = malloc_device<T>(Size, Queue);
22+
23+
Queue.copy(DataA.data(), PtrA, Size);
24+
Queue.wait_and_throw();
25+
26+
auto node = add_node(Graph, Queue, [&](handler &CGH) {
27+
local_accessor<T, 1> LocalMem(LocalSize, CGH);
28+
29+
CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
30+
LocalMem[Item.get_local_linear_id()] = Item.get_global_linear_id() * 2;
31+
PtrA[Item.get_global_linear_id()] += LocalMem[Item.get_local_linear_id()];
32+
});
33+
});
34+
35+
auto GraphExec = Graph.finalize();
36+
37+
for (unsigned n = 0; n < Iterations; n++) {
38+
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
39+
}
40+
41+
Queue.wait_and_throw();
42+
43+
Queue.copy(PtrA, DataA.data(), Size);
44+
Queue.wait_and_throw();
45+
46+
free(PtrA, Queue);
47+
48+
for (size_t i = 0; i < Size; i++) {
49+
T Ref = 10 + i + (i * 2);
50+
check_value(i, Ref, ReferenceA[i], "PtrA");
51+
}
52+
53+
return 0;
54+
}
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
// Tests whole graph update of nodes with local accessors,
2+
// and submission of the graph.
3+
4+
#include "../graph_common.hpp"
5+
6+
using T = int;
7+
8+
auto add_graph_node(
9+
exp_ext::command_graph<exp_ext::graph_state::modifiable> &Graph,
10+
queue &Queue, size_t Size, size_t LocalSize, T *Ptr) {
11+
return add_node(Graph, Queue, [&](handler &CGH) {
12+
local_accessor<T, 1> LocalMem(LocalSize, CGH);
13+
14+
CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
15+
LocalMem[Item.get_local_linear_id()] = Item.get_global_linear_id() * 2;
16+
Ptr[Item.get_global_linear_id()] +=
17+
LocalMem[Item.get_local_linear_id()] + Item.get_local_range(0);
18+
});
19+
});
20+
}
21+
int main() {
22+
queue Queue{};
23+
24+
const size_t LocalSize = 128;
25+
26+
std::vector<T> DataA(Size), DataB(Size);
27+
28+
std::iota(DataA.begin(), DataA.end(), 10);
29+
std::iota(DataB.begin(), DataB.end(), 10);
30+
31+
std::vector<T> ReferenceA(DataA), ReferenceB(DataB);
32+
33+
exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()};
34+
35+
T *PtrA = malloc_device<T>(Size, Queue);
36+
T *PtrB = malloc_device<T>(Size, Queue);
37+
38+
Queue.copy(DataA.data(), PtrA, Size);
39+
Queue.copy(DataB.data(), PtrB, Size);
40+
Queue.wait_and_throw();
41+
42+
auto NodeA = add_graph_node(GraphA, Queue, Size, LocalSize / 2, PtrA);
43+
44+
auto GraphExecA = GraphA.finalize(exp_ext::property::graph::updatable{});
45+
46+
// Create second graph for whole graph update with a different local size
47+
exp_ext::command_graph GraphB{Queue.get_context(), Queue.get_device()};
48+
auto NodeB = add_graph_node(GraphB, Queue, Size, LocalSize, PtrB);
49+
50+
// Execute graphs before updating and check outputs
51+
for (unsigned n = 0; n < Iterations; n++) {
52+
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExecA); });
53+
}
54+
55+
Queue.wait_and_throw();
56+
57+
Queue.copy(PtrA, DataA.data(), Size);
58+
Queue.copy(PtrB, DataB.data(), Size);
59+
Queue.wait_and_throw();
60+
61+
for (size_t i = 0; i < Size; i++) {
62+
T RefA = 10 + i + (i * 2) + LocalSize / 2;
63+
T RefB = 10 + i;
64+
check_value(i, RefA, ReferenceA[i], "PtrA");
65+
check_value(i, RefB, ReferenceB[i], "PtrB");
66+
}
67+
68+
// Update GraphExecA using whole graph update
69+
70+
GraphExecA.update(GraphB);
71+
72+
// Execute graphs again and check outputs
73+
for (unsigned n = 0; n < Iterations; n++) {
74+
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExecA); });
75+
}
76+
77+
Queue.wait_and_throw();
78+
79+
Queue.copy(PtrA, DataA.data(), Size);
80+
Queue.copy(PtrB, DataB.data(), Size);
81+
Queue.wait_and_throw();
82+
83+
for (size_t i = 0; i < Size; i++) {
84+
T RefA = 10 + i + (i * 2) + LocalSize / 2;
85+
T RefB = 10 + i + (i * 2) + LocalSize;
86+
check_value(i, RefA, ReferenceA[i], "PtrA");
87+
check_value(i, RefB, ReferenceB[i], "PtrB");
88+
}
89+
90+
free(PtrA, Queue);
91+
free(PtrB, Queue);
92+
return 0;
93+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
4+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
5+
// Extra run to check for immediate-command-list in Level Zero
6+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
7+
8+
#define GRAPH_E2E_RECORD_REPLAY
9+
10+
#include "../Inputs/local_accessor.cpp"
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
4+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
5+
// Extra run to check for immediate-command-list in Level Zero
6+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
7+
8+
#define GRAPH_E2E_EXPLICIT
9+
10+
#include "../../Inputs/whole_update_local_acc.cpp"
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
4+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
5+
// Extra run to check for immediate-command-list in Level Zero
6+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
7+
8+
#define GRAPH_E2E_RECORD_REPLAY
9+
10+
#include "../../Inputs/whole_update_local_acc.cpp"

sycl/test/syclcompat/launch/kernel_properties.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
// We need hardware which can support at least 2 sub-group sizes, since that
2424
// hardware (presumably) supports the `intel_reqd_sub_group_size` attribute.
2525
// REQUIRES: sg-32 && sg-16
26-
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} -o - | FileCheck %s
26+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} %s -o - | FileCheck %s
2727
#include <sycl/ext/oneapi/kernel_properties/properties.hpp>
2828
#include <sycl/detail/core.hpp>
2929
#include <sycl/ext/oneapi/properties/properties.hpp>
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
/***************************************************************************
2+
*
3+
* Copyright (C) Codeplay Software Ltd.
4+
*
5+
* Part of the LLVM Project, under the Apache License v2.0 with LLVM
6+
* Exceptions. See https://llvm.org/LICENSE.txt for license information.
7+
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8+
*
9+
* Unless required by applicable law or agreed to in writing, software
10+
* distributed under the License is distributed on an "AS IS" BASIS,
11+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
* See the License for the specific language governing permissions and
13+
* limitations under the License.
14+
*
15+
* SYCLcompat API
16+
*
17+
* launch_inlining.cpp
18+
*
19+
* Description:
20+
* Ensure kernels are inlined
21+
**************************************************************************/
22+
// RUN: %clangxx -fsycl -fgpu-inline-threshold=0 %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} %s -o - | FileCheck %s
23+
// We set -fgpu-inline-threshold=0 to disable heuristic inlining for the
24+
// purposes of the test
25+
#include <sycl/detail/core.hpp>
26+
#include <sycl/group_barrier.hpp>
27+
#include <syclcompat/launch.hpp>
28+
#include <syclcompat/memory.hpp>
29+
30+
namespace compat_exp = syclcompat::experimental;
31+
namespace sycl_exp = sycl::ext::oneapi::experimental;
32+
namespace sycl_intel_exp = sycl::ext::intel::experimental;
33+
34+
static constexpr int LOCAL_MEM_SIZE = 1024;
35+
36+
// CHECK: define {{.*}}spir_kernel{{.*}}write_mem_kernel{{.*}} {
37+
// CHECK-NOT: call {{.*}}write_mem_kernel
38+
// CHECK: }
39+
40+
template <typename T> void write_mem_kernel(T *data, int num_elements) {
41+
const int id =
42+
sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_global_id(0);
43+
if (id < num_elements) {
44+
data[id] = static_cast<T>(id);
45+
}
46+
};
47+
48+
// CHECK: define {{.*}}spir_kernel{{.*}}dynamic_local_mem_typed_kernel{{.*}} {
49+
// CHECK-NOT: call {{.*}}dynamic_local_mem_typed_kernel
50+
// CHECK: }
51+
template <typename T>
52+
void dynamic_local_mem_typed_kernel(T *data, char *local_mem) {
53+
constexpr size_t num_elements = LOCAL_MEM_SIZE / sizeof(T);
54+
T *typed_local_mem = reinterpret_cast<T *>(local_mem);
55+
56+
const int id =
57+
sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_global_id(0);
58+
if (id < num_elements) {
59+
typed_local_mem[id] = static_cast<T>(id);
60+
}
61+
sycl::group_barrier(sycl::ext::oneapi::this_work_item::get_work_group<1>());
62+
if (id < num_elements) {
63+
data[id] = typed_local_mem[num_elements - id - 1];
64+
}
65+
};
66+
67+
int test_write_mem() {
68+
compat_exp::launch_policy my_dim3_config(syclcompat::dim3{32},
69+
syclcompat::dim3{32});
70+
71+
const int memsize = 1024;
72+
int *d_a = (int *)syclcompat::malloc(memsize);
73+
compat_exp::launch<write_mem_kernel<int>>(my_dim3_config, d_a,
74+
memsize / sizeof(int))
75+
.wait();
76+
77+
syclcompat::free(d_a);
78+
return 0;
79+
}
80+
81+
int test_lmem_launch() {
82+
int local_mem_size = LOCAL_MEM_SIZE;
83+
84+
size_t num_elements = local_mem_size / sizeof(int);
85+
int *d_a = (int *)syclcompat::malloc(local_mem_size);
86+
87+
compat_exp::launch_policy my_config(
88+
sycl::nd_range<1>{{256}, {256}},
89+
compat_exp::local_mem_size(local_mem_size));
90+
91+
compat_exp::launch<dynamic_local_mem_typed_kernel<int>>(my_config, d_a)
92+
.wait();
93+
94+
syclcompat::free(d_a);
95+
96+
return 0;
97+
}

0 commit comments

Comments
 (0)