Skip to content

Commit bbcf65c

Browse files
author
Konrad Kusiak
authored
[SYCL][GRAPH] Add graph E2E tests which use free function extension (#16159)
Dynamic parameters in SYCL Graphs should be used for now with free_function_kernels extension since then the order of kernel arguments is well defined. The tests are a copy of few selected ones from `Graph/Update` and modified such that they use the extension. They live in `Graph/Update/FreeFunctionKernels`. Since the free function kernels are not yet implemented on CUDA, we should keep for now both copies of the tests (with and without free functions).
1 parent f018e76 commit bbcf65c

16 files changed

+905
-1
lines changed

sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ Jack Kirk, Codeplay +
5858
Ronan Keryell, AMD +
5959
Andrey Alekseenko, KTH Royal Institute of Technology +
6060
Fábio Mestre, Codeplay +
61+
Konrad Kusiak, Codeplay +
6162

6263
== Dependencies
6364

@@ -1979,10 +1980,16 @@ can be used adding nodes to a graph when creating a graph from queue recording.
19791980
New methods are also defined that enable submitting an executable graph,
19801981
e.g. directly to a queue without returning an event.
19811982

1983+
==== sycl_ext_oneapi_free_function_kernels
1984+
1985+
`sycl_ext_oneapi_free_function_kernels`, defined in
1986+
link:../proposed/sycl_ext_oneapi_free_function_kernels.asciidoc[sycl_ext_oneapi_free_function_kernels]
1987+
can be used with SYCL Graphs.
1988+
19821989
== Examples and Usage Guide
19831990

19841991
Detailed code examples and usage guidelines are provided in the
1985-
link:../../SYCLGraphUsageGuide.md[SYCL Graph Usage Guide].
1992+
link:../../syclgraph/SYCLGraphUsageGuide.md[SYCL Graph Usage Guide].
19861993

19871994
== Future Direction [[future-direction]]
19881995

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
4+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
5+
// Extra run to check for immediate-command-list in Level Zero
6+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
7+
//
8+
// XFAIL: cuda
9+
// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004
10+
11+
#define GRAPH_E2E_EXPLICIT
12+
13+
#include "../Inputs/free_function_kernels.cpp"
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
// Tests compatibility with free function kernels extension
2+
3+
#include "../graph_common.hpp"
4+
5+
SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::single_task_kernel))
6+
void ff_0(int *Ptr) {
7+
for (size_t i{0}; i < Size; ++i) {
8+
Ptr[i] = i;
9+
}
10+
}
11+
12+
int main() {
13+
queue Queue{};
14+
context Ctxt{Queue.get_context()};
15+
16+
exp_ext::command_graph Graph{Ctxt, Queue.get_device()};
17+
18+
int *PtrA = malloc_device<int>(Size, Queue);
19+
20+
std::vector<int> HostDataA(Size);
21+
22+
Queue.memset(PtrA, 0, Size * sizeof(int)).wait();
23+
24+
#ifndef __SYCL_DEVICE_ONLY__
25+
kernel_bundle Bundle = get_kernel_bundle<bundle_state::executable>(Ctxt);
26+
kernel_id Kernel_id = exp_ext::get_kernel_id<ff_0>();
27+
kernel Kernel = Bundle.get_kernel(Kernel_id);
28+
auto KernelNode = Graph.add([&](handler &cgh) {
29+
cgh.set_arg(0, PtrA);
30+
cgh.single_task(Kernel);
31+
});
32+
33+
auto ExecGraph = Graph.finalize();
34+
35+
Queue.ext_oneapi_graph(ExecGraph).wait();
36+
37+
Queue.copy(PtrA, HostDataA.data(), Size).wait();
38+
for (size_t i = 0; i < Size; i++) {
39+
assert(HostDataA[i] == i);
40+
}
41+
#endif
42+
sycl::free(PtrA, Queue);
43+
44+
return 0;
45+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
4+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
5+
// Extra run to check for immediate-command-list in Level Zero
6+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
7+
//
8+
// XFAIL: cuda
9+
// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004
10+
11+
#define GRAPH_E2E_RECORD_REPLAY
12+
13+
#include "../Inputs/free_function_kernels.cpp"
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#pragma once
2+
3+
#include "../../graph_common.hpp"
4+
#include "sycl/ext/oneapi/kernel_properties/properties.hpp"
5+
#include "sycl/kernel_bundle.hpp"
6+
#include <sycl/ext/oneapi/free_function_queries.hpp>
7+
8+
SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::single_task_kernel))
9+
void ff_0(int *Ptr) {
10+
for (size_t i{0}; i < Size; ++i) {
11+
Ptr[i] = i;
12+
}
13+
}
14+
15+
SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::single_task_kernel))
16+
void ff_1(int *Ptr) {
17+
for (size_t i{0}; i < Size; ++i) {
18+
Ptr[i] += i;
19+
}
20+
}
21+
22+
SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::single_task_kernel))
23+
void ff_2(int *Ptr, size_t Size, size_t NumKernelLoops) {
24+
for (size_t j{0}; j < NumKernelLoops; j++) {
25+
for (size_t i{0}; i < Size; i++) {
26+
Ptr[i] += i;
27+
}
28+
}
29+
}
30+
31+
SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::nd_range_kernel<3>))
32+
void ff_3(int *Ptr) {
33+
size_t GlobalID =
34+
ext::oneapi::this_work_item::get_nd_item<3>().get_global_linear_id();
35+
Ptr[GlobalID] = GlobalID;
36+
}
37+
38+
SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::nd_range_kernel<3>))
39+
void ff_4(int *Ptr) {
40+
size_t GlobalID =
41+
ext::oneapi::this_work_item::get_nd_item<3>().get_global_linear_id();
42+
Ptr[GlobalID] *= 2;
43+
}
44+
45+
SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::nd_range_kernel<1>))
46+
void ff_5(int *PtrA, int *PtrB, int *PtrC) {
47+
size_t GlobalID =
48+
ext::oneapi::this_work_item::get_nd_item<1>().get_global_id();
49+
PtrC[GlobalID] += PtrA[GlobalID] * PtrB[GlobalID];
50+
}
51+
52+
SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::single_task_kernel))
53+
void ff_6(int *Ptr, int ScalarValue) {
54+
for (size_t i{0}; i < Size; ++i) {
55+
Ptr[i] = ScalarValue;
56+
}
57+
}
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
4+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
5+
// Extra run to check for immediate-command-list in Level Zero
6+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
7+
//
8+
// XFAIL: cuda
9+
// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004
10+
11+
// Tests updating a graph node before finalization
12+
13+
#include "../../graph_common.hpp"
14+
#include "free_function_kernels.hpp"
15+
16+
int main() {
17+
queue Queue{};
18+
context Ctxt{Queue.get_context()};
19+
20+
exp_ext::command_graph Graph{Ctxt, Queue.get_device()};
21+
22+
int *PtrA = malloc_device<int>(Size, Queue);
23+
int *PtrB = malloc_device<int>(Size, Queue);
24+
25+
std::vector<int> HostDataA(Size);
26+
std::vector<int> HostDataB(Size);
27+
28+
Queue.memset(PtrA, 0, Size * sizeof(int)).wait();
29+
Queue.memset(PtrB, 0, Size * sizeof(int)).wait();
30+
31+
exp_ext::dynamic_parameter InputParam(Graph, PtrA);
32+
33+
#ifndef __SYCL_DEVICE_ONLY__
34+
kernel_bundle Bundle = get_kernel_bundle<bundle_state::executable>(Ctxt);
35+
kernel_id Kernel_id = exp_ext::get_kernel_id<ff_0>();
36+
kernel Kernel = Bundle.get_kernel(Kernel_id);
37+
auto KernelNode = Graph.add([&](handler &cgh) {
38+
cgh.set_arg(0, InputParam);
39+
cgh.single_task(Kernel);
40+
});
41+
// Swap PtrB to be the input
42+
InputParam.update(PtrB);
43+
44+
auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{});
45+
46+
// Only PtrB should be filled with values
47+
Queue.ext_oneapi_graph(ExecGraph).wait();
48+
49+
Queue.copy(PtrA, HostDataA.data(), Size).wait();
50+
Queue.copy(PtrB, HostDataB.data(), Size).wait();
51+
for (size_t i = 0; i < Size; i++) {
52+
assert(HostDataA[i] == 0);
53+
assert(HostDataB[i] == i);
54+
}
55+
#endif
56+
sycl::free(PtrA, Queue);
57+
sycl::free(PtrB, Queue);
58+
59+
return 0;
60+
}
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
4+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
5+
// Extra run to check for immediate-command-list in Level Zero
6+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
7+
//
8+
// XFAIL: cuda
9+
// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004
10+
11+
// Tests creating multiple executable graphs from the same modifiable graph and
12+
// only updating one of them.
13+
14+
#include "../../graph_common.hpp"
15+
#include "free_function_kernels.hpp"
16+
17+
int main() {
18+
queue Queue{};
19+
context Ctxt{Queue.get_context()};
20+
21+
exp_ext::command_graph Graph{Ctxt, Queue.get_device()};
22+
23+
int *PtrA = malloc_device<int>(Size, Queue);
24+
int *PtrB = malloc_device<int>(Size, Queue);
25+
26+
std::vector<int> HostDataA(Size);
27+
std::vector<int> HostDataB(Size);
28+
29+
Queue.memset(PtrA, 0, Size * sizeof(int)).wait();
30+
Queue.memset(PtrB, 0, Size * sizeof(int)).wait();
31+
32+
exp_ext::dynamic_parameter InputParam(Graph, PtrA);
33+
34+
#ifndef __SYCL_DEVICE_ONLY__
35+
kernel_bundle Bundle = get_kernel_bundle<bundle_state::executable>(Ctxt);
36+
kernel_id Kernel_id = exp_ext::get_kernel_id<ff_1>();
37+
kernel Kernel = Bundle.get_kernel(Kernel_id);
38+
auto KernelNode = Graph.add([&](handler &cgh) {
39+
cgh.set_arg(0, InputParam);
40+
cgh.single_task(Kernel);
41+
});
42+
43+
auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{});
44+
auto ExecGraph2 = Graph.finalize(exp_ext::property::graph::updatable{});
45+
46+
// PtrA values should be modified twice
47+
Queue.ext_oneapi_graph(ExecGraph).wait();
48+
Queue.ext_oneapi_graph(ExecGraph2).wait();
49+
50+
Queue.copy(PtrA, HostDataA.data(), Size).wait();
51+
Queue.copy(PtrB, HostDataB.data(), Size).wait();
52+
for (size_t i = 0; i < Size; i++) {
53+
assert(HostDataA[i] == i * 2);
54+
assert(HostDataB[i] == 0);
55+
}
56+
57+
// Swap PtrB to be the input
58+
InputParam.update(PtrB);
59+
// Only update ExecGraph, which should now modify PtrB while ExecGraph2
60+
// modifies PtrA still
61+
ExecGraph.update(KernelNode);
62+
Queue.ext_oneapi_graph(ExecGraph).wait();
63+
Queue.ext_oneapi_graph(ExecGraph2).wait();
64+
65+
Queue.copy(PtrA, HostDataA.data(), Size).wait();
66+
Queue.copy(PtrB, HostDataB.data(), Size).wait();
67+
for (size_t i = 0; i < Size; i++) {
68+
// A should have been modified 3 times by now, B only once
69+
assert(HostDataA[i] == i * 3);
70+
assert(HostDataB[i] == i);
71+
}
72+
#endif
73+
sycl::free(PtrA, Queue);
74+
sycl::free(PtrB, Queue);
75+
76+
return 0;
77+
}
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
4+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
5+
// Extra run to check for immediate-command-list in Level Zero
6+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
7+
//
8+
// XFAIL: cuda
9+
// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004
10+
11+
// Tests that updating a graph is ordered with respect to previous executions of
12+
// the graph which may be in flight.
13+
14+
#include "../../graph_common.hpp"
15+
#include "free_function_kernels.hpp"
16+
17+
int main() {
18+
queue Queue{};
19+
context Ctxt{Queue.get_context()};
20+
21+
// Use a large N to try and make the kernel slow
22+
const size_t N = 1 << 16;
23+
// Loop inside kernel to make even slower (too large N runs out of memory)
24+
const size_t NumKernelLoops = 4;
25+
const size_t NumSubmitLoops = 8;
26+
27+
exp_ext::command_graph Graph{Ctxt, Queue.get_device()};
28+
29+
int *PtrA = malloc_device<int>(N, Queue);
30+
int *PtrB = malloc_device<int>(N, Queue);
31+
32+
std::vector<int> HostDataA(N);
33+
std::vector<int> HostDataB(N);
34+
35+
Queue.memset(PtrA, 0, N * sizeof(int)).wait();
36+
Queue.memset(PtrB, 0, N * sizeof(int)).wait();
37+
38+
exp_ext::dynamic_parameter InputParam(Graph, PtrA);
39+
40+
#ifndef __SYCL_DEVICE_ONLY__
41+
kernel_bundle Bundle = get_kernel_bundle<bundle_state::executable>(Ctxt);
42+
kernel_id Kernel_id = exp_ext::get_kernel_id<ff_2>();
43+
kernel Kernel = Bundle.get_kernel(Kernel_id);
44+
auto KernelNode = Graph.add([&](handler &cgh) {
45+
cgh.set_arg(0, InputParam);
46+
cgh.set_arg(1, N);
47+
cgh.set_arg(2, NumKernelLoops);
48+
cgh.single_task(Kernel);
49+
});
50+
51+
auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{});
52+
53+
// Submit a bunch of graphs without waiting
54+
for (size_t i = 0; i < NumSubmitLoops; i++) {
55+
Queue.ext_oneapi_graph(ExecGraph);
56+
}
57+
58+
// Swap PtrB to be the input
59+
InputParam.update(PtrB);
60+
61+
ExecGraph.update(KernelNode);
62+
63+
// Submit another set of graphs then wait on all submissions
64+
for (size_t i = 0; i < NumSubmitLoops; i++) {
65+
Queue.ext_oneapi_graph(ExecGraph);
66+
}
67+
Queue.wait_and_throw();
68+
69+
Queue.copy(PtrA, HostDataA.data(), N).wait();
70+
Queue.copy(PtrB, HostDataB.data(), N).wait();
71+
for (size_t i = 0; i < N; i++) {
72+
assert(HostDataA[i] == i * NumKernelLoops * NumSubmitLoops);
73+
assert(HostDataB[i] == i * NumKernelLoops * NumSubmitLoops);
74+
}
75+
#endif
76+
sycl::free(PtrA, Queue);
77+
sycl::free(PtrB, Queue);
78+
79+
return 0;
80+
}

0 commit comments

Comments
 (0)