Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ Jack Kirk, Codeplay +
Ronan Keryell, AMD +
Andrey Alekseenko, KTH Royal Institute of Technology +
Fábio Mestre, Codeplay +
Konrad Kusiak, Codeplay +

== Dependencies

Expand Down Expand Up @@ -1979,10 +1980,16 @@ can be used adding nodes to a graph when creating a graph from queue recording.
New methods are also defined that enable submitting an executable graph,
e.g. directly to a queue without returning an event.

==== sycl_ext_oneapi_free_function_kernels

`sycl_ext_oneapi_free_function_kernels`, defined in
link:../proposed/sycl_ext_oneapi_free_function_kernels.asciidoc[sycl_ext_oneapi_free_function_kernels]
can be used with SYCL Graphs.

== Examples and Usage Guide

Detailed code examples and usage guidelines are provided in the
link:../../SYCLGraphUsageGuide.md[SYCL Graph Usage Guide].
link:../../syclgraph/SYCLGraphUsageGuide.md[SYCL Graph Usage Guide].

== Future Direction [[future-direction]]

Expand Down
13 changes: 13 additions & 0 deletions sycl/test-e2e/Graph/Explicit/free_function_kernels.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
// Extra run to check for immediate-command-list in Level Zero
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
//
// XFAIL: cuda
// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004

#define GRAPH_E2E_EXPLICIT

#include "../Inputs/free_function_kernels.cpp"
44 changes: 44 additions & 0 deletions sycl/test-e2e/Graph/Inputs/free_function_kernels.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Tests compatibility with free function kernels extension

#include "../graph_common.hpp"

SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::single_task_kernel))
void ff_0(int *ptr) {
for (size_t i{0}; i < Size; ++i) {
ptr[i] = i;
}
}

int main() {
queue Queue{};
context ctxt{Queue.get_context()};

exp_ext::command_graph Graph{ctxt, Queue.get_device()};

int *PtrA = malloc_device<int>(Size, Queue);

std::vector<int> HostDataA(Size);

Queue.memset(PtrA, 0, Size * sizeof(int)).wait();

#ifndef __SYCL_DEVICE_ONLY__
kernel_bundle Bundle = get_kernel_bundle<bundle_state::executable>(ctxt);
kernel_id Kernel_id = exp_ext::get_kernel_id<ff_0>();
kernel Kernel = Bundle.get_kernel(Kernel_id);
auto KernelNode = Graph.add([&](handler &cgh) {
cgh.set_arg(0, PtrA);
cgh.single_task(Kernel);
});

auto ExecGraph = Graph.finalize();

Queue.ext_oneapi_graph(ExecGraph).wait();

Queue.copy(PtrA, HostDataA.data(), Size).wait();
for (size_t i = 0; i < Size; i++) {
assert(HostDataA[i] == i);
}
sycl::free(PtrA, Queue);
#endif
return 0;
}
13 changes: 13 additions & 0 deletions sycl/test-e2e/Graph/RecordReplay/free_function_kernels.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
// Extra run to check for immediate-command-list in Level Zero
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
//
// XFAIL: cuda
// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004

#define GRAPH_E2E_RECORD_REPLAY

#include "../Inputs/free_function_kernels.cpp"
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#include "../../graph_common.hpp"
#include "sycl/ext/oneapi/kernel_properties/properties.hpp"
#include "sycl/kernel_bundle.hpp"
#include <sycl/ext/oneapi/free_function_queries.hpp>

SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::single_task_kernel))
void ff_0(int *ptr) {
for (size_t i{0}; i < Size; ++i) {
ptr[i] = i;
}
}

SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::single_task_kernel))
void ff_1(int *ptr) {
for (size_t i{0}; i < Size; ++i) {
ptr[i] += i;
}
}

SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::single_task_kernel))
void ff_2(int *ptr, size_t size, size_t numKernelLoops) {
for (size_t j = 0; j < numKernelLoops; j++) {
for (size_t i = 0; i < size; i++) {
ptr[i] += i;
}
}
}

SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::nd_range_kernel<3>))
void ff_3(int *ptr) {
size_t GlobalID =
ext::oneapi::this_work_item::get_nd_item<3>().get_global_linear_id();
ptr[GlobalID] = GlobalID;
}

SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::nd_range_kernel<3>))
void ff_4(int *ptr) {
size_t GlobalID =
ext::oneapi::this_work_item::get_nd_item<3>().get_global_linear_id();
ptr[GlobalID] *= 2;
}

SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::nd_range_kernel<1>))
void ff_5(int *ptrA, int *ptrB, int *ptrC) {
size_t id = ext::oneapi::this_work_item::get_nd_item<1>().get_global_id();
ptrC[id] += ptrA[id] * ptrB[id];
}

SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::single_task_kernel))
void ff_6(int *ptr, int scalarValue) {
for (size_t i{0}; i < Size; ++i) {
ptr[i] = scalarValue;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
// Extra run to check for immediate-command-list in Level Zero
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
//
// XFAIL: cuda
// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004

// Tests updating a graph node before finalization

#include "../../graph_common.hpp"
#include "free_function_kernels.hpp"

int main() {
queue Queue{};
context ctxt{Queue.get_context()};

exp_ext::command_graph Graph{ctxt, Queue.get_device()};

int *PtrA = malloc_device<int>(Size, Queue);
int *PtrB = malloc_device<int>(Size, Queue);

std::vector<int> HostDataA(Size);
std::vector<int> HostDataB(Size);

Queue.memset(PtrA, 0, Size * sizeof(int)).wait();
Queue.memset(PtrB, 0, Size * sizeof(int)).wait();

exp_ext::dynamic_parameter InputParam(Graph, PtrA);

#ifndef __SYCL_DEVICE_ONLY__
kernel_bundle Bundle = get_kernel_bundle<bundle_state::executable>(ctxt);
kernel_id Kernel_id = exp_ext::get_kernel_id<ff_0>();
kernel Kernel = Bundle.get_kernel(Kernel_id);
auto KernelNode = Graph.add([&](handler &cgh) {
cgh.set_arg(0, InputParam);
cgh.single_task(Kernel);
});
// Swap PtrB to be the input
InputParam.update(PtrB);

auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{});

// Only PtrB should be filled with values
Queue.ext_oneapi_graph(ExecGraph).wait();

Queue.copy(PtrA, HostDataA.data(), Size).wait();
Queue.copy(PtrB, HostDataB.data(), Size).wait();
for (size_t i = 0; i < Size; i++) {
assert(HostDataA[i] == 0);
assert(HostDataB[i] == i);
}
sycl::free(PtrA, Queue);
sycl::free(PtrB, Queue);
#endif
return 0;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
// Extra run to check for immediate-command-list in Level Zero
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
//
// XFAIL: cuda
// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004

// Tests creating multiple executable graphs from the same modifiable graph and
// only updating one of them.

#include "../../graph_common.hpp"
#include "free_function_kernels.hpp"

int main() {
queue Queue{};
context ctxt{Queue.get_context()};

exp_ext::command_graph Graph{ctxt, Queue.get_device()};

int *PtrA = malloc_device<int>(Size, Queue);
int *PtrB = malloc_device<int>(Size, Queue);

std::vector<int> HostDataA(Size);
std::vector<int> HostDataB(Size);

Queue.memset(PtrA, 0, Size * sizeof(int)).wait();
Queue.memset(PtrB, 0, Size * sizeof(int)).wait();

exp_ext::dynamic_parameter InputParam(Graph, PtrA);

#ifndef __SYCL_DEVICE_ONLY__
kernel_bundle Bundle = get_kernel_bundle<bundle_state::executable>(ctxt);
kernel_id Kernel_id = exp_ext::get_kernel_id<ff_1>();
kernel Kernel = Bundle.get_kernel(Kernel_id);
auto KernelNode = Graph.add([&](handler &cgh) {
cgh.set_arg(0, InputParam);
cgh.single_task(Kernel);
});

auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{});
auto ExecGraph2 = Graph.finalize(exp_ext::property::graph::updatable{});

// PtrA values should be modified twice
Queue.ext_oneapi_graph(ExecGraph).wait();
Queue.ext_oneapi_graph(ExecGraph2).wait();

Queue.copy(PtrA, HostDataA.data(), Size).wait();
Queue.copy(PtrB, HostDataB.data(), Size).wait();
for (size_t i = 0; i < Size; i++) {
assert(HostDataA[i] == i * 2);
assert(HostDataB[i] == 0);
}

// Swap PtrB to be the input
InputParam.update(PtrB);
// Only update ExecGraph, which should now modify PtrB while ExecGraph2
// modifies PtrA still
ExecGraph.update(KernelNode);
Queue.ext_oneapi_graph(ExecGraph).wait();
Queue.ext_oneapi_graph(ExecGraph2).wait();

Queue.copy(PtrA, HostDataA.data(), Size).wait();
Queue.copy(PtrB, HostDataB.data(), Size).wait();
for (size_t i = 0; i < Size; i++) {
// A should have been modified 3 times by now, B only once
assert(HostDataA[i] == i * 3);
assert(HostDataB[i] == i);
}
sycl::free(PtrA, Queue);
sycl::free(PtrB, Queue);
#endif
return 0;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
// Extra run to check for immediate-command-list in Level Zero
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
//
// XFAIL: cuda
// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004

// Tests that updating a graph is ordered with respect to previous executions of
// the graph which may be in flight.

#include "../../graph_common.hpp"
#include "free_function_kernels.hpp"

int main() {
queue Queue{};
context ctxt{Queue.get_context()};

// Use a large N to try and make the kernel slow
const size_t N = 1 << 16;
// Loop inside kernel to make even slower (too large N runs out of memory)
const size_t NumKernelLoops = 4;
const size_t NumSubmitLoops = 8;

exp_ext::command_graph Graph{ctxt, Queue.get_device()};

int *PtrA = malloc_device<int>(N, Queue);
int *PtrB = malloc_device<int>(N, Queue);

std::vector<int> HostDataA(N);
std::vector<int> HostDataB(N);

Queue.memset(PtrA, 0, N * sizeof(int)).wait();
Queue.memset(PtrB, 0, N * sizeof(int)).wait();

exp_ext::dynamic_parameter InputParam(Graph, PtrA);

#ifndef __SYCL_DEVICE_ONLY__
kernel_bundle Bundle = get_kernel_bundle<bundle_state::executable>(ctxt);
kernel_id Kernel_id = exp_ext::get_kernel_id<ff_2>();
kernel Kernel = Bundle.get_kernel(Kernel_id);
auto KernelNode = Graph.add([&](handler &cgh) {
cgh.set_arg(0, InputParam);
cgh.set_arg(1, N);
cgh.set_arg(2, NumKernelLoops);
cgh.single_task(Kernel);
});

auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{});

// Submit a bunch of graphs without waiting
for (size_t i = 0; i < NumSubmitLoops; i++) {
Queue.ext_oneapi_graph(ExecGraph);
}

// Swap PtrB to be the input
InputParam.update(PtrB);

ExecGraph.update(KernelNode);

// Submit another set of graphs then wait on all submissions
for (size_t i = 0; i < NumSubmitLoops; i++) {
Queue.ext_oneapi_graph(ExecGraph);
}
Queue.wait_and_throw();

Queue.copy(PtrA, HostDataA.data(), N).wait();
Queue.copy(PtrB, HostDataB.data(), N).wait();
for (size_t i = 0; i < N; i++) {
assert(HostDataA[i] == i * NumKernelLoops * NumSubmitLoops);
assert(HostDataB[i] == i * NumKernelLoops * NumSubmitLoops);
}
sycl::free(PtrA, Queue);
sycl::free(PtrB, Queue);
#endif
return 0;
}
Loading
Loading