diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index 5dff0396f07fb..a5596b6e96158 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -58,6 +58,7 @@ Jack Kirk, Codeplay + Ronan Keryell, AMD + Andrey Alekseenko, KTH Royal Institute of Technology + Fábio Mestre, Codeplay + +Konrad Kusiak, Codeplay + == Dependencies @@ -1979,10 +1980,16 @@ can be used adding nodes to a graph when creating a graph from queue recording. New methods are also defined that enable submitting an executable graph, e.g. directly to a queue without returning an event. +==== sycl_ext_oneapi_free_function_kernels + +`sycl_ext_oneapi_free_function_kernels`, defined in +link:../proposed/sycl_ext_oneapi_free_function_kernels.asciidoc[sycl_ext_oneapi_free_function_kernels] +can be used with SYCL Graphs. + == Examples and Usage Guide Detailed code examples and usage guidelines are provided in the -link:../../SYCLGraphUsageGuide.md[SYCL Graph Usage Guide]. +link:../../syclgraph/SYCLGraphUsageGuide.md[SYCL Graph Usage Guide]. == Future Direction [[future-direction]] diff --git a/sycl/test-e2e/Graph/Explicit/free_function_kernels.cpp b/sycl/test-e2e/Graph/Explicit/free_function_kernels.cpp new file mode 100644 index 0000000000000..3784ebbd800a6 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/free_function_kernels.cpp @@ -0,0 +1,13 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// XFAIL: cuda +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004 + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/free_function_kernels.cpp" diff --git a/sycl/test-e2e/Graph/Inputs/free_function_kernels.cpp b/sycl/test-e2e/Graph/Inputs/free_function_kernels.cpp new file mode 100644 index 0000000000000..5a3e9e3304d01 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/free_function_kernels.cpp @@ -0,0 +1,45 @@ +// Tests compatibility with free function kernels extension + +#include "../graph_common.hpp" + +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::single_task_kernel)) +void ff_0(int *Ptr) { + for (size_t i{0}; i < Size; ++i) { + Ptr[i] = i; + } +} + +int main() { + queue Queue{}; + context Ctxt{Queue.get_context()}; + + exp_ext::command_graph Graph{Ctxt, Queue.get_device()}; + + int *PtrA = malloc_device(Size, Queue); + + std::vector HostDataA(Size); + + Queue.memset(PtrA, 0, Size * sizeof(int)).wait(); + +#ifndef __SYCL_DEVICE_ONLY__ + kernel_bundle Bundle = get_kernel_bundle(Ctxt); + kernel_id Kernel_id = exp_ext::get_kernel_id(); + kernel Kernel = Bundle.get_kernel(Kernel_id); + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.set_arg(0, PtrA); + cgh.single_task(Kernel); + }); + + auto ExecGraph = Graph.finalize(); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == i); + } +#endif + sycl::free(PtrA, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/free_function_kernels.cpp b/sycl/test-e2e/Graph/RecordReplay/free_function_kernels.cpp new file mode 100644 index 0000000000000..ad003e575e044 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/free_function_kernels.cpp @@ -0,0 +1,13 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// XFAIL: cuda +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004 + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/free_function_kernels.cpp" diff --git a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/free_function_kernels.hpp b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/free_function_kernels.hpp new file mode 100644 index 0000000000000..d319d819abdad --- /dev/null +++ b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/free_function_kernels.hpp @@ -0,0 +1,57 @@ +#pragma once + +#include "../../graph_common.hpp" +#include "sycl/ext/oneapi/kernel_properties/properties.hpp" +#include "sycl/kernel_bundle.hpp" +#include + +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::single_task_kernel)) +void ff_0(int *Ptr) { + for (size_t i{0}; i < Size; ++i) { + Ptr[i] = i; + } +} + +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::single_task_kernel)) +void ff_1(int *Ptr) { + for (size_t i{0}; i < Size; ++i) { + Ptr[i] += i; + } +} + +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::single_task_kernel)) +void ff_2(int *Ptr, size_t Size, size_t NumKernelLoops) { + for (size_t j{0}; j < NumKernelLoops; j++) { + for (size_t i{0}; i < Size; i++) { + Ptr[i] += i; + } + } +} + +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::nd_range_kernel<3>)) +void ff_3(int *Ptr) { + size_t GlobalID = + ext::oneapi::this_work_item::get_nd_item<3>().get_global_linear_id(); + Ptr[GlobalID] = GlobalID; +} + +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::nd_range_kernel<3>)) +void ff_4(int *Ptr) { + size_t GlobalID = + ext::oneapi::this_work_item::get_nd_item<3>().get_global_linear_id(); + Ptr[GlobalID] *= 2; +} + +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::nd_range_kernel<1>)) +void ff_5(int *PtrA, int *PtrB, int *PtrC) { + size_t GlobalID = + ext::oneapi::this_work_item::get_nd_item<1>().get_global_id(); + PtrC[GlobalID] += PtrA[GlobalID] * PtrB[GlobalID]; +} + +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::single_task_kernel)) +void ff_6(int *Ptr, int ScalarValue) { + for (size_t i{0}; i < Size; ++i) { + Ptr[i] = ScalarValue; + } +} diff --git a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_before_finalize.cpp b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_before_finalize.cpp new file mode 100644 index 0000000000000..5d0ac58cf0de9 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_before_finalize.cpp @@ -0,0 +1,60 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// XFAIL: cuda +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004 + +// Tests updating a graph node before finalization + +#include "../../graph_common.hpp" +#include "free_function_kernels.hpp" + +int main() { + queue Queue{}; + context Ctxt{Queue.get_context()}; + + exp_ext::command_graph Graph{Ctxt, Queue.get_device()}; + + int *PtrA = malloc_device(Size, Queue); + int *PtrB = malloc_device(Size, Queue); + + std::vector HostDataA(Size); + std::vector HostDataB(Size); + + Queue.memset(PtrA, 0, Size * sizeof(int)).wait(); + Queue.memset(PtrB, 0, Size * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph, PtrA); + +#ifndef __SYCL_DEVICE_ONLY__ + kernel_bundle Bundle = get_kernel_bundle(Ctxt); + kernel_id Kernel_id = exp_ext::get_kernel_id(); + kernel Kernel = Bundle.get_kernel(Kernel_id); + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.set_arg(0, InputParam); + cgh.single_task(Kernel); + }); + // Swap PtrB to be the input + InputParam.update(PtrB); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // Only PtrB should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + Queue.copy(PtrB, HostDataB.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == 0); + assert(HostDataB[i] == i); + } +#endif + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_multiple_exec_graphs.cpp new file mode 100644 index 0000000000000..269a5b5373638 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_multiple_exec_graphs.cpp @@ -0,0 +1,77 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// XFAIL: cuda +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004 + +// Tests creating multiple executable graphs from the same modifiable graph and +// only updating one of them. + +#include "../../graph_common.hpp" +#include "free_function_kernels.hpp" + +int main() { + queue Queue{}; + context Ctxt{Queue.get_context()}; + + exp_ext::command_graph Graph{Ctxt, Queue.get_device()}; + + int *PtrA = malloc_device(Size, Queue); + int *PtrB = malloc_device(Size, Queue); + + std::vector HostDataA(Size); + std::vector HostDataB(Size); + + Queue.memset(PtrA, 0, Size * sizeof(int)).wait(); + Queue.memset(PtrB, 0, Size * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph, PtrA); + +#ifndef __SYCL_DEVICE_ONLY__ + kernel_bundle Bundle = get_kernel_bundle(Ctxt); + kernel_id Kernel_id = exp_ext::get_kernel_id(); + kernel Kernel = Bundle.get_kernel(Kernel_id); + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.set_arg(0, InputParam); + cgh.single_task(Kernel); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + auto ExecGraph2 = Graph.finalize(exp_ext::property::graph::updatable{}); + + // PtrA values should be modified twice + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.ext_oneapi_graph(ExecGraph2).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + Queue.copy(PtrB, HostDataB.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == i * 2); + assert(HostDataB[i] == 0); + } + + // Swap PtrB to be the input + InputParam.update(PtrB); + // Only update ExecGraph, which should now modify PtrB while ExecGraph2 + // modifies PtrA still + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + Queue.ext_oneapi_graph(ExecGraph2).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + Queue.copy(PtrB, HostDataB.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + // A should have been modified 3 times by now, B only once + assert(HostDataA[i] == i * 3); + assert(HostDataB[i] == i); + } +#endif + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ordering.cpp b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ordering.cpp new file mode 100644 index 0000000000000..4d6aa6445cd0e --- /dev/null +++ b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ordering.cpp @@ -0,0 +1,80 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// XFAIL: cuda +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004 + +// Tests that updating a graph is ordered with respect to previous executions of +// the graph which may be in flight. + +#include "../../graph_common.hpp" +#include "free_function_kernels.hpp" + +int main() { + queue Queue{}; + context Ctxt{Queue.get_context()}; + + // Use a large N to try and make the kernel slow + const size_t N = 1 << 16; + // Loop inside kernel to make even slower (too large N runs out of memory) + const size_t NumKernelLoops = 4; + const size_t NumSubmitLoops = 8; + + exp_ext::command_graph Graph{Ctxt, Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + Queue.memset(PtrB, 0, N * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph, PtrA); + +#ifndef __SYCL_DEVICE_ONLY__ + kernel_bundle Bundle = get_kernel_bundle(Ctxt); + kernel_id Kernel_id = exp_ext::get_kernel_id(); + kernel Kernel = Bundle.get_kernel(Kernel_id); + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.set_arg(0, InputParam); + cgh.set_arg(1, N); + cgh.set_arg(2, NumKernelLoops); + cgh.single_task(Kernel); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // Submit a bunch of graphs without waiting + for (size_t i = 0; i < NumSubmitLoops; i++) { + Queue.ext_oneapi_graph(ExecGraph); + } + + // Swap PtrB to be the input + InputParam.update(PtrB); + + ExecGraph.update(KernelNode); + + // Submit another set of graphs then wait on all submissions + for (size_t i = 0; i < NumSubmitLoops; i++) { + Queue.ext_oneapi_graph(ExecGraph); + } + Queue.wait_and_throw(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == i * NumKernelLoops * NumSubmitLoops); + assert(HostDataB[i] == i * NumKernelLoops * NumSubmitLoops); + } +#endif + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr.cpp b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr.cpp new file mode 100644 index 0000000000000..64fc1408cee4a --- /dev/null +++ b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr.cpp @@ -0,0 +1,70 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// XFAIL: cuda +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004 + +// Tests updating a graph node using index-based explicit update + +#include "../../graph_common.hpp" +#include "free_function_kernels.hpp" + +int main() { + queue Queue{}; + context Ctxt{Queue.get_context()}; + + exp_ext::command_graph Graph{Ctxt, Queue.get_device()}; + + int *PtrA = malloc_device(Size, Queue); + int *PtrB = malloc_device(Size, Queue); + + std::vector HostDataA(Size); + std::vector HostDataB(Size); + + Queue.memset(PtrA, 0, Size * sizeof(int)).wait(); + Queue.memset(PtrB, 0, Size * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph, PtrA); + +#ifndef __SYCL_DEVICE_ONLY__ + kernel_bundle Bundle = get_kernel_bundle(Ctxt); + kernel_id Kernel_id = exp_ext::get_kernel_id(); + kernel Kernel = Bundle.get_kernel(Kernel_id); + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.set_arg(0, InputParam); + cgh.single_task(Kernel); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + Queue.copy(PtrB, HostDataB.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == 0); + } + + // Swap PtrB to be the input + InputParam.update(PtrB); + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + Queue.copy(PtrB, HostDataB.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == i); + } +#endif + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_3D.cpp b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_3D.cpp new file mode 100644 index 0000000000000..644478f9ddfb4 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_3D.cpp @@ -0,0 +1,87 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// XFAIL: cuda +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004 + +// Tests updating a 3D ND-Range graph kernel node using index-based explicit +// update + +#include "../../graph_common.hpp" +#include "free_function_kernels.hpp" + +int main() { + queue Queue{}; + context Ctxt{Queue.get_context()}; + + const range<3> GlobalWorkSize(1, 2, 2); + const range<3> LocalWorkSize(1, 2, 2); + const size_t N = GlobalWorkSize[0] * GlobalWorkSize[1] * GlobalWorkSize[2]; + + exp_ext::command_graph Graph{Ctxt, Queue.get_device()}; + + int *PtrA = malloc_device(N, Queue); + int *PtrB = malloc_device(N, Queue); + + std::vector HostDataA(N); + std::vector HostDataB(N); + + Queue.memset(PtrA, 0, N * sizeof(int)).wait(); + Queue.memset(PtrB, 0, N * sizeof(int)).wait(); + + exp_ext::dynamic_parameter DynParam(Graph, PtrA); + + nd_range<3> NDRange{GlobalWorkSize, LocalWorkSize}; + +#ifndef __SYCL_DEVICE_ONLY__ + kernel_bundle Bundle = get_kernel_bundle(Ctxt); + kernel_id Kernel_id_A = exp_ext::get_kernel_id(); + kernel Kernel_A = Bundle.get_kernel(Kernel_id_A); + auto NodeA = Graph.add([&](handler &cgh) { + cgh.set_arg(0, DynParam); + cgh.parallel_for(NDRange, Kernel_A); + }); + + kernel_id Kernel_id_B = exp_ext::get_kernel_id(); + kernel Kernel_B = Bundle.get_kernel(Kernel_id_B); + auto NodeB = Graph.add( + [&](handler &cgh) { + cgh.set_arg(0, DynParam); + cgh.parallel_for(NDRange, Kernel_B); + }, + exp_ext::property::node::depends_on{NodeA}); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + assert(HostDataA[i] == (i * 2)); + assert(HostDataB[i] == 0); + } + + // Swap PtrB to be the input/output + DynParam.update(PtrB); + ExecGraph.update({NodeA, NodeB}); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), N).wait(); + Queue.copy(PtrB, HostDataB.data(), N).wait(); + for (size_t i = 0; i < N; i++) { + const size_t Ref = i * 2; + assert(HostDataA[i] == Ref); + assert(HostDataB[i] == Ref); + } +#endif + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_double_update.cpp b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_double_update.cpp new file mode 100644 index 0000000000000..1c66a03eac4d3 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_double_update.cpp @@ -0,0 +1,81 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// XFAIL: cuda +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004 + +// Tests updating a graph node using index-based explicit update + +#include "../../graph_common.hpp" +#include "free_function_kernels.hpp" + +int main() { + queue Queue{}; + context Ctxt{Queue.get_context()}; + + exp_ext::command_graph Graph{Ctxt, Queue.get_device()}; + + int *PtrA = malloc_device(Size, Queue); + int *PtrB = malloc_device(Size, Queue); + int *PtrUnused = malloc_device(Size, Queue); + + std::vector HostDataA(Size); + std::vector HostDataB(Size); + std::vector HostDataUnused(Size); + + Queue.memset(PtrA, 0, Size * sizeof(int)).wait(); + Queue.memset(PtrB, 0, Size * sizeof(int)).wait(); + Queue.memset(PtrUnused, 0, Size * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph, PtrA); + +#ifndef __SYCL_DEVICE_ONLY__ + kernel_bundle Bundle = get_kernel_bundle(Ctxt); + kernel_id Kernel_id = exp_ext::get_kernel_id(); + kernel Kernel = Bundle.get_kernel(Kernel_id); + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.set_arg(0, InputParam); + cgh.single_task(Kernel); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + Queue.copy(PtrB, HostDataB.data(), Size).wait(); + Queue.copy(PtrUnused, HostDataUnused.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == 0); + assert(HostDataUnused[i] == 0); + } + + // Swap PtrUnused to be the input, then swap to PtrB without executing + InputParam.update(PtrUnused); + InputParam.update(PtrB); + + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + Queue.copy(PtrB, HostDataB.data(), Size).wait(); + Queue.copy(PtrUnused, HostDataUnused.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == i); + assert(HostDataB[i] == i); + // Check that PtrUnused was never actually used in a kernel + assert(HostDataUnused[i] == 0); + } +#endif + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + sycl::free(PtrUnused, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_multiple_nodes.cpp b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_multiple_nodes.cpp new file mode 100644 index 0000000000000..1143b1edf855a --- /dev/null +++ b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_multiple_nodes.cpp @@ -0,0 +1,80 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// XFAIL: cuda +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004 + +// Tests updating a single dynamic parameter which is registered with multiple +// graph nodes + +#include "../../graph_common.hpp" +#include "free_function_kernels.hpp" + +int main() { + queue Queue{}; + context Ctxt{Queue.get_context()}; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(Size, Queue); + int *PtrB = malloc_device(Size, Queue); + + std::vector HostDataA(Size); + std::vector HostDataB(Size); + + Queue.memset(PtrA, 0, Size * sizeof(int)).wait(); + Queue.memset(PtrB, 0, Size * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph, PtrA); + +#ifndef __SYCL_DEVICE_ONLY__ + kernel_bundle Bundle = get_kernel_bundle(Ctxt); + kernel_id Kernel_id_A = exp_ext::get_kernel_id(); + kernel Kernel_A = Bundle.get_kernel(Kernel_id_A); + auto KernelNodeA = Graph.add([&](handler &cgh) { + cgh.set_arg(0, InputParam); + cgh.single_task(Kernel_A); + }); + + kernel_id Kernel_id_B = exp_ext::get_kernel_id(); + kernel Kernel_B = Bundle.get_kernel(Kernel_id_B); + auto KernelNodeB = Graph.add( + [&](handler &cgh) { + cgh.set_arg(0, InputParam); + cgh.single_task(Kernel_B); + }, + exp_ext::property::node::depends_on{KernelNodeA}); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + Queue.copy(PtrB, HostDataB.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == i * 2); + assert(HostDataB[i] == 0); + } + + // Swap PtrB to be the input + InputParam.update(PtrB); + ExecGraph.update({KernelNodeA, KernelNodeB}); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + Queue.copy(PtrB, HostDataB.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == i * 2); + assert(HostDataB[i] == i * 2); + } +#endif + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_multiple_params.cpp b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_multiple_params.cpp new file mode 100644 index 0000000000000..76c66a27838e1 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_multiple_params.cpp @@ -0,0 +1,88 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// XFAIL: cuda +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004 + +// Tests updating multiple parameters to a singlegraph node using index-based +// explicit update + +#include "../../graph_common.hpp" +#include "free_function_kernels.hpp" + +int main() { + queue Queue{}; + context Ctxt{Queue.get_context()}; + + exp_ext::command_graph Graph{Ctxt, Queue.get_device()}; + + int *PtrA = malloc_device(Size, Queue); + int *PtrB = malloc_device(Size, Queue); + int *PtrC = malloc_device(Size, Queue); + + std::vector HostDataA(Size); + std::vector HostDataB(Size); + std::vector HostDataC(Size); + std::vector OutData(Size); + + std::iota(HostDataA.begin(), HostDataA.end(), 10); + std::iota(HostDataB.begin(), HostDataB.end(), 100); + + Queue.memcpy(PtrA, HostDataA.data(), Size * sizeof(int)).wait(); + Queue.memcpy(PtrB, HostDataB.data(), Size * sizeof(int)).wait(); + Queue.memset(PtrC, 0, Size * sizeof(int)).wait(); + + exp_ext::dynamic_parameter ParamA(Graph, PtrA); + exp_ext::dynamic_parameter ParamB(Graph, PtrB); + exp_ext::dynamic_parameter ParamOut(Graph, PtrC); + + nd_range<1> NDRange{Size, 32}; + +#ifndef __SYCL_DEVICE_ONLY__ + kernel_bundle Bundle = get_kernel_bundle(Ctxt); + kernel_id Kernel_id = exp_ext::get_kernel_id(); + kernel Kernel = Bundle.get_kernel(Kernel_id); + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.set_arg(0, ParamA); + cgh.set_arg(1, ParamB); + cgh.set_arg(2, ParamOut); + cgh.parallel_for(NDRange, Kernel); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + // Copy to output data to preserve original data for verifying += op + Queue.copy(PtrC, OutData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(OutData[i] == HostDataC[i] + (HostDataA[i] * HostDataB[i])); + } + + // Update C's host data + HostDataC = OutData; + + // Swap PtrB to be the input + ParamOut.update(PtrB); + ParamB.update(PtrC); + + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + // Copy to output data to preserve original data for verifying += op + Queue.copy(PtrB, OutData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(OutData[i] == HostDataB[i] + (HostDataA[i] * HostDataC[i])); + } +#endif + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + sycl::free(PtrC, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_subgraph.cpp b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_subgraph.cpp new file mode 100644 index 0000000000000..d8b4083fed1e7 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_ptr_subgraph.cpp @@ -0,0 +1,79 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// XFAIL: cuda +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004 + +// Tests updating a graph node in an executable graph that was used as a +// subgraph node in another executable graph is not reflected in the graph +// containing the subgraph node. + +#include "../../graph_common.hpp" +#include "free_function_kernels.hpp" + +int main() { + queue Queue{}; + context Ctxt{Queue.get_context()}; + + exp_ext::command_graph Graph{Ctxt, Queue.get_device()}; + exp_ext::command_graph SubGraph{Ctxt, Queue.get_device()}; + + int *PtrA = malloc_device(Size, Queue); + int *PtrB = malloc_device(Size, Queue); + + std::vector HostDataA(Size); + std::vector HostDataB(Size); + + Queue.memset(PtrA, 0, Size * sizeof(int)).wait(); + Queue.memset(PtrB, 0, Size * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(SubGraph, PtrA); + +#ifndef __SYCL_DEVICE_ONLY__ + kernel_bundle Bundle = get_kernel_bundle(Ctxt); + kernel_id SubKernel_id = exp_ext::get_kernel_id(); + kernel SubKernel = Bundle.get_kernel(SubKernel_id); + auto SubKernelNode = SubGraph.add([&](handler &cgh) { + cgh.set_arg(0, InputParam); + cgh.single_task(SubKernel); + }); + + auto SubExecGraph = SubGraph.finalize(exp_ext::property::graph::updatable{}); + + kernel_id Kernel_id = exp_ext::get_kernel_id(); + kernel Kernel = Bundle.get_kernel(Kernel_id); + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.set_arg(0, PtrA); + cgh.single_task(Kernel); + }); + + Graph.add([&](handler &cgh) { cgh.ext_oneapi_graph(SubExecGraph); }, + exp_ext::property::node::depends_on{KernelNode}); + + // Finalize the parent graph with the original values + auto ExecGraph = Graph.finalize(); + + // Swap PtrB to be the input + InputParam.update(PtrB); + // Update the executable graph that was used as a subgraph with the new value, + // this should not affect ExecGraph + SubExecGraph.update(SubKernelNode); + // Only PtrA should be filled with values + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + Queue.copy(PtrB, HostDataB.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == i * 2); + assert(HostDataB[i] == 0); + } +#endif + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_scalar.cpp b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_scalar.cpp new file mode 100644 index 0000000000000..e69e4097e4b50 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/update_with_indices_scalar.cpp @@ -0,0 +1,65 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// XFAIL: cuda +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004 + +// Tests updating a graph node scalar argument using index-based explicit update + +#include "../../graph_common.hpp" +#include "free_function_kernels.hpp" + +int main() { + queue Queue{}; + context Ctxt{Queue.get_context()}; + + exp_ext::command_graph Graph{Ctxt, Queue.get_device()}; + + int *DeviceData = malloc_device(Size, Queue); + + int ScalarValue = 17; + + std::vector HostData(Size); + + Queue.memset(DeviceData, 0, Size * sizeof(int)).wait(); + + exp_ext::dynamic_parameter InputParam(Graph, ScalarValue); + +#ifndef __SYCL_DEVICE_ONLY__ + kernel_bundle Bundle = get_kernel_bundle(Ctxt); + kernel_id Kernel_id = exp_ext::get_kernel_id(); + kernel Kernel = Bundle.get_kernel(Kernel_id); + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.set_arg(0, DeviceData); + cgh.set_arg(1, InputParam); + cgh.single_task(Kernel); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + // DeviceData should be filled with current ScalarValue (17) + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(DeviceData, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostData[i] == 17); + } + + // Update ScalarValue to be 99 instead + InputParam.update(99); + ExecGraph.update(KernelNode); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(DeviceData, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostData[i] == 99); + } +#endif + sycl::free(DeviceData, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/graph_common.hpp b/sycl/test-e2e/Graph/graph_common.hpp index d60049a71cca3..4b5a4c6c9b41f 100644 --- a/sycl/test-e2e/Graph/graph_common.hpp +++ b/sycl/test-e2e/Graph/graph_common.hpp @@ -1,3 +1,5 @@ +#pragma once + #include #include