intel · sarnex · Nov 27, 2024 · Nov 22, 2024 · Nov 22, 2024 · Nov 25, 2024
@@ -58,6 +58,7 @@ Jack Kirk, Codeplay +
 Ronan Keryell, AMD +
 Andrey Alekseenko, KTH Royal Institute of Technology +
 Fábio Mestre, Codeplay +
+Konrad Kusiak, Codeplay +
 
 == Dependencies
 
@@ -1979,10 +1980,16 @@ can be used adding nodes to a graph when creating a graph from queue recording.
 New methods are also defined that enable submitting an executable graph, 
 e.g. directly to a queue without returning an event.
 
+==== sycl_ext_oneapi_free_function_kernels
+
+`sycl_ext_oneapi_free_function_kernels`, defined in 
+link:../proposed/sycl_ext_oneapi_free_function_kernels.asciidoc[sycl_ext_oneapi_free_function_kernels]
+can be used with SYCL Graphs. 
+
 == Examples and Usage Guide
 
 Detailed code examples and usage guidelines are provided in the
-link:../../SYCLGraphUsageGuide.md[SYCL Graph Usage Guide].
+link:../../syclgraph/SYCLGraphUsageGuide.md[SYCL Graph Usage Guide].
 
 == Future Direction [[future-direction]]
 

@@ -0,0 +1,13 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+//
+// XFAIL: cuda
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/free_function_kernels.cpp"
@@ -0,0 +1,44 @@
+// Tests compatibility with free function kernels extension
+
+#include "../graph_common.hpp"
+
+SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::single_task_kernel))
+void ff_0(int *ptr) {
+  for (size_t i{0}; i < Size; ++i) {
+    ptr[i] = i;
+  }
+}
+
+int main() {
+  queue Queue{};
+  context ctxt{Queue.get_context()};
+
+  exp_ext::command_graph Graph{ctxt, Queue.get_device()};
+
+  int *PtrA = malloc_device<int>(Size, Queue);
+
+  std::vector<int> HostDataA(Size);
+
+  Queue.memset(PtrA, 0, Size * sizeof(int)).wait();
+
+#ifndef __SYCL_DEVICE_ONLY__
+  kernel_bundle Bundle = get_kernel_bundle<bundle_state::executable>(ctxt);
+  kernel_id Kernel_id = exp_ext::get_kernel_id<ff_0>();
+  kernel Kernel = Bundle.get_kernel(Kernel_id);
+  auto KernelNode = Graph.add([&](handler &cgh) {
+    cgh.set_arg(0, PtrA);
+    cgh.single_task(Kernel);
+  });
+
+  auto ExecGraph = Graph.finalize();
+
+  Queue.ext_oneapi_graph(ExecGraph).wait();
+
+  Queue.copy(PtrA, HostDataA.data(), Size).wait();
+  for (size_t i = 0; i < Size; i++) {
+    assert(HostDataA[i] == i);
+  }
+  sycl::free(PtrA, Queue);
+#endif
+  return 0;
+}
@@ -0,0 +1,13 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+//
+// XFAIL: cuda
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/free_function_kernels.cpp"
@@ -0,0 +1,54 @@
+#include "../../graph_common.hpp"
+#include "sycl/ext/oneapi/kernel_properties/properties.hpp"
+#include "sycl/kernel_bundle.hpp"
+#include <sycl/ext/oneapi/free_function_queries.hpp>
+
+SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::single_task_kernel))
+void ff_0(int *ptr) {
+  for (size_t i{0}; i < Size; ++i) {
+    ptr[i] = i;
+  }
+}
+
+SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::single_task_kernel))
+void ff_1(int *ptr) {
+  for (size_t i{0}; i < Size; ++i) {
+    ptr[i] += i;
+  }
+}
+
+SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::single_task_kernel))
+void ff_2(int *ptr, size_t size, size_t numKernelLoops) {
+  for (size_t j = 0; j < numKernelLoops; j++) {
+    for (size_t i = 0; i < size; i++) {
+      ptr[i] += i;
+    }
+  }
+}
+
+SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::nd_range_kernel<3>))
+void ff_3(int *ptr) {
+  size_t GlobalID =
+      ext::oneapi::this_work_item::get_nd_item<3>().get_global_linear_id();
+  ptr[GlobalID] = GlobalID;
+}
+
+SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::nd_range_kernel<3>))
+void ff_4(int *ptr) {
+  size_t GlobalID =
+      ext::oneapi::this_work_item::get_nd_item<3>().get_global_linear_id();
+  ptr[GlobalID] *= 2;
+}
+
+SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::nd_range_kernel<1>))
+void ff_5(int *ptrA, int *ptrB, int *ptrC) {
+  size_t id = ext::oneapi::this_work_item::get_nd_item<1>().get_global_id();
+  ptrC[id] += ptrA[id] * ptrB[id];
+}
+
+SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::single_task_kernel))
+void ff_6(int *ptr, int scalarValue) {
+  for (size_t i{0}; i < Size; ++i) {
+    ptr[i] = scalarValue;
+  }
+}
@@ -0,0 +1,59 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+//
+// XFAIL: cuda
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004
+
+// Tests updating a graph node before finalization
+
+#include "../../graph_common.hpp"
+#include "free_function_kernels.hpp"
+
+int main() {
+  queue Queue{};
+  context ctxt{Queue.get_context()};
+
+  exp_ext::command_graph Graph{ctxt, Queue.get_device()};
+
+  int *PtrA = malloc_device<int>(Size, Queue);
+  int *PtrB = malloc_device<int>(Size, Queue);
+
+  std::vector<int> HostDataA(Size);
+  std::vector<int> HostDataB(Size);
+
+  Queue.memset(PtrA, 0, Size * sizeof(int)).wait();
+  Queue.memset(PtrB, 0, Size * sizeof(int)).wait();
+
+  exp_ext::dynamic_parameter InputParam(Graph, PtrA);
+
+#ifndef __SYCL_DEVICE_ONLY__
+  kernel_bundle Bundle = get_kernel_bundle<bundle_state::executable>(ctxt);
+  kernel_id Kernel_id = exp_ext::get_kernel_id<ff_0>();
+  kernel Kernel = Bundle.get_kernel(Kernel_id);
+  auto KernelNode = Graph.add([&](handler &cgh) {
+    cgh.set_arg(0, InputParam);
+    cgh.single_task(Kernel);
+  });
+  // Swap PtrB to be the input
+  InputParam.update(PtrB);
+
+  auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{});
+
+  // Only PtrB should be filled with values
+  Queue.ext_oneapi_graph(ExecGraph).wait();
+
+  Queue.copy(PtrA, HostDataA.data(), Size).wait();
+  Queue.copy(PtrB, HostDataB.data(), Size).wait();
+  for (size_t i = 0; i < Size; i++) {
+    assert(HostDataA[i] == 0);
+    assert(HostDataB[i] == i);
+  }
+  sycl::free(PtrA, Queue);
+  sycl::free(PtrB, Queue);
+#endif
+  return 0;
+}
@@ -0,0 +1,76 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+//
+// XFAIL: cuda
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004
+
+// Tests creating multiple executable graphs from the same modifiable graph and
+// only updating one of them.
+
+#include "../../graph_common.hpp"
+#include "free_function_kernels.hpp"
+
+int main() {
+  queue Queue{};
+  context ctxt{Queue.get_context()};
+
+  exp_ext::command_graph Graph{ctxt, Queue.get_device()};
+
+  int *PtrA = malloc_device<int>(Size, Queue);
+  int *PtrB = malloc_device<int>(Size, Queue);
+
+  std::vector<int> HostDataA(Size);
+  std::vector<int> HostDataB(Size);
+
+  Queue.memset(PtrA, 0, Size * sizeof(int)).wait();
+  Queue.memset(PtrB, 0, Size * sizeof(int)).wait();
+
+  exp_ext::dynamic_parameter InputParam(Graph, PtrA);
+
+#ifndef __SYCL_DEVICE_ONLY__
+  kernel_bundle Bundle = get_kernel_bundle<bundle_state::executable>(ctxt);
+  kernel_id Kernel_id = exp_ext::get_kernel_id<ff_1>();
+  kernel Kernel = Bundle.get_kernel(Kernel_id);
+  auto KernelNode = Graph.add([&](handler &cgh) {
+    cgh.set_arg(0, InputParam);
+    cgh.single_task(Kernel);
+  });
+
+  auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{});
+  auto ExecGraph2 = Graph.finalize(exp_ext::property::graph::updatable{});
+
+  // PtrA values should be modified twice
+  Queue.ext_oneapi_graph(ExecGraph).wait();
+  Queue.ext_oneapi_graph(ExecGraph2).wait();
+
+  Queue.copy(PtrA, HostDataA.data(), Size).wait();
+  Queue.copy(PtrB, HostDataB.data(), Size).wait();
+  for (size_t i = 0; i < Size; i++) {
+    assert(HostDataA[i] == i * 2);
+    assert(HostDataB[i] == 0);
+  }
+
+  // Swap PtrB to be the input
+  InputParam.update(PtrB);
+  // Only update ExecGraph, which should now modify PtrB while ExecGraph2
+  // modifies PtrA still
+  ExecGraph.update(KernelNode);
+  Queue.ext_oneapi_graph(ExecGraph).wait();
+  Queue.ext_oneapi_graph(ExecGraph2).wait();
+
+  Queue.copy(PtrA, HostDataA.data(), Size).wait();
+  Queue.copy(PtrB, HostDataB.data(), Size).wait();
+  for (size_t i = 0; i < Size; i++) {
+    // A should have been modified 3 times by now, B only once
+    assert(HostDataA[i] == i * 3);
+    assert(HostDataB[i] == i);
+  }
+  sycl::free(PtrA, Queue);
+  sycl::free(PtrB, Queue);
+#endif
+  return 0;
+}
@@ -0,0 +1,79 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+//
+// XFAIL: cuda
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004
+
+// Tests that updating a graph is ordered with respect to previous executions of
+// the graph which may be in flight.
+
+#include "../../graph_common.hpp"
+#include "free_function_kernels.hpp"
+
+int main() {
+  queue Queue{};
+  context ctxt{Queue.get_context()};
+
+  // Use a large N to try and make the kernel slow
+  const size_t N = 1 << 16;
+  // Loop inside kernel to make even slower (too large N runs out of memory)
+  const size_t NumKernelLoops = 4;
+  const size_t NumSubmitLoops = 8;
+
+  exp_ext::command_graph Graph{ctxt, Queue.get_device()};
+
+  int *PtrA = malloc_device<int>(N, Queue);
+  int *PtrB = malloc_device<int>(N, Queue);
+
+  std::vector<int> HostDataA(N);
+  std::vector<int> HostDataB(N);
+
+  Queue.memset(PtrA, 0, N * sizeof(int)).wait();
+  Queue.memset(PtrB, 0, N * sizeof(int)).wait();
+
+  exp_ext::dynamic_parameter InputParam(Graph, PtrA);
+
+#ifndef __SYCL_DEVICE_ONLY__
+  kernel_bundle Bundle = get_kernel_bundle<bundle_state::executable>(ctxt);
+  kernel_id Kernel_id = exp_ext::get_kernel_id<ff_2>();
+  kernel Kernel = Bundle.get_kernel(Kernel_id);
+  auto KernelNode = Graph.add([&](handler &cgh) {
+    cgh.set_arg(0, InputParam);
+    cgh.set_arg(1, N);
+    cgh.set_arg(2, NumKernelLoops);
+    cgh.single_task(Kernel);
+  });
+
+  auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{});
+
+  // Submit a bunch of graphs without waiting
+  for (size_t i = 0; i < NumSubmitLoops; i++) {
+    Queue.ext_oneapi_graph(ExecGraph);
+  }
+
+  // Swap PtrB to be the input
+  InputParam.update(PtrB);
+
+  ExecGraph.update(KernelNode);
+
+  // Submit another set of graphs then wait on all submissions
+  for (size_t i = 0; i < NumSubmitLoops; i++) {
+    Queue.ext_oneapi_graph(ExecGraph);
+  }
+  Queue.wait_and_throw();
+
+  Queue.copy(PtrA, HostDataA.data(), N).wait();
+  Queue.copy(PtrB, HostDataB.data(), N).wait();
+  for (size_t i = 0; i < N; i++) {
+    assert(HostDataA[i] == i * NumKernelLoops * NumSubmitLoops);
+    assert(HostDataB[i] == i * NumKernelLoops * NumSubmitLoops);
+  }
+  sycl::free(PtrA, Queue);
+  sycl::free(PtrB, Queue);
+#endif
+  return 0;
+}