From f8b0dcbedff943e131777b5d219fd5b24c907547 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Tue, 29 Oct 2024 16:45:58 +0000 Subject: [PATCH 1/3] [SYCL][Graph]Fix and add E2E tests for using local accessors in graphs - Update UR tag for fix to updating local accessors on CUDA/HIP - Add e2e tests covering local accessor usage --- sycl/cmake/modules/FetchUnifiedRuntime.cmake | 2 +- sycl/cmake/modules/UnifiedRuntimeTag.cmake | 2 +- .../Graph/Explicit/local_accessor.cpp | 10 ++ sycl/test-e2e/Graph/Inputs/local_accessor.cpp | 54 +++++++++++ .../Graph/Inputs/whole_update_local_acc.cpp | 93 +++++++++++++++++++ .../Graph/RecordReplay/local_accessor.cpp | 10 ++ .../Explicit/whole_update_local_acc.cpp | 10 ++ .../RecordReplay/whole_update_local_acc.cpp | 10 ++ 8 files changed, 189 insertions(+), 2 deletions(-) create mode 100644 sycl/test-e2e/Graph/Explicit/local_accessor.cpp create mode 100644 sycl/test-e2e/Graph/Inputs/local_accessor.cpp create mode 100644 sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp create mode 100644 sycl/test-e2e/Graph/RecordReplay/local_accessor.cpp create mode 100644 sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc.cpp create mode 100644 sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc.cpp diff --git a/sycl/cmake/modules/FetchUnifiedRuntime.cmake b/sycl/cmake/modules/FetchUnifiedRuntime.cmake index 72841724fa01d..417eab25f73b1 100644 --- a/sycl/cmake/modules/FetchUnifiedRuntime.cmake +++ b/sycl/cmake/modules/FetchUnifiedRuntime.cmake @@ -116,7 +116,7 @@ if(SYCL_UR_USE_FETCH_CONTENT) CACHE PATH "Path to external '${name}' adapter source dir" FORCE) endfunction() - set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") + set(UNIFIED_RUNTIME_REPO "https://github.com/bensuo/unified-runtime.git") include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/UnifiedRuntimeTag.cmake) set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES") diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index 6f3f57255c75b..cec734cb7aa27 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -4,4 +4,4 @@ # Date: Thu Oct 31 14:05:55 2024 +0000 # Merge pull request #2228 from nrspruit/copy_engine_refactor # [L0] Refactor Copy Engine Usage checks for Performance -set(UNIFIED_RUNTIME_TAG 3d58884b4939d9bd095c917f8dd823ac8486684c) +set(UNIFIED_RUNTIME_TAG b7d78ba6de853103e4bb6c8dddfe43ad3e65b3a9) diff --git a/sycl/test-e2e/Graph/Explicit/local_accessor.cpp b/sycl/test-e2e/Graph/Explicit/local_accessor.cpp new file mode 100644 index 0000000000000..fbeb2c6a5ef5c --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/local_accessor.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/local_accessor.cpp" diff --git a/sycl/test-e2e/Graph/Inputs/local_accessor.cpp b/sycl/test-e2e/Graph/Inputs/local_accessor.cpp new file mode 100644 index 0000000000000..8b6fb2404d77d --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/local_accessor.cpp @@ -0,0 +1,54 @@ +// Tests basic adding of nodes with local accessors, +// and submission of the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + using T = int; + + const size_t LocalSize = 128; + + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 10); + + std::vector ReferenceA(DataA); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_device(Size, Queue); + + Queue.copy(DataA.data(), PtrA, Size); + Queue.wait_and_throw(); + + auto node = add_node(Graph, Queue, [&](handler &CGH) { + local_accessor localMem(LocalSize, CGH); + + CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) { + localMem[Item.get_local_linear_id()] = Item.get_global_linear_id() * 2; + PtrA[Item.get_global_linear_id()] += localMem[Item.get_local_linear_id()]; + }); + }); + + auto GraphExec = Graph.finalize(); + + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.wait_and_throw(); + + free(PtrA, Queue); + + for (size_t i = 0; i < Size; i++) { + T Ref = 10 + i + (i * 2); + (check_value(i, Ref, ReferenceA[i], "PtrA")); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp b/sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp new file mode 100644 index 0000000000000..ed08095f68552 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp @@ -0,0 +1,93 @@ +// Tests whole graph update of nodes with local accessors, +// and submission of the graph. + +#include "../graph_common.hpp" + +using T = int; + +auto add_graph_node( + exp_ext::command_graph &Graph, + queue &Queue, size_t Size, size_t LocalSize, T *Ptr) { + return add_node(Graph, Queue, [&](handler &CGH) { + local_accessor localMem(LocalSize, CGH); + + CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) { + localMem[Item.get_local_linear_id()] = Item.get_global_linear_id() * 2; + Ptr[Item.get_global_linear_id()] += + localMem[Item.get_local_linear_id()] + Item.get_local_range(0); + }); + }); +} +int main() { + queue Queue{}; + + const size_t LocalSize = 128; + + std::vector DataA(Size), DataB(Size); + + std::iota(DataA.begin(), DataA.end(), 10); + std::iota(DataB.begin(), DataB.end(), 10); + + std::vector ReferenceA(DataA), ReferenceB(DataB); + + exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_device(Size, Queue); + T *PtrB = malloc_device(Size, Queue); + + Queue.copy(DataA.data(), PtrA, Size); + Queue.copy(DataB.data(), PtrB, Size); + Queue.wait_and_throw(); + + auto NodeA = add_graph_node(GraphA, Queue, Size, LocalSize / 2, PtrA); + + auto GraphExecA = GraphA.finalize(exp_ext::property::graph::updatable{}); + + // Create second graph for whole graph update with a different local size + exp_ext::command_graph GraphB{Queue.get_context(), Queue.get_device()}; + auto NodeB = add_graph_node(GraphB, Queue, Size, LocalSize, PtrB); + + // Execute graphs before updating and check outputs + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExecA); }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrB, DataB.data(), Size); + Queue.wait_and_throw(); + + for (size_t i = 0; i < Size; i++) { + T RefA = 10 + i + (i * 2) + LocalSize / 2; + T RefB = 10 + i; + (check_value(i, RefA, ReferenceA[i], "PtrA")); + (check_value(i, RefB, ReferenceB[i], "PtrB")); + } + + // Update GraphExecA using whole graph update + + GraphExecA.update(GraphB); + + // Execute graphs again and check outputs + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExecA); }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrB, DataB.data(), Size); + Queue.wait_and_throw(); + + for (size_t i = 0; i < Size; i++) { + T RefA = 10 + i + (i * 2) + LocalSize / 2; + T RefB = 10 + i + (i * 2) + LocalSize; + (check_value(i, RefA, ReferenceA[i], "PtrA")); + (check_value(i, RefB, ReferenceB[i], "PtrB")); + } + + free(PtrA, Queue); + free(PtrB, Queue); + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/local_accessor.cpp b/sycl/test-e2e/Graph/RecordReplay/local_accessor.cpp new file mode 100644 index 0000000000000..245983f67da4a --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/local_accessor.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/local_accessor.cpp" diff --git a/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc.cpp b/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc.cpp new file mode 100644 index 0000000000000..1db9905457ae7 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_EXPLICIT + +#include "../../Inputs/whole_update_local_acc.cpp" diff --git a/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc.cpp b/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc.cpp new file mode 100644 index 0000000000000..03645b2f19bfd --- /dev/null +++ b/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../../Inputs/whole_update_local_acc.cpp" From 27cef1408aac2ee8769283f4d7eea87f90fd8a74 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Wed, 30 Oct 2024 17:45:41 +0000 Subject: [PATCH 2/3] Update tag and address minor issues in new tests --- sycl/test-e2e/Graph/Inputs/local_accessor.cpp | 8 ++++---- .../Graph/Inputs/whole_update_local_acc.cpp | 14 +++++++------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/sycl/test-e2e/Graph/Inputs/local_accessor.cpp b/sycl/test-e2e/Graph/Inputs/local_accessor.cpp index 8b6fb2404d77d..b3ac9fde67b6e 100644 --- a/sycl/test-e2e/Graph/Inputs/local_accessor.cpp +++ b/sycl/test-e2e/Graph/Inputs/local_accessor.cpp @@ -24,11 +24,11 @@ int main() { Queue.wait_and_throw(); auto node = add_node(Graph, Queue, [&](handler &CGH) { - local_accessor localMem(LocalSize, CGH); + local_accessor LocalMem(LocalSize, CGH); CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) { - localMem[Item.get_local_linear_id()] = Item.get_global_linear_id() * 2; - PtrA[Item.get_global_linear_id()] += localMem[Item.get_local_linear_id()]; + LocalMem[Item.get_local_linear_id()] = Item.get_global_linear_id() * 2; + PtrA[Item.get_global_linear_id()] += LocalMem[Item.get_local_linear_id()]; }); }); @@ -47,7 +47,7 @@ int main() { for (size_t i = 0; i < Size; i++) { T Ref = 10 + i + (i * 2); - (check_value(i, Ref, ReferenceA[i], "PtrA")); + check_value(i, Ref, ReferenceA[i], "PtrA"); } return 0; diff --git a/sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp b/sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp index ed08095f68552..100792a2e4762 100644 --- a/sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp +++ b/sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp @@ -9,12 +9,12 @@ auto add_graph_node( exp_ext::command_graph &Graph, queue &Queue, size_t Size, size_t LocalSize, T *Ptr) { return add_node(Graph, Queue, [&](handler &CGH) { - local_accessor localMem(LocalSize, CGH); + local_accessor LocalMem(LocalSize, CGH); CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) { - localMem[Item.get_local_linear_id()] = Item.get_global_linear_id() * 2; + LocalMem[Item.get_local_linear_id()] = Item.get_global_linear_id() * 2; Ptr[Item.get_global_linear_id()] += - localMem[Item.get_local_linear_id()] + Item.get_local_range(0); + LocalMem[Item.get_local_linear_id()] + Item.get_local_range(0); }); }); } @@ -61,8 +61,8 @@ int main() { for (size_t i = 0; i < Size; i++) { T RefA = 10 + i + (i * 2) + LocalSize / 2; T RefB = 10 + i; - (check_value(i, RefA, ReferenceA[i], "PtrA")); - (check_value(i, RefB, ReferenceB[i], "PtrB")); + check_value(i, RefA, ReferenceA[i], "PtrA"); + check_value(i, RefB, ReferenceB[i], "PtrB"); } // Update GraphExecA using whole graph update @@ -83,8 +83,8 @@ int main() { for (size_t i = 0; i < Size; i++) { T RefA = 10 + i + (i * 2) + LocalSize / 2; T RefB = 10 + i + (i * 2) + LocalSize; - (check_value(i, RefA, ReferenceA[i], "PtrA")); - (check_value(i, RefB, ReferenceB[i], "PtrB")); + check_value(i, RefA, ReferenceA[i], "PtrA"); + check_value(i, RefB, ReferenceB[i], "PtrB"); } free(PtrA, Queue); From 06ae1003bd9a394d0ec9dba9072f6b3cb1f7d621 Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Wed, 6 Nov 2024 10:51:25 +0000 Subject: [PATCH 3/3] Update UR tag --- sycl/cmake/modules/FetchUnifiedRuntime.cmake | 2 +- sycl/cmake/modules/UnifiedRuntimeTag.cmake | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sycl/cmake/modules/FetchUnifiedRuntime.cmake b/sycl/cmake/modules/FetchUnifiedRuntime.cmake index 417eab25f73b1..72841724fa01d 100644 --- a/sycl/cmake/modules/FetchUnifiedRuntime.cmake +++ b/sycl/cmake/modules/FetchUnifiedRuntime.cmake @@ -116,7 +116,7 @@ if(SYCL_UR_USE_FETCH_CONTENT) CACHE PATH "Path to external '${name}' adapter source dir" FORCE) endfunction() - set(UNIFIED_RUNTIME_REPO "https://github.com/bensuo/unified-runtime.git") + set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/UnifiedRuntimeTag.cmake) set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES") diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index cec734cb7aa27..02b92c4b1a2dd 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1,7 @@ -# commit 3d58884b4939d9bd095c917f8dd823ac8486684c -# Merge: 6ade245e b0bd146a -# Author: aarongreig -# Date: Thu Oct 31 14:05:55 2024 +0000 -# Merge pull request #2228 from nrspruit/copy_engine_refactor -# [L0] Refactor Copy Engine Usage checks for Performance -set(UNIFIED_RUNTIME_TAG b7d78ba6de853103e4bb6c8dddfe43ad3e65b3a9) +# commit 5955bad3afc49612676d7c00566a3ac6f074c63c +# Merge: f01741af b7d78ba6 +# Author: Callum Fare +# Date: Wed Nov 6 10:46:25 2024 +0000 +# Merge pull request #2264 from Bensuo/ben/cmdbuf-local-arg-fix +# [CMDBUF] Fix incorrect handling of shared local mem args in CUDA/HIP +set(UNIFIED_RUNTIME_TAG 5955bad3afc49612676d7c00566a3ac6f074c63c)