-
Notifications
You must be signed in to change notification settings - Fork 790
[SYCL][Graph] Add E2E test for recording handlerless queue submissions #20420
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
mmichel11
wants to merge
7
commits into
intel:sycl
Choose a base branch
from
reble:matt/graph_handlerless_e2e
base: sycl
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+119
−0
Open
Changes from 6 commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
d7fbed5
Add graph test for handlerless enqueue recording in function call
mmichel11 f9794f4
Add out-of-order queue test and general cleanup
mmichel11 61e8133
Add leak check to test header
mmichel11 cddcf89
Remove sycl.hpp include in e2e test
mmichel11 8997a0f
Remove prefetch / memadvise to support ocl backend testing
mmichel11 f6cd93a
clang-format header order
mmichel11 4a5a10f
Remove unneeded includes
mmichel11 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
122 changes: 122 additions & 0 deletions
122
sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
// RUN: %{build} -o %t.out | ||
// RUN: %{run} %t.out | ||
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG | ||
// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} | ||
|
||
// Test recording of several handlerless SYCL queue APIs (memset, memcpy, | ||
// parallel_for/nd_launch, single_task) inside a single function call | ||
// while graph recording is active. All operations are USM-based and occur via | ||
// eventless queue free functions or eventful queue shortcuts to bypass handler | ||
// path. Recording is performed over a non-inlined function call. | ||
|
||
#include "../graph_common.hpp" | ||
#include <cstdint> | ||
#include <cstring> | ||
#include <sycl/ext/oneapi/experimental/enqueue_functions.hpp> | ||
#include <sycl/properties/all_properties.hpp> | ||
#include <vector> | ||
|
||
// noinline is important as we have previously caught functional issues with | ||
// kernel argument capture only when the function being recorded is not inlined. | ||
__attribute__((noinline)) void | ||
recordHandlerLessOps(sycl::queue &Q, uint32_t *A, uint32_t *B, uint32_t *C, | ||
uint32_t *D, uint32_t *E, size_t N, unsigned char Pattern, | ||
uint32_t FillValue, bool InOrderQueue) { | ||
size_t WorkGroupSize = 16; | ||
sycl::nd_range<1> KernelRange{sycl::range<1>{N}, | ||
sycl::range<1>{WorkGroupSize}}; | ||
auto DoubleKernelLambda = [=](sycl::nd_item<1> item) { | ||
const size_t i = item.get_global_linear_id(); | ||
C[i] = B[i] * 2; | ||
}; | ||
auto SingleTaskKernel = [=]() { C[0] = 999; }; | ||
// Test eventless free functions with in-order queue and eventful shortcuts | ||
// with out-of-order queue. | ||
if (InOrderQueue) { | ||
exp_ext::memset(Q, A, Pattern, N * sizeof(uint32_t)); | ||
exp_ext::fill(Q, D, FillValue, N); | ||
exp_ext::copy(Q, D, E, N); | ||
exp_ext::memcpy(Q, B, A, N * sizeof(uint32_t)); | ||
exp_ext::nd_launch(Q, KernelRange, DoubleKernelLambda); | ||
exp_ext::single_task(Q, SingleTaskKernel); | ||
} else { | ||
auto e1 = Q.memset(A, Pattern, N * sizeof(uint32_t)); | ||
auto e2 = Q.fill(D, FillValue, N); | ||
Q.copy(D, E, N, e2); | ||
auto e4 = Q.memcpy(B, A, N * sizeof(uint32_t), e1); | ||
auto e6 = Q.parallel_for(KernelRange, e4, DoubleKernelLambda); | ||
Q.single_task(e6, SingleTaskKernel); | ||
} | ||
} | ||
|
||
int main() { | ||
const size_t N = 64; | ||
const unsigned char Pattern = 42; | ||
const uint32_t FillValue = 7; | ||
auto getQueue = [](bool InOrder) { | ||
if (InOrder) { | ||
return sycl::queue{ | ||
sycl::property_list{sycl::property::queue::in_order{}}}; | ||
} else { | ||
return sycl::queue{}; | ||
} | ||
}; | ||
|
||
for (uint32_t i = 0; i <= 1; ++i) { | ||
const bool InOrderQueue = static_cast<bool>(i); | ||
sycl::queue Q = getQueue(InOrderQueue); | ||
uint32_t *A = sycl::malloc_device<uint32_t>(N, Q); | ||
uint32_t *B = sycl::malloc_device<uint32_t>(N, Q); | ||
uint32_t *C = sycl::malloc_device<uint32_t>(N, Q); | ||
|
||
uint32_t *D = sycl::malloc_device<uint32_t>(N, Q); | ||
uint32_t *E = sycl::malloc_device<uint32_t>(N, Q); | ||
|
||
// Host memory for verification | ||
std::vector<uint32_t> C_host(N); | ||
std::vector<uint32_t> E_host(N); | ||
|
||
Q.memset(A, 0, N * sizeof(uint32_t)); | ||
Q.memset(B, 0, N * sizeof(uint32_t)); | ||
Q.memset(C, 0, N * sizeof(uint32_t)); | ||
Q.memset(D, 0, N * sizeof(uint32_t)); | ||
Q.memset(E, 0, N * sizeof(uint32_t)); | ||
Q.wait_and_throw(); | ||
|
||
exp_ext::command_graph Graph{Q.get_context(), Q.get_device()}; | ||
Graph.begin_recording(Q); | ||
recordHandlerLessOps(Q, A, B, C, D, E, N, Pattern, FillValue, InOrderQueue); | ||
Graph.end_recording(); | ||
|
||
auto Exec = Graph.finalize(); | ||
Q.ext_oneapi_graph(Exec); | ||
Q.wait_and_throw(); | ||
|
||
// Copy device memory to host for verification | ||
Q.memcpy(E_host.data(), E, N * sizeof(uint32_t)); | ||
Q.memcpy(C_host.data(), C, N * sizeof(uint32_t)); | ||
Q.wait_and_throw(); | ||
|
||
// Validate copy from D -> E | ||
for (size_t i = 0; i < N; ++i) { | ||
assert(check_value(i, FillValue, E_host[i], "E")); | ||
} | ||
|
||
// Validate final values in C | ||
assert(check_value(0, static_cast<uint32_t>(999), C_host[0], "C")); | ||
uint32_t DoublePatternUint = 0; | ||
std::memset(&DoublePatternUint, Pattern, sizeof(uint32_t)); | ||
uint32_t DoublePatternUintDoubled = DoublePatternUint * 2; | ||
for (size_t i = 1; i < N; ++i) { | ||
assert(check_value(i, DoublePatternUintDoubled, C_host[i], "C")); | ||
} | ||
|
||
sycl::free(A, Q); | ||
sycl::free(B, Q); | ||
sycl::free(C, Q); | ||
sycl::free(D, Q); | ||
sycl::free(E, Q); | ||
} | ||
|
||
return 0; | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks quite verbose to other related tests. Can you please check if includes are actually required, if so we might want to update
graph_common.hpp
...There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The vector, stdint, and string includes are not necessary, so I removd them.
enqueue_functions.hpp
andall_properties.hpp
are required here but not used across all tests, so I didn't add them tograph_common.hpp
.If we'd rather just have any needed includes in graph_common.hpp, then I can move those there.