Skip to content

Commit a276b6e

Browse files
[SYCL] Event-less APIs synchronization with the scheduler (#19987)
This change makes the commands submitted to the scheduler unconditionally associated with an event (for both event and event-less APIs), for kernel submission. For other commands, the event can be skipped if the scheduler bypass condition is true (the scheduler bypass itself is not supported for commands other than the kernel submission), if the queue supports discarding the events and the event was not requested. The reason for this change is, that some commands might already be scheduled and waiting for the submission, so all the kernel submission commands subsequently submitted to the scheduler must return an event, which is then used to order the commands by the in-order type queue and avoid scheduler-bypass flow in such a case. On the other hand, if the scheduler bypass condition is true for a command other than the kernel submission, the event dependencies are safe for scheduler bypass, so the event is not needed. --------- Co-authored-by: Sergey Semenov <[email protected]>
1 parent 33c11a6 commit a276b6e

File tree

2 files changed

+99
-25
lines changed

2 files changed

+99
-25
lines changed

sycl/source/handler.cpp

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -511,7 +511,7 @@ event handler::finalize() {
511511

512512
// TODO checking the size of the events vector and avoiding the call is more
513513
// efficient here at this point
514-
const bool KernelFastPath =
514+
const bool KernelSchedulerBypass =
515515
(Queue && !Graph && !impl->MSubgraphNode && !Queue->hasCommandGraph() &&
516516
!impl->CGData.MRequirements.size() && !MStreamStorage.size() &&
517517
(impl->CGData.MEvents.size() == 0 ||
@@ -521,7 +521,7 @@ event handler::finalize() {
521521
// Extract arguments from the kernel lambda, if required.
522522
// Skipping this is currently limited to simple kernels on the fast path.
523523
if (type == detail::CGType::Kernel && impl->MKernelData.getKernelFuncPtr() &&
524-
(!KernelFastPath || impl->MKernelData.hasSpecialCaptures())) {
524+
(!KernelSchedulerBypass || impl->MKernelData.hasSpecialCaptures())) {
525525
impl->MKernelData.extractArgsAndReqsFromLambda();
526526
}
527527

@@ -633,7 +633,7 @@ event handler::finalize() {
633633
}
634634
}
635635

636-
if (KernelFastPath) {
636+
if (KernelSchedulerBypass) {
637637
// if user does not add a new dependency to the dependency graph, i.e.
638638
// the graph is not changed, then this faster path is used to submit
639639
// kernel bypassing scheduler and avoiding CommandGroup, Command objects
@@ -879,9 +879,18 @@ event handler::finalize() {
879879
#endif
880880
}
881881

882-
bool DiscardEvent = !impl->MEventNeeded && Queue &&
883-
Queue->supportsDiscardingPiEvents() &&
884-
CommandGroup->getRequirements().size() == 0;
882+
// For kernel submission, regardless of whether an event has been requested,
883+
// the scheduler needs to generate an event so the commands are properly
884+
// ordered (for in-order queue) and synchronized with a barrier (for
885+
// out-of-order queue). The event can only be skipped for the scheduler bypass
886+
// path.
887+
//
888+
// For commands other than kernel submission, if an event has not been
889+
// requested, the queue supports events discarding, and the scheduler
890+
// could have been bypassed (not supported yet), the event can be skipped.
891+
bool DiscardEvent =
892+
(type != detail::CGType::Kernel && KernelSchedulerBypass &&
893+
!impl->MEventNeeded && Queue->supportsDiscardingPiEvents());
885894

886895
detail::EventImplPtr Event = detail::Scheduler::getInstance().addCG(
887896
std::move(CommandGroup), *Queue, !DiscardEvent);

sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp

Lines changed: 84 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121

2222
using namespace sycl;
2323

24+
namespace oneapiext = ext::oneapi::experimental;
25+
2426
size_t GEventsWaitCounter = 0;
2527

2628
inline ur_result_t redefinedEventsWaitWithBarrier(void *pParams) {
@@ -49,20 +51,20 @@ TEST_F(SchedulerTest, InOrderQueueHostTaskDeps) {
4951
}
5052

5153
enum class CommandType { KERNEL = 1, MEMSET = 2, HOST_TASK = 3 };
52-
std::vector<std::pair<CommandType, size_t>> ExecutedCommands;
54+
std::vector<std::tuple<CommandType, size_t, size_t>> ExecutedCommands;
5355

5456
inline ur_result_t customEnqueueKernelLaunchWithArgsExp(void *pParams) {
5557
auto params =
5658
*static_cast<ur_enqueue_kernel_launch_with_args_exp_params_t *>(pParams);
57-
ExecutedCommands.push_back(
58-
{CommandType::KERNEL, *params.pnumEventsInWaitList});
59+
ExecutedCommands.push_back({CommandType::KERNEL, *params.pnumEventsInWaitList,
60+
**params.ppGlobalWorkSize});
5961
return UR_RESULT_SUCCESS;
6062
}
6163

6264
inline ur_result_t customEnqueueUSMFill(void *pParams) {
6365
auto params = *static_cast<ur_enqueue_usm_fill_params_t *>(pParams);
6466
ExecutedCommands.push_back(
65-
{CommandType::MEMSET, *params.pnumEventsInWaitList});
67+
{CommandType::MEMSET, *params.pnumEventsInWaitList, 0});
6668
return UR_RESULT_SUCCESS;
6769
}
6870

@@ -92,6 +94,7 @@ TEST_F(SchedulerTest, InOrderQueueCrossDeps) {
9294
CGH.host_task([&] {
9395
std::unique_lock<std::mutex> lk(CvMutex);
9496
Cv.wait(lk, [&ready] { return ready; });
97+
ExecutedCommands.push_back({CommandType::HOST_TASK, 0, 0});
9598
});
9699
});
97100

@@ -113,11 +116,15 @@ TEST_F(SchedulerTest, InOrderQueueCrossDeps) {
113116

114117
InOrderQueue.wait();
115118

116-
ASSERT_EQ(ExecutedCommands.size(), 2u);
117-
EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, CommandType::MEMSET);
118-
EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u);
119-
EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL);
120-
EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u);
119+
ASSERT_EQ(ExecutedCommands.size(), 3u);
120+
EXPECT_EQ(std::get<0>(ExecutedCommands[0]) /*CommandType*/,
121+
CommandType::HOST_TASK);
122+
EXPECT_EQ(std::get<0>(ExecutedCommands[1]) /*CommandType*/,
123+
CommandType::MEMSET);
124+
EXPECT_EQ(std::get<1>(ExecutedCommands[1]) /*EventsCount*/, 0u);
125+
EXPECT_EQ(std::get<0>(ExecutedCommands[2]) /*CommandType*/,
126+
CommandType::KERNEL);
127+
EXPECT_EQ(std::get<1>(ExecutedCommands[2]) /*EventsCount*/, 0u);
121128
}
122129

123130
TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) {
@@ -142,6 +149,7 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) {
142149
CGH.host_task([&] {
143150
std::unique_lock<std::mutex> lk(CvMutex);
144151
Cv.wait(lk, [&ready] { return ready; });
152+
ExecutedCommands.push_back({CommandType::HOST_TASK, 0, 0});
145153
});
146154
});
147155

@@ -159,11 +167,15 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) {
159167

160168
InOrderQueue.wait();
161169

162-
ASSERT_EQ(ExecutedCommands.size(), 2u);
163-
EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, CommandType::MEMSET);
164-
EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u);
165-
EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL);
166-
EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u);
170+
ASSERT_EQ(ExecutedCommands.size(), 3u);
171+
EXPECT_EQ(std::get<0>(ExecutedCommands[0]) /*CommandType*/,
172+
CommandType::HOST_TASK);
173+
EXPECT_EQ(std::get<0>(ExecutedCommands[1]) /*CommandType*/,
174+
CommandType::MEMSET);
175+
EXPECT_EQ(std::get<1>(ExecutedCommands[1]) /*EventsCount*/, 0u);
176+
EXPECT_EQ(std::get<0>(ExecutedCommands[2]) /*CommandType*/,
177+
CommandType::KERNEL);
178+
EXPECT_EQ(std::get<1>(ExecutedCommands[2]) /*EventsCount*/, 0u);
167179
}
168180

169181
TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncsParallelFor) {
@@ -186,7 +198,7 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncsParallelFor) {
186198
CGH.host_task([&] {
187199
std::unique_lock<std::mutex> lk(CvMutex);
188200
Cv.wait(lk, [&ready] { return ready; });
189-
ExecutedCommands.push_back({CommandType::HOST_TASK, 0});
201+
ExecutedCommands.push_back({CommandType::HOST_TASK, 0, 0});
190202
});
191203
});
192204

@@ -202,8 +214,61 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncsParallelFor) {
202214
InOrderQueue.wait();
203215

204216
ASSERT_EQ(ExecutedCommands.size(), 2u);
205-
EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, CommandType::HOST_TASK);
206-
EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u);
207-
EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL);
208-
EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u);
217+
EXPECT_EQ(std::get<0>(ExecutedCommands[0]) /*CommandType*/,
218+
CommandType::HOST_TASK);
219+
EXPECT_EQ(std::get<1>(ExecutedCommands[0]) /*EventsCount*/, 0u);
220+
EXPECT_EQ(std::get<0>(ExecutedCommands[1]) /*CommandType*/,
221+
CommandType::KERNEL);
222+
EXPECT_EQ(std::get<1>(ExecutedCommands[1]) /*EventsCount*/, 0u);
223+
}
224+
225+
TEST_F(SchedulerTest, InOrderQueueCrossDepsEnqueueFunctions) {
226+
ExecutedCommands.clear();
227+
sycl::unittest::UrMock<> Mock;
228+
mock::getCallbacks().set_before_callback(
229+
"urEnqueueKernelLaunchWithArgsExp",
230+
&customEnqueueKernelLaunchWithArgsExp);
231+
232+
sycl::platform Plt = sycl::platform();
233+
234+
context Ctx{Plt};
235+
queue InOrderQueue{Ctx, default_selector_v, property::queue::in_order()};
236+
237+
std::mutex CvMutex;
238+
std::condition_variable Cv;
239+
bool ready = false;
240+
241+
InOrderQueue.submit([&](sycl::handler &CGH) {
242+
CGH.host_task([&] {
243+
std::unique_lock<std::mutex> lk(CvMutex);
244+
Cv.wait(lk, [&ready] { return ready; });
245+
ExecutedCommands.push_back({CommandType::HOST_TASK, 0, 0});
246+
});
247+
});
248+
249+
oneapiext::nd_launch<TestKernel>(
250+
InOrderQueue, nd_range<1>{range<1>{32}, range<1>{32}}, [](nd_item<1>) {});
251+
252+
oneapiext::nd_launch<TestKernel>(
253+
InOrderQueue, nd_range<1>{range<1>{64}, range<1>{32}}, [](nd_item<1>) {});
254+
255+
{
256+
std::unique_lock<std::mutex> lk(CvMutex);
257+
ready = true;
258+
}
259+
Cv.notify_one();
260+
261+
InOrderQueue.wait();
262+
263+
ASSERT_EQ(ExecutedCommands.size(), 3u);
264+
EXPECT_EQ(std::get<0>(ExecutedCommands[0]) /*CommandType*/,
265+
CommandType::HOST_TASK);
266+
EXPECT_EQ(std::get<0>(ExecutedCommands[1]) /*CommandType*/,
267+
CommandType::KERNEL);
268+
EXPECT_EQ(std::get<1>(ExecutedCommands[1]) /*EventsCount*/, 0u);
269+
EXPECT_EQ(std::get<2>(ExecutedCommands[1]) /*GlobalWorkSize*/, 32u);
270+
EXPECT_EQ(std::get<0>(ExecutedCommands[2]) /*CommandType*/,
271+
CommandType::KERNEL);
272+
EXPECT_EQ(std::get<1>(ExecutedCommands[2]) /*EventsCount*/, 0u);
273+
EXPECT_EQ(std::get<2>(ExecutedCommands[2]) /*GlobalWorkSize*/, 64u);
209274
}

0 commit comments

Comments
 (0)