Skip to content

Commit f165b29

Browse files
committed
Fix discarded event dependency regression in multi-threading
Signed-off-by: Larsen, Steffen <[email protected]>
1 parent ae29eb5 commit f165b29

File tree

3 files changed

+62
-36
lines changed

3 files changed

+62
-36
lines changed

sycl/source/detail/queue_impl.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -308,8 +308,9 @@ void queue_impl::addEvent(const event &Event) {
308308
addSharedEvent(Event);
309309
}
310310
// As long as the queue supports urQueueFinish we only need to store events
311-
// for unenqueued commands and host tasks.
312-
else if (MEmulateOOO || EImpl->getHandle() == nullptr) {
311+
// for undiscarded, unenqueued commands and host tasks.
312+
else if (MEmulateOOO ||
313+
(EImpl->getHandle() == nullptr && !EImpl->isDiscarded())) {
313314
std::weak_ptr<event_impl> EventWeakPtr{EImpl};
314315
std::lock_guard<std::mutex> Lock{MMutex};
315316
MEventsWeak.push_back(std::move(EventWeakPtr));

sycl/source/detail/scheduler/commands.cpp

Lines changed: 22 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -3055,9 +3055,12 @@ ur_result_t ExecCGCommand::enqueueImpQueue() {
30553055
ur_event_handle_t *Event = DiscardUrEvent ? nullptr : &UREvent;
30563056
detail::EventImplPtr EventImpl = DiscardUrEvent ? nullptr : MEvent;
30573057

3058-
// If we are discarding the UR event, we also need to mark the result event.
3059-
if (DiscardUrEvent)
3060-
MEvent->setStateDiscarded();
3058+
auto SetEventHandleOrDiscard = [&]() {
3059+
if (Event)
3060+
MEvent->setHandle(*Event);
3061+
else
3062+
MEvent->setStateDiscarded();
3063+
};
30613064

30623065
switch (MCommandGroup->getType()) {
30633066

@@ -3192,8 +3195,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() {
31923195
Result != UR_RESULT_SUCCESS)
31933196
return Result;
31943197

3195-
if (Event)
3196-
MEvent->setHandle(*Event);
3198+
SetEventHandleOrDiscard();
31973199
return UR_RESULT_SUCCESS;
31983200
}
31993201
case CGType::FillUSM: {
@@ -3204,8 +3206,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() {
32043206
Result != UR_RESULT_SUCCESS)
32053207
return Result;
32063208

3207-
if (Event)
3208-
MEvent->setHandle(*Event);
3209+
SetEventHandleOrDiscard();
32093210
return UR_RESULT_SUCCESS;
32103211
}
32113212
case CGType::PrefetchUSM: {
@@ -3216,8 +3217,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() {
32163217
Result != UR_RESULT_SUCCESS)
32173218
return Result;
32183219

3219-
if (Event)
3220-
MEvent->setHandle(*Event);
3220+
SetEventHandleOrDiscard();
32213221
return UR_RESULT_SUCCESS;
32223222
}
32233223
case CGType::AdviseUSM: {
@@ -3229,8 +3229,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() {
32293229
Result != UR_RESULT_SUCCESS)
32303230
return Result;
32313231

3232-
if (Event)
3233-
MEvent->setHandle(*Event);
3232+
SetEventHandleOrDiscard();
32343233
return UR_RESULT_SUCCESS;
32353234
}
32363235
case CGType::Copy2DUSM: {
@@ -3242,8 +3241,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() {
32423241
Result != UR_RESULT_SUCCESS)
32433242
return Result;
32443243

3245-
if (Event)
3246-
MEvent->setHandle(*Event);
3244+
SetEventHandleOrDiscard();
32473245
return UR_RESULT_SUCCESS;
32483246
}
32493247
case CGType::Fill2DUSM: {
@@ -3255,8 +3253,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() {
32553253
Result != UR_RESULT_SUCCESS)
32563254
return Result;
32573255

3258-
if (Event)
3259-
MEvent->setHandle(*Event);
3256+
SetEventHandleOrDiscard();
32603257
return UR_RESULT_SUCCESS;
32613258
}
32623259
case CGType::Memset2DUSM: {
@@ -3268,8 +3265,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() {
32683265
Result != UR_RESULT_SUCCESS)
32693266
return Result;
32703267

3271-
if (Event)
3272-
MEvent->setHandle(*Event);
3268+
SetEventHandleOrDiscard();
32733269
return UR_RESULT_SUCCESS;
32743270
}
32753271
case CGType::CodeplayHostTask: {
@@ -3409,8 +3405,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() {
34093405
MQueue->getAdapter()->call<UrApiKind::urEnqueueNativeCommandExp>(
34103406
MQueue->getHandleRef(), InteropFreeFunc, &CustomOpData, ReqMems.size(),
34113407
ReqMems.data(), nullptr, RawEvents.size(), RawEvents.data(), Event);
3412-
if (Event)
3413-
MEvent->setHandle(*Event);
3408+
SetEventHandleOrDiscard();
34143409
return UR_RESULT_SUCCESS;
34153410
}
34163411
case CGType::Barrier: {
@@ -3420,8 +3415,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() {
34203415
MEvent->setHostEnqueueTime();
34213416
Adapter->call<UrApiKind::urEnqueueEventsWaitWithBarrier>(
34223417
MQueue->getHandleRef(), 0, nullptr, Event);
3423-
if (Event)
3424-
MEvent->setHandle(*Event);
3418+
SetEventHandleOrDiscard();
34253419
return UR_RESULT_SUCCESS;
34263420
}
34273421
case CGType::BarrierWaitlist: {
@@ -3438,8 +3432,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() {
34383432
MEvent->setHostEnqueueTime();
34393433
Adapter->call<UrApiKind::urEnqueueEventsWaitWithBarrier>(
34403434
MQueue->getHandleRef(), UrEvents.size(), &UrEvents[0], Event);
3441-
if (Event)
3442-
MEvent->setHandle(*Event);
3435+
SetEventHandleOrDiscard();
34433436
return UR_RESULT_SUCCESS;
34443437
}
34453438
case CGType::ProfilingTag: {
@@ -3486,8 +3479,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() {
34863479
Adapter->call<UrApiKind::urEventRelease>(PostTimestampBarrierEvent);
34873480
}
34883481

3489-
if (Event)
3490-
MEvent->setHandle(*Event);
3482+
SetEventHandleOrDiscard();
34913483
return UR_RESULT_SUCCESS;
34923484
}
34933485
case CGType::CopyToDeviceGlobal: {
@@ -3500,8 +3492,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() {
35003492
Result != UR_RESULT_SUCCESS)
35013493
return Result;
35023494

3503-
if (Event)
3504-
MEvent->setHandle(*Event);
3495+
SetEventHandleOrDiscard();
35053496
return UR_RESULT_SUCCESS;
35063497
}
35073498
case CGType::CopyFromDeviceGlobal: {
@@ -3515,8 +3506,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() {
35153506
Result != UR_RESULT_SUCCESS)
35163507
return Result;
35173508

3518-
if (Event)
3519-
MEvent->setHandle(*Event);
3509+
SetEventHandleOrDiscard();
35203510
return UR_RESULT_SUCCESS;
35213511
}
35223512
case CGType::ReadWriteHostPipe: {
@@ -3547,8 +3537,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() {
35473537
CmdBufferCG->MCommandBuffer, MQueue->getHandleRef(),
35483538
RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
35493539
Event);
3550-
if (Event)
3551-
MEvent->setHandle(*Event);
3540+
SetEventHandleOrDiscard();
35523541

35533542
return Err;
35543543
}
@@ -3564,8 +3553,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() {
35643553
Result != UR_RESULT_SUCCESS)
35653554
return Result;
35663555

3567-
if (Event)
3568-
MEvent->setHandle(*Event);
3556+
SetEventHandleOrDiscard();
35693557

35703558
return UR_RESULT_SUCCESS;
35713559
}
@@ -3608,7 +3596,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() {
36083596
ur_result_t Result = Adapter->call_nocheck<UrApiKind::urEnqueueEventsWait>(
36093597
MQueue->getHandleRef(), RawEvents.size(),
36103598
RawEvents.size() ? &RawEvents[0] : nullptr, &Event);
3611-
MEvent->setHandle(Event);
3599+
SetEventHandleOrDiscard();
36123600
return Result;
36133601
}
36143602
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
// REQUIRES: aspect-usm_device_allocations
2+
// RUN: %{build} %threads_lib -o %t.out
3+
// RUN: %{run} %t.out
4+
5+
// Regression test for a case where parallel work with enqueue functions
6+
// discarding their results would cause implicit waits on discarded events.
7+
8+
#include <sycl/detail/core.hpp>
9+
#include <sycl/ext/oneapi/experimental/enqueue_functions.hpp>
10+
#include <sycl/properties/all_properties.hpp>
11+
#include <sycl/usm.hpp>
12+
#include <thread>
13+
14+
void threadFunction(int) {
15+
sycl::queue Q{{sycl::property::queue::in_order()}};
16+
17+
constexpr int Size = 128 * 128 * 128;
18+
int *DevMem = sycl::malloc_device<int>(Size, Q);
19+
20+
sycl::ext::oneapi::experimental::submit(
21+
Q, [&](sycl::handler &cgh) { cgh.fill<int>(DevMem, 1, Size); });
22+
Q.wait_and_throw();
23+
24+
sycl::free(DevMem, Q);
25+
}
26+
27+
int main() {
28+
constexpr size_t NThreads = 2;
29+
std::array<std::thread, NThreads> Threads;
30+
31+
for (size_t I = 0; I < NThreads; I++)
32+
Threads[I] = std::thread{threadFunction, I};
33+
for (size_t I = 0; I < NThreads; I++)
34+
Threads[I].join();
35+
36+
return 0;
37+
}

0 commit comments

Comments
 (0)