Skip to content

Commit a44e81b

Browse files
authored
Merge pull request #1546 from pbalcer/fix-queue-locking
fix queue locking behavior when creating event lists
2 parents 9d5865e + e4ecd60 commit a44e81b

File tree

1 file changed

+25
-11
lines changed

1 file changed

+25
-11
lines changed

source/adapters/level_zero/event.cpp

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1289,16 +1289,26 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
12891289
}
12901290

12911291
auto Queue = EventList[I]->UrQueue;
1292-
if (Queue) {
1293-
// The caller of createAndRetainUrZeEventList must already hold
1294-
// a lock of the CurQueue. Additionally lock the Queue if it
1295-
// is different from CurQueue.
1296-
// TODO: rework this to avoid deadlock when another thread is
1297-
// locking the same queues but in a different order.
1298-
auto Lock = ((Queue == CurQueue)
1299-
? std::unique_lock<ur_shared_mutex>()
1300-
: std::unique_lock<ur_shared_mutex>(Queue->Mutex));
13011292

1293+
auto CurQueueDevice = CurQueue->Device;
1294+
std::optional<std::unique_lock<ur_shared_mutex>> QueueLock =
1295+
std::nullopt;
1296+
// The caller of createAndRetainUrZeEventList must already hold
1297+
// a lock of the CurQueue. However, if the CurQueue is different
1298+
// then the Event's Queue, we need to drop that lock and
1299+
// acquire the Event's Queue lock. This is done to avoid a lock
1300+
// ordering issue.
1301+
// For the rest of this scope, CurQueue cannot be accessed.
1302+
// TODO: This solution is very error-prone. This requires a refactor
1303+
// to either have fine-granularity locks inside of the queues or
1304+
// to move any operations on queues other than CurQueue out
1305+
// of this scope.
1306+
if (Queue && Queue != CurQueue) {
1307+
CurQueue->Mutex.unlock();
1308+
QueueLock = std::unique_lock<ur_shared_mutex>(Queue->Mutex);
1309+
}
1310+
1311+
if (Queue) {
13021312
// If the event that is going to be waited is in an open batch
13031313
// different from where this next command is going to be added,
13041314
// then we have to force execute of that open command-list
@@ -1341,7 +1351,7 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
13411351
}
13421352

13431353
ur_command_list_ptr_t CommandList;
1344-
if (Queue && Queue->Device != CurQueue->Device) {
1354+
if (Queue && Queue->Device != CurQueueDevice) {
13451355
// Get a command list prior to acquiring an event lock.
13461356
// This prevents a potential deadlock with recursive
13471357
// event locks.
@@ -1351,7 +1361,7 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
13511361

13521362
std::shared_lock<ur_shared_mutex> Lock(EventList[I]->Mutex);
13531363

1354-
if (Queue && Queue->Device != CurQueue->Device &&
1364+
if (Queue && Queue->Device != CurQueueDevice &&
13551365
!EventList[I]->IsMultiDevice) {
13561366
ze_event_handle_t MultiDeviceZeEvent = nullptr;
13571367
ur_event_handle_t MultiDeviceEvent;
@@ -1386,6 +1396,10 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
13861396
this->UrEventList[TmpListLength]->RefCount.increment();
13871397
}
13881398

1399+
if (QueueLock.has_value()) {
1400+
QueueLock.reset();
1401+
CurQueue->Mutex.lock();
1402+
}
13891403
TmpListLength += 1;
13901404
}
13911405
}

0 commit comments

Comments
 (0)