@@ -1289,16 +1289,26 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
1289
1289
}
1290
1290
1291
1291
auto Queue = EventList[I]->UrQueue ;
1292
- if (Queue) {
1293
- // The caller of createAndRetainUrZeEventList must already hold
1294
- // a lock of the CurQueue. Additionally lock the Queue if it
1295
- // is different from CurQueue.
1296
- // TODO: rework this to avoid deadlock when another thread is
1297
- // locking the same queues but in a different order.
1298
- auto Lock = ((Queue == CurQueue)
1299
- ? std::unique_lock<ur_shared_mutex>()
1300
- : std::unique_lock<ur_shared_mutex>(Queue->Mutex ));
1301
1292
1293
+ auto CurQueueDevice = CurQueue->Device ;
1294
+ std::optional<std::unique_lock<ur_shared_mutex>> QueueLock =
1295
+ std::nullopt;
1296
+ // The caller of createAndRetainUrZeEventList must already hold
1297
+ // a lock of the CurQueue. However, if the CurQueue is different
1298
+ // then the Event's Queue, we need to drop that lock and
1299
+ // acquire the Event's Queue lock. This is done to avoid a lock
1300
+ // ordering issue.
1301
+ // For the rest of this scope, CurQueue cannot be accessed.
1302
+ // TODO: This solution is very error-prone. This requires a refactor
1303
+ // to either have fine-granularity locks inside of the queues or
1304
+ // to move any operations on queues other than CurQueue out
1305
+ // of this scope.
1306
+ if (Queue && Queue != CurQueue) {
1307
+ CurQueue->Mutex .unlock ();
1308
+ QueueLock = std::unique_lock<ur_shared_mutex>(Queue->Mutex );
1309
+ }
1310
+
1311
+ if (Queue) {
1302
1312
// If the event that is going to be waited is in an open batch
1303
1313
// different from where this next command is going to be added,
1304
1314
// then we have to force execute of that open command-list
@@ -1341,7 +1351,7 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
1341
1351
}
1342
1352
1343
1353
ur_command_list_ptr_t CommandList;
1344
- if (Queue && Queue->Device != CurQueue-> Device ) {
1354
+ if (Queue && Queue->Device != CurQueueDevice ) {
1345
1355
// Get a command list prior to acquiring an event lock.
1346
1356
// This prevents a potential deadlock with recursive
1347
1357
// event locks.
@@ -1351,7 +1361,7 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
1351
1361
1352
1362
std::shared_lock<ur_shared_mutex> Lock (EventList[I]->Mutex );
1353
1363
1354
- if (Queue && Queue->Device != CurQueue-> Device &&
1364
+ if (Queue && Queue->Device != CurQueueDevice &&
1355
1365
!EventList[I]->IsMultiDevice ) {
1356
1366
ze_event_handle_t MultiDeviceZeEvent = nullptr ;
1357
1367
ur_event_handle_t MultiDeviceEvent;
@@ -1386,6 +1396,10 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
1386
1396
this ->UrEventList [TmpListLength]->RefCount .increment ();
1387
1397
}
1388
1398
1399
+ if (QueueLock.has_value ()) {
1400
+ QueueLock.reset ();
1401
+ CurQueue->Mutex .lock ();
1402
+ }
1389
1403
TmpListLength += 1 ;
1390
1404
}
1391
1405
}
0 commit comments