1313#include " common.hpp"
1414#include " usm.hpp"
1515
16+ template <class T >
1617void AllocDeleterCallback (cl_event event, cl_int, void *pUserData) {
1718 clReleaseEvent (event);
18- auto Info = static_cast <AllocDeleterCallbackInfo *>(pUserData);
19+ auto Info = static_cast <T *>(pUserData);
1920 delete Info;
2021}
2122
@@ -301,7 +302,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
301302 auto Info = new AllocDeleterCallbackInfo (USMFree, CLContext, HostBuffer);
302303
303304 ClErr =
304- clSetEventCallback (CopyEvent, CL_COMPLETE, AllocDeleterCallback, Info);
305+ clSetEventCallback (CopyEvent, CL_COMPLETE,
306+ AllocDeleterCallback<AllocDeleterCallbackInfo>, Info);
305307 if (ClErr != CL_SUCCESS) {
306308 // We can attempt to recover gracefully by attempting to wait for the copy
307309 // to finish and deleting the info struct here.
@@ -361,6 +363,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy(
361363 sizeof (cl_device_id), &DstDevice, nullptr ));
362364
363365 if ((SrcDevice && DstDevice) && SrcDevice != DstDevice) {
366+ // We need a queue associated with each device, so first figure out which
367+ // one we weren't given.
368+ cl_device_id QueueDevice = nullptr ;
369+ CL_RETURN_ON_FAILURE (clGetCommandQueueInfo (
370+ cl_adapter::cast<cl_command_queue>(hQueue), CL_QUEUE_DEVICE,
371+ sizeof (QueueDevice), &QueueDevice, nullptr ));
372+
373+ cl_command_queue MissingQueue = nullptr , SrcQueue = nullptr ,
374+ DstQueue = nullptr ;
375+ if (QueueDevice == SrcDevice) {
376+ MissingQueue = clCreateCommandQueue (CLContext, DstDevice, 0 , &CLErr);
377+ SrcQueue = cl_adapter::cast<cl_command_queue>(hQueue);
378+ DstQueue = MissingQueue;
379+ } else {
380+ MissingQueue = clCreateCommandQueue (CLContext, SrcDevice, 0 , &CLErr);
381+ DstQueue = cl_adapter::cast<cl_command_queue>(hQueue);
382+ SrcQueue = MissingQueue;
383+ }
384+ CL_RETURN_ON_FAILURE (CLErr);
385+
364386 cl_event HostCopyEvent = nullptr , FinalCopyEvent = nullptr ;
365387 clHostMemAllocINTEL_fn HostMemAlloc = nullptr ;
366388 UR_RETURN_ON_FAILURE (cl_ext::getExtFuncFromContext<clHostMemAllocINTEL_fn>(
@@ -387,19 +409,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy(
387409 };
388410
389411 UR_RETURN_ON_FAILURE (checkCLErr (USMMemcpy (
390- cl_adapter::cast<cl_command_queue>(hQueue), blocking, HostAlloc, pSrc,
391- size, numEventsInWaitList,
412+ SrcQueue, blocking, HostAlloc, pSrc, size, numEventsInWaitList,
392413 cl_adapter::cast<const cl_event *>(phEventWaitList), &HostCopyEvent)));
393414
394- UR_RETURN_ON_FAILURE (checkCLErr (
395- USMMemcpy (cl_adapter::cast<cl_command_queue>(hQueue) , blocking, pDst,
396- HostAlloc, size, 1 , &HostCopyEvent, &FinalCopyEvent)));
415+ UR_RETURN_ON_FAILURE (
416+ checkCLErr ( USMMemcpy (DstQueue , blocking, pDst, HostAlloc, size, 1 ,
417+ &HostCopyEvent, &FinalCopyEvent)));
397418
398419 // If this is a blocking operation we can do our cleanup immediately,
399420 // otherwise we need to defer it to an event callback.
400421 if (blocking) {
401422 CL_RETURN_ON_FAILURE (USMFree (CLContext, HostAlloc));
402423 CL_RETURN_ON_FAILURE (clReleaseEvent (HostCopyEvent));
424+ CL_RETURN_ON_FAILURE (clReleaseCommandQueue (MissingQueue));
403425 if (phEvent) {
404426 *phEvent = cl_adapter::cast<ur_event_handle_t >(FinalCopyEvent);
405427 } else {
@@ -414,11 +436,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy(
414436 }
415437
416438 // This self destructs taking the event and allocation with it.
417- auto DeleterInfo =
418- new AllocDeleterCallbackInfo{ USMFree, CLContext, HostAlloc} ;
439+ auto DeleterInfo = new AllocDeleterCallbackInfoWithQueue (
440+ USMFree, CLContext, HostAlloc, MissingQueue) ;
419441
420- CLErr = clSetEventCallback (HostCopyEvent, CL_COMPLETE,
421- AllocDeleterCallback, DeleterInfo);
442+ CLErr = clSetEventCallback (
443+ HostCopyEvent, CL_COMPLETE,
444+ AllocDeleterCallback<AllocDeleterCallbackInfoWithQueue>, DeleterInfo);
422445
423446 if (CLErr != CL_SUCCESS) {
424447 // We can attempt to recover gracefully by attempting to wait for the
0 commit comments