oneapi-src
diff --git a/‎source/adapters/cuda/enqueue.cpp‎
Lines changed: 50 additions & 23 deletions b/‎source/adapters/cuda/enqueue.cpp‎
Lines changed: 50 additions & 23 deletions
diff --git a/‎source/adapters/cuda/memory.cpp‎
Lines changed: 32 additions & 19 deletions b/‎source/adapters/cuda/memory.cpp‎
Lines changed: 32 additions & 19 deletions
diff --git a/‎source/adapters/cuda/memory.hpp‎
Lines changed: 6 additions & 0 deletions b/‎source/adapters/cuda/memory.hpp‎
Lines changed: 6 additions & 0 deletions
@@ -414,8 +414,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
   UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
 
-  std::vector<ur_event_handle_t> DepEvents(
-      phEventWaitList, phEventWaitList + numEventsInWaitList);
+  std::vector<ur_event_handle_t> MemMigrationEvents;
   std::vector<std::pair<ur_mem_handle_t, ur_lock>> MemMigrationLocks;
 
   // phEventWaitList only contains events that are handed to UR by the SYCL
@@ -427,9 +426,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     for (auto &MemArg : hKernel->Args.MemObjArgs) {
       bool PushBack = false;
       if (auto MemDepEvent = MemArg.Mem->LastEventWritingToMemObj;
-          MemDepEvent && std::find(DepEvents.begin(), DepEvents.end(),
-                                   MemDepEvent) == DepEvents.end()) {
-        DepEvents.push_back(MemDepEvent);
+          MemDepEvent && !listContainsElem(numEventsInWaitList, phEventWaitList,
+                                           MemDepEvent)) {
+        MemMigrationEvents.push_back(MemDepEvent);
         PushBack = true;
       }
       if ((MemArg.AccessFlags &
@@ -477,19 +476,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     CUstream CuStream = hQueue->getNextComputeStream(
         numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
 
-    if (DepEvents.size()) {
-      UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, DepEvents.size(),
-                                       DepEvents.data()));
-    }
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                                     phEventWaitList));
 
     // For memory migration across devices in the same context
     if (hQueue->getContext()->Devices.size() > 1) {
+      if (MemMigrationEvents.size()) {
+        UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream,
+                                         MemMigrationEvents.size(),
+                                         MemMigrationEvents.data()));
+      }
       for (auto &MemArg : hKernel->Args.MemObjArgs) {
-        migrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice());
+        enqueueMigrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice(),
+                                             CuStream);
       }
     }
 
-    if (phEvent) {
+    if (phEvent || MemMigrationEvents.size()) {
       RetImplEvent =
           std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
               UR_COMMAND_KERNEL_LAUNCH, hQueue, CuStream, StreamToken));
@@ -522,8 +525,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     if (phEvent) {
       UR_CHECK_ERROR(RetImplEvent->record());
       *phEvent = RetImplEvent.release();
+    } else if (MemMigrationEvents.size()) {
+      UR_CHECK_ERROR(RetImplEvent->record());
+      for (auto &MemArg : hKernel->Args.MemObjArgs) {
+        // If no event is passed to entry point, we still need to have an event
+        // if ur_mem_handle_t s are used. Here we give ownership of the event
+        // to the ur_mem_handle_t
+        if (MemArg.AccessFlags &
+            (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) {
+          MemArg.Mem->setLastEventWritingToMemObj(RetImplEvent.release());
+        }
+      }
     }
-
   } catch (ur_result_t Err) {
     return Err;
   }
@@ -603,8 +616,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     }
   }
 
-  std::vector<ur_event_handle_t> DepEvents(
-      phEventWaitList, phEventWaitList + numEventsInWaitList);
+  std::vector<ur_event_handle_t> MemMigrationEvents;
   std::vector<std::pair<ur_mem_handle_t, ur_lock>> MemMigrationLocks;
 
   // phEventWaitList only contains events that are handed to UR by the SYCL
@@ -616,9 +628,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     for (auto &MemArg : hKernel->Args.MemObjArgs) {
       bool PushBack = false;
       if (auto MemDepEvent = MemArg.Mem->LastEventWritingToMemObj;
-          MemDepEvent && std::find(DepEvents.begin(), DepEvents.end(),
-                                   MemDepEvent) == DepEvents.end()) {
-        DepEvents.push_back(MemDepEvent);
+          MemDepEvent && !listContainsElem(numEventsInWaitList, phEventWaitList,
+                                           MemDepEvent)) {
+        MemMigrationEvents.push_back(MemDepEvent);
         PushBack = true;
       }
       if ((MemArg.AccessFlags &
@@ -666,19 +678,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     CUstream CuStream = hQueue->getNextComputeStream(
         numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
 
-    if (DepEvents.size()) {
-      UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, DepEvents.size(),
-                                       DepEvents.data()));
-    }
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                                     phEventWaitList));
 
     // For memory migration across devices in the same context
     if (hQueue->getContext()->Devices.size() > 1) {
+      if (MemMigrationEvents.size()) {
+        UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream,
+                                         MemMigrationEvents.size(),
+                                         MemMigrationEvents.data()));
+      }
       for (auto &MemArg : hKernel->Args.MemObjArgs) {
-        migrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice());
+        enqueueMigrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice(),
+                                             CuStream);
       }
     }
 
-    if (phEvent) {
+    if (phEvent || MemMigrationEvents.size()) {
       RetImplEvent =
           std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
               UR_COMMAND_KERNEL_LAUNCH, hQueue, CuStream, StreamToken));
@@ -724,6 +740,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     if (phEvent) {
       UR_CHECK_ERROR(RetImplEvent->record());
       *phEvent = RetImplEvent.release();
+    } else if (MemMigrationEvents.size()) {
+      UR_CHECK_ERROR(RetImplEvent->record());
+      for (auto &MemArg : hKernel->Args.MemObjArgs) {
+        // If no event is passed to entry point, we still need to have an event
+        // if ur_mem_handle_t s are used. Here we give ownership of the event
+        // to the ur_mem_handle_t
+        if (MemArg.AccessFlags &
+            (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) {
+          MemArg.Mem->setLastEventWritingToMemObj(RetImplEvent.release());
+        }
+      }
     }
 
   } catch (ur_result_t Err) {
 
@@ -12,6 +12,7 @@
 
 #include "common.hpp"
 #include "context.hpp"
+#include "enqueue.hpp"
 #include "memory.hpp"
 
 /// Creates a UR Memory object using a CUDA memory allocation.
@@ -238,7 +239,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
   try {
     if (PerformInitialCopy) {
       for (const auto &Device : hContext->getDevices()) {
-        UR_CHECK_ERROR(migrateMemoryToDeviceIfNeeded(URMemObj.get(), Device));
+        // Synchronous behaviour is best in this case
+        ScopedContext Active(Device);
+        CUstream Stream{0}; // Use default stream
+        UR_CHECK_ERROR(enqueueMigrateMemoryToDeviceIfNeeded(URMemObj.get(),
+                                                            Device, Stream));
+        UR_CHECK_ERROR(cuStreamSynchronize(Stream));
       }
     }
 
@@ -496,27 +502,29 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
 }
 
 namespace {
-ur_result_t migrateBufferToDevice(ur_mem_handle_t Mem,
-                                  ur_device_handle_t hDevice) {
+ur_result_t enqueueMigrateBufferToDevice(ur_mem_handle_t Mem,
+                                         ur_device_handle_t hDevice,
+                                         CUstream Stream) {
   auto &Buffer = std::get<BufferMem>(Mem->Mem);
   if (Mem->LastEventWritingToMemObj == nullptr) {
     // Device allocation being initialized from host for the first time
     if (Buffer.HostPtr) {
-      UR_CHECK_ERROR(
-          cuMemcpyHtoD(Buffer.getPtr(hDevice), Buffer.HostPtr, Buffer.Size));
+      UR_CHECK_ERROR(cuMemcpyHtoDAsync(Buffer.getPtr(hDevice), Buffer.HostPtr,
+                                       Buffer.Size, Stream));
     }
   } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
              hDevice) {
-    UR_CHECK_ERROR(cuMemcpyDtoD(
+    UR_CHECK_ERROR(cuMemcpyDtoDAsync(
         Buffer.getPtr(hDevice),
         Buffer.getPtr(Mem->LastEventWritingToMemObj->getQueue()->getDevice()),
-        Buffer.Size));
+        Buffer.Size, Stream));
   }
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
-                                 ur_device_handle_t hDevice) {
+ur_result_t enqueueMigrateImageToDevice(ur_mem_handle_t Mem,
+                                        ur_device_handle_t hDevice,
+                                        CUstream Stream) {
   auto &Image = std::get<SurfaceMem>(Mem->Mem);
   // When a dimension isn't used image_desc has the size set to 1
   size_t PixelSizeBytes = Image.PixelTypeSizeBytes *
@@ -550,21 +558,24 @@ ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
   if (Mem->LastEventWritingToMemObj == nullptr) {
     if (Image.HostPtr) {
       if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
-        UR_CHECK_ERROR(
-            cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
+        UR_CHECK_ERROR(cuMemcpyHtoAAsync(ImageArray, 0, Image.HostPtr,
+                                         ImageSizeBytes, Stream));
       } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
         CpyDesc2D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
         CpyDesc2D.srcHost = Image.HostPtr;
-        UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D));
+        UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc2D, Stream));
       } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
         CpyDesc3D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
         CpyDesc3D.srcHost = Image.HostPtr;
-        UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D));
+        UR_CHECK_ERROR(cuMemcpy3DAsync(&CpyDesc3D, Stream));
       }
     }
   } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
              hDevice) {
     if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
+      // Blocking wait needed since we need to sync LastEventWritingToMemObj's
+      // queue, as well as the current queue with LastEventWritingToMemObj
+      UR_CHECK_ERROR(urEventWait(1, &Mem->LastEventWritingToMemObj));
       // FIXME: 1D memcpy from DtoD going through the host.
       UR_CHECK_ERROR(cuMemcpyAtoH(
           Image.HostPtr,
@@ -574,13 +585,15 @@ ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
       UR_CHECK_ERROR(
           cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
     } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
+      CpyDesc2D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_DEVICE;
       CpyDesc2D.srcArray = Image.getArray(
           Mem->LastEventWritingToMemObj->getQueue()->getDevice());
-      UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D));
+      UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc2D, Stream));
     } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
+      CpyDesc3D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_DEVICE;
       CpyDesc3D.srcArray = Image.getArray(
           Mem->LastEventWritingToMemObj->getQueue()->getDevice());
-      UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D));
+      UR_CHECK_ERROR(cuMemcpy3DAsync(&CpyDesc3D, Stream));
     }
   }
   return UR_RESULT_SUCCESS;
@@ -589,8 +602,8 @@ ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
 
 // If calling this entry point it is necessary to lock the memoryMigrationMutex
 // beforehand
-ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem,
-                                          const ur_device_handle_t hDevice) {
+ur_result_t enqueueMigrateMemoryToDeviceIfNeeded(
+    ur_mem_handle_t Mem, const ur_device_handle_t hDevice, CUstream Stream) {
   UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   // Device allocation has already been initialized with most up to date
   // data in buffer
@@ -601,9 +614,9 @@ ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem,
 
   ScopedContext Active(hDevice);
   if (Mem->isBuffer()) {
-    UR_CHECK_ERROR(migrateBufferToDevice(Mem, hDevice));
+    UR_CHECK_ERROR(enqueueMigrateBufferToDevice(Mem, hDevice, Stream));
   } else {
-    UR_CHECK_ERROR(migrateImageToDevice(Mem, hDevice));
+    UR_CHECK_ERROR(enqueueMigrateImageToDevice(Mem, hDevice, Stream));
   }
 
   Mem->HaveMigratedToDeviceSinceLastWrite[Mem->getContext()->getDeviceIndex(
 
@@ -20,6 +20,12 @@
 #include "device.hpp"
 #include "event.hpp"
 
+ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t,
+                                           const ur_device_handle_t);
+ur_result_t enqueueMigrateMemoryToDeviceIfNeeded(ur_mem_handle_t,
+                                                 const ur_device_handle_t,
+                                                 CUstream);
+
 // Handler for plain, pointer-based CUDA allocations
 struct BufferMem {