@@ -659,6 +659,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
659659 enqueueEventsWait (hQueue, Stream, numEventsInWaitList, phEventWaitList);
660660
661661 // We have to use a different copy function for each image dimensionality.
662+ // All the async copy function should be treated as synchronous because of
663+ // the explicit call to cuStreamSynchronize at the end
662664
663665 if (imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_HOST_TO_DEVICE) {
664666 if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) {
@@ -893,12 +895,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
893895 cpy_desc.Depth = pImageDesc->arraySize ;
894896 UR_CHECK_ERROR (cuMemcpy3DAsync (&cpy_desc, Stream));
895897 }
898+ // Synchronization is required here to handle the case of copying data
899+ // from
900+ // host to device, then device to device and finally device to host.
901+ // Without it, there is a risk of the copies not being executed in the
902+ // intended order.
903+ cuStreamSynchronize (Stream);
896904 }
897- // Synchronization is required here to handle the case of copying data from
898- // host to device, then device to device and finally device to host.
899- // Without it, there is a risk of the copies not being executed in the
900- // intended order.
901- cuStreamSynchronize (Stream);
905+
902906 if (phEvent) {
903907 auto NewEvent = ur_event_handle_t_::makeNative (UR_COMMAND_MEM_IMAGE_COPY,
904908 hQueue, Stream);
0 commit comments