@@ -690,12 +690,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
690690 }
691691 } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
692692 CUDA_MEMCPY2D cpy_desc = {};
693- cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
694- cpy_desc.srcHost = pSrc;
695693 cpy_desc.srcXInBytes = srcOffset.x * PixelSizeBytes;
696694 cpy_desc.srcY = srcOffset.y ;
697695 cpy_desc.dstXInBytes = dstOffset.x * PixelSizeBytes;
698696 cpy_desc.dstY = dstOffset.y ;
697+ cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
698+ cpy_desc.srcHost = pSrc;
699699 cpy_desc.srcPitch = hostExtent.width * PixelSizeBytes;
700700 if (pImageDesc->rowPitch == 0 ) {
701701 cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
@@ -781,8 +781,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
781781 cpy_desc.srcY = srcOffset.y ;
782782 cpy_desc.dstXInBytes = dstOffset.x ;
783783 cpy_desc.dstY = dstOffset.y ;
784- cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
785- cpy_desc.dstHost = pDst;
786784 if (pImageDesc->rowPitch == 0 ) {
787785 cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
788786 cpy_desc.srcArray = (CUarray)pSrc;
@@ -792,6 +790,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
792790 cpy_desc.srcPitch = pImageDesc->rowPitch ;
793791 cpy_desc.srcDevice = (CUdeviceptr)pSrc;
794792 }
793+ cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
794+ cpy_desc.dstHost = pDst;
795795 cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width ;
796796 cpy_desc.Height = copyExtent.height ;
797797 UR_CHECK_ERROR (cuMemcpy2DAsync (&cpy_desc, Stream));
@@ -831,10 +831,79 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
831831 UR_CHECK_ERROR (cuMemcpy3DAsync (&cpy_desc, Stream));
832832 }
833833 } else {
834- // / imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE
835- // / TODO: implemet device to device copy
836- return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
834+ // imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE
835+
836+ // All the following async copy function calls should be treated as
837+ // synchronous because of the explicit call to cuStreamSynchronize at
838+ // the end
839+ if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) {
840+ CUDA_MEMCPY2D cpy_desc = {};
841+ cpy_desc.srcXInBytes = srcOffset.x ;
842+ cpy_desc.srcY = 0 ;
843+ cpy_desc.dstXInBytes = dstOffset.x ;
844+ cpy_desc.dstY = 0 ;
845+ cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
846+ cpy_desc.srcArray = (CUarray)pSrc;
847+ cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
848+ cpy_desc.dstArray = (CUarray)pDst;
849+ cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width ;
850+ cpy_desc.Height = 1 ;
851+ UR_CHECK_ERROR (cuMemcpy2DAsync (&cpy_desc, Stream));
852+ } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
853+ CUDA_MEMCPY2D cpy_desc = {};
854+ cpy_desc.srcXInBytes = srcOffset.x ;
855+ cpy_desc.srcY = srcOffset.y ;
856+ cpy_desc.dstXInBytes = dstOffset.x ;
857+ cpy_desc.dstY = dstOffset.y ;
858+ cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
859+ cpy_desc.srcArray = (CUarray)pSrc;
860+ cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
861+ cpy_desc.dstArray = (CUarray)pDst;
862+ cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width ;
863+ cpy_desc.Height = copyExtent.height ;
864+ UR_CHECK_ERROR (cuMemcpy2DAsync (&cpy_desc, Stream));
865+ } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
866+ CUDA_MEMCPY3D cpy_desc = {};
867+ cpy_desc.srcXInBytes = srcOffset.x ;
868+ cpy_desc.srcY = srcOffset.y ;
869+ cpy_desc.srcZ = srcOffset.z ;
870+ cpy_desc.dstXInBytes = dstOffset.x ;
871+ cpy_desc.dstY = dstOffset.y ;
872+ cpy_desc.dstZ = dstOffset.z ;
873+ cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
874+ cpy_desc.srcArray = (CUarray)pSrc;
875+ cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
876+ cpy_desc.dstArray = (CUarray)pDst;
877+ cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width ;
878+ cpy_desc.Height = copyExtent.height ;
879+ cpy_desc.Depth = copyExtent.depth ;
880+ UR_CHECK_ERROR (cuMemcpy3DAsync (&cpy_desc, Stream));
881+ } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D_ARRAY ||
882+ pImageDesc->type == UR_MEM_TYPE_IMAGE2D_ARRAY ||
883+ pImageDesc->type == UR_MEM_TYPE_IMAGE_CUBEMAP_EXP) {
884+ CUDA_MEMCPY3D cpy_desc = {};
885+ cpy_desc.srcXInBytes = srcOffset.x ;
886+ cpy_desc.srcY = srcOffset.y ;
887+ cpy_desc.srcZ = srcOffset.z ;
888+ cpy_desc.dstXInBytes = dstOffset.x ;
889+ cpy_desc.dstY = dstOffset.y ;
890+ cpy_desc.dstZ = dstOffset.z ;
891+ cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
892+ cpy_desc.srcArray = (CUarray)pSrc;
893+ cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
894+ cpy_desc.dstArray = (CUarray)pDst;
895+ cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width ;
896+ cpy_desc.Height = std::max (uint64_t {1 }, copyExtent.height );
897+ cpy_desc.Depth = pImageDesc->arraySize ;
898+ UR_CHECK_ERROR (cuMemcpy3DAsync (&cpy_desc, Stream));
899+ }
900+ // Synchronization is required here to handle the case of copying data
901+ // from host to device, then device to device and finally device to host.
902+ // Without it, there is a risk of the copies not being executed in the
903+ // intended order.
904+ cuStreamSynchronize (Stream);
837905 }
906+
838907 if (phEvent) {
839908 auto NewEvent = ur_event_handle_t_::makeNative (UR_COMMAND_MEM_IMAGE_COPY,
840909 hQueue, Stream);
0 commit comments