@@ -690,12 +690,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
690
690
}
691
691
} else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
692
692
CUDA_MEMCPY2D cpy_desc = {};
693
- cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
694
- cpy_desc.srcHost = pSrc;
695
693
cpy_desc.srcXInBytes = srcOffset.x * PixelSizeBytes;
696
694
cpy_desc.srcY = srcOffset.y ;
697
695
cpy_desc.dstXInBytes = dstOffset.x * PixelSizeBytes;
698
696
cpy_desc.dstY = dstOffset.y ;
697
+ cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
698
+ cpy_desc.srcHost = pSrc;
699
699
cpy_desc.srcPitch = hostExtent.width * PixelSizeBytes;
700
700
if (pImageDesc->rowPitch == 0 ) {
701
701
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
@@ -781,8 +781,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
781
781
cpy_desc.srcY = srcOffset.y ;
782
782
cpy_desc.dstXInBytes = dstOffset.x ;
783
783
cpy_desc.dstY = dstOffset.y ;
784
- cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
785
- cpy_desc.dstHost = pDst;
786
784
if (pImageDesc->rowPitch == 0 ) {
787
785
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
788
786
cpy_desc.srcArray = (CUarray)pSrc;
@@ -792,6 +790,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
792
790
cpy_desc.srcPitch = pImageDesc->rowPitch ;
793
791
cpy_desc.srcDevice = (CUdeviceptr)pSrc;
794
792
}
793
+ cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
794
+ cpy_desc.dstHost = pDst;
795
795
cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width ;
796
796
cpy_desc.Height = copyExtent.height ;
797
797
UR_CHECK_ERROR (cuMemcpy2DAsync (&cpy_desc, Stream));
@@ -831,10 +831,74 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
831
831
UR_CHECK_ERROR (cuMemcpy3DAsync (&cpy_desc, Stream));
832
832
}
833
833
} else {
834
- // / imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE
835
- // / TODO: implemet device to device copy
836
- return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
834
+ // imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE
835
+ if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) {
836
+ CUDA_MEMCPY2D cpy_desc = {};
837
+ cpy_desc.srcXInBytes = srcOffset.x ;
838
+ cpy_desc.srcY = 0 ;
839
+ cpy_desc.dstXInBytes = dstOffset.x ;
840
+ cpy_desc.dstY = 0 ;
841
+ cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
842
+ cpy_desc.srcArray = (CUarray)pSrc;
843
+ cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
844
+ cpy_desc.dstArray = (CUarray)pDst;
845
+ cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width ;
846
+ cpy_desc.Height = 1 ;
847
+ UR_CHECK_ERROR (cuMemcpy2DAsync (&cpy_desc, Stream));
848
+ } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
849
+ CUDA_MEMCPY2D cpy_desc = {};
850
+ cpy_desc.srcXInBytes = srcOffset.x ;
851
+ cpy_desc.srcY = srcOffset.y ;
852
+ cpy_desc.dstXInBytes = dstOffset.x ;
853
+ cpy_desc.dstY = dstOffset.y ;
854
+ cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
855
+ cpy_desc.srcArray = (CUarray)pSrc;
856
+ cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
857
+ cpy_desc.dstArray = (CUarray)pDst;
858
+ cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width ;
859
+ cpy_desc.Height = copyExtent.height ;
860
+ UR_CHECK_ERROR (cuMemcpy2DAsync (&cpy_desc, Stream));
861
+ } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
862
+ CUDA_MEMCPY3D cpy_desc = {};
863
+ cpy_desc.srcXInBytes = srcOffset.x ;
864
+ cpy_desc.srcY = srcOffset.y ;
865
+ cpy_desc.srcZ = srcOffset.z ;
866
+ cpy_desc.dstXInBytes = dstOffset.x ;
867
+ cpy_desc.dstY = dstOffset.y ;
868
+ cpy_desc.dstZ = dstOffset.z ;
869
+ cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
870
+ cpy_desc.srcArray = (CUarray)pSrc;
871
+ cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
872
+ cpy_desc.dstArray = (CUarray)pDst;
873
+ cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width ;
874
+ cpy_desc.Height = copyExtent.height ;
875
+ cpy_desc.Depth = copyExtent.depth ;
876
+ UR_CHECK_ERROR (cuMemcpy3DAsync (&cpy_desc, Stream));
877
+ } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D_ARRAY ||
878
+ pImageDesc->type == UR_MEM_TYPE_IMAGE2D_ARRAY ||
879
+ pImageDesc->type == UR_MEM_TYPE_IMAGE_CUBEMAP_EXP) {
880
+ CUDA_MEMCPY3D cpy_desc = {};
881
+ cpy_desc.srcXInBytes = srcOffset.x ;
882
+ cpy_desc.srcY = srcOffset.y ;
883
+ cpy_desc.srcZ = srcOffset.z ;
884
+ cpy_desc.dstXInBytes = dstOffset.x ;
885
+ cpy_desc.dstY = dstOffset.y ;
886
+ cpy_desc.dstZ = dstOffset.z ;
887
+ cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
888
+ cpy_desc.srcArray = (CUarray)pSrc;
889
+ cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
890
+ cpy_desc.dstArray = (CUarray)pDst;
891
+ cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width ;
892
+ cpy_desc.Height = std::max (uint64_t {1 }, copyExtent.height );
893
+ cpy_desc.Depth = pImageDesc->arraySize ;
894
+ UR_CHECK_ERROR (cuMemcpy3DAsync (&cpy_desc, Stream));
895
+ }
837
896
}
897
+ // Synchronization is required here to handle the case of copying data from
898
+ // host to device, then device to device and finally device to host.
899
+ // Without it, there is a risk of the copies not being executed in the
900
+ // intended order.
901
+ cuStreamSynchronize (Stream);
838
902
if (phEvent) {
839
903
auto NewEvent = ur_event_handle_t_::makeNative (UR_COMMAND_MEM_IMAGE_COPY,
840
904
hQueue, Stream);
0 commit comments