From 0e7d052ca676d889aa50df73c798ce587b9e7300 Mon Sep 17 00:00:00 2001 From: cllol <825832365@qq.com> Date: Mon, 4 Jul 2022 21:20:43 +0800 Subject: [PATCH 1/3] Expose cuda malloc/free async api --- cpm_kernels/library/cudart.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/cpm_kernels/library/cudart.py b/cpm_kernels/library/cudart.py index 477e6bf..3e114c5 100644 --- a/cpm_kernels/library/cudart.py +++ b/cpm_kernels/library/cudart.py @@ -375,10 +375,21 @@ def cudaMalloc(size : int) -> ctypes.c_void_p: checkCUDAStatus(cuda.cudaMalloc(ctypes.byref(ptr), size)) return ptr +@cuda.bind("cudaMallocAsync", [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t, cudaStream_t], cudaError_t) +def cudaMallocAsync(size : int, stream : cudaStream_t) -> ctypes.c_void_p: + ptr = ctypes.c_void_p() + checkCUDAStatus(cuda.cudaMallocAsync(ctypes.byref(ptr), size, stream)) + return ptr + @cuda.bind("cudaFree", [ctypes.c_void_p], cudaError_t) def cudaFree(ptr : ctypes.c_void_p) -> None: checkCUDAStatus(cuda.cudaFree(ptr)) +@cuda.bind("cudaFreeAsync", [ctypes.c_void_p, cudaStream_t], cudaError_t) +def cudaFreeAsync(ptr : ctypes.c_void_p, stream : cudaStream_t) -> None: + checkCUDAStatus(cuda.cudaFreeAsync(ptr, stream)) + return ptr + @cuda.bind("cudaMallocHost", [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t], cudaError_t) def cudaMallocHost(size : int) -> ctypes.c_void_p: ptr = ctypes.c_void_p() From 45c84842bc7f29e74e7f85b16aa8c915bb0a29e3 Mon Sep 17 00:00:00 2001 From: cllol <825832365@qq.com> Date: Mon, 4 Jul 2022 21:46:17 +0800 Subject: [PATCH 2/3] Update cudart.py --- cpm_kernels/library/cudart.py | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/cpm_kernels/library/cudart.py b/cpm_kernels/library/cudart.py index 3e114c5..c923b14 100644 --- a/cpm_kernels/library/cudart.py +++ b/cpm_kernels/library/cudart.py @@ -298,6 +298,11 @@ cudaMemcpyDefault = 4 # Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing +# For version compatible +MALLOC_AYNC_SUPPORT = False +GET_FUNC_BY_SYMBOL_SUPPORT = False + + class dim3(ctypes.Structure): _fields_ = [ ('x', ctypes.c_uint), @@ -340,6 +345,8 @@ def cudaDriverGetVersion() -> int: try: version = cudaRuntimeGetVersion() + if version >= 11200: MALLOC_AYNC_SUPPORT = True + if version >= 11000: GET_FUNC_BY_SYMBOL_SUPPORT = True except RuntimeError: version = 0 @@ -375,20 +382,21 @@ def cudaMalloc(size : int) -> ctypes.c_void_p: checkCUDAStatus(cuda.cudaMalloc(ctypes.byref(ptr), size)) return ptr -@cuda.bind("cudaMallocAsync", [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t, cudaStream_t], cudaError_t) -def cudaMallocAsync(size : int, stream : cudaStream_t) -> ctypes.c_void_p: - ptr = ctypes.c_void_p() - checkCUDAStatus(cuda.cudaMallocAsync(ctypes.byref(ptr), size, stream)) - return ptr - @cuda.bind("cudaFree", [ctypes.c_void_p], cudaError_t) def cudaFree(ptr : ctypes.c_void_p) -> None: checkCUDAStatus(cuda.cudaFree(ptr)) -@cuda.bind("cudaFreeAsync", [ctypes.c_void_p, cudaStream_t], cudaError_t) -def cudaFreeAsync(ptr : ctypes.c_void_p, stream : cudaStream_t) -> None: - checkCUDAStatus(cuda.cudaFreeAsync(ptr, stream)) - return ptr +if MALLOC_AYNC_SUPPORT: + @cuda.bind("cudaMallocAsync", [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t, cudaStream_t], cudaError_t) + def cudaMallocAsync(size : int, stream : cudaStream_t) -> ctypes.c_void_p: + ptr = ctypes.c_void_p() + checkCUDAStatus(cuda.cudaMallocAsync(ctypes.byref(ptr), size, stream)) + return ptr + + @cuda.bind("cudaFreeAsync", [ctypes.c_void_p, cudaStream_t], cudaError_t) + def cudaFreeAsync(ptr : ctypes.c_void_p, stream : cudaStream_t) -> None: + checkCUDAStatus(cuda.cudaFreeAsync(ptr, stream)) + return ptr @cuda.bind("cudaMallocHost", [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t], cudaError_t) def cudaMallocHost(size : int) -> ctypes.c_void_p: @@ -485,7 +493,7 @@ def cudaLaunchKernel( kernelParams = None checkCUDAStatus(cuda.cudaLaunchKernel(func, gridDim, blockDim, kernelParams, sharedMem, stream)) -if version >= 11000: +if GET_FUNC_BY_SYMBOL_SUPPORT: @cuda.bind("cudaGetFuncBySymbol", [ctypes.POINTER(ctypes.c_void_p), ctypes.c_void_p], cudaError_t) def cudaGetFuncBySymbol(func : ctypes.c_void_p) -> ctypes.c_void_p: ret = ctypes.c_void_p() From b883ebb6291399a005b00088f365872be96de6fd Mon Sep 17 00:00:00 2001 From: cllol <825832365@qq.com> Date: Mon, 4 Jul 2022 21:53:30 +0800 Subject: [PATCH 3/3] fix small error --- cpm_kernels/library/cudart.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cpm_kernels/library/cudart.py b/cpm_kernels/library/cudart.py index c923b14..2162ab4 100644 --- a/cpm_kernels/library/cudart.py +++ b/cpm_kernels/library/cudart.py @@ -396,7 +396,6 @@ def cudaMallocAsync(size : int, stream : cudaStream_t) -> ctypes.c_void_p: @cuda.bind("cudaFreeAsync", [ctypes.c_void_p, cudaStream_t], cudaError_t) def cudaFreeAsync(ptr : ctypes.c_void_p, stream : cudaStream_t) -> None: checkCUDAStatus(cuda.cudaFreeAsync(ptr, stream)) - return ptr @cuda.bind("cudaMallocHost", [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t], cudaError_t) def cudaMallocHost(size : int) -> ctypes.c_void_p: