diff --git a/config/opal_check_cuda.m4 b/config/opal_check_cuda.m4 index 67059a8c851..97f0ee31237 100644 --- a/config/opal_check_cuda.m4 +++ b/config/opal_check_cuda.m4 @@ -1,5 +1,6 @@ dnl -*- shell-script -*- dnl +dnl Copyright (c) 2024 NVIDIA Corporation. All rights reserved. dnl Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana dnl University Research and Technology dnl Corporation. All rights reserved. @@ -113,6 +114,12 @@ AS_IF([test "$opal_check_cuda_happy"="yes"], [#include <$opal_cuda_incdir/cuda.h>]), []) +# If we have CUDA support, check to see if we have support for cuMemCreate memory on host NUMA. +AS_IF([test "$opal_check_cuda_happy"="yes"], + [AC_CHECK_DECL([CU_MEM_LOCATION_TYPE_HOST_NUMA], [CUDA_VMM_SUPPORT=1], [CUDA_VMM_SUPPORT=0], + [#include <$opal_cuda_incdir/cuda.h>])], + []) + AC_MSG_CHECKING([if have cuda support]) if test "$opal_check_cuda_happy" = "yes"; then AC_MSG_RESULT([yes (-I$opal_cuda_incdir)]) @@ -134,6 +141,10 @@ AM_CONDITIONAL([OPAL_cuda_sync_memops], [test "x$CUDA_SYNC_MEMOPS" = "x1"]) AC_DEFINE_UNQUOTED([OPAL_CUDA_SYNC_MEMOPS],$CUDA_SYNC_MEMOPS, [Whether we have CUDA CU_POINTER_ATTRIBUTE_SYNC_MEMOPS support available]) +AM_CONDITIONAL([OPAL_cuda_vmm_support], [test "x$CUDA_VMM_SUPPORT" = "x1"]) +AC_DEFINE_UNQUOTED([OPAL_CUDA_VMM_SUPPORT],$CUDA_VMM_SUPPORT, + [Whether we have CU_MEM_LOCATION_TYPE_HOST_NUMA support available]) + AM_CONDITIONAL([OPAL_cuda_get_attributes], [test "x$CUDA_GET_ATTRIBUTES" = "x1"]) AC_DEFINE_UNQUOTED([OPAL_CUDA_GET_ATTRIBUTES],$CUDA_GET_ATTRIBUTES, [Whether we have CUDA cuPointerGetAttributes function available]) diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c index b8689dbf9cd..8d4c7033b32 100644 --- a/opal/mca/common/cuda/common_cuda.c +++ b/opal/mca/common/cuda/common_cuda.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. @@ -106,6 +107,13 @@ struct cudaFunctionTable { int (*cuStreamDestroy)(CUstream); #if OPAL_CUDA_GET_ATTRIBUTES int (*cuPointerGetAttributes)(unsigned int, CUpointer_attribute *, void **, CUdeviceptr); +#if OPAL_CUDA_VMM_SUPPORT + int (*cuDeviceGetCount)(int*); + int (*cuMemRelease)(CUmemGenericAllocationHandle); + int (*cuMemRetainAllocationHandle)(CUmemGenericAllocationHandle*, void*); + int (*cuMemGetAllocationPropertiesFromHandle)(CUmemAllocationProp *, CUmemGenericAllocationHandle); + int (*cuMemGetAccess)(unsigned long long*, const CUmemLocation*, CUdeviceptr); +#endif #endif /* OPAL_CUDA_GET_ATTRIBUTES */ }; typedef struct cudaFunctionTable cudaFunctionTable_t; @@ -479,6 +487,13 @@ int mca_common_cuda_stage_one_init(void) #if OPAL_CUDA_GET_ATTRIBUTES OPAL_CUDA_DLSYM(libcuda_handle, cuPointerGetAttributes); #endif /* OPAL_CUDA_GET_ATTRIBUTES */ +#if OPAL_CUDA_VMM_SUPPORT + OPAL_CUDA_DLSYM(libcuda_handle, cuDeviceGetCount); + OPAL_CUDA_DLSYM(libcuda_handle, cuMemRelease); + OPAL_CUDA_DLSYM(libcuda_handle, cuMemRetainAllocationHandle); + OPAL_CUDA_DLSYM(libcuda_handle, cuMemGetAllocationPropertiesFromHandle); + OPAL_CUDA_DLSYM(libcuda_handle, cuMemGetAccess); +#endif return 0; } @@ -1730,13 +1745,77 @@ static float mydifftime(opal_timer_t ts_start, opal_timer_t ts_end) { } #endif /* OPAL_ENABLE_DEBUG */ +static int mca_common_cuda_check_vmm(CUdeviceptr dbuf, CUmemorytype *mem_type) +{ +#if OPAL_CUDA_VMM_SUPPORT + static int device_count = -1; + CUmemAllocationProp prop; + CUmemLocation location; + CUresult result; + unsigned long long flags; + CUmemGenericAllocationHandle alloc_handle; + + if (device_count == -1) { + result = cuFunc.cuDeviceGetCount(&device_count); + if (result != CUDA_SUCCESS) { + return 0; + } + } + + result = cuFunc.cuMemRetainAllocationHandle(&alloc_handle, (void*)dbuf); + if (result != CUDA_SUCCESS) { + return 0; + } + + result = cuFunc.cuMemGetAllocationPropertiesFromHandle(&prop, alloc_handle); + if (result != CUDA_SUCCESS) { + cuFunc.cuMemRelease(alloc_handle); + return 0; + } + + if (prop.location.type == CU_MEM_LOCATION_TYPE_DEVICE) { + *mem_type = CU_MEMORYTYPE_DEVICE; + cuFunc.cuMemRelease(alloc_handle); + return 1; + } + + if (prop.location.type == CU_MEM_LOCATION_TYPE_HOST_NUMA) { + /* check if device has access */ + for (int i = 0; i < device_count; i++) { + location.type = CU_MEM_LOCATION_TYPE_DEVICE; + location.id = i; + result = cuFunc.cuMemGetAccess(&flags, &location, dbuf); + if ((CUDA_SUCCESS == result) && + (CU_MEM_ACCESS_FLAGS_PROT_READWRITE == flags)) { + *mem_type = CU_MEMORYTYPE_DEVICE; + cuFunc.cuMemRelease(alloc_handle); + return 1; + } + } + } + + /* host must have access as device access possibility is exhausted */ + *mem_type = CU_MEMORYTYPE_HOST; + cuFunc.cuMemRelease(alloc_handle); + return 1; + +#endif + + return 0; +} + /* Routines that get plugged into the opal datatype code */ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t *convertor) { int res; + int is_vmm = 0; + CUmemorytype vmm_mem_type = 0; CUmemorytype memType = 0; CUdeviceptr dbuf = (CUdeviceptr)pUserBuf; CUcontext ctx = NULL, memCtx = NULL; + + is_vmm = mca_common_cuda_check_vmm(dbuf, &vmm_mem_type); + #if OPAL_CUDA_GET_ATTRIBUTES uint32_t isManaged = 0; /* With CUDA 7.0, we can get multiple attributes with a single call */ @@ -1763,8 +1842,12 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t * just assume it is not. */ return 0; } else if (memType == CU_MEMORYTYPE_HOST) { - /* Host memory, nothing to do here */ - return 0; + if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE)) { + memType = CU_MEMORYTYPE_DEVICE; + } else { + /* Host memory, nothing to do here */ + return 0; + } } else if (memType == 0) { /* This can happen when CUDA is initialized but dbuf is not valid CUDA pointer */ return 0; @@ -1779,8 +1862,12 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t * just assume it is not. */ return 0; } else if (memType == CU_MEMORYTYPE_HOST) { - /* Host memory, nothing to do here */ - return 0; + if (is_vmm && (vmm_mem_type == CU_MEMORYTYPE_DEVICE)) { + memType = CU_MEMORYTYPE_DEVICE; + } else { + /* Host memory, nothing to do here */ + return 0; + } } /* Must be a device pointer */ assert(memType == CU_MEMORYTYPE_DEVICE); @@ -1806,6 +1893,16 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t return OPAL_ERROR; } #endif /* OPAL_CUDA_GET_ATTRIBUTES */ + if (is_vmm) { + /* This function is expected to set context if pointer is device + * accessible but VMM allocations have NULL context associated + * which cannot be set against the calling thread */ + opal_output(0, + "CUDA: unable to set context with the given pointer" + "ptr=%p aborting...", dbuf); + return OPAL_ERROR; + } + res = cuFunc.cuCtxSetCurrent(memCtx); if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) { opal_output(0, "CUDA: error calling cuCtxSetCurrent: "