diff --git a/ompi/mca/coll/cuda/coll_cuda_allreduce.c b/ompi/mca/coll/cuda/coll_cuda_allreduce.c index 416c9c7fa8f..56ee52a8269 100644 --- a/ompi/mca/coll/cuda/coll_cuda_allreduce.c +++ b/ompi/mca/coll/cuda/coll_cuda_allreduce.c @@ -17,7 +17,7 @@ #include "ompi/op/op.h" #include "opal/datatype/opal_convertor.h" -#include "opal/mca/common/cuda/common_cuda.h" +#include "opal/util/opal_cuda_copy.h" /* * allreduce_intra diff --git a/ompi/mca/coll/cuda/coll_cuda_exscan.c b/ompi/mca/coll/cuda/coll_cuda_exscan.c index 5f736697fe0..60293b75e2f 100644 --- a/ompi/mca/coll/cuda/coll_cuda_exscan.c +++ b/ompi/mca/coll/cuda/coll_cuda_exscan.c @@ -17,7 +17,7 @@ #include "ompi/op/op.h" #include "opal/datatype/opal_convertor.h" -#include "opal/mca/common/cuda/common_cuda.h" +#include "opal/util/opal_cuda_copy.h" int mca_coll_cuda_exscan(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, diff --git a/ompi/mca/coll/cuda/coll_cuda_reduce.c b/ompi/mca/coll/cuda/coll_cuda_reduce.c index 5d82667b6bb..6308730d0a0 100644 --- a/ompi/mca/coll/cuda/coll_cuda_reduce.c +++ b/ompi/mca/coll/cuda/coll_cuda_reduce.c @@ -17,7 +17,7 @@ #include "ompi/op/op.h" #include "opal/datatype/opal_convertor.h" -#include "opal/mca/common/cuda/common_cuda.h" +#include "opal/util/opal_cuda_copy.h" /* * reduce_log_inter diff --git a/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c b/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c index 907257b0da8..3b9b6dd3b26 100644 --- a/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c +++ b/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c @@ -17,7 +17,7 @@ #include "ompi/op/op.h" #include "opal/datatype/opal_convertor.h" -#include "opal/mca/common/cuda/common_cuda.h" +#include "opal/util/opal_cuda_copy.h" /* * reduce_scatter_block diff --git a/ompi/mca/coll/cuda/coll_cuda_scan.c b/ompi/mca/coll/cuda/coll_cuda_scan.c index 4e7300c12f8..fbae72d2934 100644 --- a/ompi/mca/coll/cuda/coll_cuda_scan.c +++ b/ompi/mca/coll/cuda/coll_cuda_scan.c @@ -17,7 +17,7 @@ #include "ompi/op/op.h" #include "opal/datatype/opal_convertor.h" -#include "opal/mca/common/cuda/common_cuda.h" +#include "opal/util/opal_cuda_copy.h" /* * scan diff --git a/ompi/mca/coll/libnbc/nbc_internal.h b/ompi/mca/coll/libnbc/nbc_internal.h index 51a8a6369ce..2e5f66ae41e 100644 --- a/ompi/mca/coll/libnbc/nbc_internal.h +++ b/ompi/mca/coll/libnbc/nbc_internal.h @@ -32,7 +32,7 @@ #include "coll_libnbc.h" #if OPAL_CUDA_SUPPORT #include "opal/datatype/opal_convertor.h" -#include "opal/mca/common/cuda/common_cuda.h" +#include "opal/util/opal_cuda_copy.h" #endif /* OPAL_CUDA_SUPPORT */ #include "ompi/include/ompi/constants.h" #include "ompi/request/request.h" diff --git a/ompi/mca/common/ompio/common_ompio_buffer.c b/ompi/mca/common/ompio/common_ompio_buffer.c index dbd7e30e6b4..71d14983161 100644 --- a/ompi/mca/common/ompio/common_ompio_buffer.c +++ b/ompi/mca/common/ompio/common_ompio_buffer.c @@ -20,6 +20,7 @@ #include "ompi_config.h" #include "opal/datatype/opal_convertor.h" +#include "opal/util/opal_cuda_copy.h" #include "opal/mca/common/cuda/common_cuda.h" #include "opal/util/sys_limits.h" diff --git a/ompi/mca/mtl/base/mtl_base_datatype.h b/ompi/mca/mtl/base/mtl_base_datatype.h index 544ca32abc7..2f181c90c3c 100644 --- a/ompi/mca/mtl/base/mtl_base_datatype.h +++ b/ompi/mca/mtl/base/mtl_base_datatype.h @@ -28,6 +28,7 @@ #if OPAL_CUDA_SUPPORT #include "opal/mca/common/cuda/common_cuda.h" #include "opal/datatype/opal_convertor.h" +#include "opal/util/opal_cuda_copy.h" #endif #ifndef MTL_BASE_DATATYPE_H_INCLUDED diff --git a/ompi/mca/mtl/ofi/mtl_ofi.h b/ompi/mca/mtl/ofi/mtl_ofi.h index e122663db07..f0605d160af 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi.h +++ b/ompi/mca/mtl/ofi/mtl_ofi.h @@ -53,6 +53,7 @@ #if OPAL_CUDA_SUPPORT #include "opal/mca/common/cuda/common_cuda.h" #include "opal/datatype/opal_convertor.h" +#include "opal/util/opal_cuda_copy.h" #endif BEGIN_C_DECLS diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index cdc71ad3056..aab1f44381d 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -51,7 +51,7 @@ #include "opal/util/printf.h" #include "opal/util/sys_limits.h" #if OPAL_CUDA_SUPPORT -#include "opal/mca/common/cuda/common_cuda.h" +#include "opal/util/opal_cuda_copy.h" #endif /* OPAL_CUDA_SUPPORT */ #include "opal/util/info_subscriber.h" #include "opal/mca/mpool/base/base.h" diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c index ed7ec7d4360..9a3d5bf9f4b 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c @@ -55,6 +55,7 @@ #include "pml_ob1_hdr.h" #if OPAL_CUDA_SUPPORT #include "opal/mca/common/cuda/common_cuda.h" +#include "opal/util/opal_cuda_copy.h" #endif /* OPAL_CUDA_SUPPORT */ OBJ_CLASS_INSTANCE( mca_pml_ob1_buffer_t, diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index 64910b72c40..a99b2e74dde 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -46,6 +46,7 @@ #if OPAL_CUDA_SUPPORT #include "opal/mca/common/cuda/common_cuda.h" +#include "opal/util/opal_cuda_copy.h" #endif /* OPAL_CUDA_SUPPORT */ #if OPAL_CUDA_SUPPORT diff --git a/opal/Makefile.am b/opal/Makefile.am index ca90f302860..d3df8a25489 100644 --- a/opal/Makefile.am +++ b/opal/Makefile.am @@ -22,10 +22,8 @@ # $HEADER$ # -if OPAL_cuda_support SUBDIRS = \ include \ - mca/common/cuda \ datatype \ etc \ util \ @@ -39,36 +37,12 @@ SUBDIRS = \ # therefore make distclean will fail). DIST_SUBDIRS = \ include \ - mca/common/cuda \ datatype \ etc \ util \ mca/base \ $(MCA_opal_FRAMEWORKS_SUBDIRS) \ $(MCA_opal_FRAMEWORK_COMPONENT_ALL_SUBDIRS) -else -SUBDIRS = \ - include \ - datatype \ - etc \ - util \ - mca/base \ - $(MCA_opal_FRAMEWORKS_SUBDIRS) \ - $(MCA_opal_FRAMEWORK_COMPONENT_STATIC_SUBDIRS) \ - . \ - $(MCA_opal_FRAMEWORK_COMPONENT_DSO_SUBDIRS) -# libltdl is included by variable because if --disable-dlopen was -# used, there will be no generated Makefile in that directory (and -# therefore make distclean will fail). -DIST_SUBDIRS = \ - include \ - datatype \ - etc \ - util \ - mca/base \ - $(MCA_opal_FRAMEWORKS_SUBDIRS) \ - $(MCA_opal_FRAMEWORK_COMPONENT_ALL_SUBDIRS) -endif # Build the main OPAL library lib_LTLIBRARIES = lib@OPAL_LIB_NAME@.la diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c index 365ddc45a1e..7dfe2616f3e 100644 --- a/opal/datatype/opal_convertor.c +++ b/opal/datatype/opal_convertor.c @@ -39,7 +39,7 @@ #include "opal/datatype/opal_datatype_internal.h" #include "opal/datatype/opal_datatype_prototypes.h" #if OPAL_CUDA_SUPPORT -# include "opal/mca/common/cuda/common_cuda.h" +# include "opal/util/opal_cuda_copy.h" # define MEMCPY_CUDA(DST, SRC, BLENGTH, CONVERTOR) \ CONVERTOR->cbmemcpy((DST), (SRC), (BLENGTH), (CONVERTOR)) #endif diff --git a/opal/datatype/opal_datatype_copy.c b/opal/datatype/opal_datatype_copy.c index 465f479e46c..77a51b65788 100644 --- a/opal/datatype/opal_datatype_copy.c +++ b/opal/datatype/opal_datatype_copy.c @@ -73,7 +73,7 @@ static size_t opal_datatype_memop_block_size = 128 * 1024; #include "opal_datatype_copy.h" #if OPAL_CUDA_SUPPORT -# include "opal/mca/common/cuda/common_cuda.h" +# include "opal/util/opal_cuda_copy.h" # undef MEM_OP_NAME # define MEM_OP_NAME non_overlap_cuda diff --git a/opal/datatype/opal_datatype_pack_unpack_predefined.h b/opal/datatype/opal_datatype_pack_unpack_predefined.h index 261f2d48adf..a1d6ccde732 100644 --- a/opal/datatype/opal_datatype_pack_unpack_predefined.h +++ b/opal/datatype/opal_datatype_pack_unpack_predefined.h @@ -54,7 +54,7 @@ #define OPAL_DATATYPE_PACK_UNPACK_PREDEFINED_H_HAS_BEEN_INCLUDED #include "opal_config.h" -#include "opal/mca/common/cuda/common_cuda.h" +#include "opal/util/opal_cuda_copy.h" #include /* Improve predefined pack/unpack performance using mpich methods. diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c index 2fdc4b100e3..fc302c690dd 100644 --- a/opal/mca/common/cuda/common_cuda.c +++ b/opal/mca/common/cuda/common_cuda.c @@ -34,6 +34,7 @@ #include "opal/align.h" #include "opal/datatype/opal_convertor.h" #include "opal/util/argv.h" +#include "opal/util/opal_cuda_copy.h" #include "opal/util/output.h" #include "opal/util/printf.h" #include "opal/util/proc.h" @@ -2054,256 +2055,4 @@ void mca_common_cuda_get_buffer_id(mca_rcache_base_registration_t *reg) } } -static bool initialized = false; -int opal_cuda_verbose = 0; -static int opal_cuda_enabled = 0; /* Starts out disabled */ -static int opal_cuda_output = 0; -static void opal_cuda_support_init(void); -static int (*common_cuda_initialization_function)(opal_common_cuda_function_table_t *) = NULL; -static opal_common_cuda_function_table_t ftable; - -/* This function allows the common cuda code to register an - * initialization function that gets called the first time an attempt - * is made to send or receive a GPU pointer. This allows us to delay - * some CUDA initialization until after MPI_Init(). - */ -void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *)) -{ - common_cuda_initialization_function = fptr; -} - -/** - * This function is called when a convertor is instantiated. It has to call - * the opal_cuda_support_init() function once to figure out if CUDA support - * is enabled or not. If CUDA is not enabled, then short circuit out - * for all future calls. - */ -void mca_cuda_convertor_init(opal_convertor_t *convertor, const void *pUserBuf) -{ - /* Only do the initialization on the first GPU access */ - if (!initialized) { - opal_cuda_support_init(); - } - - /* This is needed to handle case where convertor is not fully initialized - * like when trying to do a sendi with convertor on the statck */ - convertor->cbmemcpy = (memcpy_fct_t) &opal_cuda_memcpy; - - /* If not enabled, then nothing else to do */ - if (!opal_cuda_enabled) { - return; - } - - if (ftable.gpu_is_gpu_buffer(pUserBuf, convertor)) { - convertor->flags |= CONVERTOR_CUDA; - } -} - -/* Checks the type of pointer - * - * @param dest One pointer to check - * @param source Another pointer to check - */ -bool opal_cuda_check_bufs(char *dest, char *src) -{ - /* Only do the initialization on the first GPU access */ - if (!initialized) { - opal_cuda_support_init(); - } - - if (!opal_cuda_enabled) { - return false; - } - - if (ftable.gpu_is_gpu_buffer(dest, NULL) || ftable.gpu_is_gpu_buffer(src, NULL)) { - return true; - } else { - return false; - } -} - -/* - * With CUDA enabled, all contiguous copies will pass through this function. - * Therefore, the first check is to see if the convertor is a GPU buffer. - * Note that if there is an error with any of the CUDA calls, the program - * aborts as there is no recovering. - */ - -/* Checks the type of pointer - * - * @param buf check one pointer providing a convertor. - * Provides aditional information, e.g. managed vs. unmanaged GPU buffer - */ -bool opal_cuda_check_one_buf(char *buf, opal_convertor_t *convertor) -{ - /* Only do the initialization on the first GPU access */ - if (!initialized) { - opal_cuda_support_init(); - } - - if (!opal_cuda_enabled) { - return false; - } - - return (ftable.gpu_is_gpu_buffer(buf, convertor)); -} - -/* - * This function allocates a buffer using either cuMemAlloc - * or malloc, depending on if the convertor flag CONVERTOR_CUDA - * is set. - * - * @param size Size of buffer to be allocated - * @param convertor The convertor with flags describing if the buf - * should be a Host or Cuda buffer. - * - * @returns void * A pointer to the newly allocated buffer. - */ -void *opal_cuda_malloc(size_t size, opal_convertor_t *convertor) -{ - int res; - void *buffer; - if (!(convertor->flags & CONVERTOR_CUDA)) { - return malloc(size); - } - res = ftable.gpu_malloc(buffer, size); - if (res != 0) { - opal_output(0, "CUDA: Error in cuMemAlloc: size=%d", (int) size); - abort(); - } else { - return buffer; - } -} - -/* - * This function frees a buffer using either cuMemFree() or free(), - * depending on if the convertor flag CONVERTOR_CUDA is set. - * - * @param buffer Pointer to buffer to be freed - * @param convertor The convertor with flags describing if the buf - * should be a Host or Cuda buffer. - * - */ -void opal_cuda_free(void *buffer, opal_convertor_t *convertor) -{ - int res; - if (!(convertor->flags & CONVERTOR_CUDA)) { - free(buffer); - return; - } - res = ftable.gpu_free(buffer); - if (res != 0) { - opal_output(0, "CUDA: Error in cuMemFree: ptr=%p", buffer); - abort(); - } - return; -} - -/* - * With CUDA enabled, all contiguous copies will pass through this function. - * Therefore, the first check is to see if the convertor is a GPU buffer. - * Note that if there is an error with any of the CUDA calls, the program - * aborts as there is no recovering. - */ - -void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t *convertor) -{ - int res; - - if (!(convertor->flags & CONVERTOR_CUDA)) { - return memcpy(dest, src, size); - } - - if (convertor->flags & CONVERTOR_CUDA_ASYNC) { - res = ftable.gpu_cu_memcpy_async(dest, (void *) src, size, convertor); - } else { - res = ftable.gpu_cu_memcpy(dest, (void *) src, size); - } - - if (res != 0) { - opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", res, dest, src, - (int) size); - abort(); - } else { - return dest; - } -} - -/* - * This function is needed in cases where we do not have contiguous - * datatypes. The current code has macros that cannot handle a convertor - * argument to the memcpy call. - */ -void *opal_cuda_memcpy_sync(void *dest, const void *src, size_t size) -{ - int res; - res = ftable.gpu_cu_memcpy(dest, src, size); - if (res != 0) { - opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", res, dest, src, - (int) size); - abort(); - } else { - return dest; - } -} - -/* - * In some cases, need an implementation of memmove. This is not fast, but - * it is not often needed. - */ -void *opal_cuda_memmove(void *dest, void *src, size_t size) -{ - int res; - - res = ftable.gpu_memmove(dest, src, size); - if (res != 0) { - opal_output(0, "CUDA: Error in gpu memmove: res=%d, dest=%p, src=%p, size=%d", res, dest, - src, (int) size); - abort(); - } - return dest; -} - -/** - * This function gets called once to check if the program is running in a cuda - * environment. - */ -static void opal_cuda_support_init(void) -{ - if (initialized) { - return; - } - - /* Set different levels of verbosity in the cuda related code. */ - opal_cuda_output = opal_output_open(NULL); - opal_output_set_verbosity(opal_cuda_output, opal_cuda_verbose); - - /* Callback into the common cuda initialization routine. This is only - * set if some work had been done already in the common cuda code.*/ - if (NULL != common_cuda_initialization_function) { - if (0 == common_cuda_initialization_function(&ftable)) { - opal_cuda_enabled = 1; - } - } - - if (1 == opal_cuda_enabled) { - opal_output_verbose(10, opal_cuda_output, - "CUDA: enabled successfully, CUDA device pointers will work"); - } else { - opal_output_verbose(10, opal_cuda_output, - "CUDA: not enabled, CUDA device pointers will not work"); - } - - initialized = true; -} - -/** - * Tell the convertor that copies will be asynchronous CUDA copies. The - * flags are cleared when the convertor is reinitialized. - */ -void opal_cuda_set_copy_function_async(opal_convertor_t *convertor, void *stream) -{ - convertor->flags |= CONVERTOR_CUDA_ASYNC; - convertor->stream = stream; -} #endif /* OPAL_CUDA_GDR_SUPPORT */ diff --git a/opal/mca/common/cuda/common_cuda.h b/opal/mca/common/cuda/common_cuda.h index 0889551640d..80781047f06 100644 --- a/opal/mca/common/cuda/common_cuda.h +++ b/opal/mca/common/cuda/common_cuda.h @@ -111,29 +111,4 @@ static inline int32_t opal_convertor_cuda_need_buffers(opal_convertor_t *pConver pConvertor->flags |= cudaflag; /* Restore CUDA flag */ return retval; } - -/* Structure to hold CUDA support functions that gets filled in when the - * common cuda code is initialized. This removes any dependency on - * in the opal cuda datatype code. */ -struct opal_common_cuda_function_table { - int (*gpu_is_gpu_buffer)(const void *, opal_convertor_t *); - int (*gpu_cu_memcpy_async)(void *, const void *, size_t, opal_convertor_t *); - int (*gpu_cu_memcpy)(void *, const void *, size_t); - int (*gpu_memmove)(void *, void *, size_t); - int (*gpu_malloc)(void *, size_t); - int (*gpu_free)(void *); -}; -typedef struct opal_common_cuda_function_table opal_common_cuda_function_table_t; - -void mca_cuda_convertor_init(opal_convertor_t *convertor, const void *pUserBuf); -bool opal_cuda_check_bufs(char *dest, char *src); -bool opal_cuda_check_one_buf(char *buf, opal_convertor_t *convertor); -void *opal_cuda_malloc(size_t size, opal_convertor_t *convertor); -void opal_cuda_free(void *buffer, opal_convertor_t *convertor); -void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t *convertor); -void *opal_cuda_memcpy_sync(void *dest, const void *src, size_t size); -void *opal_cuda_memmove(void *dest, void *src, size_t size); -void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *)); -void opal_cuda_set_copy_function_async(opal_convertor_t *convertor, void *stream); - #endif /* OPAL_MCA_COMMON_CUDA_H */ diff --git a/opal/util/Makefile.am b/opal/util/Makefile.am index 4a457d7e1b3..e09818182f0 100644 --- a/opal/util/Makefile.am +++ b/opal/util/Makefile.am @@ -124,6 +124,12 @@ if OPAL_COMPILE_TIMING libopalutil_la_SOURCES += timings.c endif +# If we have cuda support, modify file list and flags +if OPAL_cuda_support +libopalutil_la_SOURCES += opal_cuda_copy.c +headers += opal_cuda_copy.h +endif + libopalutil_la_LIBADD = \ keyval/libopalutilkeyval.la libopalutil_la_DEPENDENCIES = \ diff --git a/opal/util/opal_cuda_copy.c b/opal/util/opal_cuda_copy.c new file mode 100644 index 00000000000..26482b553f0 --- /dev/null +++ b/opal/util/opal_cuda_copy.c @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2011-2014 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2018-2022 Amazon.com, Inc. or its affiliates. All Rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" + +#include +#include +#include + +#include "opal/align.h" +#include "opal/util/output.h" +#include "opal/datatype/opal_convertor.h" +#include "opal/util/opal_cuda_copy.h" + +static bool initialized = false; +int opal_cuda_verbose = 0; +static int opal_cuda_enabled = 0; /* Starts out disabled */ +static int opal_cuda_output = 0; +static void opal_cuda_support_init(void); +static int (*common_cuda_initialization_function)(opal_common_cuda_function_table_t *) = NULL; +static opal_common_cuda_function_table_t ftable; + +/* This function allows the common cuda code to register an + * initialization function that gets called the first time an attempt + * is made to send or receive a GPU pointer. This allows us to delay + * some CUDA initialization until after MPI_Init(). + */ +void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *)) +{ + common_cuda_initialization_function = fptr; +} + +/** + * This function is called when a convertor is instantiated. It has to call + * the opal_cuda_support_init() function once to figure out if CUDA support + * is enabled or not. If CUDA is not enabled, then short circuit out + * for all future calls. + */ +void mca_cuda_convertor_init(opal_convertor_t *convertor, const void *pUserBuf) +{ + /* Only do the initialization on the first GPU access */ + if (!initialized) { + opal_cuda_support_init(); + } + + /* This is needed to handle case where convertor is not fully initialized + * like when trying to do a sendi with convertor on the statck */ + convertor->cbmemcpy = (memcpy_fct_t) &opal_cuda_memcpy; + + /* If not enabled, then nothing else to do */ + if (!opal_cuda_enabled) { + return; + } + + if (ftable.gpu_is_gpu_buffer(pUserBuf, convertor)) { + convertor->flags |= CONVERTOR_CUDA; + } +} + +/* Checks the type of pointer + * + * @param dest One pointer to check + * @param source Another pointer to check + */ +bool opal_cuda_check_bufs(char *dest, char *src) +{ + /* Only do the initialization on the first GPU access */ + if (!initialized) { + opal_cuda_support_init(); + } + + if (!opal_cuda_enabled) { + return false; + } + + if (ftable.gpu_is_gpu_buffer(dest, NULL) || ftable.gpu_is_gpu_buffer(src, NULL)) { + return true; + } else { + return false; + } +} + +/* + * With CUDA enabled, all contiguous copies will pass through this function. + * Therefore, the first check is to see if the convertor is a GPU buffer. + * Note that if there is an error with any of the CUDA calls, the program + * aborts as there is no recovering. + */ + +/* Checks the type of pointer + * + * @param buf check one pointer providing a convertor. + * Provides aditional information, e.g. managed vs. unmanaged GPU buffer + */ +bool opal_cuda_check_one_buf(char *buf, opal_convertor_t *convertor) +{ + /* Only do the initialization on the first GPU access */ + if (!initialized) { + opal_cuda_support_init(); + } + + if (!opal_cuda_enabled) { + return false; + } + + return (ftable.gpu_is_gpu_buffer(buf, convertor)); +} + +/* + * This function allocates a buffer using either cuMemAlloc + * or malloc, depending on if the convertor flag CONVERTOR_CUDA + * is set. + * + * @param size Size of buffer to be allocated + * @param convertor The convertor with flags describing if the buf + * should be a Host or Cuda buffer. + * + * @returns void * A pointer to the newly allocated buffer. + */ +void *opal_cuda_malloc(size_t size, opal_convertor_t *convertor) +{ + int res; + void *buffer; + if (!(convertor->flags & CONVERTOR_CUDA)) { + return malloc(size); + } + res = ftable.gpu_malloc(buffer, size); + if (res != 0) { + opal_output(0, "CUDA: Error in cuMemAlloc: size=%d", (int) size); + abort(); + } else { + return buffer; + } +} + +/* + * This function frees a buffer using either cuMemFree() or free(), + * depending on if the convertor flag CONVERTOR_CUDA is set. + * + * @param buffer Pointer to buffer to be freed + * @param convertor The convertor with flags describing if the buf + * should be a Host or Cuda buffer. + * + */ +void opal_cuda_free(void *buffer, opal_convertor_t *convertor) +{ + int res; + if (!(convertor->flags & CONVERTOR_CUDA)) { + free(buffer); + return; + } + res = ftable.gpu_free(buffer); + if (res != 0) { + opal_output(0, "CUDA: Error in cuMemFree: ptr=%p", buffer); + abort(); + } + return; +} + +/* + * With CUDA enabled, all contiguous copies will pass through this function. + * Therefore, the first check is to see if the convertor is a GPU buffer. + * Note that if there is an error with any of the CUDA calls, the program + * aborts as there is no recovering. + */ + +void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t *convertor) +{ + int res; + + if (!(convertor->flags & CONVERTOR_CUDA)) { + return memcpy(dest, src, size); + } + + if (convertor->flags & CONVERTOR_CUDA_ASYNC) { + res = ftable.gpu_cu_memcpy_async(dest, (void *) src, size, convertor); + } else { + res = ftable.gpu_cu_memcpy(dest, (void *) src, size); + } + + if (res != 0) { + opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", res, dest, src, + (int) size); + abort(); + } else { + return dest; + } +} + +/* + * This function is needed in cases where we do not have contiguous + * datatypes. The current code has macros that cannot handle a convertor + * argument to the memcpy call. + */ +void *opal_cuda_memcpy_sync(void *dest, const void *src, size_t size) +{ + int res; + res = ftable.gpu_cu_memcpy(dest, src, size); + if (res != 0) { + opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", res, dest, src, + (int) size); + abort(); + } else { + return dest; + } +} + +/* + * In some cases, need an implementation of memmove. This is not fast, but + * it is not often needed. + */ +void *opal_cuda_memmove(void *dest, void *src, size_t size) +{ + int res; + + res = ftable.gpu_memmove(dest, src, size); + if (res != 0) { + opal_output(0, "CUDA: Error in gpu memmove: res=%d, dest=%p, src=%p, size=%d", res, dest, + src, (int) size); + abort(); + } + return dest; +} + +/** + * This function gets called once to check if the program is running in a cuda + * environment. + */ +static void opal_cuda_support_init(void) +{ + if (initialized) { + return; + } + + /* Set different levels of verbosity in the cuda related code. */ + opal_cuda_output = opal_output_open(NULL); + opal_output_set_verbosity(opal_cuda_output, opal_cuda_verbose); + + /* Callback into the common cuda initialization routine. This is only + * set if some work had been done already in the common cuda code.*/ + if (NULL != common_cuda_initialization_function) { + if (0 == common_cuda_initialization_function(&ftable)) { + opal_cuda_enabled = 1; + } + } + + if (1 == opal_cuda_enabled) { + opal_output_verbose(10, opal_cuda_output, + "CUDA: enabled successfully, CUDA device pointers will work"); + } else { + opal_output_verbose(10, opal_cuda_output, + "CUDA: not enabled, CUDA device pointers will not work"); + } + + initialized = true; +} + +/** + * Tell the convertor that copies will be asynchronous CUDA copies. The + * flags are cleared when the convertor is reinitialized. + */ +void opal_cuda_set_copy_function_async(opal_convertor_t *convertor, void *stream) +{ + convertor->flags |= CONVERTOR_CUDA_ASYNC; + convertor->stream = stream; +} diff --git a/opal/util/opal_cuda_copy.h b/opal/util/opal_cuda_copy.h new file mode 100644 index 00000000000..d9065f4a82d --- /dev/null +++ b/opal/util/opal_cuda_copy.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2011-2014 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2018-2022 Amazon.com, Inc. or its affiliates. All Rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OPAL_CUDA_COPY_H +#define OPAL_CUDA_COPY_H + +/* Structure to hold CUDA support functions that gets filled in when the + * common cuda code is initialized. This removes any dependency on + * in the opal cuda datatype code. */ +struct opal_common_cuda_function_table { + int (*gpu_is_gpu_buffer)(const void *, opal_convertor_t *); + int (*gpu_cu_memcpy_async)(void *, const void *, size_t, opal_convertor_t *); + int (*gpu_cu_memcpy)(void *, const void *, size_t); + int (*gpu_memmove)(void *, void *, size_t); + int (*gpu_malloc)(void *, size_t); + int (*gpu_free)(void *); +}; +typedef struct opal_common_cuda_function_table opal_common_cuda_function_table_t; + +void mca_cuda_convertor_init(opal_convertor_t *convertor, const void *pUserBuf); +bool opal_cuda_check_bufs(char *dest, char *src); +bool opal_cuda_check_one_buf(char *buf, opal_convertor_t *convertor); +void *opal_cuda_malloc(size_t size, opal_convertor_t *convertor); +void opal_cuda_free(void *buffer, opal_convertor_t *convertor); +void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t *convertor); +void *opal_cuda_memcpy_sync(void *dest, const void *src, size_t size); +void *opal_cuda_memmove(void *dest, void *src, size_t size); +void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *)); +void opal_cuda_set_copy_function_async(opal_convertor_t *convertor, void *stream); + +#endif /* OPAL_CUDA_COPY_H */