diff --git a/ompi/mca/coll/cuda/coll_cuda_allreduce.c b/ompi/mca/coll/cuda/coll_cuda_allreduce.c
index 416c9c7fa8f..56ee52a8269 100644
--- a/ompi/mca/coll/cuda/coll_cuda_allreduce.c
+++ b/ompi/mca/coll/cuda/coll_cuda_allreduce.c
@@ -17,7 +17,7 @@
 
 #include "ompi/op/op.h"
 #include "opal/datatype/opal_convertor.h"
-#include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/util/opal_cuda_copy.h"
 
 /*
  *	allreduce_intra
diff --git a/ompi/mca/coll/cuda/coll_cuda_exscan.c b/ompi/mca/coll/cuda/coll_cuda_exscan.c
index 5f736697fe0..60293b75e2f 100644
--- a/ompi/mca/coll/cuda/coll_cuda_exscan.c
+++ b/ompi/mca/coll/cuda/coll_cuda_exscan.c
@@ -17,7 +17,7 @@
 
 #include "ompi/op/op.h"
 #include "opal/datatype/opal_convertor.h"
-#include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/util/opal_cuda_copy.h"
 
 int mca_coll_cuda_exscan(const void *sbuf, void *rbuf, int count,
                          struct ompi_datatype_t *dtype,
diff --git a/ompi/mca/coll/cuda/coll_cuda_reduce.c b/ompi/mca/coll/cuda/coll_cuda_reduce.c
index 5d82667b6bb..6308730d0a0 100644
--- a/ompi/mca/coll/cuda/coll_cuda_reduce.c
+++ b/ompi/mca/coll/cuda/coll_cuda_reduce.c
@@ -17,7 +17,7 @@
 
 #include "ompi/op/op.h"
 #include "opal/datatype/opal_convertor.h"
-#include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/util/opal_cuda_copy.h"
 
 /*
  *	reduce_log_inter
diff --git a/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c b/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c
index 907257b0da8..3b9b6dd3b26 100644
--- a/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c
+++ b/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c
@@ -17,7 +17,7 @@
 
 #include "ompi/op/op.h"
 #include "opal/datatype/opal_convertor.h"
-#include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/util/opal_cuda_copy.h"
 
 /*
  *	reduce_scatter_block
diff --git a/ompi/mca/coll/cuda/coll_cuda_scan.c b/ompi/mca/coll/cuda/coll_cuda_scan.c
index 4e7300c12f8..fbae72d2934 100644
--- a/ompi/mca/coll/cuda/coll_cuda_scan.c
+++ b/ompi/mca/coll/cuda/coll_cuda_scan.c
@@ -17,7 +17,7 @@
 
 #include "ompi/op/op.h"
 #include "opal/datatype/opal_convertor.h"
-#include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/util/opal_cuda_copy.h"
 
 /*
  *	scan
diff --git a/ompi/mca/coll/libnbc/nbc_internal.h b/ompi/mca/coll/libnbc/nbc_internal.h
index 51a8a6369ce..2e5f66ae41e 100644
--- a/ompi/mca/coll/libnbc/nbc_internal.h
+++ b/ompi/mca/coll/libnbc/nbc_internal.h
@@ -32,7 +32,7 @@
 #include "coll_libnbc.h"
 #if OPAL_CUDA_SUPPORT
 #include "opal/datatype/opal_convertor.h"
-#include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/util/opal_cuda_copy.h"
 #endif /* OPAL_CUDA_SUPPORT */
 #include "ompi/include/ompi/constants.h"
 #include "ompi/request/request.h"
diff --git a/ompi/mca/common/ompio/common_ompio_buffer.c b/ompi/mca/common/ompio/common_ompio_buffer.c
index dbd7e30e6b4..71d14983161 100644
--- a/ompi/mca/common/ompio/common_ompio_buffer.c
+++ b/ompi/mca/common/ompio/common_ompio_buffer.c
@@ -20,6 +20,7 @@
 #include "ompi_config.h"
 
 #include "opal/datatype/opal_convertor.h"
+#include "opal/util/opal_cuda_copy.h"
 #include "opal/mca/common/cuda/common_cuda.h"
 #include "opal/util/sys_limits.h"
 
diff --git a/ompi/mca/mtl/base/mtl_base_datatype.h b/ompi/mca/mtl/base/mtl_base_datatype.h
index 544ca32abc7..2f181c90c3c 100644
--- a/ompi/mca/mtl/base/mtl_base_datatype.h
+++ b/ompi/mca/mtl/base/mtl_base_datatype.h
@@ -28,6 +28,7 @@
 #if OPAL_CUDA_SUPPORT
 #include "opal/mca/common/cuda/common_cuda.h"
 #include "opal/datatype/opal_convertor.h"
+#include "opal/util/opal_cuda_copy.h"
 #endif
 
 #ifndef MTL_BASE_DATATYPE_H_INCLUDED
diff --git a/ompi/mca/mtl/ofi/mtl_ofi.h b/ompi/mca/mtl/ofi/mtl_ofi.h
index e122663db07..f0605d160af 100644
--- a/ompi/mca/mtl/ofi/mtl_ofi.h
+++ b/ompi/mca/mtl/ofi/mtl_ofi.h
@@ -53,6 +53,7 @@
 #if OPAL_CUDA_SUPPORT
 #include "opal/mca/common/cuda/common_cuda.h"
 #include "opal/datatype/opal_convertor.h"
+#include "opal/util/opal_cuda_copy.h"
 #endif
 
 BEGIN_C_DECLS
diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c
index cdc71ad3056..aab1f44381d 100644
--- a/ompi/mca/osc/rdma/osc_rdma_component.c
+++ b/ompi/mca/osc/rdma/osc_rdma_component.c
@@ -51,7 +51,7 @@
 #include "opal/util/printf.h"
 #include "opal/util/sys_limits.h"
 #if OPAL_CUDA_SUPPORT
-#include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/util/opal_cuda_copy.h"
 #endif /* OPAL_CUDA_SUPPORT */
 #include "opal/util/info_subscriber.h"
 #include "opal/mca/mpool/base/base.h"
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c
index ed7ec7d4360..9a3d5bf9f4b 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c
@@ -55,6 +55,7 @@
 #include "pml_ob1_hdr.h"
 #if OPAL_CUDA_SUPPORT
 #include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/util/opal_cuda_copy.h"
 #endif /* OPAL_CUDA_SUPPORT */
 
 OBJ_CLASS_INSTANCE( mca_pml_ob1_buffer_t,
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
index 64910b72c40..a99b2e74dde 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
@@ -46,6 +46,7 @@
 
 #if OPAL_CUDA_SUPPORT
 #include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/util/opal_cuda_copy.h"
 #endif /* OPAL_CUDA_SUPPORT */
 
 #if OPAL_CUDA_SUPPORT
diff --git a/opal/Makefile.am b/opal/Makefile.am
index ca90f302860..d3df8a25489 100644
--- a/opal/Makefile.am
+++ b/opal/Makefile.am
@@ -22,10 +22,8 @@
 # $HEADER$
 #
 
-if OPAL_cuda_support
 SUBDIRS = \
 	include \
-        mca/common/cuda \
         datatype \
         etc \
         util \
@@ -39,36 +37,12 @@ SUBDIRS = \
 # therefore make distclean will fail).
 DIST_SUBDIRS = \
 	include \
-        mca/common/cuda \
         datatype \
         etc \
 	util \
 	mca/base \
 	$(MCA_opal_FRAMEWORKS_SUBDIRS) \
 	$(MCA_opal_FRAMEWORK_COMPONENT_ALL_SUBDIRS)
-else
-SUBDIRS = \
-	include \
-        datatype \
-        etc \
-        util \
-	mca/base \
-	$(MCA_opal_FRAMEWORKS_SUBDIRS) \
-	$(MCA_opal_FRAMEWORK_COMPONENT_STATIC_SUBDIRS) \
-        . \
-	$(MCA_opal_FRAMEWORK_COMPONENT_DSO_SUBDIRS)
-# libltdl is included by variable because if --disable-dlopen was
-# used, there will be no generated Makefile in that directory (and
-# therefore make distclean will fail).
-DIST_SUBDIRS = \
-	include \
-        datatype \
-        etc \
-	util \
-	mca/base \
-	$(MCA_opal_FRAMEWORKS_SUBDIRS) \
-	$(MCA_opal_FRAMEWORK_COMPONENT_ALL_SUBDIRS)
-endif
 
 # Build the main OPAL library
 lib_LTLIBRARIES = lib@OPAL_LIB_NAME@.la
diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c
index 365ddc45a1e..7dfe2616f3e 100644
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@@ -39,7 +39,7 @@
 #include "opal/datatype/opal_datatype_internal.h"
 #include "opal/datatype/opal_datatype_prototypes.h"
 #if OPAL_CUDA_SUPPORT
-#    include "opal/mca/common/cuda/common_cuda.h"
+#    include "opal/util/opal_cuda_copy.h"
 #    define MEMCPY_CUDA(DST, SRC, BLENGTH, CONVERTOR) \
         CONVERTOR->cbmemcpy((DST), (SRC), (BLENGTH), (CONVERTOR))
 #endif
diff --git a/opal/datatype/opal_datatype_copy.c b/opal/datatype/opal_datatype_copy.c
index 465f479e46c..77a51b65788 100644
--- a/opal/datatype/opal_datatype_copy.c
+++ b/opal/datatype/opal_datatype_copy.c
@@ -73,7 +73,7 @@ static size_t opal_datatype_memop_block_size = 128 * 1024;
 #include "opal_datatype_copy.h"
 
 #if OPAL_CUDA_SUPPORT
-#    include "opal/mca/common/cuda/common_cuda.h"
+#    include "opal/util/opal_cuda_copy.h"
 
 #    undef MEM_OP_NAME
 #    define MEM_OP_NAME non_overlap_cuda
diff --git a/opal/datatype/opal_datatype_pack_unpack_predefined.h b/opal/datatype/opal_datatype_pack_unpack_predefined.h
index 261f2d48adf..a1d6ccde732 100644
--- a/opal/datatype/opal_datatype_pack_unpack_predefined.h
+++ b/opal/datatype/opal_datatype_pack_unpack_predefined.h
@@ -54,7 +54,7 @@
 #define OPAL_DATATYPE_PACK_UNPACK_PREDEFINED_H_HAS_BEEN_INCLUDED
 
 #include "opal_config.h"
-#include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/util/opal_cuda_copy.h"
 #include <stdint.h>
 
 /*  Improve predefined pack/unpack performance using mpich methods.
diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c
index 2fdc4b100e3..fc302c690dd 100644
--- a/opal/mca/common/cuda/common_cuda.c
+++ b/opal/mca/common/cuda/common_cuda.c
@@ -34,6 +34,7 @@
 #include "opal/align.h"
 #include "opal/datatype/opal_convertor.h"
 #include "opal/util/argv.h"
+#include "opal/util/opal_cuda_copy.h"
 #include "opal/util/output.h"
 #include "opal/util/printf.h"
 #include "opal/util/proc.h"
@@ -2054,256 +2055,4 @@ void mca_common_cuda_get_buffer_id(mca_rcache_base_registration_t *reg)
     }
 }
 
-static bool initialized = false;
-int opal_cuda_verbose = 0;
-static int opal_cuda_enabled = 0; /* Starts out disabled */
-static int opal_cuda_output = 0;
-static void opal_cuda_support_init(void);
-static int (*common_cuda_initialization_function)(opal_common_cuda_function_table_t *) = NULL;
-static opal_common_cuda_function_table_t ftable;
-
-/* This function allows the common cuda code to register an
- * initialization function that gets called the first time an attempt
- * is made to send or receive a GPU pointer.  This allows us to delay
- * some CUDA initialization until after MPI_Init().
- */
-void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *))
-{
-    common_cuda_initialization_function = fptr;
-}
-
-/**
- * This function is called when a convertor is instantiated.  It has to call
- * the opal_cuda_support_init() function once to figure out if CUDA support
- * is enabled or not.  If CUDA is not enabled, then short circuit out
- * for all future calls.
- */
-void mca_cuda_convertor_init(opal_convertor_t *convertor, const void *pUserBuf)
-{
-    /* Only do the initialization on the first GPU access */
-    if (!initialized) {
-        opal_cuda_support_init();
-    }
-
-    /* This is needed to handle case where convertor is not fully initialized
-     * like when trying to do a sendi with convertor on the statck */
-    convertor->cbmemcpy = (memcpy_fct_t) &opal_cuda_memcpy;
-
-    /* If not enabled, then nothing else to do */
-    if (!opal_cuda_enabled) {
-        return;
-    }
-
-    if (ftable.gpu_is_gpu_buffer(pUserBuf, convertor)) {
-        convertor->flags |= CONVERTOR_CUDA;
-    }
-}
-
-/* Checks the type of pointer
- *
- * @param dest   One pointer to check
- * @param source Another pointer to check
- */
-bool opal_cuda_check_bufs(char *dest, char *src)
-{
-    /* Only do the initialization on the first GPU access */
-    if (!initialized) {
-        opal_cuda_support_init();
-    }
-
-    if (!opal_cuda_enabled) {
-        return false;
-    }
-
-    if (ftable.gpu_is_gpu_buffer(dest, NULL) || ftable.gpu_is_gpu_buffer(src, NULL)) {
-        return true;
-    } else {
-        return false;
-    }
-}
-
-/*
- * With CUDA enabled, all contiguous copies will pass through this function.
- * Therefore, the first check is to see if the convertor is a GPU buffer.
- * Note that if there is an error with any of the CUDA calls, the program
- * aborts as there is no recovering.
- */
-
-/* Checks the type of pointer
- *
- * @param buf   check one pointer providing a convertor.
- *  Provides aditional information, e.g. managed vs. unmanaged GPU buffer
- */
-bool opal_cuda_check_one_buf(char *buf, opal_convertor_t *convertor)
-{
-    /* Only do the initialization on the first GPU access */
-    if (!initialized) {
-        opal_cuda_support_init();
-    }
-
-    if (!opal_cuda_enabled) {
-        return false;
-    }
-
-    return (ftable.gpu_is_gpu_buffer(buf, convertor));
-}
-
-/*
- * This function allocates a buffer using either cuMemAlloc
- * or malloc, depending on if the convertor flag CONVERTOR_CUDA
- * is set.
- *
- * @param size       Size of buffer to be allocated
- * @param convertor  The convertor with flags describing if the buf
- *                   should be a Host or Cuda buffer.
- *
- * @returns void *   A pointer to the newly allocated buffer.
- */
-void *opal_cuda_malloc(size_t size, opal_convertor_t *convertor)
-{
-    int res;
-    void *buffer;
-    if (!(convertor->flags & CONVERTOR_CUDA)) {
-        return malloc(size);
-    }
-    res = ftable.gpu_malloc(buffer, size);
-    if (res != 0) {
-        opal_output(0, "CUDA: Error in cuMemAlloc: size=%d", (int) size);
-        abort();
-    } else {
-        return buffer;
-    }
-}
-
-/*
- * This function frees a buffer using either cuMemFree() or free(),
- * depending on if the convertor flag CONVERTOR_CUDA is set.
- *
- * @param buffer     Pointer to buffer to be freed
- * @param convertor  The convertor with flags describing if the buf
- *                   should be a Host or Cuda buffer.
- *
- */
-void opal_cuda_free(void *buffer, opal_convertor_t *convertor)
-{
-    int res;
-    if (!(convertor->flags & CONVERTOR_CUDA)) {
-        free(buffer);
-        return;
-    }
-    res = ftable.gpu_free(buffer);
-    if (res != 0) {
-        opal_output(0, "CUDA: Error in cuMemFree: ptr=%p", buffer);
-        abort();
-    }
-    return;
-}
-
-/*
- * With CUDA enabled, all contiguous copies will pass through this function.
- * Therefore, the first check is to see if the convertor is a GPU buffer.
- * Note that if there is an error with any of the CUDA calls, the program
- * aborts as there is no recovering.
- */
-
-void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t *convertor)
-{
-    int res;
-
-    if (!(convertor->flags & CONVERTOR_CUDA)) {
-        return memcpy(dest, src, size);
-    }
-
-    if (convertor->flags & CONVERTOR_CUDA_ASYNC) {
-        res = ftable.gpu_cu_memcpy_async(dest, (void *) src, size, convertor);
-    } else {
-        res = ftable.gpu_cu_memcpy(dest, (void *) src, size);
-    }
-
-    if (res != 0) {
-        opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", res, dest, src,
-                    (int) size);
-        abort();
-    } else {
-        return dest;
-    }
-}
-
-/*
- * This function is needed in cases where we do not have contiguous
- * datatypes.  The current code has macros that cannot handle a convertor
- * argument to the memcpy call.
- */
-void *opal_cuda_memcpy_sync(void *dest, const void *src, size_t size)
-{
-    int res;
-    res = ftable.gpu_cu_memcpy(dest, src, size);
-    if (res != 0) {
-        opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", res, dest, src,
-                    (int) size);
-        abort();
-    } else {
-        return dest;
-    }
-}
-
-/*
- * In some cases, need an implementation of memmove.  This is not fast, but
- * it is not often needed.
- */
-void *opal_cuda_memmove(void *dest, void *src, size_t size)
-{
-    int res;
-
-    res = ftable.gpu_memmove(dest, src, size);
-    if (res != 0) {
-        opal_output(0, "CUDA: Error in gpu memmove: res=%d, dest=%p, src=%p, size=%d", res, dest,
-                    src, (int) size);
-        abort();
-    }
-    return dest;
-}
-
-/**
- * This function gets called once to check if the program is running in a cuda
- * environment.
- */
-static void opal_cuda_support_init(void)
-{
-    if (initialized) {
-        return;
-    }
-
-    /* Set different levels of verbosity in the cuda related code. */
-    opal_cuda_output = opal_output_open(NULL);
-    opal_output_set_verbosity(opal_cuda_output, opal_cuda_verbose);
-
-    /* Callback into the common cuda initialization routine. This is only
-     * set if some work had been done already in the common cuda code.*/
-    if (NULL != common_cuda_initialization_function) {
-        if (0 == common_cuda_initialization_function(&ftable)) {
-            opal_cuda_enabled = 1;
-        }
-    }
-
-    if (1 == opal_cuda_enabled) {
-        opal_output_verbose(10, opal_cuda_output,
-                            "CUDA: enabled successfully, CUDA device pointers will work");
-    } else {
-        opal_output_verbose(10, opal_cuda_output,
-                            "CUDA: not enabled, CUDA device pointers will not work");
-    }
-
-    initialized = true;
-}
-
-/**
- * Tell the convertor that copies will be asynchronous CUDA copies.  The
- * flags are cleared when the convertor is reinitialized.
- */
-void opal_cuda_set_copy_function_async(opal_convertor_t *convertor, void *stream)
-{
-    convertor->flags |= CONVERTOR_CUDA_ASYNC;
-    convertor->stream = stream;
-}
 #endif /* OPAL_CUDA_GDR_SUPPORT */
diff --git a/opal/mca/common/cuda/common_cuda.h b/opal/mca/common/cuda/common_cuda.h
index 0889551640d..80781047f06 100644
--- a/opal/mca/common/cuda/common_cuda.h
+++ b/opal/mca/common/cuda/common_cuda.h
@@ -111,29 +111,4 @@ static inline int32_t opal_convertor_cuda_need_buffers(opal_convertor_t *pConver
     pConvertor->flags |= cudaflag; /* Restore CUDA flag */
     return retval;
 }
-
-/* Structure to hold CUDA support functions that gets filled in when the
- * common cuda code is initialized.  This removes any dependency on <cuda.h>
- * in the opal cuda datatype code. */
-struct opal_common_cuda_function_table {
-    int (*gpu_is_gpu_buffer)(const void *, opal_convertor_t *);
-    int (*gpu_cu_memcpy_async)(void *, const void *, size_t, opal_convertor_t *);
-    int (*gpu_cu_memcpy)(void *, const void *, size_t);
-    int (*gpu_memmove)(void *, void *, size_t);
-    int (*gpu_malloc)(void *, size_t);
-    int (*gpu_free)(void *);
-};
-typedef struct opal_common_cuda_function_table opal_common_cuda_function_table_t;
-
-void mca_cuda_convertor_init(opal_convertor_t *convertor, const void *pUserBuf);
-bool opal_cuda_check_bufs(char *dest, char *src);
-bool opal_cuda_check_one_buf(char *buf, opal_convertor_t *convertor);
-void *opal_cuda_malloc(size_t size, opal_convertor_t *convertor);
-void opal_cuda_free(void *buffer, opal_convertor_t *convertor);
-void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t *convertor);
-void *opal_cuda_memcpy_sync(void *dest, const void *src, size_t size);
-void *opal_cuda_memmove(void *dest, void *src, size_t size);
-void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *));
-void opal_cuda_set_copy_function_async(opal_convertor_t *convertor, void *stream);
-
 #endif /* OPAL_MCA_COMMON_CUDA_H */
diff --git a/opal/util/Makefile.am b/opal/util/Makefile.am
index 4a457d7e1b3..e09818182f0 100644
--- a/opal/util/Makefile.am
+++ b/opal/util/Makefile.am
@@ -124,6 +124,12 @@ if OPAL_COMPILE_TIMING
 libopalutil_la_SOURCES += timings.c
 endif
 
+# If we have cuda support, modify file list and flags
+if OPAL_cuda_support
+libopalutil_la_SOURCES += opal_cuda_copy.c
+headers += opal_cuda_copy.h
+endif
+
 libopalutil_la_LIBADD = \
         keyval/libopalutilkeyval.la
 libopalutil_la_DEPENDENCIES = \
diff --git a/opal/util/opal_cuda_copy.c b/opal/util/opal_cuda_copy.c
new file mode 100644
index 00000000000..26482b553f0
--- /dev/null
+++ b/opal/util/opal_cuda_copy.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2011-2014 NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2018-2022 Amazon.com, Inc. or its affiliates.  All Rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "opal_config.h"
+
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "opal/align.h"
+#include "opal/util/output.h"
+#include "opal/datatype/opal_convertor.h"
+#include "opal/util/opal_cuda_copy.h"
+
+static bool initialized = false;
+int opal_cuda_verbose = 0;
+static int opal_cuda_enabled = 0; /* Starts out disabled */
+static int opal_cuda_output = 0;
+static void opal_cuda_support_init(void);
+static int (*common_cuda_initialization_function)(opal_common_cuda_function_table_t *) = NULL;
+static opal_common_cuda_function_table_t ftable;
+
+/* This function allows the common cuda code to register an
+ * initialization function that gets called the first time an attempt
+ * is made to send or receive a GPU pointer.  This allows us to delay
+ * some CUDA initialization until after MPI_Init().
+ */
+void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *))
+{
+    common_cuda_initialization_function = fptr;
+}
+
+/**
+ * This function is called when a convertor is instantiated.  It has to call
+ * the opal_cuda_support_init() function once to figure out if CUDA support
+ * is enabled or not.  If CUDA is not enabled, then short circuit out
+ * for all future calls.
+ */
+void mca_cuda_convertor_init(opal_convertor_t *convertor, const void *pUserBuf)
+{
+    /* Only do the initialization on the first GPU access */
+    if (!initialized) {
+        opal_cuda_support_init();
+    }
+
+    /* This is needed to handle case where convertor is not fully initialized
+     * like when trying to do a sendi with convertor on the statck */
+    convertor->cbmemcpy = (memcpy_fct_t) &opal_cuda_memcpy;
+
+    /* If not enabled, then nothing else to do */
+    if (!opal_cuda_enabled) {
+        return;
+    }
+
+    if (ftable.gpu_is_gpu_buffer(pUserBuf, convertor)) {
+        convertor->flags |= CONVERTOR_CUDA;
+    }
+}
+
+/* Checks the type of pointer
+ *
+ * @param dest   One pointer to check
+ * @param source Another pointer to check
+ */
+bool opal_cuda_check_bufs(char *dest, char *src)
+{
+    /* Only do the initialization on the first GPU access */
+    if (!initialized) {
+        opal_cuda_support_init();
+    }
+
+    if (!opal_cuda_enabled) {
+        return false;
+    }
+
+    if (ftable.gpu_is_gpu_buffer(dest, NULL) || ftable.gpu_is_gpu_buffer(src, NULL)) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+/*
+ * With CUDA enabled, all contiguous copies will pass through this function.
+ * Therefore, the first check is to see if the convertor is a GPU buffer.
+ * Note that if there is an error with any of the CUDA calls, the program
+ * aborts as there is no recovering.
+ */
+
+/* Checks the type of pointer
+ *
+ * @param buf   check one pointer providing a convertor.
+ *  Provides aditional information, e.g. managed vs. unmanaged GPU buffer
+ */
+bool opal_cuda_check_one_buf(char *buf, opal_convertor_t *convertor)
+{
+    /* Only do the initialization on the first GPU access */
+    if (!initialized) {
+        opal_cuda_support_init();
+    }
+
+    if (!opal_cuda_enabled) {
+        return false;
+    }
+
+    return (ftable.gpu_is_gpu_buffer(buf, convertor));
+}
+
+/*
+ * This function allocates a buffer using either cuMemAlloc
+ * or malloc, depending on if the convertor flag CONVERTOR_CUDA
+ * is set.
+ *
+ * @param size       Size of buffer to be allocated
+ * @param convertor  The convertor with flags describing if the buf
+ *                   should be a Host or Cuda buffer.
+ *
+ * @returns void *   A pointer to the newly allocated buffer.
+ */
+void *opal_cuda_malloc(size_t size, opal_convertor_t *convertor)
+{
+    int res;
+    void *buffer;
+    if (!(convertor->flags & CONVERTOR_CUDA)) {
+        return malloc(size);
+    }
+    res = ftable.gpu_malloc(buffer, size);
+    if (res != 0) {
+        opal_output(0, "CUDA: Error in cuMemAlloc: size=%d", (int) size);
+        abort();
+    } else {
+        return buffer;
+    }
+}
+
+/*
+ * This function frees a buffer using either cuMemFree() or free(),
+ * depending on if the convertor flag CONVERTOR_CUDA is set.
+ *
+ * @param buffer     Pointer to buffer to be freed
+ * @param convertor  The convertor with flags describing if the buf
+ *                   should be a Host or Cuda buffer.
+ *
+ */
+void opal_cuda_free(void *buffer, opal_convertor_t *convertor)
+{
+    int res;
+    if (!(convertor->flags & CONVERTOR_CUDA)) {
+        free(buffer);
+        return;
+    }
+    res = ftable.gpu_free(buffer);
+    if (res != 0) {
+        opal_output(0, "CUDA: Error in cuMemFree: ptr=%p", buffer);
+        abort();
+    }
+    return;
+}
+
+/*
+ * With CUDA enabled, all contiguous copies will pass through this function.
+ * Therefore, the first check is to see if the convertor is a GPU buffer.
+ * Note that if there is an error with any of the CUDA calls, the program
+ * aborts as there is no recovering.
+ */
+
+void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t *convertor)
+{
+    int res;
+
+    if (!(convertor->flags & CONVERTOR_CUDA)) {
+        return memcpy(dest, src, size);
+    }
+
+    if (convertor->flags & CONVERTOR_CUDA_ASYNC) {
+        res = ftable.gpu_cu_memcpy_async(dest, (void *) src, size, convertor);
+    } else {
+        res = ftable.gpu_cu_memcpy(dest, (void *) src, size);
+    }
+
+    if (res != 0) {
+        opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", res, dest, src,
+                    (int) size);
+        abort();
+    } else {
+        return dest;
+    }
+}
+
+/*
+ * This function is needed in cases where we do not have contiguous
+ * datatypes.  The current code has macros that cannot handle a convertor
+ * argument to the memcpy call.
+ */
+void *opal_cuda_memcpy_sync(void *dest, const void *src, size_t size)
+{
+    int res;
+    res = ftable.gpu_cu_memcpy(dest, src, size);
+    if (res != 0) {
+        opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", res, dest, src,
+                    (int) size);
+        abort();
+    } else {
+        return dest;
+    }
+}
+
+/*
+ * In some cases, need an implementation of memmove.  This is not fast, but
+ * it is not often needed.
+ */
+void *opal_cuda_memmove(void *dest, void *src, size_t size)
+{
+    int res;
+
+    res = ftable.gpu_memmove(dest, src, size);
+    if (res != 0) {
+        opal_output(0, "CUDA: Error in gpu memmove: res=%d, dest=%p, src=%p, size=%d", res, dest,
+                    src, (int) size);
+        abort();
+    }
+    return dest;
+}
+
+/**
+ * This function gets called once to check if the program is running in a cuda
+ * environment.
+ */
+static void opal_cuda_support_init(void)
+{
+    if (initialized) {
+        return;
+    }
+
+    /* Set different levels of verbosity in the cuda related code. */
+    opal_cuda_output = opal_output_open(NULL);
+    opal_output_set_verbosity(opal_cuda_output, opal_cuda_verbose);
+
+    /* Callback into the common cuda initialization routine. This is only
+     * set if some work had been done already in the common cuda code.*/
+    if (NULL != common_cuda_initialization_function) {
+        if (0 == common_cuda_initialization_function(&ftable)) {
+            opal_cuda_enabled = 1;
+        }
+    }
+
+    if (1 == opal_cuda_enabled) {
+        opal_output_verbose(10, opal_cuda_output,
+                            "CUDA: enabled successfully, CUDA device pointers will work");
+    } else {
+        opal_output_verbose(10, opal_cuda_output,
+                            "CUDA: not enabled, CUDA device pointers will not work");
+    }
+
+    initialized = true;
+}
+
+/**
+ * Tell the convertor that copies will be asynchronous CUDA copies.  The
+ * flags are cleared when the convertor is reinitialized.
+ */
+void opal_cuda_set_copy_function_async(opal_convertor_t *convertor, void *stream)
+{
+    convertor->flags |= CONVERTOR_CUDA_ASYNC;
+    convertor->stream = stream;
+}
diff --git a/opal/util/opal_cuda_copy.h b/opal/util/opal_cuda_copy.h
new file mode 100644
index 00000000000..d9065f4a82d
--- /dev/null
+++ b/opal/util/opal_cuda_copy.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2011-2014 NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2018-2022 Amazon.com, Inc. or its affiliates.  All Rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef OPAL_CUDA_COPY_H
+#define OPAL_CUDA_COPY_H
+
+/* Structure to hold CUDA support functions that gets filled in when the
+ * common cuda code is initialized.  This removes any dependency on <cuda.h>
+ * in the opal cuda datatype code. */
+struct opal_common_cuda_function_table {
+    int (*gpu_is_gpu_buffer)(const void *, opal_convertor_t *);
+    int (*gpu_cu_memcpy_async)(void *, const void *, size_t, opal_convertor_t *);
+    int (*gpu_cu_memcpy)(void *, const void *, size_t);
+    int (*gpu_memmove)(void *, void *, size_t);
+    int (*gpu_malloc)(void *, size_t);
+    int (*gpu_free)(void *);
+};
+typedef struct opal_common_cuda_function_table opal_common_cuda_function_table_t;
+
+void mca_cuda_convertor_init(opal_convertor_t *convertor, const void *pUserBuf);
+bool opal_cuda_check_bufs(char *dest, char *src);
+bool opal_cuda_check_one_buf(char *buf, opal_convertor_t *convertor);
+void *opal_cuda_malloc(size_t size, opal_convertor_t *convertor);
+void opal_cuda_free(void *buffer, opal_convertor_t *convertor);
+void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t *convertor);
+void *opal_cuda_memcpy_sync(void *dest, const void *src, size_t size);
+void *opal_cuda_memmove(void *dest, void *src, size_t size);
+void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *));
+void opal_cuda_set_copy_function_async(opal_convertor_t *convertor, void *stream);
+
+#endif /* OPAL_CUDA_COPY_H */