- 
                Notifications
    You must be signed in to change notification settings 
- Fork 928
Add CUDA/HIP implementations of reduction operators #12569
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
4b8da14
              13aeecf
              bc5c3a1
              c2c5aec
              606f778
              37c5dad
              4d4d629
              9fe6351
              60cc5aa
              46fbda1
              730102b
              c200c02
              File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,125 @@ | ||
| dnl -*- autoconf -*- | ||
| dnl | ||
| dnl Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana | ||
| dnl University Research and Technology | ||
| dnl Corporation. All rights reserved. | ||
| dnl Copyright (c) 2004-2005 The University of Tennessee and The University | ||
| dnl of Tennessee Research Foundation. All rights | ||
| dnl reserved. | ||
| dnl Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, | ||
| dnl University of Stuttgart. All rights reserved. | ||
| dnl Copyright (c) 2004-2005 The Regents of the University of California. | ||
| dnl All rights reserved. | ||
| dnl Copyright (c) 2006-2016 Cisco Systems, Inc. All rights reserved. | ||
| dnl Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. | ||
| dnl Copyright (c) 2009 IBM Corporation. All rights reserved. | ||
| dnl Copyright (c) 2009 Los Alamos National Security, LLC. All rights | ||
| dnl reserved. | ||
| dnl Copyright (c) 2009-2011 Oak Ridge National Labs. All rights reserved. | ||
| dnl Copyright (c) 2011-2015 NVIDIA Corporation. All rights reserved. | ||
| dnl Copyright (c) 2015 Research Organization for Information Science | ||
| dnl and Technology (RIST). All rights reserved. | ||
| dnl Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved. | ||
| dnl $COPYRIGHT$ | ||
| dnl | ||
| dnl Additional copyrights may follow | ||
| dnl | ||
| dnl $HEADER$ | ||
| dnl | ||
|  | ||
|  | ||
| # OPAL_CHECK_CUDART(prefix, [action-if-found], [action-if-not-found]) | ||
| # -------------------------------------------------------- | ||
| # check if CUDA runtime library support can be found. sets prefix_{CPPFLAGS, | ||
| # LDFLAGS, LIBS} as needed and runs action-if-found if there is | ||
| # support, otherwise executes action-if-not-found | ||
|  | ||
| # | ||
| # Check for CUDA support | ||
| # | ||
| AC_DEFUN([OPAL_CHECK_CUDART],[ | ||
| OPAL_VAR_SCOPE_PUSH([cudart_save_CPPFLAGS cudart_save_LDFLAGS cudart_save_LIBS]) | ||
|  | ||
| cudart_save_CPPFLAGS="$CPPFLAGS" | ||
| cudart_save_LDFLAGS="$LDFLAGS" | ||
| cudart_save_LIBS="$LIBS" | ||
|  | ||
| # | ||
| # Check to see if the user provided paths for CUDART | ||
| # | ||
| AC_ARG_WITH([cudart], | ||
| [AS_HELP_STRING([--with-cudart=DIR], | ||
| [Path to the CUDA runtime library and header files])]) | ||
| AC_MSG_CHECKING([if --with-cudart is set]) | ||
| AC_ARG_WITH([cudart-libdir], | ||
| [AS_HELP_STRING([--with-cudart-libdir=DIR], | ||
| [Search for CUDA runtime libraries in DIR])]) | ||
|  | ||
| #################################### | ||
| #### Check for CUDA runtime library | ||
| #################################### | ||
| AS_IF([test "x$with_cudart" = "xno" || test "x$with_cudart" = "x"], | ||
| [opal_check_cudart_happy=no | ||
| AC_MSG_RESULT([not set (--with-cudart=$with_cudart)])], | ||
| [AS_IF([test ! -d "$with_cudart"], | ||
| [AC_MSG_RESULT([not found]) | ||
| AC_MSG_WARN([Directory $with_cudart not found])], | ||
| [OPAL_FLAGS_APPEND_UNIQ([CPPFLAGS], [-I$with_cudart/include]) | ||
| AC_CHECK_HEADERS([cuda_runtime.h], | ||
| [opal_check_cudart_happy=yes | ||
| opal_cudart_incdir="$with_cudart/include"] | ||
| [AC_MSG_RESULT([not found]) | ||
| AC_MSG_WARN([Could not find cuda_runtime.h in $with_cudart/include])])])]) | ||
| CPPFLAGS=${cudart_save_CPPFLAGS} | ||
|  | ||
| # try include path relative to nvcc | ||
| AS_IF([test "$opal_check_cudart_happy" = "no" && test "$with_cudart" != "no"], | ||
| [AC_PATH_PROG([nvcc_bin], [nvcc], ["not-found"]) | ||
| AS_IF([test "$nvcc_bin" = "not-found"], | ||
| [AC_MSG_WARN([Could not find nvcc binary])], | ||
| [nvcc_dirname=`AS_DIRNAME([$nvcc_bin])` | ||
| OPAL_FLAGS_APPEND_UNIQ([CPPFLAGS], [-I$nvcc_dirname/../include]) | ||
| AC_CHECK_HEADERS([cuda_runtime.h], | ||
| [opal_check_cudart_happy=yes, | ||
| with_cudart=$nvcc_dirname/../ | ||
| opal_cudart_incdir="$with_cudart/include"])])], | ||
| []) | ||
| CPPFLAGS=${cudart_save_CPPFLAGS} | ||
|  | ||
| AS_IF([test x"$with_cudart_libdir" = "x"], | ||
| [with_cudart_libdir=$with_cudart/lib64/], | ||
| []) | ||
|  | ||
| AS_IF([test "$opal_check_cudart_happy" = "yes"], | ||
| [OAC_CHECK_PACKAGE([cudart], | ||
| [$1], | ||
| [cuda_runtime.h], | ||
| [cudart], | ||
| [cudaMalloc], | ||
| [opal_check_cudart_happy="yes"], | ||
| [opal_check_cudart_happy="no"])], | ||
| []) | ||
|  | ||
|  | ||
| AC_MSG_CHECKING([if have cuda runtime library support]) | ||
| if test "$opal_check_cudart_happy" = "yes"; then | ||
| AC_MSG_RESULT([yes (-I$opal_cudart_incdir)]) | ||
| CUDART_SUPPORT=1 | ||
| common_cudart_CPPFLAGS="-I$opal_cudart_incdir" | ||
| AC_SUBST([common_cudart_CPPFLAGS]) | ||
| else | ||
| AC_MSG_RESULT([no]) | ||
| CUDART_SUPPORT=0 | ||
| fi | ||
|  | ||
|  | ||
| OPAL_SUMMARY_ADD([Accelerators], [CUDART support], [], [$opal_check_cudart_happy]) | ||
| AM_CONDITIONAL([OPAL_cudart_support], [test "x$CUDART_SUPPORT" = "x1"]) | ||
| AC_DEFINE_UNQUOTED([OPAL_CUDART_SUPPORT],$CUDART_SUPPORT, | ||
| [Whether we have cuda runtime library support]) | ||
|  | ||
| CPPFLAGS=${cudart_save_CPPFLAGS} | ||
| LDFLAGS=${cudart_save_LDFLAGS} | ||
| LIBS=${cudart_save_LIBS} | ||
| OPAL_VAR_SCOPE_POP | ||
| ])dnl | ||
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,48 @@ | ||
| dnl -*- autoconf -*- | ||
| dnl | ||
| dnl Copyright (c) 2024 Stony Brook University. All rights reserved. | ||
| dnl | ||
| dnl $COPYRIGHT$ | ||
| dnl | ||
| dnl Additional copyrights may follow | ||
| dnl | ||
| dnl $HEADER$ | ||
| dnl | ||
|  | ||
| dnl | ||
| dnl Check for HIPCC and bail out if HIPCC was requested | ||
| dnl Options provided: | ||
| dnl --with-hipcc[=path/to/hipcc]: provide a path to HIPCC | ||
| dnl --enable-hipcc: require HIPCC, bail out if not found | ||
| dnl | ||
|  | ||
| AC_DEFUN([OPAL_CHECK_HIPCC],[ | ||
|  | ||
| AC_ARG_ENABLE([hipcc], | ||
| [AS_HELP_STRING([--enable-hipcc], | ||
| [Force configure to fail if hipcc is not found (hipcc is used to build HIP operator support).])]) | ||
|  | ||
| AC_ARG_WITH([hipcc], | ||
| [AS_HELP_STRING([--with-hipcc=DIR], | ||
| [Path to the HIP compiler])]) | ||
|  | ||
| AS_IF([test -n "$with_hipcc"], | ||
| [HIPCC=$with_hipcc]) | ||
| AS_IF([test -z "$HIPCC"], | ||
| # try to find hipcc in PATH | ||
| [AC_PATH_PROG([HIPCC], [hipcc], [])]) | ||
|  | ||
| # disable support if explicitly specified | ||
| AS_IF([test "$enable_hipcc" = "no"], | ||
| [HIPCC=]) | ||
|  | ||
| AS_IF([test -z "$HIPCC" && test "$enable_hipcc" = "yes"], | ||
| [AC_MSG_WARN([A suitable HIP compiler was not found, but --enable-hipcc=yes was specified]) | ||
| AC_MSG_ERROR([Cannot continue])]) | ||
|  | ||
| OPAL_SUMMARY_ADD([Accelerators], [HIPCC compiler], [], [$HIPCC (flags: $HIPCCFLAGS)]) | ||
|  | ||
| AC_ARG_VAR([HIPCC], [AMD HIP compiler]) | ||
| AC_ARG_VAR([HIPCCFLAGS], [AMD HIP compiler flags]) | ||
|  | ||
| ]) | 
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,56 @@ | ||||||
| dnl -*- autoconf -*- | ||||||
| dnl | ||||||
| dnl Copyright (c) 2024 Stony Brook University. All rights reserved. | ||||||
| dnl | ||||||
| dnl $COPYRIGHT$ | ||||||
| dnl | ||||||
| dnl Additional copyrights may follow | ||||||
| dnl | ||||||
| dnl $HEADER$ | ||||||
| dnl | ||||||
|  | ||||||
| dnl | ||||||
| dnl Check for NVCC and bail out if NVCC was requested | ||||||
| dnl Options provided: | ||||||
| dnl --with-nvcc[=path/to/nvcc]: provide a path to NVCC | ||||||
| dnl --enable-nvcc: require NVCC, bail out if not found | ||||||
| dnl --nvcc-compute-arch: request a specific compute | ||||||
| dnl architecture for the operator | ||||||
| dnl kernels | ||||||
| dnl | ||||||
|  | ||||||
| AC_DEFUN([OPAL_CHECK_NVCC],[ | ||||||
| AC_ARG_ENABLE([nvcc], | ||||||
| [AS_HELP_STRING([--enable-nvcc], | ||||||
| [Force configure to fail if CUDA nvcc is not found (CUDA nvcc is used to build CUDA operator support).])]) | ||||||
| AC_ARG_WITH([nvcc], | ||||||
| [AS_HELP_STRING([--with-nvcc=DIR], | ||||||
| [Path to the CUDA compiler])]) | ||||||
| AS_IF([test -n "$with_nvcc"], | ||||||
| [NVCC=$with_nvcc]) | ||||||
| AS_IF([test -z "$NVCC"], | ||||||
| # try to find nvcc in PATH | ||||||
| [AC_PATH_PROG([NVCC], [nvcc], [])]) | ||||||
| # disable ussage of NVCC if explicitly specified | ||||||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 
        Suggested change
       
 | ||||||
| AS_IF([test "$enable_nvcc" = "no"], | ||||||
| [NVCC=]) | ||||||
| # prepend C++17 standard, allow override by user | ||||||
| AS_IF([test -n "$NVCCFLAGS"], | ||||||
| [NVCCFLAGS=--std c++17 $NVCCFLAGS], | ||||||
| [NVCCFLAGS=--std c++17]) | ||||||
| AS_IF([test -z "$NVCC" && test "$enable_nvcc" = "yes"], | ||||||
| [AC_MSG_WARN([A suitable CUDA compiler was not found, but --enable-nvcc=yes was specified]) | ||||||
| AC_MSG_ERROR([Cannot continue])]) | ||||||
| OPAL_SUMMARY_ADD([Accelerators], [NVCC compiler], [], [$NVCC (flags: $NVCCFLAGS)]) | ||||||
| AC_ARG_VAR([NVCC], [NVIDIA CUDA compiler]) | ||||||
| AC_ARG_VAR([NVCCFLAGS], [NVIDIA CUDA compiler flags]) | ||||||
| ]) | ||||||
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -3,7 +3,7 @@ | |
| * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana | ||
| * University Research and Technology | ||
| * Corporation. All rights reserved. | ||
| * Copyright (c) 2004-2009 The University of Tennessee and The University | ||
| * Copyright (c) 2004-2023 The University of Tennessee and The University | ||
| * of Tennessee Research Foundation. All rights | ||
| * reserved. | ||
| * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, | ||
|  | @@ -152,22 +152,50 @@ int ompi_op_base_op_select(ompi_op_t *op) | |
| } | ||
|  | ||
| /* Copy over the non-NULL pointers */ | ||
| for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) { | ||
|         
                  bosilca marked this conversation as resolved.
              Show resolved
            Hide resolved | ||
| /* 2-buffer variants */ | ||
| if (NULL != avail->ao_module->opm_fns[i]) { | ||
| OBJ_RELEASE(op->o_func.intrinsic.modules[i]); | ||
| op->o_func.intrinsic.fns[i] = avail->ao_module->opm_fns[i]; | ||
| op->o_func.intrinsic.modules[i] = avail->ao_module; | ||
| OBJ_RETAIN(avail->ao_module); | ||
| if (avail->ao_module->opm_device_enabled) { | ||
| if (NULL == op->o_device_op) { | ||
| op->o_device_op = calloc(1, sizeof(*op->o_device_op)); | ||
| } | ||
|  | ||
| /* 3-buffer variants */ | ||
| if (NULL != avail->ao_module->opm_3buff_fns[i]) { | ||
| OBJ_RELEASE(op->o_3buff_intrinsic.modules[i]); | ||
| op->o_3buff_intrinsic.fns[i] = | ||
| avail->ao_module->opm_3buff_fns[i]; | ||
| op->o_3buff_intrinsic.modules[i] = avail->ao_module; | ||
| OBJ_RETAIN(avail->ao_module); | ||
| for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) { | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Adding an entire new type of ompi_op just to cater to the need for a stream is kind of ugly. I understand the desire to make them as flexible as possible, but in the context of MPI we handle a very restricted number of streams, and we expect the MPI_Op to always execute in a single stream. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ideally, we will come to a point where the user can provide us with a stream. We would then operate on that stream, so it makes sense to pass a stream into the operator. Are you suggesting we use a default stream across all of OMPI? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The user might configure some streams in OMPI, but not a stream per invocation of an MPI_Op. A stream per communicator would be a good addition, and we will figure out how to pass it down to operations not using communicators (such as the MPI_Op). But adding it as an explicit argument creates two MPI_Op API. I don't have a better idea right now, it is just that this approach requires too much code modification for very little added benefit. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. An alternative (the only I can think of) to explicit API pass-through is thread-local variables. That is hidden state, ugly and error-prone. In fact, we want to have both host-side and device-side incarnations of ops side-by-side because we don't know whether the user will pass us host or device buffers. So even if they had the same signature we would want to store them separately. I'm not sure that it would simplify in any meaningful way then. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I thought about thread-local but as you said it is error-prone and unsafe. I was more inclined toward a context-level storage solution, such as a communicator or maybe the collective itself, but something higher level than the MPI_Op. The reason is that at the end we will want to be able to orchestrate (and take advantage) of the dependencies between different parts of the same collective, and this is more natural if they share a stream. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The question of how the streams ends up in MPI is an interesting one (and I am favoring communicators as well). Somehow it needs to come from the high-level to the operator and I still favor the direct way of passing it as an argument. I just realized that when adding the  There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think you need to bump the version of the module/component struct as the other function pointer has the same signature. | ||
| /* 2-buffer variants */ | ||
| if (NULL != avail->ao_module->opm_stream_fns[i]) { | ||
| if (NULL != op->o_device_op->do_intrinsic.modules[i]) { | ||
| OBJ_RELEASE(op->o_device_op->do_intrinsic.modules[i]); | ||
| } | ||
| op->o_device_op->do_intrinsic.fns[i] = avail->ao_module->opm_stream_fns[i]; | ||
| op->o_device_op->do_intrinsic.modules[i] = avail->ao_module; | ||
| OBJ_RETAIN(avail->ao_module); | ||
| } | ||
|  | ||
| /* 3-buffer variants */ | ||
| if (NULL != avail->ao_module->opm_3buff_stream_fns[i]) { | ||
| if (NULL != op->o_device_op->do_3buff_intrinsic.modules[i]) { | ||
| OBJ_RELEASE(op->o_device_op->do_3buff_intrinsic.modules[i]); | ||
| } | ||
| op->o_device_op->do_3buff_intrinsic.fns[i] = | ||
| avail->ao_module->opm_3buff_stream_fns[i]; | ||
| op->o_device_op->do_3buff_intrinsic.modules[i] = avail->ao_module; | ||
| OBJ_RETAIN(avail->ao_module); | ||
| } | ||
| } | ||
| } else { | ||
| for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) { | ||
| /* 2-buffer variants */ | ||
| if (NULL != avail->ao_module->opm_fns[i]) { | ||
| OBJ_RELEASE(op->o_func.intrinsic.modules[i]); | ||
| op->o_func.intrinsic.fns[i] = avail->ao_module->opm_fns[i]; | ||
| op->o_func.intrinsic.modules[i] = avail->ao_module; | ||
| OBJ_RETAIN(avail->ao_module); | ||
| } | ||
|  | ||
| /* 3-buffer variants */ | ||
| if (NULL != avail->ao_module->opm_3buff_fns[i]) { | ||
| OBJ_RELEASE(op->o_3buff_intrinsic.modules[i]); | ||
| op->o_3buff_intrinsic.fns[i] = | ||
| avail->ao_module->opm_3buff_fns[i]; | ||
| op->o_3buff_intrinsic.modules[i] = avail->ao_module; | ||
| OBJ_RETAIN(avail->ao_module); | ||
| } | ||
| } | ||
| } | ||
|  | ||
|  | ||
Uh oh!
There was an error while loading. Please reload this page.