diff --git a/config/ompi_check_ubcl.m4 b/config/ompi_check_ubcl.m4 new file mode 100644 index 00000000000..d44a6e4b6cc --- /dev/null +++ b/config/ompi_check_ubcl.m4 @@ -0,0 +1,57 @@ +# -*- shell-script -*- +# +# Copyright (C) 2015-2017 Mellanox Technologies, Inc. +# All rights reserved. +# Copyright (c) 2015 Research Organization for Information Science +# and Technology (RIST). All rights reserved. +# Copyright (c) 2016 Los Alamos National Security, LLC. All rights +# reserved. +# Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved. +# Copyright (c) 2024-2025 Bull S.A.S. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# OMPI_CHECK_UBCL(prefix, [action-if-found], [action-if-not-found]) +# -------------------------------------------------------- +# check if UBCL support can be found. sets prefix_{CPPFLAGS, +# as needed and runs action-if-found if there is +# support, otherwise executes action-if-not-found +AC_DEFUN([OMPI_CHECK_UBCL],[ + OPAL_VAR_SCOPE_PUSH([ompi_check_ubcl_dir ompi_check_ubcl_happy]) + + m4_ifblank([$1], [m4_fatal([First argument to OMPI_CHECK_UBCL cannot be blank])]) + + AC_ARG_WITH([ubcl], + [AC_HELP_STRING([--with-ubcl(=DIR)], + [Build with UBCL support])]) + + # UBCL is dlopen'd to avoid direct link to libubcl.so. + # OAC_CHECK_PACKAGE would add this explicit link, so it cannot be used. + # OPAL_CHECK_WITHDIR prints an error if the given path is invalid + OPAL_CHECK_WITHDIR([ubcl], [$with_ubcl], [include/ubcl_api.h]) + + AS_IF([test "$with_ubcl" == "no"], + [ompi_check_ubcl_happy="no"], + + [test -z "$with_ubcl"], + [ompi_check_ubcl_happy="no"], + + [ompi_check_ubcl_happy="yes" + $1_CPPFLAGS="${$1_CPPFLAGS} -I$with_ubcl/include/" + AC_MSG_NOTICE([$1_CPPFLAGS is set to: ${$1_CPPFLAGS}])]) + + + OPAL_SUMMARY_ADD([Transports],[UBCL],[],[$ompi_check_ubcl_happy]) + + AS_IF([test "$ompi_check_ubcl_happy" = "yes"], + [$2], + [$3]) + + OPAL_VAR_SCOPE_POP +]) + diff --git a/ompi/mca/common/ubcl/Makefile.am b/ompi/mca/common/ubcl/Makefile.am new file mode 100644 index 00000000000..0cd4eb083ef --- /dev/null +++ b/ompi/mca/common/ubcl/Makefile.am @@ -0,0 +1,94 @@ +# Copyright (c) 2025 Bull SAS. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +#AM_CPPFLAGS = $(common_ubcl_CPPFLAGS) + +common_ubcl_sources = \ + common_ubcl.c \ + common_ubcl.h + +lib_LTLIBRARIES = +noinst_LTLIBRARIES = + +#Common component naming is forced by MCA_PROCESS_COMPONENT in config/opal_mca.m4 +# to lib${_LIB_NAME}mca_common_ubcl.la but OMPI_LIB_NAME does not exist +# so let's hope that no other project name is empty or there are no other common +comp_inst = libmca_common_ubcl.la +comp_noinst = libmca_common_ubcl_noinst.la + +if MCA_BUILD_ompi_common_ubcl_DSO +lib_LTLIBRARIES += $(comp_inst) +else +noinst_LTLIBRARIES += $(comp_noinst) +endif + +libmca_common_ubcl_la_SOURCES = $(common_ubcl_sources) +libmca_common_ubcl_la_CFLAGS = $(common_ubcl_CFLAGS) +libmca_common_ubcl_la_CPPFLAGS = $(common_ubcl_CPPFLAGS) +libmca_common_ubcl_la_LDFLAGS = $(common_ubcl_LDFLAGS) +libmca_common_ubcl_la_LIBADD = $(common_ubcl_LIBS) \ + $(OPAL_TOP_BUILDDIR)/opal/mca/common/ubcl/lib@OPAL_LIB_NAME@mca_common_ubcl.la + +libmca_common_ubcl_noinst_la_SOURCES = $(common_ubcl_sources) + +# Conditionally install the header files + +if WANT_INSTALL_HEADERS +ompidir = $(ompiincludedir)/$(subdir) +ompi_HEADERS = common_ubcl.h +endif + + +# This library is linked against various MCA components. +# There's two cases: +# +# 1. libmca_common_ubcl.la is a shared library. By linking that shared +# library to all components that need it, the OS linker will +# automatically load it into the process as necessary, and there will +# only be one copy (i.e., all the components will share *one* copy of +# the code and data). +# +# 2. libmca_common_ubcl.la is a static library. In this case, it will +# be rolled up into the top-level libmpi.la. It will also be rolled +# into each component, but then the component will also be rolled up +# into the upper-level libmpi.la. Linkers universally know how to +# "figure this out" so that we end up with only one copy of the code +# and data. +# +# As per above, we'll either have an installable or noinst result. +# The installable one should follow the same MCA prefix naming rules +# (i.e., libmca__.la). The noinst one can be named +# whatever it wants, although libmca___noinst.la is +# recommended. + +# To simplify components that link to this library, we will *always* +# have an output libtool library named libmca__.la -- even +# for case 2) described above (i.e., so there's no conditional logic +# necessary in component Makefile.am's that link to this library). +# Hence, if we're creating a noinst version of this library (i.e., +# case 2), we sym link it to the libmca__.la name +# (libtool will do the Right Things under the covers). See the +# all-local and clean-local rules, below, for how this is effected. +# These two rules will sym link the "noinst" libtool library filename +# to the installable libtool library filename in the case where we are +# compiling this component statically (case 2), described above). +V=0 +OMPI_V_LN_SCOMP = $(ompi__v_LN_SCOMP_$V) +ompi__v_LN_SCOMP_ = $(ompi__v_LN_SCOMP_$AM_DEFAULT_VERBOSITY) +ompi__v_LN_SCOMP_0 = @echo " LN_S " `basename $(comp_inst)`; + +all-local: + $(OMPI_V_LN_SCOMP) if test -z "$(lib_LTLIBRARIES)"; then \ + rm -f "$(comp_inst)"; \ + $(LN_S) "$(comp_noinst)" "$(comp_inst)"; \ + fi + +clean-local: + if test -z "$(lib_LTLIBRARIES)"; then \ + rm -f "$(comp_inst)"; \ + fi diff --git a/ompi/mca/common/ubcl/common_ubcl.c b/ompi/mca/common/ubcl/common_ubcl.c new file mode 100644 index 00000000000..0e95c01d66a --- /dev/null +++ b/ompi/mca/common/ubcl/common_ubcl.c @@ -0,0 +1,173 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include +#include + +#include "ompi/communicator/communicator.h" +#include "ompi/errhandler/errhandler.h" +#include "ompi/include/mpi.h" +#include "ompi/runtime/mpiruntime.h" +#include "ompi/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl.h" +#include "ompi/mca/pml/pml_constants.h" +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "opal/util/output.h" + +/* Default ompi_common_ubcl values */ +mca_ompi_common_ubcl_component_t mca_ompi_common_ubcl_component = { + .n_addr = 32, +}; + +static int mca_common_ubcl_find_rank(const struct ompi_communicator_t *comm, const uint64_t wrank) +{ + mca_pml_ubcl_comm_t *pml_comm = comm->c_pml_comm; + + if (NULL == comm->c_pml_comm) { + common_ubcl_error("UBCL error: no translation array in comm"); + abort(); + } + + for (uint32_t i = 0; i < pml_comm->size; i++) { + if (pml_comm->array[i] == wrank) { + return i; + } + } + + common_ubcl_error("UBCL error irank translation"); + + return 0; +} + +int mca_common_ubcl_get_mpi_rank(const int rank, const struct ompi_communicator_t *comm, + const uint64_t ubcl_rank) +{ + if (OMPI_ANY_SOURCE == rank) { + return mca_common_ubcl_find_rank(comm, ubcl_rank); + } else { + return rank; + } +} + +void mca_common_ubcl_status_to_ompi(ompi_status_public_t *status, + ubcl_status_t ubcl_status, + struct ompi_communicator_t *comm, int rank) +{ + if (MPI_STATUS_IGNORE != status) { + status->_cancelled = 0; //TODO output the information of cancel + status->_ucount = ubcl_status.size; + status->MPI_TAG = (int) ubcl_status.tag; + status->MPI_SOURCE = mca_common_ubcl_get_mpi_rank(rank, comm, ubcl_status.remote); + } +} + +int ubcl_error_to_ompi(ubcl_error_t code) +{ + int ret; + switch (code) { + case UBCL_SUCCESS: + ret = OPAL_SUCCESS; + break; + case UBCL_ERROR: + ret = OPAL_ERROR; + break; + case UBCL_ERR_RESOURCE_BUSY: + ret = OPAL_ERR_RESOURCE_BUSY; + break; + case UBCL_ERR_OUT_OF_RESOURCE: + ret = OPAL_ERR_OUT_OF_RESOURCE; + break; + case UBCL_ERR_NOT_IMPLEMENTED: + ret = OPAL_ERR_NOT_IMPLEMENTED; + break; + case UBCL_ERR_NOT_AVAILABLE: + ret = OPAL_ERR_NOT_AVAILABLE; + break; + case UBCL_ERR_TEMP_OUT_OF_RESOURCE: + ret = OPAL_ERR_TEMP_OUT_OF_RESOURCE; + break; + case UBCL_ERR_ARG_INVALID: + ret = OPAL_ERR_BAD_PARAM; + break; + case UBCL_ERR_TOO_LATE: + ret = OPAL_ERR_TIMEOUT; + break; + case UBCL_ERR_TRUNCATE: + ret = MPI_ERR_TRUNCATE; + break; + default: + ret = OPAL_ERROR; + break; + } + + return ret; +} + +void _mca_common_ubcl_error(char *filename, int line, int err, + char abort, int verbose, int output, + int is_init, int comp_verbose, + char *comp_name, char *format, ...) +{ + int n_addr = 0; + void **stack_buffer = NULL; + char **stack = NULL; + + stack_buffer = malloc(sizeof(void *) * mca_ompi_common_ubcl_component.n_addr); + n_addr = backtrace(stack_buffer, mca_ompi_common_ubcl_component.n_addr); + stack = backtrace_symbols(stack_buffer, n_addr); + + int char_per_line = 256; + int n_char = char_per_line * n_addr + 1024; + char *msg = malloc(n_char * sizeof(char)); + + if (NULL == stack || NULL == msg) { + /* Output small error */ + opal_output_verbose(verbose, output, + "========\n== ERROR: Not enough memory while outputting error...\n== " + "%s encountered an error (%d) at %s:%d\n========\n", + comp_name, err, filename, line); + } else { + /* Output full error */ + int current = 0; + current += snprintf(msg + current, n_char - current, + "========\n== %s encountered an error (%d) at %s:%d\n== %s:\n\t", + comp_name, err, filename, line, abort ? "ERROR" : "WARNING"); + va_list arglist; + va_start(arglist, format); + current += vsnprintf(msg + current, n_char - current, format, arglist); + va_end(arglist); + + current += snprintf(msg + current, n_char - current, "\n== STACK:\n"); + + for (int i = 0; i < n_addr; i++) { + size_t min_char = char_per_line < (n_char - current) ? char_per_line : n_char - current; + current += snprintf(msg + current, min_char, "= [%2d] %s\n", i, + stack[i]); + } + + if (is_init && output > 0) { + opal_output_verbose(verbose, output, + "%s========", msg); + } else if (abort || comp_verbose >= verbose) { + fprintf(stderr, "%s\n", msg); + fflush(stderr); + } + } + + if (abort) { + OMPI_ERRHANDLER_INVOKE(&ompi_mpi_comm_world.comm, err, stack[0]); + ompi_mpi_abort(&ompi_mpi_comm_world.comm, err); + } + + free(stack_buffer); + free(stack); + free(msg); +} diff --git a/ompi/mca/common/ubcl/common_ubcl.h b/ompi/mca/common/ubcl/common_ubcl.h new file mode 100644 index 00000000000..d23ccf3152d --- /dev/null +++ b/ompi/mca/common/ubcl/common_ubcl.h @@ -0,0 +1,44 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OMPI_MCA_COMMON_UBCL_H +#define OMPI_MCA_COMMON_UBCL_H + +#include + +#include "ompi/communicator/communicator.h" +#include "ompi/include/mpi.h" +#include "opal/mca/common/ubcl/common_ubcl.h" + +/* Holds common variable used in multiple UBCL components */ +struct mca_ompi_common_ubcl_component_s { + int n_addr; /**< Max number of void * addresses in printed stack*/ +}; +typedef struct mca_ompi_common_ubcl_component_s mca_ompi_common_ubcl_component_t; +extern mca_ompi_common_ubcl_component_t mca_ompi_common_ubcl_component; + +int mca_common_ubcl_get_mpi_rank(const int rank, const struct ompi_communicator_t *comm, + const uint64_t ubcl_rank); +void mca_common_ubcl_status_to_ompi(ompi_status_public_t *status, + ubcl_status_t ubcl_status, + struct ompi_communicator_t *comm, int rank); +int ubcl_error_to_ompi(ubcl_error_t code); +/* UBCL rank is on 61 bits, ompi jobid is 32bits, vpid must be truncated to 29bits */ +#define COMMON_UBCL_VPID_MAX (((1 << 29) - 1)) /* We need 3 bits for UBCL rank */ +#define PML_UBCL_JOBID_MAX (OPAL_JOBID_MAX) + +/* Error and warning output function used by UBCL components */ +void _mca_common_ubcl_error(char *filename, int line, int err, char abort, int verbose, + int output, int is_init, int comp_verbose, char *comp_name, + char *format, ...); + + +#endif /* OMPI_MCA_COMMON_UBCL_H */ + diff --git a/ompi/mca/common/ubcl/configure.m4 b/ompi/mca/common/ubcl/configure.m4 new file mode 100644 index 00000000000..42ba29cf67a --- /dev/null +++ b/ompi/mca/common/ubcl/configure.m4 @@ -0,0 +1,29 @@ +# -*- shell-script -*- +# +# Copyright (c) 2025 Bull S.A.S. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AC_DEFUN([MCA_ompi_common_ubcl_CONFIG],[ + AC_CONFIG_FILES([ompi/mca/common/ubcl/Makefile]) + + OMPI_CHECK_UBCL([ompi_common_ubcl], + [common_ubcl_happy="yes"], + [common_ubcl_happy="no"]) + + + AC_REQUIRE([MCA_opal_common_ubcl_CONFIG]) + + AS_IF([test "$common_ubcl_happy" = "yes"], + [$1], + [$2]) + + # substitute in the things needed to build ubcl + AC_SUBST([common_ubcl_CPPFLAGS]) + AC_SUBST([common_ubcl_LDFLAGS]) + AC_SUBST([common_ubcl_LIBS]) +])dnl diff --git a/ompi/mca/osc/ubcl/Makefile.am b/ompi/mca/osc/ubcl/Makefile.am new file mode 100644 index 00000000000..90a8b67d4f5 --- /dev/null +++ b/ompi/mca/osc/ubcl/Makefile.am @@ -0,0 +1,53 @@ +# Copyright (c) 2025 Bull SAS. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AM_CPPFLAGS = $(osc_ubcl_CPPFLAGS) + +dist_ompidata_DATA = + +# EXTRA_DIST = post_configure.sh + +ubcl_sources = \ + osc_ubcl_utils.h \ + osc_ubcl.h \ + osc_ubcl.c \ + osc_ubcl_put.c \ + osc_ubcl_accumulate.c \ + osc_ubcl_datatype.c \ + osc_ubcl_info.c \ + osc_ubcl_info.h \ + osc_ubcl_get.c\ + osc_ubcl_request.c \ + osc_ubcl_request.h \ + osc_ubcl_sync.c \ + osc_ubcl_sync.h + +if MCA_BUILD_ompi_osc_ubcl_DSO +component_noinst = +component_install = mca_osc_ubcl.la +else +component_noinst = libmca_osc_ubcl.la +component_install = +endif + +mcacomponentdir = $(ompilibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_osc_ubcl_la_SOURCES = $(ubcl_sources) +mca_osc_ubcl_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \ + $(osc_ubcl_LIBS) \ + $(OPAL_TOP_BUILDDIR)/opal/mca/common/ubcl/lib@OPAL_LIB_NAME@mca_common_ubcl.la \ + $(OMPI_TOP_BUILDDIR)/ompi/mca/common/ubcl/libmca_common_ubcl.la + +mca_osc_ubcl_la_LDFLAGS = -module -avoid-version $(osc_ubcl_LDFLAGS) +mca_osc_ubcl_la_CPPFLAGS = -Wextra -Wall -Werror -Wno-unused-parameter -Wno-missing-field-initializers $(osc_ubcl_CPPFLAGS) + +noinst_LTLIBRARIES = $(component_noinst) +libmca_osc_ubcl_la_SOURCES = $(ubcl_sources) +libmca_osc_ubcl_la_LIBADD = $(osc_ubcl_LIBS) +libmca_osc_ubcl_la_LDFLAGS = -module -avoid-version $(osc_ubcl_LDFLAGS) +libmca_osc_ubcl_la_CPPFLAGS = $(mca_osc_ubcl_la_CPPFLAGS) diff --git a/ompi/mca/osc/ubcl/README.md b/ompi/mca/osc/ubcl/README.md new file mode 100644 index 00000000000..d26070e6ecf --- /dev/null +++ b/ompi/mca/osc/ubcl/README.md @@ -0,0 +1,379 @@ +# INTRODUCTION + +## How to use : +Configure using the `--with-ubcl=` command with a correct ubcl install. +It needs to find ubcl to build the component. +You cannot use the OSC/UBCL without the PML/UBCL as the former relies on the latter +for UBCL endpoints + +The OSC/UBCL also relies on the opal/mca/common/ubcl and the ompi/mca/common/ubcl. + + + +# Architecture : + +## Most used data structures + +Current UCBL API (for osc calls) resembles that of the MPI specification. +The idea is to offload as much as possible to UBCL, all this component needs to do +is translate arguments and make valid calls to UBCL. + + +The component is shared to all windows and mainly contains fields to help print log, +for now. The generic inherited class also brings necessary data to open the component +and create windows/modules such as *osc_init*, *osc_select* or *osc_finish*. + + +```c +struct mca_osc_ubcl_module_s { + ompi_osc_base_module_t super; + struct ompi_communicator_t *comm; + struct ompi_win_t *win; + int64_t wid; + union {int *all; int uniq;} disp_unit; + ubcl_win_flags_t win_flags; + + uint32_t same_disp_unit:1; + uint32_t no_locks:1; + uint32_t padding_infos:30; + + ubcl_win_sync_type_t sync_type; + ubcl_win_sync_type_t *procs_sync_type; + int64_t passive_lock_refcount; + opal_mutex_t sync_lock; + + unsigned int nb_rank_waited; + struct ompi_group_t *active_sync_access_group; + struct ompi_group_t *active_sync_exposure_group; + + void *free_after; +}; +typedef struct mca_osc_ubcl_module_s mca_osc_ubcl_module_t; +``` + +The module is specific to one window and it holds in fact the fields necessary to +that window such as the parent classes for example. +The super field holds the available one-sided communications while win holds the necessary +data to compute what's needed for the window at a higher level than the osc/ubcl. +We fill the function pointers of super by coppying them from a template established +in `osc_ubcl_component.c` file. +It means in theory that we could deactivate or switch API calls to some other one-sided +function but in pratice every window have the same calls. +The communicator field is a duplicated communicator of the one that was used to +start the window with. It is necessary info about the group of procs of the window, +in part in order to synchronize procs at window create/free using regular MPI +collectives without introducing deadlocks on the original communicator. +The wid is a unique id to identify the window. +The `win_flags` are essential on window creation and track for ubcl which channel +the window can use (*bxi*, *shm* or *self*). + +```c +enum ubcl_win_sync_type { + UBCL_WIN_SYNC_NONE, + UBCL_WIN_SYNC_LOCK, + UBCL_WIN_SYNC_LOCK_NO_CHECK, + UBCL_WIN_SYNC_LOCK_ALL, + UBCL_WIN_SYNC_LOCK_ALL_NO_CHECK, + UBCL_WIN_SYNC_PSCW, + UBCL_WIN_SYNC_FENCE, + UBCL_WIN_SYNC_FENCE_EPOCH +}; +typedef enum ubcl_win_sync_type_t +``` + +`sync_type` and `procs_sync_type` are enums used to track the type of synchronization +used on the whole window and for each *access epoch* respectively. It has a debugging +purpose, checks correct use of the mpi interface and is also mandatory to handle +the various behaviors of the synchronization functions such as the `fence` epoch +only starting at the first communication. +`sync_type` is a global window status whereas `procs_sync_type` proxies the information +at the scale of each peer rank when needed by the sync type. +`sync_lock` is a lock used to guarantee only one thread access to the thread critical +fields (`sync_type`, `procs_sync_type`, `passive_lock_refcount`, `nb_tests_passed`). + +`nb_tests_passed`is used exclusively by *Test* and *Wait* to track which exposure +epoch (to which proc) was terminated. +`active_sync_access_group` and `active_sync_exposure_group` save the group used +to create the pscw epoch(s). It is needed to complete the *Complete* *Wait* and +*Test* operations as well as the one-sided communications. +`Free_after` is a pointer to memory attached to the window that needs to be freed +alongside the window. + + + +## Window creation + +```c +MPI_Win_create + ↓ +ompi_win_create + ↓ +ompi_osc_base_select + ↓ ↑ | +osc_ubcl_query -┘ | + ↓ + component_select + ↓ + new_module + ↓ + win_create + ↓ + ubcl_win_create +``` + +Each time the user requests a window to be created they will follow this diagram +of function calls. +`osc_ubcl_query` returns the priority depending on the requested window flavor. +`ompi_osc_base_select` will select the component with the highest priority as the +osc for the window. +`component_select` calls the function to create the window/module and enforces synchronization with a barrier +`new_module` allocates a new module and then copies the module template on itself +`win_create` prepares the `win_flags` for UBCL, based on the PML endpoints types of the communicator. +`ubcl_win_create` creates the window inside the UBCL library + + +### Dynamic windows + +Giving the flavor `MPI_WIN_FLAVOR_DYNAMIC` allows OMPI to create a dynamic window. +We then need to attach a buffer (or more) to it with *win_attach* and *win_detach*. +Since the window buffer is handled by UBCL *win_attach* and *win_detach* do very little. + + +## Synchronization + +### Generalities + +To enable synchronization we need every procs involved to be inside the same window +and for an *epoch* to be opened. +To that effect we store the type of synchronization inside the `osc_module` which +means the type of the epoch opened (so either *passive* or *active* and which +one precisely) because we can't run a one-sided communication without any synchronization. + +### Passive sync + +In *passive synchronization*, data is moved from the memory of one process to the +memory of another, and only the origin process is explicitly involved in the transfer. +Thus, two origin processes may communicate by accessing the same location in a +target window. +Despite the fact that no MPI call is required, the target process still needs to call +`ubcl_progress` to actively handle the request to establish the *lock*. + +#### Lock/Unlock + +```c +int ompi_osc_ubcl_lock(int lock_type, int target, int assert, struct ompi_win_t *win); +int ompi_osc_ubcl_unlock(int target, struct ompi_win_t *win); + +int ompi_osc_ubcl_lock_all(int assert, struct ompi_win_t *win); +int ompi_osc_ubcl_unlock_all(struct ompi_win_t *win); +``` + +The *lock*/*unlock* functions use the sync type `UBCL_WIN_SYNC_LOCK`. Lock is allowed +only if the window has a sync type `UBCL_WIN_SYNC_LOCK` or `UBCL_WIN_SYNC_NONE` +and if the target process is not yet locked. It marks the window (if not already +done) as sync type `UBCL_WIN_SYNC_LOCK`, and changes the target process sync type +to `UBCL_WIN_SYNC_LOCK` in the local array of locked processes. +It also increase the `passive_lock_refcount` which tracks the number of *locks* +done to allow *unlock* to reset the window type when it should. +*Unlock* requires the window to be in `UBCL_WIN_SYNC_LOCK` and have the target +process locked in the local array. + +The *lock_all*/*unlock_all* functions use the sync type `UBCL_WIN_SYNC_LOCK_ALL` +for the window only because we don't locally mark target processes as locked. +Otherwise it function the same as a simple *lock*. +As MPI requires that an initiator cannot lock the same target multiple times, +*lock_all* and *lock* are mutually exclusive despite similar names which leads to +a different sync type needed. +The main difference is that the UBCL call requires an array in argument that we +have to build. + + +In case we're provided with `MPI_MODE_NOCHECK` as assertion, we don't bother +actually locking the processes. However we still mark the target process as being +locked with `UBCL_WIN_SYNC_LOCK_NO_CHECK` and change the window sync type to +`UBCL_WIN_SYNC_LOCK` in case of the simple *lock*. +For *lock_all* we mark the window as having an epoch `UBCL_WIN_SYNC_LOCK_ALL_NO_CHECK`. + + +#### Flush + +```c +int ompi_osc_ubcl_flush(int target, struct ompi_win_t *win); +int ompi_osc_ubcl_flush_all(struct ompi_win_t *win); +int ompi_osc_ubcl_flush_local(int target, struct ompi_win_t *win); +int ompi_osc_ubcl_flush_local_all(struct ompi_win_t *win); +``` + +The *flush* functions don't create any epoch and therefore don't have a sync type +associated. However, the *flush* functions can only be called if the current process has +a valid passive target access epoch on the target process. +They make sure that all the previous one-sided communications on the window, +from the initiator to the target, are completed. +As for now *flush_local* is an alias to *flush* and *flush[_local]_all* loops +on *flush[_local]*. + + +#### Sync + +When the data modifications are not fully handled by the NIC, some counterproductive +caches must be cleaned at the start of one-sided exposure epochs. +In active target synchronization model, a call is made on target side in fence and post functions. +As there is no MPI call on target side in passive target synchronization model, +this is handled internally by ubcl. + +### Active sync + +In *active synchronization*, data is moved from the memory of one process to the +memory of another, and both are explicitly involved in the synchronization. This +communication pattern is similar to message passing, except that all the data transfer +arguments are provided by one process, and the second process only participates in +the synchronization. + +#### PSCW + +```c +int ompi_osc_ubcl_post(struct ompi_group_t *group, int assert, struct ompi_win_t *win); +int ompi_osc_ubcl_start(struct ompi_group_t *group, int assert, struct ompi_win_t *win); +int ompi_osc_ubcl_complete(struct ompi_win_t *win); +int ompi_osc_ubcl_wait(struct ompi_win_t *win); +int ompi_osc_ubcl_test(struct ompi_win_t *win, int *flag); +``` + +The *PSCW* functions use the sync type `UBCL_WIN_SYNC_PSCW`. *Post* and *Start* requires +sync type `UBCL_WIN_SYNC_NONE` or `UBCL_WIN_SYNC_PSCW` on the window and not +have a *PSCW* synchronization group tracked already. *Complete* and *Wait* +requires sync type `UBCL_WIN_SYNC_PSCW` on the window and a non-NULL synchronization +group in the window. *Test* is the same as *Wait* but non-blocking. + +In UBCL, the functions only take one target at a time whereas OMPI PSCW functions +take a whole group. So *Post* and *Start* loop on the group given in argument and +the *Complete*, *Wait* and *Test* loop on the groups specified - and stored +inside the window - when the epoch was established. +Below is the correspondance list between UBCL and OMPI : +- `MPI_Win_post` => `ubcl_win_target_grants_lock` +- `MPI_Win_start` => `ubcl_win_initiator_waits_lock` +- `MPI_Win_complete` => `ubcl_win_initiator_releases_lock` +- `MPI_Win_wait` => `ubcl_win_target_waits_lock_release` +- `MPI_Win_test` => `ubcl_win_target_tests_lock_release` + +We don't make use of the assert argument here for any of the active target synchronization functions. + + +#### Fence + +```c +int ompi_osc_ubcl_fence(int assert, struct ompi_win_t *win); +``` + +The *fence* function uses the sync type `UBCL_WIN_SYNC_FENCE` and `UBCL_WIN_SYNC_FENCE_EPOCH`. +This synchronization scheme needs both types for correctness checks. No other +synchronization calls may be started unless all epochs are completed before. +The first fence sets the window sync type as `UBCL_WIN_SYNC_FENCE` and the first +one-sided communication that starts will begin a fence epoch, setting the sync type +to `UBCL_WIN_SYNC_FENCE_EPOCH`. +That also means we have to have to allow other synchronization schemes to start an +epoch on `UBCL_WIN_SYNC_FENCE` as if it's `UBCL_WIN_SYNC_NONE`. +The function flushes the one-sided communications started in the current epoch, +acting as a barrier. Additionally whencalled inside a *fence* epoch, it closes +said epoch. The sync type is back to `UBCL_WIN_SYNC_FENCE`. + +Here we take into account the `MPI_MODE_NOPRECEDE` and `MPI_MODE_NOSUCCEED` +assertions only. The first one allows us to skip flushing the previously started +one-sided communications since there are none. We don't exploit the second assertion +much except in the case where both values are given, then the *fence* doesn't do much. + + +## One-Sided Communicaions + +### Put + +```c +int ompi_osc_ubcl_put(const void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win); + +int ompi_osc_ubcl_rput(const void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win, + struct ompi_request_t **ompi_req); +``` + +### Get + +```c +int ompi_osc_ubcl_get(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win); + +int ompi_osc_ubcl_rget(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win, + struct ompi_request_t **ompi_req); +``` + +### Atomic operations + +```c +int ompi_osc_ubcl_accumulate(const void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_dt, int target, ptrdiff_t target_disp, + int target_count, struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, struct ompi_win_t *win); + +int ompi_osc_ubcl_raccumulate(const void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_dt, int target, ptrdiff_t target_disp, + int target_count, struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, struct ompi_win_t *win, + struct ompi_request_t **request); + +int ompi_osc_ubcl_get_accumulate(const void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_datatype, void *result_addr, + int result_count, struct ompi_datatype_t *result_datatype, + int target_rank, ptrdiff_t target_disp, int target_count, + struct ompi_datatype_t *target_datatype, struct ompi_op_t *op, + struct ompi_win_t *win); + +int ompi_osc_ubcl_rget_accumulate(const void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_datatype, void *result_addr, + int result_count, struct ompi_datatype_t *result_datatype, + int target_rank, ptrdiff_t target_disp, int target_count, + struct ompi_datatype_t *target_datatype, struct ompi_op_t *op, + struct ompi_win_t *win, struct ompi_request_t **request); + +int ompi_osc_ubcl_fetch_and_op(const void *origin_addr, void *result_addr, + struct ompi_datatype_t *dt, int target, ptrdiff_t target_disp, + struct ompi_op_t *op, struct ompi_win_t *win); + +int ompi_osc_ubcl_compare_and_swap(const void *origin_addr, const void *compare_addr, + void *result_addr, struct ompi_datatype_t *dt, int target, + ptrdiff_t target_disp, struct ompi_win_t *win); +``` + +The implementation makes use of the similarity between these functions so *accumulate* +calls *raccumulate* wih *ompi_req = NULL*, *raccumulate* calls *rget_accumulate* +with all result argument sets to NULL or 0. +*get_accumulate* calls *rget_acucmulate* with *ompi_req = NULL*. +*fetch_op* also only needs to call *get_accumulate* with the correct arguments. +*compare_and_swap* gets its own implementation. + + + diff --git a/ompi/mca/osc/ubcl/configure.m4 b/ompi/mca/osc/ubcl/configure.m4 new file mode 100644 index 00000000000..add1db7c94b --- /dev/null +++ b/ompi/mca/osc/ubcl/configure.m4 @@ -0,0 +1,32 @@ +# Copyright (c) 2025 Bull SAS. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + + +AC_DEFUN([MCA_ompi_osc_ubcl_POST_CONFIG], [ + AS_IF([test "$1" = "1"], [OMPI_REQUIRE_ENDPOINT_TAG([PML])]) +]) + +AC_DEFUN([MCA_ompi_osc_ubcl_CONFIG], [ + AC_CONFIG_FILES([ompi/mca/osc/ubcl/Makefile]) + + AC_REQUIRE([MCA_ompi_common_ubcl_CONFIG]) + AC_REQUIRE([MCA_opal_common_ubcl_CONFIG]) + + OMPI_CHECK_UBCL([osc_ubcl], + [osc_ubcl_happy="yes"], + [osc_ubcl_happy="no"]) + + AS_IF([test "$osc_ubcl_happy" = "yes"], + [$1], + [$2]) + +# substitute in the things needed to build ubcl + AC_SUBST([osc_ubcl_CPPFLAGS]) + AC_SUBST([osc_ubcl_LDFLAGS]) + AC_SUBST([osc_ubcl_LIBS]) +]) diff --git a/ompi/mca/osc/ubcl/osc_ubcl.c b/ompi/mca/osc/ubcl/osc_ubcl.c new file mode 100644 index 00000000000..cdb9908aef7 --- /dev/null +++ b/ompi/mca/osc/ubcl/osc_ubcl.c @@ -0,0 +1,556 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + * Bull eXtreme Interconnect OSC API implementation. + * + * Implementation of API defined in osc.h. To see parameters and return values + * of these functions, refer to ompi/mca/osc/osc.h. + */ + +#include "opal/include/opal_config.h" + +#include "ompi/mca/osc/ubcl/osc_ubcl_info.h" +#include "opal/mca/common/ubcl/common_ubcl.h" + +#include "ompi/mca/osc/ubcl/osc_ubcl.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_request.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_sync.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_utils.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_info.h" +#include "opal/util/proc.h" +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/osc/ubcl/osc_ubcl.h" +#include "ompi/mca/common/ubcl/common_ubcl.h" + +#include + +static int component_open(void); +static int component_register(void); +static int component_init(bool enable_progress_threads, bool enable_mpi_threads); +static int component_fini(void); +static int component_query(struct ompi_win_t *win, void **base, size_t size, int disp_unit, + struct ompi_communicator_t *comm, struct opal_info_t *info, int flavor); +static int component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit, + struct ompi_communicator_t *comm, struct opal_info_t *info, int flavor, + int *model); +static int win_free(struct ompi_win_t *win); +static int shared_query(struct ompi_win_t *win, int rank, size_t *size, int *disp_unit, + void *baseptr); +static int win_attach(struct ompi_win_t *win, void *base, size_t size); +static int win_detach(struct ompi_win_t *win, const void *base); + +mca_osc_ubcl_component_t mca_osc_ubcl_component = { + .super = { /* ompi_osc_base_component_t */ + .osc_version = { + OMPI_OSC_BASE_VERSION_3_0_0, + .mca_component_name = "ubcl", + MCA_BASE_MAKE_VERSION(component, + OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, + OMPI_RELEASE_VERSION), + .mca_open_component = component_open, + .mca_register_component_params = component_register, + }, + .osc_data = { + /* The component is not checkpoint ready */ + MCA_BASE_METADATA_PARAM_NONE + }, + .osc_init = component_init, + .osc_query = component_query, + .osc_select = component_select, + .osc_finalize = component_fini, + }, + .is_init = 0 +}; + +mca_osc_ubcl_module_t mca_osc_ubcl_module_template = { + {shared_query, /* Since MPI 4.1, osc should not abort on unsupported shared_query */ + win_attach, + win_detach, + win_free, + + ompi_osc_ubcl_put, + ompi_osc_ubcl_get, + ompi_osc_ubcl_accumulate, + ompi_osc_ubcl_compare_and_swap, + ompi_osc_ubcl_fetch_and_op, + ompi_osc_ubcl_get_accumulate, + + ompi_osc_ubcl_rput, + ompi_osc_ubcl_rget, + ompi_osc_ubcl_raccumulate, + ompi_osc_ubcl_rget_accumulate, + + ompi_osc_ubcl_fence, + + ompi_osc_ubcl_start, + ompi_osc_ubcl_complete, + ompi_osc_ubcl_post, + ompi_osc_ubcl_wait, + ompi_osc_ubcl_test, + + ompi_osc_ubcl_lock, + ompi_osc_ubcl_unlock, + ompi_osc_ubcl_lock_all, + ompi_osc_ubcl_unlock_all, + + ompi_osc_ubcl_sync, + ompi_osc_ubcl_flush, + ompi_osc_ubcl_flush_all, + ompi_osc_ubcl_flush_local, + ompi_osc_ubcl_flush_local_all} +}; + +static int component_open(void) +{ + /* Open output stream */ + if (0 < mca_osc_ubcl_component.verbose) { + mca_osc_ubcl_component.output = opal_output_open(NULL); + int verbose = mca_osc_ubcl_component.verbose > 0 ? mca_osc_ubcl_component.verbose : 1; + opal_output_set_verbosity(mca_osc_ubcl_component.output, verbose); + } else { + mca_osc_ubcl_component.output = -1; + } + + return OMPI_SUCCESS; +} + +static int component_register(void) +{ + mca_base_component_t *component = &mca_osc_ubcl_component.super.osc_version; + + mca_osc_ubcl_component.priority = 0; + (void) mca_base_component_var_register(&mca_osc_ubcl_component.super.osc_version, "priority", + "Priority of the ubcl osc component", + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, + OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY, + &mca_osc_ubcl_component.priority); + + mca_osc_ubcl_component.verbose = 0; + (void) mca_base_component_var_register(component, "verbose", "Verbosity level of the osc/ubcl.", + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_osc_ubcl_component.verbose); + + mca_osc_ubcl_component.max_req = 0; + (void) + mca_base_component_var_register(component, "max_requests", + "Maximum number of requests allocated. (0 means infinite)", + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_osc_ubcl_component.max_req); + + mca_osc_ubcl_component.min_req = 0; + (void) mca_base_component_var_register(component, "min_requests", + "Minimum (and initial) number of requests allocated.", + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_osc_ubcl_component.min_req); + + mca_osc_ubcl_component.incr_req = 1024; + (void) mca_base_component_var_register( + component, "incr_requests", + "Count of new requests allocated when free list runs out of requests.", + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_osc_ubcl_component.incr_req); + + mca_common_ubcl_register_mca(); + + return OMPI_SUCCESS; +} + +static int component_init(bool enable_progress_threads, bool enable_mpi_threads) +{ + int err; + OPAL_OUTPUT_VERBOSE((50, mca_osc_ubcl_component.output, "UBCL_COMPONENT_INIT\n")); + + if (opal_atomic_fetch_add_64(&mca_osc_ubcl_component.is_init, 1)) { + return OMPI_SUCCESS; + } + + if (OPAL_SUCCESS != mca_common_ubcl_init()) { + mca_osc_ubcl_warn(OMPI_ERR_NOT_AVAILABLE, "common_ubcl could not load UBCL library\n"); + return OMPI_SUCCESS; + } + + OBJ_CONSTRUCT(&mca_osc_ubcl_component.req_free_list, opal_free_list_t); + err = opal_free_list_init(&mca_osc_ubcl_component.req_free_list, sizeof(mca_osc_ubcl_request_t), + opal_cache_line_size, OBJ_CLASS(mca_osc_ubcl_request_t), 0, + opal_cache_line_size, mca_osc_ubcl_component.min_req, + mca_osc_ubcl_component.max_req, mca_osc_ubcl_component.incr_req, NULL, + 0, NULL, NULL, NULL); + + if (OPAL_UNLIKELY(OMPI_SUCCESS != err)) { + mca_osc_ubcl_warn(OMPI_ERR_OUT_OF_RESOURCE, "Not enough memory (%d)", err); + goto error_free_list; + } + + /* Initialize UBCL */ + if (UBCL_SUCCESS != ubcl_init(enable_mpi_threads || enable_progress_threads)) { + goto error_ubcl_init; + } + + /* Mark as initialized and return */ + OPAL_OUTPUT_VERBOSE((50, mca_osc_ubcl_component.output, "INITIATION DONE\n")); + return OMPI_SUCCESS; + +error_ubcl_init: + OBJ_DESTRUCT(&mca_osc_ubcl_component.req_free_list); +error_free_list: + mca_common_ubcl_fini(); + return OMPI_ERROR; +} + +static int component_fini(void) +{ + int ret; + OPAL_OUTPUT_VERBOSE((50, mca_osc_ubcl_component.output, "ubcl_COMPONENT_FINALIZE")); + + if (0 != opal_atomic_sub_fetch_64(&mca_osc_ubcl_component.is_init, 1)) { + return OMPI_SUCCESS; + } + + /* Finalize UBCL */ + ret = ubcl_error_to_ompi(ubcl_fini()); + if (OMPI_SUCCESS != ret) { + return ret; + } + + OBJ_DESTRUCT(&mca_osc_ubcl_component.req_free_list); + + mca_common_ubcl_fini(); + return OMPI_SUCCESS; +} + +static int component_query(struct ompi_win_t *win, void **base, size_t size, int disp_unit, + struct ompi_communicator_t *comm, struct opal_info_t *info, int flavor) +{ + uint64_t flags = 0; + int dev_id; + + if (MPI_WIN_FLAVOR_SHARED == flavor) { + return OPAL_ERR_NOT_IMPLEMENTED; + } + + if (0 == mca_common_ubcl_is_init()) { + return OPAL_ERR_NOT_INITIALIZED; + } + + /* Accelerator buffer is not supported as provided window buffer */ + if (MPI_WIN_FLAVOR_ALLOCATE != flavor && MPI_WIN_FLAVOR_DYNAMIC != flavor + && 0 < size && NULL != base && NULL != *base + && opal_accelerator.check_addr(*base, &dev_id, &flags) > 0) { + mca_osc_ubcl_log(20, "GPU buffer not supported by osc/ubcl"); + return OPAL_ERR_NOT_SUPPORTED; + } + + return mca_osc_ubcl_component.priority; +} + +static int win_create(void *base, size_t size, mca_osc_ubcl_module_t *module) +{ + ompi_proc_t *proc; + mca_common_ubcl_endpoint_t *endpoint; + ompi_group_t * win_group; + int ret = OMPI_SUCCESS; + + module->win_flags.bxi = 0; + module->win_flags.shm = 0; + module->win_flags.self = 0; + if (MPI_WIN_FLAVOR_DYNAMIC == module->win->w_flavor) { + module->win_flags.dynamic = 1; + } + + ompi_win_group(module->win, &win_group); + for (int i = 0; i < ompi_group_size(win_group); i++) { + proc = ompi_group_peer_lookup_existing(win_group, i); + if (OPAL_UNLIKELY(NULL == proc)) { + ret = OMPI_ERR_BAD_PARAM; + mca_osc_ubcl_warn(ret, "Cannot create window: %d-th proc is undefined", i); + goto exit; + } + + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + if (NULL == endpoint) { + ret = OMPI_ERR_BAD_PARAM; + mca_osc_ubcl_warn(ret, "Cannot create window: %d-th UBCL endpoint is undefined", i); + goto exit; + } + + switch (endpoint->type) { + case UBCL_ENDPOINT_TYPE_SELF: + module->win_flags.self = 1; + break; + case UBCL_ENDPOINT_TYPE_SHMEM: + module->win_flags.shm = 1; + break; + case UBCL_ENDPOINT_TYPE_BXI: + module->win_flags.bxi = 1; + break; + default: + /* Should never happen, UBCL endpoints always have a type */ + mca_osc_ubcl_error(OMPI_ERROR, "Unknown endpoint type"); + } + } + + /* Endpoints are created by the osc/ubcl when ompi_init is called */ + ret = ubcl_error_to_ompi(ubcl_win_create(base, size, module->wid, module->win_flags)); +exit: + return ret; +} + +/* create a module structure */ +static int new_module(struct ompi_win_t *win, void **base, size_t size, + struct ompi_communicator_t *comm, int flavor, mca_osc_ubcl_module_t **pmodule) +{ + int ret = OMPI_ERROR; + void *win_ptr; + mca_osc_ubcl_module_t *module; + + /* Calloc is required to set all pointers to NULL and free them in case + * of error */ + module = (mca_osc_ubcl_module_t *) calloc(1, sizeof(mca_osc_ubcl_module_t)); + if (NULL == module) { + return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + } + memcpy(module, &mca_osc_ubcl_module_template, sizeof(ompi_osc_base_module_t)); + + /* Allocate window buffer */ + if (MPI_WIN_FLAVOR_ALLOCATE == flavor) { + module->free_after = *base = malloc(size); + if (NULL == *base) { + ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; + goto error; + } + } else { + module->free_after = NULL; + } + + ret = ompi_comm_dup(comm, &module->comm); + if (OMPI_SUCCESS != ret) { + goto error; + } + + /* Putting the cid into the wid that way it should be unique */ + module->win = win; + module->wid = ompi_comm_get_local_cid(module->comm); + module->sync_type = UBCL_WIN_SYNC_NONE; + module->passive_lock_refcount = 0; + OBJ_CONSTRUCT(&module->sync_lock, opal_mutex_t); + module->nb_rank_waited = 0; + module->active_sync_access_group = NULL; + module->active_sync_exposure_group = NULL; + *pmodule = module; + + size_t comm_size = ompi_comm_size(comm); + module->procs_sync_type = malloc(sizeof(ubcl_win_sync_type_t) * comm_size); + if (NULL == module->procs_sync_type) { + ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; + goto error; + } + + for (size_t i = 0; i < comm_size; i++) { + module->procs_sync_type[i] = UBCL_WIN_SYNC_NONE; + } + + if (MPI_WIN_FLAVOR_DYNAMIC == flavor) { + /* For dynamic windows, base is MPI_BOTTOM, which is NULL, so it can't be dereferenced */ + win_ptr = (void *) base; + } else { + win_ptr = *base; + } + + return win_create(win_ptr, size, module); + +error: + /* According to MPI specifications 12.6.1, errors on window creations are fatal. + * That is why MPI API calls kill all ranks if the return value is not OMPI_SUCCESS. + * Therefore it is not an issue to leave this function without entering + * ompi_comm_dup collective call: other ranks will just be blocked in it + * before being sigkill'd. + */ + /* ompi_comm_free cannot be called here since it is a collective call. */ + free(module->procs_sync_type); + free(module->free_after); + free(module); + return ret; +} + +/* osc ubcl has been selected to exclusively handle the MPI RMA window, + * this is last call before real communications */ +static int component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit, + struct ompi_communicator_t *comm, struct opal_info_t *info, int flavor, + int *model) +{ + mca_osc_ubcl_module_t *module = NULL; + int ret; + unsigned name_len = 1024; + char name[name_len]; + + /* Handle erroneous cases */ + if (MPI_WIN_FLAVOR_SHARED == flavor) { + return OPAL_ERR_NOT_IMPLEMENTED; + } + + /* Allocate first a module */ + ret = new_module(win, base, size, comm, flavor, &module); + if (OMPI_SUCCESS != ret) { + return ret; + } + + snprintf(name, name_len, "ubcl window %d, built on %s", ompi_comm_get_local_cid(module->comm), + comm->c_name); + ompi_win_set_name(win, name); + mca_osc_ubcl_log(20, "%s created", win->w_name); + + win->w_osc_module = &module->super; + module->win = win; + *model = MPI_WIN_UNIFIED; + + osc_ubcl_read_info(info, win); + osc_ubcl_sync_disp_unit(module, disp_unit, true); + + mca_osc_ubcl_log(20, "Module allocated at %p", (void *) module); + + return OMPI_SUCCESS; +} + +static int win_free(struct ompi_win_t *win) +{ + mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; + uint64_t wid; + int ret; + + if (UBCL_WIN_SYNC_NONE != module->sync_type && + UBCL_WIN_SYNC_FENCE != module->sync_type) { + ret = OMPI_ERR_RMA_SYNC; + mca_osc_ubcl_warn(ret, "Cannot free window %d: epoch not ended", module->wid); + return ret; + } + + module->comm->c_coll->coll_barrier(module->comm, + module->comm->c_coll->coll_barrier_module); + + wid = module->wid; + ret = ubcl_error_to_ompi(ubcl_win_free(wid)); + + OBJ_DESTRUCT(&module->sync_lock); + ompi_comm_free(&module->comm); + osc_ubcl_fini_disp_unit(module); + free(module->free_after); + free(module->procs_sync_type); + free(module); + + return ret; +} + +static int shared_query(struct ompi_win_t *win, int rank, size_t *size, int *disp_unit, + void *baseptr) +{ + (void) win; + (void) rank; + *size = 0; + *disp_unit = 0; + *(void **) baseptr = NULL; + + return OMPI_SUCCESS; +} + +static int win_attach(struct ompi_win_t *win, void *base, size_t size) +{ + ubcl_error_t ret; + ubcl_wid_t wid; + mca_osc_ubcl_module_t *module; + uint64_t flags = 0; + int dev_id; + + module = (mca_osc_ubcl_module_t *) win->w_osc_module; + wid = (ubcl_wid_t) module->wid; + + /* Accelerator buffer is not supported as attached buffer */ + if (opal_accelerator.check_addr(base, &dev_id, &flags)) { + mca_osc_ubcl_warn(OPAL_ERR_NOT_SUPPORTED, "GPU buffer not supported by osc/ubcl"); + return OPAL_ERR_NOT_SUPPORTED; + } + + ret = ubcl_win_attach(base, size, wid); + + return ubcl_error_to_ompi(ret); +} + +static int win_detach(struct ompi_win_t *win, const void *base) +{ + ubcl_error_t ret; + ubcl_wid_t wid; + mca_osc_ubcl_module_t *module; + + module = (mca_osc_ubcl_module_t *) win->w_osc_module; + wid = (ubcl_wid_t) module->wid; + + /* FIXME: get the window size */ + ret = ubcl_win_detach((void *) base, 0, wid); + + return ubcl_error_to_ompi(ret); +} + +int osc_ubcl_build_ddt_iov(const void *addr, ompi_proc_t *proc, int count, + ompi_datatype_t *datatype, struct iovec **output_iov, + size_t *output_iov_count) +{ + opal_convertor_t convertor; + int ret; + bool done; + size_t output_iov_pos; + + OBJ_CONSTRUCT(&convertor, opal_convertor_t); + ret = opal_convertor_copy_and_prepare_for_send(proc->super.proc_convertor, &datatype->super, + count, addr, 0, &convertor); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + OBJ_DESTRUCT(&convertor); + return ret; + } + + output_iov_pos = 0; + *output_iov_count = 0; + done = false; + do { + size_t length; + uint32_t tmp_iov_count; + size_t tmp_iov_pos; + struct iovec tmp_iov[OSC_UBCL_IOVEC_MAX]; + + tmp_iov_count = OSC_UBCL_IOVEC_MAX; + + done = opal_convertor_raw(&convertor, tmp_iov, &tmp_iov_count, &length); + + *output_iov_count += tmp_iov_count; + *output_iov = (struct iovec *) realloc(*output_iov, + *output_iov_count * sizeof(struct iovec)); + if (NULL == *output_iov) { + return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + } + + tmp_iov_pos = 0; + while (tmp_iov_pos != tmp_iov_count) { + (*output_iov)[output_iov_pos].iov_base = tmp_iov[tmp_iov_pos].iov_base; + (*output_iov)[output_iov_pos].iov_len = tmp_iov[tmp_iov_pos].iov_len; + tmp_iov_pos++; + output_iov_pos++; + } + assert(*output_iov_count == output_iov_pos); + } while (!done); + + OBJ_DESTRUCT(&convertor); + + return ret; +} diff --git a/ompi/mca/osc/ubcl/osc_ubcl.h b/ompi/mca/osc/ubcl/osc_ubcl.h new file mode 100644 index 00000000000..4a4a902967b --- /dev/null +++ b/ompi/mca/osc/ubcl/osc_ubcl.h @@ -0,0 +1,189 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + * Bull eXtreme Interconnect OSC API implementation. + * + * Implementation of API defined in osc.h. To see parameters and return values + * of these functions, refer to ompi/mca/osc/osc.h. + */ + +#ifndef MCA_OSC_UBCL_H +#define MCA_OSC_UBCL_H + +#include +#include "ompi/mca/osc/osc.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/mca/osc/base/base.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_sync.h" +#include "ompi/group/group.h" +#include "ompi/communicator/communicator.h" +#include "ompi/request/request.h" +#include "opal/util/show_help.h" +#include "opal/mca/threads/mutex.h" + +#define OSC_UBCL_IOVEC_MAX 128 + +struct mca_osc_ubcl_module_s { + ompi_osc_base_module_t super; + struct ompi_communicator_t *comm; + struct ompi_win_t *win; + int64_t wid; + union {int *all; int uniq;} disp_unit; + ubcl_win_flags_t win_flags; + + /* To avoid info access (including locking a list and string manipulations) + * usefull info keys are stored inside the osc module. + * + * Note that string data such as accumulate_ordering and accumulate_ops + * are already stored in dedicated window variables (w_acc_order and w_acc_ops) + */ + uint32_t same_disp_unit:1; + uint32_t no_locks:1; + uint32_t padding_infos:30; + + /* Sync type of the entire window */ + ubcl_win_sync_type_t sync_type; + /* Detail of locked peer, only relevant for win_[un]lock */ + ubcl_win_sync_type_t *procs_sync_type; + /* How many remote locks are currently hold */ + int64_t passive_lock_refcount; + /* Threadsafety for lock syncs + * other types of sync should never be called by concurrent threads */ + opal_mutex_t sync_lock; + + /* Active target management */ + unsigned int nb_rank_waited; + struct ompi_group_t *active_sync_access_group; + struct ompi_group_t *active_sync_exposure_group; + + /* if non-null, this pointer should be free()ed with the window */ + void *free_after; +}; +typedef struct mca_osc_ubcl_module_s mca_osc_ubcl_module_t; + +struct mca_osc_ubcl_component_s { + ompi_osc_base_component_t super; + + /** Functionnal fields **/ + volatile int64_t is_init; /**< Whether we have been initialized, for proper close */ + int output; /**< Output stream */ + + /** MCA parameters **/ + int priority; /**< Priority of the component */ + int verbose; /**< Verbosity level of the component */ + + /** UBCL endpoint type capabilities **/ + opal_free_list_t req_free_list; + unsigned int max_req; /**< Maximum number of requests */ + unsigned int min_req; /**< Minimum (and inititial) number of requests */ + unsigned int incr_req; /**< Increasing (and inititial) number of requests */ + unsigned int pad_req; +}; +typedef struct mca_osc_ubcl_component_s mca_osc_ubcl_component_t; +extern mca_osc_ubcl_component_t mca_osc_ubcl_component; + +/* One Sided operations */ +int ompi_osc_ubcl_put(const void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win); + +int ompi_osc_ubcl_rput(const void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win, + struct ompi_request_t **ompi_req); + +int ompi_osc_ubcl_get(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win); + +int ompi_osc_ubcl_rget(void *origin_addr, + int origin_count, + struct ompi_datatype_t *origin_dt, + int target, + ptrdiff_t target_disp, + int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win, + struct ompi_request_t **ompi_req); + +int ompi_osc_ubcl_accumulate(const void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_dt, int target, ptrdiff_t target_disp, + int target_count, struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, struct ompi_win_t *win); + +int ompi_osc_ubcl_raccumulate(const void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_dt, int target, ptrdiff_t target_disp, + int target_count, struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, struct ompi_win_t *win, + struct ompi_request_t **ompi_req); + +int ompi_osc_ubcl_get_accumulate(const void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_dt, void *result_addr, + int result_count, struct ompi_datatype_t *result_dt, + int target_rank, ptrdiff_t target_disp, int target_count, + struct ompi_datatype_t *target_dt, struct ompi_op_t *op, + struct ompi_win_t *win); + +int ompi_osc_ubcl_rget_accumulate(const void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_dt, void *result_addr, + int result_count, struct ompi_datatype_t *result_dt, + int target_rank, ptrdiff_t target_disp, int target_count, + struct ompi_datatype_t *target_dt, struct ompi_op_t *op, + struct ompi_win_t *win, struct ompi_request_t **ompi_req); + +int ompi_osc_ubcl_fetch_and_op(const void *origin_addr, void *result_addr, + struct ompi_datatype_t *dt, int target, ptrdiff_t target_disp, + struct ompi_op_t *op, struct ompi_win_t *win); + +int ompi_osc_ubcl_compare_and_swap(const void *origin_addr, const void *compare_addr, + void *result_addr, struct ompi_datatype_t *dt, int target, + ptrdiff_t target_disp, struct ompi_win_t *win); + +/* Sync functions */ +int ompi_osc_ubcl_flush(int target, + struct ompi_win_t *win); +int ompi_osc_ubcl_flush_all(struct ompi_win_t *win); +int ompi_osc_ubcl_flush_local(int target, + struct ompi_win_t *win); +int ompi_osc_ubcl_flush_local_all(struct ompi_win_t *win); + +/* ubcl custom memory descriptor management */ +size_t osc_ubcl_datatype_pack(void *pack_buf, const void *usr_handle, + size_t pack_size, size_t offset); +size_t osc_ubcl_datatype_unpack(void *usr_handle, const void *pack_buf, + size_t pack_size, size_t offset); +size_t osc_ubcl_datatype_mem_size(const void *usr_handle, size_t offset); +void osc_ubcl_datatype_finish(void *usr_handle); + +/* Misc */ +int osc_ubcl_build_ddt_iov(const void *addr, ompi_proc_t *proc, int count, + ompi_datatype_t *datatype, struct iovec **output_iov, + size_t *output_iov_count); + +#endif //MCA_OSC_UBCL_H diff --git a/ompi/mca/osc/ubcl/osc_ubcl_accumulate.c b/ompi/mca/osc/ubcl/osc_ubcl_accumulate.c new file mode 100644 index 00000000000..d334bd88723 --- /dev/null +++ b/ompi/mca/osc/ubcl/osc_ubcl_accumulate.c @@ -0,0 +1,1104 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + * Bull eXtreme Interconnect OSC API implementation. + * + * Implementation of API defined in osc.h. To see parameters and return values + * of these functions, refer to ompi/mca/osc/osc.h. + */ + +#include "ompi/mca/osc/ubcl/osc_ubcl.h" +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_info.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_utils.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_sync.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_request.h" +#include "ompi/mca/common/ubcl/common_ubcl.h" + +int get_ubcl_int_type(size_t size, bool is_signed, ubcl_win_atomic_datatype_t *ubcl_type) +{ + int ret = OMPI_SUCCESS; + + switch (size) { + case 1: + *ubcl_type = is_signed ? UBCL_INT8 : UBCL_UINT8; + break; + case 2: + *ubcl_type = is_signed ? UBCL_INT16 : UBCL_UINT16; + break; + case 4: + *ubcl_type = is_signed ? UBCL_INT32 : UBCL_UINT32; + break; + case 8: + *ubcl_type = is_signed ? UBCL_INT64 : UBCL_UINT64; + break; + default: + ret = OMPI_ERR_NOT_SUPPORTED; + break; + } + + return ret; +} + +int get_ubcl_fp_type(size_t size, ubcl_win_atomic_datatype_t *ubcl_type) +{ + int ret = OMPI_SUCCESS; + + switch (size) { + case sizeof(float): + *ubcl_type = UBCL_FLOAT; + break; + case sizeof(double): + *ubcl_type = UBCL_DOUBLE; + break; + case sizeof(long double): + *ubcl_type = UBCL_LONG_DOUBLE; + break; + default: + ret = OMPI_ERR_NOT_SUPPORTED; + break; + } + + return ret; +} + +static int get_c_integer_ubcl_type(struct ompi_datatype_t *origin_dt, + ubcl_win_atomic_datatype_t *ubcl_type) +{ + int ret = OMPI_SUCCESS; + size_t dt_size; + + if (OMPI_SUCCESS != ompi_datatype_type_size(origin_dt, &dt_size)) { + return OMPI_ERR_BAD_PARAM; + } + + if (MPI_CHAR == origin_dt + /* Note: MPI_CHAR is not a valid type for predefined operations + * but MPI_SIGNED_CHAR and MPI_UNSIGNED_CHAR are. + * We suppost MPI_CHAR behaves as MPI_SIGNED_CHAR. + * C.F.: MPI 4.1 section 6.9.2 (p.227) + * MPI 5.0 section 6.9.2 (p.225) + */ + || MPI_INT == origin_dt || MPI_LONG == origin_dt || MPI_SHORT == origin_dt +#if OPAL_HAVE_LONG_LONG + || MPI_LONG_LONG_INT == origin_dt || MPI_LONG_LONG == origin_dt +#endif + || MPI_SIGNED_CHAR == origin_dt || MPI_INT8_T == origin_dt || MPI_INT16_T == origin_dt + || MPI_INT32_T == origin_dt || MPI_INT64_T == origin_dt) { + + ret = get_ubcl_int_type(dt_size, true, ubcl_type); + + } else if (MPI_UNSIGNED_SHORT == origin_dt || MPI_UNSIGNED == origin_dt + || MPI_UNSIGNED_LONG == origin_dt +#if OPAL_HAVE_LONG_LONG + || MPI_UNSIGNED_LONG_LONG == origin_dt +#endif + || MPI_UNSIGNED_CHAR == origin_dt || MPI_UINT8_T == origin_dt + || MPI_UINT16_T == origin_dt || MPI_UINT32_T == origin_dt + || MPI_UINT64_T == origin_dt) { + + ret = get_ubcl_int_type(dt_size, false, ubcl_type); + + } else { + ret = OMPI_ERR_BAD_PARAM; + } + return ret; +} + +static int get_fortran_integer_ubcl_type(struct ompi_datatype_t *origin_dt, + ubcl_win_atomic_datatype_t *ubcl_type) +{ + int ret = OMPI_SUCCESS; + size_t dt_size; + + if (OMPI_SUCCESS != ompi_datatype_type_size(origin_dt, &dt_size)) { + return OMPI_ERR_BAD_PARAM; + } + + if (MPI_INTEGER == origin_dt +#if OMPI_HAVE_FORTRAN_INTEGER1 + || MPI_INTEGER1 == origin_dt +#endif +#if OMPI_HAVE_FORTRAN_INTEGER2 + || MPI_INTEGER2 == origin_dt +#endif +#if OMPI_HAVE_FORTRAN_INTEGER4 + || MPI_INTEGER4 == origin_dt +#endif +#if OMPI_HAVE_FORTRAN_INTEGER8 + || MPI_INTEGER8 == origin_dt +#endif +#if OMPI_HAVE_FORTRAN_INTEGER16 + || MPI_INTEGER16 == origin_dt +#endif + ) { + ret = get_ubcl_int_type(dt_size, true, ubcl_type); + } else { + ret = OMPI_ERR_BAD_PARAM; + } + return ret; +} + +static int get_fp_ubcl_type(struct ompi_datatype_t *origin_dt, + ubcl_win_atomic_datatype_t *ubcl_type) +{ + /* TODO: handle MPI_TYPE_CREATE_F90_REAL handles */ + int ret = OMPI_SUCCESS; + size_t dt_size; + + if (OMPI_SUCCESS != ompi_datatype_type_size(origin_dt, &dt_size)) { + return OMPI_ERR_BAD_PARAM; + } + + if (MPI_FLOAT == origin_dt || MPI_DOUBLE == origin_dt || MPI_REAL == origin_dt + || MPI_DOUBLE_PRECISION == origin_dt || MPI_LONG_DOUBLE == origin_dt +/*#if OMPI_HAVE_FORTRAN_REAL2 + * || MPI_REAL2 == origin_dt + *#endif */ +#if OMPI_HAVE_FORTRAN_REAL4 + || MPI_REAL4 == origin_dt +#endif +#if OMPI_HAVE_FORTRAN_REAL8 + || MPI_REAL8 == origin_dt +#endif +#if OMPI_HAVE_FORTRAN_REAL16 + || MPI_REAL16 == origin_dt +#endif + ) { + ret = get_ubcl_fp_type(dt_size, ubcl_type); + } else { + ret = OMPI_ERR_BAD_PARAM; + } + return ret; +} + +static int get_logical_ubcl_type(struct ompi_datatype_t *origin_dt, + ubcl_win_atomic_datatype_t *ubcl_type) +{ + int ret = OMPI_SUCCESS; + size_t dt_size; + + if (OMPI_SUCCESS != ompi_datatype_type_size(origin_dt, &dt_size)) { + return OMPI_ERR_BAD_PARAM; + } + + /* Some consideration are needed to take care of fortran logical + * Yet not we dit + */ + if (MPI_C_BOOL == origin_dt || MPI_CXX_BOOL == origin_dt) { + ret = get_ubcl_int_type(dt_size, false, ubcl_type); + } else if (MPI_LOGICAL == origin_dt +#if OMPI_HAVE_FORTRAN_LOGICAL1 + || MPI_LOGICAL1 == origin_dt +#endif +#if OMPI_HAVE_FORTRAN_LOGICAL2 + || MPI_LOGICAL2 == origin_dt +#endif +#if OMPI_HAVE_FORTRAN_LOGICAL4 + || MPI_LOGICAL4 == origin_dt +#endif +#if OMPI_HAVE_FORTRAN_LOGICAL8 + || MPI_LOGICAL8 == origin_dt +#endif +#if OMPI_HAVE_FORTRAN_LOGICAL16 + || MPI_LOGICAL16 == origin_dt +#endif + ) { + ret = OMPI_ERR_NOT_IMPLEMENTED; + } else { + ret = OMPI_ERR_BAD_PARAM; + } + return ret; +} + +static int get_complex_ubcl_type(struct ompi_datatype_t *origin_dt, + ubcl_win_atomic_datatype_t *ubcl_type) +{ + int ret = OMPI_SUCCESS; + if (MPI_COMPLEX == origin_dt +#if HAVE_FLOAT__COMPLEX + || MPI_C_COMPLEX == origin_dt || MPI_C_FLOAT_COMPLEX == origin_dt +#endif +#if HAVE_DOUBLE__COMPLEX + || MPI_C_DOUBLE_COMPLEX == origin_dt +#endif +#if HAVE_LONG_DOUBLE__COMPLEX + || MPI_C_LONG_DOUBLE_COMPLEX == origin_dt +#endif + || MPI_CXX_FLOAT_COMPLEX == origin_dt || MPI_CXX_DOUBLE_COMPLEX == origin_dt + || MPI_CXX_LONG_DOUBLE_COMPLEX == origin_dt || MPI_DOUBLE_COMPLEX == origin_dt +/*#if OMPI_HAVE_FORTRAN_REAL2 + * || MPI_COMPLEX4 == origin_dt + *#endif */ +#if OMPI_HAVE_FORTRAN_REAL4 + || MPI_COMPLEX8 == origin_dt +#endif +#if OMPI_HAVE_FORTRAN_REAL8 + || MPI_COMPLEX16 == origin_dt +#endif +#if OMPI_HAVE_FORTRAN_REAL16 + || MPI_COMPLEX32 == origin_dt +#endif + ) { + ret = OMPI_ERR_NOT_IMPLEMENTED; + } else { + ret = OMPI_ERR_BAD_PARAM; + } + return ret; +} + +static int get_byte_ubcl_type(struct ompi_datatype_t *origin_dt, + ubcl_win_atomic_datatype_t *ubcl_type) +{ + int ret = OMPI_SUCCESS; + + if (MPI_BYTE == origin_dt) { + *ubcl_type = UBCL_UINT8; + } else { + ret = OMPI_ERR_BAD_PARAM; + } + + return ret; +} + +static int get_multi_language_ubcl_type(struct ompi_datatype_t *origin_dt, + ubcl_win_atomic_datatype_t *ubcl_type) +{ + int ret = OMPI_SUCCESS; + size_t dt_size; + + if (OMPI_SUCCESS != ompi_datatype_type_size(origin_dt, &dt_size)) { + return OMPI_ERR_BAD_PARAM; + } + + if (MPI_AINT == origin_dt || MPI_OFFSET == origin_dt || MPI_COUNT == origin_dt) { + ret = get_ubcl_int_type(dt_size, true, ubcl_type); + } else { + ret = OMPI_ERR_BAD_PARAM; + } + + return ret; +} + +static int get_pair_ubcl_type(struct ompi_datatype_t *origin_dt, + ubcl_win_atomic_datatype_t *ubcl_type) +{ + int ret = OMPI_SUCCESS; + size_t dt_size; + + if (OMPI_SUCCESS != ompi_datatype_type_size(origin_dt, &dt_size)) { + return OMPI_ERR_BAD_PARAM; + } + + if (MPI_FLOAT_INT == origin_dt) { + *ubcl_type = UBCL_FLOAT; + } else if (MPI_DOUBLE_INT == origin_dt) { + *ubcl_type = UBCL_DOUBLE; + } else if (MPI_LONG_DOUBLE_INT == origin_dt) { + *ubcl_type = UBCL_LONG_DOUBLE; + } else if (MPI_LONG_INT == origin_dt) { + ret = get_ubcl_int_type(sizeof(long), true, ubcl_type); + } else if (MPI_SHORT_INT == origin_dt) { + ret = get_ubcl_int_type(sizeof(short), true, ubcl_type); + } else if (MPI_2INT == origin_dt) { + ret = get_ubcl_int_type(sizeof(int), true, ubcl_type); + } else if (MPI_2REAL == origin_dt || MPI_2DOUBLE_PRECISION == origin_dt) { + ret = get_ubcl_fp_type(dt_size, ubcl_type); + } else if (MPI_2INTEGER == origin_dt) { + ret = get_ubcl_int_type(dt_size, true, ubcl_type); + } else { + ret = OMPI_ERR_BAD_PARAM; + } + + return ret; +} + +static int build_ubcl_loc_op(struct ompi_datatype_t *origin_dt, struct ompi_op_t *op, + ubcl_win_op_t *ubcl_op) +{ + ubcl_win_atomic_operator_t ubcl_operator; + ubcl_win_atomic_datatype_t data_type; + ubcl_win_atomic_datatype_t loc_type; + int ret = OMPI_SUCCESS; + + if (MPI_MAXLOC == op) { + ubcl_operator = UBCL_MAXLOC; + } else if (MPI_MINLOC == op) { + ubcl_operator = UBCL_MINLOC; + } else { + return OMPI_ERR_BAD_PARAM; + } + + ret = get_ubcl_int_type(sizeof(int), true, &loc_type); + if (OMPI_SUCCESS != ret) { + return ret; + } + + ret = get_pair_ubcl_type(origin_dt, &data_type); + if (OMPI_SUCCESS != ret) { + return ret; + } + + return ubcl_error_to_ompi(ubcl_win_build_loc_op(ubcl_op, data_type, ubcl_operator, loc_type)); +} + +#define GET_TYPE(fct, origin_dt, data_type) \ + do { \ + int _err = fct(origin_dt, data_type); \ + if (OMPI_SUCCESS == _err) { \ + goto got_type; \ + } else if (OMPI_ERR_NOT_IMPLEMENTED == _err) { \ + goto not_implemented; \ + } \ + } while (0) + +static int build_ubcl_minmax_op(struct ompi_datatype_t *origin_dt, struct ompi_op_t *op, + ubcl_win_op_t *ubcl_op) +{ + ubcl_win_atomic_operator_t ubcl_operator; + ubcl_win_atomic_datatype_t data_type = UBCL_TYPE_NONE; + + if (MPI_MAX == op) { + ubcl_operator = UBCL_MAX; + } else if (MPI_MIN == op) { + ubcl_operator = UBCL_MAX; + } else { + return OMPI_ERR_BAD_PARAM; + } + + /* This macro calls goto on got_type or not_implemented labels if the + * datatype is one of the predifines one of this category. + */ + GET_TYPE(get_c_integer_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_fortran_integer_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_fp_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_multi_language_ubcl_type, origin_dt, &data_type); + + return OMPI_ERR_BAD_PARAM; + +got_type: + return ubcl_error_to_ompi(ubcl_win_build_op(ubcl_op, data_type, ubcl_operator)); + +not_implemented: + return OMPI_ERR_NOT_IMPLEMENTED; +} + +static int build_ubcl_arithmetic_op(struct ompi_datatype_t *origin_dt, struct ompi_op_t *op, + ubcl_win_op_t *ubcl_op) +{ + ubcl_win_atomic_operator_t ubcl_operator; + ubcl_win_atomic_datatype_t data_type = UBCL_TYPE_NONE; + + if (MPI_SUM == op) { + ubcl_operator = UBCL_SUM; + } else if (MPI_PROD == op) { + ubcl_operator = UBCL_PROD; + } else { + return OMPI_ERR_BAD_PARAM; + } + + /* This macro calls goto on got_type or not_implemented labels if the + * datatype is one of the predifines one of this category. + */ + GET_TYPE(get_c_integer_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_fortran_integer_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_fp_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_complex_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_multi_language_ubcl_type, origin_dt, &data_type); + + return OMPI_ERR_BAD_PARAM; + +got_type: + return ubcl_error_to_ompi(ubcl_win_build_op(ubcl_op, data_type, ubcl_operator)); + +not_implemented: + return OMPI_ERR_NOT_IMPLEMENTED; +} + +static int build_ubcl_logical_op(struct ompi_datatype_t *origin_dt, struct ompi_op_t *op, + ubcl_win_op_t *ubcl_op) +{ + ubcl_win_atomic_operator_t ubcl_operator; + ubcl_win_atomic_datatype_t data_type = UBCL_TYPE_NONE; + + if (MPI_LAND == op) { + ubcl_operator = UBCL_LAND; + } else if (MPI_LOR == op) { + ubcl_operator = UBCL_LOR; + } else if (MPI_LXOR == op) { + ubcl_operator = UBCL_LXOR; + } else { + return OMPI_ERR_BAD_PARAM; + } + + /* This macro calls goto on got_type or not_implemented labels if the + * datatype is one of the predifines one of this category. + */ + GET_TYPE(get_c_integer_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_logical_ubcl_type, origin_dt, &data_type); + + return OMPI_ERR_BAD_PARAM; + +got_type: + return ubcl_error_to_ompi(ubcl_win_build_op(ubcl_op, data_type, ubcl_operator)); + +not_implemented: + return OMPI_ERR_NOT_IMPLEMENTED; +} + +static int build_ubcl_bitwise_op(struct ompi_datatype_t *origin_dt, struct ompi_op_t *op, + ubcl_win_op_t *ubcl_op) +{ + ubcl_win_atomic_operator_t ubcl_operator; + ubcl_win_atomic_datatype_t data_type = UBCL_TYPE_NONE; + + if (MPI_BAND == op) { + ubcl_operator = UBCL_BAND; + } else if (MPI_BOR == op) { + ubcl_operator = UBCL_BOR; + } else if (MPI_BXOR == op) { + ubcl_operator = UBCL_BXOR; + } else { + return OMPI_ERR_BAD_PARAM; + } + + /* This macro calls goto on got_type or not_implemented labels if the + * datatype is one of the predifines one of this category. + */ + GET_TYPE(get_c_integer_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_fortran_integer_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_byte_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_multi_language_ubcl_type, origin_dt, &data_type); + + return OMPI_ERR_BAD_PARAM; + +got_type: + return ubcl_error_to_ompi(ubcl_win_build_op(ubcl_op, data_type, ubcl_operator)); + +not_implemented: + return OMPI_ERR_NOT_IMPLEMENTED; +} + +static int build_ubcl_fake_op(struct ompi_datatype_t *origin_dt, struct ompi_op_t *op, + ubcl_win_op_t *ubcl_op) +{ + int ret = OMPI_SUCCESS; + ubcl_win_atomic_operator_t ubcl_operator; + ubcl_win_atomic_datatype_t data_type = UBCL_TYPE_NONE; + ubcl_win_atomic_datatype_t loc_type = UBCL_TYPE_NONE; + + if (MPI_REPLACE == op) { + ubcl_operator = UBCL_REPLACE; + } else if (MPI_NO_OP == op) { + ubcl_operator = UBCL_NO_OP; + } else { + return OMPI_ERR_BAD_PARAM; + } + + /* This macro calls goto on got_type or not_implemented labels if the + * datatype is one of the predifines one of this category. + */ + GET_TYPE(get_c_integer_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_fortran_integer_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_fp_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_logical_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_complex_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_byte_ubcl_type, origin_dt, &data_type); + GET_TYPE(get_multi_language_ubcl_type, origin_dt, &data_type); + + ret = get_pair_ubcl_type(origin_dt, &data_type); + if (OMPI_SUCCESS != ret) { + return ret; + } + ret = get_ubcl_int_type(sizeof(int), true, &loc_type); + if (OMPI_SUCCESS != ret) { + return ret; + } + + +got_type: + return ubcl_error_to_ompi(ubcl_win_build_loc_op(ubcl_op, data_type, ubcl_operator, loc_type)); + +not_implemented: + return OMPI_ERR_NOT_IMPLEMENTED; +} + +static int build_ubcl_op(struct ompi_datatype_t *origin_dt, struct ompi_op_t *op, + ubcl_win_op_t *ubcl_op) +{ + if (MPI_MAXLOC == op || MPI_MINLOC == op) { + return build_ubcl_loc_op(origin_dt, op, ubcl_op); + } else if (MPI_MAX == op || MPI_MIN == op) { + return build_ubcl_minmax_op(origin_dt, op, ubcl_op); + } else if (MPI_SUM == op || MPI_PROD == op) { + return build_ubcl_arithmetic_op(origin_dt, op, ubcl_op); + } else if (MPI_LAND == op || MPI_LOR == op || MPI_LXOR == op) { + return build_ubcl_logical_op(origin_dt, op, ubcl_op); + } else if (MPI_BAND == op || MPI_BOR == op || MPI_BXOR == op) { + return build_ubcl_bitwise_op(origin_dt, op, ubcl_op); + } else if (MPI_REPLACE == op || MPI_NO_OP == op) { + return build_ubcl_fake_op(origin_dt, op, ubcl_op); + } else { + return OMPI_ERR_BAD_PARAM; + } +} + +int ompi_osc_ubcl_accumulate(const void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_dt, int target, ptrdiff_t target_disp, + int target_count, struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, struct ompi_win_t *win) +{ + return ompi_osc_ubcl_raccumulate(origin_addr, origin_count, origin_dt, target, target_disp, + target_count, target_dt, op, win, NULL); +} + +int ompi_osc_ubcl_raccumulate(const void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_dt, int target, ptrdiff_t target_disp, + int target_count, struct ompi_datatype_t *target_dt, + struct ompi_op_t *op, struct ompi_win_t *win, + struct ompi_request_t **ompi_req) +{ + return ompi_osc_ubcl_rget_accumulate(origin_addr, origin_count, origin_dt, NULL, 0, NULL, + target, target_disp, target_count, target_dt, op, win, + ompi_req); +} + +int ompi_osc_ubcl_get_accumulate(const void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_dt, void *result_addr, + int result_count, struct ompi_datatype_t *result_dt, + int target_rank, ptrdiff_t target_disp, int target_count, + struct ompi_datatype_t *target_dt, struct ompi_op_t *op, + struct ompi_win_t *win) +{ + return ompi_osc_ubcl_rget_accumulate(origin_addr, origin_count, origin_dt, result_addr, + result_count, result_dt, target_rank, target_disp, + target_count, target_dt, op, win, NULL); +} + +static int compute_aligned_iovecs_count(struct iovec **iovecs, size_t *iovecs_count, + int iovecs_to_align, size_t *aligned_iovec_count) +{ + size_t segment[iovecs_to_align]; + size_t consumed_size[iovecs_to_align]; + size_t aligned_count = 0; + + for (int i = 0; i < iovecs_to_align; i++) { + segment[i] = 0; + consumed_size[i] = 0; + } + + /* Stop when we reach the end of one iovec */ + while (true) { + size_t min_remaining_size = UINT64_MAX; + + /* Get the minimum remaining size */ + for (int i = 0; i < iovecs_to_align; i++) { + if (segment[i] >= iovecs_count[i]) { + goto end_compute_aligned_count; + } + + size_t remaining_size = iovecs[i][segment[i]].iov_len - consumed_size[i]; + + if (remaining_size < min_remaining_size) { + min_remaining_size = remaining_size; + } + } + + /* Consume size */ + for (int i = 0; i < iovecs_to_align; i++) { + consumed_size[i] += min_remaining_size; + + if (consumed_size[i] == iovecs[i][segment[i]].iov_len) { + consumed_size[i] = 0; + segment[i]++; + } + } + + aligned_count++; + } +end_compute_aligned_count: + + /* The send buffer must fit in the target buffer and the target buffer must fit + * in the fetch buffer so the send buffer must be the smallest and all its segments + * must have been consumed + */ + if (segment[0] < iovecs_count[0]) { + return OMPI_ERROR; + } + + *aligned_iovec_count = aligned_count; + + return OMPI_SUCCESS; +} + +static void compute_aligned_iovecs(struct iovec **iovecs, size_t *iovecs_count, int iovecs_to_align, + struct iovec **aligned_iovecs, size_t aligned_iovec_count) +{ + size_t segment[iovecs_to_align]; + size_t consumed_size[iovecs_to_align]; + + /* Run through iovecs a second time to fill aligned_iovecs */ + for (int i = 0; i < iovecs_to_align; i++) { + segment[i] = 0; + consumed_size[i] = 0; + } + + for (size_t seg = 0; seg < aligned_iovec_count; seg++) { + size_t min_remaining_size = UINT64_MAX; + + /* Get the minimum remaining size */ + for (int i = 0; i < iovecs_to_align; i++) { + size_t remaining_size = iovecs[i][segment[i]].iov_len - consumed_size[i]; + + if (remaining_size < min_remaining_size) { + min_remaining_size = remaining_size; + } + } + + /* Consume size */ + for (int i = 0; i < iovecs_to_align; i++) { + aligned_iovecs[i][seg].iov_base = iovecs[i][segment[i]].iov_base + consumed_size[i]; + aligned_iovecs[i][seg].iov_len = min_remaining_size; + + consumed_size[i] += min_remaining_size; + + if (consumed_size[i] == iovecs[i][segment[i]].iov_len) { + consumed_size[i] = 0; + segment[i]++; + } + } + } +} + +/* This function takes an array of iovec arrays with an arbitrary fragmentation, + * and allocates a new array of iovec arrays describing the same memory areas but + * potentially splitted in smaller segments. + * + * All the returned iovec have the same count of fragments and the i-th element + * have the same length on each of it. + * + * If input iovec arrays have different total length, they must be provided in + * total length increasing order. + * In this case, iovecs are truncated according to the smallest one. + * An error is raised if the smallest one is not the first one. + */ +static int align_iovecs(struct iovec **iovecs, size_t *iovecs_count, int iovecs_to_align, + struct iovec **aligned_iovecs, size_t *aligned_iovec_count) +{ + size_t aligned_count = 0; + int ret; + + ret = compute_aligned_iovecs_count(iovecs, iovecs_count, iovecs_to_align, &aligned_count); + if (OMPI_SUCCESS != ret) { + return ret; + } + + /* Allocate aligned_iovecs */ + for (int i = 0; i < iovecs_to_align; i++) { + aligned_iovecs[i] = (struct iovec *) malloc(aligned_count * sizeof(struct iovec)); + } + + compute_aligned_iovecs(iovecs, iovecs_count, iovecs_to_align, aligned_iovecs, aligned_count); + + *aligned_iovec_count = aligned_count; + + return OMPI_SUCCESS; +} + +static struct ompi_datatype_t *segmented_rget_get_base_datatype(struct ompi_datatype_t *origin_dt, + struct ompi_datatype_t *target_dt, + struct ompi_datatype_t *result_dt, + struct ompi_op_t *op) +{ + struct ompi_datatype_t *base_datatype; + + /* Get predefined datatype used to build target_dt */ + base_datatype = ompi_datatype_get_single_predefined_type_from_args(target_dt); + if (NULL == base_datatype) { + /* Null means more than one, not allowed */ + return NULL; + } + + /* Ensure origin_dt and result_dt are made in the same wood as target_dt */ + if (MPI_NO_OP != op + && base_datatype != ompi_datatype_get_single_predefined_type_from_args(origin_dt)) { + return NULL; + } + if (NULL != result_dt + && base_datatype != ompi_datatype_get_single_predefined_type_from_args(result_dt)) { + return NULL; + } + + return base_datatype; +} + +static int segmented_rget_build_aligned_iovecs( + const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, void *result_addr, + int result_count, struct ompi_datatype_t *result_dt, int target_rank, ptrdiff_t target_disp, + int target_count, struct ompi_datatype_t *target_dt, struct ompi_op_t *op, ompi_proc_t *proc, + mca_osc_ubcl_module_t *module, struct iovec *aligned_iovec[3], size_t *aligned_iovec_count) +{ + int ret; + int64_t disp_unit; + struct iovec *base_iovec[3] = {NULL, NULL, NULL}; + size_t base_iovec_count[3] = {0, 0, 0}; + disp_unit = osc_ubcl_get_disp_unit(module, target_rank); + + if (MPI_NO_OP != op) { + /* Build origin iovec based on origin addr/count/datatype */ + ret = osc_ubcl_build_ddt_iov(origin_addr, proc, origin_count, origin_dt, &base_iovec[0], + &base_iovec_count[0]); + if (OMPI_SUCCESS != ret) { + goto error; + } + } + + /* Build target iovec with relative offsets in the target window */ + ret = osc_ubcl_build_ddt_iov((void *) (target_disp * disp_unit), proc, target_count, target_dt, + &base_iovec[1], &base_iovec_count[1]); + if (OMPI_SUCCESS != ret) { + goto error; + } + + if (NULL != result_dt) { + /* Build result iovec based on result addr/count/datatype */ + ret = osc_ubcl_build_ddt_iov(result_addr, proc, result_count, result_dt, &base_iovec[2], + &base_iovec_count[2]); + if (OMPI_SUCCESS != ret) { + goto error; + } + if (MPI_NO_OP == op) { + /* No origin iovec to align */ + ret = align_iovecs(&base_iovec[1], &base_iovec_count[1], 2, &aligned_iovec[1], + aligned_iovec_count); + } else { + ret = align_iovecs(base_iovec, base_iovec_count, 3, aligned_iovec, aligned_iovec_count); + } + + /* TODO: compute additionnal no_op segments if target buffer is larger than origin buffer */ + } else { + if (MPI_NO_OP == op) { + ret = OMPI_ERR_BAD_PARAM; + goto error; + } + /* No result iovec to align */ + ret = align_iovecs(base_iovec, base_iovec_count, 2, aligned_iovec, aligned_iovec_count); + } + + if (OMPI_SUCCESS != ret) { + goto error; + } + + ret = OMPI_SUCCESS; + +error: + free(base_iovec[0]); + free(base_iovec[1]); + free(base_iovec[2]); + return ret; +} + +static int segmented_rget_accumulate(const void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_dt, void *result_addr, + int result_count, struct ompi_datatype_t *result_dt, + int target_rank, ptrdiff_t target_disp, int target_count, + struct ompi_datatype_t *target_dt, struct ompi_op_t *op, + struct ompi_win_t *win, struct ompi_request_t **ompi_req) +{ + int ret; + mca_osc_ubcl_module_t *module; + ompi_proc_t *proc; + mca_common_ubcl_endpoint_t *endpoint; + struct iovec *aligned_iovec[3] = {NULL, NULL, NULL}; + size_t aligned_iovec_count; + struct ompi_datatype_t *base_datatype; + ubcl_win_op_t ubcl_op; + mca_osc_ubcl_request_t *req; + ubcl_completion_callback_fct cb; + void *cb_data; + size_t base_dt_size; + + module = (mca_osc_ubcl_module_t *) win->w_osc_module; + + /* Get base datatype to build operation */ + base_datatype = segmented_rget_get_base_datatype(origin_dt, target_dt, result_dt, op); + if (NULL == base_datatype) { + return OMPI_ERR_BAD_PARAM; + } + ret = ompi_datatype_type_size(base_datatype, &base_dt_size); + if (OMPI_SUCCESS != ret) { + goto error; + } + ret = build_ubcl_op(base_datatype, op, &ubcl_op); + if (OMPI_SUCCESS != ret) { + goto error; + } + + /* Get proc */ + proc = ompi_group_peer_lookup(win->w_group, target_rank); + if (OPAL_UNLIKELY(NULL == proc)) { + return OMPI_ERR_BAD_PARAM; + } + + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + + /* Compute accumulate segmentation into contiguous parts */ + ret = segmented_rget_build_aligned_iovecs(origin_addr, origin_count, origin_dt, result_addr, + result_count, result_dt, target_rank, target_disp, + target_count, target_dt, op, proc, module, + aligned_iovec, &aligned_iovec_count); + if (OMPI_SUCCESS != ret) { + goto error; + } + + /* Build the request if needed */ + if (NULL == ompi_req) { + req = NULL; + cb = NULL; + cb_data = NULL; + } else { + req = (mca_osc_ubcl_request_t *) opal_free_list_get(&mca_osc_ubcl_component.req_free_list); + if (OPAL_UNLIKELY(NULL == req)) { + ret = OMPI_ERR_OUT_OF_RESOURCE; + goto error; + } + + MCA_OSC_UBCL_REQUEST_INIT(req, target_rank, (struct ompi_datatype_t *) NULL, + (struct ompi_datatype_t *) NULL, win, true); + + *ompi_req = &req->ompi_req; + cb = ubcl_request_complete_cb; + cb_data = req; + req->segment_count = aligned_iovec_count; + } + + for (size_t i = 0; i < aligned_iovec_count; i++) { + void *sbuf; + void *fetch_buf; + ptrdiff_t offset; + size_t count; + ubcl_error_t err; + + /* Check if there is data to send */ + if (MPI_NO_OP == op) { + sbuf = NULL; + } else { + sbuf = aligned_iovec[0][i].iov_base; + assert(aligned_iovec[0][i].iov_len == aligned_iovec[1][i].iov_len); + } + + /* Target buffer offsetn in bytes, relative to the window base */ + offset = (ptrdiff_t) aligned_iovec[1][i].iov_base; + + /* Check if there is data to fetch */ + if (NULL != result_dt) { + fetch_buf = aligned_iovec[2][i].iov_base; + assert(aligned_iovec[1][i].iov_len == aligned_iovec[2][i].iov_len); + } else { + fetch_buf = NULL; + } + + /* Count in terms of base datatypes in this segment */ + count = aligned_iovec[1][i].iov_len / base_dt_size; + + /* Submit contiguous operation to ubcl */ + err = ubcl_accumulate(sbuf, fetch_buf, count, endpoint->rank, offset, &ubcl_op, module->wid, + cb, cb_data); + ret = ubcl_error_to_ompi(err); + if (OMPI_SUCCESS != ret) { + if (0 == i && NULL != req) { + /* This is the first segment, we can have a clean fail */ + opal_free_list_return(&mca_osc_ubcl_component.req_free_list, &req->super); + } else { + /* Some segments have already been sent, we are in a really bad satuation */ + mca_osc_ubcl_error(ret, + "Fail to send fragment %zu in an accumulate " + "operation segmented in %zu parts. " + "This error is not recoverable\n", + i, aligned_iovec_count); + } + goto error; + } + } + + ret = OMPI_SUCCESS; + +error: + free(aligned_iovec[0]); + free(aligned_iovec[1]); + free(aligned_iovec[2]); + return ret; +} + +int ompi_osc_ubcl_rget_accumulate(const void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_dt, void *result_addr, + int result_count, struct ompi_datatype_t *result_dt, + int target_rank, ptrdiff_t target_disp, int target_count, + struct ompi_datatype_t *target_dt, struct ompi_op_t *op, + struct ompi_win_t *win, struct ompi_request_t **ompi_req) +{ + ubcl_error_t err; + int ret; + int64_t disp_unit; + mca_common_ubcl_endpoint_t *endpoint; + mca_osc_ubcl_module_t *module; + ompi_proc_t *proc; + ubcl_win_op_t ubcl_op; + ptrdiff_t remote_offset; + ubcl_completion_callback_fct cb; + void *cb_data; + struct ompi_datatype_t *dt; + size_t count; + ptrdiff_t origin_size; + ptrdiff_t target_size; + ptrdiff_t gap; + + module = (mca_osc_ubcl_module_t *) win->w_osc_module; + disp_unit = osc_ubcl_get_disp_unit(module, target_rank); + + if (MPI_NO_OP != op) { + origin_size = opal_datatype_span((const opal_datatype_t *) origin_dt, origin_count, &gap); + } else { + origin_size = 0; + } + target_size = opal_datatype_span((const opal_datatype_t *) target_dt, target_count, &gap); + (void) gap; + + if (0 == target_size || (NULL == result_dt && 0 == origin_size)) { + if (NULL != ompi_req) { + *ompi_req = &ompi_request_empty; + } + return OMPI_SUCCESS; + } + + if ((MPI_NO_OP != op && !ompi_datatype_is_predefined(origin_dt)) + || !ompi_datatype_is_predefined(target_dt) + || (NULL != result_dt && !ompi_datatype_is_predefined(result_dt))) { + /* Let's take the hard way */ + return segmented_rget_accumulate(origin_addr, origin_count, origin_dt, result_addr, + result_count, result_dt, target_rank, target_disp, + target_count, target_dt, op, win, ompi_req); + } + + /* Get proc */ + proc = ompi_group_peer_lookup(win->w_group, target_rank); + if (OPAL_UNLIKELY(NULL == proc)) { + return OMPI_ERR_BAD_PARAM; + } + + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + if (OMPI_SUCCESS != ompi_osc_ubcl_check_access_epoch(target_rank, win)) { + return OMPI_ERR_RMA_CONFLICT; + } + + if (MPI_NO_OP == op) { + dt = target_dt; + count = target_count; + origin_addr = NULL; + } else { + dt = origin_dt; + count = origin_count; + } + + ret = build_ubcl_op(dt, op, &ubcl_op); + if (OMPI_SUCCESS != ret) { + return ret; + } + + if (NULL == ompi_req) { + cb = NULL; + cb_data = NULL; + } else { + mca_osc_ubcl_request_t *req; + req = (mca_osc_ubcl_request_t *) opal_free_list_get(&mca_osc_ubcl_component.req_free_list); + if (OPAL_UNLIKELY(NULL == req)) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + MCA_OSC_UBCL_REQUEST_INIT(req, target_rank, (struct ompi_datatype_t *) NULL, + (struct ompi_datatype_t *) NULL, win, true); + + *ompi_req = &req->ompi_req; + cb = ubcl_request_complete_cb; + cb_data = req; + } + + remote_offset = target_disp * disp_unit; + + /* TODO: handle non contiguous datatypes as MPI seems to allow some of it */ + err = ubcl_accumulate((void *) origin_addr, result_addr, count, endpoint->rank, + remote_offset, &ubcl_op, module->wid, cb, cb_data); + + ret = ubcl_error_to_ompi(err); + + if (OMPI_SUCCESS != ret && NULL != cb_data) { + opal_free_list_return(&mca_osc_ubcl_component.req_free_list, cb_data); + } + + return ret; +} + +int ompi_osc_ubcl_fetch_and_op(const void *origin_addr, void *result_addr, + struct ompi_datatype_t *dt, int target, ptrdiff_t target_disp, + struct ompi_op_t *op, struct ompi_win_t *win) +{ + if (! ompi_datatype_is_predefined(dt)) { + return OMPI_ERR_BAD_PARAM; + } + return ompi_osc_ubcl_get_accumulate(origin_addr, 1, dt, result_addr, 1, dt, target, target_disp, + 1, dt, op, win); +} + +int ompi_osc_ubcl_compare_and_swap(const void *origin_addr, const void *compare_addr, + void *result_addr, struct ompi_datatype_t *dt, int target, + ptrdiff_t target_disp, struct ompi_win_t *win) +{ + ubcl_win_atomic_datatype_t data_type = UBCL_TYPE_NONE; + int64_t disp_unit; + mca_osc_ubcl_module_t *module; + ompi_proc_t *proc; + mca_common_ubcl_endpoint_t *endpoint; + ubcl_error_t err; + + module = (mca_osc_ubcl_module_t *) win->w_osc_module; + disp_unit = osc_ubcl_get_disp_unit(module, target); + + GET_TYPE(get_c_integer_ubcl_type, dt, &data_type); + GET_TYPE(get_fortran_integer_ubcl_type, dt, &data_type); + GET_TYPE(get_logical_ubcl_type, dt, &data_type); + GET_TYPE(get_byte_ubcl_type, dt, &data_type); + GET_TYPE(get_multi_language_ubcl_type, dt, &data_type); + + return OMPI_ERR_BAD_PARAM; + +got_type: + /* Get proc */ + proc = ompi_group_peer_lookup(win->w_group, target); + if (OPAL_UNLIKELY(NULL == proc)) { + return OMPI_ERR_BAD_PARAM; + } + + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + if (OMPI_SUCCESS != ompi_osc_ubcl_check_access_epoch(target, win)) { + return OMPI_ERR_RMA_CONFLICT; + } + + err = ubcl_cas(origin_addr, compare_addr, result_addr, data_type, endpoint->rank, + target_disp * disp_unit, module->wid, NULL, NULL); + + return ubcl_error_to_ompi(err); + +not_implemented: + return OMPI_ERR_NOT_IMPLEMENTED; +} diff --git a/ompi/mca/osc/ubcl/osc_ubcl_datatype.c b/ompi/mca/osc/ubcl/osc_ubcl_datatype.c new file mode 100644 index 00000000000..b81bbacf3dc --- /dev/null +++ b/ompi/mca/osc/ubcl/osc_ubcl_datatype.c @@ -0,0 +1,86 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + * OSC/UBCL datatype and convertor related functions + * + */ + +#include "ompi/mca/osc/ubcl/osc_ubcl.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_utils.h" + +size_t osc_ubcl_datatype_pack(void *pack_buf, const void *usr_handle, size_t pack_size, + size_t offset) +{ + opal_convertor_t *convertor = (opal_convertor_t *) usr_handle; + + /* Set input data size and start pointer. */ + uint32_t iov_count = 1; + int ret = 0; + struct iovec iov; + iov.iov_len = pack_size; + iov.iov_base = (IOVBASE_TYPE *) pack_buf; + + opal_convertor_set_position(convertor, &offset); + + /* Pack data from converter to iov */ + ret = opal_convertor_pack(convertor, &iov, &iov_count, &pack_size); + if (-1 == ret) { + mca_osc_ubcl_error(ret, "opal_convertor_unpack failed\n"); + } + + return pack_size; +} + +size_t osc_ubcl_datatype_unpack(void *usr_handle, const void *pack_buf, size_t pack_size, + size_t offset) +{ + opal_convertor_t *convertor = (opal_convertor_t *) usr_handle; + + /* Set input data size and start pointer. */ + uint32_t iov_count = 1; + int ret = 0; + struct iovec iov; + iov.iov_len = pack_size; + iov.iov_base = (IOVBASE_TYPE *) pack_buf; + + opal_convertor_set_position(convertor, &offset); + + /* Pack data from converter to iov */ + ret = opal_convertor_unpack(convertor, &iov, &iov_count, &pack_size); + if (-1 == ret) { + mca_osc_ubcl_error(ret, "opal_convertor_unpack failed\n"); + } + + return pack_size; +} + +size_t osc_ubcl_datatype_mem_size(const void *usr_handle, size_t offset) +{ + opal_convertor_t *convertor = (opal_convertor_t *) usr_handle; + size_t size = 0; + + opal_datatype_type_size(convertor->pDesc, &size); + + if (offset > size * convertor->count) { + return 0; + } + + return size * convertor->count - offset; +} + +void osc_ubcl_datatype_finish(void *usr_handle) +{ + (void) usr_handle; + return; +} diff --git a/ompi/mca/osc/ubcl/osc_ubcl_get.c b/ompi/mca/osc/ubcl/osc_ubcl_get.c new file mode 100644 index 00000000000..76b7a47bfee --- /dev/null +++ b/ompi/mca/osc/ubcl/osc_ubcl_get.c @@ -0,0 +1,167 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi/mca/osc/ubcl/osc_ubcl.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_info.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_request.h" +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_utils.h" +#include "ompi/mca/common/ubcl/common_ubcl.h" + +int ompi_osc_ubcl_get(void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_dt, int target, + ptrdiff_t target_disp, int target_count, + struct ompi_datatype_t *target_dt, struct ompi_win_t *win) +{ + return ompi_osc_ubcl_rget(origin_addr, origin_count, origin_dt, + target, target_disp, target_count, target_dt, + win, NULL); +} + +int ompi_osc_ubcl_rget(void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_dt, int target, + ptrdiff_t target_disp, int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win, + struct ompi_request_t **ompi_req) +{ + ubcl_error_t err = 0; + int ret = OMPI_SUCCESS; + int64_t disp_unit; + ptrdiff_t gap; + size_t span; + size_t target_span; + size_t target_iov_count; + struct iovec *target_iov; + void *target_addr; + mca_common_ubcl_endpoint_t *endpoint; + ubcl_memory_descriptor_t sbuf_md; + mca_osc_ubcl_module_t *module; + mca_osc_ubcl_request_t *osc_req; + + module = (mca_osc_ubcl_module_t *) win->w_osc_module; + disp_unit = osc_ubcl_get_disp_unit(module, target); + + OPAL_OUTPUT_VERBOSE( + (50, mca_osc_ubcl_component.output, "UBCL_OSC_GET to window %lu\n", module->wid)); + + /* Get proc */ + ompi_proc_t *proc; + proc = ompi_group_peer_lookup_existing(win->w_group, target); + if (OPAL_UNLIKELY(NULL == proc)) { + ret = OMPI_ERR_BAD_PARAM; + mca_osc_ubcl_warn(ret, "Unknown rank %d on window %d", target, module->wid); + goto exit; + } + + target_span = opal_datatype_span((const opal_datatype_t *) target_dt, target_count, &gap); + if (0 == target_span) { + if (NULL != ompi_req) { + *ompi_req = &ompi_request_empty; + } + return OMPI_SUCCESS; + } + (void) gap; + + /* We retrieve endpoints created by the PML at init */ + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + + /* Allocate an OSC request */ + osc_req = (mca_osc_ubcl_request_t *) opal_free_list_get(&mca_osc_ubcl_component.req_free_list); + if (OPAL_UNLIKELY(NULL == osc_req)) { + ret = OMPI_ERR_OUT_OF_RESOURCE; + mca_osc_ubcl_warn(ret, "Not enough memory to allocate an OSC request"); + goto exit; + } + if (NULL != ompi_req) { + MCA_OSC_UBCL_REQUEST_INIT(osc_req, target, origin_dt, target_dt, win, true); + *ompi_req = &osc_req->ompi_req; + } else { + MCA_OSC_UBCL_REQUEST_INIT(osc_req, target, origin_dt, target_dt, win, false); + } + + /* Init UBCL MD */ + err = ubcl_memory_descriptor_init(&sbuf_md); + if (UBCL_SUCCESS != err) { + /* This should never happen: ubcl_memory_descriptor_init just assign values */ + mca_osc_ubcl_error(ubcl_error_to_ompi(err), "Failed to initialize ubcl MD"); + } + + /* If we don't need to pack we can build a contiguous */ + if (ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count)) { + span = opal_datatype_span((const opal_datatype_t *) origin_dt, origin_count, &gap); + err = ubcl_memory_descriptor_build_contiguous(((char *) origin_addr) + gap, span, &sbuf_md); + if (UBCL_SUCCESS != err) { + mca_osc_ubcl_error(ubcl_error_to_ompi(err), + "Failed to build contiguous memory descriptor for input buffer"); + } + } + + /* Always build a custom MD representation so that we have a fallback */ + opal_convertor_copy_and_prepare_for_recv(proc->super.proc_convertor, &origin_dt->super, + origin_count, origin_addr, 0, + &(osc_req->origin_convertor)); + + if (opal_convertor_on_device(&osc_req->origin_convertor)) { + opal_free_list_return(&mca_osc_ubcl_component.req_free_list, &(osc_req->super)); + mca_osc_ubcl_warn(OPAL_ERR_NOT_SUPPORTED, "GPU buffer not supported by osc/ubcl"); + ret = OPAL_ERR_NOT_SUPPORTED; + goto exit; + } + + err = ubcl_memory_descriptor_build_custom((void *) &(osc_req->origin_convertor), + osc_ubcl_datatype_pack, osc_ubcl_datatype_unpack, + osc_ubcl_datatype_mem_size, osc_ubcl_datatype_finish, + &sbuf_md); + + if (UBCL_SUCCESS != err) { + mca_osc_ubcl_error(ubcl_error_to_ompi(err), + "Failed to build custom memory descriptor for input buffer"); + } + + /* We need to build the iovec to describe the memory representation at the target */ + target_iov = NULL; + target_iov_count = 0; + target_addr = (void *) (uintptr_t) (target_disp * disp_unit); + if (ompi_datatype_is_contiguous_memory_layout(target_dt, target_count)) { + target_iov_count = 1; + target_iov = (struct iovec *) malloc(target_iov_count * sizeof(struct iovec)); + + span = opal_datatype_span((const opal_datatype_t *) target_dt, target_count, &gap); + target_iov[0].iov_base = target_addr + gap; + target_iov[0].iov_len = span; + } else { + int ret = OMPI_SUCCESS; + ret = osc_ubcl_build_ddt_iov(target_addr, proc, target_count, target_dt, &target_iov, + &target_iov_count); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + MCA_OSC_UBCL_REQUEST_FINI(osc_req); + opal_free_list_return(&mca_osc_ubcl_component.req_free_list, + (opal_free_list_item_t *) osc_req); + return ret; + } + } + + err = ubcl_get(sbuf_md, target_iov, target_iov_count, endpoint->rank, module->wid, + ubcl_request_complete_cb, osc_req); + + free(target_iov); + + if (UBCL_SUCCESS != err) { + MCA_OSC_UBCL_REQUEST_FINI(osc_req); + opal_free_list_return(&mca_osc_ubcl_component.req_free_list, + (opal_free_list_item_t *) osc_req); + mca_osc_ubcl_error(ubcl_error_to_ompi(err), "Failed to send data"); + } + +exit: + return ret; +} diff --git a/ompi/mca/osc/ubcl/osc_ubcl_info.c b/ompi/mca/osc/ubcl/osc_ubcl_info.c new file mode 100644 index 00000000000..40601fc4f6b --- /dev/null +++ b/ompi/mca/osc/ubcl/osc_ubcl_info.c @@ -0,0 +1,117 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi/mca/osc/ubcl/osc_ubcl_info.h" +#include "osc_ubcl_utils.h" + +static void update_same_disp_unit_info(mca_osc_ubcl_module_t *module, bool value) +{ + if (value != module->same_disp_unit){ + int my_rank; + int my_disp; + + my_rank = ompi_comm_rank(module->comm); + my_disp = osc_ubcl_get_disp_unit(module, my_rank); + + /* Disp_unit array need to be freed or allocated */ + osc_ubcl_fini_disp_unit(module); + module->same_disp_unit = value; + osc_ubcl_sync_disp_unit(module, my_disp, false); + } +} + +static const char* update_local_copy(opal_infosubscriber_t *obj, const char *key, const char *value) +{ + bool bval; + mca_osc_ubcl_module_t *module; + struct ompi_win_t *win = (struct ompi_win_t*) obj; + module = (mca_osc_ubcl_module_t *) win->w_osc_module; + + bval = opal_str_to_bool(value); + mca_osc_ubcl_log(20, "%s updated to %s", key, value); + + if(0 == strcmp(key, "no_locks")) { + module->no_locks = bval; + } else if(0 == strcmp(key, "same_disp_unit")) { + update_same_disp_unit_info(module, bval); + } + + /* Do not change the official value. We just needed to update our copy */ + return value; +} + +static bool get_win_info_bool(struct ompi_win_t *win, char *info_name) +{ + bool ret = false; + int found; + + opal_info_get_bool(win->super.s_info, info_name, &ret, &found); + return ret; +} + +int osc_ubcl_read_info(struct opal_info_t *info, struct ompi_win_t *win) +{ + mca_osc_ubcl_module_t *module; + + /* Windows inherits from opal_infosubscriber_t class. Use it to keep + * duplicated value up-to-date */ + opal_infosubscribe_subscribe(&win->super, "no_locks", "false", update_local_copy); + opal_infosubscribe_subscribe(&win->super, "same_disp_unit", "false", update_local_copy); + + module = (mca_osc_ubcl_module_t *) win->w_osc_module; + module->no_locks = get_win_info_bool(module->win, "no_locks"); + module->same_disp_unit = get_win_info_bool(module->win, "same_disp_unit"); + + return OMPI_SUCCESS; +} + +int osc_ubcl_get_disp_unit(mca_osc_ubcl_module_t *module, int target) +{ + if (module->same_disp_unit) { + return module->disp_unit.uniq; + } else { + return module->disp_unit.all[target]; + } +} + +int osc_ubcl_sync_disp_unit(mca_osc_ubcl_module_t *module, int disp_unit, bool need_synchro) +{ + int ret = OMPI_SUCCESS; + + if(! module->same_disp_unit) { + int comm_size = ompi_comm_size(module->comm); + int my_rank = ompi_comm_rank(module->comm); + module->disp_unit.all = malloc(comm_size * sizeof(int)); + if (NULL == module->disp_unit.all) { + ret = OMPI_ERR_OUT_OF_RESOURCE; + goto exit; + } + module->disp_unit.all[my_rank] = disp_unit; + ret = module->comm->c_coll->coll_allgather(&disp_unit, 1, MPI_INT, module->disp_unit.all, 1, + MPI_INT, module->comm, + module->comm->c_coll->coll_allgather_module); + } else if (need_synchro) { + module->disp_unit.uniq = disp_unit; + ret = module->comm->c_coll->coll_barrier(module->comm, + module->comm->c_coll->coll_barrier_module); + } + +exit: + return ret; +} + +void osc_ubcl_fini_disp_unit(mca_osc_ubcl_module_t *module) +{ + if(! module->same_disp_unit) { + free(module->disp_unit.all); + module->disp_unit.all = NULL; + } +} diff --git a/ompi/mca/osc/ubcl/osc_ubcl_info.h b/ompi/mca/osc/ubcl/osc_ubcl_info.h new file mode 100644 index 00000000000..16679df1b49 --- /dev/null +++ b/ompi/mca/osc/ubcl/osc_ubcl_info.h @@ -0,0 +1,24 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_OSC_UBCL_INFO_H +#define MCA_OSC_UBCL_INFO_H + +#include "ompi/mca/osc/ubcl/osc_ubcl.h" +#include "opal/util/info.h" +#include "ompi/win/win.h" + +int osc_ubcl_read_info(struct opal_info_t *info, struct ompi_win_t *win); +int osc_ubcl_sync_disp_unit(mca_osc_ubcl_module_t *module, int disp_unit, bool need_synchro); +int osc_ubcl_get_disp_unit(mca_osc_ubcl_module_t *module, int target); +void osc_ubcl_fini_disp_unit(mca_osc_ubcl_module_t *module); + +#endif /* MCA_OSC_UBCL_INFO_H */ diff --git a/ompi/mca/osc/ubcl/osc_ubcl_put.c b/ompi/mca/osc/ubcl/osc_ubcl_put.c new file mode 100644 index 00000000000..20a54fe0dcf --- /dev/null +++ b/ompi/mca/osc/ubcl/osc_ubcl_put.c @@ -0,0 +1,169 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + * Bull eXtreme Interconnect OSC API implementation. + * + * Implementation of API defined in osc.h. To see parameters and return values + * of these functions, refer to ompi/mca/osc/osc.h. + */ + +#include "ompi/mca/osc/ubcl/osc_ubcl.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_info.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_request.h" +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_utils.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_sync.h" +#include "ompi/mca/common/ubcl/common_ubcl.h" + +int ompi_osc_ubcl_put(const void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_dt, int target, + ptrdiff_t target_disp, int target_count, + struct ompi_datatype_t *target_dt, struct ompi_win_t *win) +{ + return ompi_osc_ubcl_rput(origin_addr, origin_count, origin_dt, + target, target_disp, target_count, target_dt, + win, NULL); +} + +int ompi_osc_ubcl_rput(const void *origin_addr, int origin_count, + struct ompi_datatype_t *origin_dt, int target, + ptrdiff_t target_disp, int target_count, + struct ompi_datatype_t *target_dt, + struct ompi_win_t *win, + struct ompi_request_t **ompi_req) +{ + ubcl_error_t err = 0; + int ret = OMPI_SUCCESS; + int64_t disp_unit; + ptrdiff_t gap; + size_t span; + size_t target_iov_count; + struct iovec *target_iov; + void *target_addr; + mca_common_ubcl_endpoint_t *endpoint; + ubcl_memory_descriptor_t sbuf_md; + mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; + mca_osc_ubcl_request_t *osc_req; + + disp_unit = osc_ubcl_get_disp_unit(module, target); + OPAL_OUTPUT_VERBOSE( + (50, mca_osc_ubcl_component.output, "UBCL_OSC_PUT to window %lu\n", module->wid)); + + /* Get proc */ + ompi_proc_t *proc; + proc = ompi_group_peer_lookup_existing(win->w_group, target); + if (OPAL_UNLIKELY(NULL == proc)) { + ret = OMPI_ERR_BAD_PARAM; + mca_osc_ubcl_warn(ret, "Unknown rank %d on window %d", target, module->wid); + goto exit; + } + + span = opal_datatype_span((const opal_datatype_t *) origin_dt, origin_count, &gap); + if (0 == span) { + if (NULL != ompi_req) { + *ompi_req = &ompi_request_empty; + } + return OMPI_SUCCESS; + } + + /* We retrieve endpoints created by the PML at init */ + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + if (OMPI_SUCCESS != ompi_osc_ubcl_check_access_epoch(target, win)) { + return OMPI_ERR_RMA_CONFLICT; + } + + /* Allocate an OSC request */ + osc_req = (mca_osc_ubcl_request_t *) opal_free_list_get(&mca_osc_ubcl_component.req_free_list); + if (OPAL_UNLIKELY(NULL == osc_req)) { + ret = OMPI_ERR_OUT_OF_RESOURCE; + mca_osc_ubcl_warn(ret, "Not enough memory to allocate an OSC request"); + goto exit; + } + if (NULL != ompi_req) { + MCA_OSC_UBCL_REQUEST_INIT(osc_req, target, origin_dt, target_dt, win, true); + *ompi_req = &osc_req->ompi_req; + } else { + MCA_OSC_UBCL_REQUEST_INIT(osc_req, target, origin_dt, target_dt, win, false); + } + + /* Init UBCL MD */ + err = ubcl_memory_descriptor_init(&sbuf_md); + if (UBCL_SUCCESS != err) { + /* This should never happen: ubcl_memory_descriptor_init just assign values */ + mca_osc_ubcl_error(ubcl_error_to_ompi(err), "Failed to initialize ubcl MD"); + } + + /* If we don't need to pack we can build a contiguous */ + if (ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count)) { + err = ubcl_memory_descriptor_build_contiguous(((char *) origin_addr) + gap, span, &sbuf_md); + if (UBCL_SUCCESS != err) { + mca_osc_ubcl_error(ubcl_error_to_ompi(err), + "Failed to build contiguous memory descriptor for input buffer"); + } + } + + /* Always build a custom MD representation so that we have a fallback */ + opal_convertor_copy_and_prepare_for_send(proc->super.proc_convertor, &origin_dt->super, + origin_count, origin_addr, 0, + &(osc_req->origin_convertor)); + + if (opal_convertor_on_device(&osc_req->origin_convertor)) { + opal_free_list_return(&mca_osc_ubcl_component.req_free_list, &(osc_req->super)); + mca_osc_ubcl_warn(OPAL_ERR_NOT_SUPPORTED, "GPU buffer not supported by osc/ubcl"); + ret = OPAL_ERR_NOT_SUPPORTED; + goto exit; + } + + err = ubcl_memory_descriptor_build_custom((void *) &(osc_req->origin_convertor), + osc_ubcl_datatype_pack, osc_ubcl_datatype_unpack, + osc_ubcl_datatype_mem_size, osc_ubcl_datatype_finish, + &sbuf_md); + + if (UBCL_SUCCESS != err) { + mca_osc_ubcl_error(ubcl_error_to_ompi(err), + "Failed to build custom memory descriptor for input buffer"); + } + + /* We need to build the iovec to describe the memory representation at the target */ + target_iov = NULL; + target_iov_count = 0; + target_addr = (void *) (uintptr_t) (target_disp * disp_unit); + if (ompi_datatype_is_contiguous_memory_layout(target_dt, target_count)) { + target_iov_count = 1; + target_iov = (struct iovec *) malloc(target_iov_count * sizeof(struct iovec)); + + span = opal_datatype_span((const opal_datatype_t *) target_dt, target_count, &gap); + target_iov[0].iov_base = target_addr + gap; + target_iov[0].iov_len = span; + } else { + int ret = OMPI_SUCCESS; + ret = osc_ubcl_build_ddt_iov(target_addr, proc, target_count, target_dt, &target_iov, + &target_iov_count); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + } + + err = ubcl_put(sbuf_md, target_iov, target_iov_count, endpoint->rank, module->wid, + ubcl_request_complete_cb, osc_req); + + free(target_iov); + + if (UBCL_SUCCESS != err) { + mca_osc_ubcl_error(ubcl_error_to_ompi(err), "Failed to send data"); + } + +exit: + return ret; +} diff --git a/ompi/mca/osc/ubcl/osc_ubcl_request.c b/ompi/mca/osc/ubcl/osc_ubcl_request.c new file mode 100644 index 00000000000..37a1e2820b0 --- /dev/null +++ b/ompi/mca/osc/ubcl/osc_ubcl_request.c @@ -0,0 +1,110 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include "ompi/mca/osc/ubcl/osc_ubcl.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_request.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_utils.h" +#include "ompi/mca/common/ubcl/common_ubcl.h" + +#define container_of(ptr, type, member) ((type *) ((char *) (ptr) -offsetof(type, member))) + +/* Should be filtered out by MPI_Start based on request->type, but maybe not + * by MPI_Startall */ +static int osc_ubcl_request_start(size_t count, struct ompi_request_t **requests) +{ + (void) count; + (void) requests; + return MPI_ERR_REQUEST; +} + +static int osc_ubcl_request_free(struct ompi_request_t **request) +{ + OPAL_OUTPUT_VERBOSE((50, mca_osc_ubcl_component.output, + "OSC/UBCL REQUEST_FINALIZE BEGIN osc_req=%p\n", + request)); + mca_osc_ubcl_request_t *req; + opal_free_list_item_t * item; + + req = container_of((*request), mca_osc_ubcl_request_t, ompi_req); + item = (opal_free_list_item_t *) req; + + if (!REQUEST_COMPLETE(&(req)->ompi_req)) { + abort(); + } + + *request = MPI_REQUEST_NULL; + opal_free_list_return(&mca_osc_ubcl_component.req_free_list, item); + + return OMPI_SUCCESS; +} + +/* Cannot cancel osc requests */ +static int osc_ubcl_request_cancel(struct ompi_request_t *request, int complete) +{ + (void) request; + (void) complete; + return MPI_ERR_REQUEST; +} + +/* Called on free_list init during OBJ_CONSTRUCT */ +static void osc_ubcl_request_construct(mca_osc_ubcl_request_t *request) +{ + request->ompi_req.req_type = OMPI_REQUEST_WIN; + request->ompi_req.req_status._cancelled = 0; + request->ompi_req.req_free = osc_ubcl_request_free; + request->ompi_req.req_cancel = osc_ubcl_request_cancel; + request->ompi_req.req_start = osc_ubcl_request_start; +} + +/* callback privided to ubcl */ +void ubcl_request_complete_cb(ubcl_status_t status, void *cb_data) +{ + ompi_request_t *request; + mca_osc_ubcl_request_t *osc_req; + mca_osc_ubcl_module_t *module; + size_t segment_count; + size_t segment_acked; + + osc_req = (mca_osc_ubcl_request_t *) cb_data; + request = &osc_req->ompi_req; + module = (mca_osc_ubcl_module_t *) osc_req->win->w_osc_module; + + OPAL_OUTPUT_VERBOSE((50, mca_osc_ubcl_component.output, + "OSC/UBCL DATA TRANSFER COMPLETE mpi_req=%p\n", request)); + + mca_common_ubcl_status_to_ompi(&request->req_status, status, module->comm, -1); + if (MPI_STATUS_IGNORE != &request->req_status) { + request->req_status.MPI_ERROR = ubcl_error_to_ompi(status.status); + } + if (UBCL_SUCCESS != status.status) { + mca_osc_ubcl_error(OMPI_ERROR, "UBCL error at request completion"); + } + + segment_count = osc_req->segment_count; + segment_acked = opal_atomic_add_fetch_64((int64_t *) &osc_req->segment_ack, 1); + + if (segment_count == segment_acked) { + MCA_OSC_UBCL_REQUEST_FINI(osc_req); + ompi_request_complete(request, true); + /* Free is called once the completed request is waited/tested + * However this request comes from a non request-based call, then MPI_Wait will never be + * called so osc_ubcl_request_free must be manually called here */ + if (!osc_req->is_request_based) { + osc_ubcl_request_free(&request); + } + } +} + +OBJ_CLASS_INSTANCE(mca_osc_ubcl_request_t, + opal_free_list_item_t, + osc_ubcl_request_construct, + NULL); diff --git a/ompi/mca/osc/ubcl/osc_ubcl_request.h b/ompi/mca/osc/ubcl/osc_ubcl_request.h new file mode 100644 index 00000000000..9b744833230 --- /dev/null +++ b/ompi/mca/osc/ubcl/osc_ubcl_request.h @@ -0,0 +1,79 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_OSC_UBCL_REQUEST_H +#define MCA_OSC_UBCL_REQUEST_H + +#include +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/request/request.h" + +struct mca_osc_ubcl_request_s { + opal_free_list_item_t super; + ompi_request_t ompi_req; /**< Base request */ + struct ompi_win_t *win; + + uint64_t is_request_based : 1; + uint64_t unused : 63; + + ompi_datatype_t *origin_dt; + ompi_datatype_t *target_dt; + + opal_convertor_t origin_convertor; + + /* Non contiguous accumulate are segmented */ + size_t segment_count; + + /* Track that all segments are finished before completing the user's request */ + size_t segment_ack; +}; +typedef struct mca_osc_ubcl_request_s mca_osc_ubcl_request_t; +OBJ_CLASS_DECLARATION(mca_osc_ubcl_request_t); + +/* callback required by ubcl */ +void ubcl_request_complete_cb(ubcl_status_t status, void *cb_data); + +/** + * Generic convenient macros + */ +#define MCA_OSC_UBCL_REQUEST_INIT(req, _dst, _origin_dt, _target_dt, _win, _is_request_based) \ + do { \ + OBJ_RETAIN(_win); \ + if (NULL != _origin_dt) { \ + OMPI_DATATYPE_RETAIN(_origin_dt); \ + } \ + if (NULL != _target_dt) { \ + OMPI_DATATYPE_RETAIN(_target_dt); \ + } \ + OMPI_REQUEST_INIT(&(req)->ompi_req, false); \ + (req)->ompi_req.req_state = OMPI_REQUEST_ACTIVE; \ + (req)->origin_dt = _origin_dt; \ + (req)->target_dt = _target_dt; \ + (req)->win = _win; \ + (req)->is_request_based = _is_request_based; \ + (req)->segment_count = 1; \ + (req)->segment_ack = 0; \ + OBJ_CONSTRUCT(&((req)->origin_convertor), opal_convertor_t); \ + } while (0) + +#define MCA_OSC_UBCL_REQUEST_FINI(req) \ + do { \ + OBJ_RELEASE((req)->win); \ + if (NULL != (req)->origin_dt) { \ + OMPI_DATATYPE_RELEASE((req)->origin_dt); \ + } \ + if (NULL != (req)->target_dt) { \ + OMPI_DATATYPE_RELEASE((req)->target_dt); \ + } \ + OBJ_DESTRUCT(&((req)->origin_convertor)); \ + } while (0) + +#endif //MCA_OSC_UBCL_REQUEST_H diff --git a/ompi/mca/osc/ubcl/osc_ubcl_sync.c b/ompi/mca/osc/ubcl/osc_ubcl_sync.c new file mode 100644 index 00000000000..b47a682feda --- /dev/null +++ b/ompi/mca/osc/ubcl/osc_ubcl_sync.c @@ -0,0 +1,788 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/osc/ubcl/osc_ubcl.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_utils.h" +#include "ompi/mca/osc/ubcl/osc_ubcl_sync.h" +#include "ompi/mca/common/ubcl/common_ubcl.h" + +#define FAIL_IF_NOT_LOCKED(win, _op) \ + do { \ + if (!ompi_osc_ubcl_is_locked(win)) { \ + mca_osc_ubcl_warn(OMPI_ERR_RMA_SYNC, "Attempt %s on a non locked window", _op); \ + return OMPI_ERR_RMA_SYNC; \ + } \ + } while (0) + +static const char *osc_ubcl_sync_name(ubcl_win_sync_type_t type) +{ + switch (type) { + case UBCL_WIN_SYNC_NONE: + return "NO_SYNC"; + case UBCL_WIN_SYNC_LOCK: + return "LOCK"; + case UBCL_WIN_SYNC_LOCK_ALL: + return "LOCK_ALL"; + case UBCL_WIN_SYNC_PSCW: + return "PSCW"; + case UBCL_WIN_SYNC_FENCE: + return "FENCE"; + case UBCL_WIN_SYNC_FENCE_EPOCH: + return "FENCE_WITH_COMMUNICATIONS"; + default: + return "???"; + } +} + +static bool ompi_osc_ubcl_is_locked(struct ompi_win_t *win) +{ + mca_osc_ubcl_module_t *module; + + module = (mca_osc_ubcl_module_t *) win->w_osc_module; + return module->passive_lock_refcount || UBCL_WIN_SYNC_LOCK == module->sync_type + || UBCL_WIN_SYNC_LOCK_ALL == module->sync_type + || UBCL_WIN_SYNC_LOCK_ALL_NO_CHECK == module->sync_type; +} + +int ompi_osc_ubcl_check_access_epoch(int target_rank, struct ompi_win_t *win) +{ + int ret; + ubcl_win_sync_type_t rank_lock_type; + mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; + int real_rank; + + ret = OMPI_SUCCESS; + + switch (module->sync_type) { + case UBCL_WIN_SYNC_LOCK: + /* Check if there is an access epoch for this target */ + rank_lock_type = module->procs_sync_type[target_rank]; + if (UBCL_WIN_SYNC_NONE == rank_lock_type) { + ret = OMPI_ERR_RMA_SYNC; + mca_osc_ubcl_warn(ret, "Invalid epoch: target %d is not locked on window %s", + target_rank, win->w_name); + } + break; + case UBCL_WIN_SYNC_LOCK_ALL: + case UBCL_WIN_SYNC_LOCK_ALL_NO_CHECK: + ret = OMPI_SUCCESS; + break; + case UBCL_WIN_SYNC_NONE: + ret = OMPI_ERR_RMA_SYNC; + mca_osc_ubcl_warn(ret, "Invalid epoch: no epoch started on window %s", win->w_name); + break; + case UBCL_WIN_SYNC_PSCW: + /* Check if there is an access epoch for this target */ + if (NULL == module->active_sync_access_group) { + ret = OMPI_ERR_RMA_SYNC; + mca_osc_ubcl_warn(ret, "Invalid epoch: no access group defined for " + "window %s in an active target epoch", win->w_name); + } else if (OMPI_SUCCESS != ompi_group_translate_ranks(win->w_group, 1, &target_rank, + module->active_sync_access_group, + &real_rank)) { + ret = OMPI_ERR_RMA_SYNC; + mca_osc_ubcl_warn(ret, "Invalid target %d for communications on window %s", + target_rank, win->w_name); + } + break; + case UBCL_WIN_SYNC_FENCE: + case UBCL_WIN_SYNC_FENCE_EPOCH: + module->sync_type = UBCL_WIN_SYNC_FENCE_EPOCH; + ret = OMPI_SUCCESS; + break; + default: + ret = OMPI_ERR_NOT_IMPLEMENTED; + break; + } + return ret; +} + +/* ==== FLUSH ==== */ + +static int osc_ubcl_flush_no_check(int target, struct ompi_win_t *win) +{ + int ret; + int ubcl_ret; + mca_common_ubcl_endpoint_t *endpoint; + mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; + + ret = OMPI_SUCCESS; + /* Get proc */ + ompi_proc_t *proc = ompi_group_peer_lookup_existing(win->w_group, target); + if (OPAL_UNLIKELY(NULL == proc)) { + ret = OMPI_ERR_BAD_PARAM; + mca_osc_ubcl_warn(ret, "Unknown rank %d in window %s", target, win->w_name); + goto exit; + } + + /* We retrieve endpoints created by the PML at init */ + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + ubcl_ret = ubcl_flush(endpoint->rank, module->wid); + if (UBCL_SUCCESS != ubcl_ret) { + ret = ubcl_error_to_ompi(ubcl_ret); + mca_osc_ubcl_warn(ret, "ubcl_flush returned error %d", ubcl_ret); + } +exit: + return ret; +} + +int ompi_osc_ubcl_flush(int target, struct ompi_win_t *win) +{ + FAIL_IF_NOT_LOCKED(win, "flush"); + return osc_ubcl_flush_no_check(target, win); +} + +static int osc_ubcl_flush_all_no_check(struct ompi_win_t *win) +{ + int size, ret; + size = ompi_group_size(win->w_group); + + for (int i = 0; i < size; i++) { + ret = osc_ubcl_flush_no_check(i, win); + if (OMPI_SUCCESS != ret) { + return ret; + } + } + return OMPI_SUCCESS; +} + +int ompi_osc_ubcl_flush_all(struct ompi_win_t *win) +{ + FAIL_IF_NOT_LOCKED(win, "flush_all"); + return osc_ubcl_flush_all_no_check(win); +} + +int ompi_osc_ubcl_flush_local(int target, struct ompi_win_t *win) +{ + return ompi_osc_ubcl_flush(target, win); +} + +int ompi_osc_ubcl_flush_local_all(struct ompi_win_t *win) +{ + return ompi_osc_ubcl_flush_all(win); +} + + +/* ==== LOCK ==== */ + +int ompi_osc_ubcl_lock(int lock_type, int target, int assert, struct ompi_win_t *win) +{ + ompi_proc_t *proc; + mca_common_ubcl_endpoint_t *endpoint; + ubcl_win_lock_type_t ubcl_type; + int ret; + ubcl_error_t ubcl_err; + mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; + + if (module->no_locks) { + mca_osc_ubcl_error(OMPI_ERR_RMA_SYNC, "MPI_Win_lock : window %d is no_locks=true", module->wid); + } + + OPAL_THREAD_LOCK(&module->sync_lock); + + /* check synchronization type */ + if (UBCL_WIN_SYNC_NONE != module->sync_type && UBCL_WIN_SYNC_LOCK != module->sync_type + && UBCL_WIN_SYNC_FENCE != module->sync_type) { + ret = OMPI_ERR_RMA_CONFLICT; + mca_osc_ubcl_warn(ret, "Failed to lock window %s already in sync type %s", + win->w_name, osc_ubcl_sync_name(module->sync_type)); + goto return_locked; + } + + if (MPI_LOCK_EXCLUSIVE == lock_type) { + ubcl_type = UBCL_WIN_LOCK_EXCLUSIVE; + } else if (MPI_LOCK_SHARED == lock_type) { + ubcl_type = UBCL_WIN_LOCK_SHARED; + } else { + ret = OMPI_ERR_BAD_PARAM; + mca_osc_ubcl_warn(ret, "MPI_Win_lock : lock type %d is unknown", lock_type); + goto return_locked; + } + + proc = ompi_group_peer_lookup_existing(win->w_group, target); + if (OPAL_UNLIKELY(NULL == proc)) { + ret = OMPI_ERR_BAD_PARAM; + mca_osc_ubcl_warn(ret, "Cannot lock target %d on window %s", target, win->w_name); + goto return_locked; + } + + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + + /* check access epoch */ + if (UBCL_WIN_SYNC_NONE != module->procs_sync_type[target]) { + ret = OMPI_ERR_RMA_CONFLICT; + mca_osc_ubcl_warn(ret, "Target %d is already locked on window %s", + target, win->w_name); + goto return_locked; + } + + /* As no other process will attempt to acquire this lock while we have it, + * we don't need to actually take it + */ + if (0 != (MPI_MODE_NOCHECK & assert)) { + module->procs_sync_type[target] = UBCL_WIN_SYNC_LOCK_NO_CHECK; + ret = OMPI_SUCCESS; + goto no_check; + } + + ubcl_err = ubcl_win_lock(ubcl_type, endpoint->rank, module->wid); + ret = ubcl_error_to_ompi(ubcl_err); + if (OMPI_SUCCESS != ret) { + /* Remote rank may be locked for ever: no recovery possible */ + mca_osc_ubcl_error(ret, "MPI_Win_lock failed"); + goto return_locked; + } + + module->procs_sync_type[target] = UBCL_WIN_SYNC_LOCK; +no_check: + module->sync_type = UBCL_WIN_SYNC_LOCK; + opal_atomic_fetch_add_64(&module->passive_lock_refcount, 1); + +return_locked: + OPAL_THREAD_UNLOCK(&module->sync_lock); + + return ret; +} + +int ompi_osc_ubcl_unlock(int target, struct ompi_win_t *win) +{ + ompi_proc_t *proc; + mca_common_ubcl_endpoint_t *endpoint; + int ret; + ubcl_error_t ubcl_err; + mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; + + if (module->no_locks) { + mca_osc_ubcl_error(OMPI_ERR_RMA_SYNC, "MPI_Win_unlock : window %d is no_locks=true", module->wid); + } + + OPAL_THREAD_LOCK(&module->sync_lock); + + /* check synchronization type */ + if (UBCL_WIN_SYNC_LOCK != module->sync_type + || (UBCL_WIN_SYNC_LOCK != module->procs_sync_type[target] + && UBCL_WIN_SYNC_LOCK_NO_CHECK != module->procs_sync_type[target])) { + ret = OMPI_ERR_RMA_CONFLICT; + mca_osc_ubcl_warn(ret, "Target %d is not locked so it cannot be unlocked " + "window %s (sync type %s)", + target, win->w_name, osc_ubcl_sync_name(module->sync_type)); + goto return_locked; + } + + /* Get proc */ + proc = ompi_group_peer_lookup_existing(win->w_group, target); + if (OPAL_UNLIKELY(NULL == proc)) { + ret = OMPI_ERR_BAD_PARAM; + mca_osc_ubcl_warn(ret, "Unknown rank %d on window %s", target, win->w_name); + goto return_locked; + } + + /* check exposure epoch */ + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + + ret = osc_ubcl_flush_no_check(target, win); + if (OMPI_SUCCESS != ret) { + goto return_locked; + } + + /* We did not really take this lock, so no need to release it */ + if (UBCL_WIN_SYNC_LOCK_NO_CHECK == module->procs_sync_type[target]) { + ret = OMPI_SUCCESS; + goto no_check; + } + + ubcl_err = ubcl_win_unlock(endpoint->rank, module->wid); + ret = ubcl_error_to_ompi(ubcl_err); + if (OMPI_SUCCESS != ret) { + mca_osc_ubcl_warn(ret, "MPI_Win_unlock failed"); + goto return_locked; + } + +no_check: + if (0 == opal_atomic_sub_fetch_64(&module->passive_lock_refcount, 1)) { + module->sync_type = UBCL_WIN_SYNC_NONE; + } + module->procs_sync_type[target] = UBCL_WIN_SYNC_NONE; + +return_locked: + OPAL_THREAD_UNLOCK(&module->sync_lock); + return ret; +} + +static int get_all_ubcl_ranks(struct ompi_win_t *win, ubcl_rank_t *all_ranks) +{ + int group_size; + int ret = OMPI_SUCCESS; + + group_size = ompi_group_size(win->w_group); + + for (int i = 0; i < group_size; ++i) { + ompi_proc_t *proc; + mca_common_ubcl_endpoint_t *endpoint; + + proc = ompi_group_peer_lookup_existing(win->w_group, i); + + if (OPAL_UNLIKELY(NULL == proc)) { + mca_osc_ubcl_warn(ret, "Unknown %d-th proc on window %s", i, win->w_name); + return OMPI_ERR_BAD_PARAM; + } + + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + all_ranks[i] = endpoint->rank; + } + return ret; +} + +/* lock_all doesn't need to check the exposure epoch because if there was another + * one started (individual lock or lock_all) then module->sync_type would be + * different from UBCL_WIN_SYNC_NONE therefore returning OMPI_ERR_RMA_CONFLICT. + * Stemming from this, unlock_all doesn't need to check the epoch either + */ +int ompi_osc_ubcl_lock_all(int assert, struct ompi_win_t *win) +{ + ubcl_rank_t *all_ranks; + int group_size, ret; + mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; + + if (module->no_locks) { + mca_osc_ubcl_error(OMPI_ERR_RMA_SYNC, "MPI_Win_lockall : window %d is no_locks=true", module->wid); + } + + /* check access epoch */ + if (UBCL_WIN_SYNC_NONE != module->sync_type && UBCL_WIN_SYNC_FENCE != module->sync_type) { + ret = OMPI_ERR_RMA_CONFLICT; + mca_osc_ubcl_warn(ret, "Failed to lock_all window %s already in sync type %s", + win->w_name, osc_ubcl_sync_name(module->sync_type)); + return ret; + } + + if (0 != (MPI_MODE_NOCHECK & assert)) { + module->sync_type = UBCL_WIN_SYNC_LOCK_ALL_NO_CHECK; + ret = OMPI_SUCCESS; + goto no_check; + } + + group_size = ompi_group_size(win->w_group); + all_ranks = malloc((group_size) * sizeof(ubcl_rank_t)); + ret = get_all_ubcl_ranks(win, all_ranks); + if (OMPI_SUCCESS != ret) { + goto exit_malloced; + } + + ret = ubcl_error_to_ompi(ubcl_win_lock_multiple(all_ranks, group_size, module->wid)); + if (OMPI_SUCCESS != ret) { + /* Undefined state. Nothing can be retried safely */ + mca_osc_ubcl_error(ret, "MPI_Win_lock_all failed"); + } + + opal_atomic_fetch_add_64(&module->passive_lock_refcount, group_size); + module->sync_type = UBCL_WIN_SYNC_LOCK_ALL; + +exit_malloced: + free(all_ranks); + +no_check: + return ret; +} + +int ompi_osc_ubcl_unlock_all(struct ompi_win_t *win) +{ + ubcl_rank_t *all_ranks; + int group_size, ret; + mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; + + if (module->no_locks) { + mca_osc_ubcl_error(OMPI_ERR_RMA_SYNC, "MPI_Win_unlockall : window %d is no_locks=true", module->wid); + } + + if (UBCL_WIN_SYNC_LOCK_ALL_NO_CHECK == module->sync_type) { + osc_ubcl_flush_all_no_check(win); + module->sync_type = UBCL_WIN_SYNC_NONE; + ret = UBCL_SUCCESS; + goto no_check; + } + + /* check access epoch */ + if (UBCL_WIN_SYNC_LOCK_ALL != module->sync_type) { + return OMPI_ERR_RMA_CONFLICT; + } + + group_size = ompi_group_size(win->w_group); + all_ranks = malloc((group_size) * sizeof(ubcl_rank_t)); + ret = get_all_ubcl_ranks(win, all_ranks); + if (OMPI_SUCCESS != ret) { + goto exit_malloced; + } + + osc_ubcl_flush_all_no_check(win); + ret = ubcl_error_to_ompi(ubcl_win_unlock_multiple(all_ranks, group_size, module->wid)); + if (OMPI_SUCCESS != ret) { + /* Undefined state. Nothing can be retried safely */ + mca_osc_ubcl_error(ret, "MPI_Win_unlock_all failed"); + } + + if (0 == opal_atomic_sub_fetch_64(&module->passive_lock_refcount, group_size)) { + module->sync_type = UBCL_WIN_SYNC_NONE; + } + +exit_malloced: + free(all_ranks); + +no_check: + return ret; +} + + +/* ==== Active target Post/Start/Complete/Wait ==== */ + +int ompi_osc_ubcl_start(struct ompi_group_t *group, int assert, struct ompi_win_t *win) +{ + /* We cannot take benefit from this assertion: + * MPI_MODE_NOCHECK: As we still need to synchro the end of the epoch, + * we cannot bypass synchronization calls. + */ + (void) assert; + int ret; + ompi_proc_t *proc; + int group_size; + mca_common_ubcl_endpoint_t *endpoint; + mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; + + OPAL_THREAD_LOCK(&module->sync_lock); + + /* We should be able to create an access and an exposure epoch simultaneously */ + if (NULL != module->active_sync_access_group + || ( UBCL_WIN_SYNC_NONE != module->sync_type + && UBCL_WIN_SYNC_FENCE != module->sync_type + && UBCL_WIN_SYNC_PSCW != module->sync_type )) { + ret = OMPI_ERR_RMA_SYNC; + mca_osc_ubcl_warn(ret, "Failed to start window %s already in sync type %s", + win->w_name, osc_ubcl_sync_name(module->sync_type)); + goto return_locked; + } + + group_size = ompi_group_size(group); + for (int i = 0; i < group_size; ++i) { + ubcl_error_t ubcl_err; + + proc = ompi_group_peer_lookup_existing(group, i); + if (OPAL_UNLIKELY(NULL == proc)) { + /* Partial retries are not possible here so errors are fatal */ + mca_osc_ubcl_error(OMPI_ERROR, "Unknown %d-th rank asked to %s", i, __func__); + } + + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + ubcl_err = ubcl_win_initiator_waits_lock(endpoint->rank, module->wid); + if (UBCL_SUCCESS != ubcl_err) { + ret = ubcl_error_to_ompi(ubcl_err); + /* Partial retries are not possible here so errors are fatal */ + mca_osc_ubcl_error(ret, + "[win %s] Start failed waiting process %d to accept the lock", + win->w_name, i); + goto return_locked; + } + } + + module->sync_type = UBCL_WIN_SYNC_PSCW; + OBJ_RETAIN(group); + module->active_sync_access_group = group; + + ret = OMPI_SUCCESS; +return_locked: + OPAL_THREAD_UNLOCK(&module->sync_lock); + return ret; +} + +int ompi_osc_ubcl_complete(struct ompi_win_t *win) +{ + ompi_proc_t *proc; + int group_size; + mca_common_ubcl_endpoint_t *endpoint; + mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; + int ret; + + OPAL_THREAD_LOCK(&module->sync_lock); + + if (UBCL_WIN_SYNC_PSCW != module->sync_type) { + ret = OMPI_ERR_RMA_CONFLICT; + mca_osc_ubcl_warn(ret, "Failed to complete window %s in sync type %s", + win->w_name, osc_ubcl_sync_name(module->sync_type)); + goto return_locked; + } + + if (NULL == module->active_sync_access_group) { + ret = OMPI_ERROR; + mca_osc_ubcl_warn(ret, "[win %s] no access group for which to complete " + "active target synchronization", win->w_name); + goto return_locked; + } + + ret = osc_ubcl_flush_all_no_check(win); + if (OMPI_SUCCESS != ret) { + goto return_locked; + } + + ubcl_error_t ubcl_err; + /* Call ubcl_win_sync to clean some NIC counterproductive caches */ + ubcl_err = ubcl_win_sync(module->wid); + if (UBCL_SUCCESS != ubcl_err) { + mca_osc_ubcl_error(ubcl_error_to_ompi(ubcl_err), + "[win %s] Call to sync failed, this is not recoverable", win->w_name); + } + + group_size = ompi_group_size(module->active_sync_access_group); + for (int i = 0; i < group_size; ++i) { + + proc = ompi_group_peer_lookup_existing(module->active_sync_access_group, i); + if (OPAL_UNLIKELY(NULL == proc)) { + /* Partial retries are not possible here so errors are fatal */ + mca_osc_ubcl_error(OMPI_ERROR, "Unknown %d-th rank asked to %s", i, __func__); + } + + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + ubcl_err = ubcl_win_initiator_releases_lock(endpoint->rank, module->wid); + if (UBCL_SUCCESS != ubcl_err) { + ret = ubcl_error_to_ompi(ubcl_err); + /* Partial retries are not possible here so errors are fatal */ + mca_osc_ubcl_error(ret, "[win %s] Complete failed while releasing lock " + "for process %d", win->w_name, i); + goto return_locked; + } + } + + /* We want to keep the window marked as in a pscw sync scheme if an exposure epoch exists */ + if (NULL == module->active_sync_exposure_group) { + module->sync_type = UBCL_WIN_SYNC_NONE; + } + OBJ_RELEASE(module->active_sync_access_group); + module->active_sync_access_group = NULL; + + ret = OMPI_SUCCESS; + +return_locked: + OPAL_THREAD_UNLOCK(&module->sync_lock); + return ret; +} + +int ompi_osc_ubcl_post(struct ompi_group_t *group, int assert, struct ompi_win_t *win) +{ + /* We cannot take benefit from these assertions: + * MPI_MODE_NOCHECK: As we still need to synchro the end of the epoch, + * we cannot bypass synchronization calls. + * MPI_MODE_NOSTORE: Window is in unified memory model, operations are not cached. + * MPI_MODE_NOPUT : Window is in unified memory model, operations are not cached. + */ + (void) assert; + int ret; + ompi_proc_t *proc; + int group_size; + mca_common_ubcl_endpoint_t *endpoint; + mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; + ubcl_error_t ubcl_err; + + OPAL_THREAD_LOCK(&module->sync_lock); + + /* We should be able to create an access and an exposure epoch simultaneously */ + if (NULL != module->active_sync_exposure_group + || ( UBCL_WIN_SYNC_NONE != module->sync_type + && UBCL_WIN_SYNC_FENCE != module->sync_type + && UBCL_WIN_SYNC_PSCW != module->sync_type )) { + ret = OMPI_ERR_RMA_CONFLICT; + mca_osc_ubcl_warn(ret, "Failed to post window %s already in sync type %s", + win->w_name, osc_ubcl_sync_name(module->sync_type)); + goto return_locked; + } + + /* Call ubcl_win_sync to clean some NIC counterproductive caches */ + ubcl_err = ubcl_win_sync(module->wid); + if (UBCL_SUCCESS != ubcl_err) { + mca_osc_ubcl_error(ubcl_error_to_ompi(ubcl_err), + "[win %s] Call to sync failed, this is not recoverable", win->w_name); + } + + group_size = ompi_group_size(group); + for (int i = 0; i < group_size; ++i) { + + proc = ompi_group_peer_lookup_existing(group, i); + if (OPAL_UNLIKELY(NULL == proc)) { + /* Partial retries are not possible here so errors are fatal */ + mca_osc_ubcl_error(OMPI_ERROR, "Unknown %d-th rank asked to %s", i, __func__); + } + + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + ubcl_err = ubcl_win_target_grants_lock(endpoint->rank, module->wid); + if (UBCL_SUCCESS != ubcl_err) { + ret = ubcl_error_to_ompi(ubcl_err); + /* Partial retries are not possible here so errors are fatal */ + mca_osc_ubcl_error(ret, "[win %s] Post failed while accepting the " + "lock from process %d", win->w_name, i); + goto return_locked; + } + } + module->sync_type = UBCL_WIN_SYNC_PSCW; + OBJ_RETAIN(group); + module->active_sync_exposure_group = group; + module->nb_rank_waited = 0; + + ret = OMPI_SUCCESS; + +return_locked: + OPAL_THREAD_UNLOCK(&module->sync_lock); + return ret; +} + +int ompi_osc_ubcl_wait(struct ompi_win_t *win) +{ + int ret; + ret = ompi_osc_ubcl_test(win, NULL); + return ret; +} + +int ompi_osc_ubcl_test(struct ompi_win_t *win, int *flag) +{ + int ret; + ompi_proc_t *proc; + int group_size; + mca_common_ubcl_endpoint_t *endpoint; + int ubcl_flag = 0; + mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; + + OPAL_THREAD_LOCK(&module->sync_lock); + if (NULL != flag) { + (*flag) = 0; + } + + if (UBCL_WIN_SYNC_PSCW != module->sync_type) { + ret = OMPI_ERR_RMA_CONFLICT; + mca_osc_ubcl_warn(ret, "Failed to test window %s in sync type %s", + win->w_name, osc_ubcl_sync_name(module->sync_type)); + goto return_locked; + } + + if (NULL == module->active_sync_exposure_group) { + ret = OMPI_ERR_BAD_PARAM; + mca_osc_ubcl_warn(ret, "[win %s] no group to wait lock release from", + win->w_name); + goto return_locked; + } + + group_size = ompi_group_size(module->active_sync_exposure_group); + for (int i = module->nb_rank_waited; i < group_size; ++i) { + ubcl_error_t ubcl_err; + + proc = ompi_group_peer_lookup_existing(module->active_sync_exposure_group, i); + if (OPAL_UNLIKELY(NULL == proc)) { + /* Undefined state. Nothing can be retried safely */ + mca_osc_ubcl_error(OMPI_ERROR, "Unknown %d-th rank asked to %s", i, __func__); + } + + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + + if (NULL != flag) { + ubcl_err = ubcl_win_target_tests_lock_release(endpoint->rank, module->wid, &ubcl_flag); + } else { + ubcl_err = ubcl_win_target_waits_lock_release(endpoint->rank, module->wid); + ubcl_flag = 1; + } + + if (UBCL_SUCCESS != ubcl_err) { + ret = ubcl_error_to_ompi(ubcl_err); + mca_osc_ubcl_warn(ret, "[win %s] Test failed while waiting release " + "lock from the %d-th process", win->w_name, i); + goto return_locked; + } + + /* if we didn't pass the tests_lock_release then we return + * for next test we'll start again at proc n° module->nb_rank_waited + * if it was a wait, ubcl_flag was positionned to false so it is ignored */ + if (!ubcl_flag) { + ret = OMPI_SUCCESS; + goto return_locked; + } + module->nb_rank_waited++; + } + + /* We want to keep the window marked as in a pscw sync scheme if an exposure epoch exists */ + if (NULL == module->active_sync_access_group) { + module->sync_type = UBCL_WIN_SYNC_NONE; + } + + OBJ_RELEASE(module->active_sync_exposure_group); + module->active_sync_exposure_group = NULL; + if (NULL != flag) { + (*flag) = 1; + } + + ret = OMPI_SUCCESS; +return_locked: + OPAL_THREAD_UNLOCK(&module->sync_lock); + return ret; +} + +/* ==== Fence ==== */ + +int ompi_osc_ubcl_fence(int assert, struct ompi_win_t *win) +{ + /* We cannot take benefit from these assertions: + * MPI_MODE_NOSTORE : The window is in unified memory model, no operations are cached. + * MPI_MODE_NOPUT : The window is in unified memory model, no operations are cached. + */ + int ret = OMPI_SUCCESS; + mca_osc_ubcl_module_t *module = (mca_osc_ubcl_module_t *) win->w_osc_module; + + if (UBCL_WIN_SYNC_FENCE != module->sync_type + && UBCL_WIN_SYNC_FENCE_EPOCH != module->sync_type + && UBCL_WIN_SYNC_NONE != module->sync_type) { + ret = OMPI_ERR_RMA_CONFLICT; + mca_osc_ubcl_warn(ret, "Failed to fence window %s in sync type %s", + win->w_name, osc_ubcl_sync_name(module->sync_type)); + return ret; + } + + OPAL_THREAD_LOCK(&module->sync_lock); + + /* If the sync_type is not UBCL_WIN_SYNC_FENCE_EPOCH this should be almost a noop */ + if (0 == (MPI_MODE_NOPRECEDE & assert)) { + ret = osc_ubcl_flush_all_no_check(win); + if (OMPI_SUCCESS != ret) { + goto return_locked; + } + } + + /* There is no easy way to detect when the barrier is optional. + * The remote process may have started an epoch without us, we need to wait + * its fence to avoid concurrent access. */ + if (0 == (MPI_MODE_NOPRECEDE & assert) || 0 == (MPI_MODE_NOSUCCEED & assert)) { + ubcl_error_t ubcl_err; + /* Call ubcl_win_sync to clean some NIC counterproductive caches */ + ubcl_err = ubcl_win_sync(module->wid); + if (UBCL_SUCCESS != ubcl_err) { + mca_osc_ubcl_error(ubcl_error_to_ompi(ubcl_err), + "[win %s] Call to sync failed, this is not recoverable", win->w_name); + } + + ret = module->comm->c_coll->coll_barrier(module->comm, + module->comm->c_coll->coll_barrier_module); + } + + module->sync_type = UBCL_WIN_SYNC_FENCE; + +return_locked: + OPAL_THREAD_UNLOCK(&module->sync_lock); + return ret; +} + +int ompi_osc_ubcl_sync(struct ompi_win_t *win) +{ + (void) win; + return OMPI_SUCCESS; +} diff --git a/ompi/mca/osc/ubcl/osc_ubcl_sync.h b/ompi/mca/osc/ubcl/osc_ubcl_sync.h new file mode 100644 index 00000000000..0a4d0149e1f --- /dev/null +++ b/ompi/mca/osc/ubcl/osc_ubcl_sync.h @@ -0,0 +1,45 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_OSC_UBCL_SYNC_H +#define MCA_OSC_UBCL_SYNC_H + +typedef enum ubcl_win_sync_type { + UBCL_WIN_SYNC_NONE, + UBCL_WIN_SYNC_LOCK, + UBCL_WIN_SYNC_LOCK_NO_CHECK, + UBCL_WIN_SYNC_LOCK_ALL, + UBCL_WIN_SYNC_LOCK_ALL_NO_CHECK, + UBCL_WIN_SYNC_PSCW, + UBCL_WIN_SYNC_FENCE, + UBCL_WIN_SYNC_FENCE_EPOCH +} ubcl_win_sync_type_t; + +/* Component API */ +int ompi_osc_ubcl_lock(int lock_type, int target, int assert, struct ompi_win_t *win); +int ompi_osc_ubcl_unlock(int target, struct ompi_win_t *win); +int ompi_osc_ubcl_lock_all(int assert, struct ompi_win_t *win); +int ompi_osc_ubcl_unlock_all(struct ompi_win_t *win); + +int ompi_osc_ubcl_start(struct ompi_group_t *group, int assert, struct ompi_win_t *win); +int ompi_osc_ubcl_complete(struct ompi_win_t *win); +int ompi_osc_ubcl_post(struct ompi_group_t *group, int assert, struct ompi_win_t *win); +int ompi_osc_ubcl_wait(struct ompi_win_t *win); +int ompi_osc_ubcl_test(struct ompi_win_t *win, int *flag); + +int ompi_osc_ubcl_fence(int assert, struct ompi_win_t *win); + +int ompi_osc_ubcl_sync(struct ompi_win_t *win); + +/* OSC/UBCL internals */ +int ompi_osc_ubcl_check_access_epoch(int target_rank, struct ompi_win_t *win); + +#endif /* MCA_OSC_UBCL_SYNC_H */ diff --git a/ompi/mca/osc/ubcl/osc_ubcl_utils.h b/ompi/mca/osc/ubcl/osc_ubcl_utils.h new file mode 100644 index 00000000000..9c9280ceb78 --- /dev/null +++ b/ompi/mca/osc/ubcl/osc_ubcl_utils.h @@ -0,0 +1,37 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + * + * Bull eXtreme Interconnect utilities + * + * Contains some usefull fonctions + * + */ + +#ifndef MCA_OSC_UBCL_UTILS_H +#define MCA_OSC_UBCL_UTILS_H + +#include "ompi/mca/common/ubcl/common_ubcl.h" +#include "opal/util/output.h" + +#define OSC_UBCL_COMP_NAME "OSC/UBCL" + +#define mca_osc_ubcl_log(lvl, ...) \ + opal_output_verbose(lvl, mca_osc_ubcl_component.output, __VA_ARGS__) + +#define mca_osc_ubcl_warn(err, format, ...) \ + _mca_common_ubcl_error(__FILE__, __LINE__, err, false, 5, mca_osc_ubcl_component.output, mca_osc_ubcl_component.is_init, mca_osc_ubcl_component.verbose, OSC_UBCL_COMP_NAME, format, ##__VA_ARGS__) +#define mca_osc_ubcl_error(err, format, ...) \ + _mca_common_ubcl_error(__FILE__, __LINE__, err, true, 1, mca_osc_ubcl_component.output, mca_osc_ubcl_component.is_init, mca_osc_ubcl_component.verbose, OSC_UBCL_COMP_NAME, format, ##__VA_ARGS__) +#define mca_osc_ubcl_help(...) opal_show_help("help-mpi-osc-ubcl.txt", ##__VA_ARGS__) + +#endif /*MCA_OSC_UBCL_UTILS_H */ diff --git a/ompi/mca/pml/ubcl/Makefile.am b/ompi/mca/pml/ubcl/Makefile.am new file mode 100644 index 00000000000..64033940388 --- /dev/null +++ b/ompi/mca/pml/ubcl/Makefile.am @@ -0,0 +1,52 @@ +# Copyright (c) 2019-2024 Bull SAS. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AM_CPPFLAGS = $(pml_ubcl_CPPFLAGS) + +EXTRA_DIST = post_configure.sh + +ubcl_sources = \ + pml_ubcl_utils.h \ + pml_ubcl_request.h \ + pml_ubcl.h \ + pml_ubcl.c \ + pml_ubcl_utils.c \ + pml_ubcl_isend.c \ + pml_ubcl_irecv.c \ + pml_ubcl_iprobe.c \ + pml_ubcl_progress.c \ + pml_ubcl_request.c \ + pml_ubcl_datatype.c \ + pml_ubcl_component.c \ + pml_ubcl_endpoint.c \ + pml_ubcl_endpoint.h + +if MCA_BUILD_ompi_pml_ubcl_DSO +component_noinst = +component_install = mca_pml_ubcl.la +else +component_noinst = libmca_pml_ubcl.la +component_install = +endif + +mcacomponentdir = $(ompilibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_pml_ubcl_la_SOURCES = $(ubcl_sources) +mca_pml_ubcl_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \ + $(pml_ubcl_LIBS) \ + $(OPAL_TOP_BUILDDIR)/opal/mca/common/ubcl/lib@OPAL_LIB_NAME@mca_common_ubcl.la \ + $(OMPI_TOP_BUILDDIR)/ompi/mca/common/ubcl/libmca_common_ubcl.la + +mca_pml_ubcl_la_LDFLAGS = -module -avoid-version $(pml_ubcl_LDFLAGS) +mca_pml_ubcl_la_CPPFLAGS = -Wextra -Wall -Werror -Wno-unused-parameter -Wno-missing-field-initializers $(pml_ubcl_CPPFLAGS) + +noinst_LTLIBRARIES = $(component_noinst) +libmca_pml_ubcl_la_SOURCES = $(ubcl_sources) +libmca_pml_ubcl_la_LIBADD = $(pml_ubcl_LIBS) +libmca_pml_ubcl_la_LDFLAGS = -module -avoid-version $(pml_ubcl_LDFLAGS) +libmca_pml_ubcl_la_CPPFLAGS = $(mca_pml_ubcl_la_CPPFLAGS) diff --git a/ompi/mca/pml/ubcl/configure.m4 b/ompi/mca/pml/ubcl/configure.m4 new file mode 100644 index 00000000000..c3159651a41 --- /dev/null +++ b/ompi/mca/pml/ubcl/configure.m4 @@ -0,0 +1,35 @@ +# +# Copyright (c) 2024 Bull SAS. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + + +AC_DEFUN([MCA_ompi_pml_ubcl_POST_CONFIG], [ + AS_IF([test "$1" = "1"], [OMPI_REQUIRE_ENDPOINT_TAG([PML])]) +]) + +AC_DEFUN([MCA_ompi_pml_ubcl_CONFIG], [ + AC_CONFIG_FILES([ompi/mca/pml/ubcl/Makefile]) + + OMPI_CHECK_UBCL([pml_ubcl], + [pml_ubcl_happy="yes"], + [pml_ubcl_happy="no"]) + + AC_REQUIRE([MCA_ompi_common_ubcl_CONFIG]) + AC_REQUIRE([MCA_opal_common_ubcl_CONFIG]) + AC_REQUIRE([OPAL_CHECK_CUDA]) + AC_REQUIRE([OPAL_CHECK_CUDART]) + + AS_IF([test "$pml_ubcl_happy" = "yes"], + [$1], + [$2]) + + # substitute in the things needed to build ubcl + AC_SUBST([pml_ubcl_CPPFLAGS]) + AC_SUBST([pml_ubcl_LDFLAGS]) + AC_SUBST([pml_ubcl_LIBS]) +]) diff --git a/ompi/mca/pml/ubcl/pml_ubcl.c b/ompi/mca/pml/ubcl/pml_ubcl.c new file mode 100644 index 00000000000..730c526d347 --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl.c @@ -0,0 +1,174 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2019-2024 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file pml_ubcl.c + * + * UBCL PML + * + * Implementation of API defined in pml.h. To see parameters and return values + * of these functions, refer to ompi/mca/pml/pml.h. + */ + +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_endpoint.h" +#include "ompi/constants.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_utils.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_request.h" +#include "ompi/proc/proc.h" +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "opal/class/opal_object.h" +#include "opal/datatype/opal_convertor.h" +#include "opal/mca/hwloc/hwloc-internal.h" +#include "opal/prefetch.h" +#include "opal/util/proc.h" +#include "ubcl_api.h" + +/** + * PML UBCL Module + * + * pml_max_contextid and pml_max_tag are computed given Portas4 module + * match_bits and platform int size + */ +mca_pml_ubcl_module_t mca_pml_ubcl_module = { + .super = { + .pml_add_procs = mca_pml_ubcl_add_procs, + .pml_del_procs = mca_pml_ubcl_del_procs, + .pml_enable = mca_pml_ubcl_enable, + .pml_progress = mca_pml_ubcl_progress, + .pml_add_comm = mca_pml_ubcl_add_comm, + .pml_del_comm = mca_pml_ubcl_del_comm, + .pml_irecv_init = mca_pml_ubcl_irecv_init, + .pml_irecv = mca_pml_ubcl_irecv, + .pml_recv = mca_pml_ubcl_recv, + .pml_isend_init = mca_pml_ubcl_isend_init, + .pml_isend = mca_pml_ubcl_isend, + .pml_send = mca_pml_ubcl_send, + .pml_iprobe = mca_pml_ubcl_iprobe, + .pml_probe = mca_pml_ubcl_probe, + .pml_start = mca_pml_ubcl_start, + .pml_improbe = mca_pml_ubcl_improbe, + .pml_mprobe = mca_pml_ubcl_mprobe, + .pml_imrecv = mca_pml_ubcl_imrecv, + .pml_mrecv = mca_pml_ubcl_mrecv, + .pml_dump = mca_pml_ubcl_dump, + .pml_max_contextid = PML_UBCL_MAX_CID, /** Comes from pml_ubcl.h */ + .pml_max_tag = PML_UBCL_MAX_TAG, /** Comes from pml_ubcl.h */ + .pml_flags = MCA_PML_BASE_FLAG_REQUIRE_WORLD, + .pml_get_transports = NULL + } +}; + +int mca_pml_ubcl_add_comm(struct ompi_communicator_t *comm) +{ + mca_pml_ubcl_comm_t *new_ubcl_comm; + ompi_group_t *comm_group; + + new_ubcl_comm = malloc(sizeof(mca_pml_ubcl_comm_t)); + if (NULL == new_ubcl_comm) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + if (OMPI_COMM_IS_INTER(comm)) { + comm_group = comm->c_remote_group; + new_ubcl_comm->size = ompi_comm_remote_size(comm); + } else { + comm_group = comm->c_local_group; + new_ubcl_comm->size = ompi_comm_size(comm); + } + + new_ubcl_comm->array = malloc(new_ubcl_comm->size * sizeof(uint64_t)); + if (NULL == new_ubcl_comm->array) { + free(new_ubcl_comm); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* Build array comm_rank -> ubcl_rank */ + for (uint32_t i = 0; i < new_ubcl_comm->size; i++) { + struct ompi_proc_t *proc; + mca_common_ubcl_endpoint_t *endpoint; + proc = ompi_group_peer_lookup(comm_group, i); + /* In OMPI 5 we sometimes get procs here that didn't go through + * 'add_procs'. We create them here to avoid any issue, 'add_procs' + * tests if an endpoint is already created so there is no issue if it's + * called later */ + endpoint = proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + if (NULL == endpoint) { + mca_pml_ubcl_add_procs(&proc, 1); + endpoint = proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + } else { + mca_pml_ubcl_endpoint_retain(proc); + } + new_ubcl_comm->array[i] = endpoint->rank; + } + + comm->c_pml_comm = new_ubcl_comm; + + OPAL_OUTPUT_VERBOSE( + (50, mca_pml_ubcl_component.output, "UBCL_MODULE_ADD_COMM %s\n", ompi_comm_print_cid(comm))); + + return OMPI_SUCCESS; +} + +int mca_pml_ubcl_del_comm(struct ompi_communicator_t *comm) +{ + mca_pml_ubcl_comm_t *pml_comm; + ompi_group_t *comm_group; + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "UBCL_MODULE_DEL_COMM\n")); + + if (NULL == comm->c_pml_comm) { + mca_pml_ubcl_error(OMPI_ERR_BAD_PARAM, + "error: suspicious free of a communicator that PML UBCL has never allocated"); + } + + /* Important to be decrementing refcount/removing endpoints, + * that way if we create new communicators after MPI_Init we + * can free the endpoints reliably when needed */ + if (OMPI_COMM_IS_INTER(comm)) { + comm_group = comm->c_remote_group; + } else { + comm_group = comm->c_local_group; + } + pml_comm = (mca_pml_ubcl_comm_t *) comm->c_pml_comm; + + for (uint32_t i = 0; i < pml_comm->size; i++) { + struct ompi_proc_t *proc; + proc = ompi_group_peer_lookup(comm_group, i); + mca_pml_ubcl_endpoint_release(proc); + } + + free(pml_comm->array); + free(pml_comm); + + return OMPI_SUCCESS; +} + +/** + * Call for BTLs that we don't care of + */ +int mca_pml_ubcl_enable(bool enable) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "UBCL_MODULE_ENABLE\n")); + return OMPI_SUCCESS; +} + +int mca_pml_ubcl_dump(struct ompi_communicator_t *comm, int verbose) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "UBCL_MODULE_DUMP\n")); + return OMPI_ERROR; +} + +int mca_pml_ubcl_start(size_t count, ompi_request_t **requests) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "UBCL_MODULE_START\n")); + + return mca_pml_ubcl_request_start(count, requests); +} diff --git a/ompi/mca/pml/ubcl/pml_ubcl.h b/ompi/mca/pml/ubcl/pml_ubcl.h new file mode 100644 index 00000000000..78e57b5cd63 --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl.h @@ -0,0 +1,197 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2019-2024 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file pml_ubcl.h + * + * UBCL PML + * + * For the standard PML interface, see omp/mca/pml/pml.h + * + * For now, pml/ubcl only expose one module which sole purpose is to set the API + * functions. It then uses its component all the way through. + */ + +#ifndef MCA_PML_UBCL_H +#define MCA_PML_UBCL_H + +#include "ompi/mca/pml/pml.h" +#include "opal/class/opal_free_list.h" + +#include "ompi/communicator/communicator.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/proc/proc.h" +#include "ompi/request/request.h" +#include "opal/mca/mca.h" +#include "opal/mca/threads/mutex.h" + +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_endpoint.h" + +#include + +#define container_of(ptr, type, member) ((type *) ((char *) (ptr) -offsetof(type, member))) + +#define PML_UBCL_THREAD_ONLY if (OPAL_UNLIKELY(mca_pml_ubcl_component.thread_multiple_enabled)) +#define pml_ubcl_lock(_lock) PML_UBCL_THREAD_ONLY opal_atomic_lock(_lock) +#define pml_ubcl_unlock(_lock) PML_UBCL_THREAD_ONLY opal_atomic_unlock(_lock) + +/* Because UBCL_MAX_TAG overflows if put in an int */ +#if (UBCL_MAX_TAG < INT_MAX) + #define PML_UBCL_MAX_TAG UBCL_MAX_TAG +#else /* (UBCL_MAX_TAG < INT_MAX) */ + #define PML_UBCL_MAX_TAG INT_MAX +#endif /* (UBCL_MAX_TAG < INT_MAX) */ + +/* Because UBCL_MAX_CID overflows if put into an uint32_t */ +#if (UBCL_MAX_CID < UINT32_MAX) + #define PML_UBCL_MAX_CID UBCL_MAX_CID +#else /* (UBCL_MAX_CID < INT_MAX) */ + #define PML_UBCL_MAX_CID UINT32_MAX +#endif /* (UBCL_MAX_CID < INT_MAX) */ + +/** + * Module structure + */ +struct mca_pml_ubcl_module_t { + mca_pml_base_module_t super; +}; +typedef struct mca_pml_ubcl_module_t mca_pml_ubcl_module_t; + +/** + * Component structure + */ +struct mca_pml_ubcl_component_t { + mca_pml_base_component_t super; + + /** Functionnal fields **/ + char is_init; /**< Whether we have been initialized, for proper close */ + int output; /**< Output stream */ + char thread_multiple_enabled; /**< Multithreading support */ + size_t nprocs; /**< Number of known processes */ + void **stack_addr_buffer; /**< Buffer to store stack on component error */ + int n_addr; /**< Number of void * addresses in #stack_addr_buffer*/ + + /** MCA parameters **/ + int priority; /**< Priority of the component */ + int verbose; /**< Verbosity level of the component */ + char force_intranode_bxi; /**< Whether to force intranode communication * + * via ubcl cards*/ + char force_cuda_custom_dt; /**< Wether to force custom datatype use for CUDA + * instead of using ADGE for contiguous CUDA buffers */ + char can_progress; /**< Allow PML to call opal_progress() once at the end of + * each primitive */ + char gdb_attach; /**< Allow to attach a debugger by looping indefinitly on + * this value until 0.*/ + unsigned int max_req; /**< Maximum number of requests */ + unsigned int min_req; /**< Minimum (and inititial) number of requests */ + unsigned int incr_req; /**< Increasing (and inititial) number of requests */ + unsigned int pad_req; + + char check_recv_rsend; /**< Warn if a rsend did not immediatly match a recv */ + char warn_on_truncate; /**< Warn if Recv are truncate */ + char abort_on_truncate; /**< Abort if Recv are truncate */ + char use_mpi_wildcards; /**< Activate MPI_ANY_SOURCE and MPI_ANY_TAG support */ + char accelerator_is_cuda; /**< True if the current accelerator is 'cuda' */ + + /** UBCL endpoint type capabilities **/ + ubcl_endpoint_capabilities_t endpoint_capabilities[UBCL_ENDPOINT_TYPE_SIZE]; + + opal_free_list_t pml_req_free_list; +}; +typedef struct mca_pml_ubcl_component_t mca_pml_ubcl_component_t; + +/* + * mca_pml_comm_t is an anonymous structure used in ompi_comm_t. Each pml can + * provide its own declaration of mca_pml_comm_t. + * Don't change this name. + */ +struct mca_pml_comm_t { + uint64_t *array; + uint32_t size; + uint16_t is_inter; + uint16_t pad0; +}; +typedef struct mca_pml_comm_t mca_pml_ubcl_comm_t; + +/** Sole PML module **/ +extern mca_pml_ubcl_module_t mca_pml_ubcl_module; + +/** PML UBCL component **/ +OMPI_DECLSPEC extern mca_pml_ubcl_component_t mca_pml_ubcl_component; + +/** + * Internal API + */ +void mca_pml_ubcl_isend_start(struct ompi_request_t **request); +void mca_pml_ubcl_irecv_prepare(void *buf, size_t count, ompi_datatype_t *datatype, int src, + int tag, struct ompi_communicator_t *comm, + struct ompi_request_t **request, bool persistent, bool probe, + struct ompi_message_t *message); +void mca_pml_ubcl_irecv_start(struct ompi_request_t **request); + +size_t pml_ubcl_datatype_pack(void *pack_buf, const void *usr_handle, size_t pack_size, + size_t offset); + +size_t pml_ubcl_datatype_unpack(void *usr_handle, const void *pack_buf, size_t pack_size, + size_t offset); + +size_t pml_ubcl_datatype_mem_size(const void *usr_handle, size_t offset); + +void pml_ubcl_datatype_finish(void *usr_handle); + +/** + * PML component API (see pml_ubcl_component.c) + */ +int mca_pml_ubcl_component_open(void); +int mca_pml_ubcl_component_close(void); +int mca_pml_ubcl_component_register(void); +mca_pml_base_module_t *mca_pml_ubcl_component_init(int *priority, bool enable_progress_threads, + bool enable_mpi_threads); +int mca_pml_ubcl_component_finalize(void); + +/** + * PML API (see pml_ubcl.c) + */ +int mca_pml_ubcl_add_comm(struct ompi_communicator_t *comm); +int mca_pml_ubcl_del_comm(struct ompi_communicator_t *comm); +int mca_pml_ubcl_enable(bool enable); +int mca_pml_ubcl_progress(void); +int mca_pml_ubcl_iprobe(int src, int tag, struct ompi_communicator_t *comm, int *matched, + ompi_status_public_t *status); +int mca_pml_ubcl_probe(int src, int tag, struct ompi_communicator_t *comm, + ompi_status_public_t *status); +int mca_pml_ubcl_improbe(int src, int tag, struct ompi_communicator_t *comm, int *matched, + struct ompi_message_t **message, ompi_status_public_t *status); +int mca_pml_ubcl_mprobe(int src, int tag, struct ompi_communicator_t *comm, + struct ompi_message_t **message, ompi_status_public_t *status); +int mca_pml_ubcl_isend_init(const void *buf, size_t count, ompi_datatype_t *datatype, int dst, + int tag, mca_pml_base_send_mode_t mode, + struct ompi_communicator_t *comm, struct ompi_request_t **request); +int mca_pml_ubcl_isend(const void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, + mca_pml_base_send_mode_t mode, struct ompi_communicator_t *comm, + struct ompi_request_t **request); +int mca_pml_ubcl_send(const void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, + mca_pml_base_send_mode_t mode, struct ompi_communicator_t *comm); +int mca_pml_ubcl_irecv_init(void *buf, size_t count, ompi_datatype_t *datatype, int src, int tag, + struct ompi_communicator_t *comm, struct ompi_request_t **request); +int mca_pml_ubcl_irecv(void *buf, size_t count, ompi_datatype_t *datatype, int src, int tag, + struct ompi_communicator_t *comm, struct ompi_request_t **request); +int mca_pml_ubcl_recv(void *buf, size_t count, ompi_datatype_t *datatype, int src, int tag, + struct ompi_communicator_t *comm, ompi_status_public_t *status); +int mca_pml_ubcl_imrecv(void *buf, size_t count, ompi_datatype_t *datatype, + struct ompi_message_t **message, struct ompi_request_t **request); +int mca_pml_ubcl_mrecv(void *buf, size_t count, ompi_datatype_t *datatype, + struct ompi_message_t **message, ompi_status_public_t *status); +int mca_pml_ubcl_dump(struct ompi_communicator_t *comm, int verbose); +int mca_pml_ubcl_start(size_t count, ompi_request_t **requests); +int mca_pml_ubcl_ft_event(int state); + +#endif /* MCA_PML_UBCL_H */ diff --git a/ompi/mca/pml/ubcl/pml_ubcl_component.c b/ompi/mca/pml/ubcl/pml_ubcl_component.c new file mode 100644 index 00000000000..40eef2c9291 --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl_component.c @@ -0,0 +1,288 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2019-2024 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file pml_ubcl_component.c + * + * UBCL PML component implementation + * + * Functions parameters and return values defined in ompi/mca/pml/pml.h. + */ + +#include "opal/include/opal_config.h" + +#include "ompi/mca/pml/ubcl/pml_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_utils.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_request.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_endpoint.h" +#include "opal/mca/btl/base/base.h" +#include "opal/mca/accelerator/base/base.h" +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "opal/prefetch.h" +#include "opal/util/proc.h" + +#include + +/** + * PML UBCL Component + */ +mca_pml_ubcl_component_t mca_pml_ubcl_component = { + { + .pmlm_version = { + MCA_PML_BASE_VERSION_2_1_0, + + .mca_component_name = "ubcl", + .mca_component_major_version = OMPI_MAJOR_VERSION, + .mca_component_minor_version = OMPI_MINOR_VERSION, + .mca_component_release_version = OMPI_RELEASE_VERSION, + .mca_open_component = mca_pml_ubcl_component_open, + .mca_close_component = mca_pml_ubcl_component_close, + .mca_register_component_params = mca_pml_ubcl_component_register + }, + .pmlm_data = { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_NONE + }, + + .pmlm_init = mca_pml_ubcl_component_init, + .pmlm_finalize = mca_pml_ubcl_component_finalize, + }, + + .is_init = 0, + .accelerator_is_cuda = false, + .nprocs = 0, +}; + +/** + * Open opal output, 0-initialize some parameters and forward to communication + * modules + */ +int mca_pml_ubcl_component_open(void) +{ + /* Open output stream */ + if (0 < mca_pml_ubcl_component.verbose || mca_pml_ubcl_component.gdb_attach) { + mca_pml_ubcl_component.output = opal_output_open(NULL); + int verbose = mca_pml_ubcl_component.verbose > 0 ? mca_pml_ubcl_component.verbose : 1; + opal_output_set_verbosity(mca_pml_ubcl_component.output, verbose); + } else { + mca_pml_ubcl_component.output = -1; + } + + /* If MCA param set, wait until gdb_attach is set to 0 from outside */ + if (mca_pml_ubcl_component.gdb_attach) { + opal_output_verbose(1, mca_pml_ubcl_component.output, + "set mca_pml_ubcl_component.gdb_attach = 0\n"); + while (mca_pml_ubcl_component.gdb_attach) { + sleep(1); + }; + } + + return OMPI_SUCCESS; +} + +/** + * Close communication modules and opal output + */ +int mca_pml_ubcl_component_close(void) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "UBCL_COMPONENT_CLOSE\n")); + return OMPI_SUCCESS; +} + +int mca_pml_ubcl_component_register(void) +{ + mca_base_component_t *component = &mca_pml_ubcl_component.super.pmlm_version; + + mca_pml_ubcl_component.verbose = 0; + (void) mca_base_component_var_register(component, "verbose", "Verbosity level of the pml/ubcl.", + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_pml_ubcl_component.verbose); + + mca_pml_ubcl_component.priority = 90; + (void) mca_base_component_var_register(component, "priority", + "Priority of the pml/ubcl component", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_pml_ubcl_component.priority); + + mca_pml_ubcl_component.force_intranode_bxi = false; + (void) mca_base_component_var_register(component, "force_intranode_bxi", + "Whether to force intranode communication to go through " + "BXI network instead of shared memory.", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_pml_ubcl_component.force_intranode_bxi); + + mca_pml_ubcl_component.force_cuda_custom_dt = false; + (void) mca_base_component_var_register(component, "force_cuda_custom_dt", + "Force the pml/ubcl to use custom datatype to pack/unpack cuda " + "buffers. This prevents the use of ADGE by UBCL", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_pml_ubcl_component.force_cuda_custom_dt); + + mca_pml_ubcl_component.can_progress = false; + (void) mca_base_component_var_register( + component, "can_progress", + "Allow PML to call opal_progress() once at the end of each primitive.", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_pml_ubcl_component.can_progress); + + mca_pml_ubcl_component.warn_on_truncate = true; + (void) mca_base_component_var_register( + component, "warn_on_truncate", + "Allow PML to print warning messages whenever a truncation error is detected", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_pml_ubcl_component.warn_on_truncate); + + mca_pml_ubcl_component.abort_on_truncate = true; + (void) mca_base_component_var_register( + component, "abort_on_truncate", + "Allow PML to print error and abort in case of MPI_ERR_TRUNCATE", MCA_BASE_VAR_TYPE_BOOL, + NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_pml_ubcl_component.abort_on_truncate); + + mca_pml_ubcl_component.use_mpi_wildcards = true; + (void) mca_base_component_var_register( + component, "use_mpi_wildcards", + "MPI_ANY_SOURCE or MPI_ANY_TAG are used. For better performance this should be disabled.", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_pml_ubcl_component.use_mpi_wildcards); + + mca_pml_ubcl_component.gdb_attach = false; + (void) mca_base_component_var_register( + component, "gdb_attach", + "Allow to attach a debugger by looping indefinitly on this value until 0.", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_pml_ubcl_component.gdb_attach); + + + mca_pml_ubcl_component.max_req = 32768; + (void) mca_base_component_var_register(component, "max_req", + "Maximum number of requests allocated. (0 means infinite)", + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_pml_ubcl_component.max_req); + + mca_pml_ubcl_component.min_req = 1024; + (void) mca_base_component_var_register(component, "min_req", + "Minimum (and initial) number of requests allocated.", + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_pml_ubcl_component.min_req); + + mca_pml_ubcl_component.incr_req = 1024; + (void) mca_base_component_var_register(component, "incr_req", + "Increasing number of requests allocated.", + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_pml_ubcl_component.incr_req); + + mca_common_ubcl_register_mca(); + + return OMPI_SUCCESS; +} + +static void mca_pml_ubcl_check_cuda_accelerator() +{ + const char* cuda_component_name = "cuda"; + const char* selected_component_name = opal_accelerator_base_selected_component.base_version.mca_component_name; + + /* Check if we are currently using accelerator cuda */ + /* Only one single accelerator can be selected/active. Knowing if it's the + * cuda accelerator let us know if our device buffers are cuda or not */ + if (0 == strcmp(cuda_component_name, selected_component_name)) { + mca_pml_ubcl_component.accelerator_is_cuda = true; + } +} + +/** + * Initialize parameters and forward to communication modules + */ +mca_pml_base_module_t *mca_pml_ubcl_component_init(int *priority, bool enable_progress_threads, + bool enable_mpi_threads) +{ + int err; + + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "UBCL_COMPONENT_INIT\n")); + + /* Register thread level */ + mca_pml_ubcl_component.thread_multiple_enabled = enable_progress_threads || enable_mpi_threads; + + if (OPAL_SUCCESS != mca_common_ubcl_init()) { + mca_pml_ubcl_warn(OMPI_ERR_NOT_AVAILABLE, "common_ubcl could not load UBCL library\n"); + return NULL; + } + + OBJ_CONSTRUCT(&mca_pml_ubcl_component.pml_req_free_list, opal_free_list_t); + err = opal_free_list_init (&mca_pml_ubcl_component.pml_req_free_list, + sizeof(mca_pml_ubcl_request_t), + opal_cache_line_size, + OBJ_CLASS(mca_pml_ubcl_request_t), + 0, opal_cache_line_size, + mca_pml_ubcl_component.min_req, + mca_pml_ubcl_component.max_req, + mca_pml_ubcl_component.incr_req, + NULL, 0, NULL, NULL, NULL); + + if (OPAL_UNLIKELY(OMPI_SUCCESS != err)) { + mca_pml_ubcl_warn(OMPI_ERR_OUT_OF_RESOURCE, "Not enough memory (%d)", err); + return NULL; + } + + /* Initialize UBCL */ + if (UBCL_SUCCESS != ubcl_init(mca_pml_ubcl_component.thread_multiple_enabled)) { + return NULL; + } + + err = mca_pml_ubcl_create_local_endpoint(); + if (OMPI_SUCCESS != err) { + return NULL; + } + mca_pml_ubcl_check_cuda_accelerator(); + + /* Mark as initialized, set priority and return */ + mca_pml_ubcl_component.is_init = 1; + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "INITIATION DONE\n")); + *priority = mca_pml_ubcl_component.priority; + return &mca_pml_ubcl_module.super; +} + +/** + * Finalize parameters and forward to communication modules + */ +int mca_pml_ubcl_component_finalize(void) +{ + int ompi_ret; + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "ubcl_COMPONENT_FINALIZE")); + + if (0 == mca_pml_ubcl_component.is_init) { + return OMPI_SUCCESS; + } + + ompi_ret = mca_pml_ubcl_free_local_endpoints(); + if (OMPI_SUCCESS != ompi_ret) { + return ompi_ret; + } + + /* Finalize UBCL */ + if (UBCL_SUCCESS != ubcl_fini()) { + return OMPI_ERROR; + } + + OBJ_DESTRUCT(&mca_pml_ubcl_component.pml_req_free_list); + + if (OPAL_SUCCESS != mca_common_ubcl_fini()) { + return OMPI_ERROR; + } + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/pml/ubcl/pml_ubcl_datatype.c b/ompi/mca/pml/ubcl/pml_ubcl_datatype.c new file mode 100644 index 00000000000..cd3ebb32fa5 --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl_datatype.c @@ -0,0 +1,89 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2024 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file pml_ubcl_datatype.c + * + * PML/UBCL datatype and convertor related functions + * + */ + +#include "ompi/mca/pml/ubcl/pml_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_utils.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_request.h" + +size_t pml_ubcl_datatype_pack(void *pack_buf, const void *usr_handle, size_t pack_size, + size_t offset) +{ + opal_convertor_t *convertor = (opal_convertor_t *) usr_handle; + + /* Set input data size and start pointer. */ + uint32_t iov_count = 1; + int ret = 0; + struct iovec iov; + iov.iov_len = pack_size; + iov.iov_base = (IOVBASE_TYPE *) pack_buf; + + opal_convertor_set_position(convertor, &offset); + + /* Pack data from converter to iov */ + ret = opal_convertor_pack(convertor, &iov, &iov_count, &pack_size); + if (-1 == ret) { + mca_pml_ubcl_error(ret, "opal_convertor_unpack failed\n"); + } + + return pack_size; +} + +size_t pml_ubcl_datatype_unpack(void *usr_handle, const void *pack_buf, size_t pack_size, + size_t offset) +{ + opal_convertor_t *convertor = (opal_convertor_t *) usr_handle; + + /* Set input data size and start pointer. */ + uint32_t iov_count = 1; + int ret = 0; + struct iovec iov; + iov.iov_len = pack_size; + iov.iov_base = (IOVBASE_TYPE *) pack_buf; + + opal_convertor_set_position(convertor, &offset); + + /* Pack data from converter to iov */ + ret = opal_convertor_unpack(convertor, &iov, &iov_count, &pack_size); + if (-1 == ret) { + mca_pml_ubcl_error(ret, "opal_convertor_unpack failed\n"); + } + + return pack_size; +} + +size_t pml_ubcl_datatype_mem_size(const void *usr_handle, size_t offset) +{ + opal_convertor_t *convertor = (opal_convertor_t *) usr_handle; + size_t size = 0; + + opal_datatype_type_size(convertor->pDesc, &size); + + if (offset > size * convertor->count) { + return 0; + } + + return size * convertor->count - offset; +} + +void pml_ubcl_datatype_finish(void *usr_handle) +{ + /* + * Does nothing + */ + + return; +} diff --git a/ompi/mca/pml/ubcl/pml_ubcl_endpoint.c b/ompi/mca/pml/ubcl/pml_ubcl_endpoint.c new file mode 100644 index 00000000000..04e29babed9 --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl_endpoint.c @@ -0,0 +1,418 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2019-2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file pml_ubcl_endpoint.c + * + * UBCL PML + * + * Contains functions related to ubcl endpoints + */ + +#include "ompi/mca/pml/ubcl/pml_ubcl.h" +#include "ompi/constants.h" +#include "ompi/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_utils.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_request.h" +#include "ompi/proc/proc.h" +#include "opal/class/opal_object.h" +#include "opal/datatype/opal_convertor.h" +#include "opal/mca/hwloc/hwloc-internal.h" +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "opal/prefetch.h" +#include "opal/util/proc.h" +#include "ubcl_api.h" + +/* UBCL rank is on 61 bits, ompi jobid is 32bits, vpid must be truncated to 29bits */ +#define PML_UBCL_VPID_MAX (((1 << 29) - 1)) /* We need 3 bits for UBCL rank */ +#define PML_UBCL_JOBID_MAX (OPAL_JOBID_MAX) + +static void mca_pml_ubcl_forge_modex_key(char *keyname, size_t size, const int type) +{ + int ret; + + switch (type) { + case UBCL_ENDPOINT_TYPE_BXI: + ret = snprintf(keyname, size - 1, "OMPI_UBCL_BXI_ID"); + break; + case UBCL_ENDPOINT_TYPE_SHMEM: + ret = snprintf(keyname, size - 1, "OMPI_UBCL_SHM_ID"); + break; + /* SELF endpoints don't need to forge modex keys */ + case UBCL_ENDPOINT_TYPE_SELF: + default: + ret = 0; + } + + if (0 >= ret || ((size_t) ret) > size - 1) { + mca_pml_ubcl_error(OMPI_ERROR, "Failed to forge modex keyname"); + } + + /* paranoiac */ + keyname[size - 1] = '\0'; +} + +static uint64_t mca_pml_forge_rank(ompi_proc_t *proc) +{ + uint64_t jobid, rank; + + if (ompi_proc_is_sentinel(proc)) { + mca_pml_ubcl_error(OMPI_ERROR, + "PML/UBCL proc sentinel are not supported"); + return 0; + } + + jobid = proc->super.proc_name.jobid; + rank = proc->super.proc_name.vpid; + + if (rank > (uint32_t) PML_UBCL_VPID_MAX) { + mca_pml_ubcl_error(OMPI_ERROR, + "PML/UBCL RANK failed: vpid to high (%d)", rank); + } + + return (rank | (jobid << 29)); +} + +/** + * Init time: init transports and commit ubcl handles to pmix + */ + +static int mca_pml_ubcl_endpoint_modex_put(const int type, void *endpoint_h, size_t size) +{ + int ret; + char keyname[256]; + + mca_pml_ubcl_forge_modex_key(keyname, sizeof(keyname), type); + OPAL_MODEX_SEND_STRING(ret, PMIX_GLOBAL, keyname, endpoint_h, size); + if (0 > ret) { + mca_pml_ubcl_error(OMPI_ERROR, "Failed to modex send string: %s (%d)", + opal_strerror(ret), ret); + } + + return OMPI_SUCCESS; +} + +static int mca_pml_ubcl_export_local_endpoint_handle(const int type) +{ + int err; + uint64_t remote_rank_u64; + char endpoint_h[UBCL_HANDLE_SIZE]; + const size_t size = sizeof(endpoint_h); + + /* dummy valued for ANY_RANK */ + remote_rank_u64 = UBCL_ANY_RANK; + + err = ubcl_export_local_endpoint_handle(type, endpoint_h, &remote_rank_u64); + if (UBCL_SUCCESS != err) { + return OMPI_ERROR; + } + + mca_pml_ubcl_endpoint_modex_put(type, (void *) endpoint_h, size); + + /* We were just interested in the handle. + * The actual recv rank will be allocated during add_procs calls */ + err = ubcl_close_local_endpoint_channel(type, remote_rank_u64); + if (UBCL_SUCCESS != err) { + mca_pml_ubcl_warn(OMPI_ERROR, + "PML/UBCL failed to clean local endpoint (very unlikely error)." + " For safety reason PML will be disabled."); + return OMPI_ERROR; + } + + return OMPI_SUCCESS; +} + +int mca_pml_ubcl_create_local_endpoint(void) +{ + int type; + ubcl_error_t err; + int ompi_error; + + type = UBCL_ENDPOINT_TYPE_SELF; + err = ubcl_create_local_endpoint(type); + if (UBCL_SUCCESS != err) { + mca_pml_ubcl_error(OMPI_ERROR, "Failed ubcl_create_local_endpoint %d (%d)", type, err); + } + + /* UBCL_ENDPOINT_SHM */ + if (!mca_pml_ubcl_component.force_intranode_bxi) { + type = UBCL_ENDPOINT_TYPE_SHMEM; + err = ubcl_create_local_endpoint(type); + if (UBCL_SUCCESS != err) { + mca_pml_ubcl_error(OMPI_ERROR, "Failed ubcl_create_local_endpoint %d (%d)", type, err); + } + ompi_error = mca_pml_ubcl_export_local_endpoint_handle(type); + if (OMPI_SUCCESS != ompi_error) { + return ompi_error; + } + } + + type = UBCL_ENDPOINT_TYPE_BXI; + err = ubcl_create_local_endpoint(type); + if (UBCL_SUCCESS != err) { + mca_pml_ubcl_error(OMPI_ERROR, "Failed ubcl_create_local_endpoint %d (%d)", type, err); + } + ompi_error = mca_pml_ubcl_export_local_endpoint_handle(type); + if (OMPI_SUCCESS != ompi_error) { + return ompi_error; + } + + return OMPI_SUCCESS; +} + +int mca_pml_ubcl_free_local_endpoints() +{ + int ret; + /* Finalize BXI */ + ret = ubcl_free_local_endpoint(UBCL_ENDPOINT_TYPE_BXI); + if (UBCL_SUCCESS != ret) { + return OMPI_ERROR; + } + if (!mca_pml_ubcl_component.force_intranode_bxi) { + ret = ubcl_free_local_endpoint(UBCL_ENDPOINT_TYPE_SHMEM); + if (UBCL_SUCCESS != ret) { + return OMPI_ERROR; + } + } + ret = ubcl_free_local_endpoint(UBCL_ENDPOINT_TYPE_SELF); + if (UBCL_SUCCESS != ret) { + return OMPI_ERROR; + } + + return OMPI_SUCCESS; +} +/** + * Add_proce time: create send and recv endpoint for each peer + */ + +static int mca_pml_ubcl_recv_endpoint_modex_get(ompi_proc_t *proc, const int type, + endp_handle_t endpoint_h, size_t size) +{ + char keyname[256]; + size_t received_size; + void *received_buffer; + int ret; + + received_size = 0; + received_buffer = NULL; + + mca_pml_ubcl_forge_modex_key(keyname, sizeof(keyname), type); + OPAL_MODEX_RECV_STRING(ret, keyname, &proc->super.proc_name, + (void**) &received_buffer, &received_size); + if (0 > ret) { + mca_pml_ubcl_error(OMPI_ERROR, "Failed to modex recv string: %s (%d)", + opal_strerror(ret), ret); + } + + if (received_size != size) { + mca_pml_ubcl_error(OMPI_ERROR, "Modex value is truncated (expected: %zu, receiced: %zu)", + size, received_size); + } + + memcpy(endpoint_h, received_buffer, size); + + free(received_buffer); + + return OMPI_SUCCESS; +} + +static int mca_pml_ubcl_create_send_endpoint(ompi_proc_t *proc, size_t remote_rank, int type) +{ + ubcl_error_t err; + char endpoint_h[UBCL_HANDLE_SIZE]; + uint64_t ubcl_rank; + ompi_proc_t *self; + + self = ompi_proc_local(); + ubcl_rank = mca_pml_forge_rank(self); + + mca_pml_ubcl_recv_endpoint_modex_get(proc, type, (endp_handle_t) endpoint_h, sizeof(endpoint_h)); + err = ubcl_create_remote_endpoint(ubcl_rank, remote_rank, type, (endp_handle_t) endpoint_h); + + if (UBCL_SUCCESS != err) { + return OMPI_ERROR; + } + + ubcl_get_endpoint_type_capabilities(type, &mca_pml_ubcl_component.endpoint_capabilities[type]); + + return OMPI_SUCCESS; +} + +static int mca_pml_ubcl_create_recv_endpoint(uint64_t sender_rank, const int type) +{ + ubcl_error_t err; + uint64_t remote_rank_u64; + endp_handle_t endpoint_h[UBCL_HANDLE_SIZE]; + + remote_rank_u64 = sender_rank; + + err = ubcl_export_local_endpoint_handle(type, endpoint_h, &remote_rank_u64); + if (UBCL_SUCCESS != err) { + return OMPI_ERROR; + } + + return OMPI_SUCCESS; +} + +static int mca_pml_ubcl_create_self_endpoints(uint64_t remote_rank) +{ + ubcl_error_t err; + int type = UBCL_ENDPOINT_TYPE_SELF; + char endpoint_h[UBCL_HANDLE_SIZE]; + uint64_t my_rank = remote_rank; + + err = ubcl_export_local_endpoint_handle(type, endpoint_h, &my_rank); + if (UBCL_SUCCESS != err) { + return OMPI_ERROR; + } + err = ubcl_create_remote_endpoint(my_rank, my_rank, type, endpoint_h); + if (UBCL_SUCCESS != err) { + return OMPI_ERROR; + } + + return OMPI_SUCCESS; +} + +static int get_endpoint_type(ompi_proc_t *proc) +{ + if (ompi_proc_local() == proc) { + return UBCL_ENDPOINT_TYPE_SELF; + } + + /* Known limitation: proc_flags are invalid when jobid is different */ + if (proc->super.proc_name.jobid == ompi_proc_local()->super.proc_name.jobid + && OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags) + && !mca_pml_ubcl_component.force_intranode_bxi) { + return UBCL_ENDPOINT_TYPE_SHMEM; + } else { + return UBCL_ENDPOINT_TYPE_BXI; + } +} + +void mca_pml_ubcl_endpoint_retain(ompi_proc_t *proc) +{ + mca_common_ubcl_endpoint_t *endpoint = NULL; + assert(NULL != proc); + + endpoint = (proc)->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + assert(NULL != endpoint); + + opal_atomic_fetch_add_32(&endpoint->refcount, 1); + mca_pml_ubcl_component.nprocs++; + OBJ_RETAIN(proc); +} + +int mca_pml_ubcl_create_endpoints(ompi_proc_t *proc) +{ + int err = OMPI_SUCCESS; + mca_common_ubcl_endpoint_t *new_endpoint; + + new_endpoint = malloc(sizeof(mca_common_ubcl_endpoint_t)); + if (NULL == new_endpoint) { + mca_pml_ubcl_error(OMPI_ERR_OUT_OF_RESOURCE, + "PML/UBCL BXI EP Malloc: not enough memory"); + } + + new_endpoint->refcount = 0; //we increment it to 1 in endpoint_retain + new_endpoint->rank = mca_pml_forge_rank(proc); + new_endpoint->type = get_endpoint_type(proc); + + if (UBCL_ENDPOINT_TYPE_SELF == new_endpoint->type) { + err = mca_pml_ubcl_create_self_endpoints((uint64_t) new_endpoint->rank); + goto end; + } + + err = mca_pml_ubcl_create_recv_endpoint(new_endpoint->rank, new_endpoint->type); + if (OMPI_SUCCESS != err) { + mca_pml_ubcl_error(err, "Failed to create recv endpoint for rank %zu\n", + new_endpoint->rank); + } + + err = mca_pml_ubcl_create_send_endpoint(proc, new_endpoint->rank, new_endpoint->type); + if (OMPI_SUCCESS != err) { + mca_pml_ubcl_error(err, "Failed to create send endpoint for rank %zu\n", + new_endpoint->rank); + } + +end: + (proc)->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML] = new_endpoint; + mca_pml_ubcl_endpoint_retain(proc); + + return err; +} + +int mca_pml_ubcl_add_procs(ompi_proc_t **procs, size_t nprocs) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "UBCL_MODULE_ADD_PROCS\n")); + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "UBCL ADD PROCS: %lu to add", nprocs)); + + /* Initialize all endpoint with remote rank */ + for (size_t i = 0; i < nprocs; i++) { + /* Let's not create endpoints or increment refcount multiple times */ + if (NULL == procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]) { + int ret = mca_pml_ubcl_create_endpoints(procs[i]); + if (OMPI_SUCCESS != ret) { + mca_pml_ubcl_error(ret, "Failed mca_ubcl_create_remote_endpoint"); + } + } + } + + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "PML/UBCL ADD_PROCS called")); + + return OMPI_SUCCESS; +} + +int mca_pml_ubcl_endpoint_release(ompi_proc_t *proc) +{ + uint32_t endpoint_refcount; + ubcl_error_t ret = UBCL_SUCCESS; + int ompi_error = OMPI_SUCCESS; + mca_common_ubcl_endpoint_t *endpoint = NULL; + assert(NULL != proc); + + endpoint = (proc)->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + assert(NULL != endpoint); + + endpoint_refcount = opal_atomic_sub_fetch_32(&endpoint->refcount, 1); + if (0 == endpoint_refcount) { + ret = ubcl_free_remote_endpoint(endpoint->rank); + if (UBCL_SUCCESS != ret) { + ompi_error = ubcl_error_to_ompi(ret); + mca_pml_ubcl_warn(ompi_error, "PML/UBCL failed to free remote endpoint"); + } + ret = ubcl_close_local_endpoint_channel(endpoint->type, endpoint->rank); + if (UBCL_SUCCESS != ret) { + ompi_error = ubcl_error_to_ompi(ret); + mca_pml_ubcl_warn(ompi_error, "PML/UBCL failed to close local endpoint channel"); + } + free(endpoint); + proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML] = NULL; + mca_pml_ubcl_component.nprocs -= 1; + OBJ_RELEASE(proc); + } + + return ompi_error; +} + +int mca_pml_ubcl_del_procs(ompi_proc_t **procs, size_t nprocs) +{ + int ret = OMPI_SUCCESS; + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "UBCL_MODULE_DEL_PROCS\n")); + + for (uint32_t i = 0; i < nprocs; i++) { + if (OMPI_SUCCESS != mca_pml_ubcl_endpoint_release(procs[i])) { + ret = OMPI_ERROR; + } + } + + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "PML/UBCL DEL_PROCS called")); + + return ret; +} diff --git a/ompi/mca/pml/ubcl/pml_ubcl_endpoint.h b/ompi/mca/pml/ubcl/pml_ubcl_endpoint.h new file mode 100644 index 00000000000..0d0a44879ec --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl_endpoint.h @@ -0,0 +1,18 @@ +#ifndef PML_UBCL_ENDPOINT_INCLUDE_H +#define PML_UBCL_ENDPOINT_INCLUDE_H + +#include +#include "opal/util/proc.h" +/** * Endpoint structure */ +#include "opal/mca/common/ubcl/common_ubcl.h" + +/* endpoint functions */ + +int mca_pml_ubcl_create_local_endpoint(void); +int mca_pml_ubcl_free_local_endpoints(void); +int mca_pml_ubcl_endpoint_release(ompi_proc_t *proc); +void mca_pml_ubcl_endpoint_retain(ompi_proc_t *proc); +int mca_pml_ubcl_add_procs(ompi_proc_t **procs, size_t nprocs); +int mca_pml_ubcl_del_procs(ompi_proc_t **procs, size_t nprocs); + +#endif /* #ifndef PML_UBCL_ENDPOINT_INCLUDE_H */ diff --git a/ompi/mca/pml/ubcl/pml_ubcl_iprobe.c b/ompi/mca/pml/ubcl/pml_ubcl_iprobe.c new file mode 100644 index 00000000000..6b6dbad0cee --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl_iprobe.c @@ -0,0 +1,129 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2019-2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file pml_ubcl_iprobe.c + * + * UBCL PML iprobe related functions + * + */ + +#include "ompi/constants.h" +#include "ompi/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_utils.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_request.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/message/message.h" +#include "ompi/proc/proc.h" +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "ubcl_api.h" + +int mca_pml_ubcl_iprobe(int src, int tag, struct ompi_communicator_t *comm, + int *matched, ompi_status_public_t *status) +{ + OPAL_OUTPUT_VERBOSE((75, mca_pml_ubcl_component.output, + "UBCL_MODULE_IPROBE\n")); + ubcl_status_t ubcl_status; + uint64_t cid; + uint64_t rank; + + if (OMPI_ANY_SOURCE == src) { + rank = UBCL_ANY_SOURCE; + } else { + ompi_proc_t *proc = ompi_comm_peer_lookup(comm, src); + mca_common_ubcl_endpoint_t *endpoint = NULL; + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + rank = endpoint->rank; + } + + cid = ompi_comm_get_local_cid(comm); + ubcl_cid_t ubcl_cid= mca_pml_ubcl_compute_ubcl_cid(tag, cid); + + /* Call the UBCL api for iprobe */ + ubcl_iprobe(rank, tag, ubcl_cid, matched, &ubcl_status); + if (*matched) { + mca_common_ubcl_status_to_ompi(status, ubcl_status, comm, src); + } + + return OMPI_SUCCESS; +} + +int mca_pml_ubcl_probe(int src, int tag, struct ompi_communicator_t *comm, + ompi_status_public_t *status) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, + "UBCL_MODULE_PROBE\n")); + int match = 0; + + /* Loop over pml iprobe */ + while (!match) { + mca_pml_ubcl_iprobe(src, tag, comm, &match, status); + } + + return OMPI_SUCCESS; +} + +int mca_pml_ubcl_improbe(int src, int tag, struct ompi_communicator_t *comm, + int *matched, struct ompi_message_t **message, + ompi_status_public_t *status) +{ + OPAL_OUTPUT_VERBOSE((75, mca_pml_ubcl_component.output, + "UBCL_MODULE_IMPROBE\n")); + ubcl_status_t ubcl_status; + uint64_t rank; + uint64_t cid; + if (OMPI_ANY_SOURCE == src) { + rank = UBCL_ANY_SOURCE; + } else { + ompi_proc_t *proc = ompi_comm_peer_lookup(comm, src); + mca_common_ubcl_endpoint_t *endpoint = NULL; + endpoint = (mca_common_ubcl_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + rank = endpoint->rank; + } + + cid = ompi_comm_get_local_cid(comm); + ubcl_cid_t ubcl_cid = mca_pml_ubcl_compute_ubcl_cid(tag, cid); + + ubcl_message_t *ubcl_message; + + /* Call the UBCL api for improbe */ + ubcl_improbe(rank, tag, ubcl_cid, matched, &ubcl_message, &ubcl_status); + if (*matched) { + mca_common_ubcl_status_to_ompi(status, ubcl_status, comm, src); + *message = ompi_message_alloc(); + if (message == NULL) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + (*message)->req_ptr = ubcl_message; + (*message)->comm = comm; + (*message)->peer = mca_common_ubcl_get_mpi_rank(src, comm, ubcl_status.remote); + (*message)->count = ubcl_status.size; + } + + return OMPI_SUCCESS; +} + +int mca_pml_ubcl_mprobe(int src, int tag, struct ompi_communicator_t *comm, + struct ompi_message_t **message, + ompi_status_public_t *status) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, + "UBCL_MODULE_MPROBE\n")); + int match = 0; + + /* Loop over pml improbe */ + while (!match) { + mca_pml_ubcl_improbe(src, tag, comm, &match, message, status); + } + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/pml/ubcl/pml_ubcl_irecv.c b/ompi/mca/pml/ubcl/pml_ubcl_irecv.c new file mode 100644 index 00000000000..9ea74d9e428 --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl_irecv.c @@ -0,0 +1,292 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2019-2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file pml_ubcl_irecv.c + * + * UBCL PML irecv related functions + * + * Functions parameters and return values defined in pml.h. + */ + +#include "opal/mca/common/ubcl/common_ubcl.h" + +#include "ompi/constants.h" +#include "ompi/mca/pml/pml_constants.h" +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl.h" +#include "ompi/message/message.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_utils.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_request.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/proc/proc.h" +#include "ompi/mca/common/ubcl/common_ubcl.h" +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "ubcl_api.h" + +/** + * Prepare a request for reception. + */ +void mca_pml_ubcl_irecv_prepare(void *buf, size_t count, + ompi_datatype_t *datatype, int src, int tag, + struct ompi_communicator_t *comm, + struct ompi_request_t **request, + bool persistent, bool probe, + struct ompi_message_t *message) +{ + ompi_proc_t *proc; + mca_pml_ubcl_request_t *req; + +#if defined(OPAL_ENABLE_DEBUG) && OPAL_ENABLE_DEBUG + if (probe) { + OPAL_OUTPUT_VERBOSE((75, mca_pml_ubcl_component.output, + "UBCL_MODULE_IRECV_PREPARE\n")); + } else { + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, + "UBCL_MODULE_IRECV_PREPARE\n")); + } +#endif /* OPAL_ENABLE_DEBUG */ + + /* Get proc */ + if (OMPI_ANY_SOURCE != src) { + proc = ompi_comm_peer_lookup(comm, src); + if (OPAL_UNLIKELY(NULL == proc)) { + mca_pml_ubcl_error(OMPI_ERROR, "Unknown proc"); + } + } else { + proc = NULL; + } + + /* Allocate request and activate it */ + req = (mca_pml_ubcl_request_t *) opal_free_list_get(&mca_pml_ubcl_component.pml_req_free_list); + if (OPAL_UNLIKELY(NULL == req)) { + mca_pml_ubcl_error(OMPI_ERR_OUT_OF_RESOURCE, + "Not enough memory to allocate a recv request"); + } + + MCA_PML_UBCL_RECV_REQUEST_INIT(req, buf, count, datatype, src, tag, comm, + proc, persistent, probe, message); + + /* Set user request */ + *request = &req->ompi_req; +} + +/** + * Actually start a recv request. + */ +void mca_pml_ubcl_irecv_start(struct ompi_request_t **request) +{ + OPAL_OUTPUT_VERBOSE( + (50, mca_pml_ubcl_component.output, "UBCL_MODULE_IRECV_START %p\n", + (void *) *request)); + + mca_pml_ubcl_request_t *req = container_of((*request), + mca_pml_ubcl_request_t, ompi_req); + void *output_buf = (void *) req->buf; + + ubcl_memory_descriptor_t rbuf_md; + ubcl_error_t err = 0; + size_t size; + + /* Init UBCL MD */ + err = ubcl_memory_descriptor_init(&rbuf_md); + if (UBCL_SUCCESS != err) { + mca_pml_ubcl_error(ubcl_error_to_ompi(err), "Failed to initialize ubcl MD"); + } + if (pml_ubcl_request_is_cuda_buf(req)) { + err = ubcl_memory_descriptor_set_properties(UBCL_BUF_IS_CUDA, &rbuf_md); + if (UBCL_SUCCESS != err) { + mca_pml_ubcl_error(ubcl_error_to_ompi(err), + "Failed to set MD properties, got error: %d", err); + } + } + + /* If we don't need to pack we can build a contiguous */ + if (! MCA_PML_UBCL_REQUEST_NEED_XPACK(req)) { + ompi_datatype_type_size(req->datatype, &size); + size *= req->count; + + err = ubcl_memory_descriptor_build_contiguous(output_buf, size, &rbuf_md); + if (UBCL_SUCCESS != err) { + mca_pml_ubcl_error(ubcl_error_to_ompi(err), + "Failed to build memory descriptor for output buffer"); + } + } + + /* Always build a custom MD representation so that we have a fallback */ + err = ubcl_memory_descriptor_build_custom((void *) &req->convertor, + pml_ubcl_datatype_pack, + pml_ubcl_datatype_unpack, + pml_ubcl_datatype_mem_size, + pml_ubcl_datatype_finish, + &rbuf_md); + if (UBCL_SUCCESS != err) { + mca_pml_ubcl_error(ubcl_error_to_ompi(err), + "Failed to build custom memory descriptor for input buffer"); + } + + /* Activate request */ + MCA_PML_UBCL_REQUEST_ACTIVATE(req); + + if (req->message != NULL) { + err = ubcl_imrecv(rbuf_md, (ubcl_message_t **) &req->message, + (ubcl_completion_callback_fct) &ubcl_request_recv_complete_cb, + *request); + } else { + uint64_t rank; + uint64_t cid; + int32_t tag = req->tag; + + if (OMPI_ANY_SOURCE == req->rank) { + rank = UBCL_ANY_SOURCE; + } else { + mca_common_ubcl_endpoint_t *endpoint = NULL; + endpoint = (mca_common_ubcl_endpoint_t *) req->proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + rank = endpoint->rank; + } + + cid = ompi_comm_get_local_cid(req->comm); + ubcl_cid_t ubcl_cid = mca_pml_ubcl_compute_ubcl_cid(req->tag, cid); + tag = req->tag; + + OPAL_OUTPUT_VERBOSE( + (50, mca_pml_ubcl_component.output, "PML/UBCL IRECV: recv from rank=%zu\n", rank)); + err = ubcl_irecv(rbuf_md, tag, ubcl_cid, rank, + (ubcl_completion_callback_fct) &ubcl_request_recv_complete_cb, + *request, &req->ubcl_operation_handle); + } + + if (UBCL_ERROR == err) { + mca_pml_ubcl_error(ubcl_error_to_ompi(err), "Failed to start recv comm"); + } + + /* Optionnal call to progress */ + if (mca_pml_ubcl_component.can_progress) { + opal_progress(); + } +} + +int mca_pml_ubcl_irecv_init(void *buf, size_t count, ompi_datatype_t *datatype, + int src, int tag, struct ompi_communicator_t *comm, + struct ompi_request_t **request) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, + "UBCL_MODULE_IRECV_INIT\n")); + + /* Create request */ + mca_pml_ubcl_irecv_prepare(buf, count, datatype, src, tag, comm, request, + true, false, NULL); + + return OMPI_SUCCESS; +} + +/** + * Non blocking receive primitive. Get endpoint, allocate a pml request and + * forward to selected communication module + */ +int mca_pml_ubcl_irecv(void *buf, size_t count, ompi_datatype_t *datatype, + int src, int tag, struct ompi_communicator_t *comm, + struct ompi_request_t **request) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, + "UBCL_MODULE_IRECV\n")); + + /* Create request and start communication */ + mca_pml_ubcl_irecv_prepare(buf, count, datatype, src, tag, comm, request, + false, false, NULL); + mca_pml_ubcl_irecv_start(request); + + return OMPI_SUCCESS; +} + +/** + * Blocking receive primitive. Call non-blocking receive and wait for request + * completion + */ +int mca_pml_ubcl_recv(void *buf, size_t count, ompi_datatype_t *datatype, + int src, int tag, struct ompi_communicator_t *comm, + ompi_status_public_t *status) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, + "UBCL_MODULE_RECV\n")); + + /* Create request and start communication */ + struct ompi_request_t *request = NULL; + int rc = 0; /** TODO: fix return code */ + mca_pml_ubcl_irecv_prepare(buf, count, datatype, src, tag, comm, &request, + false, false, NULL); + mca_pml_ubcl_irecv_start(&request); + + /* Wait for data to be received */ + ompi_request_wait_completion(request); + + mca_pml_ubcl_request_t *req = container_of(request, mca_pml_ubcl_request_t, + ompi_req); + rc = req->ompi_req.req_status.MPI_ERROR; + + if (MPI_STATUS_IGNORE != status) { + OMPI_COPY_STATUS(status, req->ompi_req.req_status, false); + } + + mca_pml_ubcl_request_finalize(req); + + return rc; +} + +int mca_pml_ubcl_imrecv(void *buf, size_t count, ompi_datatype_t *datatype, + struct ompi_message_t **message, + struct ompi_request_t **request) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, + "UBCL_MODULE_IMRECV\n")); + + /* Create request and start communication */ + mca_pml_ubcl_irecv_prepare(buf, count, datatype, (*message)->peer, + OMPI_ANY_TAG, (*message)->comm, request, + false, true, (*message)->req_ptr); + mca_pml_ubcl_irecv_start(request); + ompi_message_return(*message); + *message = MPI_MESSAGE_NULL; + return OMPI_SUCCESS; +} + +int mca_pml_ubcl_mrecv(void *buf, size_t count, ompi_datatype_t *datatype, + struct ompi_message_t **message, + ompi_status_public_t *status) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, + "UBCL_MODULE_MRECV\n")); + + struct ompi_request_t *request = NULL; + int rc = 0; + //we're matching any message tag + mca_pml_ubcl_irecv_prepare(buf, count, datatype, (*message)->peer, + OMPI_ANY_TAG, (*message)->comm, &request, + false, true, (*message)->req_ptr); + mca_pml_ubcl_irecv_start(&request); + + /* Wait for data to be received */ + ompi_request_wait_completion(request); + + mca_pml_ubcl_request_t *req = container_of(request, mca_pml_ubcl_request_t, + ompi_req); + rc = req->ompi_req.req_status.MPI_ERROR; + + if (MPI_STATUS_IGNORE != status) { + OMPI_COPY_STATUS(status, req->ompi_req.req_status, false); + } + + mca_pml_ubcl_request_finalize(req); + ompi_message_return(*message); + *message = MPI_MESSAGE_NULL; + + return rc; +} + diff --git a/ompi/mca/pml/ubcl/pml_ubcl_isend.c b/ompi/mca/pml/ubcl/pml_ubcl_isend.c new file mode 100644 index 00000000000..9d5b282d884 --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl_isend.c @@ -0,0 +1,249 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2019-2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file pml_ubcl_isend.c + * + * PML/UBCL isend related functions + * + * Functions parameters and return values defined in ompi/mca/pml/pml.h. + */ + +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl.h" +#include "ompi/constants.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/proc/proc.h" +#include "ompi/mca/pml/base/pml_base_bsend.h" +#include "ompi/mca/pml/ubcl/pml_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_request.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_utils.h" +#include "ompi/request/request.h" +#include "ompi/mca/common/ubcl/common_ubcl.h" +#include "opal/mca/common/ubcl/common_ubcl.h" +#include "ubcl_api.h" + +static inline void get_ubcl_send_mode(mca_pml_base_send_mode_t mode, ubcl_send_mode_t *send_mode) +{ + switch(mode) { + case MCA_PML_BASE_SEND_SYNCHRONOUS: + *send_mode = UBCL_SEND_MODE_SYNCHRONOUS; + break; + case MCA_PML_BASE_SEND_READY: + *send_mode = UBCL_SEND_MODE_READY; + break; + case MCA_PML_BASE_SEND_BUFFERED: + *send_mode = UBCL_SEND_MODE_BUFFERED; + break; + /* Other modes not yet supported in UBCL */ + default: + *send_mode = UBCL_SEND_MODE_STANDARD; + break; + } +} + +/** + * Prepare a request for sending and perform actions according to send mode. + * + * Send modes: + * - BUFFERED = Use a specific user-defined buffer to store buf and return. + * See buffer_attach/detach + * - READY = User tells us that matching receive has already been posted by peer + * - SYNCHRONOUS = Return only when peer has begun to receive + * - STANDARD = BUFFERED or SYNCHRONOUS (up to pml to decide) + * + * By default READY is equivalent to STANDARD, except if checks are enabled by + * MCA: then receiver may print a warning or an error. + * SYNCHRONOUS forces STANDARD rendezvous protocols. + */ +static inline void mca_pml_ubcl_isend_prepare(const void *buf, size_t count, + ompi_datatype_t *datatype, int dst, int tag, + mca_pml_base_send_mode_t mode, + struct ompi_communicator_t *comm, + struct ompi_request_t **request, bool persistent) +{ + ompi_proc_t *proc; + mca_pml_ubcl_request_t *req; + + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "UBCL_MODULE_ISEND_PREPARE\n")); + + /* Get proc */ + proc = ompi_comm_peer_lookup(comm, dst); + if (OPAL_UNLIKELY(NULL == proc)) { + mca_pml_ubcl_error(OMPI_ERROR, "Unknown proc"); + } + + /* Allocate request */ + req = (mca_pml_ubcl_request_t *) opal_free_list_get(&mca_pml_ubcl_component.pml_req_free_list); + if (OPAL_UNLIKELY(NULL == req)) { + mca_pml_ubcl_error(OMPI_ERR_OUT_OF_RESOURCE, "Not enough memory to allocate a PML request"); + } + + /* TODO: Find out what can be simplified in this macro and request structure */ + MCA_PML_UBCL_SEND_REQUEST_INIT(req, buf, count, datatype, dst, tag, mode, comm, proc, + persistent); + + /* Set user request */ + *request = &req->ompi_req; +} + +/** + * Actually start a send request and perform actions according to send mode. + */ +void mca_pml_ubcl_isend_start(struct ompi_request_t **request) +{ + OPAL_OUTPUT_VERBOSE( + (50, mca_pml_ubcl_component.output, "UBCL_MODULE_ISEND_START %p\n", *request)); + + mca_pml_ubcl_request_t *req = container_of((*request), mca_pml_ubcl_request_t, ompi_req); + + char *input_buf = NULL; + mca_common_ubcl_endpoint_t *endpoint = NULL; + ubcl_memory_descriptor_t sbuf_md; + ubcl_error_t err = 0; + ubcl_send_mode_t send_mode; + uint64_t cid; + int32_t tag = req->tag; + ubcl_cid_t ubcl_cid; + + /* Activate request */ + MCA_PML_UBCL_REQUEST_ACTIVATE(req); + + if (MCA_PML_BASE_SEND_BUFFERED == req->mode) { + pml_ubcl_bufferize(req); + } + get_ubcl_send_mode(req->mode, &send_mode); + + input_buf = (char*) req->buf; + + /* Retrieve endpoint and compute overall message size */ + endpoint = (mca_common_ubcl_endpoint_t *) req->proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML]; + + /* Init UBCL MD */ + err = ubcl_memory_descriptor_init(&sbuf_md); + if (UBCL_SUCCESS != err) { + mca_pml_ubcl_error(ubcl_error_to_ompi(err), "Failed to initialize ubcl MD"); + } + if (pml_ubcl_request_is_cuda_buf(req)) { + err = ubcl_memory_descriptor_set_properties(UBCL_BUF_IS_CUDA, &sbuf_md); + if (UBCL_SUCCESS != err) { + mca_pml_ubcl_error(ubcl_error_to_ompi(err), + "Failed to set MD properties, got error: %d", err); + } + } + + /* If we don't need to pack we can build a contiguous */ + if (! MCA_PML_UBCL_REQUEST_NEED_XPACK(req)) { + ptrdiff_t gap = 0; + size_t span = opal_datatype_span(&req->datatype->super, req->count, &gap); + err = ubcl_memory_descriptor_build_contiguous(input_buf+gap, span, &sbuf_md); + if (UBCL_SUCCESS != err) { + mca_pml_ubcl_error(ubcl_error_to_ompi(err), + "Failed to build contiguous memory descriptor for input buffer"); + } + } + + /* Always build a custom MD representation so that we have a fallback */ + err = ubcl_memory_descriptor_build_custom((void *) &req->convertor, + pml_ubcl_datatype_pack, + pml_ubcl_datatype_unpack, + pml_ubcl_datatype_mem_size, + pml_ubcl_datatype_finish, &sbuf_md); + if (UBCL_SUCCESS != err) { + mca_pml_ubcl_error(ubcl_error_to_ompi(err), + "Failed to build custom memory descriptor for input buffer"); + } + + cid = ompi_comm_get_local_cid(req->comm); + ubcl_cid = mca_pml_ubcl_compute_ubcl_cid(req->tag, cid); + + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, + "PML/UBCL ISEND: send mpi_tag=%x comm_id=%zu\n", tag, ubcl_cid.bits)); + OPAL_OUTPUT_VERBOSE( + (50, mca_pml_ubcl_component.output, "PML/UBCL ISEND: ompi_req=%p\n", *request)); + OPAL_OUTPUT_VERBOSE( + (50, mca_pml_ubcl_component.output, "PML/UBCL ISEND: sending to rank=%zu\n", endpoint->rank)); + + err = ubcl_isend(sbuf_md, tag, ubcl_cid, endpoint->rank, send_mode, + (ubcl_completion_callback_fct) &ubcl_request_send_complete_cb, + *request, &req->ubcl_operation_handle); + if (UBCL_ERROR == err) { + mca_pml_ubcl_error(ubcl_error_to_ompi(err), "Failed to send data"); + } + + /* Optionnal call to progress */ + if (mca_pml_ubcl_component.can_progress) { + opal_progress(); + } +} + +/** + * Initialize a permanent send request + */ +int mca_pml_ubcl_isend_init(const void *buf, size_t count, ompi_datatype_t *datatype, int dst, + int tag, mca_pml_base_send_mode_t mode, + struct ompi_communicator_t *comm, struct ompi_request_t **request) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "UBCL_MODULE_ISEND_INIT\n")); + + /* Create request */ + mca_pml_ubcl_isend_prepare(buf, count, datatype, dst, tag, mode, comm, request, true); + + return OMPI_SUCCESS; +} + +/** + * Non-blocking send primitive. Return to user as soon as possible after the + * communication is started. + */ +int mca_pml_ubcl_isend(const void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, + mca_pml_base_send_mode_t mode, struct ompi_communicator_t *comm, + struct ompi_request_t **request) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "UBCL_MODULE_ISEND\n")); + + /* Create request and start communication */ + mca_pml_ubcl_isend_prepare(buf, count, datatype, dst, tag, mode, comm, request, false); + mca_pml_ubcl_isend_start(request); + + return OMPI_SUCCESS; +} + +/** + * Blocking send primitive. Return only when buffer can be reused by user + * (i.e. either dest has received all or we buffered). + */ +int mca_pml_ubcl_send(const void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, + mca_pml_base_send_mode_t mode, struct ompi_communicator_t *comm) +{ + int ret; + mca_pml_ubcl_request_t *request = NULL; + struct ompi_request_t *ompi_request; + + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "UBCL_MODULE_SEND\n")); + + ret = mca_pml_ubcl_isend(buf, count, datatype, dst, tag, mode, comm, &ompi_request); + if (OMPI_SUCCESS != ret || NULL == ompi_request) { + return ret; + } + + request = container_of(ompi_request, mca_pml_ubcl_request_t, ompi_req); + + if (MCA_PML_BASE_SEND_BUFFERED == mode) { + /* MPI specification: Bsend is local, no information about the remote. + * PML/BXI always buffers Bsend data. No need to wait request completion */ + request->to_free = 1; + } else { + ompi_request_wait_completion(ompi_request); + mca_pml_ubcl_request_finalize(request); + } + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/pml/ubcl/pml_ubcl_progress.c b/ompi/mca/pml/ubcl/pml_ubcl_progress.c new file mode 100644 index 00000000000..688f34bce08 --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl_progress.c @@ -0,0 +1,38 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2019-2024 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file pml_ubcl_progress.c + * + * UBCL PML progress related functions + * + * Functions parameters and return values defined in ompi/mca/pml/pml.h. + */ + +#include "ompi/mca/pml/ubcl/pml_ubcl.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/proc/proc.h" + +#include + +/** + * Forward to communication modules. Could use some weight for priority given + * frequency of call with no event. + */ +int mca_pml_ubcl_progress(void) +{ + if (0 == mca_pml_ubcl_component.nprocs) { + //return OMPI_ERROR; + return OMPI_SUCCESS; + } + + ubcl_progress(); + return OMPI_SUCCESS; +} diff --git a/ompi/mca/pml/ubcl/pml_ubcl_request.c b/ompi/mca/pml/ubcl/pml_ubcl_request.c new file mode 100644 index 00000000000..d3201fd57b5 --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl_request.c @@ -0,0 +1,386 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2019-2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file pml_ubcl_request.c + * + * UBCL PML Requests + * + * This file holds the MPI matching engine for the pml. It uses expected_list, + * unexpected_list and matched_list from the mca_pml_ubcl_component + * component. Messages come down from the pml interface (isend, irecv, iprobe) + * and up from the communication modules through + * mca_pml_ubcl_request_report_unexpected(). Matching is perform according to the + * norm on the envelop (rank, tag, cid) and in posted order. Note that messages + * on different communicators are still ordered though it is not required. It + * would need additionnal developments. + * + * Function parameters and return values are defined in ompi/request/request.h. + * Following functions are actually used but inside macros and through function + * pointers and are not detected by cppcheck. + */ + +#include "ompi/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_utils.h" +#include "ompi/mca/pml/ubcl/pml_ubcl_request.h" + +OBJ_CLASS_INSTANCE(mca_pml_ubcl_request_t, + opal_free_list_item_t, + NULL, + NULL); + +/** + * Start a PML request. Find the mca_pml_ubcl_request with the given ompi_request, + * reset it and start it. + */ +int mca_pml_ubcl_request_start(size_t count, struct ompi_request_t **requests) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "PML/UBCL REQUEST_START %zu\n", count)); + + int ret = OMPI_SUCCESS; + for (size_t i = 0; i < count; i++) { + mca_pml_ubcl_request_t *req = container_of(requests[i], mca_pml_ubcl_request_t, ompi_req); + + /* Save callback fields if they are not ours */ + if(mca_pml_ubcl_request_complete_cb != req->ompi_req.req_complete_cb) { + req->saved_complete_cb = req->ompi_req.req_complete_cb; + req->saved_complete_cb_data = req->ompi_req.req_complete_cb_data; + } else { + /* Else reset fields in case of persistent request */ + req->saved_complete_cb = NULL; + req->saved_complete_cb_data = NULL; + } + + /* Reset fields if persistent request */ + OMPI_REQUEST_INIT(&req->ompi_req, req->ompi_req.req_persistent); + req->ompi_req.req_complete_cb = mca_pml_ubcl_request_complete_cb; + req->completed = 0; + req->message = NULL; + req->prematched_req = NULL; + if (req->is_any_src) { + req->rank = OMPI_ANY_SOURCE; + req->proc = NULL; + opal_convertor_cleanup(&req->convertor); + } else { + size_t offset = 0; + opal_convertor_set_position(&req->convertor, &offset); + } + if (req->is_any_tag) { + req->tag = OMPI_ANY_TAG; + } + + /* Start request */ + if (MCA_PML_UBCL_REQUEST_SEND != req->type) { + /* Recv request */ + mca_pml_ubcl_irecv_start(requests + i); + } else { + /* Send request */ + mca_pml_ubcl_isend_start(requests + i); + } + } + + return ret; +} + +/** + * Free a PML request. Find the mca_pml_bxi_request with the given ompi_request, + * mark it as "to be freed" and finalize if already completed. + */ +int +// cppcheck-suppress unusedFunction +mca_pml_ubcl_request_free(struct ompi_request_t **request) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "PML/UBCL REQUEST_FREE %p %p\n", + (void *) request, (void *) *request)); + + /* Null check */ + if (MPI_REQUEST_NULL == *request) { + return OMPI_SUCCESS; + } + + mca_pml_ubcl_request_t *req = container_of((*request), mca_pml_ubcl_request_t, ompi_req); + if (!REQUEST_COMPLETE(&(req)->ompi_req) || !(req)->completed) { + /* Free called before complete : mark as "to free" */ + req->to_free = 1; + } else { + mca_pml_ubcl_request_finalize(req); + } + + *request = MPI_REQUEST_NULL; + + return OMPI_SUCCESS; +} + +/** + * Cannot cancel pml requests + */ +int +// cppcheck-suppress unusedFunction +mca_pml_ubcl_request_cancel(struct ompi_request_t *request, int complete) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, "PML/UBCL REQUEST_CANCEL\n")); + + mca_pml_ubcl_request_t *req = container_of(request, mca_pml_ubcl_request_t, ompi_req); + bool success = false; + ubcl_error_t err; + + /* This lock cannot be removed, even in thread single mode */ + opal_atomic_lock(&req->req_lock); + switch (req->type) { + case MCA_PML_UBCL_REQUEST_SEND: + /* Cannot cancel send requests */ + break; + case MCA_PML_UBCL_REQUEST_RECV: + if (req->completed) { + /* Cannot cancel completed requests */ + break; + } + if (NULL == req->ubcl_operation_handle) { + /* We did not store operation handle, cannot cancel */ + break; + } + + /* Try to cancel the request */ + err = ubcl_cancel(req->ubcl_operation_handle); + if (UBCL_SUCCESS != err) { + break; + } + + req->completed = true; + success = true; + break; + } + opal_atomic_unlock(&req->req_lock); + + if (!success) { + return OMPI_SUCCESS; + } + + /* If the cancel was successfull, mark the request as cancelled and complete it */ + switch (req->type) { + case MCA_PML_UBCL_REQUEST_SEND: + break; + case MCA_PML_UBCL_REQUEST_RECV: + request->req_status._cancelled = true; + ompi_request_complete(&(req->ompi_req), true); + break; + } + + return OMPI_SUCCESS; +} + +int mca_pml_ubcl_request_complete(struct ompi_request_t *request) +{ + /* Null check */ + if (MPI_REQUEST_NULL == request) { + return 0; + } + + mca_pml_ubcl_request_t *req = container_of(request, mca_pml_ubcl_request_t, ompi_req); + + /* If we saved a callback, reset the ompi_request_t fields and call it */ + if (req->saved_complete_cb) { + request->req_complete_cb = req->saved_complete_cb; + request->req_complete_cb_data = req->saved_complete_cb_data; + request->req_complete_cb(request); + } + + if (req->to_free && req->completed) { + OPAL_OUTPUT_VERBOSE( + (50, mca_pml_ubcl_component.output, "PML/UBCL REQUEST_COMPLETE CALL FINALIZE")); + mca_pml_ubcl_request_finalize(req); + return 1; + } + + return 0; +} + +/** + * Complete a PML request. Find the mca_pml_ubcl_request with the given + * ompi_request, mark it as "completed" and finalize if already freed. + */ +int mca_pml_ubcl_request_complete_cb(struct ompi_request_t *request) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, + "PML/UBCL REQUEST_COMPLETE CALLBACK CALLED with ompi_req=%p\n", + (void *) request)); + + return mca_pml_ubcl_request_complete(request); +} + +/* TODO: Get a pointer to status and not a cpy ? */ +void ubcl_request_send_complete_cb(ubcl_status_t status, void *cb_data) +{ + if (UBCL_SUCCESS != status.status) { + mca_pml_ubcl_error(OMPI_ERROR, "UBCL error at request completion"); + } + + ompi_request_t *request = (ompi_request_t *) cb_data; + mca_pml_ubcl_request_t *req = container_of(request, mca_pml_ubcl_request_t, ompi_req); + + size_t dt_size; + ompi_datatype_type_size(req->datatype, &dt_size); + + /* This lock cannot be removed, even in thread single mode */ + opal_atomic_lock(&req->req_lock); + req->completed = 1; + opal_atomic_unlock(&req->req_lock); + if (req->is_buffered) { + mca_pml_base_bsend_request_free((void*)req->buf); + /* Bsend started completed, but could not be freed, now that UBCL is + * done the transfer, if MPI_Wait is done, let free it */ + if (req->to_free) { + /* MPI request has already been waited (Bsend) or freed, no one needs it anymore */ + mca_pml_ubcl_request_finalize(req); + } + } else { + /* No need to set a MPI_Status on Send operations */ + /* No need to free the request: completion callbacks will do it */ + ompi_request_complete(&(req->ompi_req), true); + } + + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, + "PML/UBCL SEND_COMPLETE pml_req=%p mpi_tag=%x\n", req, req->tag)); + + /** mca_pml_ubcl_request_complete((ompi_request_t *) cb_data); */ +} + +void ubcl_request_recv_complete_cb(ubcl_status_t status, void *cb_data) +{ + if (UBCL_SUCCESS != status.status) { + if (UBCL_ERR_TRUNCATE == status.status) { + if (mca_pml_ubcl_component.warn_on_truncate + || mca_pml_ubcl_component.abort_on_truncate) { + mca_pml_ubcl_warn(MPI_ERR_TRUNCATE, "Truncation error found during UBCL recv"); + } + if (mca_pml_ubcl_component.abort_on_truncate) { + ompi_mpi_abort(&ompi_mpi_comm_world.comm, MPI_ERR_TRUNCATE); + } + } else { + mca_pml_ubcl_error(OMPI_ERROR, "UBCL error at request completion"); + } + } + + ompi_request_t *request = (ompi_request_t *) cb_data; + mca_pml_ubcl_request_t *req = container_of(request, mca_pml_ubcl_request_t, ompi_req); + + mca_common_ubcl_status_to_ompi(&request->req_status, status, req->comm, req->rank); + if (MPI_STATUS_IGNORE != &request->req_status) { + request->req_status.MPI_ERROR = ubcl_error_to_ompi(status.status); + } + + /* This lock cannot be removed, even in thread single mode */ + opal_atomic_lock(&req->req_lock); + req->completed = 1; + opal_atomic_unlock(&req->req_lock); + ompi_request_complete(&(req->ompi_req), true); + + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, + "PML/UBCL RECV_COMPLETE pml_req=%p mpi_tag=%d\n", req, req->tag)); + + /** mca_pml_ubcl_request_complete((ompi_request_t *) cb_data); */ +} + +/** + * Really cleanup and free request after a call to request_free and + * request_complete + */ +void mca_pml_ubcl_request_finalize(mca_pml_ubcl_request_t *req) +{ + OPAL_OUTPUT_VERBOSE((50, mca_pml_ubcl_component.output, + "PML/UBCL REQUEST_FINALIZE BEGIN pml_req=%p mpi_tag=%x\n", req, req->tag)); + + opal_convertor_cleanup(&req->convertor); + OBJ_DESTRUCT(&req->convertor); + OMPI_REQUEST_FINI(&req->ompi_req); + OBJ_RELEASE(req->comm); + OMPI_DATATYPE_RELEASE(req->datatype); + OBJ_DESTRUCT(&req->ompi_req); + + opal_free_list_return(&mca_pml_ubcl_component.pml_req_free_list, (opal_free_list_item_t *) req); + + OPAL_OUTPUT_VERBOSE( + (50, mca_pml_ubcl_component.output, "PML/UBCL REQUEST_FINALIZED %p\n", req)); +} + + +bool pml_ubcl_request_is_cuda_buf(mca_pml_ubcl_request_t *req) { + if (!mca_pml_ubcl_component.accelerator_is_cuda) { + return false; + } + + return !!(opal_convertor_on_device(&req->convertor)); +} + +int mca_pml_ubcl_request_need_xpack(mca_pml_ubcl_request_t *req, ubcl_endpoint_type_t type) +{ + int need_buffer; + int is_cuda_buffer; + ubcl_endpoint_capabilities_t *capabilities; + + ompi_datatype_t *datatype = req->datatype; + if (datatype->super.true_lb) { + return 1; + } + + need_buffer = opal_convertor_need_buffers(&req->convertor); + is_cuda_buffer = pml_ubcl_request_is_cuda_buf(req); + + /* If cuda contiguous ptr are allowed, we don't need to pack */ + if (!need_buffer && is_cuda_buffer) { + capabilities = &mca_pml_ubcl_component.endpoint_capabilities[type]; + /* Contiguous cuda buffer */ + if(!capabilities->allow_cuda_contig_ptr + || mca_pml_ubcl_component.force_cuda_custom_dt) { + /* Contiguous cuda ptr not allowed, forcing the use of pack/unpack */ + need_buffer = 1; + } + } + + return need_buffer; +} + +void pml_ubcl_bufferize(mca_pml_ubcl_request_t *req) +{ + if (NULL == req || req->is_buffered) { + return; + } + + void *buffer = NULL; + size_t dt_size, msg_size; + ompi_datatype_type_size(req->datatype, &dt_size); + msg_size = req->count * dt_size; + + /* TODO pack in a buffer on the same device as request buffer */ + buffer = mca_pml_base_bsend_request_alloc_buf(msg_size); + if (NULL == buffer) { + mca_pml_ubcl_error(OMPI_ERR_OUT_OF_RESOURCE, + "Buffered mode but no more memory left in attached " + "buffer\n"); + return; + } + + struct iovec iov; + uint32_t iov_count = 1; + size_t max_data; + iov.iov_len = msg_size; + iov.iov_base = (char *) buffer; + opal_convertor_pack(&req->convertor, &iov, &iov_count, &max_data); + req->is_buffered = 1; + req->count = msg_size; + req->datatype = MPI_PACKED; + req->buf = buffer; + req->need_xpack = 0; + + /* Copy is done Bsend is completed. UBCL just have to do the job for real */ + /* No need to set a MPI_Status on Send operations */ + ompi_request_complete(&(req->ompi_req), true); +} diff --git a/ompi/mca/pml/ubcl/pml_ubcl_request.h b/ompi/mca/pml/ubcl/pml_ubcl_request.h new file mode 100644 index 00000000000..d47fa598af8 --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl_request.h @@ -0,0 +1,311 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2019-2025 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file pml_ubcl_requests.h + * + * UBCL PML Requests + * + * Several specific cases are to be handled with care, namely: + * - Persistant requests: + * Not much but need to be reset at each restart. Some fields are erased + * by OMPI_REQUEST_INIT() and need to be set again. + * - Matching requests (mprobe/mrecv): + * Once matched by a matching probe, an incoming message must be locked + * and can only be received thanks to a corresponding mrecv. Two fields + * are given to allow quick access to ompi_message_t and internal request + * from the pml request. + * - Final trick: + * You can have the following combinations: + * - A persistant any source receive request + * - A matching any source receive request + */ + +#ifndef MCA_PML_UBCL_REQUEST_H +#define MCA_PML_UBCL_REQUEST_H + +#include "ompi/mca/pml/base/base.h" +#include "ompi/mca/pml/pml_constants.h" +#include "ompi/message/message.h" +#include "ompi/proc/proc.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/communicator/communicator.h" +#include "ompi/request/request.h" +#include "ompi/mca/pml/base/pml_base_bsend.h" +#include "opal/include/opal/sys/atomic.h" +#include "opal/mca/common/ubcl/common_ubcl.h" + +#include + +BEGIN_C_DECLS + +/** + * Requests type enum + */ +typedef enum { MCA_PML_UBCL_REQUEST_SEND, MCA_PML_UBCL_REQUEST_RECV } mca_pml_ubcl_request_type_t; + +/** + * Request structure + * + * Fields map the usual MPI calls + */ +struct mca_pml_ubcl_request_t { + opal_free_list_item_t super; + ompi_request_t ompi_req; /**< Base request */ + mca_pml_ubcl_request_type_t type; + + /* PML parameters */ + uint64_t to_free:1; + uint64_t completed:1; + uint64_t need_xpack:1; + uint64_t is_buffered:1; + uint64_t is_buffer_malloced:1; + + /* Any source parameters */ + uint64_t is_any_tag:1; /**< Remember any_tag status for persistant resets */ + uint64_t is_any_src:1; /**< Remember any_src status for persistant resets and + * internal requests cleanup */ + uint64_t pad:57; + + /* MPI API parameters */ + const void *buf; + size_t count; + ompi_datatype_t *datatype; + int rank; /**< src or dest */ + int32_t tag; + int error; /**< Statuts error */ + mca_pml_base_send_mode_t mode; /**< Send mode for send requests */ + struct ompi_communicator_t *comm; /**< Communicator */ + struct ompi_proc_t *proc; /**< Remote ompi proc */ + opal_convertor_t convertor; /**< Data convertor */ + ompi_request_complete_fn_t saved_complete_cb; /**< Saved callback from another component (e.g OSC pt2pt) */ + void *saved_complete_cb_data; /**< Saved callback data from another component (e.g OSC pt2pt) */ + + /* Matching message parameters */ + ompi_message_t *message; + void *prematched_req; /**< Save matched internal request for quick mrecv */ + + /* Cancel/complete concurrency protection */ + opal_atomic_lock_t req_lock; + + /* Operation handle used for cancel */ + void *ubcl_operation_handle; +}; +typedef struct mca_pml_ubcl_request_t mca_pml_ubcl_request_t; +OBJ_CLASS_DECLARATION(mca_pml_ubcl_request_t); + +/** + * Callback functions from request system + */ +int mca_pml_ubcl_request_start(size_t count, struct ompi_request_t **requests); +int mca_pml_ubcl_request_free(struct ompi_request_t **request); +int mca_pml_ubcl_request_cancel(struct ompi_request_t *request, int flag); +int mca_pml_ubcl_request_complete_cb(struct ompi_request_t *request); +void ubcl_request_send_complete_cb(ubcl_status_t status, void *cb_data); +void ubcl_request_recv_complete_cb(ubcl_status_t status, void *cb_data); +void mca_pml_ubcl_request_finalize(mca_pml_ubcl_request_t *req); +int mca_pml_ubcl_request_probe_send(mca_pml_ubcl_request_t *req); +void pml_ubcl_bufferize(mca_pml_ubcl_request_t *req); +bool pml_ubcl_request_is_cuda_buf(mca_pml_ubcl_request_t *req); +int mca_pml_ubcl_request_need_xpack(mca_pml_ubcl_request_t *req, + ubcl_endpoint_type_t type); + +/** + * Requests accessors. + */ +#define MCA_PML_UBCL_REQUEST_ANYSRC(req) ((req)->is_any_src) +#define MCA_PML_UBCL_REQUEST_ANYTAG(req) ((req)->is_any_tag) +#define MCA_PML_UBCL_REQUEST_COMM(req) ((req)->comm) +#define MCA_PML_UBCL_REQUEST_CONVERTOR(req) ((req)->convertor) +#define MCA_PML_UBCL_REQUEST_NEED_XPACK(req) ((req)->need_xpack) +#define MCA_PML_UBCL_REQUEST_IS_ACTIVE(req) (OMPI_REQUEST_ACTIVE == (req)->ompi_req.req_state) + +/** + * Macros for any_source messages. MOSTLY USELESS and can be put in + * pml_ubcl_request_handle_match now that it is the only place where it is called + **/ +#define MCA_PML_UBCL_RECV_REQUEST_UPDATE_SRC(_req, _rank) \ + do { \ + (_req)->rank = _rank; \ + (_req)->proc = ompi_comm_peer_lookup((_req)->comm, (_req)->rank); \ + MCA_PML_UBCL_RECV_REQUEST_CONVERTOR_INIT(_req); \ + } while (0) +#define MCA_PML_UBCL_RECV_REQUEST_UPDATE_TAG(_req, _tag) ((_req)->tag = _tag) + +/** + * Macros to handle MPI matching interface. SAME AS ABOVE, move in corresponding + * function in pml_ubcl_request.c + **/ +#define MCA_PML_UBCL_RECV_REQUEST_PREMATCH(req, _prematched_req, _rank) \ + do { \ + (req)->message->req_ptr = req; \ + (req)->prematched_req = _prematched_req; \ + (req)->rank = _rank; \ + } while (0) +#define MCA_PML_UBCL_RECV_REQUEST_NEED_PREMATCH(req) (NULL != (req)->message) +#define MCA_PML_UBCL_RECV_REQUEST_IS_PREMATCHED(req) (NULL != (req)->prematched_req) +#define MCA_PML_UBCL_RECV_REQUEST_PREMATCHED_REQ(req) ((req)->prematched_req) + +/** + * Generic convinience macros + */ +#define MCA_PML_UBCL_SEND_REQUEST_INIT(req, _buf, _count, _datatype, _dst, _tag, _mode, _comm, \ + _proc, _persistent) \ + do { \ + OBJ_RETAIN(_comm); \ + OMPI_DATATYPE_RETAIN(_datatype); \ + OBJ_CONSTRUCT(&(req)->ompi_req, ompi_request_t); \ + OMPI_REQUEST_INIT(&req->ompi_req, _persistent); \ + (req)->ompi_req.req_type = OMPI_REQUEST_PML; \ + (req)->ompi_req.req_start = mca_pml_ubcl_request_start; \ + (req)->ompi_req.req_free = mca_pml_ubcl_request_free; \ + (req)->ompi_req.req_cancel = mca_pml_ubcl_request_cancel; \ + (req)->ompi_req.req_complete_cb = mca_pml_ubcl_request_complete_cb; \ + (req)->ompi_req.req_mpi_object.comm = _comm; \ + (req)->saved_complete_cb = NULL; \ + (req)->saved_complete_cb_data = NULL; \ + (req)->type = MCA_PML_UBCL_REQUEST_SEND; \ + (req)->to_free = 0; \ + (req)->completed = 0; \ + (req)->is_buffered = 0; \ + (req)->is_buffer_malloced = 0; \ + (req)->buf = _buf; \ + (req)->count = _count; \ + (req)->datatype = _datatype; \ + (req)->rank = _dst; \ + (req)->tag = _tag; \ + (req)->error = MPI_SUCCESS; \ + (req)->mode = _mode; \ + (req)->comm = _comm; \ + (req)->proc = _proc; \ + OBJ_CONSTRUCT(&(req)->convertor, opal_convertor_t); \ + opal_convertor_copy_and_prepare_for_send(_proc->super.proc_convertor, &_datatype->super, \ + _count, _buf, 0, &(req)->convertor); \ + (req)->need_xpack = mca_pml_ubcl_request_need_xpack((req), \ + ((mca_common_ubcl_endpoint_t *)(req)->proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML])->type); \ + (req)->message = NULL; \ + (req)->prematched_req = NULL; \ + (req)->is_any_tag = 0; \ + (req)->is_any_src = 0; \ + opal_atomic_lock_init(&((req)->req_lock), OPAL_ATOMIC_LOCK_UNLOCKED); \ + (req)->ubcl_operation_handle = NULL; \ + } while (0) + +#define MCA_PML_UBCL_RECV_REQUEST_CONVERTOR_INIT(req) \ + do { \ + if ((req)->is_any_src) { \ + /* Remote proc is unknown, let assume its architecture is the same as local proc */ \ + opal_convertor_copy_and_prepare_for_recv(ompi_proc_local()->super.proc_convertor, \ + &(req)->datatype->super, (req)->count, \ + (req)->buf, 0, &(req)->convertor); \ + /* Do not ask for endpoint capabilities and enable by default need_xpack */ \ + (req)->need_xpack = (0 != (req)->datatype->super.true_lb) \ + || opal_convertor_need_buffers(&req->convertor); \ + } else { \ + opal_convertor_copy_and_prepare_for_recv((req)->proc->super.proc_convertor, \ + &(req)->datatype->super, (req)->count, \ + (req)->buf, 0, &(req)->convertor); \ + (req)->need_xpack = mca_pml_ubcl_request_need_xpack((req), \ + ((mca_common_ubcl_endpoint_t *)(req)->proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML])->type); \ + } \ + } while (0) + +#define MCA_PML_UBCL_RECV_REQUEST_INIT(req, _buf, _count, _datatype, _src, \ + _tag, _comm, _proc, _persistent, \ + _probe, _mes) \ + do { \ + OBJ_RETAIN(_comm); \ + OMPI_DATATYPE_RETAIN(_datatype); \ + OBJ_CONSTRUCT(&(req)->ompi_req, ompi_request_t); \ + OMPI_REQUEST_INIT(&req->ompi_req, _persistent); \ + (req)->ompi_req.req_type = OMPI_REQUEST_PML; \ + (req)->ompi_req.req_start = mca_pml_ubcl_request_start; \ + (req)->ompi_req.req_free = mca_pml_ubcl_request_free; \ + (req)->ompi_req.req_cancel = mca_pml_ubcl_request_cancel; \ + (req)->ompi_req.req_complete_cb = mca_pml_ubcl_request_complete_cb; \ + (req)->ompi_req.req_mpi_object.comm = _comm; \ + (req)->saved_complete_cb = NULL; \ + (req)->saved_complete_cb_data = NULL; \ + (req)->type = MCA_PML_UBCL_REQUEST_RECV; \ + (req)->to_free = 0; \ + (req)->completed = 0; \ + (req)->is_buffered = 0; \ + (req)->is_buffer_malloced = 0; \ + (req)->buf = _buf; \ + (req)->count = _count; \ + (req)->datatype = _datatype; \ + (req)->rank = _src; \ + (req)->tag = _tag; \ + (req)->error = MPI_SUCCESS; \ + (req)->mode = MCA_PML_BASE_SEND_SIZE; \ + (req)->comm = _comm; \ + (req)->proc = _proc; \ + OBJ_CONSTRUCT(&(req)->convertor, opal_convertor_t); \ + (req)->message = (void *) _mes; \ + (req)->prematched_req = NULL; \ + (req)->is_any_tag = (_tag == OMPI_ANY_TAG); \ + opal_atomic_lock_init(&((req)->req_lock), OPAL_ATOMIC_LOCK_UNLOCKED); \ + (req)->ubcl_operation_handle = NULL; \ + if (OMPI_ANY_SOURCE == (req)->rank) { \ + (req)->is_any_src = 1; \ + } else { \ + (req)->is_any_src = 0; \ + } \ + MCA_PML_UBCL_RECV_REQUEST_CONVERTOR_INIT(req); \ + } while (0) + +#define MCA_PML_UBCL_RECV_REQUEST_MPROBE_TO_MRECV(req, _buf, _count, _datatype) \ + do { \ + OMPI_DATATYPE_RETAIN(_datatype); \ + (req)->type = MCA_PML_UBCL_REQUEST_RECV; \ + (req)->buf = _buf; \ + (req)->count = _count; \ + (req)->datatype = _datatype; \ + (req)->proc = ompi_comm_peer_lookup((req)->comm, (req)->rank); \ + MCA_PML_UBCL_RECV_REQUEST_CONVERTOR_INIT(req); \ + } while (0) + +#define MCA_PML_UBCL_REQUEST_ACTIVATE(req) \ + do { \ + (req)->ompi_req.req_state = OMPI_REQUEST_ACTIVE; \ + (req)->ompi_req.req_complete = REQUEST_PENDING; \ + (req)->ompi_req.req_status.MPI_SOURCE = OMPI_ANY_SOURCE; \ + (req)->ompi_req.req_status.MPI_TAG = OMPI_ANY_TAG; \ + (req)->ompi_req.req_status.MPI_ERROR = OMPI_SUCCESS; \ + (req)->ompi_req.req_status._ucount = 0; \ + (req)->ompi_req.req_status._cancelled = 0; \ + } while (0) + +#define MCA_PML_UBCL_STATUS_SET(stat, rank, tag, err, size) \ + do { \ + (stat)->MPI_SOURCE = rank; \ + (stat)->MPI_TAG = tag; \ + (stat)->MPI_ERROR = err; \ + (stat)->_ucount = size; \ + (stat)->_cancelled = false; \ + } while (0) + +#define MCA_PML_UBCL_REQUEST_SET_STATUS(req, rank, tag, err, size) \ + do { \ + MCA_PML_UBCL_STATUS_SET(&(req)->ompi_req.req_status, rank, tag, err, size); \ + } while (0) + +#define MCA_PML_UBCL_REQUEST_CPY_STATUS(status, req) \ + do { \ + status->MPI_SOURCE = (req)->ompi_req.req_status.MPI_SOURCE; \ + status->MPI_TAG = (req)->ompi_req.req_status.MPI_TAG; \ + status->MPI_ERROR = (req)->ompi_req.req_status.MPI_ERROR; \ + status->_ucount = (req)->ompi_req.req_status._ucount; \ + status->_cancelled = (req)->ompi_req.req_status._cancelled; \ + } while (0) + +END_C_DECLS + +#endif /* MCA_PML_UBCL_REQUEST_H */ diff --git a/ompi/mca/pml/ubcl/pml_ubcl_utils.c b/ompi/mca/pml/ubcl/pml_ubcl_utils.c new file mode 100644 index 00000000000..cba136dc192 --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl_utils.c @@ -0,0 +1,43 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2019-2024 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file pml_ubcl_utils.c + * + * UBCL PML utilities + * + * Contains some usefull fonctions + * + */ + +#include "pml_ubcl_utils.h" +#include "pml_ubcl.h" +#include + +/* Reserve 1 cid bit to prevent MPI_ANY_TAG to match + * messages with negative tag, which are ompi reserved tags + */ +#define CID_RESERVED_BIT (((uint64_t) 1) << 63) + +ubcl_cid_t mca_pml_ubcl_compute_ubcl_cid(int tag, int cid) +{ + ubcl_cid_t ubcl_cid; + ubcl_cid.cid.communicator = cid; + + int is_collective_tag = tag < 0 && MPI_ANY_TAG != tag; + if (is_collective_tag) { + ubcl_cid.cid.runtime = UBCL_CID_MPI_INTERNAL; + } else { + ubcl_cid.cid.runtime = UBCL_CID_MPI_APPLICATION; + } + + return ubcl_cid; +} + diff --git a/ompi/mca/pml/ubcl/pml_ubcl_utils.h b/ompi/mca/pml/ubcl/pml_ubcl_utils.h new file mode 100644 index 00000000000..88c824542e9 --- /dev/null +++ b/ompi/mca/pml/ubcl/pml_ubcl_utils.h @@ -0,0 +1,39 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2019-2024 Bull SAS. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file pml_ubcl_utils.h + * + * UBCL PML + * + * Contains some usefull fonctions + * + */ + +#ifndef MCA_PML_UBCL_UTILS_H +#define MCA_PML_UBCL_UTILS_H + +#include "ompi/mca/common/ubcl/common_ubcl.h" +#include "ompi/mca/pml/ubcl/pml_ubcl.h" +#include "opal/util/output.h" + +#define PML_UBCL_COMP_NAME "PML/UBCL" + +#define mca_pml_ubcl_log(lvl, ...) \ + opal_output_verbose(lvl, mca_pml_ubcl_component.output, __VA_ARGS__) + +#define mca_pml_ubcl_warn(err, format, ...) \ + _mca_common_ubcl_error(__FILE__, __LINE__, err, false, 5, mca_pml_ubcl_component.output, mca_pml_ubcl_component.is_init, mca_pml_ubcl_component.verbose, PML_UBCL_COMP_NAME, format, ##__VA_ARGS__) +#define mca_pml_ubcl_error(err, format, ...) \ + _mca_common_ubcl_error(__FILE__, __LINE__, err, true, 1, mca_pml_ubcl_component.output, mca_pml_ubcl_component.is_init, mca_pml_ubcl_component.verbose, PML_UBCL_COMP_NAME, format, ##__VA_ARGS__) + +ubcl_cid_t mca_pml_ubcl_compute_ubcl_cid(int tag, int cid); + +#endif /*MCA_PML_UBCL_UTILS_H */ diff --git a/ompi/mca/pml/ubcl/post_configure.sh b/ompi/mca/pml/ubcl/post_configure.sh new file mode 100644 index 00000000000..634b9a3f1d8 --- /dev/null +++ b/ompi/mca/pml/ubcl/post_configure.sh @@ -0,0 +1,2 @@ +DIRECT_CALL_HEADER="ompi/mca/pml/ubcl/pml_ubcl.h" +# Copyright (c) 2024 BULL S.A.S. All rights reserved. diff --git a/opal/mca/common/ubcl/Makefile.am b/opal/mca/common/ubcl/Makefile.am new file mode 100644 index 00000000000..d99e62f9652 --- /dev/null +++ b/opal/mca/common/ubcl/Makefile.am @@ -0,0 +1,105 @@ +# +# Copyright (c) 2020-2024 Bull SAS. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Note that building this common component statically and linking +# against other dynamic components is *not* supported! + +# Header files + +headers = \ + common_ubcl.h + +# Source files + +sources = \ + common_ubcl.c + +# Help file + +dist_opaldata_DATA = \ + help-mpi-common-ubcl.txt + +# As per above, we'll either have an installable or noinst result. +# The installable one should follow the same MCA prefix naming rules +# (i.e., libmca__.la). The noinst one can be named +# whatever it wants, although libmca___noinst.la is +# recommended. + +# To simplify components that link to this library, we will *always* +# have an output libtool library named libmca__.la -- even +# for case 2) described above (i.e., so there's no conditional logic +# necessary in component Makefile.am's that link to this library). +# Hence, if we're creating a noinst version of this library (i.e., +# case 2), we sym link it to the libmca__.la name +# (libtool will do the Right Things under the covers). See the +# all-local and clean-local rules, below, for how this is effected. + +common_ubcl_CFLAGS= -Werror -Wall + +lib_LTLIBRARIES = +noinst_LTLIBRARIES = +comp_inst = lib@OPAL_LIB_NAME@mca_common_ubcl.la +comp_noinst = lib@OPAL_LIB_NAME@mca_common_ubcl_noinst.la + +if MCA_BUILD_opal_common_ubcl_DSO +lib_LTLIBRARIES += $(comp_inst) +else +noinst_LTLIBRARIES += $(comp_noinst) +endif + +lib@OPAL_LIB_NAME@mca_common_ubcl_la_SOURCES = \ + $(headers) $(sources) +lib@OPAL_LIB_NAME@mca_common_ubcl_la_CFLAGS = \ + $(common_ubcl_CFLAGS) +lib@OPAL_LIB_NAME@mca_common_ubcl_la_CPPFLAGS = \ + $(common_ubcl_CPPFLAGS) +lib@OPAL_LIB_NAME@mca_common_ubcl_la_LDFLAGS = \ + $(common_ubcl_LDFLAGS) +lib@OPAL_LIB_NAME@mca_common_ubcl_la_LIBADD = \ + $(common_ubcl_LIBS) \ + $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la + +lib@OPAL_LIB_NAME@mca_common_ubcl_noinst_la_SOURCES = \ + $(headers) $(sources) +lib@OPAL_LIB_NAME@mca_common_ubcl_noinst_la_CFLAGS = \ + $(common_ubcl_CFLAGS) +lib@OPAL_LIB_NAME@mca_common_ubcl_noinst_la_CPPFLAGS = \ + $(common_ubcl_CPPFLAGS) +lib@OPAL_LIB_NAME@mca_common_ubcl_noinst_la_LDFLAGS = \ + $(common_ubcl_LDFLAGS) +lib@OPAL_LIB_NAME@mca_common_ubcl_noinst_la_LIBADD = \ + $(common_ubcl_LIBS) + +# Conditionally install the header files + +if WANT_INSTALL_HEADERS +opaldir = $(opalincludedir)/$(subdir) +opal_HEADERS = $(headers) +endif + +# These two rules will sym link the "noinst" libtool library filename +# to the installable libtool library filename in the case where we are +# compiling this component statically (case 2), described above). + +# See Makefile.ompi-rules for an explanation of the "V" macros, below +V=0 +OMPI_V_LN_SCOMP = $(ompi__v_LN_SCOMP_$V) +ompi__v_LN_SCOMP_ = $(ompi__v_LN_SCOMP_$AM_DEFAULT_VERBOSITY) +ompi__v_LN_SCOMP_0 = @echo " LN_S " `basename $(comp_inst)`; + +all-local: + $(OMPI_V_LN_SCOMP) if test -z "$(lib_LTLIBRARIES)"; then \ + rm -f "$(comp_inst)"; \ + $(LN_S) "$(comp_noinst)" "$(comp_inst)"; \ + fi + +clean-local: + if test -z "$(lib_LTLIBRARIES)"; then \ + rm -f "$(comp_inst)"; \ + fi diff --git a/opal/mca/common/ubcl/common_ubcl.c b/opal/mca/common/ubcl/common_ubcl.c new file mode 100644 index 00000000000..8b39800ab5b --- /dev/null +++ b/opal/mca/common/ubcl/common_ubcl.c @@ -0,0 +1,445 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2024 Bull SAS. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +#include "opal_config.h" + +#include +#include +#include +#include + +#include "opal/mca/base/mca_base_var.h" +#include "opal/mca/dl/base/base.h" +#include "opal/util/argv.h" +#include "opal/util/output.h" +#include "opal/util/proc.h" +#include "opal/util/show_help.h" + +#include "common_ubcl.h" + +/** + * Common UBCL component + */ +mca_opal_common_ubcl_component_t mca_opal_common_ubcl_component = { + .output = 0, + .verbose = 0, + .ld_library_path_fail_warn = true, + .search_opt_ubcl = true, + .force_ld_lib_dlopen = false, + .ubcl_search_path = NULL, + + .is_init = 0, + .is_registered = 0, + .is_dlopen = 0, +}; +const char *default_search_path = "/opt/ubcl/"; + +/* + * Version of the UBCL API we need + */ +ubcl_api_version_t my_api_version = { + .major = UBCL_API_VERSION_MAJOR, + .minor = UBCL_API_VERSION_MINOR, +}; + +/* Handle to libubcl.so */ +opal_dl_handle_t *libubcl_handle = NULL; + +static int mca_common_ubcl_scandir_filter(const struct dirent *dir) +{ + char* dirname_copy = NULL; + char* saved_ptr = NULL; + char* digit_str = NULL; + char* endptr = NULL; + unsigned long digit = 0; + int digit_position = 0; + + /* Filter out '.' and '..' */ + if (0 == strcmp(dir->d_name, ".") || 0 == strcmp(dir->d_name, "..")) { + return 0; + } + + /* Only keep directories and unknown */ + if (DT_DIR != dir->d_type && DT_UNKNOWN != dir->d_type) { + return 0; + } + + /* Filter out folders that don't look like X.Y.Z */ + dirname_copy = strdup(dir->d_name); + digit_str = strtok_r(dirname_copy, ".", &saved_ptr); + while (digit_str != NULL) { + digit = strtol(digit_str, &endptr, 10); + if (digit_str == endptr) { + common_ubcl_log_verbose(95, "DIGIT: '%s' doesn't start by a number\n", + digit_str); + goto free_and_fail; + } else if ('\0' != *endptr) { + common_ubcl_log_verbose(95, "DIGIT: '%s' contains non-number\n", + digit_str); + goto free_and_fail; + } else { + switch (digit_position) { + case 0: + if (digit != my_api_version.major) { + common_ubcl_log_verbose(95, "Wrong API_MAJOR version: " + "%lu != %u\n", digit, + my_api_version.major); + goto free_and_fail; + } + break; + case 1: + if (digit < my_api_version.minor) { + common_ubcl_log_verbose(95, "Wrong API_MINOR version: " + "%lu < %u\n", digit, + my_api_version.minor); + goto free_and_fail; + } + break; + case 2: + break; + default: + common_ubcl_log_verbose(95, "'%s' has more than 3 digits", + dir->d_name); + goto free_and_fail; + } + } + digit_position++; + digit_str = strtok_r(NULL, ".", &saved_ptr); + } + + free(dirname_copy); + return 1; + +free_and_fail: + common_ubcl_log_verbose(95, "Filtering out '%s'", dir->d_name); + free(dirname_copy); + return 0; +} + +static int mca_common_ubcl_find_ubcl_install(char*** searchpaths) +{ + int nb_dir, i; + int inv_i; + struct dirent **verslist; + const char* ubcl_search_path = *mca_opal_common_ubcl_component.ubcl_search_path; + + nb_dir = scandir(ubcl_search_path, &verslist, mca_common_ubcl_scandir_filter, versionsort); + if (-1 == nb_dir) { + common_ubcl_warning("Failed to scan %s, error: %s", ubcl_search_path, strerror(errno)); + return nb_dir; + } + + /* Allocate two more to append the search path itself and 'NULL' as the last */ + (*searchpaths) = malloc( (2 + nb_dir) * sizeof(char*)); + asprintf((*searchpaths)+nb_dir, "%s/lib", ubcl_search_path); + (*searchpaths)[nb_dir + 1] = NULL; + + /* Iterate backwards to get higher versions first */ + inv_i = 0; + for (i = nb_dir - 1; i >= 0; i--) { + asprintf((*searchpaths)+inv_i, "%s/%s/lib", ubcl_search_path, verslist[i]->d_name); + free(verslist[i]); + inv_i++; + } + free(verslist); + + return nb_dir; +} + +static void mca_common_ubcl_free_found_searchpaths(char*** searchpaths, int nb_dir) { + int i; + + for (i = 0; i < nb_dir; i++) { + free((*searchpaths)[i]); + } + free((*searchpaths)[nb_dir]); + + free(*searchpaths); + (*searchpaths) = NULL; +} + +static bool mca_common_ubcl_test_lib_version(char* filename) { + int ret; + char *err_msg; + const char *ubcl_api_symbol = "ubcl_api_version"; + ubcl_api_version_t ubcl_lib_api_version; + void *symbol = NULL; + + ret = opal_dl_lookup(libubcl_handle, ubcl_api_symbol, &symbol, &err_msg); + if (OPAL_SUCCESS != ret) { + common_ubcl_warning("Library %s opened but no %s symbols found." + " It probably is an older version, skipping.\n", + filename, ubcl_api_symbol); + return OPAL_ERROR; + } + ubcl_lib_api_version = *(ubcl_api_version_t*)symbol; + + if (ubcl_lib_api_version.major != my_api_version.major) { + common_ubcl_warning("Library %s opened but API version major digit" + " '%d' isn't the wanted: '%d'. Skipping\n", + filename, ubcl_lib_api_version.major, my_api_version.major); + return OPAL_ERROR; + } + + if (ubcl_lib_api_version.minor < my_api_version.minor) { + common_ubcl_warning("Library %s opened but API version minor '%d' " + "inferior to the minimum wanted: '%d'. Skipping\n", + filename, ubcl_lib_api_version.minor, my_api_version.minor); + return OPAL_ERROR; + } + + common_ubcl_log_verbose(20, " Accepting library %s with API version: '%d.%d'," + " (wanted: '%d.%d')\n", filename, + ubcl_lib_api_version.major, ubcl_lib_api_version.minor, + my_api_version.major, my_api_version.minor); + return OPAL_SUCCESS; +} + +static bool mca_common_ubcl_try_dlopen(char** searchpaths, char** ubcllibs, char*** errmsgs) { + int retval; + int errsize; + bool dlopen_success = false; + int j = 0; + + while (searchpaths[j] != NULL) { + int i = 0; + while (ubcllibs[i] != NULL) { + char *filename = NULL; + char *str = NULL; + + /* If there's a non-empty search path, prepend it + to the library filename */ + if (strlen(searchpaths[j]) > 0) { + asprintf(&filename, "%s/%s", searchpaths[j], ubcllibs[i]); + } else { + filename = strdup(ubcllibs[i]); + } + if (NULL == filename) { + opal_show_help("help-mpi-common-ubcl.txt", "No memory", + true, OPAL_PROC_MY_HOSTNAME); + return OPAL_ERR_NOT_AVAILABLE; + } + + retval = opal_dl_open(filename, false, false, + &libubcl_handle, &str); + if (OPAL_SUCCESS != retval || NULL == libubcl_handle) { + if (NULL != str) { + opal_argv_append(&errsize, errmsgs, str); + } else { + opal_argv_append(&errsize, errmsgs, + "opal_dl_open() returned NULL."); + } + common_ubcl_log_verbose(10, "UBCL: Library open error: %s", + (*errmsgs)[errsize-1]); + } else { + if (mca_opal_common_ubcl_component.force_ld_lib_dlopen) { + /* Force retval to fake a good version check */ + retval = OPAL_SUCCESS; + } else { + /* We opened an UBCL library, now we need to check the version */ + retval = mca_common_ubcl_test_lib_version(filename); + } + + if (OPAL_SUCCESS != retval) { + asprintf(&str, "%s opened but version check failed. Skipping", filename); + opal_argv_append(&errsize, errmsgs, str); + opal_dl_close(libubcl_handle); + libubcl_handle = NULL; + } else { + common_ubcl_log_verbose(10, "UBCL: Library successfully " + "opened %s", filename); + dlopen_success = true; + free(filename); + break; + } + } + i++; + + free(filename); + } + if (true == dlopen_success) { + break; /* Break out of outer loop */ + } + j++; + } + return dlopen_success; +} + +static int mca_common_ubcl_dlopen_ubcl(void) +{ + char *ubcllibs[] = { "libubcl.so", "libubcl.so.0", NULL }; + char *searchpaths[] = { "", NULL }; + char **opt_searchpaths = NULL; + char **errmsgs = NULL; + char *errmsg = NULL; + bool dlopen_success = false; + int nb_dir = 0; + + if (1 < opal_atomic_add_fetch_32(&mca_opal_common_ubcl_component.is_dlopen, 1)) { + return OPAL_SUCCESS; + } + + if (!OPAL_HAVE_DL_SUPPORT) { + opal_show_help("help-mpi-common-ubcl.txt", "dlopen disabled", true); + return OPAL_ERR_NOT_AVAILABLE; + } + + common_ubcl_log_verbose(10, "COMMMON_UBCL: Starting to look for UBCL" + " library"); + + + /* Now walk through all the potential names libubcl and find one that + * works. If it does, all is good. If not, print out all the messages about + * why things failed. This code was careful to try and save away all error + * messages if the loading ultimately failed to help with debugging. */ + + + /* On the first try we just utilize the default loading paths from + * the system. This is so that LD_LIBRARY_PATH is looked at in priority */ + dlopen_success = mca_common_ubcl_try_dlopen(searchpaths, ubcllibs, &errmsgs); + + if (true == dlopen_success) { + goto success; + } + + if (mca_opal_common_ubcl_component.ld_library_path_fail_warn) { + common_ubcl_warning("We did not find a compatible UBCL in LD_LIBRARY_PATH\n"); + } + + if (mca_opal_common_ubcl_component.force_ld_lib_dlopen) { + common_ubcl_error("No UBCL found in LD_LIBRARY_PATH and 'force_ld_lib_dlopen'" + " set to 'true'. We cannot load UBCL for the PML/UBCL to use\n"); + goto failed; + } + + if (!mca_opal_common_ubcl_component.search_opt_ubcl) { + common_ubcl_error("No UBCL found in LD_LIBRARY_PATH and 'search_opt_ubcl'" + " set to 'false'. We cannot load UBCL for the PML/UBCL to use\n"); + goto failed; + } + + nb_dir = mca_common_ubcl_find_ubcl_install(&opt_searchpaths); + + if (-1 == nb_dir) { + goto failed; + } + + /* Now look into paths found by 'find_ubcl_install' */ + dlopen_success = mca_common_ubcl_try_dlopen(opt_searchpaths, ubcllibs, &errmsgs); + mca_common_ubcl_free_found_searchpaths(&opt_searchpaths, nb_dir); + + if (true == dlopen_success) { + goto success; + } + +failed: + errmsg = opal_argv_join(errmsgs, '\n'); + opal_show_help("help-mpi-common-ubcl.txt", "dlopen failed", true, + errmsg); + opal_argv_free(errmsgs); + free(errmsg); + return OPAL_ERR_NOT_AVAILABLE; + +success: + opal_argv_free(errmsgs); + free(errmsg); + return OPAL_SUCCESS; +} + +void mca_common_ubcl_register_mca(void) +{ + if (1 < opal_atomic_add_fetch_32(&mca_opal_common_ubcl_component.is_registered, 1)) { + return; + } + MCA_REGISTER_COMMON_UBCL("verbose", "Verbosity level of component common/ubcl", + MCA_BASE_VAR_TYPE_INT, &mca_opal_common_ubcl_component.verbose); + MCA_REGISTER_COMMON_UBCL("ld_lib_path_fail_warn", + "Warn the user when no fitting libraries were found" + " in the default system loading path (LD_LIBRARY_PATH)", + MCA_BASE_VAR_TYPE_BOOL, &mca_opal_common_ubcl_component.ld_library_path_fail_warn); + MCA_REGISTER_COMMON_UBCL("force_ld_lib_dlopen", + "Force comon/ubcl to dlopen and use an UBCL library" + " found in LD_LIBRARY_PATH, regardless of API version", + MCA_BASE_VAR_TYPE_BOOL, &mca_opal_common_ubcl_component.force_ld_lib_dlopen); + MCA_REGISTER_COMMON_UBCL("search_opt_ubcl", + "In case we don't find a suitable UBCL library in " + "LD_LIBRARY_PATH, automatically search /opt/ubcl for compatible UBCL", + MCA_BASE_VAR_TYPE_BOOL, &mca_opal_common_ubcl_component.search_opt_ubcl); + + // Extra level of string indirection needed to make ompi_info + // happy since it will unload this library before the MCA base + // cleans up the MCA vars. This will cause the string to go + // out of scope unless we place the pointer to it on the heap. + mca_opal_common_ubcl_component.ubcl_search_path = malloc(sizeof(char*)); + *mca_opal_common_ubcl_component.ubcl_search_path = default_search_path; + MCA_REGISTER_COMMON_UBCL("ubcl_search_path", + "When 'search_opt_ubcl' is true, search for UBCL" + " version directories at this path", + MCA_BASE_VAR_TYPE_STRING, mca_opal_common_ubcl_component.ubcl_search_path); +} + +int mca_common_ubcl_init(void) +{ + int ret; + + /* Safe guard for multiple init/fini */ + if (1 < opal_atomic_add_fetch_32(&mca_opal_common_ubcl_component.is_init, 1)) { + /* UBCL already init */ + return OPAL_SUCCESS; + } + + /* Open output stream */ + if (0 <= mca_opal_common_ubcl_component.verbose) { + mca_opal_common_ubcl_component.output = opal_output_open(NULL); + opal_output_set_verbosity(mca_opal_common_ubcl_component.output, + mca_opal_common_ubcl_component.verbose); + common_ubcl_log_verbose(10, "Opening common/ubcl component\n"); + } else { + mca_opal_common_ubcl_component.output = -1; + } + + /* Initializing modules */ + ret = mca_common_ubcl_dlopen_ubcl(); + + if (ret != OPAL_SUCCESS) { + common_ubcl_error("Could not dlopen UBCL"); + } + + return ret; +} + +int mca_common_ubcl_fini(void) +{ + int ret; + uint32_t refcount; + + /* Safe guard for multiple init/fini */ + refcount = opal_atomic_fetch_sub_32(&mca_opal_common_ubcl_component.is_init, 1); + assert (0 < refcount); + + if (1 < refcount) { + /* Not the last 'fini' */ + return OPAL_SUCCESS; + } + + common_ubcl_log_verbose(10, "Closing common/ubcl component\n"); + + /* Closing output */ + if (0 < mca_opal_common_ubcl_component.verbose) { + opal_output_close(mca_opal_common_ubcl_component.output); + } + + ret = opal_dl_close(libubcl_handle); + + return ret; +} + +int mca_common_ubcl_is_init(void) { + return (int) opal_atomic_add_fetch_32(&mca_opal_common_ubcl_component.is_init, 0); +} diff --git a/opal/mca/common/ubcl/common_ubcl.h b/opal/mca/common/ubcl/common_ubcl.h new file mode 100644 index 00000000000..05eeccfaa2f --- /dev/null +++ b/opal/mca/common/ubcl/common_ubcl.h @@ -0,0 +1,72 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2024 Bull SAS. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OPAL_MCA_COMMON_UBCL_H +#define OPAL_MCA_COMMON_UBCL_H + +#include "opal/mca/mca.h" +#include "opal/class/opal_list.h" +#include "opal/util/show_help.h" +#include + +#define MCA_REGISTER_COMMON_UBCL(name, desc, type, var) \ + mca_base_var_register("ompi", "mpi", "common_ubcl", name, desc, type, NULL, 0, \ + MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL, \ + var) + +BEGIN_C_DECLS + +struct mca_opal_common_ubcl_component_t { + /* MCA params */ + int output; + int verbose; + bool gdb_attach; + bool ld_library_path_fail_warn; + bool search_opt_ubcl; + bool force_ld_lib_dlopen; + const char** ubcl_search_path; + + /* Miscellaneous */ + int32_t is_init; + int32_t is_registered; + int32_t is_dlopen; +}; +typedef struct mca_opal_common_ubcl_component_t mca_opal_common_ubcl_component_t; +OPAL_DECLSPEC extern mca_opal_common_ubcl_component_t mca_opal_common_ubcl_component; + +struct mca_common_ubcl_endpoint_t { + uint32_t type; + int32_t refcount; + uint64_t rank; +}; +typedef struct mca_common_ubcl_endpoint_t mca_common_ubcl_endpoint_t; + + +void mca_common_ubcl_register_mca(void); +int mca_common_ubcl_init(void); +int mca_common_ubcl_fini(void); +int mca_common_ubcl_is_init(void); + +#define common_ubcl_generic(__token, ...) \ + opal_output(mca_opal_common_ubcl_component.output, "[COMMON/UBCL] "__token __VA_ARGS__) + +#define common_ubcl_error(...) common_ubcl_generic("ERROR: ", __VA_ARGS__) +#define common_ubcl_warning(...) common_ubcl_generic("WARNING: ", __VA_ARGS__) +#define common_ubcl_log(...) common_ubcl_generic(" ", __VA_ARGS__) + +#define common_ubcl_log_verbose(__lvl, ...) \ + opal_output_verbose(__lvl, mca_opal_common_ubcl_component.output, "[COMMON/UBCL] "__VA_ARGS__) + +#define mca_common_ubcl_help(name, ...) \ + opal_show_help("help-mpi-common-ubcl.txt", name, true, "[COMMON/UBCL]", ##__VA_ARGS__) + +END_C_DECLS + +#endif /* OPAL_MCA_COMMON_UBCL_H */ diff --git a/opal/mca/common/ubcl/configure.m4 b/opal/mca/common/ubcl/configure.m4 new file mode 100644 index 00000000000..d98ebf43103 --- /dev/null +++ b/opal/mca/common/ubcl/configure.m4 @@ -0,0 +1,27 @@ +# -*- shell-script -*- +# +# Copyright (c) 2024 Bull S.A.S. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AC_DEFUN([MCA_opal_common_ubcl_CONFIG],[ + AC_CONFIG_FILES([opal/mca/common/ubcl/Makefile]) + + OMPI_CHECK_UBCL([common_ubcl], + [common_ubcl_happy="yes"], + [common_ubcl_happy="no"]) + + + AS_IF([test "$common_ubcl_happy" = "yes"], + [$1], + [$2]) + + # substitute in the things needed to build ubcl + AC_SUBST([common_ubcl_CPPFLAGS]) + AC_SUBST([common_ubcl_LDFLAGS]) + AC_SUBST([common_ubcl_LIBS]) +])dnl diff --git a/opal/mca/common/ubcl/help-mpi-common-ubcl.txt b/opal/mca/common/ubcl/help-mpi-common-ubcl.txt new file mode 100644 index 00000000000..90b1ddec53f --- /dev/null +++ b/opal/mca/common/ubcl/help-mpi-common-ubcl.txt @@ -0,0 +1,28 @@ +# -*- text -*- +# +# Copyright (c) 2024 Bull SAS. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the English help file for Open MPI's BXI support. +# +[dlopen disabled] +Open MPI was compiled without dynamic library support (e.g., with the + --disable-dlopen flag), and therefore cannot utilize UBCL support. + +If you need UBCL support, reconfigure Open MPI with dynamic library support enabled. +# +[dlopen failed] +The library attempted to open the following UBCL libraries, +but each of them failed. UBCL cannot be used. +%s +If you need to use UBCL, then try setting LD_LIBRARY_PATH to the location +of libubcl.so to get passed this issue. +# +[No memory] +A call to allocate memory within the UBCL support failed. This is +an unrecoverable error and will cause the program to abort. + Hostname: %s