diff --git a/.github/workflows/ompi_mpi4py.yaml b/.github/workflows/ompi_mpi4py.yaml index 7df4939e5e7..8e3c450b4be 100644 --- a/.github/workflows/ompi_mpi4py.yaml +++ b/.github/workflows/ompi_mpi4py.yaml @@ -77,7 +77,6 @@ jobs: mkdir -p "$(dirname "$mca_params")" echo mpi_param_check = true >> "$mca_params" echo mpi_show_handle_leaks = true >> "$mca_params" - echo rmaps_base_oversubscribe = true >> "$mca_params" mca_params="$HOME/.prte/mca-params.conf" mkdir -p "$(dirname "$mca_params")" echo rmaps_default_mapping_policy = :oversubscribe >> "$mca_params" diff --git a/3rd-party/openpmix b/3rd-party/openpmix index e32e0179bc6..08e41ed5629 160000 --- a/3rd-party/openpmix +++ b/3rd-party/openpmix @@ -1 +1 @@ -Subproject commit e32e0179bc6bd1637f92690511ce6091719fa046 +Subproject commit 08e41ed5629b51832f5708181af6d89218c7a74e diff --git a/3rd-party/prrte b/3rd-party/prrte index 0f0a90006cb..30cadc6746e 160000 --- a/3rd-party/prrte +++ b/3rd-party/prrte @@ -1 +1 @@ -Subproject commit 0f0a90006cbc880d499b2356d6076e785e7868ba +Subproject commit 30cadc6746ebddd69ea42ca78b964398f782e4e3 diff --git a/VERSION b/VERSION index 4b702009b78..f1c8928faf6 100644 --- a/VERSION +++ b/VERSION @@ -27,7 +27,7 @@ mpi_standard_subversion=1 # List in x.y.z format. pmix_min_version=4.2.0 prte_min_version=3.0.0 -hwloc_min_version=1.11.0 +hwloc_min_version=2.1.0 event_min_version=2.0.21 automake_min_version=1.13.4 autoconf_min_version=2.69.0 diff --git a/autogen.pl b/autogen.pl index 51a10ee57a8..16f5981f99f 100755 --- a/autogen.pl +++ b/autogen.pl @@ -5,7 +5,7 @@ # Copyright (c) 2013 Mellanox Technologies, Inc. # All rights reserved. # Copyright (c) 2013-2020 Intel, Inc. All rights reserved. -# Copyright (c) 2015-2021 Research Organization for Information Science +# Copyright (c) 2015-2024 Research Organization for Information Science # and Technology (RIST). All rights reserved. # Copyright (c) 2015-2022 IBM Corporation. All rights reserved. # Copyright (c) 2020 Amazon.com, Inc. or its affiliates. @@ -891,9 +891,9 @@ sub patch_autotools_output { # source tree); we can't fix it. So all we can do is patch the # resulting configure script. :-( push(@verbose_out, $indent_str . "Patching configure for Libtool PGI 10 fortran compiler name\n"); - $c =~ s/gfortran g95 xlf95 f95 fort ifort ifc efc pgf95 lf95 ftn/gfortran g95 xlf95 f95 fort ifort ifc efc pgfortran pgf95 lf95 ftn/g; - $c =~ s/pgcc\* \| pgf77\* \| pgf90\* \| pgf95\*\)/pgcc* | pgf77* | pgf90* | pgf95* | pgfortran*)/g; - $c =~ s/pgf77\* \| pgf90\* \| pgf95\*\)/pgf77* | pgf90* | pgf95* | pgfortran*)/g; + $c =~ s/gfortran g95 xlf95 f95 fort ifort ifc efc pgf95 lf95 ftn/gfortran g95 xlf95 f95 fort ifort ifc efc pgfortran nvfortran pgf95 lf95 ftn/g; + $c =~ s/pgcc\* \| pgf77\* \| pgf90\* \| pgf95\*\)/pgcc* | pgf77* | pgf90* | pgf95* | pgfortran* | nvfortran*)/g; + $c =~ s/pgf77\* \| pgf90\* \| pgf95\*\)/pgf77* | pgf90* | pgf95* | pgfortran* | nvfortran*)/g; # Similar issue as above -- the PGI 10 version number broke <=LT # 2.2.6b's version number checking regexps. Again, we can't fix the @@ -1085,6 +1085,30 @@ sub patch_autotools_output { ;;"; $c =~ s/$search_string/$replace_string/g; + $c =~ s/for ac_prog in gfortran f95 fort xlf95 ifort ifc efc pgfortran pgf95 lf95 f90 xlf90 pgf90 epcf90 nagfor/for ac_prog in gfortran f95 fort xlf95 ifort ifc efc pgfortran pgf95 lf95 f90 xlf90 pgf90 epcf90 nagfor nvfortran/g; + foreach my $tag (("", "_FC")) { + $search_string = 'tcc\*\) + # Fabrice Bellard et al\'s Tiny C Compiler + lt_prog_compiler_wl'."${tag}".'=\'-Wl,\' + lt_prog_compiler_pic'."${tag}".'=\'-fPIC\' + lt_prog_compiler_static'."${tag}".'=\'-static\' + ;;'; + $replace_string = "tcc*) + # Fabrice Bellard et al's Tiny C Compiler + lt_prog_compiler_wl${tag}='-Wl,' + lt_prog_compiler_pic${tag}='-fPIC' + lt_prog_compiler_static${tag}='-static' + ;; + nvc* | nvcc* | nvfortran*) + # NVIDIA Fortran compiler + lt_prog_compiler_wl${tag}='-Wl,' + lt_prog_compiler_pic${tag}='-fPIC' + lt_prog_compiler_static${tag}='-Bstatic' + ;;"; + push(@verbose_out, $indent_str . "Patching configure for NVIDIA Fortran compiler (${tag})\n"); + $c =~ s/$search_string/$replace_string/g; + } + # Only write out verbose statements and a new configure if the # configure content actually changed return diff --git a/config/ompi_setup_fc.m4 b/config/ompi_setup_fc.m4 index cf4212fc9e5..157f5a6301d 100644 --- a/config/ompi_setup_fc.m4 +++ b/config/ompi_setup_fc.m4 @@ -43,7 +43,7 @@ AC_DEFUN_ONCE([_OMPI_SETUP_FC_COMPILER],[ # Fortran compilers (excluding the f77 compiler names) from AC's # default list of compilers and use it here. This is the main # reason we have an OMPI-ized version of the PROG_FC macro. - AC_PROG_FC([gfortran f95 fort xlf95 ifort ifc efc pgfortran pgf95 lf95 f90 xlf90 pgf90 epcf90 nagfor]) + AC_PROG_FC([gfortran f95 fort xlf95 ifort ifc efc pgfortran pgf95 lf95 f90 xlf90 pgf90 epcf90 nagfor nvfortran]) FCFLAGS="$ompi_fcflags_save" OPAL_VAR_SCOPE_POP ]) diff --git a/examples/hello_sessions_c.c b/examples/hello_sessions_c.c index 863aaa1269e..8d84c39ba6b 100644 --- a/examples/hello_sessions_c.c +++ b/examples/hello_sessions_c.c @@ -11,14 +11,11 @@ int main(int argc, char** argv) { MPI_Info info; MPI_Session s1, s2; -#if 0 -/* need PR https://github.com/open-mpi/ompi/pull/12868 to be merged in - * before this can be uncommented. - */ MPI_Info_create(&info); -#endif MPI_Session_init(MPI_INFO_NULL, MPI_ERRORS_RETURN, &s1); MPI_Session_finalize(&s1); MPI_Session_init(MPI_INFO_NULL, MPI_ERRORS_RETURN, &s2); MPI_Session_finalize(&s2); + MPI_Info_free(&info); + return 0; } diff --git a/ompi/communicator/comm.c b/ompi/communicator/comm.c index d7fb321e3f8..2a9afd352be 100644 --- a/ompi/communicator/comm.c +++ b/ompi/communicator/comm.c @@ -24,7 +24,7 @@ * Copyright (c) 2015 Mellanox Technologies. All rights reserved. * Copyright (c) 2017-2022 IBM Corporation. All rights reserved. * Copyright (c) 2021 Nanook Consulting. All rights reserved. - * Copyright (c) 2018-2022 Triad National Security, LLC. All rights + * Copyright (c) 2018-2024 Triad National Security, LLC. All rights * reserved. * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. * $COPYRIGHT$ @@ -1738,7 +1738,7 @@ int ompi_intercomm_create_from_groups (ompi_group_t *local_group, int local_lead ompi_communicator_t **newintercomm) { ompi_communicator_t *newcomp = NULL, *local_comm, *leader_comm = MPI_COMM_NULL; - ompi_comm_extended_cid_block_t new_block; + ompi_comm_extended_cid_block_t new_block = {0}; bool i_am_leader = local_leader == local_group->grp_my_rank; ompi_proc_t **rprocs; uint64_t data[4]; @@ -1864,14 +1864,7 @@ int ompi_intercomm_create_from_groups (ompi_group_t *local_group, int local_lead return rc; } - /* will be using a communicator ID derived from the bridge communicator to save some time */ - new_block.block_cid.cid_base = data[1]; - new_block.block_cid.cid_sub.u64 = data[2]; - new_block.block_nextsub = 0; - new_block.block_nexttag = 0; - new_block.block_level = (int8_t) data[3]; - - rc = ompi_comm_nextcid (newcomp, NULL, NULL, (void *) tag, &new_block, false, OMPI_COMM_CID_GROUP_NEW); + rc = ompi_comm_nextcid (newcomp, NULL, NULL, (void *) tag, NULL, false, OMPI_COMM_CID_GROUP_NEW); if ( OMPI_SUCCESS != rc ) { OBJ_RELEASE(newcomp); return rc; diff --git a/ompi/communicator/comm_cid.c b/ompi/communicator/comm_cid.c index db97f7ea1b8..0475d63b6f4 100644 --- a/ompi/communicator/comm_cid.c +++ b/ompi/communicator/comm_cid.c @@ -310,21 +310,16 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu const void *arg0, const void *arg1, bool send_first, int mode, ompi_request_t **req) { - pmix_info_t pinfo, *results = NULL; + pmix_info_t *pinfo, *results = NULL; size_t nresults; - opal_process_name_t *name_array = NULL; - char *tag = NULL; - size_t proc_count; - size_t cid_base = 0; + opal_process_name_t opal_proc_name; bool cid_base_set = false; + char *tag = NULL; + size_t proc_count = 0, rproc_count = 0, tproc_count = 0, cid_base = 0UL, ninfo; int rc, leader_rank; - int ret = OMPI_SUCCESS; - pmix_proc_t *procs = NULL; - - rc = ompi_group_to_proc_name_array (newcomm->c_local_group, &name_array, &proc_count); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - return rc; - } + pmix_proc_t *procs; + void *grpinfo = NULL, *list = NULL; + pmix_data_array_t darray; switch (mode) { case OMPI_COMM_CID_GROUP_NEW: @@ -341,15 +336,75 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu break; } - PMIX_INFO_LOAD(&pinfo, PMIX_GROUP_ASSIGN_CONTEXT_ID, NULL, PMIX_BOOL); + grpinfo = PMIx_Info_list_start(); + if (NULL == grpinfo) { + rc = OMPI_ERR_OUT_OF_RESOURCE; + goto fn_exit; + } + + rc = PMIx_Info_list_add(grpinfo, PMIX_GROUP_ASSIGN_CONTEXT_ID, NULL, PMIX_BOOL); + if (PMIX_SUCCESS != rc) { + OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_add failed %s %d", PMIx_Error_string(rc), __LINE__)); + rc = OMPI_ERR_OUT_OF_RESOURCE; + goto fn_exit; + } + + list = PMIx_Info_list_start(); + + size_t c_index = (size_t)newcomm->c_index; + rc = PMIx_Info_list_add(list, PMIX_GROUP_LOCAL_CID, &c_index, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_add failed %s %d", PMIx_Error_string(rc), __LINE__)); + rc = OMPI_ERR_OUT_OF_RESOURCE; + goto fn_exit; + } + + rc = PMIx_Info_list_convert(list, &darray); + if (PMIX_SUCCESS != rc) { + OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_convert failed %s %d", PMIx_Error_string(rc), __LINE__)); + rc = OMPI_ERR_OUT_OF_RESOURCE; + goto fn_exit; + } + rc = PMIx_Info_list_add(grpinfo, PMIX_GROUP_INFO, &darray, PMIX_DATA_ARRAY); + PMIX_DATA_ARRAY_DESTRUCT(&darray); + if (PMIX_SUCCESS != rc) { + OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_add failed %s %d", PMIx_Error_string(rc), __LINE__)); + rc = OMPI_ERR_OUT_OF_RESOURCE; + goto fn_exit; + } + + rc = PMIx_Info_list_convert(grpinfo, &darray); + if (PMIX_SUCCESS != rc) { + OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_convert failed %s %d", PMIx_Error_string(rc), __LINE__)); + rc = OMPI_ERR_OUT_OF_RESOURCE; + goto fn_exit; + } + + pinfo = (pmix_info_t*)darray.array; + ninfo = darray.size; + + proc_count = newcomm->c_local_group->grp_proc_count; + if ( OMPI_COMM_IS_INTER (newcomm) ){ + rproc_count = newcomm->c_remote_group->grp_proc_count; + } + + PMIX_PROC_CREATE(procs, proc_count + rproc_count); - PMIX_PROC_CREATE(procs, proc_count); for (size_t i = 0 ; i < proc_count; ++i) { - OPAL_PMIX_CONVERT_NAME(&procs[i],&name_array[i]); + opal_proc_name = ompi_group_get_proc_name(newcomm->c_local_group, i); + OPAL_PMIX_CONVERT_NAME(&procs[i],&opal_proc_name); + } + for (size_t i = 0; i < rproc_count; ++i) { + opal_proc_name = ompi_group_get_proc_name(newcomm->c_remote_group, i); + OPAL_PMIX_CONVERT_NAME(&procs[proc_count+i],&opal_proc_name); } - rc = PMIx_Group_construct(tag, procs, proc_count, &pinfo, 1, &results, &nresults); - PMIX_INFO_DESTRUCT(&pinfo); + tproc_count = proc_count + rproc_count; + + OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "calling PMIx_Group_construct - tag %s size %ld ninfo %ld cid_base %ld\n", + tag, tproc_count, ninfo, cid_base)); + rc = PMIx_Group_construct(tag, procs, tproc_count, pinfo, ninfo, &results, &nresults); + PMIX_DATA_ARRAY_DESTRUCT(&darray); if(PMIX_SUCCESS != rc) { char msg_string[1024]; switch (rc) { @@ -361,7 +416,7 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu "MPI_Comm_create_from_group/MPI_Intercomm_create_from_groups", msg_string); - ret = MPI_ERR_UNSUPPORTED_OPERATION; + rc = MPI_ERR_UNSUPPORTED_OPERATION; break; case PMIX_ERR_NOT_SUPPORTED: sprintf(msg_string,"PMIx server does not support PMIx Group operations"); @@ -370,10 +425,10 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu true, "MPI_Comm_create_from_group/MPI_Intercomm_create_from_groups", msg_string); - ret = MPI_ERR_UNSUPPORTED_OPERATION; + rc = MPI_ERR_UNSUPPORTED_OPERATION; break; default: - ret = opal_pmix_convert_status(rc); + rc = opal_pmix_convert_status(rc); break; } goto fn_exit; @@ -383,7 +438,7 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu if (PMIX_CHECK_KEY(&results[i], PMIX_GROUP_CONTEXT_ID)) { PMIX_VALUE_GET_NUMBER(rc, &results[i].value, cid_base, size_t); if(PMIX_SUCCESS != rc) { - ret = opal_pmix_convert_status(rc); + rc = opal_pmix_convert_status(rc); goto fn_exit; } cid_base_set = true; @@ -391,15 +446,20 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu } } + OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Group_construct - tag %s size %ld ninfo %ld cid_base %ld\n", + tag, tproc_count, ninfo, cid_base)); + + /* destruct the group */ rc = PMIx_Group_destruct (tag, NULL, 0); if(PMIX_SUCCESS != rc) { - ret = opal_pmix_convert_status(rc); + OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Group_destruct failed %s", PMIx_Error_string(rc))); + rc = opal_pmix_convert_status(rc); goto fn_exit; } if (!cid_base_set) { opal_show_help("help-comm.txt", "cid-base-not-set", true); - ret = OMPI_ERROR; + rc = OMPI_ERROR; goto fn_exit; } @@ -412,16 +472,19 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu } if(NULL != procs) { - PMIX_PROC_FREE(procs, proc_count); + PMIX_PROC_FREE(procs, tproc_count); procs = NULL; } - if(NULL != name_array) { - free (name_array); - name_array = NULL; + if (NULL != grpinfo) { + PMIx_Info_list_release(grpinfo); } - return ret; + if (NULL != list) { + PMIx_Info_list_release(list); + } + + return rc; } static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communicator_t *comm, @@ -446,6 +509,15 @@ static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communic block = &comm->c_contextidb; } + for (unsigned int i = ompi_mpi_communicators.lowest_free ; i < mca_pml.pml_max_contextid ; ++i) { + bool flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i, newcomm); + if (true == flag) { + newcomm->c_index = i; + break; + } + } + assert(newcomm->c_index > 2); + if (NULL == arg1) { if (OMPI_COMM_CID_GROUP == mode || OMPI_COMM_CID_GROUP_NEW == mode || !ompi_comm_extended_cid_block_available (&comm->c_contextidb)) { @@ -468,14 +540,6 @@ static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communic (void) ompi_comm_extended_cid_block_new (block, &newcomm->c_contextidb, is_new_block); } - for (unsigned int i = ompi_mpi_communicators.lowest_free ; i < mca_pml.pml_max_contextid ; ++i) { - bool flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i, newcomm); - if (true == flag) { - newcomm->c_index = i; - break; - } - } - newcomm->c_contextid = newcomm->c_contextidb.block_cid; opal_hash_table_set_value_ptr (&ompi_comm_hash, &newcomm->c_contextid, @@ -502,7 +566,7 @@ int ompi_comm_nextcid_nb (ompi_communicator_t *newcomm, ompi_communicator_t *com functions but the pml does not support these functions so return not supported */ if (NULL == comm) { char msg_string[1024]; - sprintf(msg_string,"The PML being used - %s - does not support MPI sessions related features", + sprintf(msg_string,"The PML being used - %s - does not support MPI sessions related features", mca_pml_base_selected_component.pmlm_version.mca_component_name); opal_show_help("help-comm.txt", "MPI function not supported", @@ -886,6 +950,7 @@ int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *c ompi_comm_cid_context_t *context; ompi_comm_request_t *request; ompi_request_t *subreq; + uint32_t comm_size; int ret = 0; /* the caller should not pass NULL for comm (it may be the same as *newcomm) */ @@ -907,6 +972,25 @@ int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *c request->context = &context->super; + /* Prep communicator for handling remote cids if needed */ + + if (!OMPI_COMM_IS_GLOBAL_INDEX(*newcomm)) { + if (OMPI_COMM_IS_INTER(*newcomm)) { + comm_size = ompi_comm_remote_size(*newcomm); + } else { + comm_size = ompi_comm_size(*newcomm); + } + + (*newcomm)->c_index_vec = (uint32_t *)calloc(comm_size, sizeof(uint32_t)); + if (NULL == (*newcomm)->c_index_vec) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + if (OMPI_COMM_IS_INTRA(*newcomm)) { + (*newcomm)->c_index_vec[(*newcomm)->c_my_rank] = (*newcomm)->c_index; + } + } + if (MPI_UNDEFINED != (*newcomm)->c_local_group->grp_my_rank) { /* Initialize the PML stuff in the newcomm */ if ( OMPI_SUCCESS != (ret = MCA_PML_CALL(add_comm(*newcomm))) ) { @@ -963,6 +1047,61 @@ int ompi_comm_activate (ompi_communicator_t **newcomm, ompi_communicator_t *comm return rc; } +int ompi_comm_get_remote_cid_from_pmix (ompi_communicator_t *comm, int dest, uint32_t *remote_cid) +{ + ompi_proc_t *ompi_proc; + pmix_proc_t pmix_proc; + pmix_info_t tinfo[2]; + pmix_value_t *val = NULL; + ompi_comm_extended_cid_t excid; + int rc = OMPI_SUCCESS; + size_t remote_cid64; + + assert(NULL != remote_cid); + + ompi_proc = ompi_comm_peer_lookup(comm, dest); + OPAL_PMIX_CONVERT_NAME(&pmix_proc, &ompi_proc->super.proc_name); + + PMIx_Info_construct(&tinfo[0]); + PMIX_INFO_LOAD(&tinfo[0], PMIX_TIMEOUT, &ompi_pmix_connect_timeout, PMIX_UINT32); + + excid = ompi_comm_get_extended_cid(comm); + + PMIX_INFO_CONSTRUCT(&tinfo[1]); + PMIX_INFO_LOAD(&tinfo[1], PMIX_GROUP_CONTEXT_ID, &excid.cid_base, PMIX_SIZE); + PMIX_INFO_SET_QUALIFIER(&tinfo[1]); + if (PMIX_SUCCESS != (rc = PMIx_Get(&pmix_proc, PMIX_GROUP_LOCAL_CID, tinfo, 2, &val))) { + OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID cid_base %ld %s", excid.cid_base, PMIx_Error_string(rc))); + rc = OMPI_ERR_NOT_FOUND; + goto done; + } + + if (NULL == val) { + OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID val returned NULL")); + rc = OMPI_ERR_NOT_FOUND; + goto done; + } + + if (val->type != PMIX_SIZE) { + OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID type mismatch")); + rc = OMPI_ERR_TYPE_MISMATCH; + goto done; + } + + PMIX_VALUE_GET_NUMBER(rc, val, remote_cid64, size_t); + rc = OMPI_SUCCESS; + *remote_cid = (uint32_t)remote_cid64; + comm->c_index_vec[dest] = (uint32_t)remote_cid64; + OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get PMIX_GROUP_LOCAL_CID %d for cid_base %ld", *remote_cid, excid.cid_base)); + +done: + if (NULL != val) { + PMIX_VALUE_RELEASE(val); + } + + return rc; +} + static int ompi_comm_activate_nb_complete (ompi_comm_request_t *request) { ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context; diff --git a/ompi/communicator/comm_init.c b/ompi/communicator/comm_init.c index a72a6661189..498bf4a1e70 100644 --- a/ompi/communicator/comm_init.c +++ b/ompi/communicator/comm_init.c @@ -23,7 +23,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2015-2019 Intel, Inc. All rights reserved. * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. - * Copyright (c) 2018-2022 Triad National Security, LLC. All rights + * Copyright (c) 2018-2024 Triad National Security, LLC. All rights * reserved. * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. * Copyright (c) 2023 NVIDIA Corporation. All rights reserved. @@ -69,6 +69,8 @@ ompi_predefined_communicator_t ompi_mpi_comm_self = {{{{0}}}}; ompi_predefined_communicator_t ompi_mpi_comm_null = {{{{0}}}}; ompi_communicator_t *ompi_mpi_comm_parent = NULL; +int ompi_comm_output = -1; + static bool ompi_comm_intrinsic_init; ompi_predefined_communicator_t *ompi_mpi_comm_world_addr = @@ -97,6 +99,14 @@ static int ompi_comm_finalize (void); */ int ompi_comm_init(void) { + + /* create output stream */ + + if (ompi_comm_output == -1) { + ompi_comm_output = opal_output_open(NULL); + opal_output_set_verbosity(ompi_comm_output, ompi_comm_verbose_level); + } + /* Setup communicator array */ OBJ_CONSTRUCT(&ompi_mpi_communicators, opal_pointer_array_t); if( OPAL_SUCCESS != opal_pointer_array_init(&ompi_mpi_communicators, 16, @@ -392,6 +402,11 @@ static int ompi_comm_finalize (void) /* finalize communicator requests */ ompi_comm_request_fini (); + /* close output stream */ + + opal_output_close(ompi_comm_output); + ompi_comm_output = -1; + /* release a reference to the attributes subsys */ return ompi_attr_put_ref(); } @@ -417,6 +432,7 @@ static void ompi_comm_construct(ompi_communicator_t* comm) comm->c_coll = NULL; comm->c_nbc_tag = MCA_COLL_BASE_TAG_NONBLOCKING_BASE; comm->instance = NULL; + comm->c_index_vec = NULL; /* * magic numerology - see TOPDIR/ompi/include/mpif-values.pl @@ -518,6 +534,11 @@ static void ompi_comm_destruct(ompi_communicator_t* comm) comm->c_name = NULL; } + if (NULL != comm->c_index_vec) { + free (comm->c_index_vec); + comm->c_index_vec = NULL; + } + #if OPAL_ENABLE_FT_MPI if( NULL != comm->agreement_specific ) { OBJ_RELEASE( comm->agreement_specific ); diff --git a/ompi/communicator/communicator.h b/ompi/communicator/communicator.h index 3a230b68025..1714a09befc 100644 --- a/ompi/communicator/communicator.h +++ b/ompi/communicator/communicator.h @@ -153,6 +153,8 @@ OMPI_DECLSPEC extern opal_hash_table_t ompi_comm_hash; OMPI_DECLSPEC extern opal_pointer_array_t ompi_mpi_communicators; OMPI_DECLSPEC extern opal_pointer_array_t ompi_comm_f_to_c_table; +OMPI_DECLSPEC extern int ompi_comm_output; + struct ompi_comm_extended_cid_t { uint64_t cid_base; union { @@ -284,6 +286,10 @@ struct ompi_communicator_t { uint32_t c_epoch; /* Identifier used to differentiate between two communicators using the same c_contextid (not at the same time, obviously) */ #endif + /* vector used to store remote cid values for communicators not using + * a global cid, i.e. when OMPI_COMM_IS_GLOBAL_INDEX(comm) returns 0. + */ + uint32_t *c_index_vec; /* Non-blocking collective tag. These tags might be shared between * all non-blocking collective modules (to avoid message collision * between them in the case where multiple outstanding non-blocking @@ -535,6 +541,30 @@ static inline uint32_t ompi_comm_get_local_cid (const ompi_communicator_t* comm) return comm->c_index; } +int ompi_comm_get_remote_cid_from_pmix (ompi_communicator_t *comm, int dest, uint32_t *remote_cid); + +/** + * Get remote cid for the communicator. In the case of communicators created + * using methods that don't supply an input communicator, i.e. + * MPI_Comm_create_from_group, the remote cid may be different from the local cid. + */ +static inline int ompi_comm_get_remote_cid (ompi_communicator_t *comm, int dest, uint32_t *remote_cid) +{ + int rc = OMPI_SUCCESS; + + assert(NULL != remote_cid); + + if (OPAL_LIKELY(OMPI_COMM_IS_GLOBAL_INDEX(comm))) { + *remote_cid = comm->c_index; + } else if (0 != comm->c_index_vec[dest]) { + *remote_cid = comm->c_index_vec[dest]; + } else { + rc = ompi_comm_get_remote_cid_from_pmix(comm, dest, remote_cid); + } + + return rc; +} + /** * Get the extended context ID for the communicator, suitable for passing * to ompi_comm_lookup_cid for getting the communicator back @@ -614,6 +644,12 @@ static inline struct ompi_proc_t* ompi_comm_peer_lookup (const ompi_communicator return ompi_group_peer_lookup(comm->c_remote_group,peer_id); } +static inline bool ompi_comm_instances_same(const ompi_communicator_t *comm1, + const ompi_communicator_t *comm2) +{ + return comm1->instance == comm2->instance; +} + #if OPAL_ENABLE_FT_MPI /* * Support for MPI_ANY_SOURCE point-to-point operations diff --git a/ompi/dpm/dpm.c b/ompi/dpm/dpm.c index 8e4057daffb..719b0c4a735 100644 --- a/ompi/dpm/dpm.c +++ b/ompi/dpm/dpm.c @@ -430,7 +430,7 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, wildcard_rank.jobid = proc->super.proc_name.jobid; wildcard_rank.vpid = OMPI_NAME_WILDCARD->vpid; /* retrieve the local peers for the specified jobid */ - OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCAL_PEERS, + OPAL_MODEX_RECV_VALUE_IMMEDIATE(rc, PMIX_LOCAL_PEERS, &wildcard_rank, &val, PMIX_STRING); if (OPAL_SUCCESS == rc && NULL != val) { char **peers = opal_argv_split(val, ','); diff --git a/ompi/instance/instance.c b/ompi/instance/instance.c index d0f0d490de3..391c9bb03d1 100644 --- a/ompi/instance/instance.c +++ b/ompi/instance/instance.c @@ -115,7 +115,7 @@ static mca_base_framework_t *ompi_framework_dependencies[] = { &ompi_hook_base_framework, &ompi_op_base_framework, &opal_allocator_base_framework, &opal_rcache_base_framework, &opal_mpool_base_framework, &opal_smsc_base_framework, &ompi_bml_base_framework, &ompi_pml_base_framework, &ompi_coll_base_framework, - &ompi_osc_base_framework, NULL, + &ompi_osc_base_framework, &ompi_part_base_framework, NULL, }; static mca_base_framework_t *ompi_lazy_frameworks[] = { @@ -222,6 +222,8 @@ void ompi_mpi_instance_release (void) opal_argv_free (ompi_mpi_instance_pmix_psets); ompi_mpi_instance_pmix_psets = NULL; + OBJ_DESTRUCT(&ompi_mpi_instance_null); + opal_finalize_cleanup_domain (&ompi_instance_basic_domain); OBJ_DESTRUCT(&ompi_instance_basic_domain); @@ -655,11 +657,7 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) return ompi_instance_print_error ("ompi_win_init() failed", ret); } - /* initialize partcomm */ - if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_part_base_framework, 0))) { - return ompi_instance_print_error ("mca_part_base_select() failed", ret); - } - + /* select part component to use */ if (OMPI_SUCCESS != (ret = mca_part_base_select (true, true))) { return ompi_instance_print_error ("mca_part_base_select() failed", ret); } @@ -950,17 +948,8 @@ static int ompi_mpi_instance_finalize_common (void) ompi_proc_finalize(); - OBJ_DESTRUCT(&ompi_mpi_instance_null); - ompi_mpi_instance_release (); - if (0 == opal_initialized) { - /* if there is no MPI_T_init_thread that has been MPI_T_finalize'd, - * then be gentle to the app and release all the memory now (instead - * of the opal library destructor */ - opal_class_finalize (); - } - return OMPI_SUCCESS; } diff --git a/ompi/mca/coll/adapt/coll_adapt_component.c b/ompi/mca/coll/adapt/coll_adapt_component.c index 97eec559a2d..3d7d7e16cbe 100644 --- a/ompi/mca/coll/adapt/coll_adapt_component.c +++ b/ompi/mca/coll/adapt/coll_adapt_component.c @@ -2,6 +2,7 @@ * Copyright (c) 2014-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -114,39 +115,39 @@ static int adapt_register(void) we should have a high priority */ cs->adapt_priority = 0; (void) mca_base_component_var_register(c, "priority", "Priority of the adapt coll component", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &cs->adapt_priority); + MCA_BASE_VAR_SCOPE_ALL, &cs->adapt_priority); cs->adapt_verbose = ompi_coll_base_framework.framework_verbose; (void) mca_base_component_var_register(c, "verbose", "Verbose level (default set to the collective framework verbosity)", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &cs->adapt_verbose); + MCA_BASE_VAR_SCOPE_ALL, &cs->adapt_verbose); cs->adapt_context_free_list_min = 64; (void) mca_base_component_var_register(c, "context_free_list_min", "Minimum number of segments in context free list", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, + MCA_BASE_VAR_SCOPE_ALL, &cs->adapt_context_free_list_min); cs->adapt_context_free_list_max = 1024; (void) mca_base_component_var_register(c, "context_free_list_max", "Maximum number of segments in context free list", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, + MCA_BASE_VAR_SCOPE_ALL, &cs->adapt_context_free_list_max); cs->adapt_context_free_list_inc = 32; (void) mca_base_component_var_register(c, "context_free_list_inc", "Increasement number of segments in context free list", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, + MCA_BASE_VAR_SCOPE_ALL, &cs->adapt_context_free_list_inc); ompi_coll_adapt_ibcast_register(); ompi_coll_adapt_ireduce_register(); diff --git a/ompi/mca/coll/adapt/coll_adapt_ibcast.c b/ompi/mca/coll/adapt/coll_adapt_ibcast.c index 361e0e24c72..00bc79fbd46 100644 --- a/ompi/mca/coll/adapt/coll_adapt_ibcast.c +++ b/ompi/mca/coll/adapt/coll_adapt_ibcast.c @@ -3,6 +3,7 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2022 IBM Corporation. All rights reserved + * Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -34,8 +35,9 @@ int ompi_coll_adapt_ibcast_register(void) mca_coll_adapt_component.adapt_ibcast_algorithm = 1; mca_base_component_var_register(c, "bcast_algorithm", - "Algorithm of broadcast, 0: tuned, 1: binomial, 2: in_order_binomial, 3: binary, 4: pipeline, 5: chain, 6: linear", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY, + "Algorithm of broadcast, 0: tuned, 1: binomial, 2: in_order_binomial, 3: binary, 4: pipeline, 5: chain, 6: linear", + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, &mca_coll_adapt_component.adapt_ibcast_algorithm); if( (mca_coll_adapt_component.adapt_ibcast_algorithm < 0) || (mca_coll_adapt_component.adapt_ibcast_algorithm >= OMPI_COLL_ADAPT_ALGORITHM_COUNT) ) { @@ -45,33 +47,33 @@ int ompi_coll_adapt_ibcast_register(void) mca_coll_adapt_component.adapt_ibcast_segment_size = 0; mca_base_component_var_register(c, "bcast_segment_size", "Segment size in bytes used by default for bcast algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, + MCA_BASE_VAR_SCOPE_ALL, &mca_coll_adapt_component.adapt_ibcast_segment_size); mca_coll_adapt_component.adapt_ibcast_max_send_requests = 2; mca_base_component_var_register(c, "bcast_max_send_requests", "Maximum number of send requests", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, + MCA_BASE_VAR_SCOPE_ALL, &mca_coll_adapt_component.adapt_ibcast_max_send_requests); mca_coll_adapt_component.adapt_ibcast_max_recv_requests = 3; mca_base_component_var_register(c, "bcast_max_recv_requests", "Maximum number of receive requests", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, + MCA_BASE_VAR_SCOPE_ALL, &mca_coll_adapt_component.adapt_ibcast_max_recv_requests); mca_coll_adapt_component.adapt_ibcast_synchronous_send = true; (void) mca_base_component_var_register(c, "bcast_synchronous_send", "Whether to use synchronous send operations during setup of bcast operations", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, + MCA_BASE_VAR_SCOPE_ALL, &mca_coll_adapt_component.adapt_ibcast_synchronous_send); mca_coll_adapt_component.adapt_ibcast_context_free_list = NULL; diff --git a/ompi/mca/coll/adapt/coll_adapt_ireduce.c b/ompi/mca/coll/adapt/coll_adapt_ireduce.c index 2747995a57d..15bd586901a 100644 --- a/ompi/mca/coll/adapt/coll_adapt_ireduce.c +++ b/ompi/mca/coll/adapt/coll_adapt_ireduce.c @@ -5,6 +5,7 @@ * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2022 IBM Corporation. All rights reserved * Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved. + * Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -38,8 +39,9 @@ int ompi_coll_adapt_ireduce_register(void) mca_coll_adapt_component.adapt_ireduce_algorithm = 1; mca_base_component_var_register(c, "reduce_algorithm", - "Algorithm of reduce, 1: binomial, 2: in_order_binomial, 3: binary, 4: pipeline, 5: chain, 6: linear", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY, + "Algorithm of reduce, 1: binomial, 2: in_order_binomial, 3: binary, 4: pipeline, 5: chain, 6: linear", + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL, &mca_coll_adapt_component.adapt_ireduce_algorithm); if( (mca_coll_adapt_component.adapt_ireduce_algorithm < 0) || (mca_coll_adapt_component.adapt_ireduce_algorithm > OMPI_COLL_ADAPT_ALGORITHM_COUNT) ) { @@ -49,58 +51,58 @@ int ompi_coll_adapt_ireduce_register(void) mca_coll_adapt_component.adapt_ireduce_segment_size = 163740; mca_base_component_var_register(c, "reduce_segment_size", "Segment size in bytes used by default for reduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.", - MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, + MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, + MCA_BASE_VAR_SCOPE_ALL, &mca_coll_adapt_component.adapt_ireduce_segment_size); mca_coll_adapt_component.adapt_ireduce_max_send_requests = 2; mca_base_component_var_register(c, "reduce_max_send_requests", "Maximum number of send requests", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, + MCA_BASE_VAR_SCOPE_ALL, &mca_coll_adapt_component.adapt_ireduce_max_send_requests); mca_coll_adapt_component.adapt_ireduce_max_recv_requests = 3; mca_base_component_var_register(c, "reduce_max_recv_requests", "Maximum number of receive requests per peer", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, + MCA_BASE_VAR_SCOPE_ALL, &mca_coll_adapt_component.adapt_ireduce_max_recv_requests); mca_coll_adapt_component.adapt_inbuf_free_list_min = 10; mca_base_component_var_register(c, "inbuf_free_list_min", "Minimum number of segment in inbuf free list", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, + MCA_BASE_VAR_SCOPE_ALL, &mca_coll_adapt_component.adapt_inbuf_free_list_min); mca_coll_adapt_component.adapt_inbuf_free_list_max = 10000; mca_base_component_var_register(c, "inbuf_free_list_max", "Maximum number of segment in inbuf free list", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, + MCA_BASE_VAR_SCOPE_ALL, &mca_coll_adapt_component.adapt_inbuf_free_list_max); mca_coll_adapt_component.adapt_inbuf_free_list_inc = 10; mca_base_component_var_register(c, "inbuf_free_list_inc", "Number of segments to allocate when growing the inbuf free list", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, + MCA_BASE_VAR_SCOPE_ALL, &mca_coll_adapt_component.adapt_inbuf_free_list_inc); mca_coll_adapt_component.adapt_ireduce_synchronous_send = true; (void) mca_base_component_var_register(c, "reduce_synchronous_send", "Whether to use synchronous send operations during setup of reduce operations", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, + MCA_BASE_VAR_SCOPE_ALL, &mca_coll_adapt_component.adapt_ireduce_synchronous_send); mca_coll_adapt_component.adapt_ireduce_context_free_list = NULL; diff --git a/ompi/mca/coll/base/coll_base_util.c b/ompi/mca/coll/base/coll_base_util.c index ae9010497d7..ba74aa01350 100644 --- a/ompi/mca/coll/base/coll_base_util.c +++ b/ompi/mca/coll/base/coll_base_util.c @@ -12,6 +12,9 @@ * Copyright (c) 2014-2020 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved. + * + * Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved. + * * $COPYRIGHT$ * * Additional copyrights may follow @@ -482,6 +485,26 @@ int ompi_coll_base_file_peek_next_char_is(FILE *fptr, int *fileline, int expecte } while (1); } +/** + * return non-zero if the next non-space to read on the current line is a digit. + * otherwise return 0. + */ +int ompi_coll_base_file_peek_next_char_isdigit(FILE *fptr) +{ + do { + int next = fgetc(fptr); + + if ((' ' == next) || ('\t' == next)) { + continue; /* discard space and tab. keep everything else */ + } + + ungetc(next, fptr); /* put the char back into the stream */ + + return isdigit(next); /* report back whether or not next is a digit */ + + } while (1); +} + /** * There are certainly simpler implementation for this function when performance * is not a critical point. But, as this function is used during the collective diff --git a/ompi/mca/coll/base/coll_base_util.h b/ompi/mca/coll/base/coll_base_util.h index 852abcedefa..7bceaa7dcc0 100644 --- a/ompi/mca/coll/base/coll_base_util.h +++ b/ompi/mca/coll/base/coll_base_util.h @@ -11,6 +11,7 @@ * All rights reserved. * Copyright (c) 2014-2020 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -195,6 +196,7 @@ int ompi_coll_base_file_getnext_string(FILE *fptr, int *fileline, char** val); * eat the value, otherwise put it back into the file. */ int ompi_coll_base_file_peek_next_char_is(FILE *fptr, int *fileline, int expected); +int ompi_coll_base_file_peek_next_char_isdigit(FILE *fptr); /* Miscellaneous function */ const char* mca_coll_base_colltype_to_str(int collid); diff --git a/ompi/mca/coll/basic/coll_basic_component.c b/ompi/mca/coll/basic/coll_basic_component.c index 23cbed81ad5..d38850744c0 100644 --- a/ompi/mca/coll/basic/coll_basic_component.c +++ b/ompi/mca/coll/basic/coll_basic_component.c @@ -13,6 +13,7 @@ * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -91,16 +92,16 @@ basic_register(void) mca_coll_basic_priority = 10; (void) mca_base_component_var_register(&mca_coll_basic_component.collm_version, "priority", "Priority of the basic coll component", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, + MCA_BASE_VAR_SCOPE_ALL, &mca_coll_basic_priority); mca_coll_basic_crossover = 4; (void) mca_base_component_var_register(&mca_coll_basic_component.collm_version, "crossover", "Minimum number of processes in a communicator before using the logarithmic algorithms", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, + MCA_BASE_VAR_SCOPE_ALL, &mca_coll_basic_crossover); return OMPI_SUCCESS; diff --git a/ompi/mca/coll/han/coll_han_component.c b/ompi/mca/coll/han/coll_han_component.c index 3926faaaac3..4712bccbe88 100644 --- a/ompi/mca/coll/han/coll_han_component.c +++ b/ompi/mca/coll/han/coll_han_component.c @@ -7,6 +7,7 @@ * Copyright (c) 2023 Computer Architecture and VLSI Systems (CARV) * Laboratory, ICS Forth. All rights reserved. * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -257,9 +258,9 @@ mca_coll_han_query_module_from_mca(mca_base_component_t* c, *storage = ompi_coll_han_available_components[mod_id].component_name; (void) mca_base_component_var_register(c, param_name, param_doc, - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, info_level, - MCA_BASE_VAR_SCOPE_READONLY, storage); + MCA_BASE_VAR_SCOPE_ALL, storage); module_name = *storage; mod_id = strtol(module_name, &endptr, 10); if( module_name == endptr ) { /* no conversion, maybe we got a module name instead */ @@ -288,22 +289,22 @@ static int han_register(void) COMPONENT_T component; (void) mca_base_component_var_register(c, "priority", "Priority of the HAN coll component", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &cs->han_priority); + MCA_BASE_VAR_SCOPE_ALL, &cs->han_priority); cs->han_output_verbose = 0; (void) mca_base_component_var_register(c, "verbose", "Verbosity of the HAN coll component (use coll base verbosity if not set)", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &cs->han_output_verbose); + MCA_BASE_VAR_SCOPE_ALL, &cs->han_output_verbose); cs->han_bcast_segsize = 65536; (void) mca_base_component_var_register(c, "bcast_segsize", "segment size for bcast", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &cs->han_bcast_segsize); + MCA_BASE_VAR_SCOPE_ALL, &cs->han_bcast_segsize); cs->han_bcast_up_module = 0; (void) mca_coll_han_query_module_from_mca(c, "bcast_up_module", @@ -321,9 +322,9 @@ static int han_register(void) cs->han_reduce_segsize = 65536; (void) mca_base_component_var_register(c, "reduce_segsize", "segment size for reduce", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reduce_segsize); + MCA_BASE_VAR_SCOPE_ALL, &cs->han_reduce_segsize); cs->han_reduce_up_module = 0; (void) mca_coll_han_query_module_from_mca(c, "reduce_up_module", @@ -340,9 +341,9 @@ static int han_register(void) cs->han_allreduce_segsize = 65536; (void) mca_base_component_var_register(c, "allreduce_segsize", "segment size for allreduce", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allreduce_segsize); + MCA_BASE_VAR_SCOPE_ALL, &cs->han_allreduce_segsize); cs->han_allreduce_up_module = 0; (void) mca_coll_han_query_module_from_mca(c, "allreduce_up_module", @@ -424,8 +425,8 @@ static int han_register(void) (void) mca_base_component_var_register(c, "alltoall_pstages", "Parallel Stages for alltoall. Higher numbers require more memory, " "and performs more communication in parallel. 0 chooses pstages based on message size.", - MCA_BASE_VAR_TYPE_INT32_T, NULL, 0, 0, - OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + MCA_BASE_VAR_TYPE_INT32_T, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL, &cs->han_alltoall_pstages); cs->han_alltoallv_low_module = 0; @@ -436,16 +437,16 @@ static int han_register(void) cs->han_alltoallv_smsc_avg_send_limit = 8192; (void) mca_base_component_var_register(c, "alltoallv_smsc_avg_send_limit", "The per-rank averaged send bytes limit above which smsc-based alltoallv will disqualify itself.", - MCA_BASE_VAR_TYPE_INT64_T, NULL, 0, 0, - OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + MCA_BASE_VAR_TYPE_INT64_T, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL, &cs->han_alltoallv_smsc_avg_send_limit); cs->han_alltoallv_smsc_noncontig_activation_limit = 0.10; (void) mca_base_component_var_register(c, "alltoallv_smsc_noncontig_limit", "The fractional (0.00-1.00) limit of peers in the communicator which have " "strided or otherwise non-contiguous data buffers. Above this limit " "smsc-based alltoallv will ignore the avg_send_limit, and always remain active.", - MCA_BASE_VAR_TYPE_DOUBLE, NULL, 0, 0, - OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + MCA_BASE_VAR_TYPE_DOUBLE, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL, &cs->han_alltoallv_smsc_noncontig_activation_limit); cs->han_reproducible = 0; @@ -453,21 +454,21 @@ static int han_register(void) "whether we need reproducible results " "(enabling this disables optimisations using topology)" "0 disable 1 enable, default 0", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reproducible); + MCA_BASE_VAR_SCOPE_ALL, &cs->han_reproducible); cs->han_packbuf_bytes = 128*1024; (void) mca_base_component_var_register(c, "packbuf_bytes", "The number of bytes in each HAN packbuf.", - MCA_BASE_VAR_TYPE_INT64_T, NULL, 0, 0, - OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + MCA_BASE_VAR_TYPE_INT64_T, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL, &cs->han_packbuf_bytes); cs->han_packbuf_max_count = 32; (void) mca_base_component_var_register(c, "packbuf_max_count", "The maximum number of packbufs that are allowed to be allocated.", - MCA_BASE_VAR_TYPE_INT64_T, NULL, 0, 0, - OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + MCA_BASE_VAR_TYPE_INT64_T, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL, &cs->han_packbuf_max_count); /* @@ -582,9 +583,9 @@ static int han_register(void) } mca_base_component_var_register(c, param_name, param_desc, - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, + MCA_BASE_VAR_SCOPE_ALL, &(cs->mca_sub_components[coll][topo_lvl])); } } @@ -594,27 +595,27 @@ static int han_register(void) (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, "use_dynamic_file_rules", "Enable the dynamic selection provided via the dynamic_rules_filename MCA", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_6, - MCA_BASE_VAR_SCOPE_READONLY, + MCA_BASE_VAR_SCOPE_ALL, &(cs->use_dynamic_file_rules)); cs->dynamic_rules_filename = NULL; (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, "dynamic_rules_filename", "Configuration file containing the dynamic selection rules", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_6, - MCA_BASE_VAR_SCOPE_READONLY, + MCA_BASE_VAR_SCOPE_ALL, &(cs->dynamic_rules_filename)); cs->dump_dynamic_rules = false; (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, "dump_dynamic_rules", "Switch used to decide if we dump dynamic rules provided by configuration file", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_6, - MCA_BASE_VAR_SCOPE_READONLY, + MCA_BASE_VAR_SCOPE_ALL, &(cs->dump_dynamic_rules)); if((cs->dump_dynamic_rules || NULL != cs->dynamic_rules_filename) @@ -631,9 +632,9 @@ static int han_register(void) "errors printed on rank 0 " "with a 0 verbosity." "Useless if coll_base_verbose is 30 or more.", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_6, - MCA_BASE_VAR_SCOPE_READONLY, + MCA_BASE_VAR_SCOPE_ALL, &(cs->max_dynamic_errors)); diff --git a/ompi/mca/coll/hcoll/coll_hcoll_component.c b/ompi/mca/coll/hcoll/coll_hcoll_component.c index b8eb0444974..e34169a0781 100644 --- a/ompi/mca/coll/hcoll/coll_hcoll_component.c +++ b/ompi/mca/coll/hcoll/coll_hcoll_component.c @@ -3,6 +3,7 @@ * Copyright (c) 2011 Mellanox Technologies. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -107,8 +108,8 @@ static int reg_int(const char* param_name, index = mca_base_component_var_register( &mca_coll_hcoll_component.super.collm_version, param_name, param_desc, MCA_BASE_VAR_TYPE_INT, - NULL, 0, 0,OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, storage); + NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_ALL, storage); if (NULL != deprecated_param_name) { (void) mca_base_var_register_synonym(index, "ompi", "coll", "hcoll", deprecated_param_name, diff --git a/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c b/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c index e56ece1d0b4..5eb8ef4317e 100644 --- a/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c +++ b/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c @@ -12,6 +12,7 @@ * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved. + * Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -42,13 +43,24 @@ static int fileline=0; /* used for verbose error messages */ #define getnext(fptr, pval) ompi_coll_base_file_getnext_long(fptr, &fileline, pval) +#define isnext_digit(fptr) ompi_coll_base_file_peek_next_char_isdigit(fptr) /* * Reads a rule file called fname - * Builds the algorithm rule table for a max of n_collectives + * The rule file defines a set of sets of rules. The outer set is keyed on + * communicator size while the inner set is keyed on message size. When a + * communicator is constructed its size is used to look up the nested set of + * message size keyed rules. When a collective is called the message size + * determined from its call arguments are used to lookup a specific rule in the + * inner set. + * + * Rules for communicator and message sizes 0 and N (where N is the larger than + * largest key you provide) can be specified to fall back to the fixed decision + * framework above and below the communicator and message size ranges of + * interest. * * If an error occurs it removes rule table and then exits with a very verbose - * error message (this stops the user using a half baked rule table + * error message. this stops the user using a half baked rule table. * * Returns the number of actual collectives that a rule exists for * (note 0 is NOT an error) @@ -57,9 +69,18 @@ static int fileline=0; /* used for verbose error messages */ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** rules, int n_collectives) { - long CI, NCS, CS, ALG, NMS, FANINOUT, X, MS, SS; + long NCOL = 0, /* number of collectives for which rules are provided */ + COLID = 0, /* identifies the collective type to associate the rules with */ + NCOMSIZES = 0, /* number of sets of message size rules. the key is communicator size */ + COMSIZE = 0, /* communicator size, the key identifying a specific set of message size rules. */ + NMSGSIZES = 0, /* number of message size rules in the set. */ + MSGSIZE = 0, /* message size, the key identifying a specific rule in the set. */ + ALG = 0, /* the collective specific algorithm to use */ + FANINOUT = 0, /* algorithm specific tuning parameter */ + SEGSIZE = 0, /* algorithm specific tuning parameter */ + MAXREQ = 0; /* algorithm specific tuning parameter */ FILE *fptr = (FILE*) NULL; - int x, ncs, nms; + int x, ncs, nms, version; ompi_coll_alg_rule_t *alg_rules = (ompi_coll_alg_rule_t*) NULL; /* complete table of rules */ @@ -103,68 +124,78 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** goto on_file_error; } - if( (getnext(fptr, &X) < 0) || (X < 0) ) { + /* consume the optional version identifier */ + if (0 == fscanf(fptr, "rule-file-version-%u", &version)) { + version = 1; + } + + /* get the number of collectives for which rules are provided in the file */ + if( (getnext(fptr, &NCOL) < 0) || (NCOL < 0) ) { OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of collectives in configuration file around line %d\n", fileline)); goto on_file_error; } - if (X>n_collectives) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Number of collectives in configuration file %ld is greater than number of MPI collectives possible %d ??? error around line %d\n", X, n_collectives, fileline)); + if (NCOL>n_collectives) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Number of collectives in configuration file %ld is greater than number of MPI collectives possible %d ??? error around line %d\n", NCOL, n_collectives, fileline)); goto on_file_error; } - for (x=0;x=n_collectives) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Collective id in configuration file %ld is greater than MPI collectives possible %d. Error around line %d\n", CI, n_collectives, fileline)); + if (COLID>=n_collectives) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Collective id in configuration file %ld is greater than MPI collectives possible %d. Error around line %d\n", COLID, n_collectives, fileline)); goto on_file_error; } - if (alg_rules[CI].alg_rule_id != CI) { - OPAL_OUTPUT((ompi_coll_tuned_stream, "Internal error in handling collective ID %ld\n", CI)); + if (alg_rules[COLID].alg_rule_id != COLID) { + OPAL_OUTPUT((ompi_coll_tuned_stream, "Internal error in handling collective ID %ld\n", COLID)); goto on_file_error; } - OPAL_OUTPUT((ompi_coll_tuned_stream, "Reading dynamic rule for collective ID %ld\n", CI)); - alg_p = &alg_rules[CI]; + OPAL_OUTPUT((ompi_coll_tuned_stream, "Reading dynamic rule for collective ID %ld\n", COLID)); + alg_p = &alg_rules[COLID]; - alg_p->alg_rule_id = CI; + alg_p->alg_rule_id = COLID; alg_p->n_com_sizes = 0; alg_p->com_rules = (ompi_coll_com_rule_t *) NULL; - if( (getnext (fptr, &NCS) < 0) || (NCS < 0) ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read count of communicators for collective ID %ld at around line %d\n", CI, fileline)); + /* get the number of communicator sizes for which a set of rules are to be provided */ + if( (getnext (fptr, &NCOMSIZES) < 0) || (NCOMSIZES < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read count of communicators for collective ID %ld at around line %d\n", COLID, fileline)); goto on_file_error; } - OPAL_OUTPUT((ompi_coll_tuned_stream, "Read communicator count %ld for dynamic rule for collective ID %ld\n", NCS, CI)); - alg_p->n_com_sizes = NCS; - alg_p->com_rules = ompi_coll_tuned_mk_com_rules (NCS, CI); + OPAL_OUTPUT((ompi_coll_tuned_stream, "Read communicator count %ld for dynamic rule for collective ID %ld\n", NCOMSIZES, COLID)); + alg_p->n_com_sizes = NCOMSIZES; + alg_p->com_rules = ompi_coll_tuned_mk_com_rules (NCOMSIZES, COLID); if (NULL == alg_p->com_rules) { OPAL_OUTPUT((ompi_coll_tuned_stream,"Cannot allocate com rules for file [%s]\n", fname)); goto on_file_error; } - for (ncs=0;ncscom_rules[ncs]); - if( (getnext (fptr, &CS) < 0) || (CS < 0) ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read communicator size for collective ID %ld com rule %d at around line %d\n", CI, ncs, fileline)); + /* get the communicator size to associate the set of rules with */ + if( (getnext (fptr, &COMSIZE) < 0) || (COMSIZE < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read communicator size for collective ID %ld com rule %d at around line %d\n", COLID, ncs, fileline)); goto on_file_error; } - com_p->mpi_comsize = CS; + com_p->mpi_comsize = COMSIZE; - if( (getnext (fptr, &NMS) < 0) || (NMS < 0) ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of message sizes for collective ID %ld com rule %d at around line %d\n", CI, ncs, fileline)); + /* get the number of message sizes to specify rules for. inner set size */ + if( (getnext (fptr, &NMSGSIZES) < 0) || (NMSGSIZES < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of message sizes for collective ID %ld com rule %d at around line %d\n", COLID, ncs, fileline)); goto on_file_error; } OPAL_OUTPUT((ompi_coll_tuned_stream, "Read message count %ld for dynamic rule for collective ID %ld and comm size %ld\n", - NMS, CI, CS)); - com_p->n_msg_sizes = NMS; - com_p->msg_rules = ompi_coll_tuned_mk_msg_rules (NMS, CI, ncs, CS); + NMSGSIZES, COLID, COMSIZE)); + com_p->n_msg_sizes = NMSGSIZES; + com_p->msg_rules = ompi_coll_tuned_mk_msg_rules (NMSGSIZES, COLID, ncs, COMSIZE); if (NULL == com_p->msg_rules) { OPAL_OUTPUT((ompi_coll_tuned_stream,"Cannot allocate msg rules for file [%s]\n", fname)); goto on_file_error; @@ -172,37 +203,52 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** msg_p = com_p->msg_rules; - for (nms=0;nmsmsg_rules[nms]); - if( (getnext (fptr, &MS) < 0) || (MS < 0) ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read message size for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); + /* read the message size to associate the rule with */ + if( (getnext (fptr, &MSGSIZE) < 0) || (MSGSIZE < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read message size for collective ID %ld com rule %d msg rule %d at around line %d\n", COLID, ncs, nms, fileline)); goto on_file_error; } - msg_p->msg_size = (size_t)MS; + msg_p->msg_size = (size_t)MSGSIZE; + /* read the collective specific algorithm identifier */ if( (getnext (fptr, &ALG) < 0) || (ALG < 0) ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target algorithm method for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target algorithm method for collective ID %ld com rule %d msg rule %d at around line %d\n", COLID, ncs, nms, fileline)); goto on_file_error; } msg_p->result_alg = ALG; + /* read faninout tuning parameter. required */ if( (getnext (fptr, &FANINOUT) < 0) || (FANINOUT < 0) ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read fan in/out topo for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read fan in/out topo for collective ID %ld com rule %d msg rule %d at around line %d\n", COLID, ncs, nms, fileline)); goto on_file_error; } msg_p->result_topo_faninout = FANINOUT; - if( (getnext (fptr, &SS) < 0) || (SS < 0) ) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target segment size for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); + /* read segsize tuning parameter. required */ + if( (getnext (fptr, &SEGSIZE) < 0) || (SEGSIZE < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target segment size for collective ID %ld com rule %d msg rule %d at around line %d\n", COLID, ncs, nms, fileline)); goto on_file_error; } - msg_p->result_segsize = SS; + msg_p->result_segsize = SEGSIZE; + + /* read the max requests tuning parameter. optional */ + msg_p->result_max_requests = ompi_coll_tuned_alltoall_max_requests; + if( (version > 1) && isnext_digit(fptr) ) { + if( (getnext (fptr, &MAXREQ) < 0) || (MAXREQ < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read max requests for collective ID %ld com rule %d msg rule %d at around line %d\n", COLID, ncs, nms, fileline)); + goto on_file_error; + } + msg_p->result_max_requests = MAXREQ; + } - if (!nms && MS) { + /* check the first rule is for 0 size. look-up depends on this */ + if (!nms && MSGSIZE) { OPAL_OUTPUT((ompi_coll_tuned_stream,"All algorithms must specify a rule for message size of zero upwards always first!\n")); - OPAL_OUTPUT((ompi_coll_tuned_stream,"Message size was %lu for collective ID %ld com rule %d msg rule %d at around line %d\n", MS, CI, ncs, nms, fileline)); + OPAL_OUTPUT((ompi_coll_tuned_stream,"Message size was %lu for collective ID %ld com rule %d msg rule %d at around line %d\n", MSGSIZE, COLID, ncs, nms, fileline)); goto on_file_error; } @@ -219,13 +265,14 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** } /* comm size */ total_alg_count++; - OPAL_OUTPUT((ompi_coll_tuned_stream, "Done reading dynamic rule for collective ID %ld\n", CI)); + OPAL_OUTPUT((ompi_coll_tuned_stream, "Done reading dynamic rule for collective ID %ld\n", COLID)); } /* per collective */ fclose (fptr); OPAL_OUTPUT((ompi_coll_tuned_stream,"\nConfigure file Stats\n")); + OPAL_OUTPUT((ompi_coll_tuned_stream,"Version\t\t\t\t\t: %5u\n", version)); OPAL_OUTPUT((ompi_coll_tuned_stream,"Collectives with rules\t\t\t: %5d\n", total_alg_count)); OPAL_OUTPUT((ompi_coll_tuned_stream,"Communicator sizes with rules\t\t: %5d\n", total_com_count)); OPAL_OUTPUT((ompi_coll_tuned_stream,"Message sizes with rules\t\t: %5d\n", total_msg_count)); diff --git a/ompi/mca/coll/ucc/coll_ucc_component.c b/ompi/mca/coll/ucc/coll_ucc_component.c index 6fab8c0dc26..b697ab787c2 100644 --- a/ompi/mca/coll/ucc/coll_ucc_component.c +++ b/ompi/mca/coll/ucc/coll_ucc_component.c @@ -2,6 +2,7 @@ /* * Copyright (c) 2021 Mellanox Technologies. All rights reserved. * Copyright (c) 2022 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -60,24 +61,24 @@ static int mca_coll_ucc_register(void) mca_coll_ucc_component_t *cm = &mca_coll_ucc_component; mca_base_component_t *c = &cm->super.collm_version; mca_base_component_var_register(c, "priority", "Priority of the UCC coll component", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &cm->ucc_priority); + MCA_BASE_VAR_SCOPE_ALL, &cm->ucc_priority); mca_base_component_var_register(c, "verbose", "Verbose level of the UCC coll component", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &cm->ucc_verbose); + MCA_BASE_VAR_SCOPE_ALL, &cm->ucc_verbose); mca_base_component_var_register(c, "enable", "[0|1] Enable/Disable the UCC coll component", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &cm->ucc_enable); + MCA_BASE_VAR_SCOPE_ALL, &cm->ucc_enable); mca_base_component_var_register(c, "np", "Minimal communicator size for the UCC coll component", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &cm->ucc_np); + MCA_BASE_VAR_SCOPE_ALL, &cm->ucc_np); mca_base_component_var_register(c, MCA_COMPILETIME_VER, "Version of the libucc library with which Open MPI was compiled", @@ -94,14 +95,14 @@ static int mca_coll_ucc_register(void) cm->cls = ""; mca_base_component_var_register(c, "cls", "Comma separated list of UCC CLS to be used for team creation", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_6, MCA_BASE_VAR_SCOPE_READONLY, &cm->cls); + MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_6, MCA_BASE_VAR_SCOPE_ALL, &cm->cls); cm->cts = COLL_UCC_CTS_STR; mca_base_component_var_register(c, "cts", "Comma separated list of UCC coll types to be enabled", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_6, MCA_BASE_VAR_SCOPE_READONLY, &cm->cts); + MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_6, MCA_BASE_VAR_SCOPE_ALL, &cm->cts); return OMPI_SUCCESS; } diff --git a/ompi/mca/fcoll/vulcan/Makefile.am b/ompi/mca/fcoll/vulcan/Makefile.am index e805880a661..c4680544abb 100644 --- a/ompi/mca/fcoll/vulcan/Makefile.am +++ b/ompi/mca/fcoll/vulcan/Makefile.am @@ -13,6 +13,7 @@ # Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2018 Research Organization for Information Science # and Technology (RIST). All rights reserved. +# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -22,6 +23,7 @@ sources = \ fcoll_vulcan.h \ + fcoll_vulcan_internal.h \ fcoll_vulcan_module.c \ fcoll_vulcan_component.c \ fcoll_vulcan_file_read_all.c \ diff --git a/ompi/mca/fcoll/vulcan/fcoll_vulcan.h b/ompi/mca/fcoll/vulcan/fcoll_vulcan.h index a2fd6ca82bc..3165a0b0797 100644 --- a/ompi/mca/fcoll/vulcan/fcoll_vulcan.h +++ b/ompi/mca/fcoll/vulcan/fcoll_vulcan.h @@ -14,6 +14,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2024 Triad National Security, LLC. All rights * reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -40,8 +41,6 @@ BEGIN_C_DECLS /* Globally exported variables */ extern int mca_fcoll_vulcan_priority; -extern int mca_fcoll_vulcan_num_groups; -extern int mca_fcoll_vulcan_write_chunksize; extern int mca_fcoll_vulcan_async_io; extern int mca_fcoll_vulcan_use_accelerator_buffers; diff --git a/ompi/mca/fcoll/vulcan/fcoll_vulcan_component.c b/ompi/mca/fcoll/vulcan/fcoll_vulcan_component.c index 80a5bfb872a..5fc8254f164 100644 --- a/ompi/mca/fcoll/vulcan/fcoll_vulcan_component.c +++ b/ompi/mca/fcoll/vulcan/fcoll_vulcan_component.c @@ -16,6 +16,7 @@ * reserved. * Copyright (c) 2024 Triad National Security, LLC. All rights * reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -43,8 +44,6 @@ const char *mca_fcoll_vulcan_component_version_string = * Global variables */ int mca_fcoll_vulcan_priority = 10; -int mca_fcoll_vulcan_num_groups = 1; -int mca_fcoll_vulcan_write_chunksize = -1; int mca_fcoll_vulcan_async_io = 0; /* @@ -91,20 +90,6 @@ vulcan_register(void) OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &mca_fcoll_vulcan_priority); - mca_fcoll_vulcan_num_groups = 1; - (void) mca_base_component_var_register(&mca_fcoll_vulcan_component.fcollm_version, - "num_groups", "Number of subgroups created by the vulcan component", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &mca_fcoll_vulcan_num_groups); - - mca_fcoll_vulcan_write_chunksize = -1; - (void) mca_base_component_var_register(&mca_fcoll_vulcan_component.fcollm_version, - "write_chunksize", "Chunk size written at once. Default: stripe_size of the file system", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &mca_fcoll_vulcan_write_chunksize); - mca_fcoll_vulcan_async_io = 0; (void) mca_base_component_var_register(&mca_fcoll_vulcan_component.fcollm_version, "async_io", "Asynchronous I/O support options. 0: Automatic choice (default) " diff --git a/ompi/mca/fcoll/vulcan/fcoll_vulcan_file_read_all.c b/ompi/mca/fcoll/vulcan/fcoll_vulcan_file_read_all.c index c372e9f14b4..f6a492e621c 100644 --- a/ompi/mca/fcoll/vulcan/fcoll_vulcan_file_read_all.c +++ b/ompi/mca/fcoll/vulcan/fcoll_vulcan_file_read_all.c @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -14,6 +15,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2024 Triad National Security, LLC. All rights * reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -23,10 +25,31 @@ #include "ompi_config.h" #include "fcoll_vulcan.h" +#include "fcoll_vulcan_internal.h" #include "mpi.h" +#include "ompi/constants.h" +#include "ompi/mca/fcoll/fcoll.h" +#include "ompi/mca/fcoll/base/fcoll_base_coll_array.h" #include "ompi/mca/common/ompio/common_ompio.h" +#include "ompi/mca/common/ompio/common_ompio_buffer.h" +#include "ompi/mca/io/io.h" +#include "ompi/mca/common/ompio/common_ompio_request.h" +#include "math.h" +#include "ompi/mca/pml/pml.h" +#include "opal/mca/accelerator/accelerator.h" +#include +#define DEBUG_ON 0 +#define NOT_AGGR_INDEX -1 + +static int shuffle_init (int index, int cycles, int aggregator, int rank, + mca_io_ompio_aggregator_data *data, ompi_request_t **reqs); + +static int read_init (ompio_file_t *fh, int index, int cycles, int aggregator, int rank, + mca_io_ompio_aggregator_data *aggr_data, + int read_syncType, ompi_request_t **request, + bool is_accelerator_buffer); int mca_fcoll_vulcan_file_read_all (struct ompio_file_t *fh, void *buf, @@ -34,7 +57,888 @@ int mca_fcoll_vulcan_file_read_all (struct ompio_file_t *fh, struct ompi_datatype_t *datatype, ompi_status_public_t *status) { - return mca_common_ompio_base_file_read_all (fh, buf, count, datatype, status); + int index = 0; + int cycles = 0; + int ret =0, l, i, j, bytes_per_cycle; + uint32_t iov_count = 0; + struct iovec *decoded_iov = NULL; + struct iovec *local_iov_array=NULL; + uint32_t total_fview_count = 0; + int local_count = 0; + ompi_request_t **reqs = NULL; + ompi_request_t *req_iread = MPI_REQUEST_NULL; + ompi_request_t *req_tmp = MPI_REQUEST_NULL; + mca_io_ompio_aggregator_data **aggr_data=NULL; + + ptrdiff_t *displs = NULL; + int vulcan_num_io_procs; + size_t max_data = 0; + + struct iovec **broken_iov_arrays=NULL; + struct iovec **broken_decoded_iovs=NULL; + int *broken_counts=NULL; + int *broken_iov_counts=NULL; + MPI_Aint *broken_total_lengths=NULL; + + int aggr_index = NOT_AGGR_INDEX; + int read_sync_type = 2; + int *result_counts=NULL; + + ompi_count_array_t fview_count_desc; + ompi_disp_array_t displs_desc; + int is_gpu, is_managed; + bool use_accelerator_buffer = false; + +#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN + double read_time = 0.0, start_read_time = 0.0, end_read_time = 0.0; + double comm_time = 0.0, start_comm_time = 0.0, end_comm_time = 0.0; + double exch_read = 0.0, start_exch = 0.0, end_exch = 0.0; + mca_common_ompio_print_entry nentry; +#endif + + vulcan_num_io_procs = fh->f_get_mca_parameter_value ( "num_aggregators", strlen ("num_aggregators")); + if (OMPI_ERR_MAX == vulcan_num_io_procs) { + ret = OMPI_ERROR; + goto exit; + } + bytes_per_cycle = fh->f_bytes_per_agg; + + if ((1 == mca_fcoll_vulcan_async_io) && (NULL == fh->f_fbtl->fbtl_ipreadv)) { + opal_output (1, "vulcan_read_all: fbtl Does NOT support ipreadv() (asynchronous read) \n"); + ret = MPI_ERR_UNSUPPORTED_OPERATION; + goto exit; + } + + mca_common_ompio_check_gpu_buf (fh, buf, &is_gpu, &is_managed); + if (is_gpu && !is_managed && + fh->f_get_mca_parameter_value ("use_accelerator_buffers", strlen("use_accelerator_buffers"))) { + use_accelerator_buffer = true; + } + /* since we want to overlap 2 iterations, define the bytes_per_cycle to be half of what + the user requested */ + bytes_per_cycle = bytes_per_cycle/2; + + /************************************************************************** + ** 1. Decode user buffer into an iovec + **************************************************************************/ + ret = mca_common_ompio_decode_datatype ((struct ompio_file_t *) fh, + datatype, count, buf, &max_data, + fh->f_mem_convertor, &decoded_iov, + &iov_count); + if (OMPI_SUCCESS != ret){ + goto exit; + } + + if (MPI_STATUS_IGNORE != status) { + status->_ucount = max_data; + } + + ret = mca_fcoll_vulcan_get_configuration (fh, vulcan_num_io_procs, max_data); + if (OMPI_SUCCESS != ret){ + goto exit; + } + opal_output_verbose(10, ompi_fcoll_base_framework.framework_output, + "Using %d aggregators for the read_all operation \n", fh->f_num_aggrs); + + aggr_data = (mca_io_ompio_aggregator_data **) malloc (fh->f_num_aggrs * + sizeof(mca_io_ompio_aggregator_data*)); + if (NULL == aggr_data) { + ret = OMPI_ERR_OUT_OF_RESOURCE; + goto exit; + } + + for (i = 0; i < fh->f_num_aggrs; i++) { + // At this point we know the number of aggregators. If there is a correlation between + // number of aggregators and number of IO nodes, we know how many aggr_data arrays we need + // to allocate. + aggr_data[i] = (mca_io_ompio_aggregator_data *) calloc (1, sizeof(mca_io_ompio_aggregator_data)); + aggr_data[i]->procs_per_group = fh->f_procs_per_group; + aggr_data[i]->procs_in_group = fh->f_procs_in_group; + aggr_data[i]->comm = fh->f_comm; + // Identify if the process is an aggregator. + // If so, aggr_index would be its index in "aggr_data" and "aggregators" arrays. + if (fh->f_aggr_list[i] == fh->f_rank) { + aggr_index = i; + } + } + + /********************************************************************* + *** 2. Generate the local offsets/lengths array corresponding to + *** this read operation + ********************************************************************/ + ret = fh->f_generate_current_file_view ((struct ompio_file_t *) fh, + max_data, &local_iov_array, + &local_count); + if (ret != OMPI_SUCCESS) { + goto exit; + } + + /************************************************************************* + ** 2b. Separate the local_iov_array entries based on the number of aggregators + *************************************************************************/ + // Modifications for the even distribution: + long domain_size; + ret = mca_fcoll_vulcan_minmax (fh, local_iov_array, local_count, fh->f_num_aggrs, &domain_size); + + // broken_iov_arrays[0] contains broken_counts[0] entries to aggregator 0, + // broken_iov_arrays[1] contains broken_counts[1] entries to aggregator 1, etc. + ret = mca_fcoll_vulcan_break_file_view (decoded_iov, iov_count, + local_iov_array, local_count, + &broken_decoded_iovs, &broken_iov_counts, + &broken_iov_arrays, &broken_counts, + &broken_total_lengths, + fh->f_num_aggrs, domain_size); + + /************************************************************************** + ** 3. Determine the total amount of data to be read and no. of cycles + **************************************************************************/ +#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN + start_comm_time = MPI_Wtime(); +#endif + ret = fh->f_comm->c_coll->coll_allreduce (MPI_IN_PLACE, broken_total_lengths, + fh->f_num_aggrs, MPI_LONG, MPI_SUM, + fh->f_comm, + fh->f_comm->c_coll->coll_allreduce_module); + if (OMPI_SUCCESS != ret) { + goto exit; + } + +#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN + end_comm_time = MPI_Wtime(); + comm_time += (end_comm_time - start_comm_time); +#endif + + cycles=0; + for (i = 0; i < fh->f_num_aggrs; i++) { +#if DEBUG_ON + printf("%d: Overall broken_total_lengths[%d] = %ld\n", fh->f_rank, i, broken_total_lengths[i]); +#endif + if (ceil((double)broken_total_lengths[i]/bytes_per_cycle) > cycles) { + cycles = ceil((double)broken_total_lengths[i]/bytes_per_cycle); + } + } + + result_counts = (int *) malloc (fh->f_num_aggrs * fh->f_procs_per_group * sizeof(int)); + if (NULL == result_counts) { + ret = OMPI_ERR_OUT_OF_RESOURCE; + goto exit; + } + +#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN + start_comm_time = MPI_Wtime(); +#endif + ret = fh->f_comm->c_coll->coll_allgather (broken_counts, fh->f_num_aggrs, MPI_INT, + result_counts, fh->f_num_aggrs, MPI_INT, + fh->f_comm, + fh->f_comm->c_coll->coll_allgather_module); + if (OMPI_SUCCESS != ret) { + goto exit; + } +#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN + end_comm_time = MPI_Wtime(); + comm_time += (end_comm_time - start_comm_time); +#endif + + /************************************************************* + *** 4. Allgather the offset/lengths array from all processes + *************************************************************/ + for (i = 0; i < fh->f_num_aggrs; i++) { + aggr_data[i]->total_bytes = broken_total_lengths[i]; + aggr_data[i]->decoded_iov = broken_decoded_iovs[i]; + aggr_data[i]->fview_count = (size_t *)malloc (fh->f_procs_per_group * sizeof (size_t)); + if (NULL == aggr_data[i]->fview_count) { + opal_output (1, "OUT OF MEMORY\n"); + ret = OMPI_ERR_OUT_OF_RESOURCE; + goto exit; + } + + for (j = 0; j < fh->f_procs_per_group; j++) { + aggr_data[i]->fview_count[j] = result_counts[fh->f_num_aggrs*j+i]; + } + + displs = (ptrdiff_t *)malloc (fh->f_procs_per_group * sizeof (ptrdiff_t)); + if (NULL == displs) { + opal_output (1, "OUT OF MEMORY\n"); + ret = OMPI_ERR_OUT_OF_RESOURCE; + goto exit; + } + + displs[0] = 0; + total_fview_count = (uint32_t) aggr_data[i]->fview_count[0]; + for (j = 1 ; j < fh->f_procs_per_group ; j++) { + total_fview_count += aggr_data[i]->fview_count[j]; + displs[j] = displs[j-1] + aggr_data[i]->fview_count[j-1]; + } + +#if DEBUG_ON + printf("total_fview_count : %d\n", total_fview_count); + if (fh->f_aggr_list[i] == fh->f_rank) { + for (j=0 ; jf_procs_per_group ; i++) { + printf ("%d: PROCESS: %d ELEMENTS: %ld DISPLS: %ld\n", + fh->f_rank, j, + aggr_data[i]->fview_count[j], + displs[j]); + } + } +#endif + + /* allocate the global iovec */ + if (0 != total_fview_count) { + aggr_data[i]->global_iov_array = (struct iovec*) malloc (total_fview_count * + sizeof(struct iovec)); + if (NULL == aggr_data[i]->global_iov_array) { + opal_output(1, "OUT OF MEMORY\n"); + ret = OMPI_ERR_OUT_OF_RESOURCE; + goto exit; + } + } + +#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN + start_comm_time = MPI_Wtime(); +#endif + OMPI_COUNT_ARRAY_INIT(&fview_count_desc, aggr_data[i]->fview_count); + OMPI_DISP_ARRAY_INIT(&displs_desc, displs); + ret = fh->f_comm->c_coll->coll_allgatherv (broken_iov_arrays[i], + broken_counts[i], + fh->f_iov_type, + aggr_data[i]->global_iov_array, + fview_count_desc, + displs_desc, + fh->f_iov_type, + fh->f_comm, + fh->f_comm->c_coll->coll_allgatherv_module ); + if (OMPI_SUCCESS != ret) { + goto exit; + } + +#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN + end_comm_time = MPI_Wtime(); + comm_time += (end_comm_time - start_comm_time); +#endif + + /**************************************************************************************** + *** 5. Sort the global offset/lengths list based on the offsets. + *** The result of the sort operation is the 'sorted', an integer array, + *** which contains the indexes of the global_iov_array based on the offset. + *** For example, if global_iov_array[x].offset is followed by global_iov_array[y].offset + *** in the file, and that one is followed by global_iov_array[z].offset, than + *** sorted[0] = x, sorted[1]=y and sorted[2]=z; + ******************************************************************************************/ + if (0 != total_fview_count) { + aggr_data[i]->sorted = (int *)malloc (total_fview_count * sizeof(int)); + if (NULL == aggr_data[i]->sorted) { + opal_output (1, "OUT OF MEMORY\n"); + ret = OMPI_ERR_OUT_OF_RESOURCE; + goto exit; + } + ompi_fcoll_base_sort_iovec (aggr_data[i]->global_iov_array, total_fview_count, + aggr_data[i]->sorted); + } + + if (NULL != local_iov_array) { + free(local_iov_array); + local_iov_array = NULL; + } + + if (NULL != displs) { + free(displs); + displs=NULL; + } + +#if DEBUG_ON + if (fh->f_aggr_list[i] == fh->f_rank) { + uint32_t tv=0; + for (tv = 0 ; tv < total_fview_count ; tv++) { + printf("%d: OFFSET: %lu LENGTH: %ld\n", + fh->f_rank, + (uint64_t)aggr_data[i]->global_iov_array[aggr_data[i]->sorted[tv]].iov_base, + aggr_data[i]->global_iov_array[aggr_data[i]->sorted[tv]].iov_len); + } + } +#endif + /************************************************************* + *** 6. Determine the number of cycles required to execute this + *** operation + *************************************************************/ + aggr_data[i]->bytes_per_cycle = bytes_per_cycle; + + if (fh->f_aggr_list[i] == fh->f_rank) { + aggr_data[i]->disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int)); + if (NULL == aggr_data[i]->disp_index) { + opal_output (1, "OUT OF MEMORY\n"); + ret = OMPI_ERR_OUT_OF_RESOURCE; + goto exit; + } + + aggr_data[i]->max_disp_index = (int *)calloc (fh->f_procs_per_group, sizeof (int)); + if (NULL == aggr_data[i]->max_disp_index) { + opal_output (1, "OUT OF MEMORY\n"); + ret = OMPI_ERR_OUT_OF_RESOURCE; + goto exit; + } + + aggr_data[i]->blocklen_per_process = (int **)calloc (fh->f_procs_per_group, sizeof (int*)); + if (NULL == aggr_data[i]->blocklen_per_process) { + opal_output (1, "OUT OF MEMORY\n"); + ret = OMPI_ERR_OUT_OF_RESOURCE; + goto exit; + } + + aggr_data[i]->displs_per_process = (MPI_Aint **)calloc (fh->f_procs_per_group, sizeof (MPI_Aint*)); + if (NULL == aggr_data[i]->displs_per_process) { + opal_output (1, "OUT OF MEMORY\n"); + ret = OMPI_ERR_OUT_OF_RESOURCE; + goto exit; + } + + if (use_accelerator_buffer) { + opal_output_verbose(10, ompi_fcoll_base_framework.framework_output, + "Allocating GPU device buffer for aggregation\n"); + ret = opal_accelerator.mem_alloc(MCA_ACCELERATOR_NO_DEVICE_ID, (void**)&aggr_data[i]->global_buf, + bytes_per_cycle); + if (OPAL_SUCCESS != ret) { + opal_output(1, "Could not allocate accelerator memory"); + ret = OMPI_ERR_OUT_OF_RESOURCE; + goto exit; + } + ret = opal_accelerator.mem_alloc(MCA_ACCELERATOR_NO_DEVICE_ID, (void**)&aggr_data[i]->prev_global_buf, + bytes_per_cycle); + if (OPAL_SUCCESS != ret) { + opal_output(1, "Could not allocate accelerator memory"); + ret = OMPI_ERR_OUT_OF_RESOURCE; + goto exit; + } + } else { + aggr_data[i]->global_buf = (char *) malloc (bytes_per_cycle); + aggr_data[i]->prev_global_buf = (char *) malloc (bytes_per_cycle); + if (NULL == aggr_data[i]->global_buf || NULL == aggr_data[i]->prev_global_buf){ + opal_output(1, "OUT OF MEMORY"); + ret = OMPI_ERR_OUT_OF_RESOURCE; + goto exit; + } + } + + aggr_data[i]->recvtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * + sizeof(ompi_datatype_t *)); + aggr_data[i]->prev_recvtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * + sizeof(ompi_datatype_t *)); + if (NULL == aggr_data[i]->recvtype || NULL == aggr_data[i]->prev_recvtype) { + opal_output (1, "OUT OF MEMORY\n"); + ret = OMPI_ERR_OUT_OF_RESOURCE; + goto exit; + } + for(l=0;lf_procs_per_group;l++){ + aggr_data[i]->recvtype[l] = MPI_DATATYPE_NULL; + aggr_data[i]->prev_recvtype[l] = MPI_DATATYPE_NULL; + } + } + +#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN + start_exch = MPI_Wtime(); +#endif + } + + reqs = (ompi_request_t **)malloc ((fh->f_procs_per_group + 1 )*fh->f_num_aggrs *sizeof(ompi_request_t *)); + if (NULL == reqs) { + opal_output (1, "OUT OF MEMORY\n"); + ret = OMPI_ERR_OUT_OF_RESOURCE; + goto exit; + } + + for (l = 0, i = 0; i < fh->f_num_aggrs; i++) { + for (j=0; j< (fh->f_procs_per_group+1); j++) { + reqs[l] = MPI_REQUEST_NULL; + l++; + } + } + + if( (1 == mca_fcoll_vulcan_async_io) || + ( (0 == mca_fcoll_vulcan_async_io) && (NULL != fh->f_fbtl->fbtl_ipreadv) && (2 < cycles))) { + read_sync_type = 1; + } + + if (cycles > 0) { + if (NOT_AGGR_INDEX != aggr_index) { + // Register progress function that should be used by ompi_request_wait + mca_common_ompio_register_progress (); + } +#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN + start_read_time = MPI_Wtime(); +#endif + for (i = 0; i < fh->f_num_aggrs; i++) { + ret = read_init (fh, 0, cycles, fh->f_aggr_list[i], fh->f_rank, + aggr_data[i], read_sync_type, &req_tmp, + use_accelerator_buffer); + if (OMPI_SUCCESS != ret) { + goto exit; + } + if (fh->f_aggr_list[i] == fh->f_rank) { + req_iread = req_tmp; + } + } + + if (NOT_AGGR_INDEX != aggr_index) { + ret = ompi_request_wait(&req_iread, MPI_STATUS_IGNORE); + if (OMPI_SUCCESS != ret){ + goto exit; + } + } +#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN + end_read_time = MPI_Wtime(); + read_time += end_read_time - start_read_time; +#endif + } + + for (index = 1; index < cycles; index++) { + for (i = 0; i < fh->f_num_aggrs; i++) { + ret = shuffle_init (index-1, cycles, fh->f_aggr_list[i], fh->f_rank, aggr_data[i], + &reqs[i*(fh->f_procs_per_group + 1)] ); + if (OMPI_SUCCESS != ret) { + goto exit; + } + } + + SWAP_AGGR_POINTERS(aggr_data, fh->f_num_aggrs); +#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN + start_read_time = MPI_Wtime(); +#endif + for (i = 0; i < fh->f_num_aggrs; i++) { + ret = read_init (fh, index, cycles, fh->f_aggr_list[i], fh->f_rank, + aggr_data[i], read_sync_type, + &req_tmp, use_accelerator_buffer); + if (OMPI_SUCCESS != ret){ + goto exit; + } + if (fh->f_aggr_list[i] == fh->f_rank) { + req_iread = req_tmp; + } + } +#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN + end_read_time = MPI_Wtime(); + read_time += end_read_time - start_read_time; +#endif + ret = ompi_request_wait_all ((fh->f_procs_per_group + 1 )*fh->f_num_aggrs, + reqs, MPI_STATUS_IGNORE); + if (OMPI_SUCCESS != ret){ + goto exit; + } + + if (NOT_AGGR_INDEX != aggr_index) { + ret = ompi_request_wait (&req_iread, MPI_STATUS_IGNORE); + if (OMPI_SUCCESS != ret){ + goto exit; + } + } + } /* end for (index = 1; index < cycles; index++) */ + + if (cycles > 0) { + for (i = 0; i < fh->f_num_aggrs; i++) { + ret = shuffle_init (index-1, cycles, fh->f_aggr_list[i], fh->f_rank, aggr_data[i], + &reqs[i*(fh->f_procs_per_group + 1)] ); + if (OMPI_SUCCESS != ret) { + goto exit; + } + } + ret = ompi_request_wait_all ((fh->f_procs_per_group + 1 )*fh->f_num_aggrs, + reqs, MPI_STATUS_IGNORE); + if (OMPI_SUCCESS != ret){ + goto exit; + } + } + +#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN + end_exch = MPI_Wtime(); + exch_read += end_exch - start_exch; + nentry.time[0] = read_time; + nentry.time[1] = comm_time; + nentry.time[2] = exch_read; + nentry.aggregator = 0; + for ( i=0; if_num_aggrs; i++ ) { + if (fh->f_aggr_list[i] == fh->f_rank) + nentry.aggregator = 1; + } + nentry.nprocs_for_coll = fh->f_num_aggrs; + if (!mca_common_ompio_full_print_queue(fh->f_coll_read_time)){ + mca_common_ompio_register_print_entry(fh->f_coll_read_time, + nentry); + } +#endif + +exit : + if (NULL != aggr_data) { + + for (i = 0; i < fh->f_num_aggrs; i++) { + if (fh->f_aggr_list[i] == fh->f_rank) { + if (NULL != aggr_data[i]->recvtype){ + for (j = 0; j < aggr_data[i]->procs_per_group; j++) { + if (MPI_DATATYPE_NULL != aggr_data[i]->recvtype[j]) { + ompi_datatype_destroy(&aggr_data[i]->recvtype[j]); + } + if (MPI_DATATYPE_NULL != aggr_data[i]->prev_recvtype[j]) { + ompi_datatype_destroy(&aggr_data[i]->prev_recvtype[j]); + } + } + free(aggr_data[i]->recvtype); + free(aggr_data[i]->prev_recvtype); + } + + free (aggr_data[i]->disp_index); + free (aggr_data[i]->max_disp_index); + if (use_accelerator_buffer) { + opal_accelerator.mem_release(MCA_ACCELERATOR_NO_DEVICE_ID, aggr_data[i]->global_buf); + opal_accelerator.mem_release(MCA_ACCELERATOR_NO_DEVICE_ID, aggr_data[i]->prev_global_buf); + } else { + free (aggr_data[i]->global_buf); + free (aggr_data[i]->prev_global_buf); + } + for (l = 0;l < aggr_data[i]->procs_per_group; l++) { + free (aggr_data[i]->blocklen_per_process[l]); + free (aggr_data[i]->displs_per_process[l]); + } + + free (aggr_data[i]->blocklen_per_process); + free (aggr_data[i]->displs_per_process); + } + free (aggr_data[i]->sorted); + free (aggr_data[i]->global_iov_array); + free (aggr_data[i]->fview_count); + free (aggr_data[i]->decoded_iov); + + free (aggr_data[i]); + } + free (aggr_data); + } + free(displs); + free(decoded_iov); + free(broken_counts); + free(broken_total_lengths); + free(broken_iov_counts); + free(broken_decoded_iovs); // decoded_iov arrays[i] were freed as aggr_data[i]->decoded_iov; + if (NULL != broken_iov_arrays) { + for (i = 0; i < fh->f_num_aggrs; i++) { + free(broken_iov_arrays[i]); + } + } + free(broken_iov_arrays); + free(fh->f_procs_in_group); + free(fh->f_aggr_list); + fh->f_procs_in_group=NULL; + fh->f_procs_per_group=0; + fh->f_aggr_list=NULL; + free(result_counts); + free(reqs); + + return ret; +} + +static int read_init (ompio_file_t *fh, int index, int cycles, int aggregator, int rank, + mca_io_ompio_aggregator_data *data, + int read_syncType, ompi_request_t **request, + bool is_accelerator_buffer) +{ + int ret = OMPI_SUCCESS; + ssize_t ret_temp = 0; + mca_ompio_request_t *ompio_req = NULL; + int i, j, l; + int entries_per_aggregator=0; + mca_io_ompio_local_io_array *file_offsets_for_agg=NULL; + MPI_Aint *memory_displacements=NULL; + int* blocklength_proc=NULL; + ptrdiff_t* displs_proc=NULL; + int *sorted_file_offsets=NULL; + + /********************************************************************** + *** 7a. Getting ready for next cycle: initializing and freeing buffers + **********************************************************************/ + data->bytes_sent = 0; + + if (aggregator == rank) { + if (NULL != data->recvtype){ + for (i = 0; i < data->procs_per_group; i++) { + if (MPI_DATATYPE_NULL != data->recvtype[i]) { + ompi_datatype_destroy(&data->recvtype[i]); + data->recvtype[i] = MPI_DATATYPE_NULL; + } + } + } + + for (l = 0; l < data->procs_per_group; l++) { + data->disp_index[l] = 0; + + if (data->max_disp_index[l] == 0) { + data->blocklen_per_process[l] = (int *) calloc (INIT_LEN, sizeof(int)); + data->displs_per_process[l] = (MPI_Aint *) calloc (INIT_LEN, sizeof(MPI_Aint)); + if (NULL == data->displs_per_process[l] || NULL == data->blocklen_per_process[l]){ + opal_output (1, "OUT OF MEMORY for displs\n"); + ret = OMPI_ERR_OUT_OF_RESOURCE; + goto exit; + } + data->max_disp_index[l] = INIT_LEN; + } else { + memset (data->blocklen_per_process[l], 0, data->max_disp_index[l]*sizeof(int)); + memset (data->displs_per_process[l], 0, data->max_disp_index[l]*sizeof(MPI_Aint)); + } + } + } /* rank == aggregator */ + + /************************************************************************** + *** 7b. Determine the number of bytes to be actually read in this cycle + **************************************************************************/ + int local_cycles= ceil((double)data->total_bytes / data->bytes_per_cycle); + if (index < (local_cycles -1)) { + data->bytes_to_write_in_cycle = data->bytes_per_cycle; + } else if ( index == (local_cycles -1)) { + data->bytes_to_write_in_cycle = data->total_bytes - data->bytes_per_cycle*index; + } else { + data->bytes_to_write_in_cycle = 0; + } + data->bytes_to_write = data->bytes_to_write_in_cycle; + +#if DEBUG_ON + if (aggregator == rank) { + printf ("****%d: CYCLE %d Bytes %d**********\n", + rank, index, data->bytes_to_write_in_cycle); + } +#endif + + /***************************************************************** + *** 7c. Calculate how much data will be sent to each process in + *** this cycle + *****************************************************************/ + mca_fcoll_vulcan_calc_blocklen_disps(data, aggregator, rank, &data->bytes_sent); + + /************************************************************************* + *** 7d. Calculate the displacement + *************************************************************************/ + if (rank == aggregator) { + for (i = 0; i < data->procs_per_group; i++){ + for (j = 0; j < data->disp_index[i]; j++){ + if (data->blocklen_per_process[i][j] > 0) + entries_per_aggregator++ ; + } + } + } +#if DEBUG_ON + if (aggregator == rank) { + printf("%d : Entries per aggregator : %d\n", rank, entries_per_aggregator); + } +#endif + + if (entries_per_aggregator > 0) { + file_offsets_for_agg = (mca_io_ompio_local_io_array *) malloc (entries_per_aggregator + * sizeof(mca_io_ompio_local_io_array)); + memory_displacements = (MPI_Aint *) malloc (entries_per_aggregator * sizeof(MPI_Aint)); + sorted_file_offsets = (int *) malloc (entries_per_aggregator*sizeof(int)); + if (NULL == file_offsets_for_agg || NULL == memory_displacements || + NULL == sorted_file_offsets) { + opal_output (1, "OUT OF MEMORY\n"); + ret = OMPI_ERR_OUT_OF_RESOURCE; + goto exit; + } + + ret = mca_fcoll_vulcan_calc_file_offsets(data, file_offsets_for_agg, sorted_file_offsets, + memory_displacements, entries_per_aggregator, + rank, index); + if (OMPI_SUCCESS != ret) { + goto exit; + } + + /********************************************************** + *** 7f. Create the io array + *********************************************************/ + fh->f_io_array = (mca_common_ompio_io_array_t *) malloc (entries_per_aggregator + * sizeof (mca_common_ompio_io_array_t)); + if (NULL == fh->f_io_array) { + opal_output(1, "OUT OF MEMORY\n"); + ret = OMPI_ERR_OUT_OF_RESOURCE; + goto exit; + } + + mca_fcoll_vulcan_calc_io_array(fh->f_io_array, &fh->f_num_of_io_entries, entries_per_aggregator, + (char*)data->global_buf, file_offsets_for_agg, sorted_file_offsets, + memory_displacements, rank); + } + + if (rank == aggregator && fh->f_num_of_io_entries) { + mca_common_ompio_request_alloc (&ompio_req, MCA_OMPIO_REQUEST_READ); + + if (1 == read_syncType) { + if (is_accelerator_buffer) { + ret = mca_common_ompio_file_iread_pregen(fh, (ompi_request_t *) ompio_req); + if (0 > ret) { + opal_output (1, "vulcan_read_all: mca_common_ompio_iread_pregen failed\n"); + ompio_req->req_ompi.req_status.MPI_ERROR = ret; + ompio_req->req_ompi.req_status._ucount = 0; + } + } else { + ret = fh->f_fbtl->fbtl_ipreadv(fh, (ompi_request_t *) ompio_req); + if (0 > ret) { + opal_output (1, "vulcan_read_all: fbtl_ipreadv failed\n"); + ompio_req->req_ompi.req_status.MPI_ERROR = ret; + ompio_req->req_ompi.req_status._ucount = 0; + } + } + } + else { + ret_temp = fh->f_fbtl->fbtl_preadv(fh); + if (0 > ret_temp) { + opal_output (1, "vulcan_read_all: fbtl_preadv failed\n"); + ret = ret_temp; + ret_temp = 0; + } + + ompio_req->req_ompi.req_status.MPI_ERROR = ret; + ompio_req->req_ompi.req_status._ucount = ret_temp; + ompi_request_complete (&ompio_req->req_ompi, false); + } + + free(fh->f_io_array); + } + +#if DEBUG_ON + printf("************Cycle: %d, Aggregator: %d ***************\n", + index, rank); + for (i = 0; i < data->procs_per_group; i++) { + for (j = 0; j < data->disp_index[i]; j++) { + if (data->blocklen_per_process[i][j] > 0) { + printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n", + data->procs_in_group[i],j, + data->blocklen_per_process[i][j],j, + data->displs_per_process[i][j], rank); + } + } + } +#endif + +exit: + free(sorted_file_offsets); + free(file_offsets_for_agg); + free(memory_displacements); + free(blocklength_proc); + free(displs_proc); + + fh->f_io_array = NULL; + fh->f_num_of_io_entries = 0; + + *request = (ompi_request_t *) ompio_req; + return ret; } +static int shuffle_init (int index, int cycles, int aggregator, int rank, mca_io_ompio_aggregator_data *data, + ompi_request_t **reqs) +{ + int i, ret = OMPI_SUCCESS; + int* blocklength_proc=NULL; + ptrdiff_t* displs_proc=NULL; + + /************************************************************************* + *** 7e. Perform the actual communication + *************************************************************************/ + if (aggregator == rank ) { + for (i = 0; i < data->procs_per_group; i++) { + size_t datatype_size; + reqs[i] = MPI_REQUEST_NULL; + if (0 < data->disp_index[i]) { + ompi_datatype_create_hindexed (data->disp_index[i], + data->blocklen_per_process[i], + data->displs_per_process[i], + MPI_BYTE, + &data->recvtype[i]); + ompi_datatype_commit (&data->recvtype[i]); + opal_datatype_type_size (&data->recvtype[i]->super, &datatype_size); + if (datatype_size){ + ret = MCA_PML_CALL(isend(data->global_buf, + 1, data->recvtype[i], + data->procs_in_group[i], + FCOLL_VULCAN_SHUFFLE_TAG+index, + MCA_PML_BASE_SEND_STANDARD, + data->comm, &reqs[i])); + if (OMPI_SUCCESS != ret){ + goto exit; + } + } + } + } + // } /* end if (entries_per_aggr > 0 ) */ + }/* end if (aggregator == rank ) */ + + reqs[data->procs_per_group] = MPI_REQUEST_NULL; + if (data->bytes_sent) { + size_t remaining = data->bytes_sent; + int block_index = -1; + int blocklength_size = INIT_LEN; + + ptrdiff_t recv_mem_address = 0; + ompi_datatype_t *newType = MPI_DATATYPE_NULL; + blocklength_proc = (int *) calloc (blocklength_size, sizeof (int)); + displs_proc = (ptrdiff_t *) calloc (blocklength_size, sizeof (ptrdiff_t)); + + if (NULL == blocklength_proc || NULL == displs_proc ) { + opal_output (1, "OUT OF MEMORY\n"); + ret = OMPI_ERR_OUT_OF_RESOURCE; + goto exit; + } + + while (remaining) { + block_index++; + + if(0 == block_index) { + recv_mem_address = (ptrdiff_t) (data->decoded_iov[data->iov_index].iov_base) + + data->current_position; + } + else { + // Reallocate more memory if blocklength_size is not enough + if(0 == block_index % INIT_LEN) { + blocklength_size += INIT_LEN; + blocklength_proc = (int *) realloc(blocklength_proc, blocklength_size * sizeof(int)); + displs_proc = (ptrdiff_t *) realloc(displs_proc, blocklength_size * sizeof(ptrdiff_t)); + } + displs_proc[block_index] = (ptrdiff_t) (data->decoded_iov[data->iov_index].iov_base) + + data->current_position - recv_mem_address; + } + + if (remaining >= + (data->decoded_iov[data->iov_index].iov_len - data->current_position)) { + + blocklength_proc[block_index] = data->decoded_iov[data->iov_index].iov_len - + data->current_position; + remaining = remaining - (data->decoded_iov[data->iov_index].iov_len - + data->current_position); + data->iov_index = data->iov_index + 1; + data->current_position = 0; + } else { + blocklength_proc[block_index] = remaining; + data->current_position += remaining; + remaining = 0; + } + } + + data->total_bytes_written += data->bytes_sent; + + if (0 <= block_index) { + ompi_datatype_create_hindexed (block_index+1, + blocklength_proc, + displs_proc, + MPI_BYTE, + &newType); + ompi_datatype_commit (&newType); + + ret = MCA_PML_CALL(irecv((char *)recv_mem_address, + 1, + newType, + aggregator, + FCOLL_VULCAN_SHUFFLE_TAG+index, + data->comm, + &reqs[data->procs_per_group])); + if (MPI_DATATYPE_NULL != newType) { + ompi_datatype_destroy(&newType); + } + if (OMPI_SUCCESS != ret){ + goto exit; + } + } + } +exit: + return ret; +} diff --git a/ompi/mca/fcoll/vulcan/fcoll_vulcan_file_write_all.c b/ompi/mca/fcoll/vulcan/fcoll_vulcan_file_write_all.c index 5f89fba8d01..b6e9be6d2ca 100644 --- a/ompi/mca/fcoll/vulcan/fcoll_vulcan_file_write_all.c +++ b/ompi/mca/fcoll/vulcan/fcoll_vulcan_file_write_all.c @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -25,6 +26,7 @@ #include "ompi_config.h" #include "fcoll_vulcan.h" +#include "fcoll_vulcan_internal.h" #include "mpi.h" #include "ompi/constants.h" @@ -42,85 +44,16 @@ #define DEBUG_ON 0 #define NOT_AGGR_INDEX -1 -/*Used for loading file-offsets per aggregator*/ -typedef struct mca_io_ompio_local_io_array{ - OMPI_MPI_OFFSET_TYPE offset; - MPI_Aint length; - int process_id; -}mca_io_ompio_local_io_array; - -typedef struct mca_io_ompio_aggregator_data { - int *disp_index, *sorted, n; - size_t *fview_count; - int *max_disp_index; - int **blocklen_per_process; - MPI_Aint **displs_per_process, total_bytes, bytes_per_cycle, total_bytes_written; - MPI_Comm comm; - char *buf, *global_buf, *prev_global_buf; - ompi_datatype_t **recvtype, **prev_recvtype; - struct iovec *global_iov_array; - int current_index, current_position; - int bytes_to_write_in_cycle, bytes_remaining, procs_per_group; - int *procs_in_group, iov_index; - int bytes_sent, prev_bytes_sent; - struct iovec *decoded_iov; - int bytes_to_write, prev_bytes_to_write; - mca_common_ompio_io_array_t *io_array, *prev_io_array; - int num_io_entries, prev_num_io_entries; -} mca_io_ompio_aggregator_data; - - -#define SWAP_REQUESTS(_r1,_r2) { \ - ompi_request_t **_t=_r1; \ - _r1=_r2; \ - _r2=_t;} - -#define SWAP_AGGR_POINTERS(_aggr,_num) { \ - int _i; \ - char *_t; \ - for (_i=0; _i<_num; _i++ ) { \ - _aggr[_i]->prev_io_array=_aggr[_i]->io_array; \ - _aggr[_i]->prev_num_io_entries=_aggr[_i]->num_io_entries; \ - _aggr[_i]->prev_bytes_sent=_aggr[_i]->bytes_sent; \ - _aggr[_i]->prev_bytes_to_write=_aggr[_i]->bytes_to_write; \ - _t=_aggr[_i]->prev_global_buf; \ - _aggr[_i]->prev_global_buf=_aggr[_i]->global_buf; \ - _aggr[_i]->global_buf=_t; \ - _t=(char *)_aggr[_i]->recvtype; \ - _aggr[_i]->recvtype=_aggr[_i]->prev_recvtype; \ - _aggr[_i]->prev_recvtype=(ompi_datatype_t **)_t; } \ -} -static int shuffle_init ( int index, int cycles, int aggregator, int rank, - mca_io_ompio_aggregator_data *data, - ompi_request_t **reqs ); +static int shuffle_init (int index, int num_cycles, int aggregator, int rank, + mca_io_ompio_aggregator_data *data, ompi_request_t **reqs); + static int write_init (ompio_file_t *fh, int aggregator, mca_io_ompio_aggregator_data *aggr_data, - int write_chunksize, int write_synchType, ompi_request_t **request, + int write_syncType, ompi_request_t **request, bool is_accelerator_buffer); -int mca_fcoll_vulcan_break_file_view ( struct iovec *decoded_iov, int iov_count, - struct iovec *local_iov_array, int local_count, - struct iovec ***broken_decoded_iovs, int **broken_iov_counts, - struct iovec ***broken_iov_arrays, int **broken_counts, - MPI_Aint **broken_total_lengths, - int stripe_count, size_t stripe_size); - - -int mca_fcoll_vulcan_get_configuration (ompio_file_t *fh, int num_io_procs, - int num_groups, size_t max_data); - static int local_heap_sort (mca_io_ompio_local_io_array *io_array, - int num_entries, - int *sorted); - -int mca_fcoll_vulcan_split_iov_array ( ompio_file_t *fh, mca_common_ompio_io_array_t *work_array, - int num_entries, int *last_array_pos, int *last_pos_in_field, - int chunk_size ); - - -static int mca_fcoll_vulcan_minmax ( ompio_file_t *fh, struct iovec *iov, int iov_count, int num_aggregators, - long *new_stripe_size); - + int num_entries, int *sorted); int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh, const void *buf, @@ -143,7 +76,6 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh, ptrdiff_t *displs = NULL; int vulcan_num_io_procs; size_t max_data = 0; - MPI_Aint *total_bytes_per_process = NULL; struct iovec **broken_iov_arrays=NULL; struct iovec **broken_decoded_iovs=NULL; @@ -153,7 +85,7 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh, int aggr_index = NOT_AGGR_INDEX; int write_synch_type = 2; - int write_chunksize, *result_counts=NULL; + int *result_counts=NULL; ompi_count_array_t fview_count_desc; ompi_disp_array_t displs_desc; @@ -186,13 +118,12 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh, mca_common_ompio_check_gpu_buf (fh, buf, &is_gpu, &is_managed); if (is_gpu && !is_managed && - fh->f_get_mca_parameter_value ("use_accelerator_buffers", strlen("use_accelerator_buffers"))) { - use_accelerator_buffer = true; + fh->f_get_mca_parameter_value ("use_accelerator_buffers", strlen("use_accelerator_buffers"))) { + use_accelerator_buffer = true; } /* since we want to overlap 2 iterations, define the bytes_per_cycle to be half of what the user requested */ bytes_per_cycle =bytes_per_cycle/2; - write_chunksize = bytes_per_cycle; ret = mca_common_ompio_decode_datatype ((struct ompio_file_t *) fh, datatype, @@ -207,14 +138,15 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh, } if ( MPI_STATUS_IGNORE != status ) { - status->_ucount = max_data; + status->_ucount = max_data; } - - ret = mca_fcoll_vulcan_get_configuration (fh, vulcan_num_io_procs, mca_fcoll_vulcan_num_groups, max_data); + ret = mca_fcoll_vulcan_get_configuration (fh, vulcan_num_io_procs, max_data); if (OMPI_SUCCESS != ret){ - goto exit; + goto exit; } + opal_output_verbose(10, ompi_fcoll_base_framework.framework_output, + "Using %d aggregators for the write_all operation \n", fh->f_num_aggrs); aggr_data = (mca_io_ompio_aggregator_data **) malloc ( fh->f_num_aggrs * sizeof(mca_io_ompio_aggregator_data*)); @@ -227,7 +159,6 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh, aggr_data[i]->procs_per_group = fh->f_procs_per_group; aggr_data[i]->procs_in_group = fh->f_procs_in_group; aggr_data[i]->comm = fh->f_comm; - aggr_data[i]->buf = (char *)buf; // should not be used in the new version. // Identify if the process is an aggregator. // If so, aggr_index would be its index in "aggr_data" and "aggregators" arrays. if(fh->f_aggr_list[i] == fh->f_rank) { @@ -240,11 +171,11 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh, *** this write operation ********************************************************************/ ret = fh->f_generate_current_file_view( (struct ompio_file_t *) fh, - max_data, - &local_iov_array, - &local_count); + max_data, + &local_iov_array, + &local_count); if (ret != OMPI_SUCCESS){ - goto exit; + goto exit; } /************************************************************************* @@ -270,52 +201,15 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh, #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_comm_time = MPI_Wtime(); #endif - if ( 1 == mca_fcoll_vulcan_num_groups ) { - ret = fh->f_comm->c_coll->coll_allreduce (MPI_IN_PLACE, - broken_total_lengths, - fh->f_num_aggrs, - MPI_LONG, - MPI_SUM, - fh->f_comm, - fh->f_comm->c_coll->coll_allreduce_module); - if( OMPI_SUCCESS != ret){ - goto exit; - } - - } - else { - total_bytes_per_process = (MPI_Aint*)malloc - (fh->f_num_aggrs * fh->f_procs_per_group*sizeof(MPI_Aint)); - if (NULL == total_bytes_per_process) { - opal_output (1, "OUT OF MEMORY\n"); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto exit; - } - - ret = ompi_fcoll_base_coll_allgather_array (broken_total_lengths, - fh->f_num_aggrs, - MPI_LONG, - total_bytes_per_process, - fh->f_num_aggrs, - MPI_LONG, - 0, - fh->f_procs_in_group, - fh->f_procs_per_group, - fh->f_comm); - if( OMPI_SUCCESS != ret){ - goto exit; - } - - for ( i=0; if_num_aggrs; i++ ) { - broken_total_lengths[i] = 0; - for (j=0 ; jf_procs_per_group ; j++) { - broken_total_lengths[i] += total_bytes_per_process[j*fh->f_num_aggrs + i]; - } - } - if (NULL != total_bytes_per_process) { - free (total_bytes_per_process); - total_bytes_per_process = NULL; - } + ret = fh->f_comm->c_coll->coll_allreduce (MPI_IN_PLACE, + broken_total_lengths, + fh->f_num_aggrs, + MPI_LONG, + MPI_SUM, + fh->f_comm, + fh->f_comm->c_coll->coll_allreduce_module); + if( OMPI_SUCCESS != ret){ + goto exit; } #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN @@ -342,28 +236,14 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh, #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_comm_time = MPI_Wtime(); #endif - if ( 1 == mca_fcoll_vulcan_num_groups ) { - ret = fh->f_comm->c_coll->coll_allgather(broken_counts, - fh->f_num_aggrs, - MPI_INT, - result_counts, - fh->f_num_aggrs, - MPI_INT, - fh->f_comm, - fh->f_comm->c_coll->coll_allgather_module); - } - else { - ret = ompi_fcoll_base_coll_allgather_array (broken_counts, - fh->f_num_aggrs, - MPI_INT, - result_counts, - fh->f_num_aggrs, - MPI_INT, - 0, - fh->f_procs_in_group, - fh->f_procs_per_group, - fh->f_comm); - } + ret = fh->f_comm->c_coll->coll_allgather(broken_counts, + fh->f_num_aggrs, + MPI_INT, + result_counts, + fh->f_num_aggrs, + MPI_INT, + fh->f_comm, + fh->f_comm->c_coll->coll_allgather_module); if( OMPI_SUCCESS != ret){ goto exit; } @@ -428,32 +308,17 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh, #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_comm_time = MPI_Wtime(); #endif - if ( 1 == mca_fcoll_vulcan_num_groups ) { - OMPI_COUNT_ARRAY_INIT(&fview_count_desc, aggr_data[i]->fview_count); - OMPI_DISP_ARRAY_INIT(&displs_desc, displs); - ret = fh->f_comm->c_coll->coll_allgatherv (broken_iov_arrays[i], - broken_counts[i], - fh->f_iov_type, - aggr_data[i]->global_iov_array, - fview_count_desc, - displs_desc, - fh->f_iov_type, - fh->f_comm, - fh->f_comm->c_coll->coll_allgatherv_module ); - } - else { - ret = ompi_fcoll_base_coll_allgatherv_array (broken_iov_arrays[i], - broken_counts[i], - fh->f_iov_type, - aggr_data[i]->global_iov_array, - aggr_data[i]->fview_count, - displs, - fh->f_iov_type, - fh->f_aggr_list[i], - fh->f_procs_in_group, - fh->f_procs_per_group, - fh->f_comm); - } + OMPI_COUNT_ARRAY_INIT(&fview_count_desc, aggr_data[i]->fview_count); + OMPI_DISP_ARRAY_INIT(&displs_desc, displs); + ret = fh->f_comm->c_coll->coll_allgatherv (broken_iov_arrays[i], + broken_counts[i], + fh->f_iov_type, + aggr_data[i]->global_iov_array, + fview_count_desc, + displs_desc, + fh->f_iov_type, + fh->f_comm, + fh->f_comm->c_coll->coll_allgatherv_module ); if (OMPI_SUCCESS != ret){ goto exit; } @@ -539,8 +404,8 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh, } if (use_accelerator_buffer) { - opal_output_verbose(10, ompi_fcoll_base_framework.framework_output, - "Allocating GPU device buffer for aggregation\n"); + opal_output_verbose(10, ompi_fcoll_base_framework.framework_output, + "Allocating GPU device buffer for aggregation\n"); ret = opal_accelerator.mem_alloc(MCA_ACCELERATOR_NO_DEVICE_ID, (void**)&aggr_data[i]->global_buf, bytes_per_cycle); if (OPAL_SUCCESS != ret) { @@ -583,10 +448,9 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh, #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN start_exch = MPI_Wtime(); #endif - } + } reqs = (ompi_request_t **)malloc ((fh->f_procs_per_group + 1 )*fh->f_num_aggrs *sizeof(ompi_request_t *)); - if ( NULL == reqs ) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; @@ -632,7 +496,7 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh, start_write_time = MPI_Wtime(); #endif ret = write_init (fh, fh->f_aggr_list[aggr_index], aggr_data[aggr_index], - write_chunksize, write_synch_type, &req_iwrite, use_accelerator_buffer); + write_synch_type, &req_iwrite, use_accelerator_buffer); if (OMPI_SUCCESS != ret){ goto exit; } @@ -672,7 +536,7 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh, start_write_time = MPI_Wtime(); #endif ret = write_init (fh, fh->f_aggr_list[aggr_index], aggr_data[aggr_index], - write_chunksize, write_synch_type, &req_iwrite, use_accelerator_buffer); + write_synch_type, &req_iwrite, use_accelerator_buffer); if (OMPI_SUCCESS != ret){ goto exit; } @@ -699,7 +563,7 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh, nentry.aggregator = 0; for ( i=0; if_num_aggrs; i++ ) { if (fh->f_aggr_list[i] == fh->f_rank) - nentry.aggregator = 1; + nentry.aggregator = 1; } nentry.nprocs_for_coll = fh->f_num_aggrs; if (!mca_common_ompio_full_print_queue(fh->f_coll_write_time)){ @@ -707,15 +571,13 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh, nentry); } #endif - - + exit : - + if ( NULL != aggr_data ) { - - for ( i=0; i< fh->f_num_aggrs; i++ ) { + for ( i=0; i< fh->f_num_aggrs; i++ ) { if (fh->f_aggr_list[i] == fh->f_rank) { - if (NULL != aggr_data[i]->recvtype){ + if (NULL != aggr_data[i]->recvtype) { for (j =0; j< aggr_data[i]->procs_per_group; j++) { if ( MPI_DATATYPE_NULL != aggr_data[i]->recvtype[j] ) { ompi_datatype_destroy(&aggr_data[i]->recvtype[j]); @@ -723,26 +585,25 @@ exit : if ( MPI_DATATYPE_NULL != aggr_data[i]->prev_recvtype[j] ) { ompi_datatype_destroy(&aggr_data[i]->prev_recvtype[j]); } - } free(aggr_data[i]->recvtype); free(aggr_data[i]->prev_recvtype); } - + free (aggr_data[i]->disp_index); free (aggr_data[i]->max_disp_index); - if (use_accelerator_buffer) { - opal_accelerator.mem_release(MCA_ACCELERATOR_NO_DEVICE_ID, aggr_data[i]->global_buf); - opal_accelerator.mem_release(MCA_ACCELERATOR_NO_DEVICE_ID, aggr_data[i]->prev_global_buf); - } else { - free (aggr_data[i]->global_buf); - free (aggr_data[i]->prev_global_buf); - } + if (use_accelerator_buffer) { + opal_accelerator.mem_release(MCA_ACCELERATOR_NO_DEVICE_ID, aggr_data[i]->global_buf); + opal_accelerator.mem_release(MCA_ACCELERATOR_NO_DEVICE_ID, aggr_data[i]->prev_global_buf); + } else { + free (aggr_data[i]->global_buf); + free (aggr_data[i]->prev_global_buf); + } for(l=0;lprocs_per_group;l++){ free (aggr_data[i]->blocklen_per_process[l]); free (aggr_data[i]->displs_per_process[l]); } - + free (aggr_data[i]->blocklen_per_process); free (aggr_data[i]->displs_per_process); } @@ -750,7 +611,6 @@ exit : free (aggr_data[i]->global_iov_array); free (aggr_data[i]->fview_count); free (aggr_data[i]->decoded_iov); - free (aggr_data[i]); } free (aggr_data); @@ -774,37 +634,40 @@ exit : fh->f_aggr_list=NULL; free(result_counts); free(reqs); - + return OMPI_SUCCESS; } static int write_init (ompio_file_t *fh, int aggregator, mca_io_ompio_aggregator_data *aggr_data, - int write_chunksize, - int write_synchType, + int write_syncType, ompi_request_t **request, bool is_accelerator_buffer) { int ret = OMPI_SUCCESS; ssize_t ret_temp = 0; - int last_array_pos = 0; - int last_pos = 0; + int i; mca_ompio_request_t *ompio_req = NULL; mca_common_ompio_request_alloc ( &ompio_req, MCA_OMPIO_REQUEST_WRITE ); if (aggr_data->prev_num_io_entries) { - /* In this case, aggr_data->prev_num_io_entries is always == 1. - Therefore we can write the data of size aggr_data->prev_bytes_to_write in one iteration. - In fact, aggr_data->prev_bytes_to_write <= write_chunksize. - */ - mca_fcoll_vulcan_split_iov_array (fh, aggr_data->prev_io_array, - aggr_data->prev_num_io_entries, - &last_array_pos, &last_pos, - write_chunksize); + fh->f_num_of_io_entries = aggr_data->prev_num_io_entries; + fh->f_io_array = (mca_common_ompio_io_array_t *) malloc (fh->f_num_of_io_entries * + sizeof(mca_common_ompio_io_array_t)); + if ( NULL == fh->f_io_array ){ + opal_output (1,"Could not allocate memory\n"); + return -1; + } + + for (i = 0; i < fh->f_num_of_io_entries; i++) { + fh->f_io_array[i].memory_address = aggr_data->prev_io_array[i].memory_address; + fh->f_io_array[i].offset = aggr_data->prev_io_array[i].offset; + fh->f_io_array[i].length = aggr_data->prev_io_array[i].length; + } - if (1 == write_synchType) { + if (1 == write_syncType) { if (is_accelerator_buffer) { ret = mca_common_ompio_file_iwrite_pregen(fh, (ompi_request_t *) ompio_req); if(0 > ret) { @@ -853,21 +716,16 @@ static int write_init (ompio_file_t *fh, return ret; } -static int shuffle_init ( int index, int cycles, int aggregator, int rank, mca_io_ompio_aggregator_data *data, - ompi_request_t **reqs ) +static int shuffle_init (int index, int num_cycles, int aggregator, int rank, + mca_io_ompio_aggregator_data *data, ompi_request_t **reqs) { - int bytes_sent = 0; - int blocks=0, temp_pindex; - int i, j, l, ret; - int entries_per_aggregator=0; + size_t bytes_sent = 0; + int i, j, l; + int ret = OMPI_SUCCESS; + int entries_per_aggregator = 0; mca_io_ompio_local_io_array *file_offsets_for_agg=NULL; int *sorted_file_offsets=NULL; - int temp_index=0; MPI_Aint *memory_displacements=NULL; - int *temp_disp_index=NULL; -#if DEBUG_ON - MPI_Aint global_count = 0; -#endif int* blocklength_proc=NULL; ptrdiff_t* displs_proc=NULL; @@ -879,21 +737,19 @@ static int shuffle_init ( int index, int cycles, int aggregator, int rank, mca_i *** 7a. Getting ready for next cycle: initializing and freeing buffers **********************************************************************/ if (aggregator == rank) { - if (NULL != data->recvtype){ for (i =0; i< data->procs_per_group; i++) { - if ( MPI_DATATYPE_NULL != data->recvtype[i] ) { + if (MPI_DATATYPE_NULL != data->recvtype[i]) { ompi_datatype_destroy(&data->recvtype[i]); data->recvtype[i] = MPI_DATATYPE_NULL; } } } - - for(l=0;lprocs_per_group;l++){ + for(l = 0; l < data->procs_per_group; l++){ data->disp_index[l] = 0; - - if ( data->max_disp_index[l] == 0 ) { + + if (data->max_disp_index[l] == 0) { data->blocklen_per_process[l] = (int *) calloc (INIT_LEN, sizeof(int)); data->displs_per_process[l] = (MPI_Aint *) calloc (INIT_LEN, sizeof(MPI_Aint)); if (NULL == data->displs_per_process[l] || NULL == data->blocklen_per_process[l]){ @@ -902,25 +758,22 @@ static int shuffle_init ( int index, int cycles, int aggregator, int rank, mca_i goto exit; } data->max_disp_index[l] = INIT_LEN; - } - else { - memset ( data->blocklen_per_process[l], 0, data->max_disp_index[l]*sizeof(int) ); - memset ( data->displs_per_process[l], 0, data->max_disp_index[l]*sizeof(MPI_Aint) ); + } else { + memset (data->blocklen_per_process[l], 0, data->max_disp_index[l]*sizeof(int)); + memset (data->displs_per_process[l], 0, data->max_disp_index[l]*sizeof(MPI_Aint)); } } } /* (aggregator == rank */ - + /************************************************************************** *** 7b. Determine the number of bytes to be actually written in this cycle **************************************************************************/ int local_cycles= ceil((double)data->total_bytes / data->bytes_per_cycle); - if ( index < (local_cycles -1) ) { + if (index < (local_cycles -1)) { data->bytes_to_write_in_cycle = data->bytes_per_cycle; - } - else if ( index == (local_cycles -1)) { + } else if (index == (local_cycles -1)) { data->bytes_to_write_in_cycle = data->total_bytes - data->bytes_per_cycle*index ; - } - else { + } else { data->bytes_to_write_in_cycle = 0; } data->bytes_to_write = data->bytes_to_write_in_cycle; @@ -928,309 +781,57 @@ static int shuffle_init ( int index, int cycles, int aggregator, int rank, mca_i #if DEBUG_ON if (aggregator == rank) { printf ("****%d: CYCLE %d Bytes %lld**********\n", - rank, - index, - data->bytes_to_write_in_cycle); + rank, index, data->bytes_to_write_in_cycle); } #endif - /********************************************************** - **Gather the Data from all the processes at the writers ** - *********************************************************/ - -#if DEBUG_ON - printf("bytes_to_write_in_cycle: %ld, cycle : %d\n", data->bytes_to_write_in_cycle, - index); -#endif - + /***************************************************************** *** 7c. Calculate how much data will be contributed in this cycle *** by each process *****************************************************************/ - - /* The blocklen and displs calculation only done at aggregators!*/ - while (data->bytes_to_write_in_cycle) { + mca_fcoll_vulcan_calc_blocklen_disps (data, aggregator, rank, &bytes_sent); - /* This next block identifies which process is the holder - ** of the sorted[current_index] element; - */ - blocks = data->fview_count[0]; - for (j=0 ; jprocs_per_group ; j++) { - if (data->sorted[data->current_index] < blocks) { - data->n = j; - break; - } - else { - blocks += data->fview_count[j+1]; - } - } - - if (data->bytes_remaining) { - /* Finish up a partially used buffer from the previous cycle */ - - if (data->bytes_remaining <= data->bytes_to_write_in_cycle) { - /* The data fits completely into the block */ - if (aggregator == rank) { - data->blocklen_per_process[data->n][data->disp_index[data->n]] = data->bytes_remaining; - data->displs_per_process[data->n][data->disp_index[data->n]] = - (ptrdiff_t)data->global_iov_array[data->sorted[data->current_index]].iov_base + - (data->global_iov_array[data->sorted[data->current_index]].iov_len - - data->bytes_remaining); - - data->disp_index[data->n] += 1; - - /* In this cases the length is consumed so allocating for - next displacement and blocklength*/ - if ( data->disp_index[data->n] == data->max_disp_index[data->n] ) { - data->max_disp_index[data->n] *= 2; - - data->blocklen_per_process[data->n] = (int *) realloc - ((void *)data->blocklen_per_process[data->n], - (data->max_disp_index[data->n])*sizeof(int)); - data->displs_per_process[data->n] = (MPI_Aint *) realloc - ((void *)data->displs_per_process[data->n], - (data->max_disp_index[data->n])*sizeof(MPI_Aint)); - } - data->blocklen_per_process[data->n][data->disp_index[data->n]] = 0; - data->displs_per_process[data->n][data->disp_index[data->n]] = 0; - - } - if (data->procs_in_group[data->n] == rank) { - bytes_sent += data->bytes_remaining; - } - data->current_index ++; - data->bytes_to_write_in_cycle -= data->bytes_remaining; - data->bytes_remaining = 0; - } - else { - /* the remaining data from the previous cycle is larger than the - data->bytes_to_write_in_cycle, so we have to segment again */ - if (aggregator == rank) { - data->blocklen_per_process[data->n][data->disp_index[data->n]] = data->bytes_to_write_in_cycle; - data->displs_per_process[data->n][data->disp_index[data->n]] = - (ptrdiff_t)data->global_iov_array[data->sorted[data->current_index]].iov_base + - (data->global_iov_array[data->sorted[data->current_index]].iov_len - - data->bytes_remaining); - data->disp_index[data->n] += 1; - } - - if (data->procs_in_group[data->n] == rank) { - bytes_sent += data->bytes_to_write_in_cycle; - } - data->bytes_remaining -= data->bytes_to_write_in_cycle; - data->bytes_to_write_in_cycle = 0; - break; - } - } - else { - /* No partially used entry available, have to start a new one */ - if (data->bytes_to_write_in_cycle < - (MPI_Aint) data->global_iov_array[data->sorted[data->current_index]].iov_len) { - /* This entry has more data than we can sendin one cycle */ - if (aggregator == rank) { - data->blocklen_per_process[data->n][data->disp_index[data->n]] = data->bytes_to_write_in_cycle; - data->displs_per_process[data->n][data->disp_index[data->n]] = - (ptrdiff_t)data->global_iov_array[data->sorted[data->current_index]].iov_base ; - data->disp_index[data->n] += 1; - } - if (data->procs_in_group[data->n] == rank) { - bytes_sent += data->bytes_to_write_in_cycle; - - } - data->bytes_remaining = data->global_iov_array[data->sorted[data->current_index]].iov_len - - data->bytes_to_write_in_cycle; - data->bytes_to_write_in_cycle = 0; - break; - } - else { - /* Next data entry is less than data->bytes_to_write_in_cycle */ - if (aggregator == rank) { - data->blocklen_per_process[data->n][data->disp_index[data->n]] = - data->global_iov_array[data->sorted[data->current_index]].iov_len; - data->displs_per_process[data->n][data->disp_index[data->n]] = (ptrdiff_t) - data->global_iov_array[data->sorted[data->current_index]].iov_base; - - data->disp_index[data->n] += 1; - - /*realloc for next blocklength - and assign this displacement and check for next displs as - the total length of this entry has been consumed!*/ - if ( data->disp_index[data->n] == data->max_disp_index[data->n] ) { - data->max_disp_index[data->n] *=2 ; - data->blocklen_per_process[data->n] = (int *) realloc ( - (void *)data->blocklen_per_process[data->n], - (data->max_disp_index[data->n]*sizeof(int))); - data->displs_per_process[data->n] = (MPI_Aint *)realloc ( - (void *)data->displs_per_process[data->n], - (data->max_disp_index[data->n]*sizeof(MPI_Aint))); - } - data->blocklen_per_process[data->n][data->disp_index[data->n]] = 0; - data->displs_per_process[data->n][data->disp_index[data->n]] = 0; - } - if (data->procs_in_group[data->n] == rank) { - bytes_sent += data->global_iov_array[data->sorted[data->current_index]].iov_len; - } - data->bytes_to_write_in_cycle -= - data->global_iov_array[data->sorted[data->current_index]].iov_len; - data->current_index ++; - } - } - } - - /************************************************************************* - *** 7d. Calculate the displacement on where to put the data and allocate - *** the receive buffer (global_buf) + *** 7d. Calculate the displacement on where to put the data *************************************************************************/ if (aggregator == rank) { entries_per_aggregator=0; - for (i=0;iprocs_per_group; i++){ - for (j=0;jdisp_index[i];j++){ - if (data->blocklen_per_process[i][j] > 0) + for (i = 0; i < data->procs_per_group; i++){ + for (j = 0; j < data->disp_index[i];j++){ + if (data->blocklen_per_process[i][j] > 0) { entries_per_aggregator++ ; + } } } - #if DEBUG_ON - printf("%d: cycle: %d, bytes_sent: %d\n ",rank,index, - bytes_sent); - printf("%d : Entries per aggregator : %d\n",rank,entries_per_aggregator); + printf("%d : Entries per aggregator : %d\n", rank, entries_per_aggregator); #endif - - if (entries_per_aggregator > 0){ - file_offsets_for_agg = (mca_io_ompio_local_io_array *) - malloc(entries_per_aggregator*sizeof(mca_io_ompio_local_io_array)); - if (NULL == file_offsets_for_agg) { + + if (entries_per_aggregator > 0) { + file_offsets_for_agg = (mca_io_ompio_local_io_array *) malloc(entries_per_aggregator * + sizeof(mca_io_ompio_local_io_array)); + memory_displacements = (MPI_Aint *) malloc (entries_per_aggregator * sizeof(MPI_Aint)); + sorted_file_offsets = (int *) malloc (entries_per_aggregator*sizeof(int)); + if (NULL == memory_displacements || NULL == file_offsets_for_agg || + NULL == sorted_file_offsets) { opal_output (1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } - - sorted_file_offsets = (int *) - malloc (entries_per_aggregator*sizeof(int)); - if (NULL == sorted_file_offsets){ - opal_output (1, "OUT OF MEMORY\n"); - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto exit; - } - - /*Moving file offsets to an IO array!*/ - temp_index = 0; - - for (i=0;iprocs_per_group; i++){ - for(j=0;jdisp_index[i];j++){ - if (data->blocklen_per_process[i][j] > 0){ - file_offsets_for_agg[temp_index].length = - data->blocklen_per_process[i][j]; - file_offsets_for_agg[temp_index].process_id = i; - file_offsets_for_agg[temp_index].offset = - data->displs_per_process[i][j]; - temp_index++; - -#if DEBUG_ON - printf("************Cycle: %d, Aggregator: %d ***************\n", - index+1,rank); - - printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n", - data->procs_in_group[i],j, - data->blocklen_per_process[i][j],j, - data->displs_per_process[i][j], - rank); -#endif - } - } - } - - /* Sort the displacements for each aggregator*/ - local_heap_sort (file_offsets_for_agg, - entries_per_aggregator, - sorted_file_offsets); - - /*create contiguous memory displacements - based on blocklens on the same displs array - and map it to this aggregator's actual - file-displacements (this is in the io-array created above)*/ - memory_displacements = (MPI_Aint *) malloc - (entries_per_aggregator * sizeof(MPI_Aint)); - - memory_displacements[sorted_file_offsets[0]] = 0; - for (i=1; iprocs_per_group * sizeof (int)); - if (NULL == temp_disp_index) { - opal_output (1, "OUT OF MEMORY\n"); - ret = OMPI_ERR_OUT_OF_RESOURCE; + + ret = mca_fcoll_vulcan_calc_file_offsets(data, file_offsets_for_agg, sorted_file_offsets, + memory_displacements, entries_per_aggregator, + rank, index); + if (OMPI_SUCCESS != ret) { goto exit; } - - /*Now update the displacements array with memory offsets*/ -#if DEBUG_ON - global_count = 0; -#endif - for (i=0;idispls_per_process[temp_pindex][temp_disp_index[temp_pindex]] = - memory_displacements[sorted_file_offsets[i]]; - if (temp_disp_index[temp_pindex] < data->disp_index[temp_pindex]) - temp_disp_index[temp_pindex] += 1; - else{ - printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n", - temp_pindex, temp_disp_index[temp_pindex], - temp_pindex, data->disp_index[temp_pindex]); - } -#if DEBUG_ON - global_count += - file_offsets_for_agg[sorted_file_offsets[i]].length; -#endif - } - - if (NULL != temp_disp_index){ - free(temp_disp_index); - temp_disp_index = NULL; - } - -#if DEBUG_ON - - printf("************Cycle: %d, Aggregator: %d ***************\n", - index+1,rank); - for (i=0;iprocs_per_group; i++){ - for(j=0;jdisp_index[i];j++){ - if (data->blocklen_per_process[i][j] > 0){ - printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n", - data->procs_in_group[i],j, - data->blocklen_per_process[i][j],j, - data->displs_per_process[i][j], - rank); - - } - } - } - printf("************Cycle: %d, Aggregator: %d ***************\n", - index+1,rank); - for (i=0; iprocs_per_group; i++) { + for (i = 0; i < data->procs_per_group; i++) { size_t datatype_size; reqs[i] = MPI_REQUEST_NULL; - if ( 0 < data->disp_index[i] ) { + if (0 < data->disp_index[i]) { ompi_datatype_create_hindexed(data->disp_index[i], data->blocklen_per_process[i], data->displs_per_process[i], @@ -1239,7 +840,7 @@ static int shuffle_init ( int index, int cycles, int aggregator, int rank, mca_i ompi_datatype_commit(&data->recvtype[i]); opal_datatype_type_size(&data->recvtype[i]->super, &datatype_size); - if (datatype_size){ + if (datatype_size) { ret = MCA_PML_CALL(irecv(data->global_buf, 1, data->recvtype[i], @@ -1278,8 +879,7 @@ static int shuffle_init ( int index, int cycles, int aggregator, int rank, mca_i if(0 == block_index) { send_mem_address = (ptrdiff_t) (data->decoded_iov[data->iov_index].iov_base) + data->current_position; - } - else { + } else { // Reallocate more memory if blocklength_size is not enough if(0 == block_index % INIT_LEN) { blocklength_size += INIT_LEN; @@ -1290,17 +890,14 @@ static int shuffle_init ( int index, int cycles, int aggregator, int rank, mca_i data->current_position - send_mem_address; } - if (remaining >= - (data->decoded_iov[data->iov_index].iov_len - data->current_position)) { - + if (remaining >= (data->decoded_iov[data->iov_index].iov_len - data->current_position)) { blocklength_proc[block_index] = data->decoded_iov[data->iov_index].iov_len - data->current_position; remaining = remaining - (data->decoded_iov[data->iov_index].iov_len - data->current_position); data->iov_index = data->iov_index + 1; data->current_position = 0; - } - else { + } else { blocklength_proc[block_index] = remaining; data->current_position += remaining; remaining = 0; @@ -1335,77 +932,23 @@ static int shuffle_init ( int index, int cycles, int aggregator, int rank, mca_i } } -#if DEBUG_ON - if (aggregator == rank){ - printf("************Cycle: %d, Aggregator: %d ***************\n", - index+1,rank); - for (i=0 ; iglobal_buf)[i]); - } -#endif - -//#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN -// end_comm_time = MPI_Wtime(); -// comm_time += (end_comm_time - start_comm_time); -//#endif /********************************************************** *** 7f. Create the io array, and pass it to fbtl *********************************************************/ - - if (aggregator == rank && entries_per_aggregator>0) { - - - data->io_array = (mca_common_ompio_io_array_t *) malloc - (entries_per_aggregator * sizeof (mca_common_ompio_io_array_t)); + if (aggregator == rank && entries_per_aggregator > 0) { + data->io_array = (mca_common_ompio_io_array_t *) malloc (entries_per_aggregator * + sizeof (mca_common_ompio_io_array_t)); if (NULL == data->io_array) { opal_output(1, "OUT OF MEMORY\n"); ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit; } - - data->num_io_entries = 0; - /*First entry for every aggregator*/ - data->io_array[0].offset = - (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset; - data->io_array[0].length = - file_offsets_for_agg[sorted_file_offsets[0]].length; - data->io_array[0].memory_address = - data->global_buf+memory_displacements[sorted_file_offsets[0]]; - data->num_io_entries++; - - for (i=1;iio_array[data->num_io_entries - 1].length += - file_offsets_for_agg[sorted_file_offsets[i]].length; - } - else { - data->io_array[data->num_io_entries].offset = - (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset; - data->io_array[data->num_io_entries].length = - file_offsets_for_agg[sorted_file_offsets[i]].length; - data->io_array[data->num_io_entries].memory_address = - data->global_buf+memory_displacements[sorted_file_offsets[i]]; - data->num_io_entries++; - } - - } - -#if DEBUG_ON - printf("*************************** %d\n", num_of_io_entries); - for (i=0 ; iio_array, &data->num_io_entries, entries_per_aggregator, + (char*)data->global_buf, file_offsets_for_agg, sorted_file_offsets, + memory_displacements, rank); } - + exit: free(sorted_file_offsets); free(file_offsets_for_agg); @@ -1413,10 +956,10 @@ static int shuffle_init ( int index, int cycles, int aggregator, int rank, mca_i free(blocklength_proc); free(displs_proc); - return OMPI_SUCCESS; + return ret; } -static int mca_fcoll_vulcan_minmax ( ompio_file_t *fh, struct iovec *iov, int iov_count, int num_aggregators, long *new_stripe_size) +int mca_fcoll_vulcan_minmax (ompio_file_t *fh, struct iovec *iov, int iov_count, int num_aggregators, long *new_stripe_size) { long min, max, globalmin, globalmax; long stripe_size; @@ -1430,12 +973,10 @@ static int mca_fcoll_vulcan_minmax ( ompio_file_t *fh, struct iovec *iov, int io max = 0; } fh->f_comm->c_coll->coll_allreduce ( &min, &globalmin, 1, MPI_LONG, MPI_MIN, - fh->f_comm, fh->f_comm->c_coll->coll_allreduce_module); + fh->f_comm, fh->f_comm->c_coll->coll_allreduce_module); fh->f_comm->c_coll->coll_allreduce ( &max, &globalmax, 1, MPI_LONG, MPI_MAX, - fh->f_comm, fh->f_comm->c_coll->coll_allreduce_module); - - // if ( fh->f_rank < 10 ) printf("[%d]: min=%ld max=%ld globalmin=%ld, globalmax=%ld num_aggregators=%d\n", fh->f_rank, min, max, globalmin, globalmax, num_aggregators); + fh->f_comm, fh->f_comm->c_coll->coll_allreduce_module); stripe_size = (globalmax - globalmin)/num_aggregators; if ( (globalmax - globalmin) % num_aggregators ) { @@ -1443,13 +984,9 @@ static int mca_fcoll_vulcan_minmax ( ompio_file_t *fh, struct iovec *iov, int io } *new_stripe_size = stripe_size; - // if ( fh->f_rank == 0 ) - // printf(" partition size is %ld\n", stripe_size); return OMPI_SUCCESS; } - - int mca_fcoll_vulcan_break_file_view ( struct iovec *mem_iov, int mem_count, struct iovec *file_iov, int file_count, @@ -1678,21 +1215,11 @@ int mca_fcoll_vulcan_break_file_view ( struct iovec *mem_iov, int mem_count, return ret; } - -int mca_fcoll_vulcan_get_configuration (ompio_file_t *fh, int num_io_procs, int num_groups, - size_t max_data) +int mca_fcoll_vulcan_get_configuration (ompio_file_t *fh, int num_io_procs, size_t max_data) { int i, ret; - ret = mca_common_ompio_set_aggregator_props (fh, num_io_procs, max_data); - /* Note: as of this version of the vulcan component, we are not using yet - the num_groups parameter to split the aggregators (and processes) into - distinct subgroups. This will however hopefullty be done in a second step - as well, allowing to keep communication just to individual subgroups of processes, - each subgroup using however the classic two-phase collective I/O algorithm - with multiple aggregators and even partitioning internally. - - For now, logically all processes are in a single group. */ + ret = mca_common_ompio_set_aggregator_props (fh, num_io_procs, max_data); fh->f_procs_per_group = fh->f_size; if ( NULL != fh->f_procs_in_group ) { @@ -1708,63 +1235,269 @@ int mca_fcoll_vulcan_get_configuration (ompio_file_t *fh, int num_io_procs, int return ret; } - -int mca_fcoll_vulcan_split_iov_array ( ompio_file_t *fh, mca_common_ompio_io_array_t *io_array, int num_entries, - int *ret_array_pos, int *ret_pos, int chunk_size ) +void mca_fcoll_vulcan_calc_blocklen_disps (mca_io_ompio_aggregator_data *data, int aggregator, + int rank, size_t *bytes_comm) { + size_t bytes_tmp = *bytes_comm; + int blocks = 0; + int j; - int array_pos = *ret_array_pos; - int pos = *ret_pos; - size_t bytes_written = 0; - size_t bytes_to_write = chunk_size; + /* The blocklen and displs calculation only done at aggregators */ + while (data->bytes_to_write_in_cycle) { - if ( 0 == array_pos && 0 == pos ) { - fh->f_io_array = (mca_common_ompio_io_array_t *) malloc ( num_entries * sizeof(mca_common_ompio_io_array_t)); - if ( NULL == fh->f_io_array ){ - opal_output (1,"Could not allocate memory\n"); - return -1; + /* This next block identifies which process is the holder + ** of the sorted[current_index] element; + */ + blocks = data->fview_count[0]; + for (j = 0 ; j < data->procs_per_group ; j++) { + if (data->sorted[data->current_index] < blocks) { + data->n = j; + break; + } else { + blocks += data->fview_count[j+1]; + } + } + + if (data->bytes_remaining) { + /* Finish up a partially used buffer from the previous cycle */ + + if (data->bytes_remaining <= data->bytes_to_write_in_cycle) { + /* The data fits completely into the block */ + if (aggregator == rank) { + data->blocklen_per_process[data->n][data->disp_index[data->n]] = data->bytes_remaining; + data->displs_per_process[data->n][data->disp_index[data->n]] = + (ptrdiff_t)data->global_iov_array[data->sorted[data->current_index]].iov_base + + (data->global_iov_array[data->sorted[data->current_index]].iov_len + - data->bytes_remaining); + + data->disp_index[data->n] += 1; + + /* In this cases the length is consumed so allocating for + next displacement and blocklength*/ + if (data->disp_index[data->n] == data->max_disp_index[data->n]) { + data->max_disp_index[data->n] *= 2; + + data->blocklen_per_process[data->n] = (int *) realloc + ((void *)data->blocklen_per_process[data->n], + (data->max_disp_index[data->n])*sizeof(int)); + data->displs_per_process[data->n] = (MPI_Aint *) realloc + ((void *)data->displs_per_process[data->n], + (data->max_disp_index[data->n])*sizeof(MPI_Aint)); + } + data->blocklen_per_process[data->n][data->disp_index[data->n]] = 0; + data->displs_per_process[data->n][data->disp_index[data->n]] = 0; + } + if (data->procs_in_group[data->n] == rank) { + bytes_tmp += data->bytes_remaining; + } + data->current_index ++; + data->bytes_to_write_in_cycle -= data->bytes_remaining; + data->bytes_remaining = 0; + } else { + /* the remaining data from the previous cycle is larger than the + data->bytes_to_write_in_cycle, so we have to segment again */ + if (aggregator == rank) { + data->blocklen_per_process[data->n][data->disp_index[data->n]] = data->bytes_to_write_in_cycle; + data->displs_per_process[data->n][data->disp_index[data->n]] = + (ptrdiff_t)data->global_iov_array[data->sorted[data->current_index]].iov_base + + (data->global_iov_array[data->sorted[data->current_index]].iov_len + - data->bytes_remaining); + data->disp_index[data->n] += 1; + } + + if (data->procs_in_group[data->n] == rank) { + bytes_tmp += data->bytes_to_write_in_cycle; + } + data->bytes_remaining -= data->bytes_to_write_in_cycle; + data->bytes_to_write_in_cycle = 0; + break; + } + } else { + /* No partially used entry available, have to start a new one */ + if (data->bytes_to_write_in_cycle < + (MPI_Aint) data->global_iov_array[data->sorted[data->current_index]].iov_len) { + /* This entry has more data than we can sendin one cycle */ + if (aggregator == rank) { + data->blocklen_per_process[data->n][data->disp_index[data->n]] = data->bytes_to_write_in_cycle; + data->displs_per_process[data->n][data->disp_index[data->n]] = + (ptrdiff_t)data->global_iov_array[data->sorted[data->current_index]].iov_base ; + data->disp_index[data->n] += 1; + } + if (data->procs_in_group[data->n] == rank) { + bytes_tmp += data->bytes_to_write_in_cycle; + } + data->bytes_remaining = data->global_iov_array[data->sorted[data->current_index]].iov_len - + data->bytes_to_write_in_cycle; + data->bytes_to_write_in_cycle = 0; + break; + } else { + /* Next data entry is less than data->bytes_to_write_in_cycle */ + if (aggregator == rank) { + data->blocklen_per_process[data->n][data->disp_index[data->n]] = + data->global_iov_array[data->sorted[data->current_index]].iov_len; + data->displs_per_process[data->n][data->disp_index[data->n]] = (ptrdiff_t) + data->global_iov_array[data->sorted[data->current_index]].iov_base; + + data->disp_index[data->n] += 1; + + /* realloc for next blocklength and assign this displacement + ** and check for next displs as the total length of this entry + ** has been consumed */ + if (data->disp_index[data->n] == data->max_disp_index[data->n] ) { + data->max_disp_index[data->n] *=2 ; + data->blocklen_per_process[data->n] = (int *) realloc ( + (void *)data->blocklen_per_process[data->n], + (data->max_disp_index[data->n]*sizeof(int))); + data->displs_per_process[data->n] = (MPI_Aint *)realloc ( + (void *)data->displs_per_process[data->n], + (data->max_disp_index[data->n]*sizeof(MPI_Aint))); + } + data->blocklen_per_process[data->n][data->disp_index[data->n]] = 0; + data->displs_per_process[data->n][data->disp_index[data->n]] = 0; + } + if (data->procs_in_group[data->n] == rank) { + bytes_tmp += data->global_iov_array[data->sorted[data->current_index]].iov_len; + } + data->bytes_to_write_in_cycle -= + data->global_iov_array[data->sorted[data->current_index]].iov_len; + data->current_index ++; + } } } - - int i=0; - while (bytes_to_write > 0 ) { - fh->f_io_array[i].memory_address = &(((char *)io_array[array_pos].memory_address)[pos]); - fh->f_io_array[i].offset = &(((char *)io_array[array_pos].offset)[pos]); - if ( (io_array[array_pos].length - pos ) >= bytes_to_write ) { - fh->f_io_array[i].length = bytes_to_write; + *bytes_comm = bytes_tmp; +} + +int mca_fcoll_vulcan_calc_file_offsets(mca_io_ompio_aggregator_data *data, mca_io_ompio_local_io_array *file_offsets_for_agg, + int *sorted_file_offsets, MPI_Aint *memory_displacements, int entries_per_aggregator, + int rank, int index) +{ + int *temp_disp_index; + int temp_index = 0; + int temp_pindex; + int i, j; + + /* Moving file offsets to an IO array */ + for (i = 0; i < data->procs_per_group; i++){ + for(j = 0; j < data->disp_index[i];j++){ + if (data->blocklen_per_process[i][j] > 0){ + file_offsets_for_agg[temp_index].length = + data->blocklen_per_process[i][j]; + file_offsets_for_agg[temp_index].process_id = i; + file_offsets_for_agg[temp_index].offset = + data->displs_per_process[i][j]; + temp_index++; + } + } + } + + /* Sort the displacements for each aggregator */ + local_heap_sort (file_offsets_for_agg, entries_per_aggregator, + sorted_file_offsets); + + /* create contiguous memory displacements based on blocklens + ** on the same displs array and map it to this aggregator's actual + ** file-displacements */ + memory_displacements[sorted_file_offsets[0]] = 0; + for (i = 1; i < entries_per_aggregator; i++){ + memory_displacements[sorted_file_offsets[i]] = + memory_displacements[sorted_file_offsets[i-1]] + + file_offsets_for_agg[sorted_file_offsets[i-1]].length; + } + + temp_disp_index = (int *)calloc (1, data->procs_per_group * sizeof (int)); + if (NULL == temp_disp_index) { + opal_output (1, "OUT OF MEMORY\n"); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* Now update the displacements array with memory offsets */ + for (i = 0; i < entries_per_aggregator;i++) { + temp_pindex = file_offsets_for_agg[sorted_file_offsets[i]].process_id; + data->displs_per_process[temp_pindex][temp_disp_index[temp_pindex]] = + memory_displacements[sorted_file_offsets[i]]; + if (temp_disp_index[temp_pindex] < data->disp_index[temp_pindex]) { + temp_disp_index[temp_pindex] += 1; } else { - fh->f_io_array[i].length = io_array[array_pos].length - pos; + printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n", + temp_pindex, temp_disp_index[temp_pindex], + temp_pindex, data->disp_index[temp_pindex]); } + } - pos += fh->f_io_array[i].length; - bytes_written += fh->f_io_array[i].length; - bytes_to_write-= fh->f_io_array[i].length; - i++; + free(temp_disp_index); - if ( pos == (int)io_array[array_pos].length ) { - pos = 0; - if ((array_pos + 1) < num_entries) { - array_pos++; - } - else { - break; +#if DEBUG_ON + printf("************Cycle: %d, Aggregator: %d ***************\n", + index+1, rank); + for (i = 0; i < data->procs_per_group; i++){ + for(j = 0; j < data->disp_index[i]; j++){ + if (data->blocklen_per_process[i][j] > 0){ + printf("%d communicate blocklen[%d]: %d, disp[%d]: %ld to %d\n", + data->procs_in_group[i],j, + data->blocklen_per_process[i][j],j, + data->displs_per_process[i][j], + rank); } } } - - fh->f_num_of_io_entries = i; - *ret_array_pos = array_pos; - *ret_pos = pos; - return bytes_written; + printf("************Cycle: %d, Aggregator: %d ***************\n", + index+1, rank); + for (i = 0; i < entries_per_aggregator;i++){ + printf("%d: OFFSET: %lld LENGTH: %ld, Mem-offset: %ld\n", + file_offsets_for_agg[sorted_file_offsets[i]].process_id, + file_offsets_for_agg[sorted_file_offsets[i]].offset, + file_offsets_for_agg[sorted_file_offsets[i]].length, + memory_displacements[sorted_file_offsets[i]]); + } +#endif + + return OMPI_SUCCESS; +} + +void mca_fcoll_vulcan_calc_io_array(mca_common_ompio_io_array_t *io_array, int *num_io_entries, int max_io_entries, + char *global_buf, mca_io_ompio_local_io_array *file_offsets_for_agg, + int *sorted_offsets, MPI_Aint *memory_displacements, int rank) +{ + int i; + int num_entries; + + /* First entry for every aggregator */ + io_array[0].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_offsets[0]].offset; + io_array[0].length = file_offsets_for_agg[sorted_offsets[0]].length; + io_array[0].memory_address = global_buf + memory_displacements[sorted_offsets[0]]; + num_entries = 1; + + /* If the entries are contiguous merge them, else add a new entry */ + for (i = 1; i < max_io_entries; i++) { + if (file_offsets_for_agg[sorted_offsets[i-1]].offset + + file_offsets_for_agg[sorted_offsets[i-1]].length == + file_offsets_for_agg[sorted_offsets[i]].offset) { + io_array[num_entries - 1].length += file_offsets_for_agg[sorted_offsets[i]].length; + } else { + io_array[num_entries].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_offsets[i]].offset; + io_array[num_entries].length = file_offsets_for_agg[sorted_offsets[i]].length; + io_array[num_entries].memory_address = global_buf + memory_displacements[sorted_offsets[i]]; + num_entries++; + } + } + + *num_io_entries = num_entries; +#if DEBUG_ON + printf("*************************** %d\n", num_entries); + for (i = 0; i < num_entries; i++) { + printf(" AGGREGATOR %d ADDRESS: %p OFFSET: %ld LENGTH: %ld\n", + rank, io_array[i].memory_address, + (ptrdiff_t)io_array[i].offset, + io_array[i].length); + } +#endif } - static int local_heap_sort (mca_io_ompio_local_io_array *io_array, - int num_entries, - int *sorted) + int num_entries, int *sorted) { int i = 0; int j = 0; @@ -1864,5 +1597,3 @@ static int local_heap_sort (mca_io_ompio_local_io_array *io_array, } return OMPI_SUCCESS; } - - diff --git a/ompi/mca/fcoll/vulcan/fcoll_vulcan_internal.h b/ompi/mca/fcoll/vulcan/fcoll_vulcan_internal.h new file mode 100644 index 00000000000..76402297044 --- /dev/null +++ b/ompi/mca/fcoll/vulcan/fcoll_vulcan_internal.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_FCOLL_VULCAN_INTERNAL_H +#define MCA_FCOLL_VULCAN_INTERNAL_H + +#include "ompi_config.h" + + +BEGIN_C_DECLS +/* Used for loading file-offsets per aggregator*/ +typedef struct mca_io_ompio_local_io_array{ + OMPI_MPI_OFFSET_TYPE offset; + MPI_Aint length; + int process_id; +}mca_io_ompio_local_io_array; + +typedef struct mca_io_ompio_aggregator_data { + int *disp_index, *sorted, n; + size_t *fview_count; + int *max_disp_index; + int **blocklen_per_process; + MPI_Aint **displs_per_process, total_bytes, bytes_per_cycle, total_bytes_written; + MPI_Comm comm; + char *global_buf, *prev_global_buf; + ompi_datatype_t **recvtype, **prev_recvtype; + struct iovec *global_iov_array; + int current_index, current_position; + int bytes_to_write_in_cycle, bytes_remaining, procs_per_group; + int *procs_in_group, iov_index; + size_t bytes_sent, prev_bytes_sent; + struct iovec *decoded_iov; + int bytes_to_write, prev_bytes_to_write; + mca_common_ompio_io_array_t *io_array, *prev_io_array; + int num_io_entries, prev_num_io_entries; +} mca_io_ompio_aggregator_data; + + +#define SWAP_REQUESTS(_r1,_r2) { \ + ompi_request_t **_t=_r1; \ + _r1=_r2; \ + _r2=_t;} + +#define SWAP_AGGR_POINTERS(_aggr,_num) { \ + int _i; \ + char *_t; \ + for (_i=0; _i<_num; _i++ ) { \ + _aggr[_i]->prev_io_array=_aggr[_i]->io_array; \ + _aggr[_i]->prev_num_io_entries=_aggr[_i]->num_io_entries; \ + _aggr[_i]->prev_bytes_sent=_aggr[_i]->bytes_sent; \ + _aggr[_i]->prev_bytes_to_write=_aggr[_i]->bytes_to_write; \ + _t=_aggr[_i]->prev_global_buf; \ + _aggr[_i]->prev_global_buf=_aggr[_i]->global_buf; \ + _aggr[_i]->global_buf=_t; \ + _t=(char *)_aggr[_i]->recvtype; \ + _aggr[_i]->recvtype=_aggr[_i]->prev_recvtype; \ + _aggr[_i]->prev_recvtype=(ompi_datatype_t **)_t; } \ +} + +int mca_fcoll_vulcan_break_file_view (struct iovec *decoded_iov, int iov_count, + struct iovec *local_iov_array, int local_count, + struct iovec ***broken_decoded_iovs, int **broken_iov_counts, + struct iovec ***broken_iov_arrays, int **broken_counts, + MPI_Aint **broken_total_lengths, + int stripe_count, size_t stripe_size); + +int mca_fcoll_vulcan_get_configuration (ompio_file_t *fh, int num_io_procs, + size_t max_data); + +int mca_fcoll_vulcan_minmax (ompio_file_t *fh, struct iovec *iov, int iov_count, + int num_aggregators, long *new_stripe_size); + +void mca_fcoll_vulcan_calc_blocklen_disps (mca_io_ompio_aggregator_data *data, int aggregator, + int rank, size_t *bytes_comm); + +int mca_fcoll_vulcan_calc_file_offsets(mca_io_ompio_aggregator_data *data, + mca_io_ompio_local_io_array *file_offsets_for_agg, + int *sorted_file_offsets, MPI_Aint *memory_displacements, + int entries_per_aggregator, int rank, int index); + +void mca_fcoll_vulcan_calc_io_array(mca_common_ompio_io_array_t *io_array, int *num_io_entries, int max_io_arrays, + char *global_buf, mca_io_ompio_local_io_array *file_offsets_for_agg, + int *sorted_offsets, MPI_Aint *memory_displacements, int rank); + +END_C_DECLS + +#endif /* MCA_FCOLL_VULCAN_INTERNAL_H */ diff --git a/ompi/mca/pml/ucx/pml_ucx.c b/ompi/mca/pml/ucx/pml_ucx.c index dd16a27b154..c748b02e12f 100644 --- a/ompi/mca/pml/ucx/pml_ucx.c +++ b/ompi/mca/pml/ucx/pml_ucx.c @@ -739,6 +739,8 @@ int mca_pml_ucx_isend_init(const void *buf, size_t count, ompi_datatype_t *datat struct ompi_communicator_t* comm, struct ompi_request_t **request) { + int rc; + uint32_t cid; mca_pml_ucx_persistent_request_t *req; ucp_ep_h ep; @@ -755,12 +757,17 @@ int mca_pml_ucx_isend_init(const void *buf, size_t count, ompi_datatype_t *datat return OMPI_ERROR; } + rc = ompi_comm_get_remote_cid(comm, dst, &cid); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { + return rc; + } + req->ompi.req_state = OMPI_REQUEST_INACTIVE; req->ompi.req_mpi_object.comm = comm; req->flags = MCA_PML_UCX_REQUEST_FLAG_SEND; req->buffer = (void *)buf; req->count = count; - req->tag = PML_UCX_MAKE_SEND_TAG(tag, comm); + req->tag = PML_UCX_MAKE_SEND_TAG(tag, comm, cid); req->send.mode = mode; req->send.ep = ep; req->ompi_datatype = datatype; @@ -885,7 +892,9 @@ int mca_pml_ucx_isend(const void *buf, size_t count, ompi_datatype_t *datatype, struct ompi_communicator_t* comm, struct ompi_request_t **request) { + int rc; ompi_request_t *req; + uint32_t cid; ucp_ep_h ep; PML_UCX_TRACE_SEND("i%ssend request *%p", @@ -897,15 +906,18 @@ int mca_pml_ucx_isend(const void *buf, size_t count, ompi_datatype_t *datatype, if (OPAL_UNLIKELY(NULL == ep)) { return OMPI_ERROR; } - + rc = ompi_comm_get_remote_cid(comm, dst, &cid); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { + return rc; + } #if HAVE_DECL_UCP_TAG_SEND_NBX req = (ompi_request_t*)mca_pml_ucx_common_send_nbx(ep, buf, count, datatype, - PML_UCX_MAKE_SEND_TAG(tag, comm), mode, + PML_UCX_MAKE_SEND_TAG(tag, comm, cid), mode, &mca_pml_ucx_get_op_data(datatype)->op_param.isend); #else req = (ompi_request_t*)mca_pml_ucx_common_send(ep, buf, count, datatype, mca_pml_ucx_get_datatype(datatype), - PML_UCX_MAKE_SEND_TAG(tag, comm), mode, + PML_UCX_MAKE_SEND_TAG(tag, comm, cid), mode, mca_pml_ucx_send_completion); #endif @@ -1002,7 +1014,9 @@ int mca_pml_ucx_send(const void *buf, size_t count, ompi_datatype_t *datatype, i int tag, mca_pml_base_send_mode_t mode, struct ompi_communicator_t* comm) { + int rc; ucp_ep_h ep; + uint32_t cid; PML_UCX_TRACE_SEND("%s", buf, count, datatype, dst, tag, mode, comm, mode == MCA_PML_BASE_SEND_BUFFERED ? "bsend" : "send"); @@ -1019,17 +1033,22 @@ int mca_pml_ucx_send(const void *buf, size_t count, ompi_datatype_t *datatype, i OMPI_SPC_BYTES_SENT_USER, OMPI_SPC_BYTES_SENT_MPI); #endif + rc = ompi_comm_get_remote_cid(comm, dst, &cid); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { + return rc; + } + #if HAVE_DECL_UCP_TAG_SEND_NBR if (OPAL_LIKELY((MCA_PML_BASE_SEND_BUFFERED != mode) && (MCA_PML_BASE_SEND_SYNCHRONOUS != mode))) { return mca_pml_ucx_send_nbr(ep, buf, count, datatype, - PML_UCX_MAKE_SEND_TAG(tag, comm)); + PML_UCX_MAKE_SEND_TAG(tag, comm, cid)); } #endif return mca_pml_ucx_send_nb(ep, buf, count, datatype, mca_pml_ucx_get_datatype(datatype), - PML_UCX_MAKE_SEND_TAG(tag, comm), mode); + PML_UCX_MAKE_SEND_TAG(tag, comm, cid), mode); } int mca_pml_ucx_iprobe(int src, int tag, struct ompi_communicator_t* comm, diff --git a/ompi/mca/pml/ucx/pml_ucx_component.c b/ompi/mca/pml/ucx/pml_ucx_component.c index 5639e2b1f34..ec095e19fef 100644 --- a/ompi/mca/pml/ucx/pml_ucx_component.c +++ b/ompi/mca/pml/ucx/pml_ucx_component.c @@ -145,6 +145,10 @@ mca_pml_ucx_component_init(int* priority, bool enable_progress_threads, *priority = (support_level == OPAL_COMMON_UCX_SUPPORT_DEVICE) ? ompi_pml_ucx.priority : 19; PML_UCX_VERBOSE(2, "returning priority %d", *priority); + + /** this pml supports the extended CID space */ + ompi_pml_ucx.super.pml_flags |= MCA_PML_BASE_FLAG_SUPPORTS_EXT_CID; + return &ompi_pml_ucx.super; } diff --git a/ompi/mca/pml/ucx/pml_ucx_request.c b/ompi/mca/pml/ucx/pml_ucx_request.c index fccb9f6a6f6..1a8d0dbc043 100644 --- a/ompi/mca/pml/ucx/pml_ucx_request.c +++ b/ompi/mca/pml/ucx/pml_ucx_request.c @@ -282,7 +282,7 @@ void mca_pml_ucx_completed_request_init(ompi_request_t *ompi_req) mca_pml_ucx_request_init_common(ompi_req, false, OMPI_REQUEST_ACTIVE, mca_pml_completed_request_free, mca_pml_completed_request_cancel); - ompi_req->req_mpi_object.comm = &ompi_mpi_comm_world.comm; + ompi_req->req_mpi_object.comm = &ompi_mpi_comm_null.comm; ompi_request_complete(ompi_req, false); } diff --git a/ompi/mca/pml/ucx/pml_ucx_request.h b/ompi/mca/pml/ucx/pml_ucx_request.h index 8132f6b54ba..9e901794f8d 100644 --- a/ompi/mca/pml/ucx/pml_ucx_request.h +++ b/ompi/mca/pml/ucx/pml_ucx_request.h @@ -43,10 +43,10 @@ enum { #define PML_UCX_TAG_MASK 0x7fffff0000000000ul -#define PML_UCX_MAKE_SEND_TAG(_tag, _comm) \ +#define PML_UCX_MAKE_SEND_TAG(_tag, _comm, _c_index) \ ((((uint64_t) (_tag) ) << (PML_UCX_RANK_BITS + PML_UCX_CONTEXT_BITS)) | \ (((uint64_t)(_comm)->c_my_rank ) << PML_UCX_CONTEXT_BITS) | \ - ((uint64_t)(_comm)->c_index)) + ((uint64_t)(_c_index))) #define PML_UCX_MAKE_RECV_TAG(_ucp_tag, _ucp_tag_mask, _tag, _src, _comm) \ diff --git a/ompi/runtime/ompi_mpi_params.c b/ompi/runtime/ompi_mpi_params.c index c31e47e4af8..cbc82f42f23 100644 --- a/ompi/runtime/ompi_mpi_params.c +++ b/ompi/runtime/ompi_mpi_params.c @@ -20,7 +20,7 @@ * All rights reserved. * Copyright (c) 2016-2021 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2018-2021 Triad National Security, LLC. All rights + * Copyright (c) 2018-2024 Triad National Security, LLC. All rights * reserved. * Copyright (c) 2021 Nanook Consulting. All rights reserved. * Copyright (c) 2022 IBM Corporation. All rights reserved. @@ -104,6 +104,7 @@ bool ompi_ftmpi_enabled = false; #endif /* OPAL_ENABLE_FT_MPI */ static int ompi_stream_buffering_mode = -1; +int ompi_comm_verbose_level = 0; int ompi_mpi_register_params(void) { @@ -445,6 +446,10 @@ int ompi_mpi_register_params(void) } #endif /* OPAL_ENABLE_FT_MPI */ + (void) mca_base_var_register ("ompi", "mpi", "comm", "verbose", + "Verbosity level for communicator management subsystem", + MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL, &ompi_comm_verbose_level); return OMPI_SUCCESS; } diff --git a/ompi/runtime/params.h b/ompi/runtime/params.h index d9f48f80b59..db4e9043d7b 100644 --- a/ompi/runtime/params.h +++ b/ompi/runtime/params.h @@ -16,7 +16,7 @@ * Copyright (c) 2010-2012 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. * Copyright (c) 2013 Intel, Inc. All rights reserved - * Copyright (c) 2018-2021 Triad National Security, LLC. All rights + * Copyright (c) 2018-2024 Triad National Security, LLC. All rights * reserved. * Copyright (c) 2021 Nanook Consulting. All rights reserved. * $COPYRIGHT$ @@ -191,6 +191,12 @@ OMPI_DECLSPEC extern bool ompi_enable_timing; OMPI_DECLSPEC extern int ompi_mpi_event_tick_rate; OMPI_DECLSPEC extern bool ompi_mpi_yield_when_idle; + /** + * An integer value specifying verbosity level for communicator management + * subsystem. + */ +OMPI_DECLSPEC extern int ompi_comm_verbose_level; + /** * Register MCA parameters used by the MPI layer. * diff --git a/opal/mca/base/mca_base_var_group.c b/opal/mca/base/mca_base_var_group.c index 7bdb656b09c..732df663e98 100644 --- a/opal/mca/base/mca_base_var_group.c +++ b/opal/mca/base/mca_base_var_group.c @@ -15,6 +15,8 @@ * reserved. * Copyright (c) 2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2024 Triad National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -374,6 +376,8 @@ int mca_base_var_group_deregister(int group_index) for (int i = 0; i < size; ++i) { OBJ_RELEASE(enums[i]); } + opal_value_array_set_size(&group->group_enums, 0); + size = opal_value_array_get_size(&group->group_subgroups); subgroups = OPAL_VALUE_ARRAY_GET_BASE(&group->group_subgroups, int); diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index 78df6003212..f6195b41af3 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -324,10 +324,11 @@ int opal_common_ofi_providers_subset_of_list(struct fi_info *provider_list, char int opal_common_ofi_mca_register(const mca_base_component_t *component) { - static int include_index = -1; - static int exclude_index = -1; - static int verbose_index = -1; - static int accelerator_rank_index = -1; + int include_index; + int exclude_index; + int verbose_index; + int accelerator_rank_index; + int param; int ret; if (fi_version() < FI_VERSION(1, 0)) { @@ -336,7 +337,8 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component) OPAL_THREAD_LOCK(&opal_common_ofi_mutex); - if (0 > include_index) { + param = mca_base_var_find("opal", "opal_common", "ofi", "provider_include"); + if (0 > param) { /* * this monkey business is needed because of the way the MCA VARs stuff tries to handle * pointers to strings when when destructing the MCA var database. If you don't do @@ -359,9 +361,12 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component) ret = include_index; goto err; } + } else { + include_index = param; } - if (0 > exclude_index) { + param = mca_base_var_find("opal", "opal_common", "ofi", "provider_exclude"); + if (0 > param) { if (NULL == opal_common_ofi.prov_exclude) { opal_common_ofi.prov_exclude = (char **) malloc(sizeof(char *)); assert(NULL != opal_common_ofi.prov_exclude); @@ -378,9 +383,12 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component) ret = exclude_index; goto err; } + } else { + exclude_index = param; } - if (0 > verbose_index) { + param = mca_base_var_find("opal", "opal_common", "ofi", "verbose"); + if (0 > param) { verbose_index = mca_base_var_register("opal", "opal_common", "ofi", "verbose", "Verbose level of the OFI components", MCA_BASE_VAR_TYPE_INT, NULL, 0, @@ -391,9 +399,13 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component) ret = verbose_index; goto err; } + } else { + verbose_index = param; } - if (0 > accelerator_rank_index) { + + param = mca_base_var_find("opal", "opal_common", "ofi", "accelerator_rank"); + if (0 > param) { accelerator_rank_index = mca_base_var_register("opal", "opal_common", "ofi", "accelerator_rank", "Process rank(non-negative) on the selected accelerator device", @@ -404,6 +416,8 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component) ret = accelerator_rank_index; goto err; } + } else { + accelerator_rank_index = param; } if (component) { diff --git a/opal/mca/pmix/pmix-internal.h b/opal/mca/pmix/pmix-internal.h index 4e10393f60f..3c00306f501 100644 --- a/opal/mca/pmix/pmix-internal.h +++ b/opal/mca/pmix/pmix-internal.h @@ -9,7 +9,7 @@ * reserved. * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. * All Rights reserved. - * Copyright (c) 2021 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * Copyright (c) 2021 Argonne National Laboratory. All rights * reserved. * $COPYRIGHT$ @@ -293,7 +293,7 @@ typedef struct { OPAL_OUTPUT_VERBOSE((1, opal_pmix_verbose_output, \ "%s[%s:%d] MODEX RECV VALUE OPTIONAL FOR PROC %s KEY %s", \ OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), __FILE__, __LINE__, \ - OPAL_NAME_PRINT(*(p)), (s))); \ + OPAL_NAME_PRINT(*(p)), PMIx_Get_attribute_name(s))); \ OPAL_PMIX_CONVERT_NAME(&_proc, (p)); \ PMIX_INFO_LOAD(&_info, PMIX_OPTIONAL, NULL, PMIX_BOOL); \ (r) = PMIx_Get(&(_proc), (s), &(_info), 1, &(_kv)); \ @@ -334,7 +334,7 @@ typedef struct { OPAL_OUTPUT_VERBOSE((1, opal_pmix_verbose_output, \ "%s[%s:%d] MODEX RECV VALUE IMMEDIATE FOR PROC %s KEY %s", \ OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), __FILE__, __LINE__, \ - OPAL_NAME_PRINT(*(p)), (s))); \ + OPAL_NAME_PRINT(*(p)), PMIx_Get_attribute_name(s))); \ OPAL_PMIX_CONVERT_NAME(&_proc, (p)); \ PMIX_INFO_LOAD(&_info, PMIX_IMMEDIATE, NULL, PMIX_BOOL); \ (r) = PMIx_Get(&(_proc), (s), &(_info), 1, &(_kv)); \ @@ -370,7 +370,8 @@ typedef struct { size_t _sz; \ OPAL_OUTPUT_VERBOSE( \ (1, opal_pmix_verbose_output, "%s[%s:%d] MODEX RECV VALUE FOR PROC %s KEY %s", \ - OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), __FILE__, __LINE__, OPAL_NAME_PRINT(*(p)), (s))); \ + OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), __FILE__, __LINE__, OPAL_NAME_PRINT(*(p)), \ + PMIx_Get_attribute_name(s))); \ OPAL_PMIX_CONVERT_NAME(&_proc, (p)); \ (r) = PMIx_Get(&(_proc), (s), NULL, 0, &(_kv)); \ if (NULL == _kv) { \ @@ -406,7 +407,7 @@ typedef struct { OPAL_OUTPUT_VERBOSE((1, opal_pmix_verbose_output, \ "%s[%s:%d] MODEX RECV STRING OPTIONAL FOR PROC %s KEY %s", \ OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), __FILE__, __LINE__, \ - OPAL_NAME_PRINT(*(p)), (s))); \ + OPAL_NAME_PRINT(*(p)), PMIx_Get_attribute_name(s))); \ *(d) = NULL; \ *(sz) = 0; \ OPAL_PMIX_CONVERT_NAME(&_proc, (p)); \ @@ -444,7 +445,8 @@ typedef struct { pmix_info_t _info; \ OPAL_OUTPUT_VERBOSE( \ (1, opal_pmix_verbose_output, "%s[%s:%d] MODEX RECV STRING FOR PROC %s KEY %s", \ - OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), __FILE__, __LINE__, OPAL_NAME_PRINT(*(p)), (s))); \ + OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), __FILE__, __LINE__, OPAL_NAME_PRINT(*(p)), \ + PMIx_Get_attribute_name(s))); \ *(d) = NULL; \ *(sz) = 0; \ OPAL_PMIX_CONVERT_NAME(&_proc, (p)); \