diff --git a/.github/workflows/ompi_mpi4py.yaml b/.github/workflows/ompi_mpi4py.yaml
index 7df4939e5e7..8e3c450b4be 100644
--- a/.github/workflows/ompi_mpi4py.yaml
+++ b/.github/workflows/ompi_mpi4py.yaml
@@ -77,7 +77,6 @@ jobs:
         mkdir -p "$(dirname "$mca_params")"
         echo mpi_param_check = true >> "$mca_params"
         echo mpi_show_handle_leaks = true >> "$mca_params"
-        echo rmaps_base_oversubscribe = true >> "$mca_params"
         mca_params="$HOME/.prte/mca-params.conf"
         mkdir -p "$(dirname "$mca_params")"
         echo rmaps_default_mapping_policy = :oversubscribe >> "$mca_params"
diff --git a/3rd-party/openpmix b/3rd-party/openpmix
index e32e0179bc6..08e41ed5629 160000
--- a/3rd-party/openpmix
+++ b/3rd-party/openpmix
@@ -1 +1 @@
-Subproject commit e32e0179bc6bd1637f92690511ce6091719fa046
+Subproject commit 08e41ed5629b51832f5708181af6d89218c7a74e
diff --git a/3rd-party/prrte b/3rd-party/prrte
index 0f0a90006cb..30cadc6746e 160000
--- a/3rd-party/prrte
+++ b/3rd-party/prrte
@@ -1 +1 @@
-Subproject commit 0f0a90006cbc880d499b2356d6076e785e7868ba
+Subproject commit 30cadc6746ebddd69ea42ca78b964398f782e4e3
diff --git a/VERSION b/VERSION
index 4b702009b78..f1c8928faf6 100644
--- a/VERSION
+++ b/VERSION
@@ -27,7 +27,7 @@ mpi_standard_subversion=1
 # List in x.y.z format.
 pmix_min_version=4.2.0
 prte_min_version=3.0.0
-hwloc_min_version=1.11.0
+hwloc_min_version=2.1.0
 event_min_version=2.0.21
 automake_min_version=1.13.4
 autoconf_min_version=2.69.0
diff --git a/autogen.pl b/autogen.pl
index 51a10ee57a8..16f5981f99f 100755
--- a/autogen.pl
+++ b/autogen.pl
@@ -5,7 +5,7 @@
 # Copyright (c) 2013      Mellanox Technologies, Inc.
 #                         All rights reserved.
 # Copyright (c) 2013-2020 Intel, Inc.  All rights reserved.
-# Copyright (c) 2015-2021 Research Organization for Information Science
+# Copyright (c) 2015-2024 Research Organization for Information Science
 #                         and Technology (RIST).  All rights reserved.
 # Copyright (c) 2015-2022 IBM Corporation.  All rights reserved.
 # Copyright (c) 2020      Amazon.com, Inc. or its affiliates.
@@ -891,9 +891,9 @@ sub patch_autotools_output {
     # source tree); we can't fix it.  So all we can do is patch the
     # resulting configure script.  :-(
     push(@verbose_out, $indent_str . "Patching configure for Libtool PGI 10 fortran compiler name\n");
-    $c =~ s/gfortran g95 xlf95 f95 fort ifort ifc efc pgf95 lf95 ftn/gfortran g95 xlf95 f95 fort ifort ifc efc pgfortran pgf95 lf95 ftn/g;
-    $c =~ s/pgcc\* \| pgf77\* \| pgf90\* \| pgf95\*\)/pgcc* | pgf77* | pgf90* | pgf95* | pgfortran*)/g;
-    $c =~ s/pgf77\* \| pgf90\* \| pgf95\*\)/pgf77* | pgf90* | pgf95* | pgfortran*)/g;
+    $c =~ s/gfortran g95 xlf95 f95 fort ifort ifc efc pgf95 lf95 ftn/gfortran g95 xlf95 f95 fort ifort ifc efc pgfortran nvfortran pgf95 lf95 ftn/g;
+    $c =~ s/pgcc\* \| pgf77\* \| pgf90\* \| pgf95\*\)/pgcc* | pgf77* | pgf90* | pgf95* | pgfortran* | nvfortran*)/g;
+    $c =~ s/pgf77\* \| pgf90\* \| pgf95\*\)/pgf77* | pgf90* | pgf95* | pgfortran* | nvfortran*)/g;
 
     # Similar issue as above -- the PGI 10 version number broke <=LT
     # 2.2.6b's version number checking regexps.  Again, we can't fix the
@@ -1085,6 +1085,30 @@ sub patch_autotools_output {
         ;;";
     $c =~ s/$search_string/$replace_string/g;
 
+    $c =~ s/for ac_prog in gfortran f95 fort xlf95 ifort ifc efc pgfortran pgf95 lf95 f90 xlf90 pgf90 epcf90 nagfor/for ac_prog in gfortran f95 fort xlf95 ifort ifc efc pgfortran pgf95 lf95 f90 xlf90 pgf90 epcf90 nagfor nvfortran/g;
+    foreach my $tag (("", "_FC")) {
+        $search_string = 'tcc\*\)
+	# Fabrice Bellard et al\'s Tiny C Compiler
+	lt_prog_compiler_wl'."${tag}".'=\'-Wl,\'
+	lt_prog_compiler_pic'."${tag}".'=\'-fPIC\'
+	lt_prog_compiler_static'."${tag}".'=\'-static\'
+	;;';
+        $replace_string = "tcc*)
+        # Fabrice Bellard et al's Tiny C Compiler
+        lt_prog_compiler_wl${tag}='-Wl,'
+        lt_prog_compiler_pic${tag}='-fPIC'
+        lt_prog_compiler_static${tag}='-static'
+        ;;
+    nvc* | nvcc* | nvfortran*)
+	# NVIDIA Fortran compiler
+        lt_prog_compiler_wl${tag}='-Wl,'
+        lt_prog_compiler_pic${tag}='-fPIC'
+        lt_prog_compiler_static${tag}='-Bstatic'
+        ;;";
+        push(@verbose_out, $indent_str . "Patching configure for NVIDIA Fortran compiler (${tag})\n");
+        $c =~ s/$search_string/$replace_string/g;
+    }
+
     # Only write out verbose statements and a new configure if the
     # configure content actually changed
     return
diff --git a/config/ompi_setup_fc.m4 b/config/ompi_setup_fc.m4
index cf4212fc9e5..157f5a6301d 100644
--- a/config/ompi_setup_fc.m4
+++ b/config/ompi_setup_fc.m4
@@ -43,7 +43,7 @@ AC_DEFUN_ONCE([_OMPI_SETUP_FC_COMPILER],[
     # Fortran compilers (excluding the f77 compiler names) from AC's
     # default list of compilers and use it here.  This is the main
     # reason we have an OMPI-ized version of the PROG_FC macro.
-    AC_PROG_FC([gfortran f95 fort xlf95 ifort ifc efc pgfortran pgf95 lf95 f90 xlf90 pgf90 epcf90 nagfor])
+    AC_PROG_FC([gfortran f95 fort xlf95 ifort ifc efc pgfortran pgf95 lf95 f90 xlf90 pgf90 epcf90 nagfor nvfortran])
     FCFLAGS="$ompi_fcflags_save"
     OPAL_VAR_SCOPE_POP
 ])
diff --git a/examples/hello_sessions_c.c b/examples/hello_sessions_c.c
index 863aaa1269e..8d84c39ba6b 100644
--- a/examples/hello_sessions_c.c
+++ b/examples/hello_sessions_c.c
@@ -11,14 +11,11 @@ int main(int argc, char** argv) {
   MPI_Info info;
   MPI_Session s1, s2;
 
-#if 0
-/* need PR https://github.com/open-mpi/ompi/pull/12868 to be merged in
- * before this can be uncommented.
- */
   MPI_Info_create(&info);
-#endif
   MPI_Session_init(MPI_INFO_NULL, MPI_ERRORS_RETURN, &s1);
   MPI_Session_finalize(&s1);
   MPI_Session_init(MPI_INFO_NULL, MPI_ERRORS_RETURN, &s2);
   MPI_Session_finalize(&s2);
+  MPI_Info_free(&info);
+  return 0;
 }
diff --git a/ompi/communicator/comm.c b/ompi/communicator/comm.c
index d7fb321e3f8..2a9afd352be 100644
--- a/ompi/communicator/comm.c
+++ b/ompi/communicator/comm.c
@@ -24,7 +24,7 @@
  * Copyright (c) 2015      Mellanox Technologies. All rights reserved.
  * Copyright (c) 2017-2022 IBM Corporation.  All rights reserved.
  * Copyright (c) 2021      Nanook Consulting.  All rights reserved.
- * Copyright (c) 2018-2022 Triad National Security, LLC. All rights
+ * Copyright (c) 2018-2024 Triad National Security, LLC. All rights
  *                         reserved.
  * Copyright (c) 2023      Advanced Micro Devices, Inc. All rights reserved.
  * $COPYRIGHT$
@@ -1738,7 +1738,7 @@ int ompi_intercomm_create_from_groups (ompi_group_t *local_group, int local_lead
                                        ompi_communicator_t **newintercomm)
 {
     ompi_communicator_t *newcomp = NULL, *local_comm, *leader_comm = MPI_COMM_NULL;
-    ompi_comm_extended_cid_block_t new_block;
+    ompi_comm_extended_cid_block_t new_block = {0};
     bool i_am_leader = local_leader == local_group->grp_my_rank;
     ompi_proc_t **rprocs;
     uint64_t data[4];
@@ -1864,14 +1864,7 @@ int ompi_intercomm_create_from_groups (ompi_group_t *local_group, int local_lead
         return rc;
     }
 
-    /* will be using a communicator ID derived from the bridge communicator to save some time */
-    new_block.block_cid.cid_base = data[1];
-    new_block.block_cid.cid_sub.u64 = data[2];
-    new_block.block_nextsub = 0;
-    new_block.block_nexttag = 0;
-    new_block.block_level = (int8_t) data[3];
-
-    rc = ompi_comm_nextcid (newcomp, NULL, NULL, (void *) tag, &new_block, false, OMPI_COMM_CID_GROUP_NEW);
+    rc = ompi_comm_nextcid (newcomp, NULL, NULL, (void *) tag, NULL, false, OMPI_COMM_CID_GROUP_NEW);
     if ( OMPI_SUCCESS != rc ) {
         OBJ_RELEASE(newcomp);
         return rc;
diff --git a/ompi/communicator/comm_cid.c b/ompi/communicator/comm_cid.c
index db97f7ea1b8..0475d63b6f4 100644
--- a/ompi/communicator/comm_cid.c
+++ b/ompi/communicator/comm_cid.c
@@ -310,21 +310,16 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
                                         const void *arg0, const void *arg1, bool send_first, int mode,
                                         ompi_request_t **req)
 {
-    pmix_info_t pinfo, *results = NULL;
+    pmix_info_t *pinfo, *results = NULL;
     size_t nresults;
-    opal_process_name_t *name_array = NULL;
-    char *tag = NULL;
-    size_t proc_count;
-    size_t cid_base = 0;
+    opal_process_name_t opal_proc_name;
     bool cid_base_set = false;
+    char *tag = NULL;
+    size_t proc_count = 0, rproc_count = 0, tproc_count = 0, cid_base = 0UL, ninfo;
     int rc, leader_rank;
-    int ret = OMPI_SUCCESS;
-    pmix_proc_t *procs = NULL;
-
-    rc = ompi_group_to_proc_name_array (newcomm->c_local_group, &name_array, &proc_count);
-    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
-        return rc;
-    }
+    pmix_proc_t *procs;
+    void *grpinfo = NULL, *list = NULL;
+    pmix_data_array_t darray;
 
     switch (mode) {
     case OMPI_COMM_CID_GROUP_NEW:
@@ -341,15 +336,75 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
         break;
     }
 
-    PMIX_INFO_LOAD(&pinfo, PMIX_GROUP_ASSIGN_CONTEXT_ID, NULL, PMIX_BOOL);
+    grpinfo = PMIx_Info_list_start();
+    if (NULL == grpinfo) {
+        rc = OMPI_ERR_OUT_OF_RESOURCE;
+        goto fn_exit;
+    }
+
+    rc = PMIx_Info_list_add(grpinfo, PMIX_GROUP_ASSIGN_CONTEXT_ID, NULL, PMIX_BOOL);
+    if (PMIX_SUCCESS != rc) {
+        OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_add failed %s %d", PMIx_Error_string(rc), __LINE__));
+        rc = OMPI_ERR_OUT_OF_RESOURCE;
+        goto fn_exit;
+    }
+
+    list = PMIx_Info_list_start();
+
+    size_t c_index = (size_t)newcomm->c_index;
+    rc = PMIx_Info_list_add(list, PMIX_GROUP_LOCAL_CID, &c_index, PMIX_SIZE);
+    if (PMIX_SUCCESS != rc) {
+        OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_add failed %s %d", PMIx_Error_string(rc), __LINE__));
+        rc = OMPI_ERR_OUT_OF_RESOURCE;
+        goto fn_exit;
+    }
+
+    rc = PMIx_Info_list_convert(list, &darray);
+    if (PMIX_SUCCESS != rc) {
+        OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_convert failed %s %d", PMIx_Error_string(rc), __LINE__));
+        rc = OMPI_ERR_OUT_OF_RESOURCE;
+        goto fn_exit;
+    }
+    rc = PMIx_Info_list_add(grpinfo, PMIX_GROUP_INFO, &darray, PMIX_DATA_ARRAY);
+    PMIX_DATA_ARRAY_DESTRUCT(&darray);
+    if (PMIX_SUCCESS != rc) {
+        OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_add failed %s %d", PMIx_Error_string(rc), __LINE__));
+        rc = OMPI_ERR_OUT_OF_RESOURCE;
+        goto fn_exit;
+    }
+
+    rc = PMIx_Info_list_convert(grpinfo, &darray);
+    if (PMIX_SUCCESS != rc) {
+        OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_convert failed %s %d", PMIx_Error_string(rc), __LINE__));
+        rc = OMPI_ERR_OUT_OF_RESOURCE;
+        goto fn_exit;
+    }
+
+    pinfo = (pmix_info_t*)darray.array;
+    ninfo = darray.size;
+
+    proc_count = newcomm->c_local_group->grp_proc_count;
+    if ( OMPI_COMM_IS_INTER (newcomm) ){
+        rproc_count = newcomm->c_remote_group->grp_proc_count;
+    }
+
+    PMIX_PROC_CREATE(procs, proc_count + rproc_count);
 
-    PMIX_PROC_CREATE(procs, proc_count);
     for (size_t i = 0 ; i < proc_count; ++i) {
-        OPAL_PMIX_CONVERT_NAME(&procs[i],&name_array[i]);
+        opal_proc_name = ompi_group_get_proc_name(newcomm->c_local_group, i);
+        OPAL_PMIX_CONVERT_NAME(&procs[i],&opal_proc_name);
+    }
+    for (size_t i = 0; i < rproc_count; ++i) {
+        opal_proc_name = ompi_group_get_proc_name(newcomm->c_remote_group, i);
+        OPAL_PMIX_CONVERT_NAME(&procs[proc_count+i],&opal_proc_name);
     }
 
-    rc = PMIx_Group_construct(tag, procs, proc_count, &pinfo, 1, &results, &nresults);
-    PMIX_INFO_DESTRUCT(&pinfo);
+    tproc_count = proc_count + rproc_count;
+
+    OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "calling PMIx_Group_construct - tag %s size %ld ninfo %ld cid_base %ld\n",
+                         tag, tproc_count, ninfo, cid_base));
+    rc = PMIx_Group_construct(tag, procs, tproc_count, pinfo, ninfo, &results, &nresults);
+    PMIX_DATA_ARRAY_DESTRUCT(&darray);
     if(PMIX_SUCCESS != rc) {
        char msg_string[1024];
         switch (rc) {
@@ -361,7 +416,7 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
                            "MPI_Comm_create_from_group/MPI_Intercomm_create_from_groups",
                            msg_string);
 
-            ret = MPI_ERR_UNSUPPORTED_OPERATION;
+            rc = MPI_ERR_UNSUPPORTED_OPERATION;
             break;
         case PMIX_ERR_NOT_SUPPORTED:
             sprintf(msg_string,"PMIx server does not support PMIx Group operations");
@@ -370,10 +425,10 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
                            true,
                            "MPI_Comm_create_from_group/MPI_Intercomm_create_from_groups",
                            msg_string);
-            ret = MPI_ERR_UNSUPPORTED_OPERATION;
+            rc = MPI_ERR_UNSUPPORTED_OPERATION;
             break;
         default:
-            ret = opal_pmix_convert_status(rc);
+            rc = opal_pmix_convert_status(rc);
             break;
         } 
         goto fn_exit;
@@ -383,7 +438,7 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
         if (PMIX_CHECK_KEY(&results[i], PMIX_GROUP_CONTEXT_ID)) {
             PMIX_VALUE_GET_NUMBER(rc, &results[i].value, cid_base, size_t);
             if(PMIX_SUCCESS != rc) {
-                ret = opal_pmix_convert_status(rc);
+                rc = opal_pmix_convert_status(rc);
                 goto fn_exit;
             }
             cid_base_set = true;
@@ -391,15 +446,20 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
         }
     }
 
+    OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Group_construct - tag %s size %ld ninfo %ld cid_base %ld\n",
+                         tag, tproc_count, ninfo, cid_base));
+
+    /* destruct the group */
     rc = PMIx_Group_destruct (tag, NULL, 0);
     if(PMIX_SUCCESS != rc) {
-        ret = opal_pmix_convert_status(rc);
+        OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Group_destruct failed %s", PMIx_Error_string(rc)));
+        rc = opal_pmix_convert_status(rc);
         goto fn_exit;
     }
 
     if (!cid_base_set) {
         opal_show_help("help-comm.txt", "cid-base-not-set", true);
-        ret = OMPI_ERROR;
+        rc = OMPI_ERROR;
         goto fn_exit;
     }
 
@@ -412,16 +472,19 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
     }
 
     if(NULL != procs) {
-        PMIX_PROC_FREE(procs, proc_count);
+        PMIX_PROC_FREE(procs, tproc_count);
         procs = NULL;
     }
 
-    if(NULL != name_array) {
-        free (name_array);
-        name_array = NULL;
+    if (NULL != grpinfo) {
+        PMIx_Info_list_release(grpinfo);
     }
 
-    return ret;
+    if (NULL != list) {
+        PMIx_Info_list_release(list);
+    }
+
+    return rc;
 }
 
 static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communicator_t *comm,
@@ -446,6 +509,15 @@ static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communic
         block = &comm->c_contextidb;
     }
 
+    for (unsigned int i = ompi_mpi_communicators.lowest_free ; i < mca_pml.pml_max_contextid ; ++i) {
+        bool flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i, newcomm);
+        if (true == flag) {
+            newcomm->c_index = i;
+            break;
+        }
+    }
+    assert(newcomm->c_index > 2);
+
     if (NULL == arg1) {
         if (OMPI_COMM_CID_GROUP == mode || OMPI_COMM_CID_GROUP_NEW == mode ||
             !ompi_comm_extended_cid_block_available (&comm->c_contextidb)) {
@@ -468,14 +540,6 @@ static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communic
         (void) ompi_comm_extended_cid_block_new (block, &newcomm->c_contextidb, is_new_block);
     }
 
-    for (unsigned int i = ompi_mpi_communicators.lowest_free ; i < mca_pml.pml_max_contextid ; ++i) {
-        bool flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i, newcomm);
-        if (true == flag) {
-            newcomm->c_index = i;
-            break;
-        }
-    }
-
     newcomm->c_contextid = newcomm->c_contextidb.block_cid;
 
     opal_hash_table_set_value_ptr (&ompi_comm_hash, &newcomm->c_contextid,
@@ -502,7 +566,7 @@ int ompi_comm_nextcid_nb (ompi_communicator_t *newcomm, ompi_communicator_t *com
        functions but the pml does not support these functions so return not supported */
     if (NULL == comm) {
        char msg_string[1024];
-       sprintf(msg_string,"The PML being used - %s - does not support MPI sessions related features", 
+       sprintf(msg_string,"The PML being used - %s - does not support MPI sessions related features",
                mca_pml_base_selected_component.pmlm_version.mca_component_name);
        opal_show_help("help-comm.txt",
                       "MPI function not supported",
@@ -886,6 +950,7 @@ int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *c
     ompi_comm_cid_context_t *context;
     ompi_comm_request_t *request;
     ompi_request_t *subreq;
+    uint32_t comm_size;
     int ret = 0;
 
     /* the caller should not pass NULL for comm (it may be the same as *newcomm) */
@@ -907,6 +972,25 @@ int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *c
 
     request->context = &context->super;
 
+    /* Prep communicator for handling remote cids if needed */
+
+    if (!OMPI_COMM_IS_GLOBAL_INDEX(*newcomm)) {
+        if (OMPI_COMM_IS_INTER(*newcomm)) {
+            comm_size = ompi_comm_remote_size(*newcomm);
+        } else {
+            comm_size = ompi_comm_size(*newcomm);
+        }
+
+        (*newcomm)->c_index_vec = (uint32_t *)calloc(comm_size, sizeof(uint32_t));
+        if (NULL == (*newcomm)->c_index_vec) {
+            return OMPI_ERR_OUT_OF_RESOURCE;
+        }
+
+        if (OMPI_COMM_IS_INTRA(*newcomm)) {
+            (*newcomm)->c_index_vec[(*newcomm)->c_my_rank] = (*newcomm)->c_index;
+        }
+    }
+
     if (MPI_UNDEFINED != (*newcomm)->c_local_group->grp_my_rank) {
         /* Initialize the PML stuff in the newcomm  */
         if ( OMPI_SUCCESS != (ret = MCA_PML_CALL(add_comm(*newcomm))) ) {
@@ -963,6 +1047,61 @@ int ompi_comm_activate (ompi_communicator_t **newcomm, ompi_communicator_t *comm
     return rc;
 }
 
+int ompi_comm_get_remote_cid_from_pmix (ompi_communicator_t *comm, int dest, uint32_t *remote_cid)
+{
+    ompi_proc_t *ompi_proc;
+    pmix_proc_t  pmix_proc;
+    pmix_info_t tinfo[2];
+    pmix_value_t *val = NULL;
+    ompi_comm_extended_cid_t excid;
+    int rc = OMPI_SUCCESS;
+    size_t remote_cid64;
+
+    assert(NULL != remote_cid);
+
+    ompi_proc = ompi_comm_peer_lookup(comm, dest);
+    OPAL_PMIX_CONVERT_NAME(&pmix_proc, &ompi_proc->super.proc_name);
+
+    PMIx_Info_construct(&tinfo[0]);
+    PMIX_INFO_LOAD(&tinfo[0], PMIX_TIMEOUT, &ompi_pmix_connect_timeout, PMIX_UINT32);
+
+    excid = ompi_comm_get_extended_cid(comm);
+
+    PMIX_INFO_CONSTRUCT(&tinfo[1]);
+    PMIX_INFO_LOAD(&tinfo[1], PMIX_GROUP_CONTEXT_ID, &excid.cid_base, PMIX_SIZE);
+    PMIX_INFO_SET_QUALIFIER(&tinfo[1]);
+    if (PMIX_SUCCESS != (rc = PMIx_Get(&pmix_proc, PMIX_GROUP_LOCAL_CID, tinfo, 2, &val))) {
+        OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID cid_base %ld %s", excid.cid_base, PMIx_Error_string(rc)));
+        rc = OMPI_ERR_NOT_FOUND;
+        goto done;
+    }
+
+    if (NULL == val) {
+        OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID val returned NULL"));
+        rc = OMPI_ERR_NOT_FOUND;
+        goto done;
+    }
+
+    if (val->type != PMIX_SIZE) {
+        OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID type mismatch"));
+        rc = OMPI_ERR_TYPE_MISMATCH;
+        goto done;
+    }
+
+    PMIX_VALUE_GET_NUMBER(rc, val, remote_cid64, size_t);
+    rc = OMPI_SUCCESS;
+    *remote_cid = (uint32_t)remote_cid64;
+    comm->c_index_vec[dest] = (uint32_t)remote_cid64;
+    OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get PMIX_GROUP_LOCAL_CID %d for cid_base %ld", *remote_cid, excid.cid_base));
+
+done:
+    if (NULL != val) {
+        PMIX_VALUE_RELEASE(val);
+    }
+
+    return rc;
+}
+
 static int ompi_comm_activate_nb_complete (ompi_comm_request_t *request)
 {
     ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context;
diff --git a/ompi/communicator/comm_init.c b/ompi/communicator/comm_init.c
index a72a6661189..498bf4a1e70 100644
--- a/ompi/communicator/comm_init.c
+++ b/ompi/communicator/comm_init.c
@@ -23,7 +23,7 @@
  *                         and Technology (RIST). All rights reserved.
  * Copyright (c) 2015-2019 Intel, Inc.  All rights reserved.
  * Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
- * Copyright (c) 2018-2022 Triad National Security, LLC. All rights
+ * Copyright (c) 2018-2024 Triad National Security, LLC. All rights
  *                         reserved.
  * Copyright (c) 2023      Advanced Micro Devices, Inc. All rights reserved.
  * Copyright (c) 2023      NVIDIA Corporation.  All rights reserved.
@@ -69,6 +69,8 @@ ompi_predefined_communicator_t  ompi_mpi_comm_self = {{{{0}}}};
 ompi_predefined_communicator_t  ompi_mpi_comm_null = {{{{0}}}};
 ompi_communicator_t  *ompi_mpi_comm_parent = NULL;
 
+int ompi_comm_output = -1;
+
 static bool ompi_comm_intrinsic_init;
 
 ompi_predefined_communicator_t *ompi_mpi_comm_world_addr =
@@ -97,6 +99,14 @@ static int ompi_comm_finalize (void);
  */
 int ompi_comm_init(void)
 {
+
+    /* create output stream */
+
+    if (ompi_comm_output == -1) {
+        ompi_comm_output = opal_output_open(NULL);
+        opal_output_set_verbosity(ompi_comm_output, ompi_comm_verbose_level);
+    }
+
     /* Setup communicator array */
     OBJ_CONSTRUCT(&ompi_mpi_communicators, opal_pointer_array_t);
     if( OPAL_SUCCESS != opal_pointer_array_init(&ompi_mpi_communicators, 16,
@@ -392,6 +402,11 @@ static int ompi_comm_finalize (void)
     /* finalize communicator requests */
     ompi_comm_request_fini ();
 
+    /* close output stream */
+
+    opal_output_close(ompi_comm_output);
+    ompi_comm_output = -1;
+
     /* release a reference to the attributes subsys */
     return ompi_attr_put_ref();
 }
@@ -417,6 +432,7 @@ static void ompi_comm_construct(ompi_communicator_t* comm)
     comm->c_coll         = NULL;
     comm->c_nbc_tag      = MCA_COLL_BASE_TAG_NONBLOCKING_BASE;
     comm->instance       = NULL;
+    comm->c_index_vec    = NULL;
 
     /*
      * magic numerology - see TOPDIR/ompi/include/mpif-values.pl
@@ -518,6 +534,11 @@ static void ompi_comm_destruct(ompi_communicator_t* comm)
         comm->c_name = NULL;
     }
 
+    if (NULL != comm->c_index_vec) {
+        free (comm->c_index_vec);
+        comm->c_index_vec = NULL;
+    }
+
 #if OPAL_ENABLE_FT_MPI
     if( NULL != comm->agreement_specific ) {
         OBJ_RELEASE( comm->agreement_specific );
diff --git a/ompi/communicator/communicator.h b/ompi/communicator/communicator.h
index 3a230b68025..1714a09befc 100644
--- a/ompi/communicator/communicator.h
+++ b/ompi/communicator/communicator.h
@@ -153,6 +153,8 @@ OMPI_DECLSPEC extern opal_hash_table_t ompi_comm_hash;
 OMPI_DECLSPEC extern opal_pointer_array_t ompi_mpi_communicators;
 OMPI_DECLSPEC extern opal_pointer_array_t ompi_comm_f_to_c_table;
 
+OMPI_DECLSPEC extern int ompi_comm_output;
+
 struct ompi_comm_extended_cid_t {
     uint64_t  cid_base;
     union {
@@ -284,6 +286,10 @@ struct ompi_communicator_t {
     uint32_t c_epoch;  /* Identifier used to differentiate between two communicators
                           using the same c_contextid (not at the same time, obviously) */
 #endif
+    /* vector used to store remote cid values for communicators not using
+     * a global cid, i.e. when OMPI_COMM_IS_GLOBAL_INDEX(comm) returns 0.
+     */
+    uint32_t                     *c_index_vec;
     /* Non-blocking collective tag. These tags might be shared between
      * all non-blocking collective modules (to avoid message collision
      * between them in the case where multiple outstanding non-blocking
@@ -535,6 +541,30 @@ static inline uint32_t ompi_comm_get_local_cid (const ompi_communicator_t* comm)
     return comm->c_index;
 }
 
+int ompi_comm_get_remote_cid_from_pmix (ompi_communicator_t *comm, int dest, uint32_t *remote_cid);
+
+/**
+ * Get remote cid for the communicator.  In the case of communicators created
+ * using methods that don't supply an input communicator, i.e.
+ * MPI_Comm_create_from_group, the remote cid may be different from the local cid.
+ */
+static inline int ompi_comm_get_remote_cid (ompi_communicator_t *comm, int dest, uint32_t *remote_cid)
+{
+    int rc = OMPI_SUCCESS;
+
+    assert(NULL != remote_cid);
+
+    if (OPAL_LIKELY(OMPI_COMM_IS_GLOBAL_INDEX(comm))) {
+        *remote_cid = comm->c_index;
+    } else if (0 != comm->c_index_vec[dest]) {
+        *remote_cid = comm->c_index_vec[dest];
+    } else {
+        rc = ompi_comm_get_remote_cid_from_pmix(comm, dest, remote_cid);
+    }
+
+    return rc;
+}
+
 /**
  * Get the extended context ID for the communicator, suitable for passing
  * to ompi_comm_lookup_cid for getting the communicator back
@@ -614,6 +644,12 @@ static inline struct ompi_proc_t* ompi_comm_peer_lookup (const ompi_communicator
     return ompi_group_peer_lookup(comm->c_remote_group,peer_id);
 }
 
+static inline bool ompi_comm_instances_same(const ompi_communicator_t *comm1,
+                                            const ompi_communicator_t *comm2)
+{
+    return comm1->instance == comm2->instance;
+}
+
 #if OPAL_ENABLE_FT_MPI
 /*
  * Support for MPI_ANY_SOURCE point-to-point operations
diff --git a/ompi/dpm/dpm.c b/ompi/dpm/dpm.c
index 8e4057daffb..719b0c4a735 100644
--- a/ompi/dpm/dpm.c
+++ b/ompi/dpm/dpm.c
@@ -430,7 +430,7 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
             wildcard_rank.jobid = proc->super.proc_name.jobid;
             wildcard_rank.vpid = OMPI_NAME_WILDCARD->vpid;
             /* retrieve the local peers for the specified jobid */
-            OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCAL_PEERS,
+            OPAL_MODEX_RECV_VALUE_IMMEDIATE(rc, PMIX_LOCAL_PEERS,
                                            &wildcard_rank, &val, PMIX_STRING);
             if (OPAL_SUCCESS == rc && NULL != val) {
                 char **peers = opal_argv_split(val, ',');
diff --git a/ompi/instance/instance.c b/ompi/instance/instance.c
index d0f0d490de3..391c9bb03d1 100644
--- a/ompi/instance/instance.c
+++ b/ompi/instance/instance.c
@@ -115,7 +115,7 @@ static mca_base_framework_t *ompi_framework_dependencies[] = {
     &ompi_hook_base_framework, &ompi_op_base_framework,
     &opal_allocator_base_framework, &opal_rcache_base_framework, &opal_mpool_base_framework, &opal_smsc_base_framework,
     &ompi_bml_base_framework, &ompi_pml_base_framework, &ompi_coll_base_framework,
-    &ompi_osc_base_framework, NULL,
+    &ompi_osc_base_framework, &ompi_part_base_framework, NULL,
 };
 
 static mca_base_framework_t *ompi_lazy_frameworks[] = {
@@ -222,6 +222,8 @@ void ompi_mpi_instance_release (void)
     opal_argv_free (ompi_mpi_instance_pmix_psets);
     ompi_mpi_instance_pmix_psets = NULL;
 
+    OBJ_DESTRUCT(&ompi_mpi_instance_null);
+
     opal_finalize_cleanup_domain (&ompi_instance_basic_domain);
     OBJ_DESTRUCT(&ompi_instance_basic_domain);
 
@@ -655,11 +657,7 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
         return ompi_instance_print_error ("ompi_win_init() failed", ret);
     }
 
-    /* initialize partcomm */
-    if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_part_base_framework, 0))) {
-        return ompi_instance_print_error ("mca_part_base_select() failed", ret);
-    }
-
+    /* select part component to use */
     if (OMPI_SUCCESS != (ret = mca_part_base_select (true, true))) {
         return ompi_instance_print_error ("mca_part_base_select() failed", ret);
     }
@@ -950,17 +948,8 @@ static int ompi_mpi_instance_finalize_common (void)
 
     ompi_proc_finalize();
 
-    OBJ_DESTRUCT(&ompi_mpi_instance_null);
-
     ompi_mpi_instance_release ();
 
-    if (0 == opal_initialized) {
-        /* if there is no MPI_T_init_thread that has been MPI_T_finalize'd,
-         * then be gentle to the app and release all the memory now (instead
-         * of the opal library destructor */
-        opal_class_finalize ();
-    }
-
     return OMPI_SUCCESS;
 }
 
diff --git a/ompi/mca/coll/adapt/coll_adapt_component.c b/ompi/mca/coll/adapt/coll_adapt_component.c
index 97eec559a2d..3d7d7e16cbe 100644
--- a/ompi/mca/coll/adapt/coll_adapt_component.c
+++ b/ompi/mca/coll/adapt/coll_adapt_component.c
@@ -2,6 +2,7 @@
  * Copyright (c) 2014-2020 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
+ * Copyright (c) 2024      NVIDIA CORPORATION. All rights reserved.
  * $COPYRIGHT$
  * 
  * Additional copyrights may follow
@@ -114,39 +115,39 @@ static int adapt_register(void)
        we should have a high priority */
     cs->adapt_priority = 0;
     (void) mca_base_component_var_register(c, "priority", "Priority of the adapt coll component",
-                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                            OPAL_INFO_LVL_9,
-                                           MCA_BASE_VAR_SCOPE_READONLY, &cs->adapt_priority);
+                                           MCA_BASE_VAR_SCOPE_ALL, &cs->adapt_priority);
 
     cs->adapt_verbose = ompi_coll_base_framework.framework_verbose;
     (void) mca_base_component_var_register(c, "verbose",
                                            "Verbose level (default set to the collective framework verbosity)",
-                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                            OPAL_INFO_LVL_9,
-                                           MCA_BASE_VAR_SCOPE_READONLY, &cs->adapt_verbose);
+                                           MCA_BASE_VAR_SCOPE_ALL, &cs->adapt_verbose);
 
     cs->adapt_context_free_list_min = 64;
     (void) mca_base_component_var_register(c, "context_free_list_min",
                                            "Minimum number of segments in context free list",
-                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                            OPAL_INFO_LVL_9,
-                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           MCA_BASE_VAR_SCOPE_ALL,
                                            &cs->adapt_context_free_list_min);
 
     cs->adapt_context_free_list_max = 1024;
     (void) mca_base_component_var_register(c, "context_free_list_max",
                                            "Maximum number of segments in context free list",
-                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                            OPAL_INFO_LVL_9,
-                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           MCA_BASE_VAR_SCOPE_ALL,
                                            &cs->adapt_context_free_list_max);
 
     cs->adapt_context_free_list_inc = 32;
     (void) mca_base_component_var_register(c, "context_free_list_inc",
                                            "Increasement number of segments in context free list",
-                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                            OPAL_INFO_LVL_9,
-                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           MCA_BASE_VAR_SCOPE_ALL,
                                            &cs->adapt_context_free_list_inc);
     ompi_coll_adapt_ibcast_register();
     ompi_coll_adapt_ireduce_register();
diff --git a/ompi/mca/coll/adapt/coll_adapt_ibcast.c b/ompi/mca/coll/adapt/coll_adapt_ibcast.c
index 361e0e24c72..00bc79fbd46 100644
--- a/ompi/mca/coll/adapt/coll_adapt_ibcast.c
+++ b/ompi/mca/coll/adapt/coll_adapt_ibcast.c
@@ -3,6 +3,7 @@
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2022      IBM Corporation. All rights reserved
+ * Copyright (c) 2024      NVIDIA CORPORATION. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -34,8 +35,9 @@ int ompi_coll_adapt_ibcast_register(void)
 
     mca_coll_adapt_component.adapt_ibcast_algorithm = 1;
     mca_base_component_var_register(c, "bcast_algorithm",
-                                    "Algorithm of broadcast, 0: tuned, 1: binomial, 2: in_order_binomial, 3: binary, 4: pipeline, 5: chain, 6: linear", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
-                                    OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY,
+                                    "Algorithm of broadcast, 0: tuned, 1: binomial, 2: in_order_binomial, 3: binary, 4: pipeline, 5: chain, 6: linear",
+                                    MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
+                                    OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL,
                                     &mca_coll_adapt_component.adapt_ibcast_algorithm);
     if( (mca_coll_adapt_component.adapt_ibcast_algorithm < 0) ||
         (mca_coll_adapt_component.adapt_ibcast_algorithm >= OMPI_COLL_ADAPT_ALGORITHM_COUNT) ) {
@@ -45,33 +47,33 @@ int ompi_coll_adapt_ibcast_register(void)
     mca_coll_adapt_component.adapt_ibcast_segment_size = 0;
     mca_base_component_var_register(c, "bcast_segment_size",
                                     "Segment size in bytes used by default for bcast algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
-                                    MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                    MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                     OPAL_INFO_LVL_5,
-                                    MCA_BASE_VAR_SCOPE_READONLY,
+                                    MCA_BASE_VAR_SCOPE_ALL,
                                     &mca_coll_adapt_component.adapt_ibcast_segment_size);
 
     mca_coll_adapt_component.adapt_ibcast_max_send_requests = 2;
     mca_base_component_var_register(c, "bcast_max_send_requests",
                                     "Maximum number of send requests",
-                                    MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                    MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                     OPAL_INFO_LVL_5,
-                                    MCA_BASE_VAR_SCOPE_READONLY,
+                                    MCA_BASE_VAR_SCOPE_ALL,
                                     &mca_coll_adapt_component.adapt_ibcast_max_send_requests);
 
     mca_coll_adapt_component.adapt_ibcast_max_recv_requests = 3;
     mca_base_component_var_register(c, "bcast_max_recv_requests",
                                     "Maximum number of receive requests",
-                                    MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                    MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                     OPAL_INFO_LVL_5,
-                                    MCA_BASE_VAR_SCOPE_READONLY,
+                                    MCA_BASE_VAR_SCOPE_ALL,
                                     &mca_coll_adapt_component.adapt_ibcast_max_recv_requests);
 
     mca_coll_adapt_component.adapt_ibcast_synchronous_send = true;
     (void) mca_base_component_var_register(c, "bcast_synchronous_send",
                                            "Whether to use synchronous send operations during setup of bcast operations",
-                                           MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
+                                           MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                            OPAL_INFO_LVL_9,
-                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           MCA_BASE_VAR_SCOPE_ALL,
                                            &mca_coll_adapt_component.adapt_ibcast_synchronous_send);
 
     mca_coll_adapt_component.adapt_ibcast_context_free_list = NULL;
diff --git a/ompi/mca/coll/adapt/coll_adapt_ireduce.c b/ompi/mca/coll/adapt/coll_adapt_ireduce.c
index 2747995a57d..15bd586901a 100644
--- a/ompi/mca/coll/adapt/coll_adapt_ireduce.c
+++ b/ompi/mca/coll/adapt/coll_adapt_ireduce.c
@@ -5,6 +5,7 @@
  * Copyright (c) 2020      Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2022      IBM Corporation. All rights reserved
  * Copyright (c) 2023      Jeffrey M. Squyres.  All rights reserved.
+ * Copyright (c) 2024      NVIDIA CORPORATION. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -38,8 +39,9 @@ int ompi_coll_adapt_ireduce_register(void)
 
     mca_coll_adapt_component.adapt_ireduce_algorithm = 1;
     mca_base_component_var_register(c, "reduce_algorithm",
-                                    "Algorithm of reduce, 1: binomial, 2: in_order_binomial, 3: binary, 4: pipeline, 5: chain, 6: linear", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
-                                    OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY,
+                                    "Algorithm of reduce, 1: binomial, 2: in_order_binomial, 3: binary, 4: pipeline, 5: chain, 6: linear",
+                                    MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
+                                    OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_ALL,
                                     &mca_coll_adapt_component.adapt_ireduce_algorithm);
     if( (mca_coll_adapt_component.adapt_ireduce_algorithm < 0) ||
         (mca_coll_adapt_component.adapt_ireduce_algorithm > OMPI_COLL_ADAPT_ALGORITHM_COUNT) ) {
@@ -49,58 +51,58 @@ int ompi_coll_adapt_ireduce_register(void)
     mca_coll_adapt_component.adapt_ireduce_segment_size = 163740;
     mca_base_component_var_register(c, "reduce_segment_size",
                                     "Segment size in bytes used by default for reduce algorithms. Only has meaning if algorithm is forced and supports segmenting. 0 bytes means no segmentation.",
-                                    MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
+                                    MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                     OPAL_INFO_LVL_5,
-                                    MCA_BASE_VAR_SCOPE_READONLY,
+                                    MCA_BASE_VAR_SCOPE_ALL,
                                     &mca_coll_adapt_component.adapt_ireduce_segment_size);
 
     mca_coll_adapt_component.adapt_ireduce_max_send_requests = 2;
     mca_base_component_var_register(c, "reduce_max_send_requests",
                                     "Maximum number of send requests",
-                                    MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                    MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                     OPAL_INFO_LVL_5,
-                                    MCA_BASE_VAR_SCOPE_READONLY,
+                                    MCA_BASE_VAR_SCOPE_ALL,
                                     &mca_coll_adapt_component.adapt_ireduce_max_send_requests);
 
     mca_coll_adapt_component.adapt_ireduce_max_recv_requests = 3;
     mca_base_component_var_register(c, "reduce_max_recv_requests",
                                     "Maximum number of receive requests per peer",
-                                    MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                    MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                     OPAL_INFO_LVL_5,
-                                    MCA_BASE_VAR_SCOPE_READONLY,
+                                    MCA_BASE_VAR_SCOPE_ALL,
                                     &mca_coll_adapt_component.adapt_ireduce_max_recv_requests);
 
     mca_coll_adapt_component.adapt_inbuf_free_list_min = 10;
     mca_base_component_var_register(c, "inbuf_free_list_min",
                                     "Minimum number of segment in inbuf free list",
-                                    MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                    MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                     OPAL_INFO_LVL_5,
-                                    MCA_BASE_VAR_SCOPE_READONLY,
+                                    MCA_BASE_VAR_SCOPE_ALL,
                                     &mca_coll_adapt_component.adapt_inbuf_free_list_min);
 
     mca_coll_adapt_component.adapt_inbuf_free_list_max = 10000;
     mca_base_component_var_register(c, "inbuf_free_list_max",
                                     "Maximum number of segment in inbuf free list",
-                                    MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                    MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                     OPAL_INFO_LVL_5,
-                                    MCA_BASE_VAR_SCOPE_READONLY,
+                                    MCA_BASE_VAR_SCOPE_ALL,
                                     &mca_coll_adapt_component.adapt_inbuf_free_list_max);
 
 
     mca_coll_adapt_component.adapt_inbuf_free_list_inc = 10;
     mca_base_component_var_register(c, "inbuf_free_list_inc",
                                     "Number of segments to allocate when growing the inbuf free list",
-                                    MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                    MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                     OPAL_INFO_LVL_5,
-                                    MCA_BASE_VAR_SCOPE_READONLY,
+                                    MCA_BASE_VAR_SCOPE_ALL,
                                     &mca_coll_adapt_component.adapt_inbuf_free_list_inc);
 
     mca_coll_adapt_component.adapt_ireduce_synchronous_send = true;
     (void) mca_base_component_var_register(c, "reduce_synchronous_send",
                                            "Whether to use synchronous send operations during setup of reduce operations",
-                                           MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
+                                           MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                            OPAL_INFO_LVL_9,
-                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           MCA_BASE_VAR_SCOPE_ALL,
                                            &mca_coll_adapt_component.adapt_ireduce_synchronous_send);
 
     mca_coll_adapt_component.adapt_ireduce_context_free_list = NULL;
diff --git a/ompi/mca/coll/base/coll_base_util.c b/ompi/mca/coll/base/coll_base_util.c
index ae9010497d7..ba74aa01350 100644
--- a/ompi/mca/coll/base/coll_base_util.c
+++ b/ompi/mca/coll/base/coll_base_util.c
@@ -12,6 +12,9 @@
  * Copyright (c) 2014-2020 Research Organization for Information Science
  *                         and Technology (RIST).  All rights reserved.
  * Copyright (c) 2023      Jeffrey M. Squyres.  All rights reserved.
+ *
+ * Copyright (c) 2024      NVIDIA CORPORATION. All rights reserved.
+ *
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -482,6 +485,26 @@ int ompi_coll_base_file_peek_next_char_is(FILE *fptr, int *fileline, int expecte
     } while (1);
 }
 
+/**
+ * return non-zero if the next non-space to read on the current line is a digit.
+ * otherwise return 0.
+ */
+int ompi_coll_base_file_peek_next_char_isdigit(FILE *fptr)
+{
+    do {
+        int next = fgetc(fptr);
+
+        if ((' ' == next) || ('\t' == next)) {
+            continue; /* discard space and tab. keep everything else */
+        }
+
+        ungetc(next, fptr); /* put the char back into the stream */
+
+        return isdigit(next); /* report back whether or not next is a digit */
+
+    } while (1);
+}
+
 /**
  * There are certainly simpler implementation for this function when performance
  * is not a critical point. But, as this function is used during the collective
diff --git a/ompi/mca/coll/base/coll_base_util.h b/ompi/mca/coll/base/coll_base_util.h
index 852abcedefa..7bceaa7dcc0 100644
--- a/ompi/mca/coll/base/coll_base_util.h
+++ b/ompi/mca/coll/base/coll_base_util.h
@@ -11,6 +11,7 @@
  *                         All rights reserved.
  * Copyright (c) 2014-2020 Research Organization for Information Science
  *                         and Technology (RIST).  All rights reserved.
+ * Copyright (c) 2024      NVIDIA CORPORATION. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -195,6 +196,7 @@ int ompi_coll_base_file_getnext_string(FILE *fptr, int *fileline, char** val);
  * eat the value, otherwise put it back into the file.
  */
 int ompi_coll_base_file_peek_next_char_is(FILE *fptr, int *fileline, int expected);
+int ompi_coll_base_file_peek_next_char_isdigit(FILE *fptr);
 
 /* Miscellaneous function */
 const char* mca_coll_base_colltype_to_str(int collid);
diff --git a/ompi/mca/coll/basic/coll_basic_component.c b/ompi/mca/coll/basic/coll_basic_component.c
index 23cbed81ad5..d38850744c0 100644
--- a/ompi/mca/coll/basic/coll_basic_component.c
+++ b/ompi/mca/coll/basic/coll_basic_component.c
@@ -13,6 +13,7 @@
  * Copyright (c) 2008      Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
  *                         reserved.
+ * Copyright (c) 2024      NVIDIA CORPORATION. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -91,16 +92,16 @@ basic_register(void)
     mca_coll_basic_priority = 10;
     (void) mca_base_component_var_register(&mca_coll_basic_component.collm_version, "priority",
                                            "Priority of the basic coll component",
-                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                            OPAL_INFO_LVL_9,
-                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           MCA_BASE_VAR_SCOPE_ALL,
                                            &mca_coll_basic_priority);
     mca_coll_basic_crossover = 4;
     (void) mca_base_component_var_register(&mca_coll_basic_component.collm_version, "crossover",
                                            "Minimum number of processes in a communicator before using the logarithmic algorithms",
-                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                            OPAL_INFO_LVL_9,
-                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           MCA_BASE_VAR_SCOPE_ALL,
                                            &mca_coll_basic_crossover);
 
     return OMPI_SUCCESS;
diff --git a/ompi/mca/coll/han/coll_han_component.c b/ompi/mca/coll/han/coll_han_component.c
index 3926faaaac3..4712bccbe88 100644
--- a/ompi/mca/coll/han/coll_han_component.c
+++ b/ompi/mca/coll/han/coll_han_component.c
@@ -7,6 +7,7 @@
  * Copyright (c) 2023      Computer Architecture and VLSI Systems (CARV)
  *                         Laboratory, ICS Forth. All rights reserved.
  * Copyright (c) 2024      Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * Copyright (c) 2024      NVIDIA CORPORATION. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -257,9 +258,9 @@ mca_coll_han_query_module_from_mca(mca_base_component_t* c,
     *storage = ompi_coll_han_available_components[mod_id].component_name;
 
     (void) mca_base_component_var_register(c, param_name, param_doc,
-                                           MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
+                                           MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                            info_level,
-                                           MCA_BASE_VAR_SCOPE_READONLY, storage);
+                                           MCA_BASE_VAR_SCOPE_ALL, storage);
     module_name = *storage;
     mod_id = strtol(module_name, &endptr, 10);
     if( module_name == endptr ) {  /* no conversion, maybe we got a module name instead */
@@ -288,22 +289,22 @@ static int han_register(void)
     COMPONENT_T component;
 
     (void) mca_base_component_var_register(c, "priority", "Priority of the HAN coll component",
-                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                            OPAL_INFO_LVL_9,
-                                           MCA_BASE_VAR_SCOPE_READONLY, &cs->han_priority);
+                                           MCA_BASE_VAR_SCOPE_ALL, &cs->han_priority);
 
     cs->han_output_verbose = 0;
     (void) mca_base_component_var_register(c, "verbose", "Verbosity of the HAN coll component (use coll base verbosity if not set)",
-                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                            OPAL_INFO_LVL_9,
-                                           MCA_BASE_VAR_SCOPE_READONLY, &cs->han_output_verbose);
+                                           MCA_BASE_VAR_SCOPE_ALL, &cs->han_output_verbose);
 
     cs->han_bcast_segsize = 65536;
     (void) mca_base_component_var_register(c, "bcast_segsize",
                                            "segment size for bcast",
-                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                            OPAL_INFO_LVL_9,
-                                           MCA_BASE_VAR_SCOPE_READONLY, &cs->han_bcast_segsize);
+                                           MCA_BASE_VAR_SCOPE_ALL, &cs->han_bcast_segsize);
 
     cs->han_bcast_up_module = 0;
     (void) mca_coll_han_query_module_from_mca(c, "bcast_up_module",
@@ -321,9 +322,9 @@ static int han_register(void)
     cs->han_reduce_segsize = 65536;
     (void) mca_base_component_var_register(c, "reduce_segsize",
                                            "segment size for reduce",
-                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                            OPAL_INFO_LVL_9,
-                                           MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reduce_segsize);
+                                           MCA_BASE_VAR_SCOPE_ALL, &cs->han_reduce_segsize);
 
     cs->han_reduce_up_module = 0;
     (void) mca_coll_han_query_module_from_mca(c, "reduce_up_module",
@@ -340,9 +341,9 @@ static int han_register(void)
     cs->han_allreduce_segsize = 65536;
     (void) mca_base_component_var_register(c, "allreduce_segsize",
                                            "segment size for allreduce",
-                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                            OPAL_INFO_LVL_9,
-                                           MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allreduce_segsize);
+                                           MCA_BASE_VAR_SCOPE_ALL, &cs->han_allreduce_segsize);
 
     cs->han_allreduce_up_module = 0;
     (void) mca_coll_han_query_module_from_mca(c, "allreduce_up_module",
@@ -424,8 +425,8 @@ static int han_register(void)
     (void) mca_base_component_var_register(c, "alltoall_pstages",
                                               "Parallel Stages for alltoall.  Higher numbers require more memory, "
                                               "and performs more communication in parallel.  0 chooses pstages based on message size.",
-                                              MCA_BASE_VAR_TYPE_INT32_T, NULL, 0, 0,
-                                              OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
+                                              MCA_BASE_VAR_TYPE_INT32_T, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
+                                              OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL,
                                               &cs->han_alltoall_pstages);
 
     cs->han_alltoallv_low_module = 0;
@@ -436,16 +437,16 @@ static int han_register(void)
     cs->han_alltoallv_smsc_avg_send_limit = 8192;
     (void) mca_base_component_var_register(c, "alltoallv_smsc_avg_send_limit",
                                               "The per-rank averaged send bytes limit above which smsc-based alltoallv will disqualify itself.",
-                                              MCA_BASE_VAR_TYPE_INT64_T, NULL, 0, 0,
-                                              OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
+                                              MCA_BASE_VAR_TYPE_INT64_T, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
+                                              OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL,
                                               &cs->han_alltoallv_smsc_avg_send_limit);
     cs->han_alltoallv_smsc_noncontig_activation_limit = 0.10;
     (void) mca_base_component_var_register(c, "alltoallv_smsc_noncontig_limit",
                                               "The fractional (0.00-1.00) limit of peers in the communicator which have "
                                               "strided or otherwise non-contiguous data buffers.  Above this limit "
                                               "smsc-based alltoallv will ignore the avg_send_limit, and always remain active.",
-                                              MCA_BASE_VAR_TYPE_DOUBLE, NULL, 0, 0,
-                                              OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
+                                              MCA_BASE_VAR_TYPE_DOUBLE, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
+                                              OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL,
                                               &cs->han_alltoallv_smsc_noncontig_activation_limit);
 
     cs->han_reproducible = 0;
@@ -453,21 +454,21 @@ static int han_register(void)
                                            "whether we need reproducible results "
                                            "(enabling this disables optimisations using topology)"
                                            "0 disable 1 enable, default 0",
-                                           MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
+                                           MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                            OPAL_INFO_LVL_3,
-                                           MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reproducible);
+                                           MCA_BASE_VAR_SCOPE_ALL, &cs->han_reproducible);
 
     cs->han_packbuf_bytes = 128*1024;
     (void) mca_base_component_var_register(c, "packbuf_bytes",
                                            "The number of bytes in each HAN packbuf.",
-                                           MCA_BASE_VAR_TYPE_INT64_T, NULL, 0, 0,
-                                           OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
+                                           MCA_BASE_VAR_TYPE_INT64_T, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
+                                           OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL,
                                            &cs->han_packbuf_bytes);
     cs->han_packbuf_max_count = 32;
     (void) mca_base_component_var_register(c, "packbuf_max_count",
                                            "The maximum number of packbufs that are allowed to be allocated.",
-                                           MCA_BASE_VAR_TYPE_INT64_T, NULL, 0, 0,
-                                           OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
+                                           MCA_BASE_VAR_TYPE_INT64_T, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
+                                           OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL,
                                            &cs->han_packbuf_max_count);
 
     /*
@@ -582,9 +583,9 @@ static int han_register(void)
             }
 
             mca_base_component_var_register(c, param_name, param_desc,
-                                            MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                            MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                             OPAL_INFO_LVL_9,
-                                            MCA_BASE_VAR_SCOPE_READONLY,
+                                            MCA_BASE_VAR_SCOPE_ALL,
                                             &(cs->mca_sub_components[coll][topo_lvl]));
         }
     }
@@ -594,27 +595,27 @@ static int han_register(void)
     (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version,
                                            "use_dynamic_file_rules",
                                            "Enable the dynamic selection provided via the dynamic_rules_filename MCA",
-                                           MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
+                                           MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                            OPAL_INFO_LVL_6,
-                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           MCA_BASE_VAR_SCOPE_ALL,
                                            &(cs->use_dynamic_file_rules));
 
     cs->dynamic_rules_filename = NULL;
     (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version,
                                            "dynamic_rules_filename",
                                            "Configuration file containing the dynamic selection rules",
-                                           MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
+                                           MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                            OPAL_INFO_LVL_6,
-                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           MCA_BASE_VAR_SCOPE_ALL,
                                            &(cs->dynamic_rules_filename));
 
     cs->dump_dynamic_rules = false;
     (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version,
                                            "dump_dynamic_rules",
                                            "Switch used to decide if we dump  dynamic rules provided by configuration file",
-                                           MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
+                                           MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                            OPAL_INFO_LVL_6,
-                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           MCA_BASE_VAR_SCOPE_ALL,
                                            &(cs->dump_dynamic_rules));
 
     if((cs->dump_dynamic_rules || NULL != cs->dynamic_rules_filename)
@@ -631,9 +632,9 @@ static int han_register(void)
                                            "errors printed on rank 0 "
                                            "with a 0 verbosity."
                                            "Useless if coll_base_verbose is 30 or more.",
-                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                            OPAL_INFO_LVL_6,
-                                           MCA_BASE_VAR_SCOPE_READONLY,
+                                           MCA_BASE_VAR_SCOPE_ALL,
                                            &(cs->max_dynamic_errors));
 
 
diff --git a/ompi/mca/coll/hcoll/coll_hcoll_component.c b/ompi/mca/coll/hcoll/coll_hcoll_component.c
index b8eb0444974..e34169a0781 100644
--- a/ompi/mca/coll/hcoll/coll_hcoll_component.c
+++ b/ompi/mca/coll/hcoll/coll_hcoll_component.c
@@ -3,6 +3,7 @@
  * Copyright (c) 2011 Mellanox Technologies. All rights reserved.
  * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
  *                         reserved.
+ * Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -107,8 +108,8 @@ static int reg_int(const char* param_name,
     index = mca_base_component_var_register(
             &mca_coll_hcoll_component.super.collm_version,
             param_name, param_desc, MCA_BASE_VAR_TYPE_INT,
-            NULL, 0, 0,OPAL_INFO_LVL_9,
-            MCA_BASE_VAR_SCOPE_READONLY, storage);
+            NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,OPAL_INFO_LVL_9,
+            MCA_BASE_VAR_SCOPE_ALL, storage);
     if (NULL != deprecated_param_name) {
         (void) mca_base_var_register_synonym(index,
                 "ompi", "coll", "hcoll", deprecated_param_name,
diff --git a/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c b/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c
index e56ece1d0b4..5eb8ef4317e 100644
--- a/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c
+++ b/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c
@@ -12,6 +12,7 @@
  * Copyright (c) 2015      Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
  * Copyright (c) 2023      Jeffrey M. Squyres.  All rights reserved.
+ * Copyright (c) 2024      NVIDIA CORPORATION. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -42,13 +43,24 @@
 static int fileline=0; /* used for verbose error messages */
 
 #define getnext(fptr, pval)        ompi_coll_base_file_getnext_long(fptr, &fileline, pval)
+#define isnext_digit(fptr)         ompi_coll_base_file_peek_next_char_isdigit(fptr)
 
 /*
  * Reads a rule file called fname
- * Builds the algorithm rule table for a max of n_collectives
+ * The rule file defines a set of sets of rules. The outer set is keyed on
+ * communicator size while the inner set is keyed on message size.  When a
+ * communicator is constructed its size is used to look up the nested set of
+ * message size keyed rules.  When a collective is called the message size
+ * determined from its call arguments are used to lookup a specific rule in the
+ * inner set.
+ *
+ * Rules for communicator and message sizes 0 and N (where N is the larger than
+ * largest key you provide) can be specified to fall back to the fixed decision
+ * framework above and below the communicator and message size ranges of
+ * interest.
  *
  * If an error occurs it removes rule table and then exits with a very verbose
- * error message (this stops the user using a half baked rule table
+ * error message. this stops the user using a half baked rule table.
  *
  * Returns the number of actual collectives that a rule exists for
  * (note 0 is NOT an error)
@@ -57,9 +69,18 @@ static int fileline=0; /* used for verbose error messages */
 
 int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** rules, int n_collectives)
 {
-    long CI, NCS, CS, ALG, NMS, FANINOUT, X, MS, SS;
+    long NCOL = 0,      /* number of collectives for which rules are provided  */
+         COLID = 0,     /* identifies the collective type to associate the rules with */
+         NCOMSIZES = 0, /* number of sets of message size rules. the key is communicator size */
+         COMSIZE = 0,   /* communicator size, the key identifying a specific set of message size rules. */
+         NMSGSIZES = 0, /* number of message size rules in the set. */
+         MSGSIZE = 0,   /* message size, the key identifying a specific rule in the set. */
+         ALG = 0,       /* the collective specific algorithm to use */
+         FANINOUT = 0,  /* algorithm specific tuning parameter */
+         SEGSIZE = 0,   /* algorithm specific tuning parameter */
+         MAXREQ = 0;    /* algorithm specific tuning parameter */
     FILE *fptr = (FILE*) NULL;
-    int x, ncs, nms;
+    int x, ncs, nms, version;
 
     ompi_coll_alg_rule_t *alg_rules = (ompi_coll_alg_rule_t*) NULL;   /* complete table of rules */
 
@@ -103,68 +124,78 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
         goto on_file_error;
     }
 
-    if( (getnext(fptr, &X) < 0) || (X < 0) ) {
+    /* consume the optional version identifier */
+    if (0 == fscanf(fptr, "rule-file-version-%u", &version)) {
+        version = 1;
+    }
+
+    /* get the number of collectives for which rules are provided in the file */
+    if( (getnext(fptr, &NCOL) < 0) || (NCOL < 0) ) {
         OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of collectives in configuration file around line %d\n", fileline));
         goto on_file_error;
     }
-    if (X>n_collectives) {
-        OPAL_OUTPUT((ompi_coll_tuned_stream,"Number of collectives in configuration file %ld is greater than number of MPI collectives possible %d ??? error around line %d\n", X, n_collectives, fileline));
+    if (NCOL>n_collectives) {
+        OPAL_OUTPUT((ompi_coll_tuned_stream,"Number of collectives in configuration file %ld is greater than number of MPI collectives possible %d ??? error around line %d\n", NCOL, n_collectives, fileline));
         goto on_file_error;
     }
 
-    for (x=0;x<X;x++) { /* for each collective */
+    for (x=0;x<NCOL;x++) { /* for each collective */
 
-        if( (getnext(fptr, &CI) < 0) || (CI < 0) ) {
+        /* get the collective for which rules are being provided */
+        if( (getnext(fptr, &COLID) < 0) || (COLID < 0) ) {
             OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read next Collective id in configuration file around line %d\n", fileline));
             goto on_file_error;
         }
-        if (CI>=n_collectives) {
-            OPAL_OUTPUT((ompi_coll_tuned_stream,"Collective id in configuration file %ld is greater than MPI collectives possible %d. Error around line %d\n", CI, n_collectives, fileline));
+        if (COLID>=n_collectives) {
+            OPAL_OUTPUT((ompi_coll_tuned_stream,"Collective id in configuration file %ld is greater than MPI collectives possible %d. Error around line %d\n", COLID, n_collectives, fileline));
             goto on_file_error;
         }
 
-        if (alg_rules[CI].alg_rule_id != CI) {
-            OPAL_OUTPUT((ompi_coll_tuned_stream, "Internal error in handling collective ID %ld\n", CI));
+        if (alg_rules[COLID].alg_rule_id != COLID) {
+            OPAL_OUTPUT((ompi_coll_tuned_stream, "Internal error in handling collective ID %ld\n", COLID));
             goto on_file_error;
         }
-        OPAL_OUTPUT((ompi_coll_tuned_stream, "Reading dynamic rule for collective ID %ld\n", CI));
-        alg_p = &alg_rules[CI];
+        OPAL_OUTPUT((ompi_coll_tuned_stream, "Reading dynamic rule for collective ID %ld\n", COLID));
+        alg_p = &alg_rules[COLID];
 
-        alg_p->alg_rule_id = CI;
+        alg_p->alg_rule_id = COLID;
         alg_p->n_com_sizes = 0;
         alg_p->com_rules = (ompi_coll_com_rule_t *) NULL;
 
-        if( (getnext (fptr, &NCS) < 0) || (NCS < 0) ) {
-            OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read count of communicators for collective ID %ld at around line %d\n", CI, fileline));
+        /* get the number of communicator sizes for which a set of rules are to be provided */
+        if( (getnext (fptr, &NCOMSIZES) < 0) || (NCOMSIZES < 0) ) {
+            OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read count of communicators for collective ID %ld at around line %d\n", COLID, fileline));
             goto on_file_error;
         }
-        OPAL_OUTPUT((ompi_coll_tuned_stream, "Read communicator count %ld for dynamic rule for collective ID %ld\n", NCS, CI));
-        alg_p->n_com_sizes = NCS;
-        alg_p->com_rules = ompi_coll_tuned_mk_com_rules (NCS, CI);
+        OPAL_OUTPUT((ompi_coll_tuned_stream, "Read communicator count %ld for dynamic rule for collective ID %ld\n", NCOMSIZES, COLID));
+        alg_p->n_com_sizes = NCOMSIZES;
+        alg_p->com_rules = ompi_coll_tuned_mk_com_rules (NCOMSIZES, COLID);
         if (NULL == alg_p->com_rules) {
             OPAL_OUTPUT((ompi_coll_tuned_stream,"Cannot allocate com rules for file [%s]\n", fname));
             goto on_file_error;
         }
 
-        for (ncs=0;ncs<NCS;ncs++) {	/* for each comm size */
+        for (ncs=0;ncs<NCOMSIZES;ncs++) {	/* for each comm size */
 
             com_p = &(alg_p->com_rules[ncs]);
 
-            if( (getnext (fptr, &CS) < 0) || (CS < 0) ) {
-                OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read communicator size for collective ID %ld com rule %d at around line %d\n", CI, ncs, fileline));
+            /* get the communicator size to associate the set of rules with */
+            if( (getnext (fptr, &COMSIZE) < 0) || (COMSIZE < 0) ) {
+                OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read communicator size for collective ID %ld com rule %d at around line %d\n", COLID, ncs, fileline));
                 goto on_file_error;
             }
 
-            com_p->mpi_comsize = CS;
+            com_p->mpi_comsize = COMSIZE;
 
-            if( (getnext (fptr, &NMS) < 0) || (NMS < 0) ) {
-                OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of message sizes for collective ID %ld com rule %d at around line %d\n", CI, ncs, fileline));
+            /* get the number of message sizes to specify rules for. inner set size */
+            if( (getnext (fptr, &NMSGSIZES) < 0) || (NMSGSIZES < 0) ) {
+                OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of message sizes for collective ID %ld com rule %d at around line %d\n", COLID, ncs, fileline));
                 goto on_file_error;
             }
             OPAL_OUTPUT((ompi_coll_tuned_stream, "Read message count %ld for dynamic rule for collective ID %ld and comm size %ld\n",
-                         NMS, CI, CS));
-            com_p->n_msg_sizes = NMS;
-            com_p->msg_rules = ompi_coll_tuned_mk_msg_rules (NMS, CI, ncs, CS);
+                         NMSGSIZES, COLID, COMSIZE));
+            com_p->n_msg_sizes = NMSGSIZES;
+            com_p->msg_rules = ompi_coll_tuned_mk_msg_rules (NMSGSIZES, COLID, ncs, COMSIZE);
             if (NULL == com_p->msg_rules) {
                 OPAL_OUTPUT((ompi_coll_tuned_stream,"Cannot allocate msg rules for file [%s]\n", fname));
                 goto on_file_error;
@@ -172,37 +203,52 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
 
             msg_p = com_p->msg_rules;
 
-            for (nms=0;nms<NMS;nms++) {	/* for each msg size */
+            for (nms=0;nms<NMSGSIZES;nms++) {	/* for each msg size */
 
                 msg_p = &(com_p->msg_rules[nms]);
 
-                if( (getnext (fptr, &MS) < 0) || (MS < 0) ) {
-                    OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read message size for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
+                /* read the message size to associate the rule with */
+                if( (getnext (fptr, &MSGSIZE) < 0) || (MSGSIZE < 0) ) {
+                    OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read message size for collective ID %ld com rule %d msg rule %d at around line %d\n", COLID, ncs, nms, fileline));
                     goto on_file_error;
                 }
-                msg_p->msg_size = (size_t)MS;
+                msg_p->msg_size = (size_t)MSGSIZE;
 
+                /* read the collective specific algorithm identifier */
                 if( (getnext (fptr, &ALG) < 0) || (ALG < 0) ) {
-                    OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target algorithm method for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
+                    OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target algorithm method for collective ID %ld com rule %d msg rule %d at around line %d\n", COLID, ncs, nms, fileline));
                     goto on_file_error;
                 }
                 msg_p->result_alg = ALG;
 
+                /* read faninout tuning parameter. required */
                 if( (getnext (fptr, &FANINOUT) < 0) || (FANINOUT < 0) ) {
-                    OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read fan in/out topo for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
+                    OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read fan in/out topo for collective ID %ld com rule %d msg rule %d at around line %d\n", COLID, ncs, nms, fileline));
                     goto on_file_error;
                 }
                 msg_p->result_topo_faninout = FANINOUT;
 
-                if( (getnext (fptr, &SS) < 0) || (SS < 0) ) {
-                    OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target segment size for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline));
+                /* read segsize tuning parameter. required */
+                if( (getnext (fptr, &SEGSIZE) < 0) || (SEGSIZE < 0) ) {
+                    OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target segment size for collective ID %ld com rule %d msg rule %d at around line %d\n", COLID, ncs, nms, fileline));
                     goto on_file_error;
                 }
-                msg_p->result_segsize = SS;
+                msg_p->result_segsize = SEGSIZE;
+
+                /* read the max requests tuning parameter. optional */
+                msg_p->result_max_requests = ompi_coll_tuned_alltoall_max_requests;
+                if( (version > 1) && isnext_digit(fptr) ) {
+                    if( (getnext (fptr, &MAXREQ) < 0) || (MAXREQ < 0) ) {
+                        OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read max requests for collective ID %ld com rule %d msg rule %d at around line %d\n", COLID, ncs, nms, fileline));
+                        goto on_file_error;
+                    }
+                    msg_p->result_max_requests = MAXREQ;
+                }
 
-                if (!nms && MS) {
+                /* check the first rule is for 0 size. look-up depends on this */
+                if (!nms && MSGSIZE) {
                     OPAL_OUTPUT((ompi_coll_tuned_stream,"All algorithms must specify a rule for message size of zero upwards always first!\n"));
-                    OPAL_OUTPUT((ompi_coll_tuned_stream,"Message size was %lu for collective ID %ld com rule %d msg rule %d at around line %d\n", MS, CI, ncs, nms, fileline));
+                    OPAL_OUTPUT((ompi_coll_tuned_stream,"Message size was %lu for collective ID %ld com rule %d msg rule %d at around line %d\n", MSGSIZE, COLID, ncs, nms, fileline));
                     goto on_file_error;
                 }
 
@@ -219,13 +265,14 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t**
         } /* comm size */
 
         total_alg_count++;
-        OPAL_OUTPUT((ompi_coll_tuned_stream, "Done reading dynamic rule for collective ID %ld\n", CI));
+        OPAL_OUTPUT((ompi_coll_tuned_stream, "Done reading dynamic rule for collective ID %ld\n", COLID));
 
     } /* per collective */
 
     fclose (fptr);
 
     OPAL_OUTPUT((ompi_coll_tuned_stream,"\nConfigure file Stats\n"));
+    OPAL_OUTPUT((ompi_coll_tuned_stream,"Version\t\t\t\t\t: %5u\n", version));
     OPAL_OUTPUT((ompi_coll_tuned_stream,"Collectives with rules\t\t\t: %5d\n", total_alg_count));
     OPAL_OUTPUT((ompi_coll_tuned_stream,"Communicator sizes with rules\t\t: %5d\n", total_com_count));
     OPAL_OUTPUT((ompi_coll_tuned_stream,"Message sizes with rules\t\t: %5d\n", total_msg_count));
diff --git a/ompi/mca/coll/ucc/coll_ucc_component.c b/ompi/mca/coll/ucc/coll_ucc_component.c
index 6fab8c0dc26..b697ab787c2 100644
--- a/ompi/mca/coll/ucc/coll_ucc_component.c
+++ b/ompi/mca/coll/ucc/coll_ucc_component.c
@@ -2,6 +2,7 @@
 /*
  * Copyright (c) 2021 Mellanox Technologies. All rights reserved.
  * Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
+ * Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -60,24 +61,24 @@ static int mca_coll_ucc_register(void)
     mca_coll_ucc_component_t *cm = &mca_coll_ucc_component;
     mca_base_component_t     *c  = &cm->super.collm_version;
     mca_base_component_var_register(c, "priority", "Priority of the UCC coll component",
-                                    MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                    MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                     OPAL_INFO_LVL_9,
-                                    MCA_BASE_VAR_SCOPE_READONLY, &cm->ucc_priority);
+                                    MCA_BASE_VAR_SCOPE_ALL, &cm->ucc_priority);
 
     mca_base_component_var_register(c, "verbose", "Verbose level of the UCC coll component",
-                                    MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                    MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                     OPAL_INFO_LVL_9,
-                                    MCA_BASE_VAR_SCOPE_READONLY, &cm->ucc_verbose);
+                                    MCA_BASE_VAR_SCOPE_ALL, &cm->ucc_verbose);
 
     mca_base_component_var_register(c, "enable", "[0|1] Enable/Disable the UCC coll component",
-                                    MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                    MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                     OPAL_INFO_LVL_9,
-                                    MCA_BASE_VAR_SCOPE_READONLY, &cm->ucc_enable);
+                                    MCA_BASE_VAR_SCOPE_ALL, &cm->ucc_enable);
 
     mca_base_component_var_register(c, "np", "Minimal communicator size for the UCC coll component",
-                                    MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                    MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                     OPAL_INFO_LVL_9,
-                                    MCA_BASE_VAR_SCOPE_READONLY, &cm->ucc_np);
+                                    MCA_BASE_VAR_SCOPE_ALL, &cm->ucc_np);
 
     mca_base_component_var_register(c, MCA_COMPILETIME_VER,
                                     "Version of the libucc library with which Open MPI was compiled",
@@ -94,14 +95,14 @@ static int mca_coll_ucc_register(void)
     cm->cls = "";
     mca_base_component_var_register(c, "cls",
                                     "Comma separated list of UCC CLS to be used for team creation",
-                                    MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
-                                    OPAL_INFO_LVL_6, MCA_BASE_VAR_SCOPE_READONLY, &cm->cls);
+                                    MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
+                                    OPAL_INFO_LVL_6, MCA_BASE_VAR_SCOPE_ALL, &cm->cls);
 
     cm->cts = COLL_UCC_CTS_STR;
     mca_base_component_var_register(c, "cts",
                                     "Comma separated list of UCC coll types to be enabled",
-                                    MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
-                                    OPAL_INFO_LVL_6, MCA_BASE_VAR_SCOPE_READONLY, &cm->cts);
+                                    MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
+                                    OPAL_INFO_LVL_6, MCA_BASE_VAR_SCOPE_ALL, &cm->cts);
     return OMPI_SUCCESS;
 }
 
diff --git a/ompi/mca/fcoll/vulcan/Makefile.am b/ompi/mca/fcoll/vulcan/Makefile.am
index e805880a661..c4680544abb 100644
--- a/ompi/mca/fcoll/vulcan/Makefile.am
+++ b/ompi/mca/fcoll/vulcan/Makefile.am
@@ -13,6 +13,7 @@
 # Copyright (c) 2012 Cisco Systems, Inc.  All rights reserved.
 # Copyright (c) 2018      Research Organization for Information Science
 #                         and Technology (RIST).  All rights reserved.
+# Copyright (c) 2024      Advanced Micro Devices, Inc. All rights reserved.
 # $COPYRIGHT$
 #
 # Additional copyrights may follow
@@ -22,6 +23,7 @@
 
 sources = \
         fcoll_vulcan.h \
+        fcoll_vulcan_internal.h \
         fcoll_vulcan_module.c \
         fcoll_vulcan_component.c \
         fcoll_vulcan_file_read_all.c \
diff --git a/ompi/mca/fcoll/vulcan/fcoll_vulcan.h b/ompi/mca/fcoll/vulcan/fcoll_vulcan.h
index a2fd6ca82bc..3165a0b0797 100644
--- a/ompi/mca/fcoll/vulcan/fcoll_vulcan.h
+++ b/ompi/mca/fcoll/vulcan/fcoll_vulcan.h
@@ -14,6 +14,7 @@
  *                         and Technology (RIST). All rights reserved.
  * Copyright (c) 2024      Triad National Security, LLC. All rights
  *                         reserved.
+ * Copyright (c) 2024      Advanced Micro Devices, Inc. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -40,8 +41,6 @@ BEGIN_C_DECLS
 /* Globally exported variables */
 
 extern int mca_fcoll_vulcan_priority;
-extern int mca_fcoll_vulcan_num_groups;
-extern int mca_fcoll_vulcan_write_chunksize;
 extern int mca_fcoll_vulcan_async_io;
 extern int mca_fcoll_vulcan_use_accelerator_buffers;
 
diff --git a/ompi/mca/fcoll/vulcan/fcoll_vulcan_component.c b/ompi/mca/fcoll/vulcan/fcoll_vulcan_component.c
index 80a5bfb872a..5fc8254f164 100644
--- a/ompi/mca/fcoll/vulcan/fcoll_vulcan_component.c
+++ b/ompi/mca/fcoll/vulcan/fcoll_vulcan_component.c
@@ -16,6 +16,7 @@
  *                         reserved.
  * Copyright (c) 2024      Triad National Security, LLC. All rights
  *                         reserved.
+ * Copyright (c) 2024      Advanced Micro Devices, Inc. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -43,8 +44,6 @@ const char *mca_fcoll_vulcan_component_version_string =
  * Global variables
  */
 int mca_fcoll_vulcan_priority = 10;
-int mca_fcoll_vulcan_num_groups = 1;
-int mca_fcoll_vulcan_write_chunksize = -1;
 int mca_fcoll_vulcan_async_io = 0;
 
 /*
@@ -91,20 +90,6 @@ vulcan_register(void)
                                            OPAL_INFO_LVL_9,
                                            MCA_BASE_VAR_SCOPE_READONLY, &mca_fcoll_vulcan_priority);
 
-    mca_fcoll_vulcan_num_groups = 1;
-    (void) mca_base_component_var_register(&mca_fcoll_vulcan_component.fcollm_version,
-                                           "num_groups", "Number of subgroups created by the vulcan component",
-                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
-                                           OPAL_INFO_LVL_9,
-                                           MCA_BASE_VAR_SCOPE_READONLY, &mca_fcoll_vulcan_num_groups);
-
-    mca_fcoll_vulcan_write_chunksize = -1;
-    (void) mca_base_component_var_register(&mca_fcoll_vulcan_component.fcollm_version,
-                                           "write_chunksize", "Chunk size written at once. Default: stripe_size of the file system",
-                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
-                                           OPAL_INFO_LVL_9,
-                                           MCA_BASE_VAR_SCOPE_READONLY, &mca_fcoll_vulcan_write_chunksize);
-
     mca_fcoll_vulcan_async_io = 0;
     (void) mca_base_component_var_register(&mca_fcoll_vulcan_component.fcollm_version,
                                            "async_io", "Asynchronous I/O support options. 0: Automatic choice (default) "
diff --git a/ompi/mca/fcoll/vulcan/fcoll_vulcan_file_read_all.c b/ompi/mca/fcoll/vulcan/fcoll_vulcan_file_read_all.c
index c372e9f14b4..f6a492e621c 100644
--- a/ompi/mca/fcoll/vulcan/fcoll_vulcan_file_read_all.c
+++ b/ompi/mca/fcoll/vulcan/fcoll_vulcan_file_read_all.c
@@ -1,3 +1,4 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
@@ -14,6 +15,7 @@
  *                         and Technology (RIST). All rights reserved.
  * Copyright (c) 2024      Triad National Security, LLC. All rights
  *                         reserved.
+ * Copyright (c) 2024      Advanced Micro Devices, Inc. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -23,10 +25,31 @@
 
 #include "ompi_config.h"
 #include "fcoll_vulcan.h"
+#include "fcoll_vulcan_internal.h"
 
 #include "mpi.h"
+#include "ompi/constants.h"
+#include "ompi/mca/fcoll/fcoll.h"
+#include "ompi/mca/fcoll/base/fcoll_base_coll_array.h"
 #include "ompi/mca/common/ompio/common_ompio.h"
+#include "ompi/mca/common/ompio/common_ompio_buffer.h"
+#include "ompi/mca/io/io.h"
+#include "ompi/mca/common/ompio/common_ompio_request.h"
+#include "math.h"
+#include "ompi/mca/pml/pml.h"
+#include "opal/mca/accelerator/accelerator.h"
+#include <unistd.h>
 
+#define DEBUG_ON 0
+#define NOT_AGGR_INDEX -1
+
+static int shuffle_init (int index, int cycles, int aggregator, int rank,
+                         mca_io_ompio_aggregator_data *data, ompi_request_t **reqs);
+
+static int read_init (ompio_file_t *fh, int index, int cycles, int aggregator, int rank,
+		      mca_io_ompio_aggregator_data *aggr_data,
+                      int read_syncType, ompi_request_t **request,
+                      bool is_accelerator_buffer);
 
 int mca_fcoll_vulcan_file_read_all (struct ompio_file_t *fh,
                                     void *buf,
@@ -34,7 +57,888 @@ int mca_fcoll_vulcan_file_read_all (struct ompio_file_t *fh,
                                     struct ompi_datatype_t *datatype,
                                     ompi_status_public_t *status)
 {
-    return mca_common_ompio_base_file_read_all (fh, buf, count, datatype, status);
+    int index = 0;
+    int cycles = 0;
+    int ret =0, l, i, j, bytes_per_cycle;
+    uint32_t iov_count = 0;
+    struct iovec *decoded_iov = NULL;
+    struct iovec *local_iov_array=NULL;
+    uint32_t total_fview_count = 0;
+    int local_count = 0;
+    ompi_request_t **reqs = NULL;
+    ompi_request_t *req_iread = MPI_REQUEST_NULL;
+    ompi_request_t *req_tmp = MPI_REQUEST_NULL;
+    mca_io_ompio_aggregator_data **aggr_data=NULL;
+
+    ptrdiff_t *displs = NULL;
+    int vulcan_num_io_procs;
+    size_t max_data = 0;
+
+    struct iovec **broken_iov_arrays=NULL;
+    struct iovec **broken_decoded_iovs=NULL;
+    int *broken_counts=NULL;
+    int *broken_iov_counts=NULL;
+    MPI_Aint *broken_total_lengths=NULL;
+
+    int aggr_index = NOT_AGGR_INDEX;
+    int read_sync_type = 2;
+    int *result_counts=NULL;
+
+    ompi_count_array_t fview_count_desc;
+    ompi_disp_array_t displs_desc;
+    int is_gpu, is_managed;
+    bool use_accelerator_buffer = false;
+
+#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
+    double read_time = 0.0, start_read_time = 0.0, end_read_time = 0.0;
+    double comm_time = 0.0, start_comm_time = 0.0, end_comm_time = 0.0;
+    double exch_read = 0.0, start_exch = 0.0, end_exch = 0.0;
+    mca_common_ompio_print_entry nentry;
+#endif
+
+    vulcan_num_io_procs = fh->f_get_mca_parameter_value ( "num_aggregators", strlen ("num_aggregators"));
+    if (OMPI_ERR_MAX == vulcan_num_io_procs) {
+        ret = OMPI_ERROR;
+        goto exit;
+    }
+    bytes_per_cycle = fh->f_bytes_per_agg;
+
+    if ((1 == mca_fcoll_vulcan_async_io) && (NULL == fh->f_fbtl->fbtl_ipreadv)) {
+        opal_output (1, "vulcan_read_all: fbtl Does NOT support ipreadv() (asynchronous read) \n");
+        ret = MPI_ERR_UNSUPPORTED_OPERATION;
+        goto exit;
+    }
+
+    mca_common_ompio_check_gpu_buf (fh, buf, &is_gpu, &is_managed);
+    if (is_gpu && !is_managed &&
+        fh->f_get_mca_parameter_value ("use_accelerator_buffers", strlen("use_accelerator_buffers"))) {
+        use_accelerator_buffer = true;
+    }
+    /* since we want to overlap 2 iterations, define the bytes_per_cycle to be half of what
+       the user requested */
+    bytes_per_cycle = bytes_per_cycle/2;
+
+    /**************************************************************************
+     ** 1. Decode user buffer into an iovec
+     **************************************************************************/
+    ret = mca_common_ompio_decode_datatype ((struct ompio_file_t *) fh,
+                                            datatype, count, buf, &max_data,
+                                            fh->f_mem_convertor, &decoded_iov,
+                                            &iov_count);
+    if (OMPI_SUCCESS != ret){
+        goto exit;
+    }
+
+    if (MPI_STATUS_IGNORE != status) {
+        status->_ucount = max_data;
+    }
+
+    ret = mca_fcoll_vulcan_get_configuration (fh, vulcan_num_io_procs, max_data);
+    if (OMPI_SUCCESS != ret){
+        goto exit;
+    }
+    opal_output_verbose(10, ompi_fcoll_base_framework.framework_output,
+                        "Using %d aggregators for the read_all operation \n", fh->f_num_aggrs);
+
+    aggr_data = (mca_io_ompio_aggregator_data **) malloc (fh->f_num_aggrs *
+                                            sizeof(mca_io_ompio_aggregator_data*));
+    if (NULL == aggr_data) {
+        ret = OMPI_ERR_OUT_OF_RESOURCE;
+        goto exit;
+    }
+
+    for (i = 0; i < fh->f_num_aggrs; i++) {
+        // At this point we know the number of aggregators. If there is a correlation between
+        // number of aggregators and number of IO nodes, we know how many aggr_data arrays we need
+        // to allocate.
+        aggr_data[i] = (mca_io_ompio_aggregator_data *) calloc (1, sizeof(mca_io_ompio_aggregator_data));
+        aggr_data[i]->procs_per_group = fh->f_procs_per_group;
+        aggr_data[i]->procs_in_group  = fh->f_procs_in_group;
+        aggr_data[i]->comm = fh->f_comm;
+        // Identify if the process is an aggregator.
+        // If so, aggr_index would be its index in "aggr_data" and "aggregators" arrays.
+        if (fh->f_aggr_list[i] == fh->f_rank) {
+            aggr_index = i;
+        }
+    }
+
+    /*********************************************************************
+     *** 2. Generate the local offsets/lengths array corresponding to
+     ***    this read operation
+     ********************************************************************/
+    ret = fh->f_generate_current_file_view ((struct ompio_file_t *) fh,
+                                            max_data, &local_iov_array,
+                                            &local_count);
+    if (ret != OMPI_SUCCESS) {
+        goto exit;
+    }
+
+    /*************************************************************************
+     ** 2b. Separate the local_iov_array entries based on the number of aggregators
+     *************************************************************************/
+    // Modifications for the even distribution:
+    long domain_size;
+    ret = mca_fcoll_vulcan_minmax (fh, local_iov_array, local_count, fh->f_num_aggrs, &domain_size);
+
+    // broken_iov_arrays[0] contains broken_counts[0] entries to aggregator 0,
+    // broken_iov_arrays[1] contains broken_counts[1] entries to aggregator 1, etc.
+    ret = mca_fcoll_vulcan_break_file_view (decoded_iov, iov_count,
+                                            local_iov_array, local_count,
+                                            &broken_decoded_iovs, &broken_iov_counts,
+                                            &broken_iov_arrays, &broken_counts,
+                                            &broken_total_lengths,
+                                            fh->f_num_aggrs, domain_size);
+
+    /**************************************************************************
+     ** 3. Determine the total amount of data to be read and no. of cycles
+     **************************************************************************/
+#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
+    start_comm_time = MPI_Wtime();
+#endif
+    ret = fh->f_comm->c_coll->coll_allreduce (MPI_IN_PLACE, broken_total_lengths,
+                                              fh->f_num_aggrs, MPI_LONG, MPI_SUM,
+                                              fh->f_comm,
+                                              fh->f_comm->c_coll->coll_allreduce_module);
+    if (OMPI_SUCCESS != ret) {
+        goto exit;
+    }
+
+#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
+    end_comm_time = MPI_Wtime();
+    comm_time += (end_comm_time - start_comm_time);
+#endif
+
+    cycles=0;
+    for (i = 0; i < fh->f_num_aggrs; i++) {
+#if DEBUG_ON
+        printf("%d: Overall broken_total_lengths[%d] = %ld\n", fh->f_rank, i, broken_total_lengths[i]);
+#endif
+        if (ceil((double)broken_total_lengths[i]/bytes_per_cycle) > cycles) {
+            cycles = ceil((double)broken_total_lengths[i]/bytes_per_cycle);
+        }
+    }
+
+    result_counts = (int *) malloc (fh->f_num_aggrs * fh->f_procs_per_group * sizeof(int));
+    if (NULL == result_counts) {
+        ret = OMPI_ERR_OUT_OF_RESOURCE;
+        goto exit;
+    }
+
+#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
+    start_comm_time = MPI_Wtime();
+#endif
+    ret = fh->f_comm->c_coll->coll_allgather (broken_counts, fh->f_num_aggrs, MPI_INT,
+                                              result_counts, fh->f_num_aggrs, MPI_INT,
+                                              fh->f_comm,
+					      fh->f_comm->c_coll->coll_allgather_module);
+    if (OMPI_SUCCESS != ret) {
+        goto exit;
+    }
+#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
+    end_comm_time = MPI_Wtime();
+    comm_time += (end_comm_time - start_comm_time);
+#endif
+
+    /*************************************************************
+     *** 4. Allgather the offset/lengths array from all processes
+     *************************************************************/
+    for (i = 0; i < fh->f_num_aggrs; i++) {
+        aggr_data[i]->total_bytes = broken_total_lengths[i];
+        aggr_data[i]->decoded_iov = broken_decoded_iovs[i];
+        aggr_data[i]->fview_count = (size_t *)malloc (fh->f_procs_per_group * sizeof (size_t));
+        if (NULL == aggr_data[i]->fview_count) {
+            opal_output (1, "OUT OF MEMORY\n");
+            ret = OMPI_ERR_OUT_OF_RESOURCE;
+            goto exit;
+        }
+
+        for (j = 0; j < fh->f_procs_per_group; j++) {
+            aggr_data[i]->fview_count[j] = result_counts[fh->f_num_aggrs*j+i];
+        }
+
+        displs = (ptrdiff_t *)malloc (fh->f_procs_per_group * sizeof (ptrdiff_t));
+        if (NULL == displs) {
+            opal_output (1, "OUT OF MEMORY\n");
+            ret = OMPI_ERR_OUT_OF_RESOURCE;
+            goto exit;
+        }
+
+        displs[0] = 0;
+        total_fview_count = (uint32_t) aggr_data[i]->fview_count[0];
+        for (j = 1 ; j < fh->f_procs_per_group ; j++) {
+            total_fview_count += aggr_data[i]->fview_count[j];
+            displs[j] = displs[j-1] + aggr_data[i]->fview_count[j-1];
+        }
+
+#if DEBUG_ON
+        printf("total_fview_count : %d\n", total_fview_count);
+        if (fh->f_aggr_list[i] == fh->f_rank) {
+            for (j=0 ; j<fh->f_procs_per_group ; i++) {
+                printf ("%d: PROCESS: %d  ELEMENTS: %ld  DISPLS: %ld\n",
+                        fh->f_rank, j,
+                        aggr_data[i]->fview_count[j],
+                        displs[j]);
+            }
+        }
+#endif
+
+        /* allocate the global iovec  */
+        if (0 != total_fview_count) {
+            aggr_data[i]->global_iov_array = (struct iovec*) malloc (total_fview_count *
+                                                                     sizeof(struct iovec));
+            if (NULL == aggr_data[i]->global_iov_array) {
+                opal_output(1, "OUT OF MEMORY\n");
+                ret = OMPI_ERR_OUT_OF_RESOURCE;
+                goto exit;
+            }
+        }
+
+#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
+        start_comm_time = MPI_Wtime();
+#endif
+        OMPI_COUNT_ARRAY_INIT(&fview_count_desc, aggr_data[i]->fview_count);
+        OMPI_DISP_ARRAY_INIT(&displs_desc, displs);
+        ret = fh->f_comm->c_coll->coll_allgatherv (broken_iov_arrays[i],
+                                                   broken_counts[i],
+                                                   fh->f_iov_type,
+                                                   aggr_data[i]->global_iov_array,
+                                                   fview_count_desc,
+                                                   displs_desc,
+                                                   fh->f_iov_type,
+                                                   fh->f_comm,
+                                                   fh->f_comm->c_coll->coll_allgatherv_module );
+        if (OMPI_SUCCESS != ret) {
+            goto exit;
+        }
+
+#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
+        end_comm_time = MPI_Wtime();
+        comm_time += (end_comm_time - start_comm_time);
+#endif
+
+        /****************************************************************************************
+         *** 5. Sort the global offset/lengths list based on the offsets.
+         *** The result of the sort operation is the 'sorted', an integer array,
+         *** which contains the indexes of the global_iov_array based on the offset.
+         *** For example, if global_iov_array[x].offset is followed by global_iov_array[y].offset
+         *** in the file, and that one is followed by global_iov_array[z].offset, than
+         *** sorted[0] = x, sorted[1]=y and sorted[2]=z;
+         ******************************************************************************************/
+        if (0 != total_fview_count) {
+            aggr_data[i]->sorted = (int *)malloc (total_fview_count * sizeof(int));
+            if (NULL == aggr_data[i]->sorted) {
+                opal_output (1, "OUT OF MEMORY\n");
+                ret = OMPI_ERR_OUT_OF_RESOURCE;
+                goto exit;
+            }
+            ompi_fcoll_base_sort_iovec (aggr_data[i]->global_iov_array, total_fview_count,
+					aggr_data[i]->sorted);
+        }
+
+        if (NULL != local_iov_array) {
+            free(local_iov_array);
+            local_iov_array = NULL;
+        }
+
+        if (NULL != displs) {
+            free(displs);
+            displs=NULL;
+        }
+
+#if DEBUG_ON
+        if (fh->f_aggr_list[i] == fh->f_rank) {
+            uint32_t tv=0;
+            for (tv = 0 ; tv < total_fview_count ; tv++) {
+                printf("%d: OFFSET: %lu   LENGTH: %ld\n",
+                       fh->f_rank,
+                       (uint64_t)aggr_data[i]->global_iov_array[aggr_data[i]->sorted[tv]].iov_base,
+                       aggr_data[i]->global_iov_array[aggr_data[i]->sorted[tv]].iov_len);
+            }
+        }
+#endif
+        /*************************************************************
+         *** 6. Determine the number of cycles required to execute this
+         ***    operation
+         *************************************************************/
+        aggr_data[i]->bytes_per_cycle = bytes_per_cycle;
+
+        if (fh->f_aggr_list[i] == fh->f_rank) {
+            aggr_data[i]->disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int));
+            if (NULL == aggr_data[i]->disp_index) {
+                opal_output (1, "OUT OF MEMORY\n");
+                ret = OMPI_ERR_OUT_OF_RESOURCE;
+                goto exit;
+            }
+
+            aggr_data[i]->max_disp_index = (int *)calloc (fh->f_procs_per_group,  sizeof (int));
+            if (NULL == aggr_data[i]->max_disp_index) {
+                opal_output (1, "OUT OF MEMORY\n");
+                ret = OMPI_ERR_OUT_OF_RESOURCE;
+                goto exit;
+            }
+
+            aggr_data[i]->blocklen_per_process = (int **)calloc (fh->f_procs_per_group, sizeof (int*));
+            if (NULL == aggr_data[i]->blocklen_per_process) {
+                opal_output (1, "OUT OF MEMORY\n");
+                ret = OMPI_ERR_OUT_OF_RESOURCE;
+                goto exit;
+            }
+
+            aggr_data[i]->displs_per_process = (MPI_Aint **)calloc (fh->f_procs_per_group, sizeof (MPI_Aint*));
+            if (NULL == aggr_data[i]->displs_per_process) {
+                opal_output (1, "OUT OF MEMORY\n");
+                ret = OMPI_ERR_OUT_OF_RESOURCE;
+                goto exit;
+            }
+
+            if (use_accelerator_buffer) {
+                opal_output_verbose(10, ompi_fcoll_base_framework.framework_output,
+                                    "Allocating GPU device buffer for aggregation\n");
+                ret = opal_accelerator.mem_alloc(MCA_ACCELERATOR_NO_DEVICE_ID, (void**)&aggr_data[i]->global_buf,
+                                                 bytes_per_cycle);
+                if (OPAL_SUCCESS != ret) {
+                    opal_output(1, "Could not allocate accelerator memory");
+                    ret = OMPI_ERR_OUT_OF_RESOURCE;
+                    goto exit;
+                }
+                ret = opal_accelerator.mem_alloc(MCA_ACCELERATOR_NO_DEVICE_ID, (void**)&aggr_data[i]->prev_global_buf,
+                                                 bytes_per_cycle);
+                if (OPAL_SUCCESS != ret) {
+                    opal_output(1, "Could not allocate accelerator memory");
+                    ret = OMPI_ERR_OUT_OF_RESOURCE;
+                    goto exit;
+                }
+            } else {
+                aggr_data[i]->global_buf       = (char *) malloc (bytes_per_cycle);
+                aggr_data[i]->prev_global_buf  = (char *) malloc (bytes_per_cycle);
+                if (NULL == aggr_data[i]->global_buf || NULL == aggr_data[i]->prev_global_buf){
+                    opal_output(1, "OUT OF MEMORY");
+                    ret = OMPI_ERR_OUT_OF_RESOURCE;
+                    goto exit;
+                }
+            }
+
+            aggr_data[i]->recvtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group  *
+                                                                  sizeof(ompi_datatype_t *));
+            aggr_data[i]->prev_recvtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group  *
+                                                                       sizeof(ompi_datatype_t *));
+            if (NULL == aggr_data[i]->recvtype || NULL == aggr_data[i]->prev_recvtype) {
+                opal_output (1, "OUT OF MEMORY\n");
+                ret = OMPI_ERR_OUT_OF_RESOURCE;
+                goto exit;
+            }
+            for(l=0;l<fh->f_procs_per_group;l++){
+                aggr_data[i]->recvtype[l]      = MPI_DATATYPE_NULL;
+                aggr_data[i]->prev_recvtype[l] = MPI_DATATYPE_NULL;
+            }
+        }
+
+#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
+        start_exch = MPI_Wtime();
+#endif
+    }
+
+    reqs = (ompi_request_t **)malloc ((fh->f_procs_per_group + 1 )*fh->f_num_aggrs *sizeof(ompi_request_t *));
+    if (NULL == reqs) {
+        opal_output (1, "OUT OF MEMORY\n");
+        ret = OMPI_ERR_OUT_OF_RESOURCE;
+        goto exit;
+    }
+
+    for (l = 0, i = 0; i < fh->f_num_aggrs; i++) {
+        for (j=0; j< (fh->f_procs_per_group+1); j++) {
+            reqs[l] = MPI_REQUEST_NULL;
+            l++;
+        }
+    }
+
+    if( (1 == mca_fcoll_vulcan_async_io) ||
+        ( (0 == mca_fcoll_vulcan_async_io) && (NULL != fh->f_fbtl->fbtl_ipreadv) && (2 < cycles))) {
+        read_sync_type = 1;
+    }
+
+    if (cycles > 0) {
+        if (NOT_AGGR_INDEX != aggr_index) {
+	    // Register progress function that should be used by ompi_request_wait
+	    mca_common_ompio_register_progress ();
+	}
+#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
+	start_read_time = MPI_Wtime();
+#endif
+        for (i = 0; i < fh->f_num_aggrs; i++) {            
+            ret = read_init (fh, 0, cycles, fh->f_aggr_list[i], fh->f_rank,
+                             aggr_data[i], read_sync_type, &req_tmp,
+                             use_accelerator_buffer);
+            if (OMPI_SUCCESS != ret) {
+                goto exit;
+            }
+            if (fh->f_aggr_list[i] == fh->f_rank) {
+                req_iread = req_tmp;
+            }
+	}
+
+        if (NOT_AGGR_INDEX != aggr_index) {
+            ret = ompi_request_wait(&req_iread, MPI_STATUS_IGNORE);
+            if (OMPI_SUCCESS != ret){
+                goto exit;
+            }
+        }
+#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
+	end_read_time = MPI_Wtime();
+	read_time += end_read_time - start_read_time;
+#endif
+    }
+
+    for (index = 1; index < cycles; index++) {
+        for (i = 0; i < fh->f_num_aggrs; i++) {
+            ret = shuffle_init (index-1, cycles, fh->f_aggr_list[i], fh->f_rank, aggr_data[i],
+                                &reqs[i*(fh->f_procs_per_group + 1)] );
+            if (OMPI_SUCCESS != ret) {
+                goto exit;
+            }
+        }
+
+        SWAP_AGGR_POINTERS(aggr_data, fh->f_num_aggrs);
+#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
+        start_read_time = MPI_Wtime();
+#endif
+        for (i = 0; i < fh->f_num_aggrs; i++) {
+            ret = read_init (fh, index, cycles, fh->f_aggr_list[i], fh->f_rank,
+			     aggr_data[i], read_sync_type,
+			     &req_tmp, use_accelerator_buffer);
+            if (OMPI_SUCCESS != ret){
+                goto exit;
+            }
+            if (fh->f_aggr_list[i] == fh->f_rank) {
+                req_iread = req_tmp;
+            }
+        }
+#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
+	end_read_time = MPI_Wtime();
+	read_time += end_read_time - start_read_time;
+#endif
+	ret = ompi_request_wait_all ((fh->f_procs_per_group + 1 )*fh->f_num_aggrs,
+                                     reqs, MPI_STATUS_IGNORE);
+        if (OMPI_SUCCESS != ret){
+            goto exit;
+        }
+
+        if (NOT_AGGR_INDEX != aggr_index) {
+            ret = ompi_request_wait (&req_iread, MPI_STATUS_IGNORE);
+            if (OMPI_SUCCESS != ret){
+                goto exit;
+            }
+        }
+    } /* end  for (index = 1; index < cycles; index++) */
+
+    if (cycles > 0) {
+        for (i = 0; i < fh->f_num_aggrs; i++) {
+            ret = shuffle_init (index-1, cycles, fh->f_aggr_list[i], fh->f_rank, aggr_data[i],
+                                &reqs[i*(fh->f_procs_per_group + 1)] );
+            if (OMPI_SUCCESS != ret) {
+                goto exit;
+            }
+        }
+	ret = ompi_request_wait_all ((fh->f_procs_per_group + 1 )*fh->f_num_aggrs,
+                                     reqs, MPI_STATUS_IGNORE);
+        if (OMPI_SUCCESS != ret){
+            goto exit;
+        }
+    }
+
+#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
+    end_exch = MPI_Wtime();
+    exch_read += end_exch - start_exch;
+    nentry.time[0] = read_time;
+    nentry.time[1] = comm_time;
+    nentry.time[2] = exch_read;
+    nentry.aggregator = 0;
+    for ( i=0; i<fh->f_num_aggrs; i++ ) {
+        if (fh->f_aggr_list[i] == fh->f_rank)
+        nentry.aggregator = 1;
+    }
+    nentry.nprocs_for_coll = fh->f_num_aggrs;
+    if (!mca_common_ompio_full_print_queue(fh->f_coll_read_time)){
+        mca_common_ompio_register_print_entry(fh->f_coll_read_time,
+                                               nentry);
+    }
+#endif
+
+exit :
+    if (NULL != aggr_data) {
+
+        for (i = 0; i < fh->f_num_aggrs; i++) {
+            if (fh->f_aggr_list[i] == fh->f_rank) {
+                if (NULL != aggr_data[i]->recvtype){
+                    for (j = 0; j < aggr_data[i]->procs_per_group; j++) {
+                        if (MPI_DATATYPE_NULL != aggr_data[i]->recvtype[j]) {
+                            ompi_datatype_destroy(&aggr_data[i]->recvtype[j]);
+                        }
+                        if (MPI_DATATYPE_NULL != aggr_data[i]->prev_recvtype[j]) {
+                            ompi_datatype_destroy(&aggr_data[i]->prev_recvtype[j]);
+                        }
+                    }
+                    free(aggr_data[i]->recvtype);
+                    free(aggr_data[i]->prev_recvtype);
+                }
+
+                free (aggr_data[i]->disp_index);
+                free (aggr_data[i]->max_disp_index);
+                if (use_accelerator_buffer) {
+                    opal_accelerator.mem_release(MCA_ACCELERATOR_NO_DEVICE_ID, aggr_data[i]->global_buf);
+                    opal_accelerator.mem_release(MCA_ACCELERATOR_NO_DEVICE_ID, aggr_data[i]->prev_global_buf);
+                } else {
+                    free (aggr_data[i]->global_buf);
+                    free (aggr_data[i]->prev_global_buf);
+                }
+                for (l = 0;l < aggr_data[i]->procs_per_group; l++) {
+                    free (aggr_data[i]->blocklen_per_process[l]);
+                    free (aggr_data[i]->displs_per_process[l]);
+                }
+
+                free (aggr_data[i]->blocklen_per_process);
+                free (aggr_data[i]->displs_per_process);
+            }
+            free (aggr_data[i]->sorted);
+            free (aggr_data[i]->global_iov_array);
+            free (aggr_data[i]->fview_count);
+            free (aggr_data[i]->decoded_iov);
+
+            free (aggr_data[i]);
+        }
+        free (aggr_data);
+    }
+    free(displs);
+    free(decoded_iov);
+    free(broken_counts);
+    free(broken_total_lengths);
+    free(broken_iov_counts);
+    free(broken_decoded_iovs); // decoded_iov arrays[i] were freed as aggr_data[i]->decoded_iov;
+    if (NULL != broken_iov_arrays) {
+        for (i = 0; i < fh->f_num_aggrs; i++) {
+            free(broken_iov_arrays[i]);
+        }
+    }
+    free(broken_iov_arrays);
+    free(fh->f_procs_in_group);
+    free(fh->f_aggr_list);
+    fh->f_procs_in_group=NULL;
+    fh->f_procs_per_group=0;
+    fh->f_aggr_list=NULL;
+    free(result_counts);
+    free(reqs);
+
+    return ret;
+}
+
+static int read_init (ompio_file_t *fh, int index, int cycles, int aggregator, int rank,
+		      mca_io_ompio_aggregator_data *data,
+                      int read_syncType, ompi_request_t **request,
+                      bool is_accelerator_buffer)
+{
+    int ret = OMPI_SUCCESS;
+    ssize_t ret_temp = 0;
+    mca_ompio_request_t *ompio_req = NULL;
+    int i, j, l;
+    int entries_per_aggregator=0;
+    mca_io_ompio_local_io_array *file_offsets_for_agg=NULL;
+    MPI_Aint *memory_displacements=NULL;
+    int* blocklength_proc=NULL;
+    ptrdiff_t* displs_proc=NULL;
+    int *sorted_file_offsets=NULL;
+
+    /**********************************************************************
+     ***  7a. Getting ready for next cycle: initializing and freeing buffers
+     **********************************************************************/
+    data->bytes_sent = 0;
+
+    if (aggregator == rank) {
+	if (NULL != data->recvtype){
+	    for (i = 0; i < data->procs_per_group; i++) {
+		if (MPI_DATATYPE_NULL != data->recvtype[i]) {
+		    ompi_datatype_destroy(&data->recvtype[i]);
+		    data->recvtype[i] = MPI_DATATYPE_NULL;
+		}
+	    }
+	}
+
+	for (l = 0; l < data->procs_per_group; l++) {
+	    data->disp_index[l] = 0;
+
+	    if (data->max_disp_index[l] == 0) {
+		data->blocklen_per_process[l] = (int *) calloc (INIT_LEN, sizeof(int));
+		data->displs_per_process[l] = (MPI_Aint *) calloc (INIT_LEN, sizeof(MPI_Aint));
+		if (NULL == data->displs_per_process[l] || NULL == data->blocklen_per_process[l]){
+		    opal_output (1, "OUT OF MEMORY for displs\n");
+		    ret = OMPI_ERR_OUT_OF_RESOURCE;
+		    goto exit;
+		}
+		data->max_disp_index[l] = INIT_LEN;
+	    } else {
+		memset (data->blocklen_per_process[l], 0, data->max_disp_index[l]*sizeof(int));
+		memset (data->displs_per_process[l], 0, data->max_disp_index[l]*sizeof(MPI_Aint));
+	    }
+	}
+    } /* rank == aggregator */
+
+    /**************************************************************************
+     ***  7b. Determine the number of bytes to be actually read in this cycle
+     **************************************************************************/
+    int local_cycles= ceil((double)data->total_bytes / data->bytes_per_cycle);
+    if (index  < (local_cycles -1)) {
+        data->bytes_to_write_in_cycle = data->bytes_per_cycle;
+    } else if ( index == (local_cycles -1)) {
+        data->bytes_to_write_in_cycle = data->total_bytes - data->bytes_per_cycle*index;
+    } else {
+        data->bytes_to_write_in_cycle = 0;
+    }
+    data->bytes_to_write = data->bytes_to_write_in_cycle;
+
+#if DEBUG_ON
+    if (aggregator == rank) {
+        printf ("****%d: CYCLE %d   Bytes %d**********\n",
+                rank, index, data->bytes_to_write_in_cycle);
+    }
+#endif
+
+    /*****************************************************************
+     *** 7c. Calculate how much data will be sent to each process in
+     *** this cycle
+     *****************************************************************/
+    mca_fcoll_vulcan_calc_blocklen_disps(data, aggregator, rank, &data->bytes_sent);
+
+    /*************************************************************************
+     *** 7d. Calculate the displacement
+     *************************************************************************/
+    if (rank == aggregator) {
+        for (i = 0; i < data->procs_per_group; i++){
+            for (j = 0; j < data->disp_index[i]; j++){
+                if (data->blocklen_per_process[i][j] > 0)
+                    entries_per_aggregator++ ;
+            }
+        }
+    }
+#if DEBUG_ON
+    if (aggregator == rank) {
+        printf("%d : Entries per aggregator : %d\n", rank, entries_per_aggregator);
+    }
+#endif
+
+    if (entries_per_aggregator > 0) {
+	file_offsets_for_agg = (mca_io_ompio_local_io_array *) malloc (entries_per_aggregator
+							* sizeof(mca_io_ompio_local_io_array));
+	memory_displacements = (MPI_Aint *) malloc (entries_per_aggregator * sizeof(MPI_Aint));
+	sorted_file_offsets = (int *) malloc (entries_per_aggregator*sizeof(int));
+	if (NULL == file_offsets_for_agg || NULL == memory_displacements ||
+	    NULL == sorted_file_offsets) {
+	    opal_output (1, "OUT OF MEMORY\n");
+	    ret = OMPI_ERR_OUT_OF_RESOURCE;
+	    goto exit;
+	}
+
+	ret = mca_fcoll_vulcan_calc_file_offsets(data, file_offsets_for_agg, sorted_file_offsets,
+						 memory_displacements, entries_per_aggregator,
+						 rank, index);
+	if (OMPI_SUCCESS != ret) {
+	  goto exit;
+	}
+
+	/**********************************************************
+	 *** 7f. Create the io array
+	 *********************************************************/
+        fh->f_io_array = (mca_common_ompio_io_array_t *) malloc (entries_per_aggregator
+						* sizeof (mca_common_ompio_io_array_t));
+        if (NULL == fh->f_io_array) {
+            opal_output(1, "OUT OF MEMORY\n");
+            ret = OMPI_ERR_OUT_OF_RESOURCE;
+            goto exit;
+        }
+
+        mca_fcoll_vulcan_calc_io_array(fh->f_io_array, &fh->f_num_of_io_entries, entries_per_aggregator,
+                                       (char*)data->global_buf, file_offsets_for_agg, sorted_file_offsets,
+                                       memory_displacements, rank);
+    }
+
+    if (rank == aggregator && fh->f_num_of_io_entries) {
+        mca_common_ompio_request_alloc (&ompio_req, MCA_OMPIO_REQUEST_READ);
+
+        if (1 == read_syncType) {
+            if (is_accelerator_buffer) {
+                ret = mca_common_ompio_file_iread_pregen(fh, (ompi_request_t *) ompio_req);
+                if (0 > ret) {
+                    opal_output (1, "vulcan_read_all: mca_common_ompio_iread_pregen failed\n");
+                    ompio_req->req_ompi.req_status.MPI_ERROR = ret;
+                    ompio_req->req_ompi.req_status._ucount = 0;
+                }
+            } else {
+                ret = fh->f_fbtl->fbtl_ipreadv(fh, (ompi_request_t *) ompio_req);
+                if (0 > ret) {
+                    opal_output (1, "vulcan_read_all: fbtl_ipreadv failed\n");
+                    ompio_req->req_ompi.req_status.MPI_ERROR = ret;
+                    ompio_req->req_ompi.req_status._ucount = 0;
+                }
+            }
+        }
+        else {
+            ret_temp = fh->f_fbtl->fbtl_preadv(fh);
+            if (0 > ret_temp) {
+                opal_output (1, "vulcan_read_all: fbtl_preadv failed\n");
+                ret = ret_temp;
+                ret_temp = 0;
+            }
+
+            ompio_req->req_ompi.req_status.MPI_ERROR = ret;
+            ompio_req->req_ompi.req_status._ucount = ret_temp;
+            ompi_request_complete (&ompio_req->req_ompi, false);
+        }
+
+        free(fh->f_io_array);
+    }
+
+#if DEBUG_ON
+    printf("************Cycle: %d,  Aggregator: %d ***************\n",
+	   index, rank);
+    for (i = 0; i < data->procs_per_group; i++) {
+	for (j = 0; j < data->disp_index[i]; j++) {
+	    if (data->blocklen_per_process[i][j] > 0) {
+		printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n",
+		       data->procs_in_group[i],j,
+		       data->blocklen_per_process[i][j],j,
+		       data->displs_per_process[i][j], rank);
+	    }
+	}
+    }
+#endif
+
+exit:
+    free(sorted_file_offsets);
+    free(file_offsets_for_agg);
+    free(memory_displacements);
+    free(blocklength_proc);
+    free(displs_proc);
+
+    fh->f_io_array = NULL;
+    fh->f_num_of_io_entries = 0;
+
+    *request = (ompi_request_t *) ompio_req;
+    return ret;
 }
 
+static int shuffle_init (int index, int cycles, int aggregator, int rank, mca_io_ompio_aggregator_data *data,
+                         ompi_request_t **reqs)
+{
+    int i, ret = OMPI_SUCCESS;
+    int* blocklength_proc=NULL;
+    ptrdiff_t* displs_proc=NULL;
+
+    /*************************************************************************
+     *** 7e. Perform the actual communication
+     *************************************************************************/
+    if (aggregator == rank ) {
+	for (i = 0; i < data->procs_per_group; i++) {
+	    size_t datatype_size;
+            reqs[i] = MPI_REQUEST_NULL;
+	    if (0 < data->disp_index[i]) {
+		ompi_datatype_create_hindexed (data->disp_index[i],
+					       data->blocklen_per_process[i],
+					       data->displs_per_process[i],
+					       MPI_BYTE,
+					       &data->recvtype[i]);
+		ompi_datatype_commit (&data->recvtype[i]);
+		opal_datatype_type_size (&data->recvtype[i]->super, &datatype_size);
 
+		if (datatype_size){
+		    ret = MCA_PML_CALL(isend(data->global_buf,
+					     1, data->recvtype[i],
+					     data->procs_in_group[i],
+					     FCOLL_VULCAN_SHUFFLE_TAG+index,
+					     MCA_PML_BASE_SEND_STANDARD,
+					     data->comm, &reqs[i]));
+		    if (OMPI_SUCCESS != ret){
+			goto exit;
+		    }
+		}
+	    }
+	}
+	// }  /* end if (entries_per_aggr > 0 ) */
+    }/* end if (aggregator == rank ) */
+
+    reqs[data->procs_per_group] = MPI_REQUEST_NULL;
+    if (data->bytes_sent) {
+        size_t remaining      = data->bytes_sent;
+        int block_index       = -1;
+        int blocklength_size  = INIT_LEN;
+
+        ptrdiff_t recv_mem_address  = 0;
+        ompi_datatype_t *newType    = MPI_DATATYPE_NULL;
+        blocklength_proc            = (int *)       calloc (blocklength_size, sizeof (int));
+        displs_proc                 = (ptrdiff_t *) calloc (blocklength_size, sizeof (ptrdiff_t));
+
+        if (NULL == blocklength_proc || NULL == displs_proc ) {
+            opal_output (1, "OUT OF MEMORY\n");
+            ret = OMPI_ERR_OUT_OF_RESOURCE;
+            goto exit;
+        }
+
+        while (remaining) {
+            block_index++;
+
+            if(0 == block_index) {
+                recv_mem_address = (ptrdiff_t) (data->decoded_iov[data->iov_index].iov_base) +
+                                                data->current_position;
+            }
+            else {
+                // Reallocate more memory if blocklength_size is not enough
+                if(0 == block_index % INIT_LEN) {
+                    blocklength_size += INIT_LEN;
+                    blocklength_proc = (int *)       realloc(blocklength_proc, blocklength_size * sizeof(int));
+                    displs_proc      = (ptrdiff_t *) realloc(displs_proc, blocklength_size * sizeof(ptrdiff_t));
+                }
+                displs_proc[block_index] = (ptrdiff_t) (data->decoded_iov[data->iov_index].iov_base) +
+                                                        data->current_position - recv_mem_address;
+            }
+
+            if (remaining >=
+                (data->decoded_iov[data->iov_index].iov_len - data->current_position)) {
+
+                blocklength_proc[block_index] = data->decoded_iov[data->iov_index].iov_len -
+                                                data->current_position;
+                remaining = remaining - (data->decoded_iov[data->iov_index].iov_len -
+					 data->current_position);
+                data->iov_index = data->iov_index + 1;
+                data->current_position = 0;
+            } else {
+                blocklength_proc[block_index] = remaining;
+                data->current_position += remaining;
+                remaining = 0;
+            }
+        }
+
+        data->total_bytes_written += data->bytes_sent;
+
+        if (0 <= block_index) {
+            ompi_datatype_create_hindexed (block_index+1,
+                                           blocklength_proc,
+                                           displs_proc,
+                                           MPI_BYTE,
+                                           &newType);
+            ompi_datatype_commit (&newType);
+
+            ret = MCA_PML_CALL(irecv((char *)recv_mem_address,
+                                     1,
+                                     newType,
+                                     aggregator,
+                                     FCOLL_VULCAN_SHUFFLE_TAG+index,
+                                     data->comm,
+                                     &reqs[data->procs_per_group]));
+            if (MPI_DATATYPE_NULL != newType) {
+                ompi_datatype_destroy(&newType);
+            }
+            if (OMPI_SUCCESS != ret){
+                goto exit;
+            }
+        }
+    }
+exit:
+    return ret;
+}
diff --git a/ompi/mca/fcoll/vulcan/fcoll_vulcan_file_write_all.c b/ompi/mca/fcoll/vulcan/fcoll_vulcan_file_write_all.c
index 5f89fba8d01..b6e9be6d2ca 100644
--- a/ompi/mca/fcoll/vulcan/fcoll_vulcan_file_write_all.c
+++ b/ompi/mca/fcoll/vulcan/fcoll_vulcan_file_write_all.c
@@ -1,3 +1,4 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
@@ -25,6 +26,7 @@
 
 #include "ompi_config.h"
 #include "fcoll_vulcan.h"
+#include "fcoll_vulcan_internal.h"
 
 #include "mpi.h"
 #include "ompi/constants.h"
@@ -42,85 +44,16 @@
 #define DEBUG_ON 0
 #define NOT_AGGR_INDEX -1
 
-/*Used for loading file-offsets per aggregator*/
-typedef struct mca_io_ompio_local_io_array{
-    OMPI_MPI_OFFSET_TYPE offset;
-    MPI_Aint             length;
-    int                  process_id;
-}mca_io_ompio_local_io_array;
-
-typedef struct mca_io_ompio_aggregator_data {
-    int *disp_index, *sorted, n;
-    size_t *fview_count;
-    int *max_disp_index;
-    int **blocklen_per_process;
-    MPI_Aint **displs_per_process, total_bytes, bytes_per_cycle, total_bytes_written;
-    MPI_Comm comm;
-    char *buf, *global_buf, *prev_global_buf;
-    ompi_datatype_t **recvtype, **prev_recvtype;
-    struct iovec *global_iov_array;
-    int current_index, current_position;
-    int bytes_to_write_in_cycle, bytes_remaining, procs_per_group;    
-    int *procs_in_group, iov_index;
-    int bytes_sent, prev_bytes_sent;
-    struct iovec *decoded_iov;
-    int bytes_to_write, prev_bytes_to_write;
-    mca_common_ompio_io_array_t *io_array, *prev_io_array;
-    int num_io_entries, prev_num_io_entries;
-} mca_io_ompio_aggregator_data;
-
-
-#define SWAP_REQUESTS(_r1,_r2) { \
-    ompi_request_t **_t=_r1;     \
-    _r1=_r2;                     \
-    _r2=_t;}
-
-#define SWAP_AGGR_POINTERS(_aggr,_num) {                        \
-    int _i;                                                     \
-    char *_t;                                                   \
-    for (_i=0; _i<_num; _i++ ) {                                \
-        _aggr[_i]->prev_io_array=_aggr[_i]->io_array;             \
-        _aggr[_i]->prev_num_io_entries=_aggr[_i]->num_io_entries; \
-        _aggr[_i]->prev_bytes_sent=_aggr[_i]->bytes_sent;         \
-        _aggr[_i]->prev_bytes_to_write=_aggr[_i]->bytes_to_write; \
-        _t=_aggr[_i]->prev_global_buf;                            \
-        _aggr[_i]->prev_global_buf=_aggr[_i]->global_buf;         \
-        _aggr[_i]->global_buf=_t;                                 \
-        _t=(char *)_aggr[_i]->recvtype;                           \
-        _aggr[_i]->recvtype=_aggr[_i]->prev_recvtype;             \
-        _aggr[_i]->prev_recvtype=(ompi_datatype_t **)_t;          }                                                             \
-}
 
-static int shuffle_init ( int index, int cycles, int aggregator, int rank, 
-                          mca_io_ompio_aggregator_data *data, 
-                          ompi_request_t **reqs );
+static int shuffle_init (int index, int num_cycles, int aggregator, int rank,
+                         mca_io_ompio_aggregator_data *data, ompi_request_t **reqs);
+
 static int write_init (ompio_file_t *fh, int aggregator, mca_io_ompio_aggregator_data *aggr_data,
-                       int write_chunksize, int write_synchType, ompi_request_t **request,
+                       int write_syncType, ompi_request_t **request,
                        bool is_accelerator_buffer);
-int mca_fcoll_vulcan_break_file_view ( struct iovec *decoded_iov, int iov_count, 
-                                        struct iovec *local_iov_array, int local_count, 
-                                        struct iovec ***broken_decoded_iovs, int **broken_iov_counts,
-                                        struct iovec ***broken_iov_arrays, int **broken_counts, 
-                                        MPI_Aint **broken_total_lengths,
-                                        int stripe_count, size_t stripe_size); 
-
-
-int mca_fcoll_vulcan_get_configuration (ompio_file_t *fh, int num_io_procs, 
-                                        int num_groups, size_t max_data);
-
 
 static int local_heap_sort (mca_io_ompio_local_io_array *io_array,
-			    int num_entries,
-			    int *sorted);
-
-int mca_fcoll_vulcan_split_iov_array ( ompio_file_t *fh, mca_common_ompio_io_array_t *work_array,
-                                             int num_entries, int *last_array_pos, int *last_pos_in_field,
-                                             int chunk_size );
-
-
-static int mca_fcoll_vulcan_minmax ( ompio_file_t *fh, struct iovec *iov, int iov_count,  int num_aggregators, 
-                                     long *new_stripe_size);
-
+			    int num_entries, int *sorted);
 
 int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
                                       const void *buf,
@@ -143,7 +76,6 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
     ptrdiff_t *displs = NULL;
     int vulcan_num_io_procs;
     size_t max_data = 0;
-    MPI_Aint *total_bytes_per_process = NULL;
     
     struct iovec **broken_iov_arrays=NULL;
     struct iovec **broken_decoded_iovs=NULL;
@@ -153,7 +85,7 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
 
     int aggr_index = NOT_AGGR_INDEX;
     int write_synch_type = 2;
-    int write_chunksize, *result_counts=NULL;
+    int *result_counts=NULL;
 
     ompi_count_array_t fview_count_desc;
     ompi_disp_array_t displs_desc;
@@ -186,13 +118,12 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
 
     mca_common_ompio_check_gpu_buf (fh, buf, &is_gpu, &is_managed);
     if (is_gpu && !is_managed &&
-	fh->f_get_mca_parameter_value ("use_accelerator_buffers", strlen("use_accelerator_buffers"))) {
-	use_accelerator_buffer = true;
+        fh->f_get_mca_parameter_value ("use_accelerator_buffers", strlen("use_accelerator_buffers"))) {
+        use_accelerator_buffer = true;
     }
     /* since we want to overlap 2 iterations, define the bytes_per_cycle to be half of what
        the user requested */
     bytes_per_cycle =bytes_per_cycle/2;
-    write_chunksize = bytes_per_cycle;
     
     ret =   mca_common_ompio_decode_datatype ((struct ompio_file_t *) fh,
                                               datatype,
@@ -207,14 +138,15 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
     }
     
     if ( MPI_STATUS_IGNORE != status ) {
-	status->_ucount = max_data;
+        status->_ucount = max_data;
     }
     
-    
-    ret = mca_fcoll_vulcan_get_configuration (fh, vulcan_num_io_procs, mca_fcoll_vulcan_num_groups, max_data);
+    ret = mca_fcoll_vulcan_get_configuration (fh, vulcan_num_io_procs, max_data);
     if (OMPI_SUCCESS != ret){
-	goto exit;
+        goto exit;
     }
+    opal_output_verbose(10, ompi_fcoll_base_framework.framework_output,
+        "Using %d aggregators for the write_all operation \n", fh->f_num_aggrs);
 
     aggr_data = (mca_io_ompio_aggregator_data **) malloc ( fh->f_num_aggrs * 
                                                            sizeof(mca_io_ompio_aggregator_data*));
@@ -227,7 +159,6 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
         aggr_data[i]->procs_per_group = fh->f_procs_per_group;
         aggr_data[i]->procs_in_group  = fh->f_procs_in_group;
         aggr_data[i]->comm = fh->f_comm;
-        aggr_data[i]->buf  = (char *)buf;             // should not be used in the new version.
         // Identify if the process is an aggregator.
         // If so, aggr_index would be its index in "aggr_data" and "aggregators" arrays.
         if(fh->f_aggr_list[i] == fh->f_rank) {
@@ -240,11 +171,11 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
      ***    this write operation
      ********************************************************************/
     ret = fh->f_generate_current_file_view( (struct ompio_file_t *) fh,
-					    max_data,
-					    &local_iov_array,
-					    &local_count);
+                                            max_data,
+                                            &local_iov_array,
+                                            &local_count);
     if (ret != OMPI_SUCCESS){
-	goto exit;
+        goto exit;
     }
     
     /*************************************************************************
@@ -270,52 +201,15 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
 #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
     start_comm_time = MPI_Wtime();
 #endif
-    if ( 1 == mca_fcoll_vulcan_num_groups ) {
-        ret = fh->f_comm->c_coll->coll_allreduce (MPI_IN_PLACE,
-                                                  broken_total_lengths,
-						  fh->f_num_aggrs,
-						  MPI_LONG,
-                                                  MPI_SUM,
-						  fh->f_comm,
-						  fh->f_comm->c_coll->coll_allreduce_module);
-        if( OMPI_SUCCESS != ret){
-            goto exit;
-        }
-
-    }
-    else {
-        total_bytes_per_process = (MPI_Aint*)malloc
-            (fh->f_num_aggrs * fh->f_procs_per_group*sizeof(MPI_Aint));
-        if (NULL == total_bytes_per_process) {
-            opal_output (1, "OUT OF MEMORY\n");
-            ret = OMPI_ERR_OUT_OF_RESOURCE;
-            goto exit;
-        }
-    
-        ret = ompi_fcoll_base_coll_allgather_array (broken_total_lengths,
-						    fh->f_num_aggrs,
-						    MPI_LONG,
-						    total_bytes_per_process,
-						    fh->f_num_aggrs,
-						    MPI_LONG,
-						    0,
-						    fh->f_procs_in_group,
-						    fh->f_procs_per_group,
-						    fh->f_comm);
-        if( OMPI_SUCCESS != ret){
-            goto exit;
-        }
-
-        for ( i=0; i<fh->f_num_aggrs; i++ ) {
-            broken_total_lengths[i] = 0;
-            for (j=0 ; j<fh->f_procs_per_group ; j++) {
-                broken_total_lengths[i] += total_bytes_per_process[j*fh->f_num_aggrs + i];
-            }
-        }
-        if (NULL != total_bytes_per_process) {
-            free (total_bytes_per_process);
-            total_bytes_per_process = NULL;
-        }    
+    ret = fh->f_comm->c_coll->coll_allreduce (MPI_IN_PLACE,
+                                              broken_total_lengths,
+                                              fh->f_num_aggrs,
+                                              MPI_LONG,
+                                              MPI_SUM,
+                                              fh->f_comm,
+                                              fh->f_comm->c_coll->coll_allreduce_module);
+    if( OMPI_SUCCESS != ret){
+      goto exit;
     }
     
 #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
@@ -342,28 +236,14 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
 #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
     start_comm_time = MPI_Wtime();
 #endif
-    if ( 1 == mca_fcoll_vulcan_num_groups ) {
-        ret = fh->f_comm->c_coll->coll_allgather(broken_counts,
-						 fh->f_num_aggrs,
-						 MPI_INT,
-						 result_counts,
-						 fh->f_num_aggrs,
-						 MPI_INT,
-						 fh->f_comm,
-						 fh->f_comm->c_coll->coll_allgather_module);            
-    }
-    else {
-        ret = ompi_fcoll_base_coll_allgather_array (broken_counts,
-						    fh->f_num_aggrs,
-						    MPI_INT,
-						    result_counts,
-						    fh->f_num_aggrs,
-						    MPI_INT,
-						    0,
-						    fh->f_procs_in_group,
-						    fh->f_procs_per_group,
-						    fh->f_comm);
-    }
+    ret = fh->f_comm->c_coll->coll_allgather(broken_counts,
+                                                 fh->f_num_aggrs,
+                                                 MPI_INT,
+                                                 result_counts,
+                                                 fh->f_num_aggrs,
+                                                 MPI_INT,
+                                                 fh->f_comm,
+                                                 fh->f_comm->c_coll->coll_allgather_module);
     if( OMPI_SUCCESS != ret){
         goto exit;
     }
@@ -428,32 +308,17 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
 #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
         start_comm_time = MPI_Wtime();
 #endif
-        if ( 1 == mca_fcoll_vulcan_num_groups ) {
-            OMPI_COUNT_ARRAY_INIT(&fview_count_desc, aggr_data[i]->fview_count);
-            OMPI_DISP_ARRAY_INIT(&displs_desc, displs);
-            ret = fh->f_comm->c_coll->coll_allgatherv (broken_iov_arrays[i],
-                                                      broken_counts[i],
-                                                      fh->f_iov_type,
-                                                      aggr_data[i]->global_iov_array,
-                                                      fview_count_desc,
-                                                      displs_desc,
-                                                      fh->f_iov_type,
-                                                      fh->f_comm,
-                                                      fh->f_comm->c_coll->coll_allgatherv_module );
-        }
-        else {
-            ret = ompi_fcoll_base_coll_allgatherv_array (broken_iov_arrays[i],
-							 broken_counts[i],
-							 fh->f_iov_type,
-							 aggr_data[i]->global_iov_array,
-							 aggr_data[i]->fview_count,
-							 displs,
-							 fh->f_iov_type,
-							 fh->f_aggr_list[i],
-							 fh->f_procs_in_group,
-							 fh->f_procs_per_group,
-							 fh->f_comm);
-        }
+        OMPI_COUNT_ARRAY_INIT(&fview_count_desc, aggr_data[i]->fview_count);
+        OMPI_DISP_ARRAY_INIT(&displs_desc, displs);
+        ret = fh->f_comm->c_coll->coll_allgatherv (broken_iov_arrays[i],
+                                                   broken_counts[i],
+                                                   fh->f_iov_type,
+                                                   aggr_data[i]->global_iov_array,
+                                                   fview_count_desc,
+                                                   displs_desc,
+                                                   fh->f_iov_type,
+                                                   fh->f_comm,
+                                                   fh->f_comm->c_coll->coll_allgatherv_module );
         if (OMPI_SUCCESS != ret){
             goto exit;
         }
@@ -539,8 +404,8 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
             }
         
             if (use_accelerator_buffer) {
-		opal_output_verbose(10, ompi_fcoll_base_framework.framework_output,
-				    "Allocating GPU device buffer for aggregation\n");
+                opal_output_verbose(10, ompi_fcoll_base_framework.framework_output,
+                                    "Allocating GPU device buffer for aggregation\n");
                 ret = opal_accelerator.mem_alloc(MCA_ACCELERATOR_NO_DEVICE_ID, (void**)&aggr_data[i]->global_buf,
                                                  bytes_per_cycle);
                 if (OPAL_SUCCESS != ret) {
@@ -583,10 +448,9 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
 #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
         start_exch = MPI_Wtime();
 #endif
-    }    
+    }
 
     reqs = (ompi_request_t **)malloc ((fh->f_procs_per_group + 1 )*fh->f_num_aggrs *sizeof(ompi_request_t *));
-
     if ( NULL == reqs ) {
         opal_output (1, "OUT OF MEMORY\n");
         ret = OMPI_ERR_OUT_OF_RESOURCE;
@@ -632,7 +496,7 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
             start_write_time = MPI_Wtime();
 #endif
             ret = write_init (fh, fh->f_aggr_list[aggr_index], aggr_data[aggr_index],
-                              write_chunksize, write_synch_type, &req_iwrite, use_accelerator_buffer);
+                              write_synch_type, &req_iwrite, use_accelerator_buffer);
             if (OMPI_SUCCESS != ret){
                 goto exit;
             }
@@ -672,7 +536,7 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
             start_write_time = MPI_Wtime();
 #endif
             ret = write_init (fh, fh->f_aggr_list[aggr_index], aggr_data[aggr_index],
-                              write_chunksize, write_synch_type, &req_iwrite, use_accelerator_buffer);
+                              write_synch_type, &req_iwrite, use_accelerator_buffer);
             if (OMPI_SUCCESS != ret){
                 goto exit;
             }
@@ -699,7 +563,7 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
     nentry.aggregator = 0;
     for ( i=0; i<fh->f_num_aggrs; i++ ) {
         if (fh->f_aggr_list[i] == fh->f_rank)
-	nentry.aggregator = 1;
+        nentry.aggregator = 1;
     }
     nentry.nprocs_for_coll = fh->f_num_aggrs;
     if (!mca_common_ompio_full_print_queue(fh->f_coll_write_time)){
@@ -707,15 +571,13 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
                                                nentry);
     }
 #endif
-    
-    
+
 exit :
-    
+
     if ( NULL != aggr_data ) {
-        
-        for ( i=0; i< fh->f_num_aggrs; i++ ) {            
+        for ( i=0; i< fh->f_num_aggrs; i++ ) {
             if (fh->f_aggr_list[i] == fh->f_rank) {
-                if (NULL != aggr_data[i]->recvtype){
+                if (NULL != aggr_data[i]->recvtype) {
                     for (j =0; j< aggr_data[i]->procs_per_group; j++) {
                         if ( MPI_DATATYPE_NULL != aggr_data[i]->recvtype[j] ) {
                             ompi_datatype_destroy(&aggr_data[i]->recvtype[j]);
@@ -723,26 +585,25 @@ exit :
                         if ( MPI_DATATYPE_NULL != aggr_data[i]->prev_recvtype[j] ) {
                             ompi_datatype_destroy(&aggr_data[i]->prev_recvtype[j]);
                         }
-			
                     }
                     free(aggr_data[i]->recvtype);
                     free(aggr_data[i]->prev_recvtype);
                 }
-                
+
                 free (aggr_data[i]->disp_index);
                 free (aggr_data[i]->max_disp_index);
-		if (use_accelerator_buffer) {
-		    opal_accelerator.mem_release(MCA_ACCELERATOR_NO_DEVICE_ID, aggr_data[i]->global_buf);
-		    opal_accelerator.mem_release(MCA_ACCELERATOR_NO_DEVICE_ID, aggr_data[i]->prev_global_buf);
-		} else {
-		    free (aggr_data[i]->global_buf);
-		    free (aggr_data[i]->prev_global_buf);
-		}
+                if (use_accelerator_buffer) {
+                    opal_accelerator.mem_release(MCA_ACCELERATOR_NO_DEVICE_ID, aggr_data[i]->global_buf);
+                    opal_accelerator.mem_release(MCA_ACCELERATOR_NO_DEVICE_ID, aggr_data[i]->prev_global_buf);
+                } else {
+                    free (aggr_data[i]->global_buf);
+                    free (aggr_data[i]->prev_global_buf);
+                }
                 for(l=0;l<aggr_data[i]->procs_per_group;l++){
                     free (aggr_data[i]->blocklen_per_process[l]);
                     free (aggr_data[i]->displs_per_process[l]);
                 }
-                
+
                 free (aggr_data[i]->blocklen_per_process);
                 free (aggr_data[i]->displs_per_process);
             }
@@ -750,7 +611,6 @@ exit :
             free (aggr_data[i]->global_iov_array);
             free (aggr_data[i]->fview_count);
             free (aggr_data[i]->decoded_iov);
-            
             free (aggr_data[i]);
         }
         free (aggr_data);
@@ -774,37 +634,40 @@ exit :
     fh->f_aggr_list=NULL;
     free(result_counts);
     free(reqs);
-     
+
     return OMPI_SUCCESS;
 }
 
 static int write_init (ompio_file_t *fh,
                        int aggregator,
                        mca_io_ompio_aggregator_data *aggr_data,
-                       int write_chunksize,
-                       int write_synchType,
+                       int write_syncType,
                        ompi_request_t **request,
                        bool is_accelerator_buffer)
 {
     int ret = OMPI_SUCCESS;
     ssize_t ret_temp = 0;
-    int last_array_pos = 0;
-    int last_pos = 0;
+    int i;
     mca_ompio_request_t *ompio_req = NULL;
 
     mca_common_ompio_request_alloc ( &ompio_req, MCA_OMPIO_REQUEST_WRITE );
 
     if (aggr_data->prev_num_io_entries) {
-        /*  In this case, aggr_data->prev_num_io_entries is always == 1.
-            Therefore we can write the data of size aggr_data->prev_bytes_to_write in one iteration.
-            In fact, aggr_data->prev_bytes_to_write <= write_chunksize.
-        */
-        mca_fcoll_vulcan_split_iov_array (fh, aggr_data->prev_io_array,
-                                          aggr_data->prev_num_io_entries,
-                                          &last_array_pos, &last_pos,
-                                          write_chunksize);
+        fh->f_num_of_io_entries = aggr_data->prev_num_io_entries;
+        fh->f_io_array = (mca_common_ompio_io_array_t *) malloc (fh->f_num_of_io_entries *
+                                                                 sizeof(mca_common_ompio_io_array_t));
+        if ( NULL == fh->f_io_array ){
+            opal_output (1,"Could not allocate memory\n");
+            return -1;
+        }
+
+        for (i = 0; i < fh->f_num_of_io_entries; i++) {
+            fh->f_io_array[i].memory_address = aggr_data->prev_io_array[i].memory_address;
+            fh->f_io_array[i].offset = aggr_data->prev_io_array[i].offset;
+            fh->f_io_array[i].length = aggr_data->prev_io_array[i].length;
+        }
 
-        if (1 == write_synchType) {
+        if (1 == write_syncType) {
             if (is_accelerator_buffer) {
                 ret = mca_common_ompio_file_iwrite_pregen(fh, (ompi_request_t *) ompio_req);
                 if(0 > ret) {
@@ -853,21 +716,16 @@ static int write_init (ompio_file_t *fh,
     return ret;
 }
 
-static int shuffle_init ( int index, int cycles, int aggregator, int rank, mca_io_ompio_aggregator_data *data, 
-                          ompi_request_t **reqs )
+static int shuffle_init (int index, int num_cycles, int aggregator, int rank,
+                         mca_io_ompio_aggregator_data *data, ompi_request_t **reqs)
 {
-    int bytes_sent = 0;
-    int blocks=0, temp_pindex;
-    int i, j, l, ret;
-    int  entries_per_aggregator=0;
+    size_t bytes_sent = 0;
+    int i, j, l;
+    int ret = OMPI_SUCCESS;
+    int  entries_per_aggregator = 0;
     mca_io_ompio_local_io_array *file_offsets_for_agg=NULL;
     int *sorted_file_offsets=NULL;
-    int temp_index=0;
     MPI_Aint *memory_displacements=NULL;
-    int *temp_disp_index=NULL;
-#if DEBUG_ON
-    MPI_Aint global_count = 0;
-#endif
     int* blocklength_proc=NULL;
     ptrdiff_t* displs_proc=NULL;
 
@@ -879,21 +737,19 @@ static int shuffle_init ( int index, int cycles, int aggregator, int rank, mca_i
      ***  7a. Getting ready for next cycle: initializing and freeing buffers
      **********************************************************************/
     if (aggregator == rank) {
-        
         if (NULL != data->recvtype){
             for (i =0; i< data->procs_per_group; i++) {
-                if ( MPI_DATATYPE_NULL != data->recvtype[i] ) {
+                if (MPI_DATATYPE_NULL != data->recvtype[i]) {
                     ompi_datatype_destroy(&data->recvtype[i]);
                     data->recvtype[i] = MPI_DATATYPE_NULL;
                 }
             }
         }
 
-        
-        for(l=0;l<data->procs_per_group;l++){
+        for(l = 0; l < data->procs_per_group; l++){
             data->disp_index[l] = 0;
-            
-            if ( data->max_disp_index[l] == 0 ) {
+
+            if (data->max_disp_index[l] == 0) {
                 data->blocklen_per_process[l] = (int *) calloc (INIT_LEN, sizeof(int));
                 data->displs_per_process[l] = (MPI_Aint *) calloc (INIT_LEN, sizeof(MPI_Aint));
                 if (NULL == data->displs_per_process[l] || NULL == data->blocklen_per_process[l]){
@@ -902,25 +758,22 @@ static int shuffle_init ( int index, int cycles, int aggregator, int rank, mca_i
                     goto exit;
                 }
                 data->max_disp_index[l] = INIT_LEN;
-            }
-            else {
-                memset ( data->blocklen_per_process[l], 0, data->max_disp_index[l]*sizeof(int) );
-                memset ( data->displs_per_process[l], 0, data->max_disp_index[l]*sizeof(MPI_Aint) );
+            } else {
+                memset (data->blocklen_per_process[l], 0, data->max_disp_index[l]*sizeof(int));
+                memset (data->displs_per_process[l], 0, data->max_disp_index[l]*sizeof(MPI_Aint));
             }
         }
     } /* (aggregator == rank */
-    
+
     /**************************************************************************
      ***  7b. Determine the number of bytes to be actually written in this cycle
      **************************************************************************/
     int local_cycles= ceil((double)data->total_bytes / data->bytes_per_cycle);
-    if ( index  < (local_cycles -1) ) {
+    if (index  < (local_cycles -1)) {
         data->bytes_to_write_in_cycle = data->bytes_per_cycle;
-    }
-    else if ( index == (local_cycles -1)) {
+    } else if (index == (local_cycles -1)) {
         data->bytes_to_write_in_cycle = data->total_bytes - data->bytes_per_cycle*index ;
-    }
-    else {
+    } else {
         data->bytes_to_write_in_cycle = 0;
     }
     data->bytes_to_write = data->bytes_to_write_in_cycle;
@@ -928,309 +781,57 @@ static int shuffle_init ( int index, int cycles, int aggregator, int rank, mca_i
 #if DEBUG_ON
     if (aggregator == rank) {
         printf ("****%d: CYCLE %d   Bytes %lld**********\n",
-                rank,
-                index,
-                data->bytes_to_write_in_cycle);
+                rank, index, data->bytes_to_write_in_cycle);
     }
 #endif
-    /**********************************************************
-     **Gather the Data from all the processes at the writers **
-     *********************************************************/
-    
-#if DEBUG_ON
-    printf("bytes_to_write_in_cycle: %ld, cycle : %d\n", data->bytes_to_write_in_cycle,
-           index);
-#endif
-    
+
     /*****************************************************************
      *** 7c. Calculate how much data will be contributed in this cycle
      ***     by each process
      *****************************************************************/
-    
-    /* The blocklen and displs calculation only done at aggregators!*/
-    while (data->bytes_to_write_in_cycle) {
+    mca_fcoll_vulcan_calc_blocklen_disps (data, aggregator, rank, &bytes_sent);
         
-        /* This next block identifies which process is the holder
-        ** of the sorted[current_index] element;
-        */
-        blocks = data->fview_count[0];
-        for (j=0 ; j<data->procs_per_group ; j++) {
-            if (data->sorted[data->current_index] < blocks) {
-                data->n = j;
-                break;
-            }
-            else {
-                blocks += data->fview_count[j+1];
-            }
-        }
-        
-        if (data->bytes_remaining) {
-            /* Finish up a partially used buffer from the previous  cycle */
-            
-            if (data->bytes_remaining <= data->bytes_to_write_in_cycle) {
-                /* The data fits completely into the block */
-                if (aggregator == rank) {
-                    data->blocklen_per_process[data->n][data->disp_index[data->n]] = data->bytes_remaining;
-                    data->displs_per_process[data->n][data->disp_index[data->n]] =
-                        (ptrdiff_t)data->global_iov_array[data->sorted[data->current_index]].iov_base +
-                        (data->global_iov_array[data->sorted[data->current_index]].iov_len
-                         - data->bytes_remaining);
-
-                    data->disp_index[data->n] += 1;
-                    
-                    /* In this cases the length is consumed so allocating for
-                       next displacement and blocklength*/
-                    if ( data->disp_index[data->n] == data->max_disp_index[data->n] ) {
-                        data->max_disp_index[data->n] *= 2;
-
-                        data->blocklen_per_process[data->n] = (int *) realloc
-                            ((void *)data->blocklen_per_process[data->n], 
-                             (data->max_disp_index[data->n])*sizeof(int));
-                        data->displs_per_process[data->n] = (MPI_Aint *) realloc
-                            ((void *)data->displs_per_process[data->n], 
-                             (data->max_disp_index[data->n])*sizeof(MPI_Aint));
-                    }
-                    data->blocklen_per_process[data->n][data->disp_index[data->n]] = 0;
-                    data->displs_per_process[data->n][data->disp_index[data->n]] = 0;
-
-                }
-                if (data->procs_in_group[data->n] == rank) {
-                    bytes_sent += data->bytes_remaining;
-                }
-                data->current_index ++;
-                data->bytes_to_write_in_cycle -= data->bytes_remaining;
-                data->bytes_remaining = 0;
-            }
-            else {
-                /* the remaining data from the previous cycle is larger than the
-                   data->bytes_to_write_in_cycle, so we have to segment again */
-                if (aggregator == rank) {
-                    data->blocklen_per_process[data->n][data->disp_index[data->n]] = data->bytes_to_write_in_cycle;
-                    data->displs_per_process[data->n][data->disp_index[data->n]] =
-                        (ptrdiff_t)data->global_iov_array[data->sorted[data->current_index]].iov_base +
-                        (data->global_iov_array[data->sorted[data->current_index]].iov_len
-                         - data->bytes_remaining);
-                    data->disp_index[data->n] += 1;
-                }
-                
-                if (data->procs_in_group[data->n] == rank) {
-                    bytes_sent += data->bytes_to_write_in_cycle;
-                }
-                data->bytes_remaining -= data->bytes_to_write_in_cycle;
-                data->bytes_to_write_in_cycle = 0;
-                break;
-            }
-        }
-        else {
-            /* No partially used entry available, have to start a new one */
-            if (data->bytes_to_write_in_cycle <
-                (MPI_Aint) data->global_iov_array[data->sorted[data->current_index]].iov_len) {
-                /* This entry has more data than we can sendin one cycle */
-                if (aggregator == rank) {
-                    data->blocklen_per_process[data->n][data->disp_index[data->n]] = data->bytes_to_write_in_cycle;
-                    data->displs_per_process[data->n][data->disp_index[data->n]] =
-                        (ptrdiff_t)data->global_iov_array[data->sorted[data->current_index]].iov_base ;
-                    data->disp_index[data->n] += 1;
-                }
-                if (data->procs_in_group[data->n] == rank) {
-                    bytes_sent += data->bytes_to_write_in_cycle;
-                    
-                }
-                data->bytes_remaining = data->global_iov_array[data->sorted[data->current_index]].iov_len -
-                    data->bytes_to_write_in_cycle;
-                data->bytes_to_write_in_cycle = 0;
-                break;
-            }
-            else {
-                /* Next data entry is less than data->bytes_to_write_in_cycle */
-                if (aggregator == rank) {
-                    data->blocklen_per_process[data->n][data->disp_index[data->n]] =
-                        data->global_iov_array[data->sorted[data->current_index]].iov_len;
-                    data->displs_per_process[data->n][data->disp_index[data->n]] = (ptrdiff_t)
-                        data->global_iov_array[data->sorted[data->current_index]].iov_base;
-
-                    data->disp_index[data->n] += 1;
-                    
-                    /*realloc for next blocklength
-                      and assign this displacement and check for next displs as
-                      the total length of this entry has been consumed!*/
-                    if ( data->disp_index[data->n] == data->max_disp_index[data->n] ) {
-                        data->max_disp_index[data->n] *=2 ;
-                        data->blocklen_per_process[data->n] = (int *) realloc (
-                            (void *)data->blocklen_per_process[data->n], 
-                            (data->max_disp_index[data->n]*sizeof(int)));
-                        data->displs_per_process[data->n] = (MPI_Aint *)realloc (
-                            (void *)data->displs_per_process[data->n], 
-                            (data->max_disp_index[data->n]*sizeof(MPI_Aint)));
-                    }
-                    data->blocklen_per_process[data->n][data->disp_index[data->n]] = 0;
-                    data->displs_per_process[data->n][data->disp_index[data->n]] = 0;
-                }
-                if (data->procs_in_group[data->n] == rank) {
-                    bytes_sent += data->global_iov_array[data->sorted[data->current_index]].iov_len;
-                }
-                data->bytes_to_write_in_cycle -=
-                    data->global_iov_array[data->sorted[data->current_index]].iov_len;
-                data->current_index ++;
-            }
-        }
-    }
-    
-    
     /*************************************************************************
-     *** 7d. Calculate the displacement on where to put the data and allocate
-     ***     the receive buffer (global_buf)
+     *** 7d. Calculate the displacement on where to put the data
      *************************************************************************/
     if (aggregator == rank) {
         entries_per_aggregator=0;
-        for (i=0;i<data->procs_per_group; i++){
-            for (j=0;j<data->disp_index[i];j++){
-                if (data->blocklen_per_process[i][j] > 0)
+        for (i = 0; i < data->procs_per_group; i++){
+            for (j = 0; j < data->disp_index[i];j++){
+                if (data->blocklen_per_process[i][j] > 0) {
                     entries_per_aggregator++ ;
+                }
             }
         }
-        
 #if DEBUG_ON
-        printf("%d: cycle: %d, bytes_sent: %d\n ",rank,index,
-               bytes_sent);
-        printf("%d : Entries per aggregator : %d\n",rank,entries_per_aggregator);
+        printf("%d : Entries per aggregator : %d\n", rank, entries_per_aggregator);
 #endif
-        
-        if (entries_per_aggregator > 0){
-            file_offsets_for_agg = (mca_io_ompio_local_io_array *)
-                malloc(entries_per_aggregator*sizeof(mca_io_ompio_local_io_array));
-            if (NULL == file_offsets_for_agg) {
+
+        if (entries_per_aggregator > 0) {
+            file_offsets_for_agg = (mca_io_ompio_local_io_array *) malloc(entries_per_aggregator *
+                                                              sizeof(mca_io_ompio_local_io_array));
+            memory_displacements = (MPI_Aint *) malloc (entries_per_aggregator * sizeof(MPI_Aint));
+            sorted_file_offsets = (int *) malloc (entries_per_aggregator*sizeof(int));
+            if (NULL == memory_displacements || NULL == file_offsets_for_agg ||
+                NULL == sorted_file_offsets) {
                 opal_output (1, "OUT OF MEMORY\n");
                 ret = OMPI_ERR_OUT_OF_RESOURCE;
                 goto exit;
             }
-            
-            sorted_file_offsets = (int *)
-                malloc (entries_per_aggregator*sizeof(int));
-            if (NULL == sorted_file_offsets){
-                opal_output (1, "OUT OF MEMORY\n");
-                ret =  OMPI_ERR_OUT_OF_RESOURCE;
-                goto exit;
-            }
-            
-            /*Moving file offsets to an IO array!*/
-            temp_index = 0;
-            
-            for (i=0;i<data->procs_per_group; i++){
-                for(j=0;j<data->disp_index[i];j++){
-                    if (data->blocklen_per_process[i][j] > 0){
-                        file_offsets_for_agg[temp_index].length =
-                            data->blocklen_per_process[i][j];
-                        file_offsets_for_agg[temp_index].process_id = i;
-                        file_offsets_for_agg[temp_index].offset =
-                            data->displs_per_process[i][j];
-                        temp_index++;
-                        
-#if DEBUG_ON
-                        printf("************Cycle: %d,  Aggregator: %d ***************\n",
-                               index+1,rank);
-                        
-                        printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n",
-                               data->procs_in_group[i],j,
-                               data->blocklen_per_process[i][j],j,
-                               data->displs_per_process[i][j],
-                               rank);
-#endif
-                    }
-                }
-            }
-                
-            /* Sort the displacements for each aggregator*/
-            local_heap_sort (file_offsets_for_agg,
-                             entries_per_aggregator,
-                             sorted_file_offsets);
-            
-            /*create contiguous memory displacements
-              based on blocklens on the same displs array
-              and map it to this aggregator's actual
-              file-displacements (this is in the io-array created above)*/
-            memory_displacements = (MPI_Aint *) malloc
-                (entries_per_aggregator * sizeof(MPI_Aint));
-            
-            memory_displacements[sorted_file_offsets[0]] = 0;
-            for (i=1; i<entries_per_aggregator; i++){
-                memory_displacements[sorted_file_offsets[i]] =
-                    memory_displacements[sorted_file_offsets[i-1]] +
-                    file_offsets_for_agg[sorted_file_offsets[i-1]].length;
-            }
-            
-            temp_disp_index = (int *)calloc (1, data->procs_per_group * sizeof (int));
-            if (NULL == temp_disp_index) {
-                opal_output (1, "OUT OF MEMORY\n");
-                ret = OMPI_ERR_OUT_OF_RESOURCE;
+
+            ret = mca_fcoll_vulcan_calc_file_offsets(data, file_offsets_for_agg, sorted_file_offsets,
+                                                     memory_displacements, entries_per_aggregator,
+                                                     rank, index);
+            if (OMPI_SUCCESS != ret) {
                 goto exit;
             }
-            
-            /*Now update the displacements array  with memory offsets*/
-#if DEBUG_ON
-            global_count = 0;
-#endif
-            for (i=0;i<entries_per_aggregator;i++){
-                temp_pindex =
-                    file_offsets_for_agg[sorted_file_offsets[i]].process_id;
-                data->displs_per_process[temp_pindex][temp_disp_index[temp_pindex]] =
-                    memory_displacements[sorted_file_offsets[i]];
-                if (temp_disp_index[temp_pindex] < data->disp_index[temp_pindex])
-                    temp_disp_index[temp_pindex] += 1;
-                else{
-                    printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n",
-                           temp_pindex, temp_disp_index[temp_pindex],
-                           temp_pindex, data->disp_index[temp_pindex]);
-                }
-#if DEBUG_ON
-                global_count +=
-                    file_offsets_for_agg[sorted_file_offsets[i]].length;
-#endif
-            }
-            
-            if (NULL != temp_disp_index){
-                free(temp_disp_index);
-                temp_disp_index = NULL;
-            }
-            
-#if DEBUG_ON
-            
-            printf("************Cycle: %d,  Aggregator: %d ***************\n",
-                   index+1,rank);
-            for (i=0;i<data->procs_per_group; i++){
-                for(j=0;j<data->disp_index[i];j++){
-                    if (data->blocklen_per_process[i][j] > 0){
-                        printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n",
-                               data->procs_in_group[i],j,
-                               data->blocklen_per_process[i][j],j,
-                               data->displs_per_process[i][j],
-                               rank);
-                        
-                    }
-                }
-            }
-            printf("************Cycle: %d,  Aggregator: %d ***************\n",
-                   index+1,rank);
-            for (i=0; i<entries_per_aggregator;i++){
-                printf("%d: OFFSET: %lld   LENGTH: %ld, Mem-offset: %ld\n",
-                       file_offsets_for_agg[sorted_file_offsets[i]].process_id,
-                       file_offsets_for_agg[sorted_file_offsets[i]].offset,
-                       file_offsets_for_agg[sorted_file_offsets[i]].length,
-                       memory_displacements[sorted_file_offsets[i]]);
-            }
-            printf("%d : global_count : %ld, bytes_sent : %d\n",
-                   rank,global_count, bytes_sent);
-#endif
-//#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
-//            start_comm_time = MPI_Wtime();
-//#endif
             /*************************************************************************
              *** 7e. Perform the actual communication
              *************************************************************************/
-            for (i=0;i<data->procs_per_group; i++) {
+            for (i = 0; i < data->procs_per_group; i++) {
                 size_t datatype_size;
                 reqs[i] = MPI_REQUEST_NULL;
-                if ( 0 < data->disp_index[i] ) {
+                if (0 < data->disp_index[i]) {
                     ompi_datatype_create_hindexed(data->disp_index[i],
                                                   data->blocklen_per_process[i],
                                                   data->displs_per_process[i],
@@ -1239,7 +840,7 @@ static int shuffle_init ( int index, int cycles, int aggregator, int rank, mca_i
                     ompi_datatype_commit(&data->recvtype[i]);
                     opal_datatype_type_size(&data->recvtype[i]->super, &datatype_size);
                     
-                    if (datatype_size){
+                    if (datatype_size) {
                         ret = MCA_PML_CALL(irecv(data->global_buf,
                                                  1,
                                                  data->recvtype[i],
@@ -1278,8 +879,7 @@ static int shuffle_init ( int index, int cycles, int aggregator, int rank, mca_i
             if(0 == block_index) {
                 send_mem_address = (ptrdiff_t) (data->decoded_iov[data->iov_index].iov_base) +
                                                 data->current_position;
-            }
-            else {
+            } else {
                 // Reallocate more memory if blocklength_size is not enough
                 if(0 == block_index % INIT_LEN) {
                     blocklength_size += INIT_LEN;
@@ -1290,17 +890,14 @@ static int shuffle_init ( int index, int cycles, int aggregator, int rank, mca_i
                                                         data->current_position - send_mem_address;
             }
 
-            if (remaining >=
-                (data->decoded_iov[data->iov_index].iov_len - data->current_position)) {
-
+            if (remaining >= (data->decoded_iov[data->iov_index].iov_len - data->current_position)) {
                 blocklength_proc[block_index] = data->decoded_iov[data->iov_index].iov_len -
                                                 data->current_position;
                 remaining = remaining -
                             (data->decoded_iov[data->iov_index].iov_len - data->current_position);
                 data->iov_index = data->iov_index + 1;
                 data->current_position = 0;
-            }
-            else {
+            } else {
                 blocklength_proc[block_index] = remaining;
                 data->current_position += remaining;
                 remaining = 0;
@@ -1335,77 +932,23 @@ static int shuffle_init ( int index, int cycles, int aggregator, int rank, mca_i
         }
     }
 
-#if DEBUG_ON
-    if (aggregator == rank){
-        printf("************Cycle: %d,  Aggregator: %d ***************\n",
-               index+1,rank);
-        for (i=0 ; i<global_count/4 ; i++)
-            printf (" RECV %d \n",((int *)data->global_buf)[i]);
-    }
-#endif
-    
-//#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
-//    end_comm_time = MPI_Wtime();
-//    comm_time += (end_comm_time - start_comm_time);
-//#endif
     /**********************************************************
      *** 7f. Create the io array, and pass it to fbtl
      *********************************************************/
-    
-    if (aggregator == rank && entries_per_aggregator>0) {
-        
-        
-        data->io_array = (mca_common_ompio_io_array_t *) malloc
-            (entries_per_aggregator * sizeof (mca_common_ompio_io_array_t));
+    if (aggregator == rank && entries_per_aggregator > 0) {
+        data->io_array = (mca_common_ompio_io_array_t *) malloc (entries_per_aggregator *
+                                                         sizeof (mca_common_ompio_io_array_t));
         if (NULL == data->io_array) {
             opal_output(1, "OUT OF MEMORY\n");
             ret = OMPI_ERR_OUT_OF_RESOURCE;
             goto exit;
         }
-        
-        data->num_io_entries = 0;
-        /*First entry for every aggregator*/
-        data->io_array[0].offset =
-            (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset;
-        data->io_array[0].length =
-            file_offsets_for_agg[sorted_file_offsets[0]].length;
-        data->io_array[0].memory_address =
-            data->global_buf+memory_displacements[sorted_file_offsets[0]];
-        data->num_io_entries++;
-        
-        for (i=1;i<entries_per_aggregator;i++){
-            /* If the entries are contiguous merge them,
-               else make a new entry */
-            if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset +
-                file_offsets_for_agg[sorted_file_offsets[i-1]].length ==
-                file_offsets_for_agg[sorted_file_offsets[i]].offset){
-                data->io_array[data->num_io_entries - 1].length +=
-                    file_offsets_for_agg[sorted_file_offsets[i]].length;
-            }
-            else {
-                data->io_array[data->num_io_entries].offset =
-                    (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset;
-                data->io_array[data->num_io_entries].length =
-                    file_offsets_for_agg[sorted_file_offsets[i]].length;
-                data->io_array[data->num_io_entries].memory_address =
-                    data->global_buf+memory_displacements[sorted_file_offsets[i]];
-                data->num_io_entries++;
-            }
-            
-        }
-        
-#if DEBUG_ON
-        printf("*************************** %d\n", num_of_io_entries);
-        for (i=0 ; i<num_of_io_entries ; i++) {
-            printf(" ADDRESS: %p  OFFSET: %ld   LENGTH: %ld\n",
-                   io_array[i].memory_address,
-                   (ptrdiff_t)io_array[i].offset,
-                   io_array[i].length);
-        }
-        
-#endif
+
+        mca_fcoll_vulcan_calc_io_array(data->io_array, &data->num_io_entries, entries_per_aggregator,
+                                       (char*)data->global_buf, file_offsets_for_agg, sorted_file_offsets,
+                                       memory_displacements, rank);
     }
-        
+
 exit:
     free(sorted_file_offsets);
     free(file_offsets_for_agg);
@@ -1413,10 +956,10 @@ static int shuffle_init ( int index, int cycles, int aggregator, int rank, mca_i
     free(blocklength_proc);
     free(displs_proc);
 
-    return OMPI_SUCCESS;
+    return ret;
 }
     
-static int mca_fcoll_vulcan_minmax ( ompio_file_t *fh, struct iovec *iov, int iov_count,  int num_aggregators, long *new_stripe_size)
+int mca_fcoll_vulcan_minmax (ompio_file_t *fh, struct iovec *iov, int iov_count,  int num_aggregators, long *new_stripe_size)
 {
     long min, max, globalmin, globalmax;
     long stripe_size;
@@ -1430,12 +973,10 @@ static int mca_fcoll_vulcan_minmax ( ompio_file_t *fh, struct iovec *iov, int io
         max = 0;
     }
     fh->f_comm->c_coll->coll_allreduce ( &min, &globalmin, 1, MPI_LONG, MPI_MIN,
-					 fh->f_comm, fh->f_comm->c_coll->coll_allreduce_module);
+                                         fh->f_comm, fh->f_comm->c_coll->coll_allreduce_module);
     
     fh->f_comm->c_coll->coll_allreduce ( &max, &globalmax, 1, MPI_LONG, MPI_MAX,
-					 fh->f_comm, fh->f_comm->c_coll->coll_allreduce_module);
-
-    //    if ( fh->f_rank < 10 ) printf("[%d]: min=%ld max=%ld globalmin=%ld, globalmax=%ld num_aggregators=%d\n", fh->f_rank, min, max, globalmin, globalmax, num_aggregators);
+                                         fh->f_comm, fh->f_comm->c_coll->coll_allreduce_module);
 
     stripe_size = (globalmax - globalmin)/num_aggregators;
     if ( (globalmax - globalmin) % num_aggregators ) {
@@ -1443,13 +984,9 @@ static int mca_fcoll_vulcan_minmax ( ompio_file_t *fh, struct iovec *iov, int io
     }
 
     *new_stripe_size  = stripe_size;
-    //    if ( fh->f_rank == 0 ) 
-    //    printf(" partition size is %ld\n", stripe_size);
 
     return OMPI_SUCCESS;
 }
-    
-    
 
 int mca_fcoll_vulcan_break_file_view ( struct iovec *mem_iov, int mem_count, 
                                         struct iovec *file_iov, int file_count, 
@@ -1678,21 +1215,11 @@ int mca_fcoll_vulcan_break_file_view ( struct iovec *mem_iov, int mem_count,
     return ret;
 }
 
-
-int mca_fcoll_vulcan_get_configuration (ompio_file_t *fh, int num_io_procs, int num_groups, 
-                                        size_t max_data)
+int mca_fcoll_vulcan_get_configuration (ompio_file_t *fh, int num_io_procs, size_t max_data)
 {
     int i, ret;
-    ret = mca_common_ompio_set_aggregator_props (fh, num_io_procs, max_data);
 
-    /* Note: as of this version of the vulcan component, we are not using yet
-       the num_groups parameter to split the aggregators (and processes) into
-       distinct subgroups. This will however hopefullty be done in a second step
-       as well, allowing to keep communication just to individual subgroups of processes,
-       each subgroup using however the classic two-phase collective I/O algorithm
-       with multiple aggregators and even partitioning internally. 
-    
-       For now, logically all processes are in a single group. */
+    ret = mca_common_ompio_set_aggregator_props (fh, num_io_procs, max_data);
 
     fh->f_procs_per_group = fh->f_size;
     if ( NULL != fh->f_procs_in_group ) {
@@ -1708,63 +1235,269 @@ int mca_fcoll_vulcan_get_configuration (ompio_file_t *fh, int num_io_procs, int
     
     return ret;
 }    
-    
 
-int mca_fcoll_vulcan_split_iov_array ( ompio_file_t *fh, mca_common_ompio_io_array_t *io_array, int num_entries,
-                                             int *ret_array_pos, int *ret_pos,  int chunk_size )
+void mca_fcoll_vulcan_calc_blocklen_disps (mca_io_ompio_aggregator_data *data, int aggregator,
+                                           int rank, size_t *bytes_comm)
 {
+    size_t bytes_tmp = *bytes_comm;
+    int blocks = 0;
+    int j;
 
-    int array_pos = *ret_array_pos;
-    int pos       = *ret_pos;
-    size_t bytes_written = 0;
-    size_t bytes_to_write = chunk_size;
+    /* The blocklen and displs calculation only done at aggregators */
+    while (data->bytes_to_write_in_cycle) {
 
-    if ( 0 == array_pos && 0 == pos ) {
-        fh->f_io_array = (mca_common_ompio_io_array_t *) malloc ( num_entries * sizeof(mca_common_ompio_io_array_t));
-        if ( NULL == fh->f_io_array ){
-            opal_output (1,"Could not allocate memory\n");
-            return -1;
+        /* This next block identifies which process is the holder
+        ** of the sorted[current_index] element;
+        */
+        blocks = data->fview_count[0];
+        for (j = 0 ; j < data->procs_per_group ; j++) {
+            if (data->sorted[data->current_index] < blocks) {
+                data->n = j;
+                break;
+            } else {
+                blocks += data->fview_count[j+1];
+            }
+        }
+
+        if (data->bytes_remaining) {
+            /* Finish up a partially used buffer from the previous  cycle */
+
+            if (data->bytes_remaining <= data->bytes_to_write_in_cycle) {
+                /* The data fits completely into the block */
+                if (aggregator == rank) {
+                    data->blocklen_per_process[data->n][data->disp_index[data->n]] = data->bytes_remaining;
+                    data->displs_per_process[data->n][data->disp_index[data->n]] =
+                        (ptrdiff_t)data->global_iov_array[data->sorted[data->current_index]].iov_base +
+                        (data->global_iov_array[data->sorted[data->current_index]].iov_len
+                         - data->bytes_remaining);
+
+                    data->disp_index[data->n] += 1;
+
+                    /* In this cases the length is consumed so allocating for
+                       next displacement and blocklength*/
+                    if (data->disp_index[data->n] == data->max_disp_index[data->n]) {
+                        data->max_disp_index[data->n] *= 2;
+
+                        data->blocklen_per_process[data->n] = (int *) realloc
+                            ((void *)data->blocklen_per_process[data->n],
+                             (data->max_disp_index[data->n])*sizeof(int));
+                        data->displs_per_process[data->n] = (MPI_Aint *) realloc
+                            ((void *)data->displs_per_process[data->n],
+                             (data->max_disp_index[data->n])*sizeof(MPI_Aint));
+                    }
+                    data->blocklen_per_process[data->n][data->disp_index[data->n]] = 0;
+                    data->displs_per_process[data->n][data->disp_index[data->n]] = 0;
+                }
+                if (data->procs_in_group[data->n] == rank) {
+                    bytes_tmp += data->bytes_remaining;
+                }
+                data->current_index ++;
+                data->bytes_to_write_in_cycle -= data->bytes_remaining;
+                data->bytes_remaining = 0;
+            } else {
+                /* the remaining data from the previous cycle is larger than the
+                   data->bytes_to_write_in_cycle, so we have to segment again */
+                if (aggregator == rank) {
+                    data->blocklen_per_process[data->n][data->disp_index[data->n]] = data->bytes_to_write_in_cycle;
+                    data->displs_per_process[data->n][data->disp_index[data->n]] =
+                        (ptrdiff_t)data->global_iov_array[data->sorted[data->current_index]].iov_base +
+                        (data->global_iov_array[data->sorted[data->current_index]].iov_len
+                         - data->bytes_remaining);
+                    data->disp_index[data->n] += 1;
+                }
+
+                if (data->procs_in_group[data->n] == rank) {
+                    bytes_tmp += data->bytes_to_write_in_cycle;
+                }
+                data->bytes_remaining -= data->bytes_to_write_in_cycle;
+                data->bytes_to_write_in_cycle = 0;
+                break;
+            }
+        } else {
+            /* No partially used entry available, have to start a new one */
+            if (data->bytes_to_write_in_cycle <
+                (MPI_Aint) data->global_iov_array[data->sorted[data->current_index]].iov_len) {
+                /* This entry has more data than we can sendin one cycle */
+                if (aggregator == rank) {
+                    data->blocklen_per_process[data->n][data->disp_index[data->n]] = data->bytes_to_write_in_cycle;
+                    data->displs_per_process[data->n][data->disp_index[data->n]] =
+                        (ptrdiff_t)data->global_iov_array[data->sorted[data->current_index]].iov_base ;
+                    data->disp_index[data->n] += 1;
+                }
+                if (data->procs_in_group[data->n] == rank) {
+                    bytes_tmp += data->bytes_to_write_in_cycle;
+                }
+                data->bytes_remaining = data->global_iov_array[data->sorted[data->current_index]].iov_len -
+                    data->bytes_to_write_in_cycle;
+                data->bytes_to_write_in_cycle = 0;
+                break;
+            } else {
+                /* Next data entry is less than data->bytes_to_write_in_cycle */
+                if (aggregator == rank) {
+                    data->blocklen_per_process[data->n][data->disp_index[data->n]] =
+                        data->global_iov_array[data->sorted[data->current_index]].iov_len;
+                    data->displs_per_process[data->n][data->disp_index[data->n]] = (ptrdiff_t)
+                        data->global_iov_array[data->sorted[data->current_index]].iov_base;
+
+                    data->disp_index[data->n] += 1;
+
+                    /* realloc for next blocklength and assign this displacement
+                    ** and check for next displs as the total length of this entry
+                    ** has been consumed */
+                    if (data->disp_index[data->n] == data->max_disp_index[data->n] ) {
+                        data->max_disp_index[data->n] *=2 ;
+                        data->blocklen_per_process[data->n] = (int *) realloc (
+                            (void *)data->blocklen_per_process[data->n],
+                            (data->max_disp_index[data->n]*sizeof(int)));
+                        data->displs_per_process[data->n] = (MPI_Aint *)realloc (
+                            (void *)data->displs_per_process[data->n],
+                            (data->max_disp_index[data->n]*sizeof(MPI_Aint)));
+                    }
+                    data->blocklen_per_process[data->n][data->disp_index[data->n]] = 0;
+                    data->displs_per_process[data->n][data->disp_index[data->n]] = 0;
+                }
+                if (data->procs_in_group[data->n] == rank) {
+                    bytes_tmp += data->global_iov_array[data->sorted[data->current_index]].iov_len;
+                }
+                data->bytes_to_write_in_cycle -=
+                    data->global_iov_array[data->sorted[data->current_index]].iov_len;
+                data->current_index ++;
+            }
         }
     }
-        
-    int i=0;
-    while (bytes_to_write > 0 ) {
-        fh->f_io_array[i].memory_address = &(((char *)io_array[array_pos].memory_address)[pos]);
-        fh->f_io_array[i].offset = &(((char *)io_array[array_pos].offset)[pos]);
 
-        if ( (io_array[array_pos].length - pos ) >= bytes_to_write ) {
-            fh->f_io_array[i].length = bytes_to_write;
+    *bytes_comm = bytes_tmp;
+}
+
+int mca_fcoll_vulcan_calc_file_offsets(mca_io_ompio_aggregator_data *data, mca_io_ompio_local_io_array *file_offsets_for_agg,
+                                       int *sorted_file_offsets, MPI_Aint *memory_displacements, int entries_per_aggregator,
+                                       int rank, int index)
+{
+    int *temp_disp_index;
+    int temp_index = 0;
+    int temp_pindex;
+    int i, j;
+
+    /* Moving file offsets to an IO array */
+    for (i = 0; i < data->procs_per_group; i++){
+        for(j = 0; j < data->disp_index[i];j++){
+            if (data->blocklen_per_process[i][j] > 0){
+                file_offsets_for_agg[temp_index].length =
+                    data->blocklen_per_process[i][j];
+                file_offsets_for_agg[temp_index].process_id = i;
+                file_offsets_for_agg[temp_index].offset =
+                    data->displs_per_process[i][j];
+                temp_index++;
+            }
+        }
+    }
+
+    /* Sort the displacements for each aggregator */
+    local_heap_sort (file_offsets_for_agg, entries_per_aggregator,
+                     sorted_file_offsets);
+
+    /* create contiguous memory displacements based on blocklens
+    ** on the same displs array and map it to this aggregator's actual
+    ** file-displacements */
+    memory_displacements[sorted_file_offsets[0]] = 0;
+    for (i = 1; i < entries_per_aggregator; i++){
+        memory_displacements[sorted_file_offsets[i]] =
+            memory_displacements[sorted_file_offsets[i-1]] +
+            file_offsets_for_agg[sorted_file_offsets[i-1]].length;
+    }
+
+    temp_disp_index = (int *)calloc (1, data->procs_per_group * sizeof (int));
+    if (NULL == temp_disp_index) {
+        opal_output (1, "OUT OF MEMORY\n");
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+
+    /* Now update the displacements array  with memory offsets */
+    for (i = 0; i < entries_per_aggregator;i++) {
+        temp_pindex = file_offsets_for_agg[sorted_file_offsets[i]].process_id;
+        data->displs_per_process[temp_pindex][temp_disp_index[temp_pindex]] =
+            memory_displacements[sorted_file_offsets[i]];
+        if (temp_disp_index[temp_pindex] < data->disp_index[temp_pindex]) {
+            temp_disp_index[temp_pindex] += 1;
         }
         else {
-            fh->f_io_array[i].length = io_array[array_pos].length - pos;
+            printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n",
+                   temp_pindex, temp_disp_index[temp_pindex],
+                   temp_pindex, data->disp_index[temp_pindex]);
         }
+    }
 
-        pos           += fh->f_io_array[i].length;
-        bytes_written += fh->f_io_array[i].length;
-        bytes_to_write-= fh->f_io_array[i].length;
-        i++;
+    free(temp_disp_index);
 
-        if ( pos == (int)io_array[array_pos].length ) {
-            pos = 0;
-            if ((array_pos + 1) < num_entries) {
-                array_pos++;
-            }
-            else {
-                break;
+#if DEBUG_ON
+    printf("************Cycle: %d,  Aggregator: %d ***************\n",
+           index+1, rank);
+    for (i = 0; i < data->procs_per_group; i++){
+        for(j = 0; j < data->disp_index[i]; j++){
+            if (data->blocklen_per_process[i][j] > 0){
+                printf("%d communicate blocklen[%d]: %d, disp[%d]: %ld to %d\n",
+                       data->procs_in_group[i],j,
+                       data->blocklen_per_process[i][j],j,
+                       data->displs_per_process[i][j],
+                       rank);
             }
         }
     }
-    
-    fh->f_num_of_io_entries = i;
-    *ret_array_pos   = array_pos;
-    *ret_pos         = pos;
-    return bytes_written;
+    printf("************Cycle: %d,  Aggregator: %d ***************\n",
+           index+1, rank);
+    for (i = 0; i < entries_per_aggregator;i++){
+        printf("%d: OFFSET: %lld   LENGTH: %ld, Mem-offset: %ld\n",
+               file_offsets_for_agg[sorted_file_offsets[i]].process_id,
+               file_offsets_for_agg[sorted_file_offsets[i]].offset,
+               file_offsets_for_agg[sorted_file_offsets[i]].length,
+               memory_displacements[sorted_file_offsets[i]]);
+    }
+#endif
+
+    return OMPI_SUCCESS;
+}
+
+void mca_fcoll_vulcan_calc_io_array(mca_common_ompio_io_array_t *io_array, int *num_io_entries, int max_io_entries,
+                                    char *global_buf, mca_io_ompio_local_io_array *file_offsets_for_agg,
+                                    int *sorted_offsets, MPI_Aint *memory_displacements, int rank)
+{
+    int i;
+    int num_entries;
+
+    /* First entry for every aggregator */
+    io_array[0].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_offsets[0]].offset;
+    io_array[0].length = file_offsets_for_agg[sorted_offsets[0]].length;
+    io_array[0].memory_address = global_buf + memory_displacements[sorted_offsets[0]];
+    num_entries = 1;
+
+    /* If the entries are contiguous merge them, else add a new entry */
+    for (i = 1; i < max_io_entries; i++) {
+        if (file_offsets_for_agg[sorted_offsets[i-1]].offset +
+            file_offsets_for_agg[sorted_offsets[i-1]].length ==
+            file_offsets_for_agg[sorted_offsets[i]].offset) {
+            io_array[num_entries - 1].length += file_offsets_for_agg[sorted_offsets[i]].length;
+        } else {
+            io_array[num_entries].offset = (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_offsets[i]].offset;
+            io_array[num_entries].length = file_offsets_for_agg[sorted_offsets[i]].length;
+            io_array[num_entries].memory_address = global_buf + memory_displacements[sorted_offsets[i]];
+            num_entries++;
+        }
+    }
+
+    *num_io_entries = num_entries;
+#if DEBUG_ON
+    printf("*************************** %d\n", num_entries);
+    for (i = 0; i < num_entries; i++) {
+        printf(" AGGREGATOR %d ADDRESS: %p  OFFSET: %ld   LENGTH: %ld\n",
+               rank, io_array[i].memory_address,
+               (ptrdiff_t)io_array[i].offset,
+               io_array[i].length);
+    }
+#endif
 }
 
-    
 static int local_heap_sort (mca_io_ompio_local_io_array *io_array,
-			    int num_entries,
-			    int *sorted)
+			    int num_entries, int *sorted)
 {
     int i = 0;
     int j = 0;
@@ -1864,5 +1597,3 @@ static int local_heap_sort (mca_io_ompio_local_io_array *io_array,
     }
     return OMPI_SUCCESS;
 }
-
-
diff --git a/ompi/mca/fcoll/vulcan/fcoll_vulcan_internal.h b/ompi/mca/fcoll/vulcan/fcoll_vulcan_internal.h
new file mode 100644
index 00000000000..76402297044
--- /dev/null
+++ b/ompi/mca/fcoll/vulcan/fcoll_vulcan_internal.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2024      Advanced Micro Devices, Inc. All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef MCA_FCOLL_VULCAN_INTERNAL_H
+#define MCA_FCOLL_VULCAN_INTERNAL_H
+
+#include "ompi_config.h"
+
+
+BEGIN_C_DECLS
+/* Used for loading file-offsets per aggregator*/
+typedef struct mca_io_ompio_local_io_array{
+    OMPI_MPI_OFFSET_TYPE offset;
+    MPI_Aint             length;
+    int                  process_id;
+}mca_io_ompio_local_io_array;
+
+typedef struct mca_io_ompio_aggregator_data {
+    int *disp_index, *sorted, n;
+    size_t *fview_count;
+    int *max_disp_index;
+    int **blocklen_per_process;
+    MPI_Aint **displs_per_process, total_bytes, bytes_per_cycle, total_bytes_written;
+    MPI_Comm comm;
+    char *global_buf, *prev_global_buf;
+    ompi_datatype_t **recvtype, **prev_recvtype;
+    struct iovec *global_iov_array;
+    int current_index, current_position;
+    int bytes_to_write_in_cycle, bytes_remaining, procs_per_group;    
+    int *procs_in_group, iov_index;
+    size_t bytes_sent, prev_bytes_sent;
+    struct iovec *decoded_iov;
+    int bytes_to_write, prev_bytes_to_write;
+    mca_common_ompio_io_array_t *io_array, *prev_io_array;
+    int num_io_entries, prev_num_io_entries;
+} mca_io_ompio_aggregator_data;
+
+
+#define SWAP_REQUESTS(_r1,_r2) { \
+    ompi_request_t **_t=_r1;     \
+    _r1=_r2;                     \
+    _r2=_t;}
+
+#define SWAP_AGGR_POINTERS(_aggr,_num) {                        \
+    int _i;                                                     \
+    char *_t;                                                   \
+    for (_i=0; _i<_num; _i++ ) {                                \
+        _aggr[_i]->prev_io_array=_aggr[_i]->io_array;             \
+        _aggr[_i]->prev_num_io_entries=_aggr[_i]->num_io_entries; \
+        _aggr[_i]->prev_bytes_sent=_aggr[_i]->bytes_sent;         \
+        _aggr[_i]->prev_bytes_to_write=_aggr[_i]->bytes_to_write; \
+        _t=_aggr[_i]->prev_global_buf;                            \
+        _aggr[_i]->prev_global_buf=_aggr[_i]->global_buf;         \
+        _aggr[_i]->global_buf=_t;                                 \
+        _t=(char *)_aggr[_i]->recvtype;                           \
+        _aggr[_i]->recvtype=_aggr[_i]->prev_recvtype;             \
+        _aggr[_i]->prev_recvtype=(ompi_datatype_t **)_t;          }                                                             \
+}
+
+int mca_fcoll_vulcan_break_file_view (struct iovec *decoded_iov, int iov_count,
+                                      struct iovec *local_iov_array, int local_count,
+                                      struct iovec ***broken_decoded_iovs, int **broken_iov_counts,
+                                      struct iovec ***broken_iov_arrays, int **broken_counts,
+                                      MPI_Aint **broken_total_lengths,
+                                      int stripe_count, size_t stripe_size);
+
+int mca_fcoll_vulcan_get_configuration (ompio_file_t *fh, int num_io_procs,
+                                        size_t max_data);
+
+int mca_fcoll_vulcan_minmax (ompio_file_t *fh, struct iovec *iov, int iov_count,
+			     int num_aggregators, long *new_stripe_size);
+
+void mca_fcoll_vulcan_calc_blocklen_disps (mca_io_ompio_aggregator_data *data, int aggregator,
+                                           int rank, size_t *bytes_comm);
+
+int mca_fcoll_vulcan_calc_file_offsets(mca_io_ompio_aggregator_data *data,
+                                       mca_io_ompio_local_io_array *file_offsets_for_agg,
+                                       int *sorted_file_offsets, MPI_Aint *memory_displacements,
+                                       int entries_per_aggregator, int rank, int index);
+
+void mca_fcoll_vulcan_calc_io_array(mca_common_ompio_io_array_t *io_array, int *num_io_entries, int max_io_arrays,
+                                    char *global_buf, mca_io_ompio_local_io_array *file_offsets_for_agg,
+                                    int *sorted_offsets, MPI_Aint *memory_displacements, int rank);
+
+END_C_DECLS
+
+#endif /* MCA_FCOLL_VULCAN_INTERNAL_H */
diff --git a/ompi/mca/pml/ucx/pml_ucx.c b/ompi/mca/pml/ucx/pml_ucx.c
index dd16a27b154..c748b02e12f 100644
--- a/ompi/mca/pml/ucx/pml_ucx.c
+++ b/ompi/mca/pml/ucx/pml_ucx.c
@@ -739,6 +739,8 @@ int mca_pml_ucx_isend_init(const void *buf, size_t count, ompi_datatype_t *datat
                            struct ompi_communicator_t* comm,
                            struct ompi_request_t **request)
 {
+    int rc;
+    uint32_t cid;
     mca_pml_ucx_persistent_request_t *req;
     ucp_ep_h ep;
 
@@ -755,12 +757,17 @@ int mca_pml_ucx_isend_init(const void *buf, size_t count, ompi_datatype_t *datat
         return OMPI_ERROR;
     }
 
+    rc = ompi_comm_get_remote_cid(comm, dst, &cid);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+         return rc;
+    }
+
     req->ompi.req_state           = OMPI_REQUEST_INACTIVE;
     req->ompi.req_mpi_object.comm = comm;
     req->flags                    = MCA_PML_UCX_REQUEST_FLAG_SEND;
     req->buffer                   = (void *)buf;
     req->count                    = count;
-    req->tag                      = PML_UCX_MAKE_SEND_TAG(tag, comm);
+    req->tag                      = PML_UCX_MAKE_SEND_TAG(tag, comm, cid);
     req->send.mode                = mode;
     req->send.ep                  = ep;
     req->ompi_datatype            = datatype;
@@ -885,7 +892,9 @@ int mca_pml_ucx_isend(const void *buf, size_t count, ompi_datatype_t *datatype,
                       struct ompi_communicator_t* comm,
                       struct ompi_request_t **request)
 {
+    int rc;
     ompi_request_t *req;
+    uint32_t cid;
     ucp_ep_h ep;
 
     PML_UCX_TRACE_SEND("i%ssend request *%p",
@@ -897,15 +906,18 @@ int mca_pml_ucx_isend(const void *buf, size_t count, ompi_datatype_t *datatype,
     if (OPAL_UNLIKELY(NULL == ep)) {
         return OMPI_ERROR;
     }
-
+    rc = ompi_comm_get_remote_cid(comm, dst, &cid);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        return rc;
+    }
 #if HAVE_DECL_UCP_TAG_SEND_NBX
     req = (ompi_request_t*)mca_pml_ucx_common_send_nbx(ep, buf, count, datatype,
-                                                       PML_UCX_MAKE_SEND_TAG(tag, comm), mode,
+                                                       PML_UCX_MAKE_SEND_TAG(tag, comm, cid), mode,
                                                        &mca_pml_ucx_get_op_data(datatype)->op_param.isend);
 #else
     req = (ompi_request_t*)mca_pml_ucx_common_send(ep, buf, count, datatype,
                                                    mca_pml_ucx_get_datatype(datatype),
-                                                   PML_UCX_MAKE_SEND_TAG(tag, comm), mode,
+                                                   PML_UCX_MAKE_SEND_TAG(tag, comm, cid), mode,
                                                    mca_pml_ucx_send_completion);
 #endif
 
@@ -1002,7 +1014,9 @@ int mca_pml_ucx_send(const void *buf, size_t count, ompi_datatype_t *datatype, i
                      int tag, mca_pml_base_send_mode_t mode,
                      struct ompi_communicator_t* comm)
 {
+    int rc;
     ucp_ep_h ep;
+    uint32_t cid;
 
     PML_UCX_TRACE_SEND("%s", buf, count, datatype, dst, tag, mode, comm,
                        mode == MCA_PML_BASE_SEND_BUFFERED ? "bsend" : "send");
@@ -1019,17 +1033,22 @@ int mca_pml_ucx_send(const void *buf, size_t count, ompi_datatype_t *datatype, i
                     OMPI_SPC_BYTES_SENT_USER, OMPI_SPC_BYTES_SENT_MPI);
 #endif
 
+    rc = ompi_comm_get_remote_cid(comm, dst, &cid);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        return rc;
+    }
+
 #if HAVE_DECL_UCP_TAG_SEND_NBR
     if (OPAL_LIKELY((MCA_PML_BASE_SEND_BUFFERED != mode) &&
                     (MCA_PML_BASE_SEND_SYNCHRONOUS != mode))) {
         return mca_pml_ucx_send_nbr(ep, buf, count, datatype,
-                                    PML_UCX_MAKE_SEND_TAG(tag, comm));
+                                    PML_UCX_MAKE_SEND_TAG(tag, comm, cid));
     }
 #endif
 
     return mca_pml_ucx_send_nb(ep, buf, count, datatype,
                                mca_pml_ucx_get_datatype(datatype),
-                               PML_UCX_MAKE_SEND_TAG(tag, comm), mode);
+                               PML_UCX_MAKE_SEND_TAG(tag, comm, cid), mode);
 }
 
 int mca_pml_ucx_iprobe(int src, int tag, struct ompi_communicator_t* comm,
diff --git a/ompi/mca/pml/ucx/pml_ucx_component.c b/ompi/mca/pml/ucx/pml_ucx_component.c
index 5639e2b1f34..ec095e19fef 100644
--- a/ompi/mca/pml/ucx/pml_ucx_component.c
+++ b/ompi/mca/pml/ucx/pml_ucx_component.c
@@ -145,6 +145,10 @@ mca_pml_ucx_component_init(int* priority, bool enable_progress_threads,
     *priority = (support_level == OPAL_COMMON_UCX_SUPPORT_DEVICE) ?
                 ompi_pml_ucx.priority : 19;
     PML_UCX_VERBOSE(2, "returning priority %d", *priority);
+
+    /** this pml supports the extended CID space */
+    ompi_pml_ucx.super.pml_flags |= MCA_PML_BASE_FLAG_SUPPORTS_EXT_CID;
+
     return &ompi_pml_ucx.super;
 }
 
diff --git a/ompi/mca/pml/ucx/pml_ucx_request.c b/ompi/mca/pml/ucx/pml_ucx_request.c
index fccb9f6a6f6..1a8d0dbc043 100644
--- a/ompi/mca/pml/ucx/pml_ucx_request.c
+++ b/ompi/mca/pml/ucx/pml_ucx_request.c
@@ -282,7 +282,7 @@ void mca_pml_ucx_completed_request_init(ompi_request_t *ompi_req)
     mca_pml_ucx_request_init_common(ompi_req, false, OMPI_REQUEST_ACTIVE,
                                     mca_pml_completed_request_free,
                                     mca_pml_completed_request_cancel);
-    ompi_req->req_mpi_object.comm = &ompi_mpi_comm_world.comm;
+    ompi_req->req_mpi_object.comm = &ompi_mpi_comm_null.comm;
     ompi_request_complete(ompi_req, false);
 }
 
diff --git a/ompi/mca/pml/ucx/pml_ucx_request.h b/ompi/mca/pml/ucx/pml_ucx_request.h
index 8132f6b54ba..9e901794f8d 100644
--- a/ompi/mca/pml/ucx/pml_ucx_request.h
+++ b/ompi/mca/pml/ucx/pml_ucx_request.h
@@ -43,10 +43,10 @@ enum {
 #define PML_UCX_TAG_MASK                       0x7fffff0000000000ul
 
 
-#define PML_UCX_MAKE_SEND_TAG(_tag, _comm) \
+#define PML_UCX_MAKE_SEND_TAG(_tag, _comm, _c_index) \
     ((((uint64_t) (_tag)            ) << (PML_UCX_RANK_BITS + PML_UCX_CONTEXT_BITS)) | \
      (((uint64_t)(_comm)->c_my_rank ) << PML_UCX_CONTEXT_BITS) | \
-     ((uint64_t)(_comm)->c_index))
+     ((uint64_t)(_c_index)))
 
 
 #define PML_UCX_MAKE_RECV_TAG(_ucp_tag, _ucp_tag_mask, _tag, _src, _comm) \
diff --git a/ompi/runtime/ompi_mpi_params.c b/ompi/runtime/ompi_mpi_params.c
index c31e47e4af8..cbc82f42f23 100644
--- a/ompi/runtime/ompi_mpi_params.c
+++ b/ompi/runtime/ompi_mpi_params.c
@@ -20,7 +20,7 @@
  *                         All rights reserved.
  * Copyright (c) 2016-2021 Research Organization for Information Science
  *                         and Technology (RIST).  All rights reserved.
- * Copyright (c) 2018-2021 Triad National Security, LLC. All rights
+ * Copyright (c) 2018-2024 Triad National Security, LLC. All rights
  *                         reserved.
  * Copyright (c) 2021      Nanook Consulting.  All rights reserved.
  * Copyright (c) 2022      IBM Corporation.  All rights reserved.
@@ -104,6 +104,7 @@ bool ompi_ftmpi_enabled = false;
 #endif /* OPAL_ENABLE_FT_MPI */
 
 static int ompi_stream_buffering_mode = -1;
+int ompi_comm_verbose_level = 0;
 
 int ompi_mpi_register_params(void)
 {
@@ -445,6 +446,10 @@ int ompi_mpi_register_params(void)
     }
 #endif /* OPAL_ENABLE_FT_MPI */
 
+    (void) mca_base_var_register ("ompi", "mpi", "comm", "verbose",
+                                  "Verbosity level for communicator management subsystem",
+                                  MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
+                                  OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL, &ompi_comm_verbose_level);
 
     return OMPI_SUCCESS;
 }
diff --git a/ompi/runtime/params.h b/ompi/runtime/params.h
index d9f48f80b59..db4e9043d7b 100644
--- a/ompi/runtime/params.h
+++ b/ompi/runtime/params.h
@@ -16,7 +16,7 @@
  * Copyright (c) 2010-2012 Oak Ridge National Labs.  All rights reserved.
  * Copyright (c) 2013      NVIDIA Corporation.  All rights reserved.
  * Copyright (c) 2013      Intel, Inc. All rights reserved
- * Copyright (c) 2018-2021 Triad National Security, LLC. All rights
+ * Copyright (c) 2018-2024 Triad National Security, LLC. All rights
  *                         reserved.
  * Copyright (c) 2021      Nanook Consulting.  All rights reserved.
  * $COPYRIGHT$
@@ -191,6 +191,12 @@ OMPI_DECLSPEC extern bool ompi_enable_timing;
 OMPI_DECLSPEC extern int ompi_mpi_event_tick_rate;
 OMPI_DECLSPEC extern bool ompi_mpi_yield_when_idle;
 
+ /**
+ * An integer value specifying verbosity level for communicator management
+ * subsystem.
+ */
+OMPI_DECLSPEC extern int ompi_comm_verbose_level;
+
 /**
  * Register MCA parameters used by the MPI layer.
  *
diff --git a/opal/mca/base/mca_base_var_group.c b/opal/mca/base/mca_base_var_group.c
index 7bdb656b09c..732df663e98 100644
--- a/opal/mca/base/mca_base_var_group.c
+++ b/opal/mca/base/mca_base_var_group.c
@@ -15,6 +15,8 @@
  *                         reserved.
  * Copyright (c) 2017      Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2024      Triad National Security, LLC. All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -374,6 +376,8 @@ int mca_base_var_group_deregister(int group_index)
     for (int i = 0; i < size; ++i) {
         OBJ_RELEASE(enums[i]);
     }
+    opal_value_array_set_size(&group->group_enums, 0);
+
 
     size = opal_value_array_get_size(&group->group_subgroups);
     subgroups = OPAL_VALUE_ARRAY_GET_BASE(&group->group_subgroups, int);
diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c
index 78df6003212..f6195b41af3 100644
--- a/opal/mca/common/ofi/common_ofi.c
+++ b/opal/mca/common/ofi/common_ofi.c
@@ -324,10 +324,11 @@ int opal_common_ofi_providers_subset_of_list(struct fi_info *provider_list, char
 
 int opal_common_ofi_mca_register(const mca_base_component_t *component)
 {
-    static int include_index = -1;
-    static int exclude_index = -1;
-    static int verbose_index = -1;
-    static int accelerator_rank_index = -1;
+    int include_index;
+    int exclude_index;
+    int verbose_index;
+    int accelerator_rank_index;
+    int param;
     int ret;
 
     if (fi_version() < FI_VERSION(1, 0)) {
@@ -336,7 +337,8 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
 
     OPAL_THREAD_LOCK(&opal_common_ofi_mutex);
 
-    if (0 > include_index) {
+    param = mca_base_var_find("opal", "opal_common", "ofi", "provider_include");
+    if (0 > param) {
         /*
          * this monkey business is needed because of the way the MCA VARs stuff tries to handle
          * pointers to strings when when destructing the MCA var database.  If you don't do
@@ -359,9 +361,12 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
             ret = include_index;
             goto err;
         }
+    } else {
+       include_index = param;
     }
 
-    if (0 > exclude_index) {
+    param = mca_base_var_find("opal", "opal_common", "ofi", "provider_exclude");
+    if (0 > param) {
         if (NULL == opal_common_ofi.prov_exclude) {
             opal_common_ofi.prov_exclude = (char **) malloc(sizeof(char *));
             assert(NULL != opal_common_ofi.prov_exclude);
@@ -378,9 +383,12 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
             ret = exclude_index;
             goto err;
         }
+    } else {
+        exclude_index = param;
     }
 
-    if (0 > verbose_index) {
+    param = mca_base_var_find("opal", "opal_common", "ofi", "verbose");
+    if (0 > param) {
         verbose_index = mca_base_var_register("opal", "opal_common", "ofi", "verbose",
                                               "Verbose level of the OFI components",
                                               MCA_BASE_VAR_TYPE_INT, NULL, 0,
@@ -391,9 +399,13 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
             ret = verbose_index;
             goto err;
         }
+    } else {
+        verbose_index = param;
     }
 
-    if (0 > accelerator_rank_index) {
+
+    param = mca_base_var_find("opal", "opal_common", "ofi", "accelerator_rank");
+    if (0 > param) {
         accelerator_rank_index
             = mca_base_var_register("opal", "opal_common", "ofi", "accelerator_rank",
                                     "Process rank(non-negative) on the selected accelerator device",
@@ -404,6 +416,8 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
             ret = accelerator_rank_index;
             goto err;
         }
+    } else {
+        accelerator_rank_index = param;
     }
 
     if (component) {
diff --git a/opal/mca/pmix/pmix-internal.h b/opal/mca/pmix/pmix-internal.h
index 4e10393f60f..3c00306f501 100644
--- a/opal/mca/pmix/pmix-internal.h
+++ b/opal/mca/pmix/pmix-internal.h
@@ -9,7 +9,7 @@
  *                         reserved.
  * Copyright (c) 2020      Amazon.com, Inc. or its affiliates.
  *                         All Rights reserved.
- * Copyright (c) 2021      Nanook Consulting.  All rights reserved.
+ * Copyright (c) 2021-2024 Nanook Consulting  All rights reserved.
  * Copyright (c) 2021      Argonne National Laboratory.  All rights
  *                         reserved.
  * $COPYRIGHT$
@@ -293,7 +293,7 @@ typedef struct {
         OPAL_OUTPUT_VERBOSE((1, opal_pmix_verbose_output,                              \
                              "%s[%s:%d] MODEX RECV VALUE OPTIONAL FOR PROC %s KEY %s", \
                              OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), __FILE__, __LINE__,   \
-                             OPAL_NAME_PRINT(*(p)), (s)));                             \
+                             OPAL_NAME_PRINT(*(p)), PMIx_Get_attribute_name(s)));      \
         OPAL_PMIX_CONVERT_NAME(&_proc, (p));                                           \
         PMIX_INFO_LOAD(&_info, PMIX_OPTIONAL, NULL, PMIX_BOOL);                        \
         (r) = PMIx_Get(&(_proc), (s), &(_info), 1, &(_kv));                            \
@@ -334,7 +334,7 @@ typedef struct {
         OPAL_OUTPUT_VERBOSE((1, opal_pmix_verbose_output,                               \
                              "%s[%s:%d] MODEX RECV VALUE IMMEDIATE FOR PROC %s KEY %s", \
                              OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), __FILE__, __LINE__,    \
-                             OPAL_NAME_PRINT(*(p)), (s)));                              \
+                             OPAL_NAME_PRINT(*(p)), PMIx_Get_attribute_name(s)));       \
         OPAL_PMIX_CONVERT_NAME(&_proc, (p));                                            \
         PMIX_INFO_LOAD(&_info, PMIX_IMMEDIATE, NULL, PMIX_BOOL);                        \
         (r) = PMIx_Get(&(_proc), (s), &(_info), 1, &(_kv));                             \
@@ -370,7 +370,8 @@ typedef struct {
         size_t _sz;                                                                                \
         OPAL_OUTPUT_VERBOSE(                                                                       \
             (1, opal_pmix_verbose_output, "%s[%s:%d] MODEX RECV VALUE FOR PROC %s KEY %s",         \
-             OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), __FILE__, __LINE__, OPAL_NAME_PRINT(*(p)), (s))); \
+             OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), __FILE__, __LINE__, OPAL_NAME_PRINT(*(p)),        \
+             PMIx_Get_attribute_name(s))); \
         OPAL_PMIX_CONVERT_NAME(&_proc, (p));                                                       \
         (r) = PMIx_Get(&(_proc), (s), NULL, 0, &(_kv));                                            \
         if (NULL == _kv) {                                                                         \
@@ -406,7 +407,7 @@ typedef struct {
         OPAL_OUTPUT_VERBOSE((1, opal_pmix_verbose_output,                               \
                              "%s[%s:%d] MODEX RECV STRING OPTIONAL FOR PROC %s KEY %s", \
                              OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), __FILE__, __LINE__,    \
-                             OPAL_NAME_PRINT(*(p)), (s)));                              \
+                             OPAL_NAME_PRINT(*(p)), PMIx_Get_attribute_name(s)));       \
         *(d) = NULL;                                                                    \
         *(sz) = 0;                                                                      \
         OPAL_PMIX_CONVERT_NAME(&_proc, (p));                                            \
@@ -444,7 +445,8 @@ typedef struct {
         pmix_info_t _info;                                                                         \
         OPAL_OUTPUT_VERBOSE(                                                                       \
             (1, opal_pmix_verbose_output, "%s[%s:%d] MODEX RECV STRING FOR PROC %s KEY %s",        \
-             OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), __FILE__, __LINE__, OPAL_NAME_PRINT(*(p)), (s))); \
+             OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), __FILE__, __LINE__, OPAL_NAME_PRINT(*(p)),        \
+             PMIx_Get_attribute_name(s)));                                                         \
         *(d) = NULL;                                                                               \
         *(sz) = 0;                                                                                 \
         OPAL_PMIX_CONVERT_NAME(&_proc, (p));                                                       \