open-mpi
diff --git a/‎ompi/mca/rte/rte.h‎
Lines changed: 19 additions & 19 deletions b/‎ompi/mca/rte/rte.h‎
Lines changed: 19 additions & 19 deletions
diff --git a/‎ompi/runtime/ompi_mpi_finalize.c‎
Lines changed: 12 additions & 11 deletions b/‎ompi/runtime/ompi_mpi_finalize.c‎
Lines changed: 12 additions & 11 deletions
diff --git a/‎ompi/runtime/ompi_mpi_init.c‎
Lines changed: 33 additions & 10 deletions b/‎ompi/runtime/ompi_mpi_init.c‎
Lines changed: 33 additions & 10 deletions
diff --git a/‎opal/mca/pmix/base/base.h‎
Lines changed: 10 additions & 1 deletion b/‎opal/mca/pmix/base/base.h‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎opal/mca/pmix/base/pmix_base_fns.c‎
Lines changed: 5 additions & 0 deletions b/‎opal/mca/pmix/base/pmix_base_fns.c‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎opal/mca/pmix/base/pmix_base_frame.c‎
Lines changed: 1 addition & 1 deletion b/‎opal/mca/pmix/base/pmix_base_frame.c‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎opal/mca/pmix/cray/pmix_cray.c‎
Lines changed: 41 additions & 7 deletions b/‎opal/mca/pmix/cray/pmix_cray.c‎
Lines changed: 41 additions & 7 deletions
diff --git a/‎opal/mca/pmix/external/pmix_ext_client.c‎
Lines changed: 2 additions & 0 deletions b/‎opal/mca/pmix/external/pmix_ext_client.c‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎opal/mca/pmix/pmix.h‎
Lines changed: 0 additions & 15 deletions b/‎opal/mca/pmix/pmix.h‎
Lines changed: 0 additions & 15 deletions
diff --git a/‎opal/mca/pmix/pmix112/pmix1_client.c‎
Lines changed: 2 additions & 0 deletions b/‎opal/mca/pmix/pmix112/pmix1_client.c‎
Lines changed: 2 additions & 0 deletions
@@ -207,27 +207,27 @@ OMPI_DECLSPEC extern mca_base_framework_t ompi_rte_base_framework;
  * progress while waiting, so we loop over opal_progress, letting
  * the RTE progress thread move the RTE along
  */
-#define OMPI_WAIT_FOR_COMPLETION(flg)                                   \
-    do {                                                                \
-        opal_output_verbose(1, ompi_rte_base_framework.framework_output, \
-                            "%s waiting on RTE event at %s:%d",         \
-                            OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),         \
-                            __FILE__, __LINE__);                        \
-        while ((flg)) {                                                \
-            opal_progress();                                            \
-        }                                                               \
+#define OMPI_WAIT_FOR_COMPLETION(flg)                                       \
+    do {                                                                    \
+        opal_output_verbose(1, ompi_rte_base_framework.framework_output,    \
+                            "%s waiting on RTE event at %s:%d",             \
+                            OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),             \
+                            __FILE__, __LINE__);                            \
+        while ((flg)) {                                                     \
+            opal_progress();                                                \
+        }                                                                   \
     }while(0);
 
-#define OMPI_LAZY_WAIT_FOR_COMPLETION(flg)                              \
-    do {                                                                \
-        opal_output_verbose(1, ompi_rte_base_framework.framework_output, \
-                            "%s lazy waiting on RTE event at %s:%d",    \
-                            OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),         \
-                            __FILE__, __LINE__);                        \
-        while ((flg)) {                                                 \
-            opal_progress();                                            \
-            usleep(100);                                                \
-        }                                                               \
+#define OMPI_LAZY_WAIT_FOR_COMPLETION(flg)                                  \
+    do {                                                                    \
+        opal_output_verbose(1, ompi_rte_base_framework.framework_output,    \
+                            "%s lazy waiting on RTE event at %s:%d",        \
+                            OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),             \
+                            __FILE__, __LINE__);                            \
+        while ((flg)) {                                                     \
+            opal_progress();                                                \
+            usleep(100);                                                    \
+        }                                                                   \
     }while(0);
 
 typedef struct {
 
@@ -16,7 +16,7 @@
  * Copyright (c) 2006      University of Houston. All rights reserved.
  * Copyright (c) 2009      Sun Microsystems, Inc.  All rights reserved.
  * Copyright (c) 2011      Sandia National Laboratories. All rights reserved.
- * Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
+ * Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
  *
  * $COPYRIGHT$
  *
@@ -242,19 +242,20 @@ int ompi_mpi_finalize(void)
        more details). */
     if (NULL != opal_pmix.fence_nb) {
         active = true;
-        /* Note that the non-blocking PMIx fence will cycle calling
-           opal_progress(), which will allow any other pending
-           communications/actions to complete.  See
-           https://github.com/open-mpi/ompi/issues/1576 for the
-           original bug report. */
+        /* Note that use of the non-blocking PMIx fence will
+         * allow us to lazily cycle calling
+         * opal_progress(), which will allow any other pending
+         * communications/actions to complete.  See
+         * https://github.com/open-mpi/ompi/issues/1576 for the
+         * original bug report. */
         opal_pmix.fence_nb(NULL, 0, fence_cbfunc, (void*)&active);
-        OMPI_WAIT_FOR_COMPLETION(active);
+        OMPI_LAZY_WAIT_FOR_COMPLETION(active);
     } else {
         /* However, we cannot guarantee that the provided PMIx has
-           fence_nb.  If it doesn't, then do the best we can: an MPI
-           barrier on COMM_WORLD (which isn't the best because of the
-           reasons cited above), followed by a blocking PMIx fence
-           (which may not necessarily call opal_progress()). */
+         * fence_nb.  If it doesn't, then do the best we can: an MPI
+         * barrier on COMM_WORLD (which isn't the best because of the
+         * reasons cited above), followed by a blocking PMIx fence
+         * (which does not call opal_progress()). */
         ompi_communicator_t *comm = &ompi_mpi_comm_world.comm;
         comm->c_coll.coll_barrier(comm, comm->c_coll.coll_barrier_module);
 
 
@@ -364,13 +364,20 @@ static int ompi_register_mca_variables(void)
     return OMPI_SUCCESS;
 }
 
+static void fence_release(int status, void *cbdata)
+{
+    volatile bool *active = (volatile bool*)cbdata;
+    *active = false;
+}
+
 int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
 {
     int ret;
     ompi_proc_t** procs;
     size_t nprocs;
     char *error = NULL;
     char *cmd=NULL, *av=NULL;
+    volatile bool active;
     OPAL_TIMING_DECLARE(tm);
     OPAL_TIMING_INIT_EXT(&tm, OPAL_TIMING_GET_TIME_OF_DAY);
 
@@ -628,16 +635,25 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
 
     /* exchange connection info - this function may also act as a barrier
      * if data exchange is required. The modex occurs solely across procs
-     * in our job, so no proc array is passed. If a barrier is required,
-     * the "modex" function will perform it internally
-     */
-    OPAL_MODEX();
+     * in our job. If a barrier is required, the "modex" function will
+     * perform it internally */
+    active = true;
+    opal_pmix.commit();
+    if (!opal_pmix_base_async_modex) {
+        if (NULL != opal_pmix.fence_nb) {
+            opal_pmix.fence_nb(NULL, opal_pmix_collect_all_data,
+                               fence_release, (void*)&active);
+            OMPI_WAIT_FOR_COMPLETION(active);
+        } else {
+            opal_pmix.fence(NULL, opal_pmix_collect_all_data);
+        }
+    }
 
     OPAL_TIMING_MNEXT((&tm,"time from modex to first barrier"));
 
     /* select buffered send allocator component to be used */
     if( OMPI_SUCCESS !=
-	(ret = mca_pml_base_bsend_init(ompi_mpi_thread_multiple))) {
+        (ret = mca_pml_base_bsend_init(ompi_mpi_thread_multiple))) {
         error = "mca_pml_base_bsend_init() failed";
         goto error;
     }
@@ -792,7 +808,15 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
     /* wait for everyone to reach this point - this is a hard
      * barrier requirement at this time, though we hope to relax
      * it at a later point */
-    opal_pmix.fence(NULL, 0);
+    active = true;
+    opal_pmix.commit();
+    if (NULL != opal_pmix.fence_nb) {
+        opal_pmix.fence_nb(NULL, opal_pmix_collect_all_data,
+                           fence_release, (void*)&active);
+        OMPI_WAIT_FOR_COMPLETION(active);
+    } else {
+        opal_pmix.fence(NULL, opal_pmix_collect_all_data);
+    }
 
     /* check for timing request - get stop time and report elapsed
        time if so, then start the clock again */
@@ -829,10 +853,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
        e.g. hierarch, might create subcommunicators. The threadlevel
        requested by all processes is required in order to know
        which cid allocation algorithm can be used. */
-    if ( OMPI_SUCCESS !=
-	 ( ret = ompi_comm_cid_init ())) {
-	error = "ompi_mpi_init: ompi_comm_cid_init failed";
-	goto error;
+    if (OMPI_SUCCESS != ( ret = ompi_comm_cid_init ())) {
+        error = "ompi_mpi_init: ompi_comm_cid_init failed";
+        goto error;
     }
 
     /* Init coll for the comms. This has to be after dpm_base_select,
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
+ * Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -46,6 +46,15 @@ OPAL_DECLSPEC void opal_pmix_base_errhandler(int status,
 OPAL_DECLSPEC int opal_pmix_base_exchange(opal_value_t *info,
                                           opal_pmix_pdata_t *pdat,
                                           int timeout);
+
+OPAL_DECLSPEC void opal_pmix_base_set_evbase(opal_event_base_t *evbase);
+
+typedef struct {
+    opal_event_base_t *evbase;
+} opal_pmix_base_t;
+
+extern opal_pmix_base_t opal_pmix_base;
+
 END_C_DECLS
 
 #endif
@@ -39,6 +39,11 @@
 
 #define OPAL_PMI_PAD  10
 
+void opal_pmix_base_set_evbase(opal_event_base_t *evbase)
+{
+    opal_pmix_base.evbase = evbase;
+}
+
 /********     ERRHANDLER SUPPORT FOR COMPONENTS THAT
  ********     DO NOT NATIVELY SUPPORT IT
  ********/
 
@@ -33,9 +33,9 @@
    https://github.com/open-mpi/ompi/issues/375 for details. */
 opal_pmix_base_module_t opal_pmix = { 0 };
 bool opal_pmix_collect_all_data = true;
-bool opal_pmix_base_allow_delayed_server = false;
 int opal_pmix_verbose_output = -1;
 bool opal_pmix_base_async_modex = false;
+opal_pmix_base_t opal_pmix_base = {0};
 
 static int opal_pmix_base_frame_register(mca_base_register_flag_t flags)
 {
 
@@ -57,7 +57,8 @@ static int cray_resolve_peers(const char *nodename,
                               opal_list_t *procs);
 static int cray_resolve_nodes(opal_jobid_t jobid, char **nodelist);
 static int cray_put(opal_pmix_scope_t scope, opal_value_t *kv);
-static int cray_fence(opal_list_t *procs, int collect_data);
+static int cray_fencenb(opal_list_t *procs, int collect_data,
+                        opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
 static int cray_commit(void);
 static int cray_get(const opal_process_name_t *id,
                     const char *key, opal_list_t *info,
@@ -90,8 +91,8 @@ const opal_pmix_base_module_t opal_pmix_cray_module = {
     .initialized = cray_initialized,
     .abort = cray_abort,
     .commit = cray_commit,
-    .fence = cray_fence,
-    .fence_nb = NULL,
+    .fence = NULL,
+    .fence_nb = cray_fencenb,
     .put = cray_put,
     .get = cray_get,
     .get_nb = cray_get_nb,
@@ -119,6 +120,17 @@ const opal_pmix_base_module_t opal_pmix_cray_module = {
 // usage accounting
 static int pmix_init_count = 0;
 
+// local object
+typedef struct {
+    opal_object_t super;
+    opal_event_t ev;
+    opal_pmix_op_cbfunc_t opcbfunc;
+    void *cbdata;
+} pmi_opcaddy_t;
+OBJ_CLASS_INSTANCE(pmi_opcaddy_t,
+                   opal_object_t,
+                   NULL, NULL);
+
 // PMI constant values:
 static int pmix_kvslen_max = 0;
 static int pmix_keylen_max = 0;
@@ -512,8 +524,9 @@ static int cray_commit(void)
     return OPAL_SUCCESS;
 }
 
-static int cray_fence(opal_list_t *procs, int collect_data)
+static void fencenb(int sd, short args, void *cbdata)
 {
+    pmi_opcaddy_t *op = (pmi_opcaddy_t*)cbdata;
     int rc, cnt;
     int32_t i;
     int *all_lens = NULL;
@@ -550,7 +563,8 @@ static int cray_fence(opal_list_t *procs, int collect_data)
 
     send_buffer = OBJ_NEW(opal_buffer_t);
     if (NULL == send_buffer) {
-        return OPAL_ERR_OUT_OF_RESOURCE;
+        rc = OPAL_ERR_OUT_OF_RESOURCE;
+        goto fn_exit;
     }
 
     opal_dss.copy_payload(send_buffer, mca_pmix_cray_component.cache_global);
@@ -668,7 +682,7 @@ static int cray_fence(opal_list_t *procs, int collect_data)
      * for every process in the job.
      *
      *  we only need to set locality for each local rank as "not found"
-     * equates to "non-local" 
+     * equates to "non-local"
      */
 
     for (i=0; i < pmix_nlranks; i++) {
@@ -732,7 +746,27 @@ static int cray_fence(opal_list_t *procs, int collect_data)
     if (r_bytes_and_ranks != NULL) {
         free(r_bytes_and_ranks);
     }
-    return rc;
+    if (NULL != op->opcbfunc) {
+        op->opcbfunc(rc, op->cbdata);
+    }
+    OBJ_RELEASE(op);
+    return;
+}
+
+static int cray_fencenb(opal_list_t *procs, int collect_data,
+                      opal_pmix_op_cbfunc_t cbfunc, void *cbdata)
+{
+    pmi_opcaddy_t *op;
+
+    /* thread-shift this so we don't block in Cray's barrier */
+    op = OBJ_NEW(pmi_opcaddy_t);
+    op->opcbfunc = cbfunc;
+    op->cbdata = cbdata;
+    event_assign(&op->ev, opal_pmix_base.evbase, -1,
+                 EV_WRITE, fencenb, op);
+    event_active(&op->ev, EV_WRITE, 1);
+
+    return OPAL_SUCCESS;
 }
 
 static int cray_get(const opal_process_name_t *id, const char *key, opal_list_t *info, opal_value_t **kv)
 
@@ -367,6 +367,8 @@ int pmix_ext_fencenb(opal_list_t *procs, int collect_data,
     if (collect_data) {
         PMIX_INFO_CONSTRUCT(&info);
         (void)strncpy(info.key, PMIX_COLLECT_DATA, PMIX_MAX_KEYLEN);
+        info.value.type = PMIX_BOOL;
+        info.value.data.flag = true;
         iptr = &info;
         n = 1;
     } else {
 
@@ -250,21 +250,6 @@ extern int opal_pmix_base_exchange(opal_value_t *info,
         }                                                               \
     } while(0);
 
-
-/**
- * Provide a simplified macro for calling the fence function
- * that takes into account directives and availability of
- * non-blocking operations
- */
-#define OPAL_MODEX()                                    \
-    do {                                                \
-        opal_pmix.commit();                             \
-        if (!opal_pmix_base_async_modex) {              \
-            opal_pmix.fence(NULL,                       \
-                opal_pmix_collect_all_data);            \
-        }                                               \
-    } while(0);
-
 /**
  * Provide a macro for accessing a base function that exchanges
  * data values between two procs using the PMIx Publish/Lookup
 
@@ -367,6 +367,8 @@ int pmix1_fencenb(opal_list_t *procs, int collect_data,
     if (collect_data) {
         PMIX_INFO_CONSTRUCT(&info);
         (void)strncpy(info.key, PMIX_COLLECT_DATA, PMIX_MAX_KEYLEN);
+        info.value.type = PMIX_BOOL;
+        info.value.data.flag = true;
         iptr = &info;
         n = 1;
     } else {