Skip to content
This repository was archived by the owner on Sep 30, 2022. It is now read-only.

Commit 5065fd0

Browse files
committed
Merge pull request #1166 from rhc54/cmr20/direct
When direct launching applications, we must allow the MPI layer to progress during RTE-level barriers.
2 parents c1ffc07 + a771dfe commit 5065fd0

File tree

21 files changed

+322
-141
lines changed

21 files changed

+322
-141
lines changed

ompi/mca/rte/rte.h

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -207,27 +207,27 @@ OMPI_DECLSPEC extern mca_base_framework_t ompi_rte_base_framework;
207207
* progress while waiting, so we loop over opal_progress, letting
208208
* the RTE progress thread move the RTE along
209209
*/
210-
#define OMPI_WAIT_FOR_COMPLETION(flg) \
211-
do { \
212-
opal_output_verbose(1, ompi_rte_base_framework.framework_output, \
213-
"%s waiting on RTE event at %s:%d", \
214-
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
215-
__FILE__, __LINE__); \
216-
while ((flg)) { \
217-
opal_progress(); \
218-
} \
210+
#define OMPI_WAIT_FOR_COMPLETION(flg) \
211+
do { \
212+
opal_output_verbose(1, ompi_rte_base_framework.framework_output, \
213+
"%s waiting on RTE event at %s:%d", \
214+
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
215+
__FILE__, __LINE__); \
216+
while ((flg)) { \
217+
opal_progress(); \
218+
} \
219219
}while(0);
220220

221-
#define OMPI_LAZY_WAIT_FOR_COMPLETION(flg) \
222-
do { \
223-
opal_output_verbose(1, ompi_rte_base_framework.framework_output, \
224-
"%s lazy waiting on RTE event at %s:%d", \
225-
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
226-
__FILE__, __LINE__); \
227-
while ((flg)) { \
228-
opal_progress(); \
229-
usleep(100); \
230-
} \
221+
#define OMPI_LAZY_WAIT_FOR_COMPLETION(flg) \
222+
do { \
223+
opal_output_verbose(1, ompi_rte_base_framework.framework_output, \
224+
"%s lazy waiting on RTE event at %s:%d", \
225+
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), \
226+
__FILE__, __LINE__); \
227+
while ((flg)) { \
228+
opal_progress(); \
229+
usleep(100); \
230+
} \
231231
}while(0);
232232

233233
typedef struct {

ompi/runtime/ompi_mpi_finalize.c

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
* Copyright (c) 2006 University of Houston. All rights reserved.
1717
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
1818
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
19-
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
19+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
2020
*
2121
* $COPYRIGHT$
2222
*
@@ -242,19 +242,20 @@ int ompi_mpi_finalize(void)
242242
more details). */
243243
if (NULL != opal_pmix.fence_nb) {
244244
active = true;
245-
/* Note that the non-blocking PMIx fence will cycle calling
246-
opal_progress(), which will allow any other pending
247-
communications/actions to complete. See
248-
https://github.com/open-mpi/ompi/issues/1576 for the
249-
original bug report. */
245+
/* Note that use of the non-blocking PMIx fence will
246+
* allow us to lazily cycle calling
247+
* opal_progress(), which will allow any other pending
248+
* communications/actions to complete. See
249+
* https://github.com/open-mpi/ompi/issues/1576 for the
250+
* original bug report. */
250251
opal_pmix.fence_nb(NULL, 0, fence_cbfunc, (void*)&active);
251-
OMPI_WAIT_FOR_COMPLETION(active);
252+
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
252253
} else {
253254
/* However, we cannot guarantee that the provided PMIx has
254-
fence_nb. If it doesn't, then do the best we can: an MPI
255-
barrier on COMM_WORLD (which isn't the best because of the
256-
reasons cited above), followed by a blocking PMIx fence
257-
(which may not necessarily call opal_progress()). */
255+
* fence_nb. If it doesn't, then do the best we can: an MPI
256+
* barrier on COMM_WORLD (which isn't the best because of the
257+
* reasons cited above), followed by a blocking PMIx fence
258+
* (which does not call opal_progress()). */
258259
ompi_communicator_t *comm = &ompi_mpi_comm_world.comm;
259260
comm->c_coll.coll_barrier(comm, comm->c_coll.coll_barrier_module);
260261

ompi/runtime/ompi_mpi_init.c

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -364,13 +364,20 @@ static int ompi_register_mca_variables(void)
364364
return OMPI_SUCCESS;
365365
}
366366

367+
static void fence_release(int status, void *cbdata)
368+
{
369+
volatile bool *active = (volatile bool*)cbdata;
370+
*active = false;
371+
}
372+
367373
int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
368374
{
369375
int ret;
370376
ompi_proc_t** procs;
371377
size_t nprocs;
372378
char *error = NULL;
373379
char *cmd=NULL, *av=NULL;
380+
volatile bool active;
374381
OPAL_TIMING_DECLARE(tm);
375382
OPAL_TIMING_INIT_EXT(&tm, OPAL_TIMING_GET_TIME_OF_DAY);
376383

@@ -628,16 +635,25 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
628635

629636
/* exchange connection info - this function may also act as a barrier
630637
* if data exchange is required. The modex occurs solely across procs
631-
* in our job, so no proc array is passed. If a barrier is required,
632-
* the "modex" function will perform it internally
633-
*/
634-
OPAL_MODEX();
638+
* in our job. If a barrier is required, the "modex" function will
639+
* perform it internally */
640+
active = true;
641+
opal_pmix.commit();
642+
if (!opal_pmix_base_async_modex) {
643+
if (NULL != opal_pmix.fence_nb) {
644+
opal_pmix.fence_nb(NULL, opal_pmix_collect_all_data,
645+
fence_release, (void*)&active);
646+
OMPI_WAIT_FOR_COMPLETION(active);
647+
} else {
648+
opal_pmix.fence(NULL, opal_pmix_collect_all_data);
649+
}
650+
}
635651

636652
OPAL_TIMING_MNEXT((&tm,"time from modex to first barrier"));
637653

638654
/* select buffered send allocator component to be used */
639655
if( OMPI_SUCCESS !=
640-
(ret = mca_pml_base_bsend_init(ompi_mpi_thread_multiple))) {
656+
(ret = mca_pml_base_bsend_init(ompi_mpi_thread_multiple))) {
641657
error = "mca_pml_base_bsend_init() failed";
642658
goto error;
643659
}
@@ -792,7 +808,15 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
792808
/* wait for everyone to reach this point - this is a hard
793809
* barrier requirement at this time, though we hope to relax
794810
* it at a later point */
795-
opal_pmix.fence(NULL, 0);
811+
active = true;
812+
opal_pmix.commit();
813+
if (NULL != opal_pmix.fence_nb) {
814+
opal_pmix.fence_nb(NULL, opal_pmix_collect_all_data,
815+
fence_release, (void*)&active);
816+
OMPI_WAIT_FOR_COMPLETION(active);
817+
} else {
818+
opal_pmix.fence(NULL, opal_pmix_collect_all_data);
819+
}
796820

797821
/* check for timing request - get stop time and report elapsed
798822
time if so, then start the clock again */
@@ -829,10 +853,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
829853
e.g. hierarch, might create subcommunicators. The threadlevel
830854
requested by all processes is required in order to know
831855
which cid allocation algorithm can be used. */
832-
if ( OMPI_SUCCESS !=
833-
( ret = ompi_comm_cid_init ())) {
834-
error = "ompi_mpi_init: ompi_comm_cid_init failed";
835-
goto error;
856+
if (OMPI_SUCCESS != ( ret = ompi_comm_cid_init ())) {
857+
error = "ompi_mpi_init: ompi_comm_cid_init failed";
858+
goto error;
836859
}
837860

838861
/* Init coll for the comms. This has to be after dpm_base_select,

opal/mca/pmix/base/base.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
2+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
33
* $COPYRIGHT$
44
*
55
* Additional copyrights may follow
@@ -46,6 +46,15 @@ OPAL_DECLSPEC void opal_pmix_base_errhandler(int status,
4646
OPAL_DECLSPEC int opal_pmix_base_exchange(opal_value_t *info,
4747
opal_pmix_pdata_t *pdat,
4848
int timeout);
49+
50+
OPAL_DECLSPEC void opal_pmix_base_set_evbase(opal_event_base_t *evbase);
51+
52+
typedef struct {
53+
opal_event_base_t *evbase;
54+
} opal_pmix_base_t;
55+
56+
extern opal_pmix_base_t opal_pmix_base;
57+
4958
END_C_DECLS
5059

5160
#endif

opal/mca/pmix/base/pmix_base_fns.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,11 @@
3939

4040
#define OPAL_PMI_PAD 10
4141

42+
void opal_pmix_base_set_evbase(opal_event_base_t *evbase)
43+
{
44+
opal_pmix_base.evbase = evbase;
45+
}
46+
4247
/******** ERRHANDLER SUPPORT FOR COMPONENTS THAT
4348
******** DO NOT NATIVELY SUPPORT IT
4449
********/

opal/mca/pmix/base/pmix_base_frame.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@
3333
https://github.com/open-mpi/ompi/issues/375 for details. */
3434
opal_pmix_base_module_t opal_pmix = { 0 };
3535
bool opal_pmix_collect_all_data = true;
36-
bool opal_pmix_base_allow_delayed_server = false;
3736
int opal_pmix_verbose_output = -1;
3837
bool opal_pmix_base_async_modex = false;
38+
opal_pmix_base_t opal_pmix_base = {0};
3939

4040
static int opal_pmix_base_frame_register(mca_base_register_flag_t flags)
4141
{

opal/mca/pmix/cray/pmix_cray.c

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,8 @@ static int cray_resolve_peers(const char *nodename,
5757
opal_list_t *procs);
5858
static int cray_resolve_nodes(opal_jobid_t jobid, char **nodelist);
5959
static int cray_put(opal_pmix_scope_t scope, opal_value_t *kv);
60-
static int cray_fence(opal_list_t *procs, int collect_data);
60+
static int cray_fencenb(opal_list_t *procs, int collect_data,
61+
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
6162
static int cray_commit(void);
6263
static int cray_get(const opal_process_name_t *id,
6364
const char *key, opal_list_t *info,
@@ -90,8 +91,8 @@ const opal_pmix_base_module_t opal_pmix_cray_module = {
9091
.initialized = cray_initialized,
9192
.abort = cray_abort,
9293
.commit = cray_commit,
93-
.fence = cray_fence,
94-
.fence_nb = NULL,
94+
.fence = NULL,
95+
.fence_nb = cray_fencenb,
9596
.put = cray_put,
9697
.get = cray_get,
9798
.get_nb = cray_get_nb,
@@ -119,6 +120,17 @@ const opal_pmix_base_module_t opal_pmix_cray_module = {
119120
// usage accounting
120121
static int pmix_init_count = 0;
121122

123+
// local object
124+
typedef struct {
125+
opal_object_t super;
126+
opal_event_t ev;
127+
opal_pmix_op_cbfunc_t opcbfunc;
128+
void *cbdata;
129+
} pmi_opcaddy_t;
130+
OBJ_CLASS_INSTANCE(pmi_opcaddy_t,
131+
opal_object_t,
132+
NULL, NULL);
133+
122134
// PMI constant values:
123135
static int pmix_kvslen_max = 0;
124136
static int pmix_keylen_max = 0;
@@ -512,8 +524,9 @@ static int cray_commit(void)
512524
return OPAL_SUCCESS;
513525
}
514526

515-
static int cray_fence(opal_list_t *procs, int collect_data)
527+
static void fencenb(int sd, short args, void *cbdata)
516528
{
529+
pmi_opcaddy_t *op = (pmi_opcaddy_t*)cbdata;
517530
int rc, cnt;
518531
int32_t i;
519532
int *all_lens = NULL;
@@ -550,7 +563,8 @@ static int cray_fence(opal_list_t *procs, int collect_data)
550563

551564
send_buffer = OBJ_NEW(opal_buffer_t);
552565
if (NULL == send_buffer) {
553-
return OPAL_ERR_OUT_OF_RESOURCE;
566+
rc = OPAL_ERR_OUT_OF_RESOURCE;
567+
goto fn_exit;
554568
}
555569

556570
opal_dss.copy_payload(send_buffer, mca_pmix_cray_component.cache_global);
@@ -668,7 +682,7 @@ static int cray_fence(opal_list_t *procs, int collect_data)
668682
* for every process in the job.
669683
*
670684
* we only need to set locality for each local rank as "not found"
671-
* equates to "non-local"
685+
* equates to "non-local"
672686
*/
673687

674688
for (i=0; i < pmix_nlranks; i++) {
@@ -732,7 +746,27 @@ static int cray_fence(opal_list_t *procs, int collect_data)
732746
if (r_bytes_and_ranks != NULL) {
733747
free(r_bytes_and_ranks);
734748
}
735-
return rc;
749+
if (NULL != op->opcbfunc) {
750+
op->opcbfunc(rc, op->cbdata);
751+
}
752+
OBJ_RELEASE(op);
753+
return;
754+
}
755+
756+
static int cray_fencenb(opal_list_t *procs, int collect_data,
757+
opal_pmix_op_cbfunc_t cbfunc, void *cbdata)
758+
{
759+
pmi_opcaddy_t *op;
760+
761+
/* thread-shift this so we don't block in Cray's barrier */
762+
op = OBJ_NEW(pmi_opcaddy_t);
763+
op->opcbfunc = cbfunc;
764+
op->cbdata = cbdata;
765+
event_assign(&op->ev, opal_pmix_base.evbase, -1,
766+
EV_WRITE, fencenb, op);
767+
event_active(&op->ev, EV_WRITE, 1);
768+
769+
return OPAL_SUCCESS;
736770
}
737771

738772
static int cray_get(const opal_process_name_t *id, const char *key, opal_list_t *info, opal_value_t **kv)

opal/mca/pmix/external/pmix_ext_client.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,8 @@ int pmix_ext_fencenb(opal_list_t *procs, int collect_data,
367367
if (collect_data) {
368368
PMIX_INFO_CONSTRUCT(&info);
369369
(void)strncpy(info.key, PMIX_COLLECT_DATA, PMIX_MAX_KEYLEN);
370+
info.value.type = PMIX_BOOL;
371+
info.value.data.flag = true;
370372
iptr = &info;
371373
n = 1;
372374
} else {

opal/mca/pmix/pmix.h

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -250,21 +250,6 @@ extern int opal_pmix_base_exchange(opal_value_t *info,
250250
} \
251251
} while(0);
252252

253-
254-
/**
255-
* Provide a simplified macro for calling the fence function
256-
* that takes into account directives and availability of
257-
* non-blocking operations
258-
*/
259-
#define OPAL_MODEX() \
260-
do { \
261-
opal_pmix.commit(); \
262-
if (!opal_pmix_base_async_modex) { \
263-
opal_pmix.fence(NULL, \
264-
opal_pmix_collect_all_data); \
265-
} \
266-
} while(0);
267-
268253
/**
269254
* Provide a macro for accessing a base function that exchanges
270255
* data values between two procs using the PMIx Publish/Lookup

opal/mca/pmix/pmix112/pmix1_client.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,8 @@ int pmix1_fencenb(opal_list_t *procs, int collect_data,
367367
if (collect_data) {
368368
PMIX_INFO_CONSTRUCT(&info);
369369
(void)strncpy(info.key, PMIX_COLLECT_DATA, PMIX_MAX_KEYLEN);
370+
info.value.type = PMIX_BOOL;
371+
info.value.data.flag = true;
370372
iptr = &info;
371373
n = 1;
372374
} else {

0 commit comments

Comments
 (0)