Skip to content

Commit 3d96261

Browse files
authored
Merge pull request #5219 from jsquyres/pr/v3.0.x/fix-finalized-hang
v3.0.x: mpi/finalized: revamp INITIALIZED/FINALIZED
2 parents 56f5e27 + 3cc727f commit 3d96261

21 files changed

+169
-124
lines changed

ompi/errhandler/errhandler.h

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* University of Stuttgart. All rights reserved.
1111
* Copyright (c) 2004-2005 The Regents of the University of California.
1212
* All rights reserved.
13-
* Copyright (c) 2008-2012 Cisco Systems, Inc. All rights reserved.
13+
* Copyright (c) 2008-2018 Cisco Systems, Inc. All rights reserved
1414
* Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
1515
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
1616
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
@@ -193,11 +193,22 @@ struct ompi_request_t;
193193
* This macro directly invokes the ompi_mpi_errors_are_fatal_handler()
194194
* when an error occurs because MPI_COMM_WORLD does not exist (because
195195
* we're before MPI_Init() or after MPI_Finalize()).
196+
*
197+
* NOTE: The ompi_mpi_state variable is a volatile that is set
198+
* atomically in ompi_mpi_init() and ompi_mpi_finalize(). The
199+
* appropriate memory barriers are done in those 2 functions such that
200+
* we do not need to do a read memory barrier here (in
201+
* potentially-performance-critical code paths) before reading the
202+
* variable.
196203
*/
197-
#define OMPI_ERR_INIT_FINALIZE(name) \
198-
if( OPAL_UNLIKELY(!ompi_mpi_initialized || ompi_mpi_finalized) ) { \
199-
ompi_mpi_errors_are_fatal_comm_handler(NULL, NULL, name); \
200-
}
204+
#define OMPI_ERR_INIT_FINALIZE(name) \
205+
{ \
206+
int32_t state = ompi_mpi_state; \
207+
if (OPAL_UNLIKELY(state < OMPI_MPI_STATE_INIT_COMPLETED || \
208+
state > OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT)) { \
209+
ompi_mpi_errors_are_fatal_comm_handler(NULL, NULL, name); \
210+
} \
211+
}
201212

202213
/**
203214
* This is the macro to invoke to directly invoke an MPI error

ompi/errhandler/errhandler_predefined.c

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
1212
* Copyright (c) 2006 University of Houston. All rights reserved.
13-
* Copyright (c) 2008-2013 Cisco Systems, Inc. All rights reserved.
13+
* Copyright (c) 2008-2018 Cisco Systems, Inc. All rights reserved
1414
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
1515
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
1616
* Copyright (c) 2012 Los Alamos National Security, LLC.
@@ -149,7 +149,8 @@ void ompi_mpi_errors_return_win_handler(struct ompi_win_t **win,
149149

150150
static void out(char *str, char *arg)
151151
{
152-
if (ompi_rte_initialized && !ompi_mpi_finalized) {
152+
if (ompi_rte_initialized &&
153+
ompi_mpi_state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
153154
if (NULL != arg) {
154155
opal_output(0, str, arg);
155156
} else {
@@ -280,7 +281,9 @@ static void backend_fatal_no_aggregate(char *type,
280281
{
281282
char *arg;
282283

283-
assert(!ompi_mpi_initialized || ompi_mpi_finalized);
284+
int32_t state = ompi_mpi_state;
285+
assert(state < OMPI_MPI_STATE_INIT_COMPLETED ||
286+
state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT);
284287

285288
fflush(stdout);
286289
fflush(stderr);
@@ -289,7 +292,7 @@ static void backend_fatal_no_aggregate(char *type,
289292

290293
/* Per #2152, print out in plain english if something was invoked
291294
before MPI_INIT* or after MPI_FINALIZE */
292-
if (!ompi_mpi_init_started && !ompi_mpi_initialized) {
295+
if (state < OMPI_MPI_STATE_INIT_STARTED) {
293296
if (NULL != arg) {
294297
out("*** The %s() function was called before MPI_INIT was invoked.\n"
295298
"*** This is disallowed by the MPI standard.\n", arg);
@@ -300,7 +303,7 @@ static void backend_fatal_no_aggregate(char *type,
300303
"*** function was invoked, sorry. :-(\n", NULL);
301304
}
302305
out("*** Your MPI job will now abort.\n", NULL);
303-
} else if (ompi_mpi_finalized) {
306+
} else if (state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
304307
if (NULL != arg) {
305308
out("*** The %s() function was called after MPI_FINALIZE was invoked.\n"
306309
"*** This is disallowed by the MPI standard.\n", arg);

ompi/mca/coll/fca/coll_fca_ops.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
* Copyright (c) 2011 Mellanox Technologies. All rights reserved.
33
* Copyright (c) 2015 Research Organization for Information Science
44
* and Technology (RIST). All rights reserved.
5+
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
56
* $COPYRIGHT$
67
*
78
* Additional copyrights may follow
@@ -159,7 +160,7 @@ int mca_coll_fca_barrier(struct ompi_communicator_t *comm,
159160
int ret;
160161

161162
FCA_VERBOSE(5,"Using FCA Barrier");
162-
if (OPAL_UNLIKELY(ompi_mpi_finalize_started)) {
163+
if (OPAL_UNLIKELY(ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED)) {
163164
FCA_VERBOSE(5, "In finalize, reverting to previous barrier");
164165
goto orig_barrier;
165166
}

ompi/mca/coll/hcoll/coll_hcoll_module.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
* Copyright (c) 2017 The University of Tennessee and The University
55
* of Tennessee Research Foundation. All rights
66
* reserved.
7+
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
78
* $COPYRIGHT$
89
*
910
* Additional copyrights may follow
@@ -241,7 +242,7 @@ static int mca_coll_hcoll_module_enable(mca_coll_base_module_t *module,
241242

242243
int mca_coll_hcoll_progress(void)
243244
{
244-
if (ompi_mpi_finalized){
245+
if (ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
245246
hcoll_rte_p2p_disabled_notify();
246247
}
247248

ompi/mca/coll/hcoll/coll_hcoll_ops.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Copyright (c) 2011 Mellanox Technologies. All rights reserved.
33
Copyright (c) 2015 Research Organization for Information Science
44
and Technology (RIST). All rights reserved.
5+
Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
56
$COPYRIGHT$
67
78
Additional copyrights may follow
@@ -21,7 +22,7 @@ int mca_coll_hcoll_barrier(struct ompi_communicator_t *comm,
2122
mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module;
2223
HCOL_VERBOSE(20,"RUNNING HCOL BARRIER");
2324

24-
if (OPAL_UNLIKELY(ompi_mpi_finalize_started)) {
25+
if (OPAL_UNLIKELY(ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED)) {
2526
HCOL_VERBOSE(5, "In finalize, reverting to previous barrier");
2627
goto orig_barrier;
2728
}

ompi/mca/io/romio314/src/io_romio314_file_open.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2015 Research Organization for Information Science
1313
* and Technology (RIST). All rights reserved.
14+
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
1415
* $COPYRIGHT$
1516
*
1617
* Additional copyrights may follow
@@ -58,7 +59,7 @@ mca_io_romio314_file_close (ompi_file_t *fh)
5859
which we obviously can't do if we've started to MPI_Finalize).
5960
The user didn't close the file, so they should expect
6061
unexpected behavior. */
61-
if (ompi_mpi_finalized) {
62+
if (ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
6263
return OMPI_SUCCESS;
6364
}
6465

ompi/mca/pml/yalla/pml_yalla.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
* Copyright (C) 2001-2011 Mellanox Technologies Ltd. ALL RIGHTS RESERVED.
33
* Copyright (c) 2015 Research Organization for Information Science
44
* and Technology (RIST). All rights reserved.
5+
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
56
* $COPYRIGHT$
67
*
78
* Additional copyrights may follow
@@ -265,7 +266,7 @@ int mca_pml_yalla_del_procs(struct ompi_proc_t **procs, size_t nprocs)
265266
{
266267
size_t i;
267268

268-
if (ompi_mpi_finalized) {
269+
if (ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
269270
PML_YALLA_VERBOSE(3, "%s", "using bulk powerdown");
270271
mxm_ep_powerdown(ompi_pml_yalla.mxm_ep);
271272
}

ompi/mpi/c/finalized.c

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2015 Research Organization for Information Science
1313
* and Technology (RIST). All rights reserved.
14-
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
14+
* Copyright (c) 2015-2018 Cisco Systems, Inc. All rights reserved
1515
* Copyright (c) 2015 Intel, Inc. All rights reserved
1616
* $COPYRIGHT$
1717
*
@@ -44,13 +44,7 @@ int MPI_Finalized(int *flag)
4444

4545
ompi_hook_base_mpi_finalized_top(flag);
4646

47-
/* We must obtain the lock to guarnatee consistent values of
48-
ompi_mpi_initialized and ompi_mpi_finalized. Note, too, that
49-
this lock is held for the bulk of the duration of
50-
ompi_mpi_init() and ompi_mpi_finalize(), so when we get the
51-
lock, we are guaranteed that some other thread is not part way
52-
through initialization or finalization. */
53-
opal_mutex_lock(&ompi_mpi_bootstrap_mutex);
47+
int32_t state = ompi_mpi_state;
5448

5549
if (MPI_PARAM_CHECK) {
5650
if (NULL == flag) {
@@ -59,12 +53,11 @@ int MPI_Finalized(int *flag)
5953
whether we're currently (after MPI_Init and before
6054
MPI_Finalize) or not */
6155

62-
if (ompi_mpi_initialized && !ompi_mpi_finalized) {
63-
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
56+
if (state >= OMPI_MPI_STATE_INIT_COMPLETED &&
57+
state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
6458
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG,
6559
FUNC_NAME);
6660
} else {
67-
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
6861
/* We have no MPI object here so call ompi_errhandle_invoke
6962
* directly */
7063
return ompi_errhandler_invoke(NULL, NULL, -1,
@@ -74,8 +67,7 @@ int MPI_Finalized(int *flag)
7467
}
7568
}
7669

77-
*flag = ompi_mpi_finalized;
78-
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
70+
*flag = (state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT);
7971

8072
ompi_hook_base_mpi_finalized_bottom(flag);
8173

ompi/mpi/c/get_library_version.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12-
* Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved.
12+
* Copyright (c) 2014-2018 Cisco Systems, Inc. All rights reserved
1313
* Copyright (c) 2015 Research Organization for Information Science
1414
* and Technology (RIST). All rights reserved.
1515
* Copyright (c) 2015 Intel, Inc. All rights reserved
@@ -58,7 +58,9 @@ int MPI_Get_library_version(char *version, int *resultlen)
5858
(i.e., use a NULL communicator, which will end up at the
5959
default errhandler, which is abort). */
6060

61-
if (ompi_mpi_initialized && !ompi_mpi_finalized) {
61+
int32_t state = ompi_mpi_state;
62+
if (state >= OMPI_MPI_STATE_INIT_COMPLETED &&
63+
state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
6264
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG,
6365
FUNC_NAME);
6466
} else {

ompi/mpi/c/get_version.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* Copyright (c) 2015 Research Organization for Information Science
1313
* and Technology (RIST). All rights reserved.
1414
* Copyright (c) 2015 Intel, Inc. All rights reserved
15+
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
1516
* $COPYRIGHT$
1617
*
1718
* Additional copyrights may follow
@@ -54,7 +55,9 @@ int MPI_Get_version(int *version, int *subversion)
5455
(i.e., use a NULL communicator, which will end up at the
5556
default errhandler, which is abort). */
5657

57-
if (ompi_mpi_initialized && !ompi_mpi_finalized) {
58+
int32_t state = ompi_mpi_state;
59+
if (state >= OMPI_MPI_STATE_INIT_COMPLETED &&
60+
state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
5861
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG,
5962
FUNC_NAME);
6063
} else {

0 commit comments

Comments
 (0)