Skip to content

Commit 5dfb7ac

Browse files
author
rhc54
committed
Merge pull request #1266 from ggouaillardet/topic/misc_pmix_fixes
Topic/misc pmix fixes
2 parents b7b4231 + b20a219 commit 5dfb7ac

File tree

212 files changed

+50038
-447
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

212 files changed

+50038
-447
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,10 @@ opal/mca/installdirs/config/install_dirs.h
303303
opal/mca/pmix/pmix112/pmix/include/pmix/autogen/config.h
304304
opal/mca/pmix/pmix112/pmix/include/private/autogen/config.h
305305
opal/mca/pmix/pmix112/pmix/include/private/autogen/config.h.in
306+
opal/mca/pmix/pmix120/pmix/include/pmix/autogen/config.h
307+
opal/mca/pmix/pmix120/pmix/include/private/autogen/config.h
308+
opal/mca/pmix/pmix120/pmix/include/private/autogen/config.h.in
309+
306310

307311
opal/tools/opal-checkpoint/opal-checkpoint
308312
opal/tools/opal-checkpoint/opal-checkpoint.1

config/opal_check_pmi.m4

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -232,20 +232,25 @@ AC_DEFUN([OPAL_CHECK_PMIX],[
232232
233233
OPAL_VAR_SCOPE_PUSH([pmix_ext_install_dir])
234234
235-
AC_ARG_WITH([external-pmix],
236-
[AC_HELP_STRING([--with-external-pmix(=DIR)],
237-
[Use external PMIx support, optionally adding DIR to the search path (default: no)])],
238-
[], with_external_pmix=no)
235+
AC_ARG_WITH([pmix],
236+
[AC_HELP_STRING([--with-pmix(=DIR)],
237+
[Build PMIx support. DIR can take one of three values: "internal", "external", or a valid directory name. "internal" (or no DIR value) forces Open MPI to use its internal copy of PMIx. "external" forces Open MPI to use an external installation of PMIx. Supplying a valid directory name also forces Open MPI to use an external installation of PMIx, and adds DIR/include, DIR/lib, and DIR/lib64 to the search path for headers and libraries. Note that Open MPI does not support --without-pmix.])])
239238
240-
AC_MSG_CHECKING([if user requested PMIx support])
241-
AS_IF([test "$with_external_pmix" = "no"],
239+
AS_IF([test "$with_pmix" = "no"],
240+
[AC_MSG_WARN([Open MPI requires PMIx support. It can be built])
241+
AC_MSG_WARN([with either its own internal copy of PMIx, or with])
242+
AC_MSG_WARN([an external copy that you supply.])
243+
AC_MSG_ERROR([Cannot continue])])
244+
245+
AC_MSG_CHECKING([if user requested PMIx support($with_pmix)])
246+
AS_IF([test -z "$with_pmix" || test "$with_pmix" = "yes" || test "$with_mpix" = "internal"],
242247
[AC_MSG_RESULT([no])
243248
opal_external_pmix_happy="no"],
244249
[AC_MSG_RESULT([yes])
245250
# check for external pmix lib */
246-
AS_IF([test "$with_external_pmix" == "yes" || test -z "$with_external_pmix"],
251+
AS_IF([test "$with_pmix" = "external"],
247252
[pmix_ext_install_dir=/usr],
248-
[pmix_ext_install_dir=$with_external_pmix])
253+
[pmix_ext_install_dir=$with_pmix])
249254
250255
# cannot use check_package because there are
251256
# external dependencies to make the headers

ompi/errhandler/errhandler.c

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
1515
* Copyright (c) 2015 Research Organization for Information Science
1616
* and Technology (RIST). All rights reserved.
17+
* Copyright (c) 2015 Intel, Inc. All rights reserved.
1718
* $COPYRIGHT$
1819
*
1920
* Additional copyrights may follow
@@ -30,13 +31,18 @@
3031
#include "ompi/errhandler/errhandler.h"
3132
#include "ompi/errhandler/errhandler_predefined.h"
3233
#include "opal/class/opal_pointer_array.h"
34+
#include "opal/mca/pmix/pmix.h"
3335

3436

3537
/*
3638
* Table for Fortran <-> C errhandler handle conversion
3739
*/
3840
opal_pointer_array_t ompi_errhandler_f_to_c_table = {{0}};
3941

42+
/*
43+
* default errhandler id
44+
*/
45+
static int default_errhandler_id = -1;
4046

4147
/*
4248
* Class information
@@ -157,6 +163,7 @@ int ompi_errhandler_finalize(void)
157163

158164
/* JMS Add stuff here checking for unreleased errorhandlers,
159165
similar to communicators, info handles, etc. */
166+
opal_pmix.deregister_errhandler(default_errhandler_id, NULL, NULL);
160167

161168
/* Remove errhandler F2C table */
162169

@@ -169,7 +176,7 @@ int ompi_errhandler_finalize(void)
169176

170177

171178
ompi_errhandler_t *ompi_errhandler_create(ompi_errhandler_type_t object_type,
172-
ompi_errhandler_generic_handler_fn_t *func,
179+
ompi_errhandler_generic_handler_fn_t *func,
173180
ompi_errhandler_lang_t lang)
174181
{
175182
ompi_errhandler_t *new_errhandler;
@@ -213,20 +220,33 @@ ompi_errhandler_t *ompi_errhandler_create(ompi_errhandler_type_t object_type,
213220
return new_errhandler;
214221
}
215222

223+
/* registration callback */
224+
void ompi_errhandler_registration_callback(int status,
225+
int errhandler_ref,
226+
void *cbdata)
227+
{
228+
ompi_errhandler_errtrk_t *errtrk = (ompi_errhandler_errtrk_t*)cbdata;
229+
230+
default_errhandler_id = errhandler_ref;
231+
errtrk->status = status;
232+
errtrk->active = false;
233+
}
234+
216235
/**
217-
* Default runtime errhandler callback
236+
* Default errhandler callback
218237
*/
219-
int ompi_errhandler_runtime_callback(opal_pointer_array_t *errors) {
220-
ompi_rte_error_report_t *err;
221-
int errcode = 1;
222-
223-
if (NULL != errors &&
224-
(NULL != (err = (ompi_rte_error_report_t*)opal_pointer_array_get_item(errors, 0)))) {
225-
errcode = err->errcode;
238+
void ompi_errhandler_callback(int status,
239+
opal_list_t *procs,
240+
opal_list_t *info,
241+
opal_pmix_release_cbfunc_t cbfunc,
242+
void *cbdata)
243+
{
244+
/* allow the caller to release its data */
245+
if (NULL != cbfunc) {
246+
cbfunc(cbdata);
226247
}
227-
228-
ompi_mpi_abort(MPI_COMM_WORLD, errcode);
229-
return OMPI_SUCCESS;
248+
/* our default action is to abort */
249+
ompi_mpi_abort(MPI_COMM_WORLD, status);
230250
}
231251

232252
/**************************************************************************

ompi/errhandler/errhandler.h

Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2008-2012 Cisco Systems, Inc. All rights reserved.
1414
* Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
15+
* Copyright (c) 2015 Intel, Inc. All rights reserved.
1516
* $COPYRIGHT$
1617
*
1718
* Additional copyrights may follow
@@ -30,6 +31,7 @@
3031
#include "opal/prefetch.h"
3132
#include "opal/class/opal_object.h"
3233
#include "opal/class/opal_pointer_array.h"
34+
#include "opal/mca/pmix/pmix.h"
3335

3436
#include "ompi/mca/rte/rte.h"
3537
#include "ompi/runtime/mpiruntime.h"
@@ -364,29 +366,28 @@ struct ompi_request_t;
364366
ompi_errhandler_lang_t language);
365367

366368
/**
367-
* Callback function from runtime layer to alert the MPI layer of an error at
368-
* the runtime layer.
369-
*
370-
* @param errors A pointer array containing structs of type
371-
* ompi_rte_error_report_t that consists of at least
372-
* {
373-
* ompi_process_name_t proc;
374-
* int errcode;
375-
* }
376-
* Each RTE is allowed to add additional information
377-
* as required
369+
* Callback function to alert the MPI layer of an error or notification
370+
* from the internal RTE and/or the resource manager.
378371
*
379372
* This function is used to alert the MPI layer to a specific fault detected by the
380-
* runtime layer. This could be a process failure, a lost connection, or the inability
373+
* runtime layer or host RM. This could be a process failure, a lost connection, or the inability
381374
* to send an OOB message. The MPI layer has the option to perform whatever actions it
382375
* needs to stabilize itself and continue running, abort, etc.
383-
*
384-
* Upon completion, the error handler should return OMPI_SUCCESS if the error has
385-
* been resolved and no further callbacks are to be executed. Return of any other
386-
* value will cause the RTE to continue executing error callbacks.
387376
*/
388-
OMPI_DECLSPEC int ompi_errhandler_runtime_callback(opal_pointer_array_t *errors);
389-
377+
typedef struct {
378+
volatile bool active;
379+
int status;
380+
} ompi_errhandler_errtrk_t;
381+
382+
OMPI_DECLSPEC void ompi_errhandler_callback(int status,
383+
opal_list_t *procs,
384+
opal_list_t *info,
385+
opal_pmix_release_cbfunc_t cbfunc,
386+
void *cbdata);
387+
388+
OMPI_DECLSPEC void ompi_errhandler_registration_callback(int status,
389+
int errhandler_ref,
390+
void *cbdata);
390391
/**
391392
* Check to see if an errhandler is intrinsic.
392393
*

ompi/mca/rte/orte/rte_orte.h

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
66
* Copyright (c) 2014 Research Organization for Information Science
77
* and Technology (RIST). All rights reserved.
8+
* Copyright (c) 2015 Intel, Inc. All rights reserved.
89
* $COPYRIGHT$
910
*
1011
* Additional copyrights may follow
@@ -83,12 +84,6 @@ typedef orte_local_rank_t ompi_local_rank_t;
8384
OMPI_DECLSPEC void __opal_attribute_noreturn__
8485
ompi_rte_abort(int error_code, char *fmt, ...);
8586
#define ompi_rte_abort_peers(a, b, c) orte_errmgr.abort_peers(a, b, c)
86-
#define OMPI_RTE_ERRHANDLER_FIRST ORTE_ERRMGR_CALLBACK_FIRST
87-
#define OMPI_RTE_ERRHANDLER_LAST ORTE_ERRMGR_CALLBACK_LAST
88-
#define OMPI_RTE_ERRHANDLER_PREPEND ORTE_ERRMGR_CALLBACK_PREPEND
89-
#define OMPI_RTE_ERRHANDLER_APPEND ORTE_ERRMGR_CALLBACK_APPEND
90-
typedef orte_error_t ompi_rte_error_report_t;
91-
#define ompi_rte_register_errhandler(a, b) orte_errmgr.register_error_callback(a, b)
9287
#define OMPI_ERROR_LOG ORTE_ERROR_LOG
9388

9489
/* Init and finalize objects and operations */

ompi/mca/rte/rte.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights reserved.
44
* Copyright (c) 2013 Mellanox Technologies, Inc.
55
* All rights reserved.
6-
* Copyright (c) 2014 Intel, Inc. All rights reserved.
6+
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
77
*
88
* $COPYRIGHT$
99
*
@@ -111,8 +111,6 @@
111111
* 2. int ompi_rte_abort_peers(ompi_process_name_t *procs, size_t nprocs) -
112112
* Abort the specified list of peers
113113
* 3. OMPI_ERROR_LOG(rc) - print error message regarding the given return code
114-
* 4. ompi_rte_register_errhandler - register a callback function for the RTE
115-
* to report asynchronous errors to the caller
116114
*
117115
* (e) Init and finalize objects and operations
118116
* 1. ompi_rte_init - a function to initialize the RTE. The function

ompi/runtime/ompi_mpi_init.c

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
378378
size_t nprocs;
379379
char *error = NULL;
380380
char *cmd=NULL, *av=NULL;
381+
ompi_errhandler_errtrk_t errtrk;
381382
OPAL_TIMING_DECLARE(tm);
382383
OPAL_TIMING_INIT_EXT(&tm, OPAL_TIMING_GET_TIME_OF_DAY);
383384

@@ -504,11 +505,18 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
504505
}
505506
}
506507

507-
/* Register the default errhandler callback - RTE will ignore if it
508-
* doesn't support this capability
509-
*/
510-
ompi_rte_register_errhandler(ompi_errhandler_runtime_callback,
511-
OMPI_RTE_ERRHANDLER_LAST);
508+
/* Register the default errhandler callback */
509+
errtrk.status = OPAL_ERROR;
510+
errtrk.active = true;
511+
opal_pmix.register_errhandler(NULL, ompi_errhandler_callback,
512+
ompi_errhandler_registration_callback,
513+
(void*)&errtrk);
514+
OMPI_WAIT_FOR_COMPLETION(errtrk.active);
515+
if (OPAL_SUCCESS != errtrk.status) {
516+
error = "Error handler registration";
517+
ret = errtrk.status;
518+
goto error;
519+
}
512520

513521
/* Figure out the final MPI thread levels. If we were not
514522
compiled for support for MPI threads, then don't allow

opal/mca/hwloc/external/configure.m4

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,21 @@ AC_DEFUN([MCA_opal_hwloc_external_CONFIG],[
181181
[AC_MSG_RESULT([yes])],
182182
[AC_MSG_RESULT([no])
183183
AC_MSG_ERROR([Cannot continue])])
184+
AC_MSG_CHECKING([if external hwloc version is lower than 2.0])
185+
AS_IF([test "$opal_hwloc_dir" != ""],
186+
[opal_hwloc_external_CFLAGS_save=$CFLAGS
187+
CFLAGS="-I$opal_hwloc_dir/include $opal_hwloc_external_CFLAGS_save"])
188+
AC_COMPILE_IFELSE(
189+
[AC_LANG_PROGRAM([[#include <hwloc.h>]],
190+
[[
191+
#if HWLOC_API_VERSION >= 0x00020000
192+
#error "hwloc API version is greater or equal than 0x00020000"
193+
#endif
194+
]])],
195+
[AC_MSG_RESULT([yes])],
196+
[AC_MSG_RESULT([no])
197+
AC_MSG_ERROR([OMPI does not currently support hwloc v2 API
198+
Cannot continue])])
184199
AS_IF([test "$opal_hwloc_dir" != ""],
185200
[CFLAGS=$opal_hwloc_external_CFLAGS_save])
186201
$1],

opal/mca/pmix/base/base.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,17 @@ OPAL_DECLSPEC int opal_pmix_base_select(void);
3232

3333
OPAL_DECLSPEC extern bool opal_pmix_base_allow_delayed_server;
3434

35-
OPAL_DECLSPEC void opal_pmix_base_register_handler(opal_pmix_errhandler_fn_t err);
36-
OPAL_DECLSPEC void opal_pmix_base_deregister_handler(void);
35+
OPAL_DECLSPEC void opal_pmix_base_register_handler(opal_list_t *info,
36+
opal_pmix_notification_fn_t errhandler,
37+
opal_pmix_errhandler_reg_cbfunc_t cbfunc,
38+
void *cbdata);
39+
OPAL_DECLSPEC void opal_pmix_base_deregister_handler(int errhandler,
40+
opal_pmix_op_cbfunc_t cbfunc,
41+
void *cbdata);
3742
OPAL_DECLSPEC void opal_pmix_base_errhandler(int status,
3843
opal_list_t *procs,
39-
opal_list_t *info);
44+
opal_list_t *info,
45+
opal_pmix_release_cbfunc_t cbfunc, void *cbdata);
4046
OPAL_DECLSPEC int opal_pmix_base_exchange(opal_value_t *info,
4147
opal_pmix_pdata_t *pdat,
4248
int timeout);

opal/mca/pmix/base/pmix_base_fns.c

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -38,26 +38,40 @@
3838

3939
#define OPAL_PMI_PAD 10
4040

41-
/******** ERRHANDLER SUPPORT ********/
42-
static opal_pmix_errhandler_fn_t errhandler = NULL;
43-
44-
void opal_pmix_base_register_handler(opal_pmix_errhandler_fn_t err)
41+
/******** ERRHANDLER SUPPORT FOR COMPONENTS THAT
42+
******** DO NOT NATIVELY SUPPORT IT
43+
********/
44+
static opal_pmix_notification_fn_t errhandler = NULL;
45+
46+
void opal_pmix_base_register_handler(opal_list_t *info,
47+
opal_pmix_notification_fn_t err,
48+
opal_pmix_errhandler_reg_cbfunc_t cbfunc,
49+
void *cbdata)
4550
{
4651
errhandler = err;
52+
if (NULL != cbfunc) {
53+
cbfunc(OPAL_SUCCESS, 0, cbdata);
54+
}
4755
}
4856

4957
void opal_pmix_base_errhandler(int status,
5058
opal_list_t *procs,
51-
opal_list_t *info)
59+
opal_list_t *info,
60+
opal_pmix_release_cbfunc_t cbfunc, void *cbdata)
5261
{
5362
if (NULL != errhandler) {
54-
errhandler(status);
63+
errhandler(status, procs, info, cbfunc, cbdata);
5564
}
5665
}
5766

58-
void opal_pmix_base_deregister_handler(void)
67+
void opal_pmix_base_deregister_handler(int errid,
68+
opal_pmix_op_cbfunc_t cbfunc,
69+
void *cbdata)
5970
{
6071
errhandler = NULL;
72+
if (NULL != cbfunc) {
73+
cbfunc(OPAL_SUCCESS, cbdata);
74+
}
6175
}
6276

6377
struct lookup_caddy_t {

0 commit comments

Comments
 (0)