Skip to content

Commit 810f244

Browse files
Ralph Castainggouaillardet
authored andcommitted
Add pmix120 component, update the error handling functions in the PMIx API.
Update the configure logic for the new pmix120 component ckpt Get the pmix120 component to work - still not really registering or handling notifications, but infrastructure now operates Cleanup some of the symbol scopes, and provide a more comprehensive rename.h file. Will pretty it up later - let's see how this works Cleanup the rename files to use the pretty macros
1 parent c757c5c commit 810f244

File tree

208 files changed

+49942
-398
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

208 files changed

+49942
-398
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,10 @@ opal/mca/installdirs/config/install_dirs.h
303303
opal/mca/pmix/pmix112/pmix/include/pmix/autogen/config.h
304304
opal/mca/pmix/pmix112/pmix/include/private/autogen/config.h
305305
opal/mca/pmix/pmix112/pmix/include/private/autogen/config.h.in
306+
opal/mca/pmix/pmix120/pmix/include/pmix/autogen/config.h
307+
opal/mca/pmix/pmix120/pmix/include/private/autogen/config.h
308+
opal/mca/pmix/pmix120/pmix/include/private/autogen/config.h.in
309+
306310

307311
opal/tools/opal-checkpoint/opal-checkpoint
308312
opal/tools/opal-checkpoint/opal-checkpoint.1

ompi/errhandler/errhandler.c

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
1515
* Copyright (c) 2015 Research Organization for Information Science
1616
* and Technology (RIST). All rights reserved.
17+
* Copyright (c) 2015 Intel, Inc. All rights reserved.
1718
* $COPYRIGHT$
1819
*
1920
* Additional copyrights may follow
@@ -30,13 +31,18 @@
3031
#include "ompi/errhandler/errhandler.h"
3132
#include "ompi/errhandler/errhandler_predefined.h"
3233
#include "opal/class/opal_pointer_array.h"
34+
#include "opal/mca/pmix/pmix.h"
3335

3436

3537
/*
3638
* Table for Fortran <-> C errhandler handle conversion
3739
*/
3840
opal_pointer_array_t ompi_errhandler_f_to_c_table = {{0}};
3941

42+
/*
43+
* default errhandler id
44+
*/
45+
static int default_errhandler_id = -1;
4046

4147
/*
4248
* Class information
@@ -157,6 +163,7 @@ int ompi_errhandler_finalize(void)
157163

158164
/* JMS Add stuff here checking for unreleased errorhandlers,
159165
similar to communicators, info handles, etc. */
166+
opal_pmix.deregister_errhandler(default_errhandler_id, NULL, NULL);
160167

161168
/* Remove errhandler F2C table */
162169

@@ -169,7 +176,7 @@ int ompi_errhandler_finalize(void)
169176

170177

171178
ompi_errhandler_t *ompi_errhandler_create(ompi_errhandler_type_t object_type,
172-
ompi_errhandler_generic_handler_fn_t *func,
179+
ompi_errhandler_generic_handler_fn_t *func,
173180
ompi_errhandler_lang_t lang)
174181
{
175182
ompi_errhandler_t *new_errhandler;
@@ -213,20 +220,33 @@ ompi_errhandler_t *ompi_errhandler_create(ompi_errhandler_type_t object_type,
213220
return new_errhandler;
214221
}
215222

223+
/* registration callback */
224+
void ompi_errhandler_registration_callback(int status,
225+
int errhandler_ref,
226+
void *cbdata)
227+
{
228+
ompi_errhandler_errtrk_t *errtrk = (ompi_errhandler_errtrk_t*)cbdata;
229+
230+
default_errhandler_id = errhandler_ref;
231+
errtrk->status = status;
232+
errtrk->active = false;
233+
}
234+
216235
/**
217-
* Default runtime errhandler callback
236+
* Default errhandler callback
218237
*/
219-
int ompi_errhandler_runtime_callback(opal_pointer_array_t *errors) {
220-
ompi_rte_error_report_t *err;
221-
int errcode = 1;
222-
223-
if (NULL != errors &&
224-
(NULL != (err = (ompi_rte_error_report_t*)opal_pointer_array_get_item(errors, 0)))) {
225-
errcode = err->errcode;
238+
void ompi_errhandler_callback(int status,
239+
opal_list_t *procs,
240+
opal_list_t *info,
241+
opal_pmix_release_cbfunc_t cbfunc,
242+
void *cbdata)
243+
{
244+
/* allow the caller to release its data */
245+
if (NULL != cbfunc) {
246+
cbfunc(cbdata);
226247
}
227-
228-
ompi_mpi_abort(MPI_COMM_WORLD, errcode);
229-
return OMPI_SUCCESS;
248+
/* our default action is to abort */
249+
ompi_mpi_abort(MPI_COMM_WORLD, status);
230250
}
231251

232252
/**************************************************************************

ompi/errhandler/errhandler.h

Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2008-2012 Cisco Systems, Inc. All rights reserved.
1414
* Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
15+
* Copyright (c) 2015 Intel, Inc. All rights reserved.
1516
* $COPYRIGHT$
1617
*
1718
* Additional copyrights may follow
@@ -30,6 +31,7 @@
3031
#include "opal/prefetch.h"
3132
#include "opal/class/opal_object.h"
3233
#include "opal/class/opal_pointer_array.h"
34+
#include "opal/mca/pmix/pmix.h"
3335

3436
#include "ompi/mca/rte/rte.h"
3537
#include "ompi/runtime/mpiruntime.h"
@@ -364,29 +366,28 @@ struct ompi_request_t;
364366
ompi_errhandler_lang_t language);
365367

366368
/**
367-
* Callback function from runtime layer to alert the MPI layer of an error at
368-
* the runtime layer.
369-
*
370-
* @param errors A pointer array containing structs of type
371-
* ompi_rte_error_report_t that consists of at least
372-
* {
373-
* ompi_process_name_t proc;
374-
* int errcode;
375-
* }
376-
* Each RTE is allowed to add additional information
377-
* as required
369+
* Callback function to alert the MPI layer of an error or notification
370+
* from the internal RTE and/or the resource manager.
378371
*
379372
* This function is used to alert the MPI layer to a specific fault detected by the
380-
* runtime layer. This could be a process failure, a lost connection, or the inability
373+
* runtime layer or host RM. This could be a process failure, a lost connection, or the inability
381374
* to send an OOB message. The MPI layer has the option to perform whatever actions it
382375
* needs to stabilize itself and continue running, abort, etc.
383-
*
384-
* Upon completion, the error handler should return OMPI_SUCCESS if the error has
385-
* been resolved and no further callbacks are to be executed. Return of any other
386-
* value will cause the RTE to continue executing error callbacks.
387376
*/
388-
OMPI_DECLSPEC int ompi_errhandler_runtime_callback(opal_pointer_array_t *errors);
389-
377+
typedef struct {
378+
volatile bool active;
379+
int status;
380+
} ompi_errhandler_errtrk_t;
381+
382+
OMPI_DECLSPEC void ompi_errhandler_callback(int status,
383+
opal_list_t *procs,
384+
opal_list_t *info,
385+
opal_pmix_release_cbfunc_t cbfunc,
386+
void *cbdata);
387+
388+
OMPI_DECLSPEC void ompi_errhandler_registration_callback(int status,
389+
int errhandler_ref,
390+
void *cbdata);
390391
/**
391392
* Check to see if an errhandler is intrinsic.
392393
*

ompi/mca/rte/orte/rte_orte.h

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
66
* Copyright (c) 2014 Research Organization for Information Science
77
* and Technology (RIST). All rights reserved.
8+
* Copyright (c) 2015 Intel, Inc. All rights reserved.
89
* $COPYRIGHT$
910
*
1011
* Additional copyrights may follow
@@ -83,12 +84,6 @@ typedef orte_local_rank_t ompi_local_rank_t;
8384
OMPI_DECLSPEC void __opal_attribute_noreturn__
8485
ompi_rte_abort(int error_code, char *fmt, ...);
8586
#define ompi_rte_abort_peers(a, b, c) orte_errmgr.abort_peers(a, b, c)
86-
#define OMPI_RTE_ERRHANDLER_FIRST ORTE_ERRMGR_CALLBACK_FIRST
87-
#define OMPI_RTE_ERRHANDLER_LAST ORTE_ERRMGR_CALLBACK_LAST
88-
#define OMPI_RTE_ERRHANDLER_PREPEND ORTE_ERRMGR_CALLBACK_PREPEND
89-
#define OMPI_RTE_ERRHANDLER_APPEND ORTE_ERRMGR_CALLBACK_APPEND
90-
typedef orte_error_t ompi_rte_error_report_t;
91-
#define ompi_rte_register_errhandler(a, b) orte_errmgr.register_error_callback(a, b)
9287
#define OMPI_ERROR_LOG ORTE_ERROR_LOG
9388

9489
/* Init and finalize objects and operations */

ompi/mca/rte/rte.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights reserved.
44
* Copyright (c) 2013 Mellanox Technologies, Inc.
55
* All rights reserved.
6-
* Copyright (c) 2014 Intel, Inc. All rights reserved.
6+
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
77
*
88
* $COPYRIGHT$
99
*
@@ -111,8 +111,6 @@
111111
* 2. int ompi_rte_abort_peers(ompi_process_name_t *procs, size_t nprocs) -
112112
* Abort the specified list of peers
113113
* 3. OMPI_ERROR_LOG(rc) - print error message regarding the given return code
114-
* 4. ompi_rte_register_errhandler - register a callback function for the RTE
115-
* to report asynchronous errors to the caller
116114
*
117115
* (e) Init and finalize objects and operations
118116
* 1. ompi_rte_init - a function to initialize the RTE. The function

ompi/runtime/ompi_mpi_init.c

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
378378
size_t nprocs;
379379
char *error = NULL;
380380
char *cmd=NULL, *av=NULL;
381+
ompi_errhandler_errtrk_t errtrk;
381382
OPAL_TIMING_DECLARE(tm);
382383
OPAL_TIMING_INIT_EXT(&tm, OPAL_TIMING_GET_TIME_OF_DAY);
383384

@@ -504,11 +505,18 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
504505
}
505506
}
506507

507-
/* Register the default errhandler callback - RTE will ignore if it
508-
* doesn't support this capability
509-
*/
510-
ompi_rte_register_errhandler(ompi_errhandler_runtime_callback,
511-
OMPI_RTE_ERRHANDLER_LAST);
508+
/* Register the default errhandler callback */
509+
errtrk.status = OPAL_ERROR;
510+
errtrk.active = true;
511+
opal_pmix.register_errhandler(NULL, ompi_errhandler_callback,
512+
ompi_errhandler_registration_callback,
513+
(void*)&errtrk);
514+
OMPI_WAIT_FOR_COMPLETION(errtrk.active);
515+
if (OPAL_SUCCESS != errtrk.status) {
516+
error = "Error handler registration";
517+
ret = errtrk.status;
518+
goto error;
519+
}
512520

513521
/* Figure out the final MPI thread levels. If we were not
514522
compiled for support for MPI threads, then don't allow

opal/mca/pmix/base/base.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,17 @@ OPAL_DECLSPEC int opal_pmix_base_select(void);
3232

3333
OPAL_DECLSPEC extern bool opal_pmix_base_allow_delayed_server;
3434

35-
OPAL_DECLSPEC void opal_pmix_base_register_handler(opal_pmix_errhandler_fn_t err);
36-
OPAL_DECLSPEC void opal_pmix_base_deregister_handler(void);
35+
OPAL_DECLSPEC void opal_pmix_base_register_handler(opal_list_t *info,
36+
opal_pmix_notification_fn_t errhandler,
37+
opal_pmix_errhandler_reg_cbfunc_t cbfunc,
38+
void *cbdata);
39+
OPAL_DECLSPEC void opal_pmix_base_deregister_handler(int errhandler,
40+
opal_pmix_op_cbfunc_t cbfunc,
41+
void *cbdata);
3742
OPAL_DECLSPEC void opal_pmix_base_errhandler(int status,
3843
opal_list_t *procs,
39-
opal_list_t *info);
44+
opal_list_t *info,
45+
opal_pmix_release_cbfunc_t cbfunc, void *cbdata);
4046
OPAL_DECLSPEC int opal_pmix_base_exchange(opal_value_t *info,
4147
opal_pmix_pdata_t *pdat,
4248
int timeout);

opal/mca/pmix/base/pmix_base_fns.c

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -38,26 +38,40 @@
3838

3939
#define OPAL_PMI_PAD 10
4040

41-
/******** ERRHANDLER SUPPORT ********/
42-
static opal_pmix_errhandler_fn_t errhandler = NULL;
43-
44-
void opal_pmix_base_register_handler(opal_pmix_errhandler_fn_t err)
41+
/******** ERRHANDLER SUPPORT FOR COMPONENTS THAT
42+
******** DO NOT NATIVELY SUPPORT IT
43+
********/
44+
static opal_pmix_notification_fn_t errhandler = NULL;
45+
46+
void opal_pmix_base_register_handler(opal_list_t *info,
47+
opal_pmix_notification_fn_t err,
48+
opal_pmix_errhandler_reg_cbfunc_t cbfunc,
49+
void *cbdata)
4550
{
4651
errhandler = err;
52+
if (NULL != cbfunc) {
53+
cbfunc(OPAL_SUCCESS, 0, cbdata);
54+
}
4755
}
4856

4957
void opal_pmix_base_errhandler(int status,
5058
opal_list_t *procs,
51-
opal_list_t *info)
59+
opal_list_t *info,
60+
opal_pmix_release_cbfunc_t cbfunc, void *cbdata)
5261
{
5362
if (NULL != errhandler) {
54-
errhandler(status);
63+
errhandler(status, procs, info, cbfunc, cbdata);
5564
}
5665
}
5766

58-
void opal_pmix_base_deregister_handler(void)
67+
void opal_pmix_base_deregister_handler(int errid,
68+
opal_pmix_op_cbfunc_t cbfunc,
69+
void *cbdata)
5970
{
6071
errhandler = NULL;
72+
if (NULL != cbfunc) {
73+
cbfunc(OPAL_SUCCESS, cbdata);
74+
}
6175
}
6276

6377
struct lookup_caddy_t {

opal/mca/pmix/external/pmix_ext_client.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ static void myerr(pmix_status_t status,
7777
iptr = OBJ_NEW(opal_value_t);
7878
iptr->key = strdup(info[n].key);
7979
pmix1_value_unload(iptr, &info[n].value);
80-
opal_list_append(&plist, &nm->super);
80+
opal_list_append(&plist, &iptr->super);
8181
}
8282

8383
/* call the base errhandler */

opal/mca/pmix/external/pmix_ext_server_south.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ static void myerr(pmix_status_t status,
9191
iptr = OBJ_NEW(opal_value_t);
9292
iptr->key = strdup(info[n].key);
9393
pmix1_value_unload(iptr, &info[n].value);
94-
opal_list_append(&plist, &nm->super);
94+
opal_list_append(&plist, &iptr->super);
9595
}
9696

9797
/* call the base errhandler */

0 commit comments

Comments
 (0)