Skip to content

Commit 5d330d5

Browse files
author
Ralph Castain
committed
Enable the PMIx event notification capability and use that for all error notifications, including debugger release. This capability requires use of PMIx 2.0 or above as the features are not available with earlier PMIx releases. When OMPI master is built against an earlier external version, it will fallback to the prior behavior - i.e., debugger will be released via RML and all notifications will go strictly to the default error handler.
Add PMIx 2.0 Remove PMIx 1.1.4 Cleanup copying of component Add missing file Touchup a typo in the Makefile.am Update the pmix ext114 component Minor cleanups and resync to master Update to latest PMIx 2.x Update to the PMIx event notification branch latest changes
1 parent c2185bb commit 5d330d5

File tree

264 files changed

+11825
-5905
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

264 files changed

+11825
-5905
lines changed

ompi/errhandler/errhandler.c

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
1515
* Copyright (c) 2015 Research Organization for Information Science
1616
* and Technology (RIST). All rights reserved.
17-
* Copyright (c) 2015 Intel, Inc. All rights reserved.
17+
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
1818
* $COPYRIGHT$
1919
*
2020
* Additional copyrights may follow
@@ -42,7 +42,7 @@ opal_pointer_array_t ompi_errhandler_f_to_c_table = {{0}};
4242
/*
4343
* default errhandler id
4444
*/
45-
static int default_errhandler_id = -1;
45+
static size_t default_errhandler_id = SIZE_MAX;
4646

4747
/*
4848
* Class information
@@ -163,7 +163,7 @@ int ompi_errhandler_finalize(void)
163163

164164
/* JMS Add stuff here checking for unreleased errorhandlers,
165165
similar to communicators, info handles, etc. */
166-
opal_pmix.deregister_errhandler(default_errhandler_id, NULL, NULL);
166+
opal_pmix.deregister_evhandler(default_errhandler_id, NULL, NULL);
167167

168168
/* Remove errhandler F2C table */
169169

@@ -222,7 +222,7 @@ ompi_errhandler_t *ompi_errhandler_create(ompi_errhandler_type_t object_type,
222222

223223
/* registration callback */
224224
void ompi_errhandler_registration_callback(int status,
225-
int errhandler_ref,
225+
size_t errhandler_ref,
226226
void *cbdata)
227227
{
228228
ompi_errhandler_errtrk_t *errtrk = (ompi_errhandler_errtrk_t*)cbdata;
@@ -236,14 +236,15 @@ void ompi_errhandler_registration_callback(int status,
236236
* Default errhandler callback
237237
*/
238238
void ompi_errhandler_callback(int status,
239-
opal_list_t *procs,
240-
opal_list_t *info,
241-
opal_pmix_release_cbfunc_t cbfunc,
239+
const opal_process_name_t *source,
240+
opal_list_t *info, opal_list_t *results,
241+
opal_pmix_notification_complete_fn_t cbfunc,
242242
void *cbdata)
243243
{
244-
/* allow the caller to release its data */
244+
/* tell the event chain engine to go no further - we
245+
* will handle this */
245246
if (NULL != cbfunc) {
246-
cbfunc(cbdata);
247+
cbfunc(OMPI_ERR_HANDLERS_COMPLETE, NULL, NULL, NULL, cbdata);
247248
}
248249
/* our default action is to abort */
249250
ompi_mpi_abort(MPI_COMM_WORLD, status);

ompi/errhandler/errhandler.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2008-2012 Cisco Systems, Inc. All rights reserved.
1414
* Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
15-
* Copyright (c) 2015 Intel, Inc. All rights reserved.
15+
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
1616
* $COPYRIGHT$
1717
*
1818
* Additional copyrights may follow
@@ -380,13 +380,13 @@ typedef struct {
380380
} ompi_errhandler_errtrk_t;
381381

382382
OMPI_DECLSPEC void ompi_errhandler_callback(int status,
383-
opal_list_t *procs,
384-
opal_list_t *info,
385-
opal_pmix_release_cbfunc_t cbfunc,
383+
const opal_process_name_t *source,
384+
opal_list_t *info, opal_list_t *results,
385+
opal_pmix_notification_complete_fn_t cbfunc,
386386
void *cbdata);
387387

388388
OMPI_DECLSPEC void ompi_errhandler_registration_callback(int status,
389-
int errhandler_ref,
389+
size_t errhandler_ref,
390390
void *cbdata);
391391
/**
392392
* Check to see if an errhandler is intrinsic.

ompi/include/ompi/constants.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12+
* Copyright (c) 2016 Intel, Inc. All rights reserved.
1213
* $COPYRIGHT$
1314
*
1415
* Additional copyrights may follow
@@ -63,6 +64,7 @@ enum {
6364

6465
OMPI_ERR_BUFFER = OPAL_ERR_BUFFER,
6566
OMPI_ERR_SILENT = OPAL_ERR_SILENT,
67+
OMPI_ERR_HANDLERS_COMPLETE = OPAL_ERR_HANDLERS_COMPLETE,
6668

6769
OMPI_ERR_REQUEST = OMPI_ERR_BASE - 1,
6870
OMPI_ERR_RMA_SYNC = OMPI_ERR_BASE - 2,

ompi/mca/rte/orte/rte_orte_module.c

Lines changed: 47 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,34 @@ void ompi_rte_abort(int error_code, char *fmt, ...)
9595
exit(-1);
9696
}
9797

98+
static size_t handler = SIZE_MAX;
99+
static bool debugger_register_active = true;
100+
static bool debugger_event_active = true;
101+
102+
static void _release_fn(int status,
103+
const opal_process_name_t *source,
104+
opal_list_t *info, opal_list_t *results,
105+
opal_pmix_notification_complete_fn_t cbfunc,
106+
void *cbdata)
107+
{
108+
/* must let the notifier know we are done */
109+
if (NULL != cbfunc) {
110+
cbfunc(ORTE_SUCCESS, NULL, NULL, NULL, cbdata);
111+
}
112+
debugger_event_active = false;
113+
}
114+
115+
static void _register_fn(int status,
116+
size_t evhandler_ref,
117+
void *cbdata)
118+
{
119+
opal_list_t *codes = (opal_list_t*)cbdata;
120+
121+
handler = evhandler_ref;
122+
OPAL_LIST_RELEASE(codes);
123+
debugger_register_active = false;
124+
}
125+
98126
/*
99127
* Wait for a debugger if asked. We support two ways of waiting for
100128
* attaching debuggers -- see big comment in
@@ -103,7 +131,8 @@ void ompi_rte_abort(int error_code, char *fmt, ...)
103131
void ompi_rte_wait_for_debugger(void)
104132
{
105133
int debugger;
106-
orte_rml_recv_cb_t xfer;
134+
opal_list_t *codes;
135+
opal_value_t *kv;
107136

108137
/* See lengthy comment in orte/tools/orterun/debuggers.c about
109138
orte_in_parallel_debugger */
@@ -133,23 +162,23 @@ void ompi_rte_wait_for_debugger(void)
133162
#endif
134163
}
135164
} else {
136-
/* only the rank=0 proc waits for either a message from the
137-
* HNP or for the debugger to attach - everyone else will just
138-
* spin in * the grpcomm barrier in ompi_mpi_init until rank=0
139-
* joins them.
140-
*/
141-
if (0 != ORTE_PROC_MY_NAME->vpid) {
142-
return;
143-
}
144165

145-
/* VPID 0 waits for a message from the HNP */
146-
OBJ_CONSTRUCT(&xfer, orte_rml_recv_cb_t);
147-
xfer.active = true;
148-
orte_rml.recv_buffer_nb(OMPI_NAME_WILDCARD,
149-
ORTE_RML_TAG_DEBUGGER_RELEASE,
150-
ORTE_RML_NON_PERSISTENT,
151-
orte_rml_recv_callback, &xfer);
152-
/* let the MPI progress engine run while we wait */
153-
OMPI_WAIT_FOR_COMPLETION(xfer.active);
166+
/* register an event handler for the ORTE_ERR_DEBUGGER_RELEASE event */
167+
codes = OBJ_NEW(opal_list_t);
168+
kv = OBJ_NEW(opal_value_t);
169+
kv->key = strdup("errorcode");
170+
kv->type = OPAL_INT;
171+
kv->data.integer = ORTE_ERR_DEBUGGER_RELEASE;
172+
opal_list_append(codes, &kv->super);
173+
174+
opal_pmix.register_evhandler(codes, NULL, _release_fn, _register_fn, codes);
175+
/* let the MPI progress engine run while we wait for registration to complete */
176+
OMPI_WAIT_FOR_COMPLETION(debugger_register_active);
177+
178+
/* let the MPI progress engine run while we wait for debugger release */
179+
OMPI_WAIT_FOR_COMPLETION(debugger_event_active);
180+
181+
/* deregister the event handler */
182+
opal_pmix.deregister_evhandler(handler, NULL, NULL);
154183
}
155184
}

ompi/mpi/c/lookup_name.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,10 @@
1212
* All rights reserved.
1313
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
1414
* reserved.
15-
* Copyright (c) 2015 Intel, Inc. All rights reserved.
15+
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
1616
* Copyright (c) 2015 Research Organization for Information Science
1717
* and Technology (RIST). All rights reserved.
18-
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
18+
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
1919
* $COPYRIGHT$
2020
*
2121
* Additional copyrights may follow
@@ -94,13 +94,13 @@ int MPI_Lookup_name(const char *service_name, MPI_Info info, char *port_name)
9494
rng = OBJ_NEW(opal_value_t);
9595
rng->key = strdup(OPAL_PMIX_RANGE);
9696
rng->type = OPAL_INT;
97-
rng->data.integer = OPAL_PMIX_NAMESPACE; // share only with procs in same nspace
97+
rng->data.integer = OPAL_PMIX_RANGE_NAMESPACE; // share only with procs in same nspace
9898
opal_list_append(&pinfo, &rng->super);
9999
} else if (0 == strcmp(range, "session")) {
100100
rng = OBJ_NEW(opal_value_t);
101101
rng->key = strdup(OPAL_PMIX_RANGE);
102102
rng->type = OPAL_INT;
103-
rng->data.integer = OPAL_PMIX_SESSION; // share only with procs in same session
103+
rng->data.integer = OPAL_PMIX_RANGE_SESSION; // share only with procs in same session
104104
opal_list_append(&pinfo, &rng->super);
105105
} else {
106106
/* unrecognized scope */

ompi/mpi/c/publish_name.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,10 @@
1212
* All rights reserved.
1313
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
1414
* reserved.
15-
* Copyright (c) 2015 Intel, Inc. All rights reserved.
15+
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
1616
* Copyright (c) 2015 Research Organization for Information Science
1717
* and Technology (RIST). All rights reserved.
18-
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
18+
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
1919
* $COPYRIGHT$
2020
*
2121
* Additional copyrights may follow
@@ -94,13 +94,13 @@ int MPI_Publish_name(const char *service_name, MPI_Info info,
9494
rng = OBJ_NEW(opal_value_t);
9595
rng->key = strdup(OPAL_PMIX_RANGE);
9696
rng->type = OPAL_INT;
97-
rng->data.integer = OPAL_PMIX_NAMESPACE; // share only with procs in same nspace
97+
rng->data.integer = OPAL_PMIX_RANGE_NAMESPACE; // share only with procs in same nspace
9898
opal_list_append(&values, &rng->super);
9999
} else if (0 == strcmp(range, "session")) {
100100
rng = OBJ_NEW(opal_value_t);
101101
rng->key = strdup(OPAL_PMIX_RANGE);
102102
rng->type = OPAL_INT;
103-
rng->data.integer = OPAL_PMIX_SESSION; // share only with procs in same session
103+
rng->data.integer = OPAL_PMIX_RANGE_SESSION; // share only with procs in same session
104104
opal_list_append(&values, &rng->super);
105105
} else {
106106
/* unrecognized scope */

ompi/mpi/c/unpublish_name.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,10 @@
1212
* All rights reserved.
1313
* Copyright (c) 2012-2013 Los Alamos National Security, LLC. All rights
1414
* reserved.
15-
* Copyright (c) 2015 Intel, Inc. All rights reserved.
15+
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
1616
* Copyright (c) 2015 Research Organization for Information Science
1717
* and Technology (RIST). All rights reserved.
18-
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
18+
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
1919
* $COPYRIGHT$
2020
*
2121
* Additional copyrights may follow
@@ -96,13 +96,13 @@ int MPI_Unpublish_name(const char *service_name, MPI_Info info,
9696
rng = OBJ_NEW(opal_value_t);
9797
rng->key = strdup(OPAL_PMIX_RANGE);
9898
rng->type = OPAL_INT;
99-
rng->data.integer = OPAL_PMIX_NAMESPACE; // share only with procs in same nspace
99+
rng->data.integer = OPAL_PMIX_RANGE_NAMESPACE; // share only with procs in same nspace
100100
opal_list_append(&pinfo, &rng->super);
101101
} else if (0 == strcmp(range, "session")) {
102102
rng = OBJ_NEW(opal_value_t);
103103
rng->key = strdup(OPAL_PMIX_RANGE);
104104
rng->type = OPAL_INT;
105-
rng->data.integer = OPAL_PMIX_SESSION; // share only with procs in same session
105+
rng->data.integer = OPAL_PMIX_RANGE_SESSION; // share only with procs in same session
106106
opal_list_append(&pinfo, &rng->super);
107107
} else {
108108
/* unrecognized scope */

ompi/runtime/ompi_mpi_init.c

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,8 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
377377
char *cmd=NULL, *av=NULL;
378378
ompi_errhandler_errtrk_t errtrk;
379379
volatile bool active;
380+
opal_list_t info;
381+
opal_value_t *kv;
380382
OPAL_TIMING_DECLARE(tm);
381383
OPAL_TIMING_INIT_EXT(&tm, OPAL_TIMING_GET_TIME_OF_DAY);
382384

@@ -519,10 +521,16 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
519521
/* Register the default errhandler callback */
520522
errtrk.status = OPAL_ERROR;
521523
errtrk.active = true;
522-
opal_pmix.register_errhandler(NULL, ompi_errhandler_callback,
523-
ompi_errhandler_registration_callback,
524-
(void*)&errtrk);
524+
/* we want to go first */
525+
OBJ_CONSTRUCT(&info, opal_list_t);
526+
kv = OBJ_NEW(opal_value_t);
527+
kv->key = strdup(OPAL_PMIX_EVENT_ORDER_PREPEND);
528+
opal_list_append(&info, &kv->super);
529+
opal_pmix.register_evhandler(NULL, &info, ompi_errhandler_callback,
530+
ompi_errhandler_registration_callback,
531+
(void*)&errtrk);
525532
OMPI_WAIT_FOR_COMPLETION(errtrk.active);
533+
OPAL_LIST_DESTRUCT(&info);
526534
if (OPAL_SUCCESS != errtrk.status) {
527535
error = "Error handler registration";
528536
ret = errtrk.status;

opal/Makefile.am

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
# Copyright (c) 2004-2005 The Regents of the University of California.
1111
# All rights reserved.
1212
# Copyright (c) 2009-2015 Cisco Systems, Inc. All rights reserved.
13-
# Copyright (c) 2015 Intel, Inc. All rights reserved.
13+
# Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
1414
# $COPYRIGHT$
1515
#
1616
# Additional copyrights may follow
@@ -76,7 +76,6 @@ nobase_opal_HEADERS = $(headers)
7676
endif
7777

7878
include class/Makefile.am
79-
include errhandler/Makefile.am
8079
include memoryhooks/Makefile.am
8180
include runtime/Makefile.am
8281
include threads/Makefile.am

opal/errhandler/Makefile.am

Lines changed: 0 additions & 17 deletions
This file was deleted.

0 commit comments

Comments
 (0)