Skip to content

Commit 48d35a9

Browse files
author
Ralph Castain
committed
Ensure we properly convert pmix status to ORTE state before activating an error state upon notification. Cleanup some conversion issues on notification info. Add a new orte_notify.c test program
1 parent 1ef3c86 commit 48d35a9

File tree

6 files changed

+147
-10
lines changed

6 files changed

+147
-10
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,7 @@ orte/test/system/opal_hwloc
457457
orte/test/system/opal_db
458458
orte/test/system/ulfm
459459
orte/test/system/pmixtool
460+
orte/test/system/orte_notify
460461

461462
orte/tools/orte-checkpoint/orte-checkpoint
462463
orte/tools/orte-checkpoint/orte-checkpoint.1

opal/mca/pmix/ext20/pmix_ext20.c

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,9 @@ static void progress_local_event_hdlr(int status,
179179
if (sing->code == chain->status) {
180180
OBJ_RETAIN(chain);
181181
chain->sing = sing;
182+
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
183+
"%s PROGRESS CALLING SINGLE EVHDLR",
184+
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
182185
sing->handler(chain->status, &chain->source,
183186
chain->info, &chain->results,
184187
progress_local_event_hdlr, (void*)chain);
@@ -204,6 +207,9 @@ static void progress_local_event_hdlr(int status,
204207
* callback function to our progression function */
205208
OBJ_RETAIN(chain);
206209
chain->multi = multi;
210+
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
211+
"%s PROGRESS CALLING MULTI EVHDLR",
212+
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
207213
multi->handler(chain->status, &chain->source,
208214
chain->info, &chain->results,
209215
progress_local_event_hdlr, (void*)chain);
@@ -230,6 +236,9 @@ static void progress_local_event_hdlr(int status,
230236
def = (opal_ext20_default_event_t*)nxt;
231237
OBJ_RETAIN(chain);
232238
chain->def = def;
239+
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
240+
"%s PROGRESS CALLING DEFAULT EVHDLR",
241+
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
233242
def->handler(chain->status, &chain->source,
234243
chain->info, &chain->results,
235244
progress_local_event_hdlr, (void*)chain);
@@ -259,7 +268,7 @@ static void _event_hdlr(int sd, short args, void *cbdata)
259268
opal_ext20_default_event_t *def;
260269

261270
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
262-
"%s RECEIVED NOTIFICATION OF STATUS %d",
271+
"%s _EVENT_HDLR RECEIVED NOTIFICATION OF STATUS %d",
263272
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), cd->status);
264273

265274
chain = OBJ_NEW(opal_ext20_event_chain_t);
@@ -281,7 +290,7 @@ static void _event_hdlr(int sd, short args, void *cbdata)
281290
OBJ_RETAIN(chain);
282291
chain->sing = sing;
283292
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
284-
"%s CALLING SINGLE EVHDLR",
293+
"%s _EVENT_HDLR CALLING SINGLE EVHDLR",
285294
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
286295
sing->handler(chain->status, &chain->source,
287296
chain->info, &chain->results,
@@ -300,7 +309,7 @@ static void _event_hdlr(int sd, short args, void *cbdata)
300309
OBJ_RETAIN(chain);
301310
chain->multi = multi;
302311
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
303-
"%s CALLING MULTI EVHDLR",
312+
"%s _EVENT_HDLR CALLING MULTI EVHDLR",
304313
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
305314
multi->handler(chain->status, &chain->source,
306315
chain->info, &chain->results,
@@ -327,7 +336,7 @@ static void _event_hdlr(int sd, short args, void *cbdata)
327336
OBJ_RETAIN(chain);
328337
chain->def = def;
329338
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
330-
"%s CALLING DEFAULT EVHDLR",
339+
"%s _EVENT_HDLR CALLING DEFAULT EVHDLR",
331340
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
332341
def->handler(chain->status, &chain->source,
333342
chain->info, &chain->results,
@@ -812,6 +821,10 @@ void ext20_value_load(pmix_value_t *v,
812821
* so the ORTE layer is responsible for converting it */
813822
memcpy(&v->data.state, &kv->data.uint8, sizeof(uint8_t));
814823
break;
824+
case OPAL_PTR:
825+
v->type = PMIX_POINTER;
826+
v->data.ptr = kv->data.ptr;
827+
break;
815828
default:
816829
/* silence warnings */
817830
break;
@@ -943,11 +956,17 @@ int ext20_value_unload(opal_value_t *kv,
943956
case PMIX_DATA_RANGE:
944957
kv->type = OPAL_DATA_RANGE;
945958
kv->data.uint8 = ext20_convert_range(v->data.persist);
959+
break;
946960
case PMIX_PROC_STATE:
947961
kv->type = OPAL_PROC_STATE;
948962
/* the OPAL layer doesn't have any concept of proc state,
949963
* so the ORTE layer is responsible for converting it */
950964
memcpy(&kv->data.uint8, &v->data.state, sizeof(uint8_t));
965+
break;
966+
case PMIX_POINTER:
967+
kv->type = OPAL_PTR;
968+
kv->data.ptr = v->data.ptr;
969+
break;
951970
default:
952971
/* silence warnings */
953972
rc = OPAL_ERROR;

opal/mca/pmix/pmix2x/pmix2x.c

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,9 @@ static void progress_local_event_hdlr(int status,
179179
if (sing->code == chain->status) {
180180
OBJ_RETAIN(chain);
181181
chain->sing = sing;
182+
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
183+
"%s PROGRESS CALLING SINGLE EVHDLR",
184+
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
182185
sing->handler(chain->status, &chain->source,
183186
chain->info, &chain->results,
184187
progress_local_event_hdlr, (void*)chain);
@@ -204,6 +207,9 @@ static void progress_local_event_hdlr(int status,
204207
* callback function to our progression function */
205208
OBJ_RETAIN(chain);
206209
chain->multi = multi;
210+
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
211+
"%s PROGRESS CALLING MULTI EVHDLR",
212+
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
207213
multi->handler(chain->status, &chain->source,
208214
chain->info, &chain->results,
209215
progress_local_event_hdlr, (void*)chain);
@@ -230,6 +236,9 @@ static void progress_local_event_hdlr(int status,
230236
def = (opal_pmix2x_default_event_t*)nxt;
231237
OBJ_RETAIN(chain);
232238
chain->def = def;
239+
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
240+
"%s PROGRESS CALLING DEFAULT EVHDLR",
241+
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
233242
def->handler(chain->status, &chain->source,
234243
chain->info, &chain->results,
235244
progress_local_event_hdlr, (void*)chain);
@@ -259,7 +268,7 @@ static void _event_hdlr(int sd, short args, void *cbdata)
259268
opal_pmix2x_default_event_t *def;
260269

261270
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
262-
"%s RECEIVED NOTIFICATION OF STATUS %d",
271+
"%s _EVENT_HDLR RECEIVED NOTIFICATION OF STATUS %d",
263272
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), cd->status);
264273

265274
chain = OBJ_NEW(opal_pmix2x_event_chain_t);
@@ -281,7 +290,7 @@ static void _event_hdlr(int sd, short args, void *cbdata)
281290
OBJ_RETAIN(chain);
282291
chain->sing = sing;
283292
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
284-
"%s CALLING SINGLE EVHDLR",
293+
"%s _EVENT_HDLR CALLING SINGLE EVHDLR",
285294
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
286295
sing->handler(chain->status, &chain->source,
287296
chain->info, &chain->results,
@@ -300,7 +309,7 @@ static void _event_hdlr(int sd, short args, void *cbdata)
300309
OBJ_RETAIN(chain);
301310
chain->multi = multi;
302311
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
303-
"%s CALLING MULTI EVHDLR",
312+
"%s _EVENT_HDLR CALLING MULTI EVHDLR",
304313
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
305314
multi->handler(chain->status, &chain->source,
306315
chain->info, &chain->results,
@@ -327,7 +336,7 @@ static void _event_hdlr(int sd, short args, void *cbdata)
327336
OBJ_RETAIN(chain);
328337
chain->def = def;
329338
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
330-
"%s CALLING DEFAULT EVHDLR",
339+
"%s _EVENT_HDLR CALLING DEFAULT EVHDLR",
331340
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
332341
def->handler(chain->status, &chain->source,
333342
chain->info, &chain->results,
@@ -812,6 +821,10 @@ void pmix2x_value_load(pmix_value_t *v,
812821
* so the ORTE layer is responsible for converting it */
813822
memcpy(&v->data.state, &kv->data.uint8, sizeof(uint8_t));
814823
break;
824+
case OPAL_PTR:
825+
v->type = PMIX_POINTER;
826+
v->data.ptr = kv->data.ptr;
827+
break;
815828
default:
816829
/* silence warnings */
817830
break;
@@ -943,11 +956,17 @@ int pmix2x_value_unload(opal_value_t *kv,
943956
case PMIX_DATA_RANGE:
944957
kv->type = OPAL_DATA_RANGE;
945958
kv->data.uint8 = pmix2x_convert_range(v->data.persist);
959+
break;
946960
case PMIX_PROC_STATE:
947961
kv->type = OPAL_PROC_STATE;
948962
/* the OPAL layer doesn't have any concept of proc state,
949963
* so the ORTE layer is responsible for converting it */
950964
memcpy(&kv->data.uint8, &v->data.state, sizeof(uint8_t));
965+
break;
966+
case PMIX_POINTER:
967+
kv->type = OPAL_PTR;
968+
kv->data.ptr = v->data.ptr;
969+
break;
951970
default:
952971
/* silence warnings */
953972
rc = OPAL_ERROR;

orte/mca/errmgr/default_app/errmgr_default_app.c

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,19 +82,34 @@ static void notify_cbfunc(int status,
8282
opal_list_t *info, opal_list_t *results,
8383
opal_pmix_notification_complete_fn_t cbfunc, void *cbdata)
8484
{
85+
orte_proc_state_t state;
86+
8587
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
8688
"%s errmgr:default_app: pmix event handler called with status %s",
8789
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
8890
ORTE_ERROR_NAME(status)));
8991

92+
/* we must convert the incoming status into an equivalent state
93+
* so we can activate the state machine */
94+
switch(status) {
95+
case OPAL_ERR_PROC_ABORTED:
96+
state = ORTE_PROC_STATE_ABORTED;
97+
break;
98+
case OPAL_ERR_PROC_REQUESTED_ABORT:
99+
state = ORTE_PROC_STATE_CALLED_ABORT;
100+
break;
101+
default:
102+
state = ORTE_PROC_STATE_TERMINATED;
103+
}
104+
90105
/* let the caller know we processed this, but allow the
91106
* chain to continue */
92107
if (NULL != cbfunc) {
93108
cbfunc(ORTE_SUCCESS, NULL, NULL, NULL, cbdata);
94109
}
95110

96111
/* push it into our event base */
97-
ORTE_ACTIVATE_PROC_STATE(ORTE_PROC_MY_NAME, status);
112+
ORTE_ACTIVATE_PROC_STATE(ORTE_PROC_MY_NAME, state);
98113
}
99114

100115
/************************

orte/test/system/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits \
22
orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix opal_interface orte_spin segfault \
33
orte_exit test-time event-threads psm_keygen regex orte_errors evpri-test opal-evpri-test evpri-test2 \
4-
mapper reducer opal_hotel orte_dfs ulfm pmixtool
4+
mapper reducer opal_hotel orte_dfs ulfm pmixtool orte_notify
55

66
all: $(PROGS)
77

orte/test/system/orte_notify.c

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
/* -*- C -*-
2+
*
3+
* $HEADER$
4+
*
5+
*/
6+
7+
#include "orte_config.h"
8+
9+
#include <stdio.h>
10+
#include <unistd.h>
11+
12+
#include "opal/mca/pmix/pmix.h"
13+
#include "orte/runtime/runtime.h"
14+
#include "orte/util/proc_info.h"
15+
#include "orte/util/name_fns.h"
16+
#include "orte/runtime/orte_globals.h"
17+
#include "orte/mca/errmgr/errmgr.h"
18+
19+
static pid_t pid;
20+
static char hostname[OPAL_MAXHOSTNAMELEN];
21+
22+
static void notification_fn(int status,
23+
const opal_process_name_t *source,
24+
opal_list_t *info, opal_list_t *results,
25+
opal_pmix_notification_complete_fn_t cbfunc,
26+
void *cbdata)
27+
{
28+
int peer_rank;
29+
30+
fprintf(stderr, "orte_notify: Name %s Host: %s Pid %ld source %s\n",
31+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
32+
hostname, (long)pid, ORTE_NAME_PRINT(source));
33+
34+
/** let the notifier know we are done */
35+
if (cbfunc) {
36+
cbfunc(OPAL_ERR_HANDLERS_COMPLETE, NULL, NULL, NULL, cbdata);
37+
}
38+
39+
}
40+
41+
static void errhandler_reg_callbk(int status,
42+
size_t evhdlr_ref,
43+
void *cbdata)
44+
{
45+
return;
46+
}
47+
48+
int main(int argc, char* argv[])
49+
{
50+
int rc;
51+
opal_value_t *kv;
52+
opal_list_t info;
53+
54+
if (0 > (rc = orte_init(&argc, &argv, ORTE_PROC_NON_MPI))) {
55+
fprintf(stderr, "orte_abort: couldn't init orte - error code %d\n", rc);
56+
return rc;
57+
}
58+
pid = getpid();
59+
gethostname(hostname, sizeof(hostname));
60+
61+
printf("orte_notify: Name %s Host: %s Pid %ld\n",
62+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
63+
hostname, (long)pid);
64+
fflush(stdout);
65+
66+
/* register the event handler */
67+
OBJ_CONSTRUCT(&info, opal_list_t);
68+
kv = OBJ_NEW(opal_value_t);
69+
kv->key = strdup(OPAL_PMIX_EVENT_ORDER_PREPEND);
70+
kv->type = OPAL_BOOL;
71+
kv->data.flag = true;
72+
opal_list_append(&info, &kv->super);
73+
74+
opal_pmix.register_evhandler(NULL, &info,
75+
notification_fn,
76+
NULL, NULL);
77+
78+
while (1) {
79+
usleep(100);
80+
}
81+
82+
return 0;
83+
}

0 commit comments

Comments
 (0)