Skip to content

Commit ddde154

Browse files
author
rhc54
authored
Merge pull request #1962 from rhc54/topic/notify
Ensure we properly convert pmix status to ORTE state before activatin…
2 parents 9868093 + 48d35a9 commit ddde154

File tree

6 files changed

+147
-10
lines changed

6 files changed

+147
-10
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,7 @@ orte/test/system/opal_hwloc
457457
orte/test/system/opal_db
458458
orte/test/system/ulfm
459459
orte/test/system/pmixtool
460+
orte/test/system/orte_notify
460461

461462
orte/tools/orte-checkpoint/orte-checkpoint
462463
orte/tools/orte-checkpoint/orte-checkpoint.1

opal/mca/pmix/ext20/pmix_ext20.c

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,9 @@ static void progress_local_event_hdlr(int status,
179179
if (sing->code == chain->status) {
180180
OBJ_RETAIN(chain);
181181
chain->sing = sing;
182+
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
183+
"%s PROGRESS CALLING SINGLE EVHDLR",
184+
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
182185
sing->handler(chain->status, &chain->source,
183186
chain->info, &chain->results,
184187
progress_local_event_hdlr, (void*)chain);
@@ -204,6 +207,9 @@ static void progress_local_event_hdlr(int status,
204207
* callback function to our progression function */
205208
OBJ_RETAIN(chain);
206209
chain->multi = multi;
210+
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
211+
"%s PROGRESS CALLING MULTI EVHDLR",
212+
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
207213
multi->handler(chain->status, &chain->source,
208214
chain->info, &chain->results,
209215
progress_local_event_hdlr, (void*)chain);
@@ -230,6 +236,9 @@ static void progress_local_event_hdlr(int status,
230236
def = (opal_ext20_default_event_t*)nxt;
231237
OBJ_RETAIN(chain);
232238
chain->def = def;
239+
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
240+
"%s PROGRESS CALLING DEFAULT EVHDLR",
241+
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
233242
def->handler(chain->status, &chain->source,
234243
chain->info, &chain->results,
235244
progress_local_event_hdlr, (void*)chain);
@@ -259,7 +268,7 @@ static void _event_hdlr(int sd, short args, void *cbdata)
259268
opal_ext20_default_event_t *def;
260269

261270
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
262-
"%s RECEIVED NOTIFICATION OF STATUS %d",
271+
"%s _EVENT_HDLR RECEIVED NOTIFICATION OF STATUS %d",
263272
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), cd->status);
264273

265274
chain = OBJ_NEW(opal_ext20_event_chain_t);
@@ -281,7 +290,7 @@ static void _event_hdlr(int sd, short args, void *cbdata)
281290
OBJ_RETAIN(chain);
282291
chain->sing = sing;
283292
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
284-
"%s CALLING SINGLE EVHDLR",
293+
"%s _EVENT_HDLR CALLING SINGLE EVHDLR",
285294
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
286295
sing->handler(chain->status, &chain->source,
287296
chain->info, &chain->results,
@@ -300,7 +309,7 @@ static void _event_hdlr(int sd, short args, void *cbdata)
300309
OBJ_RETAIN(chain);
301310
chain->multi = multi;
302311
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
303-
"%s CALLING MULTI EVHDLR",
312+
"%s _EVENT_HDLR CALLING MULTI EVHDLR",
304313
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
305314
multi->handler(chain->status, &chain->source,
306315
chain->info, &chain->results,
@@ -327,7 +336,7 @@ static void _event_hdlr(int sd, short args, void *cbdata)
327336
OBJ_RETAIN(chain);
328337
chain->def = def;
329338
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
330-
"%s CALLING DEFAULT EVHDLR",
339+
"%s _EVENT_HDLR CALLING DEFAULT EVHDLR",
331340
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
332341
def->handler(chain->status, &chain->source,
333342
chain->info, &chain->results,
@@ -812,6 +821,10 @@ void ext20_value_load(pmix_value_t *v,
812821
* so the ORTE layer is responsible for converting it */
813822
memcpy(&v->data.state, &kv->data.uint8, sizeof(uint8_t));
814823
break;
824+
case OPAL_PTR:
825+
v->type = PMIX_POINTER;
826+
v->data.ptr = kv->data.ptr;
827+
break;
815828
default:
816829
/* silence warnings */
817830
break;
@@ -943,11 +956,17 @@ int ext20_value_unload(opal_value_t *kv,
943956
case PMIX_DATA_RANGE:
944957
kv->type = OPAL_DATA_RANGE;
945958
kv->data.uint8 = ext20_convert_range(v->data.persist);
959+
break;
946960
case PMIX_PROC_STATE:
947961
kv->type = OPAL_PROC_STATE;
948962
/* the OPAL layer doesn't have any concept of proc state,
949963
* so the ORTE layer is responsible for converting it */
950964
memcpy(&kv->data.uint8, &v->data.state, sizeof(uint8_t));
965+
break;
966+
case PMIX_POINTER:
967+
kv->type = OPAL_PTR;
968+
kv->data.ptr = v->data.ptr;
969+
break;
951970
default:
952971
/* silence warnings */
953972
rc = OPAL_ERROR;

opal/mca/pmix/pmix2x/pmix2x.c

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,9 @@ static void progress_local_event_hdlr(int status,
179179
if (sing->code == chain->status) {
180180
OBJ_RETAIN(chain);
181181
chain->sing = sing;
182+
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
183+
"%s PROGRESS CALLING SINGLE EVHDLR",
184+
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
182185
sing->handler(chain->status, &chain->source,
183186
chain->info, &chain->results,
184187
progress_local_event_hdlr, (void*)chain);
@@ -204,6 +207,9 @@ static void progress_local_event_hdlr(int status,
204207
* callback function to our progression function */
205208
OBJ_RETAIN(chain);
206209
chain->multi = multi;
210+
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
211+
"%s PROGRESS CALLING MULTI EVHDLR",
212+
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
207213
multi->handler(chain->status, &chain->source,
208214
chain->info, &chain->results,
209215
progress_local_event_hdlr, (void*)chain);
@@ -230,6 +236,9 @@ static void progress_local_event_hdlr(int status,
230236
def = (opal_pmix2x_default_event_t*)nxt;
231237
OBJ_RETAIN(chain);
232238
chain->def = def;
239+
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
240+
"%s PROGRESS CALLING DEFAULT EVHDLR",
241+
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
233242
def->handler(chain->status, &chain->source,
234243
chain->info, &chain->results,
235244
progress_local_event_hdlr, (void*)chain);
@@ -259,7 +268,7 @@ static void _event_hdlr(int sd, short args, void *cbdata)
259268
opal_pmix2x_default_event_t *def;
260269

261270
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
262-
"%s RECEIVED NOTIFICATION OF STATUS %d",
271+
"%s _EVENT_HDLR RECEIVED NOTIFICATION OF STATUS %d",
263272
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), cd->status);
264273

265274
chain = OBJ_NEW(opal_pmix2x_event_chain_t);
@@ -281,7 +290,7 @@ static void _event_hdlr(int sd, short args, void *cbdata)
281290
OBJ_RETAIN(chain);
282291
chain->sing = sing;
283292
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
284-
"%s CALLING SINGLE EVHDLR",
293+
"%s _EVENT_HDLR CALLING SINGLE EVHDLR",
285294
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
286295
sing->handler(chain->status, &chain->source,
287296
chain->info, &chain->results,
@@ -300,7 +309,7 @@ static void _event_hdlr(int sd, short args, void *cbdata)
300309
OBJ_RETAIN(chain);
301310
chain->multi = multi;
302311
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
303-
"%s CALLING MULTI EVHDLR",
312+
"%s _EVENT_HDLR CALLING MULTI EVHDLR",
304313
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
305314
multi->handler(chain->status, &chain->source,
306315
chain->info, &chain->results,
@@ -327,7 +336,7 @@ static void _event_hdlr(int sd, short args, void *cbdata)
327336
OBJ_RETAIN(chain);
328337
chain->def = def;
329338
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
330-
"%s CALLING DEFAULT EVHDLR",
339+
"%s _EVENT_HDLR CALLING DEFAULT EVHDLR",
331340
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
332341
def->handler(chain->status, &chain->source,
333342
chain->info, &chain->results,
@@ -812,6 +821,10 @@ void pmix2x_value_load(pmix_value_t *v,
812821
* so the ORTE layer is responsible for converting it */
813822
memcpy(&v->data.state, &kv->data.uint8, sizeof(uint8_t));
814823
break;
824+
case OPAL_PTR:
825+
v->type = PMIX_POINTER;
826+
v->data.ptr = kv->data.ptr;
827+
break;
815828
default:
816829
/* silence warnings */
817830
break;
@@ -943,11 +956,17 @@ int pmix2x_value_unload(opal_value_t *kv,
943956
case PMIX_DATA_RANGE:
944957
kv->type = OPAL_DATA_RANGE;
945958
kv->data.uint8 = pmix2x_convert_range(v->data.persist);
959+
break;
946960
case PMIX_PROC_STATE:
947961
kv->type = OPAL_PROC_STATE;
948962
/* the OPAL layer doesn't have any concept of proc state,
949963
* so the ORTE layer is responsible for converting it */
950964
memcpy(&kv->data.uint8, &v->data.state, sizeof(uint8_t));
965+
break;
966+
case PMIX_POINTER:
967+
kv->type = OPAL_PTR;
968+
kv->data.ptr = v->data.ptr;
969+
break;
951970
default:
952971
/* silence warnings */
953972
rc = OPAL_ERROR;

orte/mca/errmgr/default_app/errmgr_default_app.c

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,19 +82,34 @@ static void notify_cbfunc(int status,
8282
opal_list_t *info, opal_list_t *results,
8383
opal_pmix_notification_complete_fn_t cbfunc, void *cbdata)
8484
{
85+
orte_proc_state_t state;
86+
8587
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
8688
"%s errmgr:default_app: pmix event handler called with status %s",
8789
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
8890
ORTE_ERROR_NAME(status)));
8991

92+
/* we must convert the incoming status into an equivalent state
93+
* so we can activate the state machine */
94+
switch(status) {
95+
case OPAL_ERR_PROC_ABORTED:
96+
state = ORTE_PROC_STATE_ABORTED;
97+
break;
98+
case OPAL_ERR_PROC_REQUESTED_ABORT:
99+
state = ORTE_PROC_STATE_CALLED_ABORT;
100+
break;
101+
default:
102+
state = ORTE_PROC_STATE_TERMINATED;
103+
}
104+
90105
/* let the caller know we processed this, but allow the
91106
* chain to continue */
92107
if (NULL != cbfunc) {
93108
cbfunc(ORTE_SUCCESS, NULL, NULL, NULL, cbdata);
94109
}
95110

96111
/* push it into our event base */
97-
ORTE_ACTIVATE_PROC_STATE(ORTE_PROC_MY_NAME, status);
112+
ORTE_ACTIVATE_PROC_STATE(ORTE_PROC_MY_NAME, state);
98113
}
99114

100115
/************************

orte/test/system/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
PROGS = no_op sigusr_trap spin orte_nodename orte_spawn orte_loop_spawn orte_loop_child orte_abort get_limits \
22
orte_tool orte_no_op binom oob_stress iof_stress iof_delay radix opal_interface orte_spin segfault \
33
orte_exit test-time event-threads psm_keygen regex orte_errors evpri-test opal-evpri-test evpri-test2 \
4-
mapper reducer opal_hotel orte_dfs ulfm pmixtool
4+
mapper reducer opal_hotel orte_dfs ulfm pmixtool orte_notify
55

66
all: $(PROGS)
77

orte/test/system/orte_notify.c

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
/* -*- C -*-
2+
*
3+
* $HEADER$
4+
*
5+
*/
6+
7+
#include "orte_config.h"
8+
9+
#include <stdio.h>
10+
#include <unistd.h>
11+
12+
#include "opal/mca/pmix/pmix.h"
13+
#include "orte/runtime/runtime.h"
14+
#include "orte/util/proc_info.h"
15+
#include "orte/util/name_fns.h"
16+
#include "orte/runtime/orte_globals.h"
17+
#include "orte/mca/errmgr/errmgr.h"
18+
19+
static pid_t pid;
20+
static char hostname[OPAL_MAXHOSTNAMELEN];
21+
22+
static void notification_fn(int status,
23+
const opal_process_name_t *source,
24+
opal_list_t *info, opal_list_t *results,
25+
opal_pmix_notification_complete_fn_t cbfunc,
26+
void *cbdata)
27+
{
28+
int peer_rank;
29+
30+
fprintf(stderr, "orte_notify: Name %s Host: %s Pid %ld source %s\n",
31+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
32+
hostname, (long)pid, ORTE_NAME_PRINT(source));
33+
34+
/** let the notifier know we are done */
35+
if (cbfunc) {
36+
cbfunc(OPAL_ERR_HANDLERS_COMPLETE, NULL, NULL, NULL, cbdata);
37+
}
38+
39+
}
40+
41+
static void errhandler_reg_callbk(int status,
42+
size_t evhdlr_ref,
43+
void *cbdata)
44+
{
45+
return;
46+
}
47+
48+
int main(int argc, char* argv[])
49+
{
50+
int rc;
51+
opal_value_t *kv;
52+
opal_list_t info;
53+
54+
if (0 > (rc = orte_init(&argc, &argv, ORTE_PROC_NON_MPI))) {
55+
fprintf(stderr, "orte_abort: couldn't init orte - error code %d\n", rc);
56+
return rc;
57+
}
58+
pid = getpid();
59+
gethostname(hostname, sizeof(hostname));
60+
61+
printf("orte_notify: Name %s Host: %s Pid %ld\n",
62+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
63+
hostname, (long)pid);
64+
fflush(stdout);
65+
66+
/* register the event handler */
67+
OBJ_CONSTRUCT(&info, opal_list_t);
68+
kv = OBJ_NEW(opal_value_t);
69+
kv->key = strdup(OPAL_PMIX_EVENT_ORDER_PREPEND);
70+
kv->type = OPAL_BOOL;
71+
kv->data.flag = true;
72+
opal_list_append(&info, &kv->super);
73+
74+
opal_pmix.register_evhandler(NULL, &info,
75+
notification_fn,
76+
NULL, NULL);
77+
78+
while (1) {
79+
usleep(100);
80+
}
81+
82+
return 0;
83+
}

0 commit comments

Comments
 (0)