Skip to content

Commit c2a02ab

Browse files
author
rhc54
committed
Merge pull request #1756 from rhc54/topic/hangs
Fix rare hangs observed on OS-X by properly thread-shifting upcalls from the PMIx server into ORTE
2 parents db70852 + dd0f843 commit c2a02ab

File tree

3 files changed

+169
-69
lines changed

3 files changed

+169
-69
lines changed

orte/mca/plm/plm_types.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@ typedef uint32_t orte_proc_state_t;
5353
#define ORTE_PROC_STATE_IOF_COMPLETE 6 /* io forwarding pipes have closed */
5454
#define ORTE_PROC_STATE_WAITPID_FIRED 7 /* waitpid fired on process */
5555
#define ORTE_PROC_STATE_MODEX_READY 8 /* all modex info has been stored */
56-
5756
/*
5857
* Define a "boundary" so we can easily and quickly determine
5958
* if a proc is still running or not - any value less than

orte/orted/pmix/pmix_server_gen.c

Lines changed: 109 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -43,72 +43,153 @@
4343

4444
#include "pmix_server_internal.h"
4545

46+
static void _client_conn(int sd, short args, void *cbdata)
47+
{
48+
orte_pmix_server_op_caddy_t *cd = (orte_pmix_server_op_caddy_t*)cbdata;
49+
orte_job_t *jdata;
50+
orte_proc_t *p, *ptr;
51+
int i;
52+
53+
if (NULL != cd->server_object) {
54+
/* we were passed back the orte_proc_t */
55+
p = (orte_proc_t*)cd->server_object;
56+
} else {
57+
/* find the named process */
58+
p = NULL;
59+
if (NULL == (jdata = orte_get_job_data_object(cd->proc->jobid))) {
60+
return;
61+
}
62+
for (i=0; i < jdata->procs->size; i++) {
63+
if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
64+
continue;
65+
}
66+
if (cd->proc->jobid != ptr->name.jobid) {
67+
continue;
68+
}
69+
if (cd->proc->vpid == ptr->name.vpid) {
70+
p = ptr;
71+
break;
72+
}
73+
}
74+
}
75+
if (NULL != p) {
76+
ORTE_FLAG_SET(p, ORTE_PROC_FLAG_REG);
77+
ORTE_ACTIVATE_PROC_STATE(&p->name, ORTE_PROC_STATE_REGISTERED);
78+
}
79+
OBJ_RELEASE(cd);
80+
}
81+
4682
int pmix_server_client_connected_fn(opal_process_name_t *proc, void *server_object)
4783
{
48-
ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_REGISTERED);
84+
/* need to thread-shift this request as we are going
85+
* to access our global list of registered events */
86+
ORTE_PMIX_THREADSHIFT(proc, server_object, OPAL_SUCCESS, NULL,
87+
NULL, _client_conn, NULL, NULL);
4988
return ORTE_SUCCESS;
5089
}
5190

52-
int pmix_server_client_finalized_fn(opal_process_name_t *proc, void *server_object,
53-
opal_pmix_op_cbfunc_t cbfunc, void *cbdata)
91+
static void _client_finalized(int sd, short args, void *cbdata)
5492
{
93+
orte_pmix_server_op_caddy_t *cd = (orte_pmix_server_op_caddy_t*)cbdata;
5594
orte_job_t *jdata;
5695
orte_proc_t *p, *ptr;
5796
int i;
5897

59-
if (NULL != cbdata) {
98+
if (NULL != cd->server_object) {
6099
/* we were passed back the orte_proc_t */
61-
p = (orte_proc_t*)cbdata;
100+
p = (orte_proc_t*)cd->server_object;
62101
} else {
63102
/* find the named process */
64103
p = NULL;
65-
if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
66-
return ORTE_ERR_NOT_FOUND;
104+
if (NULL == (jdata = orte_get_job_data_object(cd->proc->jobid))) {
105+
return;
67106
}
68107
for (i=0; i < jdata->procs->size; i++) {
69108
if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
70109
continue;
71110
}
72-
if (proc->jobid != ptr->name.jobid) {
111+
if (cd->proc->jobid != ptr->name.jobid) {
73112
continue;
74113
}
75-
if (proc->vpid == ptr->name.vpid) {
114+
if (cd->proc->vpid == ptr->name.vpid) {
76115
p = ptr;
77116
break;
78117
}
79118
}
80119
}
81120
if (NULL != p) {
82-
p->state = ORTE_PROC_STATE_TERMINATED;
83-
/* release the caller */
84-
if (NULL != cbfunc) {
85-
cbfunc(ORTE_SUCCESS, cbdata);
121+
ORTE_FLAG_SET(p, ORTE_PROC_FLAG_HAS_DEREG);
122+
/* release the caller */
123+
if (NULL != cd->cbfunc) {
124+
cd->cbfunc(ORTE_SUCCESS, cd->cbdata);
86125
}
87-
return ORTE_SUCCESS;
88126
}
89-
return ORTE_ERR_NOT_FOUND;
127+
OBJ_RELEASE(cd);
90128
}
91129

92-
int pmix_server_abort_fn(opal_process_name_t *proc, void *server_object,
93-
int status, const char msg[],
94-
opal_list_t *procs_to_abort,
95-
opal_pmix_op_cbfunc_t cbfunc, void *cbdata)
130+
int pmix_server_client_finalized_fn(opal_process_name_t *proc, void *server_object,
131+
opal_pmix_op_cbfunc_t cbfunc, void *cbdata)
96132
{
97-
orte_proc_t *p;
133+
/* need to thread-shift this request as we are going
134+
* to access our global list of registered events */
135+
ORTE_PMIX_THREADSHIFT(proc, server_object, OPAL_SUCCESS, NULL,
136+
NULL, _client_finalized, cbfunc, cbdata);
137+
return ORTE_SUCCESS;
138+
139+
}
140+
141+
static void _client_abort(int sd, short args, void *cbdata)
142+
{
143+
orte_pmix_server_op_caddy_t *cd = (orte_pmix_server_op_caddy_t*)cbdata;
144+
orte_job_t *jdata;
145+
orte_proc_t *p, *ptr;
146+
int i;
98147

99-
if (NULL != server_object) {
100-
p = (orte_proc_t*)server_object;
101-
p->exit_code = status;
148+
if (NULL != cd->server_object) {
149+
p = (orte_proc_t*)cd->server_object;
150+
} else {
151+
/* find the named process */
152+
p = NULL;
153+
if (NULL == (jdata = orte_get_job_data_object(cd->proc->jobid))) {
154+
return;
155+
}
156+
for (i=0; i < jdata->procs->size; i++) {
157+
if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) {
158+
continue;
159+
}
160+
if (cd->proc->jobid != ptr->name.jobid) {
161+
continue;
162+
}
163+
if (cd->proc->vpid == ptr->name.vpid) {
164+
p = ptr;
165+
break;
166+
}
167+
}
168+
}
169+
if (NULL != p) {
170+
p->exit_code = cd->status;
171+
ORTE_ACTIVATE_PROC_STATE(&p->name, ORTE_PROC_STATE_CALLED_ABORT);
102172
}
103173

104-
ORTE_UPDATE_EXIT_STATUS(status);
105-
ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_CALLED_ABORT);
174+
ORTE_UPDATE_EXIT_STATUS(cd->status);
106175

107176
/* release the caller */
108-
if (NULL != cbfunc) {
109-
cbfunc(OPAL_SUCCESS, cbdata);
177+
if (NULL != cd->cbfunc) {
178+
cd->cbfunc(OPAL_SUCCESS, cd->cbdata);
110179
}
111-
return OPAL_SUCCESS;
180+
OBJ_RELEASE(cd);
181+
}
182+
183+
int pmix_server_abort_fn(opal_process_name_t *proc, void *server_object,
184+
int status, const char msg[],
185+
opal_list_t *procs_to_abort,
186+
opal_pmix_op_cbfunc_t cbfunc, void *cbdata)
187+
{
188+
/* need to thread-shift this request as we are going
189+
* to access our global list of registered events */
190+
ORTE_PMIX_THREADSHIFT(proc, server_object, status, msg,
191+
procs_to_abort, _client_abort, cbfunc, cbdata);
192+
return ORTE_SUCCESS;
112193
}
113194

114195
static void _register_events(int sd, short args, void *cbdata)

orte/orted/pmix/pmix_server_internal.h

Lines changed: 60 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,10 @@ OBJ_CLASS_DECLARATION(pmix_server_req_t);
7474
typedef struct {
7575
opal_object_t super;
7676
opal_event_t ev;
77+
int status;
78+
opal_process_name_t *proc;
79+
const char *msg;
80+
void *server_object;
7781
opal_list_t *procs;
7882
opal_list_t *eprocs;
7983
opal_list_t *info;
@@ -90,46 +94,62 @@ typedef struct {
9094
} orte_pmix_mdx_caddy_t;
9195
OBJ_CLASS_DECLARATION(orte_pmix_mdx_caddy_t);
9296

93-
#define ORTE_DMX_REQ(p, cf, ocf, ocd) \
94-
do { \
95-
pmix_server_req_t *_req; \
96-
_req = OBJ_NEW(pmix_server_req_t); \
97-
_req->target = (p); \
98-
_req->mdxcbfunc = (ocf); \
99-
_req->cbdata = (ocd); \
100-
opal_event_set(orte_event_base, &(_req->ev), \
101-
-1, OPAL_EV_WRITE, (cf), _req); \
102-
opal_event_set_priority(&(_req->ev), ORTE_MSG_PRI); \
103-
opal_event_active(&(_req->ev), OPAL_EV_WRITE, 1); \
104-
} while(0);
105-
106-
#define ORTE_SPN_REQ(j, cf, ocf, ocd) \
107-
do { \
108-
pmix_server_req_t *_req; \
109-
_req = OBJ_NEW(pmix_server_req_t); \
110-
_req->jdata = (j); \
111-
_req->spcbfunc = (ocf); \
112-
_req->cbdata = (ocd); \
113-
opal_event_set(orte_event_base, &(_req->ev), \
114-
-1, OPAL_EV_WRITE, (cf), _req); \
115-
opal_event_set_priority(&(_req->ev), ORTE_MSG_PRI); \
116-
opal_event_active(&(_req->ev), OPAL_EV_WRITE, 1); \
117-
} while(0);
118-
119-
#define ORTE_PMIX_OPERATION(p, i, fn, cf, cb) \
120-
do { \
121-
orte_pmix_server_op_caddy_t *_cd; \
122-
_cd = OBJ_NEW(orte_pmix_server_op_caddy_t); \
123-
_cd->procs = (p); \
124-
_cd->info = (i); \
125-
_cd->cbfunc = (cf); \
126-
_cd->cbdata = (cb); \
127-
opal_event_set(orte_event_base, &(_cd->ev), -1, \
128-
OPAL_EV_WRITE, (fn), _cd); \
129-
opal_event_set_priority(&(_cd->ev), ORTE_MSG_PRI); \
130-
opal_event_active(&(_cd->ev), OPAL_EV_WRITE, 1); \
131-
} while(0);
132-
97+
#define ORTE_DMX_REQ(p, cf, ocf, ocd) \
98+
do { \
99+
pmix_server_req_t *_req; \
100+
_req = OBJ_NEW(pmix_server_req_t); \
101+
_req->target = (p); \
102+
_req->mdxcbfunc = (ocf); \
103+
_req->cbdata = (ocd); \
104+
opal_event_set(orte_event_base, &(_req->ev), \
105+
-1, OPAL_EV_WRITE, (cf), _req); \
106+
opal_event_set_priority(&(_req->ev), ORTE_MSG_PRI); \
107+
opal_event_active(&(_req->ev), OPAL_EV_WRITE, 1); \
108+
} while(0);
109+
110+
#define ORTE_SPN_REQ(j, cf, ocf, ocd) \
111+
do { \
112+
pmix_server_req_t *_req; \
113+
_req = OBJ_NEW(pmix_server_req_t); \
114+
_req->jdata = (j); \
115+
_req->spcbfunc = (ocf); \
116+
_req->cbdata = (ocd); \
117+
opal_event_set(orte_event_base, &(_req->ev), \
118+
-1, OPAL_EV_WRITE, (cf), _req); \
119+
opal_event_set_priority(&(_req->ev), ORTE_MSG_PRI); \
120+
opal_event_active(&(_req->ev), OPAL_EV_WRITE, 1); \
121+
} while(0);
122+
123+
#define ORTE_PMIX_OPERATION(p, i, fn, cf, cb) \
124+
do { \
125+
orte_pmix_server_op_caddy_t *_cd; \
126+
_cd = OBJ_NEW(orte_pmix_server_op_caddy_t); \
127+
_cd->procs = (p); \
128+
_cd->info = (i); \
129+
_cd->cbfunc = (cf); \
130+
_cd->cbdata = (cb); \
131+
opal_event_set(orte_event_base, &(_cd->ev), -1, \
132+
OPAL_EV_WRITE, (fn), _cd); \
133+
opal_event_set_priority(&(_cd->ev), ORTE_MSG_PRI); \
134+
opal_event_active(&(_cd->ev), OPAL_EV_WRITE, 1); \
135+
} while(0);
136+
137+
#define ORTE_PMIX_THREADSHIFT(p, s, st, m, pl, fn, cf, cb) \
138+
do { \
139+
orte_pmix_server_op_caddy_t *_cd; \
140+
_cd = OBJ_NEW(orte_pmix_server_op_caddy_t); \
141+
_cd->proc = (p); \
142+
_cd->server_object = (s); \
143+
_cd->status = (st); \
144+
_cd->msg = (m); \
145+
_cd->procs = (pl); \
146+
_cd->cbfunc = (cf); \
147+
_cd->cbdata = (cb); \
148+
opal_event_set(orte_event_base, &(_cd->ev), -1, \
149+
OPAL_EV_WRITE, (fn), _cd); \
150+
opal_event_set_priority(&(_cd->ev), ORTE_MSG_PRI); \
151+
opal_event_active(&(_cd->ev), OPAL_EV_WRITE, 1); \
152+
} while(0);
133153

134154
/* define the server module functions */
135155
extern int pmix_server_client_connected_fn(opal_process_name_t *proc, void* server_object);

0 commit comments

Comments
 (0)