Skip to content

Commit c4b8852

Browse files
committed
Merge pull request open-mpi#717 from rhc54/cmr2.x/wait
Resolve a race condition when registering sigchild callbacks
2 parents 139369a + c402b2f commit c4b8852

File tree

2 files changed

+21
-34
lines changed

2 files changed

+21
-34
lines changed

orte/mca/odls/base/odls_base_default_fns.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1025,7 +1025,7 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
10251025
}
10261026

10271027
if (5 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
1028-
opal_output(orte_odls_base_framework.framework_output, "%s odls:launch: spawning child %s",
1028+
opal_output(orte_odls_base_framework.framework_output, "%s odls:launch spawning child %s",
10291029
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
10301030
ORTE_NAME_PRINT(&child->name));
10311031

orte/runtime/orte_wait.c

Lines changed: 20 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -147,53 +147,40 @@ int orte_wait_finalize(void)
147147
return ORTE_SUCCESS;
148148
}
149149

150-
static void register_callback(int fd, short args, void *cbdata)
150+
/* this function *must* always be called from
151+
* within an event in the orte_event_base */
152+
void orte_wait_cb(orte_proc_t *child, orte_wait_fn_t callback, void *data)
151153
{
152-
orte_wait_tracker_t *trk = (orte_wait_tracker_t*)cbdata;
153154
orte_wait_tracker_t *t2;
154155

156+
if (NULL == child || NULL == callback) {
157+
/* bozo protection */
158+
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
159+
return;
160+
}
161+
155162
/* see if this proc is still alive */
156-
if (!ORTE_FLAG_TEST(trk->child, ORTE_PROC_FLAG_ALIVE)) {
163+
if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
157164
/* already heard this proc is dead, so just do the callback */
158-
if (NULL != trk->cbfunc) {
159-
trk->cbfunc(trk->child, trk->cbdata);
160-
OBJ_RELEASE(trk);
161-
return;
162-
}
165+
callback(child, data);
166+
return;
163167
}
164168

165169
/* we just override any existing registration */
166170
OPAL_LIST_FOREACH(t2, &pending_cbs, orte_wait_tracker_t) {
167-
if (t2->child == trk->child) {
168-
t2->cbfunc = trk->cbfunc;
169-
t2->cbdata = trk->cbdata;
170-
OBJ_RELEASE(trk);
171+
if (t2->child == child) {
172+
t2->cbfunc = callback;
173+
t2->cbdata = data;
171174
return;
172175
}
173176
}
174177
/* get here if this is a new registration */
175-
opal_list_append(&pending_cbs, &trk->super);
176-
}
177-
178-
void orte_wait_cb(orte_proc_t *child, orte_wait_fn_t callback, void *data)
179-
{
180-
orte_wait_tracker_t *trk;
181-
182-
if (NULL == child || NULL == callback) {
183-
/* bozo protection */
184-
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
185-
return;
186-
}
187-
188-
/* push this into the event library for handling */
189-
trk = OBJ_NEW(orte_wait_tracker_t);
178+
t2 = OBJ_NEW(orte_wait_tracker_t);
190179
OBJ_RETAIN(child); // protect against race conditions
191-
trk->child = child;
192-
trk->cbfunc = callback;
193-
trk->cbdata = data;
194-
opal_event_set(orte_event_base, &trk->ev, -1, OPAL_EV_WRITE, register_callback, trk);
195-
opal_event_set_priority(&trk->ev, ORTE_SYS_PRI);
196-
opal_event_active(&trk->ev, OPAL_EV_WRITE, 1);
180+
t2->child = child;
181+
t2->cbfunc = callback;
182+
t2->cbdata = data;
183+
opal_list_append(&pending_cbs, &t2->super);
197184
}
198185

199186
static void cancel_callback(int fd, short args, void *cbdata)

0 commit comments

Comments
 (0)