Skip to content

Commit 9b991bd

Browse files
author
Ralph Castain
committed
Ensure that the "running" state is correctly updated
It is possible that one or more procs could get thru PMIx_Init, and thus be marked as in state "registered", before all local procs have been started. If that happens, then we would report some of the procs in state "running", and the others in state "registered" - which means that the HNP would miss the "running" stage of the state machine. Thanks to Jingchao Zhang for his patience in tracking this down on the 2.0 branch
1 parent bfe0327 commit 9b991bd

File tree

1 file changed

+52
-9
lines changed

1 file changed

+52
-9
lines changed

orte/mca/state/orted/state_orted.c

Lines changed: 52 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -156,28 +156,71 @@ static void track_jobs(int fd, short argc, void *cbdata)
156156
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
157157
opal_buffer_t *alert;
158158
orte_plm_cmd_flag_t cmd;
159-
int rc;
159+
int rc, i;
160+
orte_proc_state_t running = ORTE_PROC_STATE_RUNNING;
161+
orte_proc_t *child;
162+
orte_vpid_t null=ORTE_VPID_INVALID;
160163

161164
if (ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE == caddy->job_state) {
162165
OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
163-
"%s state:orted:track_jobs sending local launch complete for job %s",
164-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
165-
ORTE_JOBID_PRINT(caddy->jdata->jobid)));
166+
"%s state:orted:track_jobs sending local launch complete for job %s",
167+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
168+
ORTE_JOBID_PRINT(caddy->jdata->jobid)));
166169
/* update the HNP with all proc states for this job */
167-
alert = OBJ_NEW(opal_buffer_t);
168-
/* pack update state command */
170+
alert = OBJ_NEW(opal_buffer_t);
171+
/* pack update state command */
169172
cmd = ORTE_PLM_UPDATE_PROC_STATE;
170173
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
171174
ORTE_ERROR_LOG(rc);
172175
OBJ_RELEASE(alert);
173176
goto cleanup;
174177
}
175-
/* pack the job info */
176-
if (ORTE_SUCCESS != (rc = pack_state_update(alert, caddy->jdata))) {
178+
/* pack the jobid */
179+
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &caddy->jdata->jobid, 1, ORTE_JOBID))) {
180+
ORTE_ERROR_LOG(rc);
181+
OBJ_RELEASE(alert);
182+
goto cleanup;
183+
}
184+
for (i=0; i < orte_local_children->size; i++) {
185+
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
186+
continue;
187+
}
188+
/* if this child is part of the job... */
189+
if (child->name.jobid == caddy->jdata->jobid) {
190+
/* pack the child's vpid */
191+
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &(child->name.vpid), 1, ORTE_VPID))) {
192+
ORTE_ERROR_LOG(rc);
193+
OBJ_RELEASE(alert);
194+
goto cleanup;
195+
}
196+
/* pack the pid */
197+
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->pid, 1, OPAL_PID))) {
198+
ORTE_ERROR_LOG(rc);
199+
OBJ_RELEASE(alert);
200+
goto cleanup;
201+
}
202+
/* pack the RUNNING state */
203+
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &running, 1, ORTE_PROC_STATE))) {
204+
ORTE_ERROR_LOG(rc);
205+
OBJ_RELEASE(alert);
206+
goto cleanup;
207+
}
208+
/* pack its exit code */
209+
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->exit_code, 1, ORTE_EXIT_CODE))) {
210+
ORTE_ERROR_LOG(rc);
211+
OBJ_RELEASE(alert);
212+
goto cleanup;
213+
}
214+
}
215+
}
216+
217+
/* flag that this job is complete so the receiver can know */
218+
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
177219
ORTE_ERROR_LOG(rc);
178220
OBJ_RELEASE(alert);
179221
goto cleanup;
180222
}
223+
181224
/* send it */
182225
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
183226
ORTE_RML_TAG_PLM,
@@ -187,7 +230,7 @@ static void track_jobs(int fd, short argc, void *cbdata)
187230
}
188231
}
189232

190-
cleanup:
233+
cleanup:
191234
OBJ_RELEASE(caddy);
192235
}
193236

0 commit comments

Comments
 (0)