Skip to content

Commit ed58460

Browse files
author
rhc54
authored
Merge pull request #2033 from rhc54/topic/state
Ensure that the "running" state is correctly updated
2 parents bfe0327 + 9b991bd commit ed58460

File tree

1 file changed

+52
-9
lines changed

1 file changed

+52
-9
lines changed

orte/mca/state/orted/state_orted.c

Lines changed: 52 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -156,28 +156,71 @@ static void track_jobs(int fd, short argc, void *cbdata)
156156
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
157157
opal_buffer_t *alert;
158158
orte_plm_cmd_flag_t cmd;
159-
int rc;
159+
int rc, i;
160+
orte_proc_state_t running = ORTE_PROC_STATE_RUNNING;
161+
orte_proc_t *child;
162+
orte_vpid_t null=ORTE_VPID_INVALID;
160163

161164
if (ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE == caddy->job_state) {
162165
OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
163-
"%s state:orted:track_jobs sending local launch complete for job %s",
164-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
165-
ORTE_JOBID_PRINT(caddy->jdata->jobid)));
166+
"%s state:orted:track_jobs sending local launch complete for job %s",
167+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
168+
ORTE_JOBID_PRINT(caddy->jdata->jobid)));
166169
/* update the HNP with all proc states for this job */
167-
alert = OBJ_NEW(opal_buffer_t);
168-
/* pack update state command */
170+
alert = OBJ_NEW(opal_buffer_t);
171+
/* pack update state command */
169172
cmd = ORTE_PLM_UPDATE_PROC_STATE;
170173
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
171174
ORTE_ERROR_LOG(rc);
172175
OBJ_RELEASE(alert);
173176
goto cleanup;
174177
}
175-
/* pack the job info */
176-
if (ORTE_SUCCESS != (rc = pack_state_update(alert, caddy->jdata))) {
178+
/* pack the jobid */
179+
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &caddy->jdata->jobid, 1, ORTE_JOBID))) {
180+
ORTE_ERROR_LOG(rc);
181+
OBJ_RELEASE(alert);
182+
goto cleanup;
183+
}
184+
for (i=0; i < orte_local_children->size; i++) {
185+
if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
186+
continue;
187+
}
188+
/* if this child is part of the job... */
189+
if (child->name.jobid == caddy->jdata->jobid) {
190+
/* pack the child's vpid */
191+
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &(child->name.vpid), 1, ORTE_VPID))) {
192+
ORTE_ERROR_LOG(rc);
193+
OBJ_RELEASE(alert);
194+
goto cleanup;
195+
}
196+
/* pack the pid */
197+
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->pid, 1, OPAL_PID))) {
198+
ORTE_ERROR_LOG(rc);
199+
OBJ_RELEASE(alert);
200+
goto cleanup;
201+
}
202+
/* pack the RUNNING state */
203+
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &running, 1, ORTE_PROC_STATE))) {
204+
ORTE_ERROR_LOG(rc);
205+
OBJ_RELEASE(alert);
206+
goto cleanup;
207+
}
208+
/* pack its exit code */
209+
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->exit_code, 1, ORTE_EXIT_CODE))) {
210+
ORTE_ERROR_LOG(rc);
211+
OBJ_RELEASE(alert);
212+
goto cleanup;
213+
}
214+
}
215+
}
216+
217+
/* flag that this job is complete so the receiver can know */
218+
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
177219
ORTE_ERROR_LOG(rc);
178220
OBJ_RELEASE(alert);
179221
goto cleanup;
180222
}
223+
181224
/* send it */
182225
if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
183226
ORTE_RML_TAG_PLM,
@@ -187,7 +230,7 @@ static void track_jobs(int fd, short argc, void *cbdata)
187230
}
188231
}
189232

190-
cleanup:
233+
cleanup:
191234
OBJ_RELEASE(caddy);
192235
}
193236

0 commit comments

Comments
 (0)