@@ -156,28 +156,71 @@ static void track_jobs(int fd, short argc, void *cbdata)
156156 orte_state_caddy_t * caddy = (orte_state_caddy_t * )cbdata ;
157157 opal_buffer_t * alert ;
158158 orte_plm_cmd_flag_t cmd ;
159- int rc ;
159+ int rc , i ;
160+ orte_proc_state_t running = ORTE_PROC_STATE_RUNNING ;
161+ orte_proc_t * child ;
162+ orte_vpid_t null = ORTE_VPID_INVALID ;
160163
161164 if (ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE == caddy -> job_state ) {
162165 OPAL_OUTPUT_VERBOSE ((5 , orte_state_base_framework .framework_output ,
163- "%s state:orted:track_jobs sending local launch complete for job %s" ,
164- ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
165- ORTE_JOBID_PRINT (caddy -> jdata -> jobid )));
166+ "%s state:orted:track_jobs sending local launch complete for job %s" ,
167+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
168+ ORTE_JOBID_PRINT (caddy -> jdata -> jobid )));
166169 /* update the HNP with all proc states for this job */
167- alert = OBJ_NEW (opal_buffer_t );
168- /* pack update state command */
170+ alert = OBJ_NEW (opal_buffer_t );
171+ /* pack update state command */
169172 cmd = ORTE_PLM_UPDATE_PROC_STATE ;
170173 if (ORTE_SUCCESS != (rc = opal_dss .pack (alert , & cmd , 1 , ORTE_PLM_CMD ))) {
171174 ORTE_ERROR_LOG (rc );
172175 OBJ_RELEASE (alert );
173176 goto cleanup ;
174177 }
175- /* pack the job info */
176- if (ORTE_SUCCESS != (rc = pack_state_update (alert , caddy -> jdata ))) {
178+ /* pack the jobid */
179+ if (ORTE_SUCCESS != (rc = opal_dss .pack (alert , & caddy -> jdata -> jobid , 1 , ORTE_JOBID ))) {
180+ ORTE_ERROR_LOG (rc );
181+ OBJ_RELEASE (alert );
182+ goto cleanup ;
183+ }
184+ for (i = 0 ; i < orte_local_children -> size ; i ++ ) {
185+ if (NULL == (child = (orte_proc_t * )opal_pointer_array_get_item (orte_local_children , i ))) {
186+ continue ;
187+ }
188+ /* if this child is part of the job... */
189+ if (child -> name .jobid == caddy -> jdata -> jobid ) {
190+ /* pack the child's vpid */
191+ if (ORTE_SUCCESS != (rc = opal_dss .pack (alert , & (child -> name .vpid ), 1 , ORTE_VPID ))) {
192+ ORTE_ERROR_LOG (rc );
193+ OBJ_RELEASE (alert );
194+ goto cleanup ;
195+ }
196+ /* pack the pid */
197+ if (ORTE_SUCCESS != (rc = opal_dss .pack (alert , & child -> pid , 1 , OPAL_PID ))) {
198+ ORTE_ERROR_LOG (rc );
199+ OBJ_RELEASE (alert );
200+ goto cleanup ;
201+ }
202+ /* pack the RUNNING state */
203+ if (ORTE_SUCCESS != (rc = opal_dss .pack (alert , & running , 1 , ORTE_PROC_STATE ))) {
204+ ORTE_ERROR_LOG (rc );
205+ OBJ_RELEASE (alert );
206+ goto cleanup ;
207+ }
208+ /* pack its exit code */
209+ if (ORTE_SUCCESS != (rc = opal_dss .pack (alert , & child -> exit_code , 1 , ORTE_EXIT_CODE ))) {
210+ ORTE_ERROR_LOG (rc );
211+ OBJ_RELEASE (alert );
212+ goto cleanup ;
213+ }
214+ }
215+ }
216+
217+ /* flag that this job is complete so the receiver can know */
218+ if (ORTE_SUCCESS != (rc = opal_dss .pack (alert , & null , 1 , ORTE_VPID ))) {
177219 ORTE_ERROR_LOG (rc );
178220 OBJ_RELEASE (alert );
179221 goto cleanup ;
180222 }
223+
181224 /* send it */
182225 if (0 > (rc = orte_rml .send_buffer_nb (ORTE_PROC_MY_HNP , alert ,
183226 ORTE_RML_TAG_PLM ,
@@ -187,7 +230,7 @@ static void track_jobs(int fd, short argc, void *cbdata)
187230 }
188231 }
189232
190- cleanup :
233+ cleanup :
191234 OBJ_RELEASE (caddy );
192235}
193236
0 commit comments