@@ -54,12 +54,12 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
5454{
5555 int rc ;
5656 orte_proc_t * pptr ;
57- int i , k ;
57+ int i , k , n , nlocalprocs ;
5858 opal_list_t * info , * pmap ;
5959 opal_value_t * kv ;
6060 orte_node_t * node , * n2 ;
6161 opal_vpid_t vpid ;
62- char * * list , * * procs , * * micro , * tmp , * regex ;
62+ char * * list , * * procs , * * micro , * tmp , * regex , * cpulist , * peerlist ;
6363 orte_job_t * dmns ;
6464 orte_job_map_t * map ;
6565 orte_app_context_t * app ;
@@ -178,6 +178,48 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
178178 kv -> data .uint32 = node -> index ;
179179 opal_list_append (info , & kv -> super );
180180
181+ /* pass our node size */
182+ kv = OBJ_NEW (opal_value_t );
183+ kv -> key = strdup (OPAL_PMIX_NODE_SIZE );
184+ kv -> type = OPAL_UINT32 ;
185+ kv -> data .uint32 = node -> num_procs ;
186+ opal_list_append (info , & kv -> super );
187+
188+ /* univ size */
189+ kv = OBJ_NEW (opal_value_t );
190+ kv -> key = strdup (OPAL_PMIX_UNIV_SIZE );
191+ kv -> type = OPAL_UINT32 ;
192+ kv -> data .uint32 = jdata -> total_slots_alloc ;
193+ opal_list_append (info , & kv -> super );
194+
195+ /* job size */
196+ kv = OBJ_NEW (opal_value_t );
197+ kv -> key = strdup (OPAL_PMIX_JOB_SIZE );
198+ kv -> type = OPAL_UINT32 ;
199+ kv -> data .uint32 = jdata -> num_procs ;
200+ opal_list_append (info , & kv -> super );
201+
202+ /* number of apps in this job */
203+ kv = OBJ_NEW (opal_value_t );
204+ kv -> key = strdup (OPAL_PMIX_JOB_NUM_APPS );
205+ kv -> type = OPAL_UINT32 ;
206+ kv -> data .uint32 = jdata -> num_apps ;
207+ opal_list_append (info , & kv -> super );
208+
209+ /* local size */
210+ kv = OBJ_NEW (opal_value_t );
211+ kv -> key = strdup (OPAL_PMIX_LOCAL_SIZE );
212+ kv -> type = OPAL_UINT32 ;
213+ kv -> data .uint32 = jdata -> num_local_procs ;
214+ opal_list_append (info , & kv -> super );
215+
216+ /* max procs */
217+ kv = OBJ_NEW (opal_value_t );
218+ kv -> key = strdup (OPAL_PMIX_MAX_PROCS );
219+ kv -> type = OPAL_UINT32 ;
220+ kv -> data .uint32 = jdata -> total_slots_alloc ;
221+ opal_list_append (info , & kv -> super );
222+
181223 /* identify our local node object within the map,
182224 * if we were included */
183225 node = NULL ;
@@ -192,22 +234,55 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
192234 }
193235 }
194236 if (NULL != node ) {
195- /* node size */
237+ vpid = ORTE_VPID_MAX ;
238+ for (i = 0 ; i < node -> procs -> size ; i ++ ) {
239+ if (NULL == (pptr = (orte_proc_t * )opal_pointer_array_get_item (node -> procs , i ))) {
240+ continue ;
241+ }
242+ if (pptr -> name .jobid == jdata -> jobid ) {
243+ if (pptr -> name .vpid < vpid ) {
244+ vpid = pptr -> name .vpid ;
245+ }
246+ /* go ahead and register this client */
247+ if (OPAL_SUCCESS != (rc = opal_pmix .server_register_client (& pptr -> name , uid , gid ,
248+ (void * )pptr , NULL , NULL ))) {
249+ ORTE_ERROR_LOG (rc );
250+ }
251+ }
252+ }
253+ /* pass the local ldr */
196254 kv = OBJ_NEW (opal_value_t );
197- kv -> key = strdup (OPAL_PMIX_NODE_SIZE );
198- kv -> type = OPAL_UINT32 ;
199- kv -> data .uint32 = node -> num_procs ;
255+ kv -> key = strdup (OPAL_PMIX_LOCALLDR );
256+ kv -> type = OPAL_VPID ;
257+ kv -> data .name . vpid = vpid ;
200258 opal_list_append (info , & kv -> super );
259+ }
260+
261+ /* for each proc in this job, create an object that
262+ * includes the info describing the proc so the recipient has a complete
263+ * picture. This allows procs to connect to each other without
264+ * any further info exchange, assuming the underlying transports
265+ * support it. We also pass all the proc-specific data here so
266+ * that each proc can lookup info about every other proc in the job */
267+
268+ for (n = 0 ; n < map -> nodes -> size ; n ++ ) {
269+ if (NULL == (node = (orte_node_t * )opal_pointer_array_get_item (map -> nodes , n ))) {
270+ continue ;
271+ }
201272 /* construct the list of local peers, while adding
202273 * each proc's locality info */
203274 list = NULL ;
204275 procs = NULL ;
276+ cpulist = NULL ;
277+ peerlist = NULL ;
205278 vpid = ORTE_VPID_MAX ;
279+ nlocalprocs = 0 ;
206280 for (i = 0 ; i < node -> procs -> size ; i ++ ) {
207281 if (NULL == (pptr = (orte_proc_t * )opal_pointer_array_get_item (node -> procs , i ))) {
208282 continue ;
209283 }
210284 if (pptr -> name .jobid == jdata -> jobid ) {
285+ ++ nlocalprocs ;
211286 opal_argv_append_nosize (& list , ORTE_VPID_PRINT (pptr -> name .vpid ));
212287 if (pptr -> name .vpid < vpid ) {
213288 vpid = pptr -> name .vpid ;
@@ -225,168 +300,127 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata)
225300 } else {
226301 opal_argv_append_nosize (& procs , "UNBOUND" );
227302 }
228- /* go ahead and register this client */
229- if (OPAL_SUCCESS != (rc = opal_pmix .server_register_client (& pptr -> name , uid , gid ,
230- (void * )pptr , NULL , NULL ))) {
231- ORTE_ERROR_LOG (rc );
232- }
233303 }
234304 }
235-
236305 /* construct the list of peers for transmission */
237306 if (NULL != list ) {
238- tmp = opal_argv_join (list , ',' );
307+ peerlist = opal_argv_join (list , ',' );
239308 opal_argv_free (list );
240309 list = NULL ;
241- /* pass the list of peers */
242- kv = OBJ_NEW (opal_value_t );
243- kv -> key = strdup (OPAL_PMIX_LOCAL_PEERS );
244- kv -> type = OPAL_STRING ;
245- kv -> data .string = tmp ;
246- opal_list_append (info , & kv -> super );
247310 }
248311 /* construct the list of cpusets for transmission */
249312 if (NULL != procs ) {
250- tmp = opal_argv_join (procs , ':' );
313+ cpulist = opal_argv_join (procs , ':' );
251314 opal_argv_free (procs );
252315 procs = NULL ;
253- /* pass the list of cpusets */
254- kv = OBJ_NEW (opal_value_t );
255- kv -> key = strdup (OPAL_PMIX_LOCAL_CPUSETS );
256- kv -> type = OPAL_STRING ;
257- kv -> data .string = tmp ;
258- opal_list_append (info , & kv -> super );
259316 }
260- /* pass the local ldr */
261- kv = OBJ_NEW (opal_value_t );
262- kv -> key = strdup (OPAL_PMIX_LOCALLDR );
263- kv -> type = OPAL_VPID ;
264- kv -> data .name .vpid = vpid ;
265- opal_list_append (info , & kv -> super );
266- }
267-
268- /* univ size */
269- kv = OBJ_NEW (opal_value_t );
270- kv -> key = strdup (OPAL_PMIX_UNIV_SIZE );
271- kv -> type = OPAL_UINT32 ;
272- kv -> data .uint32 = jdata -> total_slots_alloc ;
273- opal_list_append (info , & kv -> super );
274-
275- /* job size */
276- kv = OBJ_NEW (opal_value_t );
277- kv -> key = strdup (OPAL_PMIX_JOB_SIZE );
278- kv -> type = OPAL_UINT32 ;
279- kv -> data .uint32 = jdata -> num_procs ;
280- opal_list_append (info , & kv -> super );
281-
282- /* number of apps in this job */
283- kv = OBJ_NEW (opal_value_t );
284- kv -> key = strdup (OPAL_PMIX_JOB_NUM_APPS );
285- kv -> type = OPAL_UINT32 ;
286- kv -> data .uint32 = jdata -> num_apps ;
287- opal_list_append (info , & kv -> super );
288317
289- /* local size */
290- kv = OBJ_NEW ( opal_value_t );
291- kv -> key = strdup ( OPAL_PMIX_LOCAL_SIZE );
292- kv -> type = OPAL_UINT32 ;
293- kv -> data . uint32 = jdata -> num_local_procs ;
294- opal_list_append ( info , & kv -> super );
295-
296- /* max procs */
297- kv = OBJ_NEW ( opal_value_t );
298- kv -> key = strdup ( OPAL_PMIX_MAX_PROCS ) ;
299- kv -> type = OPAL_UINT32 ;
300- kv -> data . uint32 = jdata -> total_slots_alloc ;
301- opal_list_append ( info , & kv -> super ) ;
318+ /* now cycle across each proc on this node, passing all data that
319+ * varies by proc */
320+ for ( i = 0 ; i < node -> procs -> size ; i ++ ) {
321+ if ( NULL == ( pptr = ( orte_proc_t * ) opal_pointer_array_get_item ( node -> procs , i ))) {
322+ continue ;
323+ }
324+ /* setup the proc map object */
325+ kv = OBJ_NEW ( opal_value_t );
326+ kv -> key = strdup ( OPAL_PMIX_PROC_DATA );
327+ kv -> type = OPAL_PTR ;
328+ kv -> data . ptr = OBJ_NEW ( opal_list_t ) ;
329+ opal_list_append ( info , & kv -> super ) ;
330+ pmap = kv -> data . ptr ;
302331
303- /* for each proc in this job, create an object that
304- * includes the info describing the proc so the recipient has a complete
305- * picture. This allows procs to connect to each other without
306- * an further info exchange, assuming the underlying transports
307- * support it */
332+ /* must start with rank */
333+ kv = OBJ_NEW (opal_value_t );
334+ kv -> key = strdup (OPAL_PMIX_RANK );
335+ kv -> type = OPAL_VPID ;
336+ kv -> data .name .vpid = pptr -> name .vpid ;
337+ opal_list_append (pmap , & kv -> super );
308338
309- for (i = 0 ; i < jdata -> procs -> size ; i ++ ) {
310- if (NULL == (pptr = (orte_proc_t * )opal_pointer_array_get_item (jdata -> procs , i ))) {
311- continue ;
312- }
313- kv = OBJ_NEW (opal_value_t );
314- kv -> key = strdup (OPAL_PMIX_PROC_DATA );
315- kv -> type = OPAL_PTR ;
316- kv -> data .ptr = OBJ_NEW (opal_list_t );
317- opal_list_append (info , & kv -> super );
318- pmap = kv -> data .ptr ;
339+ /* pass the list of peers */
340+ kv = OBJ_NEW (opal_value_t );
341+ kv -> key = strdup (OPAL_PMIX_LOCAL_PEERS );
342+ kv -> type = OPAL_STRING ;
343+ kv -> data .string = peerlist ;
344+ opal_list_append (pmap , & kv -> super );
319345
320- /* rank */
321- kv = OBJ_NEW (opal_value_t );
322- kv -> key = strdup (OPAL_PMIX_RANK );
323- kv -> type = OPAL_VPID ;
324- kv -> data .name . vpid = pptr -> name . vpid ;
325- opal_list_append (pmap , & kv -> super );
346+ /* pass the list of cpusets */
347+ kv = OBJ_NEW (opal_value_t );
348+ kv -> key = strdup (OPAL_PMIX_LOCAL_CPUSETS );
349+ kv -> type = OPAL_STRING ;
350+ kv -> data .string = cpulist ;
351+ opal_list_append (pmap , & kv -> super );
326352
327- /* appnum */
328- kv = OBJ_NEW (opal_value_t );
329- kv -> key = strdup (OPAL_PMIX_APPNUM );
330- kv -> type = OPAL_UINT32 ;
331- kv -> data .uint32 = pptr -> app_idx ;
332- opal_list_append (pmap , & kv -> super );
353+ /* appnum */
354+ kv = OBJ_NEW (opal_value_t );
355+ kv -> key = strdup (OPAL_PMIX_APPNUM );
356+ kv -> type = OPAL_UINT32 ;
357+ kv -> data .uint32 = pptr -> app_idx ;
358+ opal_list_append (pmap , & kv -> super );
333359
334- /* app ldr */
335- app = (orte_app_context_t * )opal_pointer_array_get_item (jdata -> apps , pptr -> app_idx );
336- kv = OBJ_NEW (opal_value_t );
337- kv -> key = strdup (OPAL_PMIX_APPLDR );
338- kv -> type = OPAL_VPID ;
339- kv -> data .name .vpid = app -> first_rank ;
340- opal_list_append (pmap , & kv -> super );
360+ /* app ldr */
361+ app = (orte_app_context_t * )opal_pointer_array_get_item (jdata -> apps , pptr -> app_idx );
362+ kv = OBJ_NEW (opal_value_t );
363+ kv -> key = strdup (OPAL_PMIX_APPLDR );
364+ kv -> type = OPAL_VPID ;
365+ kv -> data .name .vpid = app -> first_rank ;
366+ opal_list_append (pmap , & kv -> super );
341367
342- /* global/univ rank */
343- kv = OBJ_NEW (opal_value_t );
344- kv -> key = strdup (OPAL_PMIX_GLOBAL_RANK );
345- kv -> type = OPAL_VPID ;
346- kv -> data .name .vpid = pptr -> name .vpid + jdata -> offset ;
347- opal_list_append (pmap , & kv -> super );
368+ /* global/univ rank */
369+ kv = OBJ_NEW (opal_value_t );
370+ kv -> key = strdup (OPAL_PMIX_GLOBAL_RANK );
371+ kv -> type = OPAL_VPID ;
372+ kv -> data .name .vpid = pptr -> name .vpid + jdata -> offset ;
373+ opal_list_append (pmap , & kv -> super );
348374
349- /* app rank */
350- kv = OBJ_NEW (opal_value_t );
351- kv -> key = strdup (OPAL_PMIX_APP_RANK );
352- kv -> type = OPAL_VPID ;
353- kv -> data .name .vpid = pptr -> app_rank ;
354- opal_list_append (pmap , & kv -> super );
375+ /* app rank */
376+ kv = OBJ_NEW (opal_value_t );
377+ kv -> key = strdup (OPAL_PMIX_APP_RANK );
378+ kv -> type = OPAL_VPID ;
379+ kv -> data .name .vpid = pptr -> app_rank ;
380+ opal_list_append (pmap , & kv -> super );
355381
356- /* app size */
357- kv = OBJ_NEW (opal_value_t );
358- kv -> key = strdup (OPAL_PMIX_APP_SIZE );
359- kv -> type = OPAL_UINT32 ;
360- kv -> data .uint32 = app -> num_procs ;
361- opal_list_append (info , & kv -> super );
382+ /* app size */
383+ kv = OBJ_NEW (opal_value_t );
384+ kv -> key = strdup (OPAL_PMIX_APP_SIZE );
385+ kv -> type = OPAL_UINT32 ;
386+ kv -> data .uint32 = app -> num_procs ;
387+ opal_list_append (info , & kv -> super );
362388
363- /* local rank */
364- kv = OBJ_NEW (opal_value_t );
365- kv -> key = strdup (OPAL_PMIX_LOCAL_RANK );
366- kv -> type = OPAL_UINT16 ;
367- kv -> data .uint16 = pptr -> local_rank ;
368- opal_list_append (pmap , & kv -> super );
389+ /* local rank */
390+ kv = OBJ_NEW (opal_value_t );
391+ kv -> key = strdup (OPAL_PMIX_LOCAL_RANK );
392+ kv -> type = OPAL_UINT16 ;
393+ kv -> data .uint16 = pptr -> local_rank ;
394+ opal_list_append (pmap , & kv -> super );
369395
370- /* node rank */
371- kv = OBJ_NEW (opal_value_t );
372- kv -> key = strdup (OPAL_PMIX_NODE_RANK );
373- kv -> type = OPAL_UINT16 ;
374- kv -> data .uint32 = pptr -> node_rank ;
375- opal_list_append (pmap , & kv -> super );
396+ /* node rank */
397+ kv = OBJ_NEW (opal_value_t );
398+ kv -> key = strdup (OPAL_PMIX_NODE_RANK );
399+ kv -> type = OPAL_UINT16 ;
400+ kv -> data .uint32 = pptr -> node_rank ;
401+ opal_list_append (pmap , & kv -> super );
376402
377- /* hostname */
378- kv = OBJ_NEW (opal_value_t );
379- kv -> key = strdup (OPAL_PMIX_HOSTNAME );
380- kv -> type = OPAL_STRING ;
381- kv -> data .string = strdup (pptr -> node -> name );
382- opal_list_append (pmap , & kv -> super );
403+ /* hostname */
404+ kv = OBJ_NEW (opal_value_t );
405+ kv -> key = strdup (OPAL_PMIX_HOSTNAME );
406+ kv -> type = OPAL_STRING ;
407+ kv -> data .string = strdup (pptr -> node -> name );
408+ opal_list_append (pmap , & kv -> super );
383409
384- /* node ID */
385- kv = OBJ_NEW (opal_value_t );
386- kv -> key = strdup (OPAL_PMIX_NODEID );
387- kv -> type = OPAL_UINT32 ;
388- kv -> data .uint32 = pptr -> node -> index ;
389- opal_list_append (pmap , & kv -> super );
410+ /* node ID */
411+ kv = OBJ_NEW (opal_value_t );
412+ kv -> key = strdup (OPAL_PMIX_NODEID );
413+ kv -> type = OPAL_UINT32 ;
414+ kv -> data .uint32 = pptr -> node -> index ;
415+ opal_list_append (pmap , & kv -> super );
416+ }
417+ /* cleanup */
418+ if (NULL != cpulist ) {
419+ free (cpulist );
420+ }
421+ if (NULL != peerlist ) {
422+ free (peerlist );
423+ }
390424 }
391425
392426 /* mark the job as registered */
0 commit comments