@@ -115,6 +115,8 @@ static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t
115115 opal_hash_table_set_value_ptr (& ompi_proc_hash , & proc -> super .proc_name , sizeof (proc -> super .proc_name ),
116116 proc );
117117
118+ /* by default we consider process to be remote */
119+ proc -> super .proc_flags = OPAL_PROC_NON_LOCAL ;
118120 * procp = proc ;
119121
120122 return OMPI_SUCCESS ;
@@ -132,26 +134,14 @@ static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t
132134 */
133135int ompi_proc_complete_init_single (ompi_proc_t * proc )
134136{
135- uint16_t u16 , * u16ptr ;
136137 int ret ;
137138
138- u16ptr = & u16 ;
139-
140139 if ((OMPI_CAST_RTE_NAME (& proc -> super .proc_name )-> jobid == OMPI_PROC_MY_NAME -> jobid ) &&
141140 (OMPI_CAST_RTE_NAME (& proc -> super .proc_name )-> vpid == OMPI_PROC_MY_NAME -> vpid )) {
142141 /* nothing else to do */
143142 return OMPI_SUCCESS ;
144143 }
145144
146- /* get the locality information - all RTEs are required
147- * to provide this information at startup */
148- OPAL_MODEX_RECV_VALUE_OPTIONAL (ret , OPAL_PMIX_LOCALITY , & proc -> super .proc_name , & u16ptr , OPAL_UINT16 );
149- if (OPAL_SUCCESS != ret ) {
150- proc -> super .proc_flags = OPAL_PROC_NON_LOCAL ;
151- } else {
152- proc -> super .proc_flags = u16 ;
153- }
154-
155145 /* we can retrieve the hostname at no cost because it
156146 * was provided at startup - but make it optional so
157147 * we don't chase after it if some system doesn't
@@ -286,20 +276,6 @@ int ompi_proc_init(void)
286276 }
287277#endif
288278
289- if (ompi_process_info .num_procs < ompi_add_procs_cutoff ) {
290- /* create proc structures and find self */
291- for (ompi_vpid_t i = 0 ; i < ompi_process_info .num_procs ; ++ i ) {
292- if (i == OMPI_PROC_MY_NAME -> vpid ) {
293- continue ;
294- }
295-
296- ret = ompi_proc_allocate (OMPI_PROC_MY_NAME -> jobid , i , & proc );
297- if (OMPI_SUCCESS != ret ) {
298- return ret ;
299- }
300- }
301- }
302-
303279 return OMPI_SUCCESS ;
304280}
305281
@@ -330,41 +306,69 @@ int ompi_proc_complete_init(void)
330306{
331307 ompi_proc_t * proc ;
332308 int ret , errcode = OMPI_SUCCESS ;
309+ char * val ;
333310
334311 opal_mutex_lock (& ompi_proc_lock );
335312
313+ /* Add all local peers first */
314+ OPAL_MODEX_RECV_VALUE (ret , OPAL_PMIX_LOCAL_PEERS ,
315+ ORTE_PROC_MY_NAME , & val , OPAL_STRING );
316+ if (OPAL_SUCCESS == ret && NULL != val ) {
317+ char * * peers = opal_argv_split (val , ',' );
318+ int i ;
319+ free (val );
320+ for (i = 0 ; NULL != peers [i ]; i ++ ) {
321+ ompi_vpid_t local_rank = strtoul (peers [i ], NULL , 10 );
322+ uint16_t u16 , * u16ptr = & u16 ;
323+ if (OMPI_PROC_MY_NAME -> vpid == local_rank ) {
324+ continue ;
325+ }
326+ ret = ompi_proc_allocate (OMPI_PROC_MY_NAME -> jobid , local_rank , & proc );
327+ if (OMPI_SUCCESS != ret ) {
328+ return ret ;
329+ }
330+ /* get the locality information - all RTEs are required
331+ * to provide this information at startup */
332+ OPAL_MODEX_RECV_VALUE_OPTIONAL (ret , OPAL_PMIX_LOCALITY , & proc -> super .proc_name , & u16ptr , OPAL_UINT16 );
333+ if (OPAL_SUCCESS == ret ) {
334+ proc -> super .proc_flags = u16 ;
335+ }
336+ }
337+ opal_argv_free (peers );
338+ }
339+
340+ /* Complete initialization of node-local procs */
336341 OPAL_LIST_FOREACH (proc , & ompi_proc_list , ompi_proc_t ) {
337342 ret = ompi_proc_complete_init_single (proc );
338343 if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
339344 errcode = ret ;
340345 break ;
341346 }
342347 }
343- opal_mutex_unlock (& ompi_proc_lock );
344348
345- if (ompi_process_info .num_procs >= ompi_add_procs_cutoff ) {
346- char * val ;
347- /* retrieve the local peers */
348- OPAL_MODEX_RECV_VALUE (ret , OPAL_PMIX_LOCAL_PEERS ,
349- ORTE_PROC_MY_NAME , & val , OPAL_STRING );
350- if (OPAL_SUCCESS == ret && NULL != val ) {
351- char * * peers = opal_argv_split (val , ',' );
352- int i ;
353- free (val );
354- for (i = 0 ; NULL != peers [i ]; i ++ ) {
355- ompi_vpid_t local_rank = strtoul (peers [i ], NULL , 10 );
356- opal_process_name_t proc_name = {.vpid = local_rank , .jobid = OMPI_PROC_MY_NAME -> jobid };
357-
358- if (OMPI_PROC_MY_NAME -> vpid == local_rank ) {
359- continue ;
360- }
361- (void ) ompi_proc_for_name (proc_name );
362- }
363- opal_argv_free (peers );
349+ /* if cutoff is larger than # of procs - add all processes
350+ * NOTE that local procs will be automatically skipped as they
351+ * are already in the hash table
352+ */
353+ if (ompi_process_info .num_procs < ompi_add_procs_cutoff ) {
354+ /* sinse ompi_proc_for_name is locking internally -
355+ * we need to release lock here
356+ */
357+ opal_mutex_unlock (& ompi_proc_lock );
358+
359+ for (ompi_vpid_t i = 0 ; i < ompi_process_info .num_procs ; ++ i ) {
360+ opal_process_name_t proc_name ;
361+ proc_name .jobid = OMPI_PROC_MY_NAME -> jobid ;
362+ proc_name .vpid = i ;
363+ (void ) ompi_proc_for_name (proc_name );
364364 }
365+
366+ /* acquire lock back for the next step - sort */
367+ opal_mutex_lock (& ompi_proc_lock );
365368 }
366369
367370 opal_list_sort (& ompi_proc_list , ompi_proc_compare_vid );
371+ opal_mutex_unlock (& ompi_proc_lock );
368372
369373 return errcode ;
370374}
0 commit comments