@@ -259,6 +259,7 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
259259 * fail. */
260260 if (0 >= rportlen ) {
261261 rc = rportlen ;
262+ /* no need to free here, the root has already done it and everyone else has not yet allocated the rport array */
262263 goto exit ;
263264 }
264265
@@ -406,72 +407,85 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
406407 OPAL_LIST_DESTRUCT (& rlist );
407408 goto exit ;
408409 }
409- if (0 < opal_list_get_size (& ilist )) {
410- uint32_t * peer_ranks = NULL ;
410+ if (!opal_list_is_empty (& ilist )) {
411411 int prn , nprn = 0 ;
412412 char * val ;
413- uint16_t u16 ;
414413 opal_process_name_t wildcard_rank ;
414+ i = 0 ; /* start from the begining */
415+
415416 /* convert the list of new procs to a proc_t array */
416417 new_proc_list = (ompi_proc_t * * )calloc (opal_list_get_size (& ilist ),
417418 sizeof (ompi_proc_t * ));
418- /* get the list of local peers for the new procs */
419- cd = (ompi_dpm_proct_caddy_t * )opal_list_get_first (& ilist );
420- proc = cd -> p ;
421- wildcard_rank .jobid = proc -> super .proc_name .jobid ;
422- wildcard_rank .vpid = OMPI_NAME_WILDCARD -> vpid ;
423- /* retrieve the local peers */
424- OPAL_MODEX_RECV_VALUE_OPTIONAL (rc , PMIX_LOCAL_PEERS ,
425- & wildcard_rank , & val , PMIX_STRING );
426- if (OPAL_SUCCESS == rc && NULL != val ) {
427- char * * peers = opal_argv_split (val , ',' );
428- free (val );
429- nprn = opal_argv_count (peers );
430- peer_ranks = (uint32_t * )calloc (nprn , sizeof (uint32_t ));
431- for (prn = 0 ; NULL != peers [prn ]; prn ++ ) {
432- peer_ranks [prn ] = strtoul (peers [prn ], NULL , 10 );
433- }
434- opal_argv_free (peers );
435- }
436-
437- i = 0 ;
438- OPAL_LIST_FOREACH (cd , & ilist , ompi_dpm_proct_caddy_t ) {
419+ /* Extract the modex info for the first proc on the ilist, and then
420+ * remove all processors in the same jobid from the list by getting
421+ * their connection information and moving them into the proc array.
422+ */
423+ do {
424+ uint32_t * local_ranks_in_jobid = NULL ;
425+ ompi_dpm_proct_caddy_t * next = NULL ;
426+ cd = (ompi_dpm_proct_caddy_t * )opal_list_get_first (& ilist );
439427 proc = cd -> p ;
440- new_proc_list [i ] = proc ;
441- /* ompi_proc_complete_init_single() initializes and optionally retrieves
442- * OPAL_PMIX_LOCALITY and OPAL_PMIX_HOSTNAME. since we can live without
443- * them, we are just fine */
444- ompi_proc_complete_init_single (proc );
445- /* if this proc is local, then get its locality */
446- if (NULL != peer_ranks ) {
447- for (prn = 0 ; prn < nprn ; prn ++ ) {
448- if (peer_ranks [prn ] == proc -> super .proc_name .vpid ) {
449- /* get their locality string */
450- val = NULL ;
451- OPAL_MODEX_RECV_VALUE_IMMEDIATE (rc , PMIX_LOCALITY_STRING ,
452- & proc -> super .proc_name , & val , PMIX_STRING );
453- if (OPAL_SUCCESS == rc && NULL != ompi_process_info .locality ) {
454- u16 = opal_hwloc_compute_relative_locality (ompi_process_info .locality , val );
455- free (val );
456- } else {
457- /* all we can say is that it shares our node */
458- u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE ;
428+ wildcard_rank .jobid = proc -> super .proc_name .jobid ;
429+ wildcard_rank .vpid = OMPI_NAME_WILDCARD -> vpid ;
430+ /* retrieve the local peers for the specified jobid */
431+ OPAL_MODEX_RECV_VALUE_OPTIONAL (rc , PMIX_LOCAL_PEERS ,
432+ & wildcard_rank , & val , PMIX_STRING );
433+ if (OPAL_SUCCESS == rc && NULL != val ) {
434+ char * * peers = opal_argv_split (val , ',' );
435+ free (val );
436+ nprn = opal_argv_count (peers );
437+ local_ranks_in_jobid = (uint32_t * )calloc (nprn , sizeof (uint32_t ));
438+ for (prn = 0 ; NULL != peers [prn ]; prn ++ ) {
439+ local_ranks_in_jobid [prn ] = strtoul (peers [prn ], NULL , 10 );
440+ }
441+ opal_argv_free (peers );
442+ }
443+
444+ OPAL_LIST_FOREACH_SAFE (cd , next , & ilist , ompi_dpm_proct_caddy_t ) {
445+ proc = cd -> p ;
446+ if ( proc -> super .proc_name .jobid != wildcard_rank .jobid )
447+ continue ; /* not a proc from this jobid */
448+
449+ new_proc_list [i ] = proc ;
450+ opal_list_remove_item (& ilist , (opal_list_item_t * )cd ); // TODO: do we need to release cd ?
451+ OBJ_RELEASE (cd );
452+ /* ompi_proc_complete_init_single() initializes and optionally retrieves
453+ * OPAL_PMIX_LOCALITY and OPAL_PMIX_HOSTNAME. since we can live without
454+ * them, we are just fine */
455+ ompi_proc_complete_init_single (proc );
456+ /* if this proc is local, then get its locality */
457+ if (NULL != local_ranks_in_jobid ) {
458+ uint16_t u16 ;
459+ for (prn = 0 ; prn < nprn ; prn ++ ) {
460+ if (local_ranks_in_jobid [prn ] == proc -> super .proc_name .vpid ) {
461+ /* get their locality string */
462+ val = NULL ;
463+ OPAL_MODEX_RECV_VALUE_IMMEDIATE (rc , PMIX_LOCALITY_STRING ,
464+ & proc -> super .proc_name , & val , PMIX_STRING );
465+ if (OPAL_SUCCESS == rc && NULL != ompi_process_info .locality ) {
466+ u16 = opal_hwloc_compute_relative_locality (ompi_process_info .locality , val );
467+ free (val );
468+ } else {
469+ /* all we can say is that it shares our node */
470+ u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE ;
471+ }
472+ proc -> super .proc_flags = u16 ;
473+ /* save the locality for later */
474+ OPAL_PMIX_CONVERT_NAME (& pxproc , & proc -> super .proc_name );
475+ pval .type = PMIX_UINT16 ;
476+ pval .data .uint16 = proc -> super .proc_flags ;
477+ PMIx_Store_internal (& pxproc , PMIX_LOCALITY , & pval );
478+ break ;
459479 }
460- proc -> super .proc_flags = u16 ;
461- /* save the locality for later */
462- OPAL_PMIX_CONVERT_NAME (& pxproc , & proc -> super .proc_name );
463- pval .type = PMIX_UINT16 ;
464- pval .data .uint16 = proc -> super .proc_flags ;
465- PMIx_Store_internal (& pxproc , PMIX_LOCALITY , & pval );
466- break ;
467480 }
468481 }
482+ ++ i ;
469483 }
470- ++ i ;
471- }
472- if ( NULL != peer_ranks ) {
473- free ( peer_ranks );
474- }
484+ if ( NULL != local_ranks_in_jobid ) {
485+ free ( local_ranks_in_jobid );
486+ }
487+ } while (! opal_list_is_empty ( & ilist ) );
488+
475489 /* call add_procs on the new ones */
476490 rc = MCA_PML_CALL (add_procs (new_proc_list , opal_list_get_size (& ilist )));
477491 free (new_proc_list );
0 commit comments