@@ -308,21 +308,16 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
308308 const void * arg0 , const void * arg1 , bool send_first , int mode ,
309309 ompi_request_t * * req )
310310{
311- pmix_info_t pinfo , * results = NULL ;
311+ pmix_info_t * pinfo , * results = NULL ;
312312 size_t nresults ;
313- opal_process_name_t * name_array = NULL ;
314- char * tag = NULL ;
315- size_t proc_count ;
316- size_t cid_base = 0 ;
313+ opal_process_name_t opal_proc_name ;
317314 bool cid_base_set = false;
315+ char * tag = NULL ;
316+ size_t proc_count = 0 , rproc_count = 0 , tproc_count = 0 , cid_base = 0UL , ninfo ;
318317 int rc , leader_rank ;
319- int ret = OMPI_SUCCESS ;
320- pmix_proc_t * procs = NULL ;
321-
322- rc = ompi_group_to_proc_name_array (newcomm -> c_local_group , & name_array , & proc_count );
323- if (OPAL_UNLIKELY (OMPI_SUCCESS != rc )) {
324- return rc ;
325- }
318+ pmix_proc_t * procs ;
319+ void * grpinfo = NULL , * list = NULL ;
320+ pmix_data_array_t darray ;
326321
327322 switch (mode ) {
328323 case OMPI_COMM_CID_GROUP_NEW :
@@ -339,15 +334,75 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
339334 break ;
340335 }
341336
342- PMIX_INFO_LOAD (& pinfo , PMIX_GROUP_ASSIGN_CONTEXT_ID , NULL , PMIX_BOOL );
337+ grpinfo = PMIx_Info_list_start ();
338+ if (NULL == grpinfo ) {
339+ rc = OMPI_ERR_OUT_OF_RESOURCE ;
340+ goto fn_exit ;
341+ }
342+
343+ rc = PMIx_Info_list_add (grpinfo , PMIX_GROUP_ASSIGN_CONTEXT_ID , NULL , PMIX_BOOL );
344+ if (PMIX_SUCCESS != rc ) {
345+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_add failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
346+ rc = OMPI_ERR_OUT_OF_RESOURCE ;
347+ goto fn_exit ;
348+ }
349+
350+ list = PMIx_Info_list_start ();
351+
352+ size_t c_index = (size_t )newcomm -> c_index ;
353+ rc = PMIx_Info_list_add (list , PMIX_GROUP_LOCAL_CID , & c_index , PMIX_SIZE );
354+ if (PMIX_SUCCESS != rc ) {
355+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_add failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
356+ rc = OMPI_ERR_OUT_OF_RESOURCE ;
357+ goto fn_exit ;
358+ }
359+
360+ rc = PMIx_Info_list_convert (list , & darray );
361+ if (PMIX_SUCCESS != rc ) {
362+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_convert failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
363+ rc = OMPI_ERR_OUT_OF_RESOURCE ;
364+ goto fn_exit ;
365+ }
366+ rc = PMIx_Info_list_add (grpinfo , PMIX_GROUP_INFO , & darray , PMIX_DATA_ARRAY );
367+ PMIX_DATA_ARRAY_DESTRUCT (& darray );
368+ if (PMIX_SUCCESS != rc ) {
369+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_add failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
370+ rc = OMPI_ERR_OUT_OF_RESOURCE ;
371+ goto fn_exit ;
372+ }
373+
374+ rc = PMIx_Info_list_convert (grpinfo , & darray );
375+ if (PMIX_SUCCESS != rc ) {
376+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_convert failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
377+ rc = OMPI_ERR_OUT_OF_RESOURCE ;
378+ goto fn_exit ;
379+ }
380+
381+ pinfo = (pmix_info_t * )darray .array ;
382+ ninfo = darray .size ;
383+
384+ proc_count = newcomm -> c_local_group -> grp_proc_count ;
385+ if ( OMPI_COMM_IS_INTER (newcomm ) ){
386+ rproc_count = newcomm -> c_remote_group -> grp_proc_count ;
387+ }
388+
389+ PMIX_PROC_CREATE (procs , proc_count + rproc_count );
343390
344- PMIX_PROC_CREATE (procs , proc_count );
345391 for (size_t i = 0 ; i < proc_count ; ++ i ) {
346- OPAL_PMIX_CONVERT_NAME (& procs [i ],& name_array [i ]);
392+ opal_proc_name = ompi_group_get_proc_name (newcomm -> c_local_group , i );
393+ OPAL_PMIX_CONVERT_NAME (& procs [i ],& opal_proc_name );
394+ }
395+ for (size_t i = 0 ; i < rproc_count ; ++ i ) {
396+ opal_proc_name = ompi_group_get_proc_name (newcomm -> c_remote_group , i );
397+ OPAL_PMIX_CONVERT_NAME (& procs [proc_count + i ],& opal_proc_name );
347398 }
348399
349- rc = PMIx_Group_construct (tag , procs , proc_count , & pinfo , 1 , & results , & nresults );
350- PMIX_INFO_DESTRUCT (& pinfo );
400+ tproc_count = proc_count + rproc_count ;
401+
402+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "calling PMIx_Group_construct - tag %s size %ld ninfo %ld cid_base %ld\n" ,
403+ tag , tproc_count , ninfo , cid_base ));
404+ rc = PMIx_Group_construct (tag , procs , tproc_count , pinfo , ninfo , & results , & nresults );
405+ PMIX_DATA_ARRAY_DESTRUCT (& darray );
351406 if (PMIX_SUCCESS != rc ) {
352407 char msg_string [1024 ];
353408 switch (rc ) {
@@ -359,7 +414,7 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
359414 "MPI_Comm_create_from_group/MPI_Intercomm_create_from_groups" ,
360415 msg_string );
361416
362- ret = MPI_ERR_UNSUPPORTED_OPERATION ;
417+ rc = MPI_ERR_UNSUPPORTED_OPERATION ;
363418 break ;
364419 case PMIX_ERR_NOT_SUPPORTED :
365420 sprintf (msg_string ,"PMIx server does not support PMIx Group operations" );
@@ -368,10 +423,10 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
368423 true,
369424 "MPI_Comm_create_from_group/MPI_Intercomm_create_from_groups" ,
370425 msg_string );
371- ret = MPI_ERR_UNSUPPORTED_OPERATION ;
426+ rc = MPI_ERR_UNSUPPORTED_OPERATION ;
372427 break ;
373428 default :
374- ret = opal_pmix_convert_status (rc );
429+ rc = opal_pmix_convert_status (rc );
375430 break ;
376431 }
377432 goto fn_exit ;
@@ -381,23 +436,28 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
381436 if (PMIX_CHECK_KEY (& results [i ], PMIX_GROUP_CONTEXT_ID )) {
382437 PMIX_VALUE_GET_NUMBER (rc , & results [i ].value , cid_base , size_t );
383438 if (PMIX_SUCCESS != rc ) {
384- ret = opal_pmix_convert_status (rc );
439+ rc = opal_pmix_convert_status (rc );
385440 goto fn_exit ;
386441 }
387442 cid_base_set = true;
388443 break ;
389444 }
390445 }
391446
447+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Group_construct - tag %s size %ld ninfo %ld cid_base %ld\n" ,
448+ tag , tproc_count , ninfo , cid_base ));
449+
450+ /* destruct the group */
392451 rc = PMIx_Group_destruct (tag , NULL , 0 );
393452 if (PMIX_SUCCESS != rc ) {
394- ret = opal_pmix_convert_status (rc );
453+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Group_destruct failed %s" , PMIx_Error_string (rc )));
454+ rc = opal_pmix_convert_status (rc );
395455 goto fn_exit ;
396456 }
397457
398458 if (!cid_base_set ) {
399459 opal_show_help ("help-comm.txt" , "cid-base-not-set" , true);
400- ret = OMPI_ERROR ;
460+ rc = OMPI_ERROR ;
401461 goto fn_exit ;
402462 }
403463
@@ -410,16 +470,19 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
410470 }
411471
412472 if (NULL != procs ) {
413- PMIX_PROC_FREE (procs , proc_count );
473+ PMIX_PROC_FREE (procs , tproc_count );
414474 procs = NULL ;
415475 }
416476
417- if (NULL != name_array ) {
418- free (name_array );
419- name_array = NULL ;
477+ if (NULL != grpinfo ) {
478+ PMIx_Info_list_release (grpinfo );
420479 }
421480
422- return ret ;
481+ if (NULL != list ) {
482+ PMIx_Info_list_release (list );
483+ }
484+
485+ return rc ;
423486}
424487
425488static int ompi_comm_nextcid_ext_nb (ompi_communicator_t * newcomm , ompi_communicator_t * comm ,
@@ -444,6 +507,15 @@ static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communic
444507 block = & comm -> c_contextidb ;
445508 }
446509
510+ for (unsigned int i = ompi_mpi_communicators .lowest_free ; i < mca_pml .pml_max_contextid ; ++ i ) {
511+ bool flag = opal_pointer_array_test_and_set_item (& ompi_mpi_communicators , i , newcomm );
512+ if (true == flag ) {
513+ newcomm -> c_index = i ;
514+ break ;
515+ }
516+ }
517+ assert (newcomm -> c_index > 2 );
518+
447519 if (NULL == arg1 ) {
448520 if (OMPI_COMM_CID_GROUP == mode || OMPI_COMM_CID_GROUP_NEW == mode ||
449521 !ompi_comm_extended_cid_block_available (& comm -> c_contextidb )) {
@@ -466,14 +538,6 @@ static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communic
466538 (void ) ompi_comm_extended_cid_block_new (block , & newcomm -> c_contextidb , is_new_block );
467539 }
468540
469- for (unsigned int i = ompi_mpi_communicators .lowest_free ; i < mca_pml .pml_max_contextid ; ++ i ) {
470- bool flag = opal_pointer_array_test_and_set_item (& ompi_mpi_communicators , i , newcomm );
471- if (true == flag ) {
472- newcomm -> c_index = i ;
473- break ;
474- }
475- }
476-
477541 newcomm -> c_contextid = newcomm -> c_contextidb .block_cid ;
478542
479543 opal_hash_table_set_value_ptr (& ompi_comm_hash , & newcomm -> c_contextid ,
@@ -500,7 +564,7 @@ int ompi_comm_nextcid_nb (ompi_communicator_t *newcomm, ompi_communicator_t *com
500564 functions but the pml does not support these functions so return not supported */
501565 if (NULL == comm ) {
502566 char msg_string [1024 ];
503- sprintf (msg_string ,"The PML being used - %s - does not support MPI sessions related features" ,
567+ sprintf (msg_string ,"The PML being used - %s - does not support MPI sessions related features" ,
504568 mca_pml_base_selected_component .pmlm_version .mca_component_name );
505569 opal_show_help ("help-comm.txt" ,
506570 "MPI function not supported" ,
@@ -855,6 +919,7 @@ int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *c
855919 ompi_comm_cid_context_t * context ;
856920 ompi_comm_request_t * request ;
857921 ompi_request_t * subreq ;
922+ uint32_t comm_size ;
858923 int ret = 0 ;
859924
860925 /* the caller should not pass NULL for comm (it may be the same as *newcomm) */
@@ -876,6 +941,25 @@ int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *c
876941
877942 request -> context = & context -> super ;
878943
944+ /* Prep communicator for handling remote cids if needed */
945+
946+ if (!OMPI_COMM_IS_GLOBAL_INDEX (* newcomm )) {
947+ if (OMPI_COMM_IS_INTER (* newcomm )) {
948+ comm_size = ompi_comm_remote_size (* newcomm );
949+ } else {
950+ comm_size = ompi_comm_size (* newcomm );
951+ }
952+
953+ (* newcomm )-> c_index_vec = (uint32_t * )calloc (comm_size , sizeof (uint32_t ));
954+ if (NULL == (* newcomm )-> c_index_vec ) {
955+ return OMPI_ERR_OUT_OF_RESOURCE ;
956+ }
957+
958+ if (OMPI_COMM_IS_INTRA (* newcomm )) {
959+ (* newcomm )-> c_index_vec [(* newcomm )-> c_my_rank ] = (* newcomm )-> c_index ;
960+ }
961+ }
962+
879963 if (MPI_UNDEFINED != (* newcomm )-> c_local_group -> grp_my_rank ) {
880964 /* Initialize the PML stuff in the newcomm */
881965 if ( OMPI_SUCCESS != (ret = MCA_PML_CALL (add_comm (* newcomm ))) ) {
@@ -926,6 +1010,61 @@ int ompi_comm_activate (ompi_communicator_t **newcomm, ompi_communicator_t *comm
9261010 return rc ;
9271011}
9281012
1013+ int ompi_comm_get_remote_cid_from_pmix (ompi_communicator_t * comm , int dest , uint32_t * remote_cid )
1014+ {
1015+ ompi_proc_t * ompi_proc ;
1016+ pmix_proc_t pmix_proc ;
1017+ pmix_info_t tinfo [2 ];
1018+ pmix_value_t * val = NULL ;
1019+ ompi_comm_extended_cid_t excid ;
1020+ int rc = OMPI_SUCCESS ;
1021+ size_t remote_cid64 ;
1022+
1023+ assert (NULL != remote_cid );
1024+
1025+ ompi_proc = ompi_comm_peer_lookup (comm , dest );
1026+ OPAL_PMIX_CONVERT_NAME (& pmix_proc , & ompi_proc -> super .proc_name );
1027+
1028+ PMIx_Info_construct (& tinfo [0 ]);
1029+ PMIX_INFO_LOAD (& tinfo [0 ], PMIX_TIMEOUT , & ompi_pmix_connect_timeout , PMIX_UINT32 );
1030+
1031+ excid = ompi_comm_get_extended_cid (comm );
1032+
1033+ PMIX_INFO_CONSTRUCT (& tinfo [1 ]);
1034+ PMIX_INFO_LOAD (& tinfo [1 ], PMIX_GROUP_CONTEXT_ID , & excid .cid_base , PMIX_SIZE );
1035+ PMIX_INFO_SET_QUALIFIER (& tinfo [1 ]);
1036+ if (PMIX_SUCCESS != (rc = PMIx_Get (& pmix_proc , PMIX_GROUP_LOCAL_CID , tinfo , 2 , & val ))) {
1037+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Get failed for PMIX_GROUP_LOCAL_CID cid_base %ld %s" , excid .cid_base , PMIx_Error_string (rc )));
1038+ rc = OMPI_ERR_NOT_FOUND ;
1039+ goto done ;
1040+ }
1041+
1042+ if (NULL == val ) {
1043+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Get failed for PMIX_GROUP_LOCAL_CID val returned NULL" ));
1044+ rc = OMPI_ERR_NOT_FOUND ;
1045+ goto done ;
1046+ }
1047+
1048+ if (val -> type != PMIX_SIZE ) {
1049+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Get failed for PMIX_GROUP_LOCAL_CID type mismatch" ));
1050+ rc = OMPI_ERR_TYPE_MISMATCH ;
1051+ goto done ;
1052+ }
1053+
1054+ PMIX_VALUE_GET_NUMBER (rc , val , remote_cid64 , size_t );
1055+ rc = OMPI_SUCCESS ;
1056+ * remote_cid = (uint32_t )remote_cid64 ;
1057+ comm -> c_index_vec [dest ] = (uint32_t )remote_cid64 ;
1058+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Get PMIX_GROUP_LOCAL_CID %d for cid_base %ld" , * remote_cid , excid .cid_base ));
1059+
1060+ done :
1061+ if (NULL != val ) {
1062+ PMIX_VALUE_RELEASE (val );
1063+ }
1064+
1065+ return rc ;
1066+ }
1067+
9291068static int ompi_comm_activate_nb_complete (ompi_comm_request_t * request )
9301069{
9311070 ompi_comm_cid_context_t * context = (ompi_comm_cid_context_t * ) request -> context ;
0 commit comments