@@ -308,21 +308,16 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
308
308
const void * arg0 , const void * arg1 , bool send_first , int mode ,
309
309
ompi_request_t * * req )
310
310
{
311
- pmix_info_t pinfo , * results = NULL ;
311
+ pmix_info_t * pinfo , * results = NULL ;
312
312
size_t nresults ;
313
- opal_process_name_t * name_array = NULL ;
314
- char * tag = NULL ;
315
- size_t proc_count ;
316
- size_t cid_base = 0 ;
313
+ opal_process_name_t opal_proc_name ;
317
314
bool cid_base_set = false;
315
+ char * tag = NULL ;
316
+ size_t proc_count = 0 , rproc_count = 0 , tproc_count = 0 , cid_base = 0UL , ninfo ;
318
317
int rc , leader_rank ;
319
- int ret = OMPI_SUCCESS ;
320
- pmix_proc_t * procs = NULL ;
321
-
322
- rc = ompi_group_to_proc_name_array (newcomm -> c_local_group , & name_array , & proc_count );
323
- if (OPAL_UNLIKELY (OMPI_SUCCESS != rc )) {
324
- return rc ;
325
- }
318
+ pmix_proc_t * procs ;
319
+ void * grpinfo = NULL , * list = NULL ;
320
+ pmix_data_array_t darray ;
326
321
327
322
switch (mode ) {
328
323
case OMPI_COMM_CID_GROUP_NEW :
@@ -339,15 +334,75 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
339
334
break ;
340
335
}
341
336
342
- PMIX_INFO_LOAD (& pinfo , PMIX_GROUP_ASSIGN_CONTEXT_ID , NULL , PMIX_BOOL );
337
+ grpinfo = PMIx_Info_list_start ();
338
+ if (NULL == grpinfo ) {
339
+ rc = OMPI_ERR_OUT_OF_RESOURCE ;
340
+ goto fn_exit ;
341
+ }
342
+
343
+ rc = PMIx_Info_list_add (grpinfo , PMIX_GROUP_ASSIGN_CONTEXT_ID , NULL , PMIX_BOOL );
344
+ if (PMIX_SUCCESS != rc ) {
345
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_add failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
346
+ rc = OMPI_ERR_OUT_OF_RESOURCE ;
347
+ goto fn_exit ;
348
+ }
349
+
350
+ list = PMIx_Info_list_start ();
351
+
352
+ size_t c_index = (size_t )newcomm -> c_index ;
353
+ rc = PMIx_Info_list_add (list , PMIX_GROUP_LOCAL_CID , & c_index , PMIX_SIZE );
354
+ if (PMIX_SUCCESS != rc ) {
355
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_add failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
356
+ rc = OMPI_ERR_OUT_OF_RESOURCE ;
357
+ goto fn_exit ;
358
+ }
359
+
360
+ rc = PMIx_Info_list_convert (list , & darray );
361
+ if (PMIX_SUCCESS != rc ) {
362
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_convert failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
363
+ rc = OMPI_ERR_OUT_OF_RESOURCE ;
364
+ goto fn_exit ;
365
+ }
366
+ rc = PMIx_Info_list_add (grpinfo , PMIX_GROUP_INFO , & darray , PMIX_DATA_ARRAY );
367
+ PMIX_DATA_ARRAY_DESTRUCT (& darray );
368
+ if (PMIX_SUCCESS != rc ) {
369
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_add failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
370
+ rc = OMPI_ERR_OUT_OF_RESOURCE ;
371
+ goto fn_exit ;
372
+ }
373
+
374
+ rc = PMIx_Info_list_convert (grpinfo , & darray );
375
+ if (PMIX_SUCCESS != rc ) {
376
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_convert failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
377
+ rc = OMPI_ERR_OUT_OF_RESOURCE ;
378
+ goto fn_exit ;
379
+ }
380
+
381
+ pinfo = (pmix_info_t * )darray .array ;
382
+ ninfo = darray .size ;
383
+
384
+ proc_count = newcomm -> c_local_group -> grp_proc_count ;
385
+ if ( OMPI_COMM_IS_INTER (newcomm ) ){
386
+ rproc_count = newcomm -> c_remote_group -> grp_proc_count ;
387
+ }
388
+
389
+ PMIX_PROC_CREATE (procs , proc_count + rproc_count );
343
390
344
- PMIX_PROC_CREATE (procs , proc_count );
345
391
for (size_t i = 0 ; i < proc_count ; ++ i ) {
346
- OPAL_PMIX_CONVERT_NAME (& procs [i ],& name_array [i ]);
392
+ opal_proc_name = ompi_group_get_proc_name (newcomm -> c_local_group , i );
393
+ OPAL_PMIX_CONVERT_NAME (& procs [i ],& opal_proc_name );
394
+ }
395
+ for (size_t i = 0 ; i < rproc_count ; ++ i ) {
396
+ opal_proc_name = ompi_group_get_proc_name (newcomm -> c_remote_group , i );
397
+ OPAL_PMIX_CONVERT_NAME (& procs [proc_count + i ],& opal_proc_name );
347
398
}
348
399
349
- rc = PMIx_Group_construct (tag , procs , proc_count , & pinfo , 1 , & results , & nresults );
350
- PMIX_INFO_DESTRUCT (& pinfo );
400
+ tproc_count = proc_count + rproc_count ;
401
+
402
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "calling PMIx_Group_construct - tag %s size %ld ninfo %ld cid_base %ld\n" ,
403
+ tag , tproc_count , ninfo , cid_base ));
404
+ rc = PMIx_Group_construct (tag , procs , tproc_count , pinfo , ninfo , & results , & nresults );
405
+ PMIX_DATA_ARRAY_DESTRUCT (& darray );
351
406
if (PMIX_SUCCESS != rc ) {
352
407
char msg_string [1024 ];
353
408
switch (rc ) {
@@ -359,7 +414,7 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
359
414
"MPI_Comm_create_from_group/MPI_Intercomm_create_from_groups" ,
360
415
msg_string );
361
416
362
- ret = MPI_ERR_UNSUPPORTED_OPERATION ;
417
+ rc = MPI_ERR_UNSUPPORTED_OPERATION ;
363
418
break ;
364
419
case PMIX_ERR_NOT_SUPPORTED :
365
420
sprintf (msg_string ,"PMIx server does not support PMIx Group operations" );
@@ -368,10 +423,10 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
368
423
true,
369
424
"MPI_Comm_create_from_group/MPI_Intercomm_create_from_groups" ,
370
425
msg_string );
371
- ret = MPI_ERR_UNSUPPORTED_OPERATION ;
426
+ rc = MPI_ERR_UNSUPPORTED_OPERATION ;
372
427
break ;
373
428
default :
374
- ret = opal_pmix_convert_status (rc );
429
+ rc = opal_pmix_convert_status (rc );
375
430
break ;
376
431
}
377
432
goto fn_exit ;
@@ -381,23 +436,28 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
381
436
if (PMIX_CHECK_KEY (& results [i ], PMIX_GROUP_CONTEXT_ID )) {
382
437
PMIX_VALUE_GET_NUMBER (rc , & results [i ].value , cid_base , size_t );
383
438
if (PMIX_SUCCESS != rc ) {
384
- ret = opal_pmix_convert_status (rc );
439
+ rc = opal_pmix_convert_status (rc );
385
440
goto fn_exit ;
386
441
}
387
442
cid_base_set = true;
388
443
break ;
389
444
}
390
445
}
391
446
447
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Group_construct - tag %s size %ld ninfo %ld cid_base %ld\n" ,
448
+ tag , tproc_count , ninfo , cid_base ));
449
+
450
+ /* destruct the group */
392
451
rc = PMIx_Group_destruct (tag , NULL , 0 );
393
452
if (PMIX_SUCCESS != rc ) {
394
- ret = opal_pmix_convert_status (rc );
453
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Group_destruct failed %s" , PMIx_Error_string (rc )));
454
+ rc = opal_pmix_convert_status (rc );
395
455
goto fn_exit ;
396
456
}
397
457
398
458
if (!cid_base_set ) {
399
459
opal_show_help ("help-comm.txt" , "cid-base-not-set" , true);
400
- ret = OMPI_ERROR ;
460
+ rc = OMPI_ERROR ;
401
461
goto fn_exit ;
402
462
}
403
463
@@ -410,16 +470,19 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
410
470
}
411
471
412
472
if (NULL != procs ) {
413
- PMIX_PROC_FREE (procs , proc_count );
473
+ PMIX_PROC_FREE (procs , tproc_count );
414
474
procs = NULL ;
415
475
}
416
476
417
- if (NULL != name_array ) {
418
- free (name_array );
419
- name_array = NULL ;
477
+ if (NULL != grpinfo ) {
478
+ PMIx_Info_list_release (grpinfo );
420
479
}
421
480
422
- return ret ;
481
+ if (NULL != list ) {
482
+ PMIx_Info_list_release (list );
483
+ }
484
+
485
+ return rc ;
423
486
}
424
487
425
488
static int ompi_comm_nextcid_ext_nb (ompi_communicator_t * newcomm , ompi_communicator_t * comm ,
@@ -444,6 +507,15 @@ static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communic
444
507
block = & comm -> c_contextidb ;
445
508
}
446
509
510
+ for (unsigned int i = ompi_mpi_communicators .lowest_free ; i < mca_pml .pml_max_contextid ; ++ i ) {
511
+ bool flag = opal_pointer_array_test_and_set_item (& ompi_mpi_communicators , i , newcomm );
512
+ if (true == flag ) {
513
+ newcomm -> c_index = i ;
514
+ break ;
515
+ }
516
+ }
517
+ assert (newcomm -> c_index > 2 );
518
+
447
519
if (NULL == arg1 ) {
448
520
if (OMPI_COMM_CID_GROUP == mode || OMPI_COMM_CID_GROUP_NEW == mode ||
449
521
!ompi_comm_extended_cid_block_available (& comm -> c_contextidb )) {
@@ -466,14 +538,6 @@ static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communic
466
538
(void ) ompi_comm_extended_cid_block_new (block , & newcomm -> c_contextidb , is_new_block );
467
539
}
468
540
469
- for (unsigned int i = ompi_mpi_communicators .lowest_free ; i < mca_pml .pml_max_contextid ; ++ i ) {
470
- bool flag = opal_pointer_array_test_and_set_item (& ompi_mpi_communicators , i , newcomm );
471
- if (true == flag ) {
472
- newcomm -> c_index = i ;
473
- break ;
474
- }
475
- }
476
-
477
541
newcomm -> c_contextid = newcomm -> c_contextidb .block_cid ;
478
542
479
543
opal_hash_table_set_value_ptr (& ompi_comm_hash , & newcomm -> c_contextid ,
@@ -500,7 +564,7 @@ int ompi_comm_nextcid_nb (ompi_communicator_t *newcomm, ompi_communicator_t *com
500
564
functions but the pml does not support these functions so return not supported */
501
565
if (NULL == comm ) {
502
566
char msg_string [1024 ];
503
- sprintf (msg_string ,"The PML being used - %s - does not support MPI sessions related features" ,
567
+ sprintf (msg_string ,"The PML being used - %s - does not support MPI sessions related features" ,
504
568
mca_pml_base_selected_component .pmlm_version .mca_component_name );
505
569
opal_show_help ("help-comm.txt" ,
506
570
"MPI function not supported" ,
@@ -855,6 +919,7 @@ int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *c
855
919
ompi_comm_cid_context_t * context ;
856
920
ompi_comm_request_t * request ;
857
921
ompi_request_t * subreq ;
922
+ uint32_t comm_size ;
858
923
int ret = 0 ;
859
924
860
925
/* the caller should not pass NULL for comm (it may be the same as *newcomm) */
@@ -876,6 +941,25 @@ int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *c
876
941
877
942
request -> context = & context -> super ;
878
943
944
+ /* Prep communicator for handling remote cids if needed */
945
+
946
+ if (!OMPI_COMM_IS_GLOBAL_INDEX (* newcomm )) {
947
+ if (OMPI_COMM_IS_INTER (* newcomm )) {
948
+ comm_size = ompi_comm_remote_size (* newcomm );
949
+ } else {
950
+ comm_size = ompi_comm_size (* newcomm );
951
+ }
952
+
953
+ (* newcomm )-> c_index_vec = (uint32_t * )calloc (comm_size , sizeof (uint32_t ));
954
+ if (NULL == (* newcomm )-> c_index_vec ) {
955
+ return OMPI_ERR_OUT_OF_RESOURCE ;
956
+ }
957
+
958
+ if (OMPI_COMM_IS_INTRA (* newcomm )) {
959
+ (* newcomm )-> c_index_vec [(* newcomm )-> c_my_rank ] = (* newcomm )-> c_index ;
960
+ }
961
+ }
962
+
879
963
if (MPI_UNDEFINED != (* newcomm )-> c_local_group -> grp_my_rank ) {
880
964
/* Initialize the PML stuff in the newcomm */
881
965
if ( OMPI_SUCCESS != (ret = MCA_PML_CALL (add_comm (* newcomm ))) ) {
@@ -926,6 +1010,61 @@ int ompi_comm_activate (ompi_communicator_t **newcomm, ompi_communicator_t *comm
926
1010
return rc ;
927
1011
}
928
1012
1013
+ int ompi_comm_get_remote_cid_from_pmix (ompi_communicator_t * comm , int dest , uint32_t * remote_cid )
1014
+ {
1015
+ ompi_proc_t * ompi_proc ;
1016
+ pmix_proc_t pmix_proc ;
1017
+ pmix_info_t tinfo [2 ];
1018
+ pmix_value_t * val = NULL ;
1019
+ ompi_comm_extended_cid_t excid ;
1020
+ int rc = OMPI_SUCCESS ;
1021
+ size_t remote_cid64 ;
1022
+
1023
+ assert (NULL != remote_cid );
1024
+
1025
+ ompi_proc = ompi_comm_peer_lookup (comm , dest );
1026
+ OPAL_PMIX_CONVERT_NAME (& pmix_proc , & ompi_proc -> super .proc_name );
1027
+
1028
+ PMIx_Info_construct (& tinfo [0 ]);
1029
+ PMIX_INFO_LOAD (& tinfo [0 ], PMIX_TIMEOUT , & ompi_pmix_connect_timeout , PMIX_UINT32 );
1030
+
1031
+ excid = ompi_comm_get_extended_cid (comm );
1032
+
1033
+ PMIX_INFO_CONSTRUCT (& tinfo [1 ]);
1034
+ PMIX_INFO_LOAD (& tinfo [1 ], PMIX_GROUP_CONTEXT_ID , & excid .cid_base , PMIX_SIZE );
1035
+ PMIX_INFO_SET_QUALIFIER (& tinfo [1 ]);
1036
+ if (PMIX_SUCCESS != (rc = PMIx_Get (& pmix_proc , PMIX_GROUP_LOCAL_CID , tinfo , 2 , & val ))) {
1037
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Get failed for PMIX_GROUP_LOCAL_CID cid_base %ld %s" , excid .cid_base , PMIx_Error_string (rc )));
1038
+ rc = OMPI_ERR_NOT_FOUND ;
1039
+ goto done ;
1040
+ }
1041
+
1042
+ if (NULL == val ) {
1043
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Get failed for PMIX_GROUP_LOCAL_CID val returned NULL" ));
1044
+ rc = OMPI_ERR_NOT_FOUND ;
1045
+ goto done ;
1046
+ }
1047
+
1048
+ if (val -> type != PMIX_SIZE ) {
1049
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Get failed for PMIX_GROUP_LOCAL_CID type mismatch" ));
1050
+ rc = OMPI_ERR_TYPE_MISMATCH ;
1051
+ goto done ;
1052
+ }
1053
+
1054
+ PMIX_VALUE_GET_NUMBER (rc , val , remote_cid64 , size_t );
1055
+ rc = OMPI_SUCCESS ;
1056
+ * remote_cid = (uint32_t )remote_cid64 ;
1057
+ comm -> c_index_vec [dest ] = (uint32_t )remote_cid64 ;
1058
+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Get PMIX_GROUP_LOCAL_CID %d for cid_base %ld" , * remote_cid , excid .cid_base ));
1059
+
1060
+ done :
1061
+ if (NULL != val ) {
1062
+ PMIX_VALUE_RELEASE (val );
1063
+ }
1064
+
1065
+ return rc ;
1066
+ }
1067
+
929
1068
static int ompi_comm_activate_nb_complete (ompi_comm_request_t * request )
930
1069
{
931
1070
ompi_comm_cid_context_t * context = (ompi_comm_cid_context_t * ) request -> context ;
0 commit comments