@@ -1091,7 +1091,8 @@ int ompi_instance_get_num_psets (ompi_instance_t *instance, int *npset_names)
10911091
10921092int ompi_instance_get_nth_pset (ompi_instance_t * instance , int n , int * len , char * pset_name )
10931093{
1094- if (NULL == ompi_mpi_instance_pmix_psets && n >= ompi_instance_builtin_count ) {
1094+ if (NULL == ompi_mpi_instance_pmix_psets ||
1095+ (size_t ) n >= (ompi_instance_builtin_count + ompi_mpi_instance_num_pmix_psets )) {
10951096 ompi_instance_refresh_pmix_psets (PMIX_QUERY_PSET_NAMES );
10961097 }
10971098
@@ -1229,71 +1230,83 @@ static int ompi_instance_group_self (ompi_instance_t *instance, ompi_group_t **g
12291230
12301231static int ompi_instance_group_pmix_pset (ompi_instance_t * instance , const char * pset_name , ompi_group_t * * group_out )
12311232{
1233+ int ret = OMPI_SUCCESS ;
1234+ size_t i ,n ;
1235+ bool isnew , try_again = false, refresh = true;
12321236 pmix_status_t rc ;
1233- pmix_proc_t p ;
1234- ompi_group_t * group ;
1235- pmix_value_t * pval = NULL ;
1236- char * stmp = NULL ;
1237- size_t size = 0 ;
1238-
1239- /* make the group large enough to hold world */
1240- group = ompi_group_allocate (NULL , ompi_process_info .num_procs );
1241- if (OPAL_UNLIKELY (NULL == group )) {
1242- return OMPI_ERR_OUT_OF_RESOURCE ;
1243- }
1237+ ompi_group_t * group = NULL ;
1238+ pmix_query_t query ;
1239+ pmix_info_t * info = NULL ;
1240+ size_t ninfo ;
1241+ opal_process_name_t pname ;
12441242
1243+ PMIX_QUERY_CONSTRUCT (& query );
1244+ PMIX_ARGV_APPEND (rc , query .keys , PMIX_QUERY_PSET_MEMBERSHIP );
1245+ PMIX_INFO_CREATE (query .qualifiers , 1 );
1246+ query .nqual = 1 ;
1247+ PMIX_INFO_LOAD (& query .qualifiers [0 ], PMIX_PSET_NAME , pset_name , PMIX_STRING );
12451248
1246- for (size_t i = 0 ; i < ompi_process_info .num_procs ; ++ i ) {
1247- opal_process_name_t name = {.vpid = i , .jobid = OMPI_PROC_MY_NAME -> jobid };
1249+ /*
1250+ * First try finding in the local PMIx cache, if not found, try a refresh
1251+ */
1252+ fn_try_again :
1253+ rc = PMIx_Query_info (& query , 1 , & info , & ninfo );
1254+ if (PMIX_SUCCESS != (rc = PMIx_Query_info (& query , 1 , & info , & ninfo )) || 0 == ninfo ) {
1255+ if ((PMIX_ERR_NOT_FOUND == rc ) && (false == try_again )) {
1256+ try_again = true;
1257+ PMIX_QUERY_DESTRUCT (& query );
1258+ PMIX_QUERY_CONSTRUCT (& query );
1259+ PMIX_ARGV_APPEND (rc , query .keys , PMIX_QUERY_PSET_MEMBERSHIP );
1260+ PMIX_INFO_CREATE (query .qualifiers , 2 );
1261+ PMIX_INFO_LOAD (& query .qualifiers [0 ], PMIX_PSET_NAME , pset_name , PMIX_STRING );
1262+ PMIX_INFO_LOAD (& query .qualifiers [1 ], PMIX_QUERY_REFRESH_CACHE , & refresh , PMIX_BOOL );
1263+ goto fn_try_again ;
1264+ }
1265+ ret = opal_pmix_convert_status (rc );
1266+ ompi_instance_print_error ("PMIx_Query_info() failed" , ret );
1267+ goto fn_w_query ;
1268+ }
12481269
1249- OPAL_PMIX_CONVERT_NAME (& p , & name );
1250- rc = PMIx_Get (& p , PMIX_PSET_NAME , NULL , 0 , & pval );
1251- if (OPAL_UNLIKELY (PMIX_SUCCESS != rc )) {
1252- OBJ_RELEASE (group );
1253- return opal_pmix_convert_status (rc );
1254- }
1270+ for (n = 0 ; n < ninfo ; n ++ ){
1271+ if (0 == strcmp (info [n ].key , PMIX_QUERY_PSET_MEMBERSHIP )){
1272+
1273+ pmix_data_array_t * data_array = info [n ].value .data .darray ;
1274+ pmix_proc_t * members_array = (pmix_proc_t * ) data_array -> array ;
12551275
1256- PMIX_VALUE_UNLOAD (rc ,
1257- pval ,
1258- (void * * )& stmp ,
1259- & size );
1260- if (0 != strcmp (pset_name , stmp )) {
1261- PMIX_VALUE_RELEASE (pval );
1262- free (stmp );
1263- continue ;
1264- }
1265- PMIX_VALUE_RELEASE (pval );
1266- free (stmp );
1276+ group = ompi_group_allocate (NULL , data_array -> size );
1277+ if (OPAL_UNLIKELY (NULL == group )) {
1278+ ret = OMPI_ERR_OUT_OF_RESOURCE ;
1279+ goto fn_w_info ;
1280+ }
12671281
1268- /* look for existing ompi_proc_t that matches this name */
1269- group -> grp_proc_pointers [size ] = (ompi_proc_t * ) ompi_proc_lookup (name );
1270- if (NULL == group -> grp_proc_pointers [size ]) {
1271- /* set sentinel value */
1272- group -> grp_proc_pointers [size ] = (ompi_proc_t * ) ompi_proc_name_to_sentinel (name );
1273- } else {
1274- OBJ_RETAIN (group -> grp_proc_pointers [size ]);
1282+ for (i = 0 ; i < data_array -> size ; i ++ ){
1283+ OPAL_PMIX_CONVERT_PROCT (ret , & pname , & members_array [i ]);
1284+ if (OPAL_SUCCESS == rc ) {
1285+ group -> grp_proc_pointers [i ] = ompi_proc_find_and_add (& pname ,& isnew );
1286+ } else {
1287+ ompi_instance_print_error ("OPAL_PMIX_CONVERT_PROCT failed %d" , ret );
1288+ ompi_group_free (& group );
1289+ goto fn_w_info ;
1290+ }
1291+ }
1292+ break ;
12751293 }
1276- ++ size ;
12771294 }
12781295
1279- /* shrink the proc array if needed */
1280- if (size < (size_t ) group -> grp_proc_count ) {
1281- void * tmp = realloc (group -> grp_proc_pointers , size * sizeof (group -> grp_proc_pointers [0 ]));
1282- if (OPAL_UNLIKELY (NULL == tmp )) {
1283- OBJ_RELEASE (group );
1284- return OMPI_ERR_OUT_OF_RESOURCE ;
1285- }
1286-
1287- group -> grp_proc_pointers = (ompi_proc_t * * ) tmp ;
1288- group -> grp_proc_count = (int ) size ;
1296+ if (NULL != group ) {
1297+ ompi_set_group_rank (group , ompi_proc_local ());
1298+ group -> grp_instance = instance ;
1299+ * group_out = group ;
1300+ } else {
1301+ ret = OMPI_ERR_NOT_FOUND ;
12891302 }
12901303
1291- ompi_set_group_rank (group , ompi_proc_local ());
1292-
1293- group -> grp_instance = instance ;
1304+ fn_w_info :
1305+ PMIX_INFO_DESTRUCT (info );
1306+ fn_w_query :
1307+ PMIX_QUERY_DESTRUCT (& query );
12941308
1295- * group_out = group ;
1296- return OMPI_SUCCESS ;
1309+ return ret ;
12971310}
12981311
12991312static int ompi_instance_get_pmix_pset_size (ompi_instance_t * instance , const char * pset_name , size_t * size_out )
0 commit comments