Skip to content

Commit e6c2f22

Browse files
committed
sessions: add support for ucx more
Greatly simplify support for MPI_Comm_create_from_group and MPI_Intercomm_create_from_group by removing the need to support the 128-bit excid notion. Only non-ucx related pieces of this commit where cherry-picked over to v5.0.x (cherry picked from commit 95e3323) Signed-off-by: Howard Pritchard <[email protected]>
1 parent d1bb190 commit e6c2f22

File tree

6 files changed

+250
-50
lines changed

6 files changed

+250
-50
lines changed

ompi/communicator/comm.c

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
* Copyright (c) 2015 Mellanox Technologies. All rights reserved.
2525
* Copyright (c) 2017-2022 IBM Corporation. All rights reserved.
2626
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
27-
* Copyright (c) 2018-2022 Triad National Security, LLC. All rights
27+
* Copyright (c) 2018-2024 Triad National Security, LLC. All rights
2828
* reserved.
2929
* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
3030
* $COPYRIGHT$
@@ -1741,7 +1741,7 @@ int ompi_intercomm_create_from_groups (ompi_group_t *local_group, int local_lead
17411741
ompi_communicator_t **newintercomm)
17421742
{
17431743
ompi_communicator_t *newcomp = NULL, *local_comm, *leader_comm = MPI_COMM_NULL;
1744-
ompi_comm_extended_cid_block_t new_block;
1744+
ompi_comm_extended_cid_block_t new_block = {0};
17451745
bool i_am_leader = local_leader == local_group->grp_my_rank;
17461746
ompi_proc_t **rprocs;
17471747
uint64_t data[4];
@@ -1867,14 +1867,7 @@ int ompi_intercomm_create_from_groups (ompi_group_t *local_group, int local_lead
18671867
return rc;
18681868
}
18691869

1870-
/* will be using a communicator ID derived from the bridge communicator to save some time */
1871-
new_block.block_cid.cid_base = data[1];
1872-
new_block.block_cid.cid_sub.u64 = data[2];
1873-
new_block.block_nextsub = 0;
1874-
new_block.block_nexttag = 0;
1875-
new_block.block_level = (int8_t) data[3];
1876-
1877-
rc = ompi_comm_nextcid (newcomp, NULL, NULL, (void *) tag, &new_block, false, OMPI_COMM_CID_GROUP_NEW);
1870+
rc = ompi_comm_nextcid (newcomp, NULL, NULL, (void *) tag, NULL, false, OMPI_COMM_CID_GROUP_NEW);
18781871
if ( OMPI_SUCCESS != rc ) {
18791872
OBJ_RELEASE(newcomp);
18801873
return rc;

ompi/communicator/comm_cid.c

Lines changed: 176 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -308,21 +308,16 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
308308
const void *arg0, const void *arg1, bool send_first, int mode,
309309
ompi_request_t **req)
310310
{
311-
pmix_info_t pinfo, *results = NULL;
311+
pmix_info_t *pinfo, *results = NULL;
312312
size_t nresults;
313-
opal_process_name_t *name_array = NULL;
314-
char *tag = NULL;
315-
size_t proc_count;
316-
size_t cid_base = 0;
313+
opal_process_name_t opal_proc_name;
317314
bool cid_base_set = false;
315+
char *tag = NULL;
316+
size_t proc_count = 0, rproc_count = 0, tproc_count = 0, cid_base = 0UL, ninfo;
318317
int rc, leader_rank;
319-
int ret = OMPI_SUCCESS;
320-
pmix_proc_t *procs = NULL;
321-
322-
rc = ompi_group_to_proc_name_array (newcomm->c_local_group, &name_array, &proc_count);
323-
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
324-
return rc;
325-
}
318+
pmix_proc_t *procs;
319+
void *grpinfo = NULL, *list = NULL;
320+
pmix_data_array_t darray;
326321

327322
switch (mode) {
328323
case OMPI_COMM_CID_GROUP_NEW:
@@ -339,15 +334,75 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
339334
break;
340335
}
341336

342-
PMIX_INFO_LOAD(&pinfo, PMIX_GROUP_ASSIGN_CONTEXT_ID, NULL, PMIX_BOOL);
337+
grpinfo = PMIx_Info_list_start();
338+
if (NULL == grpinfo) {
339+
rc = OMPI_ERR_OUT_OF_RESOURCE;
340+
goto fn_exit;
341+
}
342+
343+
rc = PMIx_Info_list_add(grpinfo, PMIX_GROUP_ASSIGN_CONTEXT_ID, NULL, PMIX_BOOL);
344+
if (PMIX_SUCCESS != rc) {
345+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_add failed %s %d", PMIx_Error_string(rc), __LINE__));
346+
rc = OMPI_ERR_OUT_OF_RESOURCE;
347+
goto fn_exit;
348+
}
349+
350+
list = PMIx_Info_list_start();
351+
352+
size_t c_index = (size_t)newcomm->c_index;
353+
rc = PMIx_Info_list_add(list, PMIX_GROUP_LOCAL_CID, &c_index, PMIX_SIZE);
354+
if (PMIX_SUCCESS != rc) {
355+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_add failed %s %d", PMIx_Error_string(rc), __LINE__));
356+
rc = OMPI_ERR_OUT_OF_RESOURCE;
357+
goto fn_exit;
358+
}
359+
360+
rc = PMIx_Info_list_convert(list, &darray);
361+
if (PMIX_SUCCESS != rc) {
362+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_convert failed %s %d", PMIx_Error_string(rc), __LINE__));
363+
rc = OMPI_ERR_OUT_OF_RESOURCE;
364+
goto fn_exit;
365+
}
366+
rc = PMIx_Info_list_add(grpinfo, PMIX_GROUP_INFO, &darray, PMIX_DATA_ARRAY);
367+
PMIX_DATA_ARRAY_DESTRUCT(&darray);
368+
if (PMIX_SUCCESS != rc) {
369+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_add failed %s %d", PMIx_Error_string(rc), __LINE__));
370+
rc = OMPI_ERR_OUT_OF_RESOURCE;
371+
goto fn_exit;
372+
}
373+
374+
rc = PMIx_Info_list_convert(grpinfo, &darray);
375+
if (PMIX_SUCCESS != rc) {
376+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_convert failed %s %d", PMIx_Error_string(rc), __LINE__));
377+
rc = OMPI_ERR_OUT_OF_RESOURCE;
378+
goto fn_exit;
379+
}
380+
381+
pinfo = (pmix_info_t*)darray.array;
382+
ninfo = darray.size;
383+
384+
proc_count = newcomm->c_local_group->grp_proc_count;
385+
if ( OMPI_COMM_IS_INTER (newcomm) ){
386+
rproc_count = newcomm->c_remote_group->grp_proc_count;
387+
}
388+
389+
PMIX_PROC_CREATE(procs, proc_count + rproc_count);
343390

344-
PMIX_PROC_CREATE(procs, proc_count);
345391
for (size_t i = 0 ; i < proc_count; ++i) {
346-
OPAL_PMIX_CONVERT_NAME(&procs[i],&name_array[i]);
392+
opal_proc_name = ompi_group_get_proc_name(newcomm->c_local_group, i);
393+
OPAL_PMIX_CONVERT_NAME(&procs[i],&opal_proc_name);
394+
}
395+
for (size_t i = 0; i < rproc_count; ++i) {
396+
opal_proc_name = ompi_group_get_proc_name(newcomm->c_remote_group, i);
397+
OPAL_PMIX_CONVERT_NAME(&procs[proc_count+i],&opal_proc_name);
347398
}
348399

349-
rc = PMIx_Group_construct(tag, procs, proc_count, &pinfo, 1, &results, &nresults);
350-
PMIX_INFO_DESTRUCT(&pinfo);
400+
tproc_count = proc_count + rproc_count;
401+
402+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "calling PMIx_Group_construct - tag %s size %ld ninfo %ld cid_base %ld\n",
403+
tag, tproc_count, ninfo, cid_base));
404+
rc = PMIx_Group_construct(tag, procs, tproc_count, pinfo, ninfo, &results, &nresults);
405+
PMIX_DATA_ARRAY_DESTRUCT(&darray);
351406
if(PMIX_SUCCESS != rc) {
352407
char msg_string[1024];
353408
switch (rc) {
@@ -359,7 +414,7 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
359414
"MPI_Comm_create_from_group/MPI_Intercomm_create_from_groups",
360415
msg_string);
361416

362-
ret = MPI_ERR_UNSUPPORTED_OPERATION;
417+
rc = MPI_ERR_UNSUPPORTED_OPERATION;
363418
break;
364419
case PMIX_ERR_NOT_SUPPORTED:
365420
sprintf(msg_string,"PMIx server does not support PMIx Group operations");
@@ -368,10 +423,10 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
368423
true,
369424
"MPI_Comm_create_from_group/MPI_Intercomm_create_from_groups",
370425
msg_string);
371-
ret = MPI_ERR_UNSUPPORTED_OPERATION;
426+
rc = MPI_ERR_UNSUPPORTED_OPERATION;
372427
break;
373428
default:
374-
ret = opal_pmix_convert_status(rc);
429+
rc = opal_pmix_convert_status(rc);
375430
break;
376431
}
377432
goto fn_exit;
@@ -381,23 +436,28 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
381436
if (PMIX_CHECK_KEY(&results[i], PMIX_GROUP_CONTEXT_ID)) {
382437
PMIX_VALUE_GET_NUMBER(rc, &results[i].value, cid_base, size_t);
383438
if(PMIX_SUCCESS != rc) {
384-
ret = opal_pmix_convert_status(rc);
439+
rc = opal_pmix_convert_status(rc);
385440
goto fn_exit;
386441
}
387442
cid_base_set = true;
388443
break;
389444
}
390445
}
391446

447+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Group_construct - tag %s size %ld ninfo %ld cid_base %ld\n",
448+
tag, tproc_count, ninfo, cid_base));
449+
450+
/* destruct the group */
392451
rc = PMIx_Group_destruct (tag, NULL, 0);
393452
if(PMIX_SUCCESS != rc) {
394-
ret = opal_pmix_convert_status(rc);
453+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Group_destruct failed %s", PMIx_Error_string(rc)));
454+
rc = opal_pmix_convert_status(rc);
395455
goto fn_exit;
396456
}
397457

398458
if (!cid_base_set) {
399459
opal_show_help("help-comm.txt", "cid-base-not-set", true);
400-
ret = OMPI_ERROR;
460+
rc = OMPI_ERROR;
401461
goto fn_exit;
402462
}
403463

@@ -410,16 +470,19 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
410470
}
411471

412472
if(NULL != procs) {
413-
PMIX_PROC_FREE(procs, proc_count);
473+
PMIX_PROC_FREE(procs, tproc_count);
414474
procs = NULL;
415475
}
416476

417-
if(NULL != name_array) {
418-
free (name_array);
419-
name_array = NULL;
477+
if (NULL != grpinfo) {
478+
PMIx_Info_list_release(grpinfo);
420479
}
421480

422-
return ret;
481+
if (NULL != list) {
482+
PMIx_Info_list_release(list);
483+
}
484+
485+
return rc;
423486
}
424487

425488
static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communicator_t *comm,
@@ -444,6 +507,15 @@ static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communic
444507
block = &comm->c_contextidb;
445508
}
446509

510+
for (unsigned int i = ompi_mpi_communicators.lowest_free ; i < mca_pml.pml_max_contextid ; ++i) {
511+
bool flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i, newcomm);
512+
if (true == flag) {
513+
newcomm->c_index = i;
514+
break;
515+
}
516+
}
517+
assert(newcomm->c_index > 2);
518+
447519
if (NULL == arg1) {
448520
if (OMPI_COMM_CID_GROUP == mode || OMPI_COMM_CID_GROUP_NEW == mode ||
449521
!ompi_comm_extended_cid_block_available (&comm->c_contextidb)) {
@@ -466,14 +538,6 @@ static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communic
466538
(void) ompi_comm_extended_cid_block_new (block, &newcomm->c_contextidb, is_new_block);
467539
}
468540

469-
for (unsigned int i = ompi_mpi_communicators.lowest_free ; i < mca_pml.pml_max_contextid ; ++i) {
470-
bool flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i, newcomm);
471-
if (true == flag) {
472-
newcomm->c_index = i;
473-
break;
474-
}
475-
}
476-
477541
newcomm->c_contextid = newcomm->c_contextidb.block_cid;
478542

479543
opal_hash_table_set_value_ptr (&ompi_comm_hash, &newcomm->c_contextid,
@@ -500,7 +564,7 @@ int ompi_comm_nextcid_nb (ompi_communicator_t *newcomm, ompi_communicator_t *com
500564
functions but the pml does not support these functions so return not supported */
501565
if (NULL == comm) {
502566
char msg_string[1024];
503-
sprintf(msg_string,"The PML being used - %s - does not support MPI sessions related features",
567+
sprintf(msg_string,"The PML being used - %s - does not support MPI sessions related features",
504568
mca_pml_base_selected_component.pmlm_version.mca_component_name);
505569
opal_show_help("help-comm.txt",
506570
"MPI function not supported",
@@ -855,6 +919,7 @@ int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *c
855919
ompi_comm_cid_context_t *context;
856920
ompi_comm_request_t *request;
857921
ompi_request_t *subreq;
922+
uint32_t comm_size;
858923
int ret = 0;
859924

860925
/* the caller should not pass NULL for comm (it may be the same as *newcomm) */
@@ -876,6 +941,25 @@ int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *c
876941

877942
request->context = &context->super;
878943

944+
/* Prep communicator for handling remote cids if needed */
945+
946+
if (!OMPI_COMM_IS_GLOBAL_INDEX(*newcomm)) {
947+
if (OMPI_COMM_IS_INTER(*newcomm)) {
948+
comm_size = ompi_comm_remote_size(*newcomm);
949+
} else {
950+
comm_size = ompi_comm_size(*newcomm);
951+
}
952+
953+
(*newcomm)->c_index_vec = (uint32_t *)calloc(comm_size, sizeof(uint32_t));
954+
if (NULL == (*newcomm)->c_index_vec) {
955+
return OMPI_ERR_OUT_OF_RESOURCE;
956+
}
957+
958+
if (OMPI_COMM_IS_INTRA(*newcomm)) {
959+
(*newcomm)->c_index_vec[(*newcomm)->c_my_rank] = (*newcomm)->c_index;
960+
}
961+
}
962+
879963
if (MPI_UNDEFINED != (*newcomm)->c_local_group->grp_my_rank) {
880964
/* Initialize the PML stuff in the newcomm */
881965
if ( OMPI_SUCCESS != (ret = MCA_PML_CALL(add_comm(*newcomm))) ) {
@@ -926,6 +1010,61 @@ int ompi_comm_activate (ompi_communicator_t **newcomm, ompi_communicator_t *comm
9261010
return rc;
9271011
}
9281012

1013+
int ompi_comm_get_remote_cid_from_pmix (ompi_communicator_t *comm, int dest, uint32_t *remote_cid)
1014+
{
1015+
ompi_proc_t *ompi_proc;
1016+
pmix_proc_t pmix_proc;
1017+
pmix_info_t tinfo[2];
1018+
pmix_value_t *val = NULL;
1019+
ompi_comm_extended_cid_t excid;
1020+
int rc = OMPI_SUCCESS;
1021+
size_t remote_cid64;
1022+
1023+
assert(NULL != remote_cid);
1024+
1025+
ompi_proc = ompi_comm_peer_lookup(comm, dest);
1026+
OPAL_PMIX_CONVERT_NAME(&pmix_proc, &ompi_proc->super.proc_name);
1027+
1028+
PMIx_Info_construct(&tinfo[0]);
1029+
PMIX_INFO_LOAD(&tinfo[0], PMIX_TIMEOUT, &ompi_pmix_connect_timeout, PMIX_UINT32);
1030+
1031+
excid = ompi_comm_get_extended_cid(comm);
1032+
1033+
PMIX_INFO_CONSTRUCT(&tinfo[1]);
1034+
PMIX_INFO_LOAD(&tinfo[1], PMIX_GROUP_CONTEXT_ID, &excid.cid_base, PMIX_SIZE);
1035+
PMIX_INFO_SET_QUALIFIER(&tinfo[1]);
1036+
if (PMIX_SUCCESS != (rc = PMIx_Get(&pmix_proc, PMIX_GROUP_LOCAL_CID, tinfo, 2, &val))) {
1037+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID cid_base %ld %s", excid.cid_base, PMIx_Error_string(rc)));
1038+
rc = OMPI_ERR_NOT_FOUND;
1039+
goto done;
1040+
}
1041+
1042+
if (NULL == val) {
1043+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID val returned NULL"));
1044+
rc = OMPI_ERR_NOT_FOUND;
1045+
goto done;
1046+
}
1047+
1048+
if (val->type != PMIX_SIZE) {
1049+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID type mismatch"));
1050+
rc = OMPI_ERR_TYPE_MISMATCH;
1051+
goto done;
1052+
}
1053+
1054+
PMIX_VALUE_GET_NUMBER(rc, val, remote_cid64, size_t);
1055+
rc = OMPI_SUCCESS;
1056+
*remote_cid = (uint32_t)remote_cid64;
1057+
comm->c_index_vec[dest] = (uint32_t)remote_cid64;
1058+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get PMIX_GROUP_LOCAL_CID %d for cid_base %ld", *remote_cid, excid.cid_base));
1059+
1060+
done:
1061+
if (NULL != val) {
1062+
PMIX_VALUE_RELEASE(val);
1063+
}
1064+
1065+
return rc;
1066+
}
1067+
9291068
static int ompi_comm_activate_nb_complete (ompi_comm_request_t *request)
9301069
{
9311070
ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context;

0 commit comments

Comments
 (0)