Skip to content

Commit 9a9730e

Browse files
committed
ompi/comm: Improve MPI_Comm_create algorithm
Force only procs that are participating in the ne Comm to decide what CID is appropriate. This will have 2 advantages: * Speedup Comm creation for small communicators: non-participating procs will not interfere * Reduce CID fragmentation: non-overlaping groups will be allowed to use same CID. Signed-off-by: Artem Polyakov <[email protected]> (cherry picked from commit f1f7f20)
1 parent d951055 commit 9a9730e

File tree

1 file changed

+65
-28
lines changed

1 file changed

+65
-28
lines changed

ompi/communicator/comm_cid.c

Lines changed: 65 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
* Copyright (c) 2014-2016 Research Organization for Information Science
2222
* and Technology (RIST). All rights reserved.
2323
* Copyright (c) 2016 IBM Corporation. All rights reserved.
24+
* Copyright (c) 2017 Mellanox Technologies. All rights reserved.
2425
* $COPYRIGHT$
2526
*
2627
* Additional copyrights may follow
@@ -303,6 +304,7 @@ static int ompi_comm_allreduce_getnextcid (ompi_comm_request_t *request)
303304
ompi_request_t *subreq;
304305
bool flag;
305306
int ret;
307+
int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED);
306308

307309
if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) {
308310
return ompi_comm_request_schedule_append (request, ompi_comm_allreduce_getnextcid, NULL, 0);
@@ -318,81 +320,116 @@ static int ompi_comm_allreduce_getnextcid (ompi_comm_request_t *request)
318320
/**
319321
* This is the real algorithm described in the doc
320322
*/
321-
flag = false;
322-
context->nextlocal_cid = mca_pml.pml_max_contextid;
323-
for (unsigned int i = context->start ; i < mca_pml.pml_max_contextid ; ++i) {
324-
flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i,
325-
context->comm);
326-
if (true == flag) {
327-
context->nextlocal_cid = i;
328-
break;
323+
if( participate ){
324+
flag = false;
325+
context->nextlocal_cid = mca_pml.pml_max_contextid;
326+
for (unsigned int i = context->start ; i < mca_pml.pml_max_contextid ; ++i) {
327+
flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i,
328+
context->comm);
329+
if (true == flag) {
330+
context->nextlocal_cid = i;
331+
break;
332+
}
329333
}
334+
} else {
335+
context->nextlocal_cid = 0;
330336
}
331337

332338
ret = context->allreduce_fn (&context->nextlocal_cid, &context->nextcid, 1, MPI_MAX,
333339
context, &subreq);
340+
/* there was a failure during non-blocking collective
341+
* all we can do is abort
342+
*/
334343
if (OMPI_SUCCESS != ret) {
335-
ompi_comm_cid_lowest_id = INT64_MAX;
336-
OPAL_THREAD_UNLOCK(&ompi_cid_lock);
337-
return ret;
344+
goto err_exit;
338345
}
339346

340-
if ((unsigned int) context->nextlocal_cid == mca_pml.pml_max_contextid) {
341-
/* at least one peer ran out of CIDs */
342-
if (flag) {
343-
opal_pointer_array_test_and_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL);
344-
}
345-
346-
ompi_comm_cid_lowest_id = INT64_MAX;
347-
OPAL_THREAD_UNLOCK(&ompi_cid_lock);
348-
return OMPI_ERR_OUT_OF_RESOURCE;
347+
if ( ((unsigned int) context->nextlocal_cid == mca_pml.pml_max_contextid) ) {
348+
/* Our local CID space is out, others already aware (allreduce above) */
349+
ret = OMPI_ERR_OUT_OF_RESOURCE;
350+
goto err_exit;
349351
}
350352
OPAL_THREAD_UNLOCK(&ompi_cid_lock);
351353

352354
/* next we want to verify that the resulting commid is ok */
353355
return ompi_comm_request_schedule_append (request, ompi_comm_checkcid, &subreq, 1);
356+
err_exit:
357+
if (participate && flag) {
358+
opal_pointer_array_test_and_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL);
359+
}
360+
ompi_comm_cid_lowest_id = INT64_MAX;
361+
OPAL_THREAD_UNLOCK(&ompi_cid_lock);
362+
return ret;
363+
354364
}
355365

356366
static int ompi_comm_checkcid (ompi_comm_request_t *request)
357367
{
358368
ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context;
359369
ompi_request_t *subreq;
360370
int ret;
371+
int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED);
361372

362373
if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) {
363374
return ompi_comm_request_schedule_append (request, ompi_comm_checkcid, NULL, 0);
364375
}
365376

366-
context->flag = (context->nextcid == context->nextlocal_cid);
367-
368-
if (!context->flag) {
369-
opal_pointer_array_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL);
377+
if( !participate ){
378+
context->flag = 1;
379+
} else {
380+
context->flag = (context->nextcid == context->nextlocal_cid);
381+
if ( participate && !context->flag) {
382+
opal_pointer_array_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL);
370383

371-
context->flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators,
372-
context->nextcid, context->comm);
384+
context->flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators,
385+
context->nextcid, context->comm);
386+
}
373387
}
374388

375389
++context->iter;
376390

377391
ret = context->allreduce_fn (&context->flag, &context->rflag, 1, MPI_MIN, context, &subreq);
378392
if (OMPI_SUCCESS == ret) {
379393
ompi_comm_request_schedule_append (request, ompi_comm_nextcid_check_flag, &subreq, 1);
394+
} else {
395+
if (participate && context->flag ) {
396+
opal_pointer_array_test_and_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL);
397+
}
398+
ompi_comm_cid_lowest_id = INT64_MAX;
380399
}
381400

382401
OPAL_THREAD_UNLOCK(&ompi_cid_lock);
383-
384402
return ret;
385403
}
386404

387405
static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request)
388406
{
389407
ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context;
408+
int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED);
390409

391410
if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) {
392411
return ompi_comm_request_schedule_append (request, ompi_comm_nextcid_check_flag, NULL, 0);
393412
}
394413

395414
if (1 == context->rflag) {
415+
if( !participate ) {
416+
/* we need to provide something sane here
417+
* but we cannot use `nextcid` as we may have it
418+
* in-use, go ahead with next locally-available CID
419+
*/
420+
context->nextlocal_cid = mca_pml.pml_max_contextid;
421+
for (unsigned int i = context->start ; i < mca_pml.pml_max_contextid ; ++i) {
422+
bool flag;
423+
flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i,
424+
context->comm);
425+
if (true == flag) {
426+
context->nextlocal_cid = i;
427+
break;
428+
}
429+
}
430+
context->nextcid = context->nextlocal_cid;
431+
}
432+
396433
/* set the according values to the newcomm */
397434
context->newcomm->c_contextid = context->nextcid;
398435
opal_pointer_array_set_item (&ompi_mpi_communicators, context->nextcid, context->newcomm);
@@ -405,7 +442,7 @@ static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request)
405442
return OMPI_SUCCESS;
406443
}
407444

408-
if (1 == context->flag) {
445+
if (participate && (1 == context->flag)) {
409446
/* we could use this cid, but other don't agree */
410447
opal_pointer_array_set_item (&ompi_mpi_communicators, context->nextcid, NULL);
411448
context->start = context->nextcid + 1; /* that's where we can start the next round */

0 commit comments

Comments
 (0)