Skip to content

Commit 88834e9

Browse files
committed
ompi/comm: Improve MPI_Comm_create algorithm
Force only procs that are participating in the ne Comm to decide what CID is appropriate. This will have 2 advantages: * Speedup Comm creation for small communicators: non-participating procs will not interfere * Reduce CID fragmentation: non-overlaping groups will be allowed to use same CID. Signed-off-by: Artem Polyakov <[email protected]> (cherry picked from commit f1f7f20)
1 parent 11ffb66 commit 88834e9

File tree

1 file changed

+64
-28
lines changed

1 file changed

+64
-28
lines changed

ompi/communicator/comm_cid.c

Lines changed: 64 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,7 @@ static int ompi_comm_allreduce_getnextcid (ompi_comm_request_t *request)
303303
ompi_request_t *subreq;
304304
bool flag;
305305
int ret;
306+
int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED);
306307

307308
if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) {
308309
return ompi_comm_request_schedule_append (request, ompi_comm_allreduce_getnextcid, NULL, 0);
@@ -318,81 +319,116 @@ static int ompi_comm_allreduce_getnextcid (ompi_comm_request_t *request)
318319
/**
319320
* This is the real algorithm described in the doc
320321
*/
321-
flag = false;
322-
context->nextlocal_cid = mca_pml.pml_max_contextid;
323-
for (unsigned int i = context->start ; i < mca_pml.pml_max_contextid ; ++i) {
324-
flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i,
325-
context->comm);
326-
if (true == flag) {
327-
context->nextlocal_cid = i;
328-
break;
322+
if( participate ){
323+
flag = false;
324+
context->nextlocal_cid = mca_pml.pml_max_contextid;
325+
for (unsigned int i = context->start ; i < mca_pml.pml_max_contextid ; ++i) {
326+
flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i,
327+
context->comm);
328+
if (true == flag) {
329+
context->nextlocal_cid = i;
330+
break;
331+
}
329332
}
333+
} else {
334+
context->nextlocal_cid = 0;
330335
}
331336

332337
ret = context->allreduce_fn (&context->nextlocal_cid, &context->nextcid, 1, MPI_MAX,
333338
context, &subreq);
339+
/* there was a failure during non-blocking collective
340+
* all we can do is abort
341+
*/
334342
if (OMPI_SUCCESS != ret) {
335-
ompi_comm_cid_lowest_id = INT64_MAX;
336-
OPAL_THREAD_UNLOCK(&ompi_cid_lock);
337-
return ret;
343+
goto err_exit;
338344
}
339345

340-
if ((unsigned int) context->nextlocal_cid == mca_pml.pml_max_contextid) {
341-
/* at least one peer ran out of CIDs */
342-
if (flag) {
343-
opal_pointer_array_test_and_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL);
344-
}
345-
346-
ompi_comm_cid_lowest_id = INT64_MAX;
347-
OPAL_THREAD_UNLOCK(&ompi_cid_lock);
348-
return OMPI_ERR_OUT_OF_RESOURCE;
346+
if ( ((unsigned int) context->nextlocal_cid == mca_pml.pml_max_contextid) ) {
347+
/* Our local CID space is out, others already aware (allreduce above) */
348+
ret = OMPI_ERR_OUT_OF_RESOURCE;
349+
goto err_exit;
349350
}
350351
OPAL_THREAD_UNLOCK(&ompi_cid_lock);
351352

352353
/* next we want to verify that the resulting commid is ok */
353354
return ompi_comm_request_schedule_append (request, ompi_comm_checkcid, &subreq, 1);
355+
err_exit:
356+
if (participate && flag) {
357+
opal_pointer_array_test_and_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL);
358+
}
359+
ompi_comm_cid_lowest_id = INT64_MAX;
360+
OPAL_THREAD_UNLOCK(&ompi_cid_lock);
361+
return ret;
362+
354363
}
355364

356365
static int ompi_comm_checkcid (ompi_comm_request_t *request)
357366
{
358367
ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context;
359368
ompi_request_t *subreq;
360369
int ret;
370+
int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED);
361371

362372
if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) {
363373
return ompi_comm_request_schedule_append (request, ompi_comm_checkcid, NULL, 0);
364374
}
365375

366-
context->flag = (context->nextcid == context->nextlocal_cid);
367-
368-
if (!context->flag) {
369-
opal_pointer_array_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL);
376+
if( !participate ){
377+
context->flag = 1;
378+
} else {
379+
context->flag = (context->nextcid == context->nextlocal_cid);
380+
if ( participate && !context->flag) {
381+
opal_pointer_array_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL);
370382

371-
context->flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators,
372-
context->nextcid, context->comm);
383+
context->flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators,
384+
context->nextcid, context->comm);
385+
}
373386
}
374387

375388
++context->iter;
376389

377390
ret = context->allreduce_fn (&context->flag, &context->rflag, 1, MPI_MIN, context, &subreq);
378391
if (OMPI_SUCCESS == ret) {
379392
ompi_comm_request_schedule_append (request, ompi_comm_nextcid_check_flag, &subreq, 1);
393+
} else {
394+
if (participate && context->flag ) {
395+
opal_pointer_array_test_and_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL);
396+
}
397+
ompi_comm_cid_lowest_id = INT64_MAX;
380398
}
381399

382400
OPAL_THREAD_UNLOCK(&ompi_cid_lock);
383-
384401
return ret;
385402
}
386403

387404
static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request)
388405
{
389406
ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context;
407+
int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED);
390408

391409
if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) {
392410
return ompi_comm_request_schedule_append (request, ompi_comm_nextcid_check_flag, NULL, 0);
393411
}
394412

395413
if (1 == context->rflag) {
414+
if( !participate ) {
415+
/* we need to provide something sane here
416+
* but we cannot use `nextcid` as we may have it
417+
* in-use, go ahead with next locally-available CID
418+
*/
419+
context->nextlocal_cid = mca_pml.pml_max_contextid;
420+
for (unsigned int i = context->start ; i < mca_pml.pml_max_contextid ; ++i) {
421+
bool flag;
422+
flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i,
423+
context->comm);
424+
if (true == flag) {
425+
context->nextlocal_cid = i;
426+
break;
427+
}
428+
}
429+
context->nextcid = context->nextlocal_cid;
430+
}
431+
396432
/* set the according values to the newcomm */
397433
context->newcomm->c_contextid = context->nextcid;
398434
opal_pointer_array_set_item (&ompi_mpi_communicators, context->nextcid, context->newcomm);
@@ -405,7 +441,7 @@ static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request)
405441
return OMPI_SUCCESS;
406442
}
407443

408-
if (1 == context->flag) {
444+
if (participate && (1 == context->flag)) {
409445
/* we could use this cid, but other don't agree */
410446
opal_pointer_array_set_item (&ompi_mpi_communicators, context->nextcid, NULL);
411447
context->start = context->nextcid + 1; /* that's where we can start the next round */

0 commit comments

Comments
 (0)