2121 * Copyright (c) 2014-2016 Research Organization for Information Science
2222 * and Technology (RIST). All rights reserved.
2323 * Copyright (c) 2016 IBM Corporation. All rights reserved.
24+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
2425 * $COPYRIGHT$
2526 *
2627 * Additional copyrights may follow
@@ -303,6 +304,7 @@ static int ompi_comm_allreduce_getnextcid (ompi_comm_request_t *request)
303304 ompi_request_t * subreq ;
304305 bool flag ;
305306 int ret ;
307+ int participate = (context -> newcomm -> c_local_group -> grp_my_rank != MPI_UNDEFINED );
306308
307309 if (OPAL_THREAD_TRYLOCK (& ompi_cid_lock )) {
308310 return ompi_comm_request_schedule_append (request , ompi_comm_allreduce_getnextcid , NULL , 0 );
@@ -318,81 +320,116 @@ static int ompi_comm_allreduce_getnextcid (ompi_comm_request_t *request)
318320 /**
319321 * This is the real algorithm described in the doc
320322 */
321- flag = false;
322- context -> nextlocal_cid = mca_pml .pml_max_contextid ;
323- for (unsigned int i = context -> start ; i < mca_pml .pml_max_contextid ; ++ i ) {
324- flag = opal_pointer_array_test_and_set_item (& ompi_mpi_communicators , i ,
325- context -> comm );
326- if (true == flag ) {
327- context -> nextlocal_cid = i ;
328- break ;
323+ if ( participate ){
324+ flag = false;
325+ context -> nextlocal_cid = mca_pml .pml_max_contextid ;
326+ for (unsigned int i = context -> start ; i < mca_pml .pml_max_contextid ; ++ i ) {
327+ flag = opal_pointer_array_test_and_set_item (& ompi_mpi_communicators , i ,
328+ context -> comm );
329+ if (true == flag ) {
330+ context -> nextlocal_cid = i ;
331+ break ;
332+ }
329333 }
334+ } else {
335+ context -> nextlocal_cid = 0 ;
330336 }
331337
332338 ret = context -> allreduce_fn (& context -> nextlocal_cid , & context -> nextcid , 1 , MPI_MAX ,
333339 context , & subreq );
340+ /* there was a failure during non-blocking collective
341+ * all we can do is abort
342+ */
334343 if (OMPI_SUCCESS != ret ) {
335- ompi_comm_cid_lowest_id = INT64_MAX ;
336- OPAL_THREAD_UNLOCK (& ompi_cid_lock );
337- return ret ;
344+ goto err_exit ;
338345 }
339346
340- if ((unsigned int ) context -> nextlocal_cid == mca_pml .pml_max_contextid ) {
341- /* at least one peer ran out of CIDs */
342- if (flag ) {
343- opal_pointer_array_test_and_set_item (& ompi_mpi_communicators , context -> nextlocal_cid , NULL );
344- }
345-
346- ompi_comm_cid_lowest_id = INT64_MAX ;
347- OPAL_THREAD_UNLOCK (& ompi_cid_lock );
348- return OMPI_ERR_OUT_OF_RESOURCE ;
347+ if ( ((unsigned int ) context -> nextlocal_cid == mca_pml .pml_max_contextid ) ) {
348+ /* Our local CID space is out, others already aware (allreduce above) */
349+ ret = OMPI_ERR_OUT_OF_RESOURCE ;
350+ goto err_exit ;
349351 }
350352 OPAL_THREAD_UNLOCK (& ompi_cid_lock );
351353
352354 /* next we want to verify that the resulting commid is ok */
353355 return ompi_comm_request_schedule_append (request , ompi_comm_checkcid , & subreq , 1 );
356+ err_exit :
357+ if (participate && flag ) {
358+ opal_pointer_array_test_and_set_item (& ompi_mpi_communicators , context -> nextlocal_cid , NULL );
359+ }
360+ ompi_comm_cid_lowest_id = INT64_MAX ;
361+ OPAL_THREAD_UNLOCK (& ompi_cid_lock );
362+ return ret ;
363+
354364}
355365
356366static int ompi_comm_checkcid (ompi_comm_request_t * request )
357367{
358368 ompi_comm_cid_context_t * context = (ompi_comm_cid_context_t * ) request -> context ;
359369 ompi_request_t * subreq ;
360370 int ret ;
371+ int participate = (context -> newcomm -> c_local_group -> grp_my_rank != MPI_UNDEFINED );
361372
362373 if (OPAL_THREAD_TRYLOCK (& ompi_cid_lock )) {
363374 return ompi_comm_request_schedule_append (request , ompi_comm_checkcid , NULL , 0 );
364375 }
365376
366- context -> flag = (context -> nextcid == context -> nextlocal_cid );
367-
368- if (!context -> flag ) {
369- opal_pointer_array_set_item (& ompi_mpi_communicators , context -> nextlocal_cid , NULL );
377+ if ( !participate ){
378+ context -> flag = 1 ;
379+ } else {
380+ context -> flag = (context -> nextcid == context -> nextlocal_cid );
381+ if ( participate && !context -> flag ) {
382+ opal_pointer_array_set_item (& ompi_mpi_communicators , context -> nextlocal_cid , NULL );
370383
371- context -> flag = opal_pointer_array_test_and_set_item (& ompi_mpi_communicators ,
372- context -> nextcid , context -> comm );
384+ context -> flag = opal_pointer_array_test_and_set_item (& ompi_mpi_communicators ,
385+ context -> nextcid , context -> comm );
386+ }
373387 }
374388
375389 ++ context -> iter ;
376390
377391 ret = context -> allreduce_fn (& context -> flag , & context -> rflag , 1 , MPI_MIN , context , & subreq );
378392 if (OMPI_SUCCESS == ret ) {
379393 ompi_comm_request_schedule_append (request , ompi_comm_nextcid_check_flag , & subreq , 1 );
394+ } else {
395+ if (participate && context -> flag ) {
396+ opal_pointer_array_test_and_set_item (& ompi_mpi_communicators , context -> nextlocal_cid , NULL );
397+ }
398+ ompi_comm_cid_lowest_id = INT64_MAX ;
380399 }
381400
382401 OPAL_THREAD_UNLOCK (& ompi_cid_lock );
383-
384402 return ret ;
385403}
386404
387405static int ompi_comm_nextcid_check_flag (ompi_comm_request_t * request )
388406{
389407 ompi_comm_cid_context_t * context = (ompi_comm_cid_context_t * ) request -> context ;
408+ int participate = (context -> newcomm -> c_local_group -> grp_my_rank != MPI_UNDEFINED );
390409
391410 if (OPAL_THREAD_TRYLOCK (& ompi_cid_lock )) {
392411 return ompi_comm_request_schedule_append (request , ompi_comm_nextcid_check_flag , NULL , 0 );
393412 }
394413
395414 if (1 == context -> rflag ) {
415+ if ( !participate ) {
416+ /* we need to provide something sane here
417+ * but we cannot use `nextcid` as we may have it
418+ * in-use, go ahead with next locally-available CID
419+ */
420+ context -> nextlocal_cid = mca_pml .pml_max_contextid ;
421+ for (unsigned int i = context -> start ; i < mca_pml .pml_max_contextid ; ++ i ) {
422+ bool flag ;
423+ flag = opal_pointer_array_test_and_set_item (& ompi_mpi_communicators , i ,
424+ context -> comm );
425+ if (true == flag ) {
426+ context -> nextlocal_cid = i ;
427+ break ;
428+ }
429+ }
430+ context -> nextcid = context -> nextlocal_cid ;
431+ }
432+
396433 /* set the according values to the newcomm */
397434 context -> newcomm -> c_contextid = context -> nextcid ;
398435 opal_pointer_array_set_item (& ompi_mpi_communicators , context -> nextcid , context -> newcomm );
@@ -405,7 +442,7 @@ static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request)
405442 return OMPI_SUCCESS ;
406443 }
407444
408- if (1 == context -> flag ) {
445+ if (participate && ( 1 == context -> flag ) ) {
409446 /* we could use this cid, but other don't agree */
410447 opal_pointer_array_set_item (& ompi_mpi_communicators , context -> nextcid , NULL );
411448 context -> start = context -> nextcid + 1 ; /* that's where we can start the next round */
0 commit comments