@@ -354,8 +354,10 @@ static int create_srq(mca_btl_openib_module_t *openib_btl)
354354 } else
355355#endif
356356 {
357+ opal_mutex_lock (& openib_btl -> device -> device_lock );
357358 openib_btl -> qps [qp ].u .srq_qp .srq =
358359 ibv_create_srq (openib_btl -> device -> ib_pd , & attr );
360+ opal_mutex_unlock (& openib_btl -> device -> device_lock );
359361 }
360362 if (NULL == openib_btl -> qps [qp ].u .srq_qp .srq ) {
361363 mca_btl_openib_show_init_error (__FILE__ , __LINE__ ,
@@ -404,12 +406,32 @@ static int create_srq(mca_btl_openib_module_t *openib_btl)
404406 return OPAL_SUCCESS ;
405407}
406408
407- static int mca_btl_openib_size_queues_nolock (struct mca_btl_openib_module_t * openib_btl , size_t nprocs )
409+ static int openib_btl_prepare (struct mca_btl_openib_module_t * openib_btl )
410+ {
411+ int rc = OPAL_SUCCESS ;
412+ opal_mutex_lock (& openib_btl -> ib_lock );
413+ if (0 == openib_btl -> num_peers &&
414+ (mca_btl_openib_component .num_srq_qps > 0 ||
415+ mca_btl_openib_component .num_xrc_qps > 0 )) {
416+ rc = create_srq (openib_btl );
417+ }
418+ opal_mutex_unlock (& openib_btl -> ib_lock );
419+ return rc ;
420+ }
421+
422+
423+ static int openib_btl_size_queues (struct mca_btl_openib_module_t * openib_btl , size_t nprocs )
408424{
409425 uint32_t send_cqes , recv_cqes ;
410426 int rc = OPAL_SUCCESS , qp ;
411427 mca_btl_openib_device_t * device = openib_btl -> device ;
412428
429+ if ( 0 == nprocs ){
430+ /* nothing to do */
431+ return OPAL_SUCCESS ;
432+ }
433+
434+ opal_mutex_lock (& openib_btl -> ib_lock );
413435 /* figure out reasonable sizes for completion queues */
414436 for (qp = 0 ; qp < mca_btl_openib_component .num_qps ; qp ++ ) {
415437 if (BTL_OPENIB_QP_TYPE_SRQ (qp )) {
@@ -420,8 +442,11 @@ static int mca_btl_openib_size_queues_nolock(struct mca_btl_openib_module_t* ope
420442 mca_btl_openib_component .qp_infos [qp ].u .pp_qp .rd_rsv ) * nprocs ;
421443 recv_cqes = send_cqes ;
422444 }
445+
446+ opal_mutex_lock (& openib_btl -> device -> device_lock );
423447 openib_btl -> device -> cq_size [qp_cq_prio (qp )] += recv_cqes ;
424448 openib_btl -> device -> cq_size [BTL_OPENIB_LP_CQ ] += send_cqes ;
449+ opal_mutex_unlock (& openib_btl -> device -> device_lock );
425450 }
426451
427452 rc = adjust_cq (device , BTL_OPENIB_HP_CQ );
@@ -434,14 +459,9 @@ static int mca_btl_openib_size_queues_nolock(struct mca_btl_openib_module_t* ope
434459 goto out ;
435460 }
436461
437- if (0 == openib_btl -> num_peers &&
438- (mca_btl_openib_component .num_srq_qps > 0 ||
439- mca_btl_openib_component .num_xrc_qps > 0 )) {
440- rc = create_srq (openib_btl );
441- }
442-
443462 openib_btl -> num_peers += nprocs ;
444463out :
464+ opal_mutex_unlock (& openib_btl -> ib_lock );
445465 return rc ;
446466}
447467
@@ -601,13 +621,15 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
601621 return OPAL_SUCCESS ;
602622}
603623
604- static int prepare_device_for_use_nolock (mca_btl_openib_device_t * device )
624+ static int prepare_device_for_use (mca_btl_openib_device_t * device )
605625{
606626 mca_btl_openib_frag_init_data_t * init_data ;
607- int rc , length ;
627+ int rc = OPAL_SUCCESS , length ;
628+
629+ opal_mutex_lock (& device -> device_lock );
608630
609631 if (device -> ready_for_use ) {
610- return OPAL_SUCCESS ;
632+ goto exit ;
611633 }
612634
613635 /* For each btl module that we made - find every
@@ -628,7 +650,8 @@ static int prepare_device_for_use_nolock (mca_btl_openib_device_t *device)
628650 sizeof (mca_btl_openib_device_qp_t ));
629651 if (NULL == device -> qps ) {
630652 BTL_ERROR (("Failed malloc: %s:%d" , __FILE__ , __LINE__ ));
631- return OPAL_ERR_OUT_OF_RESOURCE ;
653+ rc = OPAL_ERR_OUT_OF_RESOURCE ;
654+ goto exit ;
632655 }
633656
634657 for (int qp_index = 0 ; qp_index < mca_btl_openib_component .num_qps ; qp_index ++ ) {
@@ -660,13 +683,15 @@ static int prepare_device_for_use_nolock (mca_btl_openib_device_t *device)
660683 mca_btl_openib_component .num_xrc_qps ,
661684 ibv_get_device_name (device -> ib_dev ),
662685 opal_process_info .nodename );
663- return OPAL_ERROR ;
686+ rc = OPAL_ERROR ;
687+ goto exit ;
664688 }
665689
666690 if (MCA_BTL_XRC_ENABLED ) {
667691 if (OPAL_SUCCESS != mca_btl_openib_open_xrc_domain (device )) {
668692 BTL_ERROR (("XRC Internal error. Failed to open xrc domain" ));
669- return OPAL_ERROR ;
693+ rc = OPAL_ERROR ;
694+ goto exit ;
670695 }
671696 }
672697#endif
@@ -681,7 +706,8 @@ static int prepare_device_for_use_nolock (mca_btl_openib_device_t *device)
681706 sizeof (mca_btl_openib_endpoint_t * ));
682707 if (NULL == device -> eager_rdma_buffers ) {
683708 BTL_ERROR (("Memory allocation fails" ));
684- return OPAL_ERR_OUT_OF_RESOURCE ;
709+ rc = OPAL_ERR_OUT_OF_RESOURCE ;
710+ goto exit ;
685711 }
686712 }
687713
@@ -694,7 +720,8 @@ static int prepare_device_for_use_nolock (mca_btl_openib_device_t *device)
694720 device -> eager_rdma_buffers = NULL ;
695721 }
696722 BTL_ERROR (("Memory allocation fails" ));
697- return OPAL_ERR_OUT_OF_RESOURCE ;
723+ rc = OPAL_ERR_OUT_OF_RESOURCE ;
724+ goto exit ;
698725 }
699726
700727 length = sizeof (mca_btl_openib_header_t ) +
@@ -722,15 +749,16 @@ static int prepare_device_for_use_nolock (mca_btl_openib_device_t *device)
722749 "opal_free_list_init" ,
723750 ibv_get_device_name (device -> ib_dev ));
724751 }
725- return rc ;
752+ goto exit ;
726753 }
727754
728755 /* setup all the qps */
729756 for (int qp = 0 ; qp < mca_btl_openib_component .num_qps ; qp ++ ) {
730757 init_data = (mca_btl_openib_frag_init_data_t * ) malloc (sizeof (mca_btl_openib_frag_init_data_t ));
731758 if (NULL == init_data ) {
732759 BTL_ERROR (("Memory allocation fails" ));
733- return OPAL_ERR_OUT_OF_RESOURCE ;
760+ rc = OPAL_ERR_OUT_OF_RESOURCE ;
761+ goto exit ;
734762 }
735763
736764 /* Initialize pool of send fragments */
@@ -763,7 +791,7 @@ static int prepare_device_for_use_nolock (mca_btl_openib_device_t *device)
763791 "opal_free_list_init" ,
764792 ibv_get_device_name (device -> ib_dev ));
765793 }
766- return OPAL_ERROR ;
794+ goto exit ;
767795 }
768796
769797 init_data = (mca_btl_openib_frag_init_data_t * ) malloc (sizeof (mca_btl_openib_frag_init_data_t ));
@@ -785,13 +813,16 @@ static int prepare_device_for_use_nolock (mca_btl_openib_device_t *device)
785813 mca_btl_openib_component .ib_free_list_inc ,
786814 device -> mpool , 0 , NULL , mca_btl_openib_frag_init ,
787815 init_data )) {
788- return OPAL_ERROR ;
816+ rc = OPAL_ERROR ;
817+ goto exit ;
789818 }
790819 }
791820
792821 device -> ready_for_use = true;
793822
794- return OPAL_SUCCESS ;
823+ exit :
824+ opal_mutex_unlock (& device -> device_lock );
825+ return rc ;
795826}
796827
797828static int init_ib_proc_nolock (mca_btl_openib_module_t * openib_btl , mca_btl_openib_proc_t * ib_proc ,
@@ -994,22 +1025,23 @@ int mca_btl_openib_add_procs(
9941025 }
9951026#endif
9961027
997- /* protect the device */
998- opal_mutex_lock (& openib_btl -> device -> device_lock );
999- rc = prepare_device_for_use_nolock (openib_btl -> device );
1028+ rc = prepare_device_for_use (openib_btl -> device );
10001029 if (OPAL_SUCCESS != rc ) {
10011030 BTL_ERROR (("could not prepare openib device for use" ));
1002- opal_mutex_unlock (& openib_btl -> device -> device_lock );
10031031 return rc ;
10041032 }
10051033
1006- rc = mca_btl_openib_size_queues_nolock (openib_btl , nprocs );
1034+ rc = openib_btl_prepare (openib_btl );
1035+ if (OPAL_SUCCESS != rc ) {
1036+ BTL_ERROR (("could not prepare openib btl structure for usel" ));
1037+ return rc ;
1038+ }
1039+
1040+ rc = openib_btl_size_queues (openib_btl , nprocs );
10071041 if (OPAL_SUCCESS != rc ) {
10081042 BTL_ERROR (("error creating cqs" ));
1009- opal_mutex_unlock (& openib_btl -> device -> device_lock );
10101043 return rc ;
10111044 }
1012- opal_mutex_unlock (& openib_btl -> device -> device_lock );
10131045
10141046 for (i = 0 , local_procs = 0 ; i < (int ) nprocs ; i ++ ) {
10151047 struct opal_proc_t * proc = procs [i ];
@@ -1089,24 +1121,23 @@ struct mca_btl_base_endpoint_t *mca_btl_openib_get_ep (struct mca_btl_base_modul
10891121 int local_port_cnt = 0 , btl_rank ;
10901122 bool is_new ;
10911123
1092- // TODO: shift to the separate function
1093- /* protect the device */
1094- opal_mutex_lock (& openib_btl -> device -> device_lock );
1095- rc = prepare_device_for_use_nolock (openib_btl -> device );
1124+ rc = prepare_device_for_use (openib_btl -> device );
10961125 if (OPAL_SUCCESS != rc ) {
10971126 BTL_ERROR (("could not prepare openib device for use" ));
1098- opal_mutex_unlock (& openib_btl -> device -> device_lock );
10991127 return NULL ;
11001128 }
11011129
1102- rc = mca_btl_openib_size_queues_nolock (openib_btl , 1 );
1130+ rc = openib_btl_prepare (openib_btl );
11031131 if (OPAL_SUCCESS != rc ) {
1104- BTL_ERROR (("error creating cqs" ));
1105- opal_mutex_unlock (& openib_btl -> device -> device_lock );
1132+ BTL_ERROR (("could not prepare openib btl structure for use" ));
11061133 return NULL ;
11071134 }
1108- opal_mutex_unlock (& openib_btl -> device -> device_lock );
11091135
1136+ rc = openib_btl_size_queues (openib_btl , 1 );
1137+ if (OPAL_SUCCESS != rc ) {
1138+ BTL_ERROR (("error creating cqs" ));
1139+ return NULL ;
1140+ }
11101141
11111142 if (NULL == (ib_proc = mca_btl_openib_proc_get_locked (proc , & is_new ))) {
11121143 /* if we don't have connection info for this process, it's
0 commit comments