1515 * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
1616 * Copyright (c) 2012-2015 Sandia National Laboratories. All rights reserved.
1717 * Copyright (c) 2015 NVIDIA Corporation. All rights reserved.
18- * Copyright (c) 2015 Intel, Inc. All rights reserved.
18+ * Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
1919 * Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
2020 * $COPYRIGHT$
2121 *
5555#include "opal/mca/btl/base/base.h"
5656#include "opal/mca/base/mca_base_pvar.h"
5757#include "ompi/mca/bml/base/base.h"
58+ #include "ompi/mca/mtl/base/base.h"
5859
5960static int ompi_osc_rdma_component_register (void );
6061static int ompi_osc_rdma_component_init (bool enable_progress_threads , bool enable_mpi_threads );
@@ -70,10 +71,12 @@ static int ompi_osc_rdma_set_info (struct ompi_win_t *win, struct opal_info_t *i
7071static int ompi_osc_rdma_get_info (struct ompi_win_t * win , struct opal_info_t * * info_used );
7172
7273static int ompi_osc_rdma_query_btls (ompi_communicator_t * comm , struct mca_btl_base_module_t * * btl );
74+ static int ompi_osc_rdma_query_mtls (void );
7375
7476static char * ompi_osc_rdma_set_no_lock_info (opal_infosubscriber_t * obj , char * key , char * value );
7577
7678static char * ompi_osc_rdma_btl_names ;
79+ static char * ompi_osc_rdma_mtl_names ;
7780
7881ompi_osc_rdma_component_t mca_osc_rdma_component = {
7982 .super = {
@@ -167,62 +170,88 @@ static int ompi_osc_rdma_pvar_read (const struct mca_base_pvar_t *pvar, void *va
167170
168171static int ompi_osc_rdma_component_register (void )
169172{
173+ char * description_str ;
170174 mca_osc_rdma_component .no_locks = false;
175+ asprintf (& description_str , "Enable optimizations available only if MPI_LOCK is "
176+ "not used. Info key of same name overrides this value (default: %s)" ,
177+ mca_osc_rdma_component .no_locks ? "true" : "false" );
171178 (void ) mca_base_component_var_register (& mca_osc_rdma_component .super .osc_version ,
172- "no_locks" , "Enable optimizations available only if MPI_LOCK is "
173- "not used. Info key of same name overrides this value (default: false)" ,
179+ "no_locks" , description_str ,
174180 MCA_BASE_VAR_TYPE_BOOL , NULL , 0 , 0 , OPAL_INFO_LVL_5 ,
175181 MCA_BASE_VAR_SCOPE_GROUP , & mca_osc_rdma_component .no_locks );
182+ free (description_str );
176183
177184 mca_osc_rdma_component .acc_single_intrinsic = false;
185+ asprintf (& description_str , "Enable optimizations for MPI_Fetch_and_op, MPI_Accumulate, etc for codes "
186+ "that will not use anything more than a single predefined datatype (default: %s)" ,
187+ mca_osc_rdma_component .acc_single_intrinsic ? "true" : "false" );
178188 (void ) mca_base_component_var_register (& mca_osc_rdma_component .super .osc_version , "acc_single_intrinsic" ,
179- "Enable optimizations for MPI_Fetch_and_op, MPI_Accumulate, etc for codes "
180- "that will not use anything more than a single predefined datatype (default: false)" ,
189+ description_str ,
181190 MCA_BASE_VAR_TYPE_BOOL , NULL , 0 , 0 , OPAL_INFO_LVL_5 ,
182191 MCA_BASE_VAR_SCOPE_GROUP , & mca_osc_rdma_component .acc_single_intrinsic );
192+ free (description_str );
183193
184194 mca_osc_rdma_component .acc_use_amo = true;
195+ asprintf (& description_str , "Enable the use of network atomic memory operations when using single "
196+ "intrinsic optimizations. If not set network compare-and-swap will be "
197+ "used instread (default: %s)" , mca_osc_rdma_component .acc_use_amo ? "true" : "false" );
185198 (void ) mca_base_component_var_register (& mca_osc_rdma_component .super .osc_version , "acc_use_amo" ,
186- "Enable the use of network atomic memory operations when using single "
187- "intrinsic optimizations. If not set network compare-and-swap will be "
188- "used instread (default: true)" , MCA_BASE_VAR_TYPE_BOOL , NULL , 0 , 0 , OPAL_INFO_LVL_5 ,
199+ description_str , MCA_BASE_VAR_TYPE_BOOL , NULL , 0 , 0 , OPAL_INFO_LVL_5 ,
189200 MCA_BASE_VAR_SCOPE_GROUP , & mca_osc_rdma_component .acc_use_amo );
201+ free (description_str );
190202
191203 mca_osc_rdma_component .buffer_size = 32768 ;
204+ asprintf (& description_str , "Size of temporary buffers (default: %d)" , mca_osc_rdma_component .buffer_size );
192205 (void ) mca_base_component_var_register (& mca_osc_rdma_component .super .osc_version , "buffer_size" ,
193- "Size of temporary buffers (default: 32k)" , MCA_BASE_VAR_TYPE_UNSIGNED_INT ,
206+ description_str , MCA_BASE_VAR_TYPE_UNSIGNED_INT ,
194207 NULL , 0 , 0 , OPAL_INFO_LVL_3 , MCA_BASE_VAR_SCOPE_LOCAL ,
195208 & mca_osc_rdma_component .buffer_size );
209+ free (description_str );
196210
197211 mca_osc_rdma_component .max_attach = 32 ;
212+ asprintf (& description_str , "Maximum number of buffers that can be attached to a dynamic window. "
213+ "Keep in mind that each attached buffer will use a potentially limited "
214+ "resource (default: %d)" , mca_osc_rdma_component .max_attach );
198215 (void ) mca_base_component_var_register (& mca_osc_rdma_component .super .osc_version , "max_attach" ,
199- "Maximum number of buffers that can be attached to a dynamic window. "
200- "Keep in mind that each attached buffer will use a potentially limited "
201- "resource (default: 32)" , MCA_BASE_VAR_TYPE_UNSIGNED_INT , NULL , 0 , 0 ,
216+ description_str , MCA_BASE_VAR_TYPE_UNSIGNED_INT , NULL , 0 , 0 ,
202217 OPAL_INFO_LVL_3 , MCA_BASE_VAR_SCOPE_GROUP , & mca_osc_rdma_component .max_attach );
218+ free (description_str );
203219
204220 mca_osc_rdma_component .aggregation_limit = 1024 ;
221+ asprintf (& description_str , "Maximum size of an aggregated put/get. Messages are aggregated for consecutive"
222+ "put and get operations. In some cases this may lead to higher latency but "
223+ "should also lead to higher bandwidth utilization. Set to 0 to disable (default: %d)" ,
224+ mca_osc_rdma_component .aggregation_limit );
205225 (void ) mca_base_component_var_register (& mca_osc_rdma_component .super .osc_version , "aggregation_limit" ,
206- "Maximum size of an aggregated put/get. Messages are aggregated for consecutive"
207- "put and get operations. In some cases this may lead to higher latency but "
208- "should also lead to higher bandwidth utilization. Set to 0 to disable (default:"
209- " 1k)" , MCA_BASE_VAR_TYPE_UNSIGNED_INT , NULL , 0 , 0 , OPAL_INFO_LVL_3 ,
226+ description_str , MCA_BASE_VAR_TYPE_UNSIGNED_INT , NULL , 0 , 0 , OPAL_INFO_LVL_3 ,
210227 MCA_BASE_VAR_SCOPE_GROUP , & mca_osc_rdma_component .aggregation_limit );
228+ free (description_str );
211229
212230 mca_osc_rdma_component .priority = 90 ;
231+ asprintf (& description_str , "Priority of the osc/rdma component (default: %d)" ,
232+ mca_osc_rdma_component .priority );
213233 (void ) mca_base_component_var_register (& mca_osc_rdma_component .super .osc_version , "priority" ,
214- "Priority of the osc/rdma component (default: 90)" ,
215- MCA_BASE_VAR_TYPE_UNSIGNED_INT , NULL , 0 , 0 , OPAL_INFO_LVL_3 ,
234+ description_str , MCA_BASE_VAR_TYPE_UNSIGNED_INT , NULL , 0 , 0 , OPAL_INFO_LVL_3 ,
216235 MCA_BASE_VAR_SCOPE_GROUP , & mca_osc_rdma_component .priority );
236+ free (description_str );
217237
218238 ompi_osc_rdma_btl_names = "openib,ugni" ;
239+ asprintf (& description_str , "Comma-delimited list of BTL component names to allow without verifying "
240+ "connectivity. Do not add a BTL to to this list unless it can reach all "
241+ "processes in any communicator used with an MPI window (default: %s)" ,
242+ ompi_osc_rdma_btl_names );
219243 (void ) mca_base_component_var_register (& mca_osc_rdma_component .super .osc_version , "btls" ,
220- "Comma-delimited list of BTL component names to allow without verifying "
221- "connectivity. Do not add a BTL to to this list unless it can reach all "
222- "processes in any communicator used with an MPI window (default: openib,ugni)" ,
223- MCA_BASE_VAR_TYPE_STRING , NULL , 0 , 0 , OPAL_INFO_LVL_3 ,
244+ description_str , MCA_BASE_VAR_TYPE_STRING , NULL , 0 , 0 , OPAL_INFO_LVL_3 ,
224245 MCA_BASE_VAR_SCOPE_GROUP , & ompi_osc_rdma_btl_names );
246+ free (description_str );
225247
248+ ompi_osc_rdma_mtl_names = "psm2" ;
249+ asprintf (& description_str , "Comma-delimited list of MTL component names to lower the priority of rdma "
250+ "osc component favoring pt2pt osc (default: %s)" , ompi_osc_rdma_mtl_names );
251+ (void ) mca_base_component_var_register (& mca_osc_rdma_component .super .osc_version , "mtls" ,
252+ description_str , MCA_BASE_VAR_TYPE_STRING , NULL , 0 , 0 , OPAL_INFO_LVL_3 ,
253+ MCA_BASE_VAR_SCOPE_GROUP , & ompi_osc_rdma_mtl_names );
254+ free (description_str );
226255
227256 /* register performance variables */
228257
@@ -340,6 +369,10 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s
340369 }
341370#endif /* OPAL_CUDA_SUPPORT */
342371
372+ if (OMPI_SUCCESS == ompi_osc_rdma_query_mtls ()) {
373+ return 5 ; /* this has to be lower that osc pt2pt default priority */
374+ }
375+
343376 if (OMPI_SUCCESS != ompi_osc_rdma_query_btls (comm , NULL )) {
344377 return -1 ;
345378 }
@@ -709,6 +742,25 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
709742 return ret ;
710743}
711744
745+ static int ompi_osc_rdma_query_mtls (void )
746+ {
747+ char * * mtls_to_use ;
748+ bool mtl_match = false;
749+
750+ mtls_to_use = opal_argv_split (ompi_osc_rdma_mtl_names , ',' );
751+ if (mtls_to_use && ompi_mtl_base_selected_component ) {
752+ for (int i = 0 ; mtls_to_use [i ] ; ++ i ) {
753+ if (0 == strcmp (mtls_to_use [i ], ompi_mtl_base_selected_component -> mtl_version .mca_component_name )) {
754+ mtl_match = true;
755+ break ;
756+ }
757+ }
758+ }
759+
760+ opal_argv_free (mtls_to_use );
761+ return mtl_match ? OMPI_SUCCESS : OMPI_ERR_NOT_FOUND ;
762+ }
763+
712764static int ompi_osc_rdma_query_btls (ompi_communicator_t * comm , struct mca_btl_base_module_t * * btl )
713765{
714766 struct mca_btl_base_module_t * * possible_btls = NULL ;
0 commit comments