Skip to content

Commit eaf22c4

Browse files
authored
Merge pull request #4575 from matcabral/osc_rdma_new_mtl_param
osc/rdma: add an mca parameter to list MTLs for which osc pt2pt should have higher priority than rdma and default to psm2
2 parents b349659 + 3de9a2d commit eaf22c4

File tree

1 file changed

+74
-22
lines changed

1 file changed

+74
-22
lines changed

ompi/mca/osc/rdma/osc_rdma_component.c

Lines changed: 74 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
1616
* Copyright (c) 2012-2015 Sandia National Laboratories. All rights reserved.
1717
* Copyright (c) 2015 NVIDIA Corporation. All rights reserved.
18-
* Copyright (c) 2015 Intel, Inc. All rights reserved.
18+
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
1919
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
2020
* $COPYRIGHT$
2121
*
@@ -55,6 +55,7 @@
5555
#include "opal/mca/btl/base/base.h"
5656
#include "opal/mca/base/mca_base_pvar.h"
5757
#include "ompi/mca/bml/base/base.h"
58+
#include "ompi/mca/mtl/base/base.h"
5859

5960
static int ompi_osc_rdma_component_register (void);
6061
static int ompi_osc_rdma_component_init (bool enable_progress_threads, bool enable_mpi_threads);
@@ -70,10 +71,12 @@ static int ompi_osc_rdma_set_info (struct ompi_win_t *win, struct opal_info_t *i
7071
static int ompi_osc_rdma_get_info (struct ompi_win_t *win, struct opal_info_t **info_used);
7172

7273
static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, struct mca_btl_base_module_t **btl);
74+
static int ompi_osc_rdma_query_mtls (void);
7375

7476
static char* ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, char *key, char *value);
7577

7678
static char *ompi_osc_rdma_btl_names;
79+
static char *ompi_osc_rdma_mtl_names;
7780

7881
ompi_osc_rdma_component_t mca_osc_rdma_component = {
7982
.super = {
@@ -167,62 +170,88 @@ static int ompi_osc_rdma_pvar_read (const struct mca_base_pvar_t *pvar, void *va
167170

168171
static int ompi_osc_rdma_component_register (void)
169172
{
173+
char *description_str;
170174
mca_osc_rdma_component.no_locks = false;
175+
asprintf(&description_str, "Enable optimizations available only if MPI_LOCK is "
176+
"not used. Info key of same name overrides this value (default: %s)",
177+
mca_osc_rdma_component.no_locks ? "true" : "false");
171178
(void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version,
172-
"no_locks", "Enable optimizations available only if MPI_LOCK is "
173-
"not used. Info key of same name overrides this value (default: false)",
179+
"no_locks", description_str,
174180
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5,
175181
MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.no_locks);
182+
free(description_str);
176183

177184
mca_osc_rdma_component.acc_single_intrinsic = false;
185+
asprintf(&description_str, "Enable optimizations for MPI_Fetch_and_op, MPI_Accumulate, etc for codes "
186+
"that will not use anything more than a single predefined datatype (default: %s)",
187+
mca_osc_rdma_component.acc_single_intrinsic ? "true" : "false");
178188
(void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, "acc_single_intrinsic",
179-
"Enable optimizations for MPI_Fetch_and_op, MPI_Accumulate, etc for codes "
180-
"that will not use anything more than a single predefined datatype (default: false)",
189+
description_str,
181190
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5,
182191
MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.acc_single_intrinsic);
192+
free(description_str);
183193

184194
mca_osc_rdma_component.acc_use_amo = true;
195+
asprintf(&description_str, "Enable the use of network atomic memory operations when using single "
196+
"intrinsic optimizations. If not set network compare-and-swap will be "
197+
"used instread (default: %s)", mca_osc_rdma_component.acc_use_amo ? "true" : "false");
185198
(void) mca_base_component_var_register(&mca_osc_rdma_component.super.osc_version, "acc_use_amo",
186-
"Enable the use of network atomic memory operations when using single "
187-
"intrinsic optimizations. If not set network compare-and-swap will be "
188-
"used instread (default: true)", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5,
199+
description_str, MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5,
189200
MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.acc_use_amo);
201+
free(description_str);
190202

191203
mca_osc_rdma_component.buffer_size = 32768;
204+
asprintf(&description_str, "Size of temporary buffers (default: %d)", mca_osc_rdma_component.buffer_size);
192205
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "buffer_size",
193-
"Size of temporary buffers (default: 32k)", MCA_BASE_VAR_TYPE_UNSIGNED_INT,
206+
description_str, MCA_BASE_VAR_TYPE_UNSIGNED_INT,
194207
NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
195208
&mca_osc_rdma_component.buffer_size);
209+
free(description_str);
196210

197211
mca_osc_rdma_component.max_attach = 32;
212+
asprintf(&description_str, "Maximum number of buffers that can be attached to a dynamic window. "
213+
"Keep in mind that each attached buffer will use a potentially limited "
214+
"resource (default: %d)", mca_osc_rdma_component.max_attach);
198215
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "max_attach",
199-
"Maximum number of buffers that can be attached to a dynamic window. "
200-
"Keep in mind that each attached buffer will use a potentially limited "
201-
"resource (default: 32)", MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0,
216+
description_str , MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0,
202217
OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.max_attach);
218+
free(description_str);
203219

204220
mca_osc_rdma_component.aggregation_limit = 1024;
221+
asprintf(&description_str, "Maximum size of an aggregated put/get. Messages are aggregated for consecutive"
222+
"put and get operations. In some cases this may lead to higher latency but "
223+
"should also lead to higher bandwidth utilization. Set to 0 to disable (default: %d)",
224+
mca_osc_rdma_component.aggregation_limit);
205225
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "aggregation_limit",
206-
"Maximum size of an aggregated put/get. Messages are aggregated for consecutive"
207-
"put and get operations. In some cases this may lead to higher latency but "
208-
"should also lead to higher bandwidth utilization. Set to 0 to disable (default:"
209-
" 1k)", MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3,
226+
description_str, MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3,
210227
MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.aggregation_limit);
228+
free(description_str);
211229

212230
mca_osc_rdma_component.priority = 90;
231+
asprintf(&description_str, "Priority of the osc/rdma component (default: %d)",
232+
mca_osc_rdma_component.priority);
213233
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "priority",
214-
"Priority of the osc/rdma component (default: 90)",
215-
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3,
234+
description_str, MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3,
216235
MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.priority);
236+
free(description_str);
217237

218238
ompi_osc_rdma_btl_names = "openib,ugni";
239+
asprintf(&description_str, "Comma-delimited list of BTL component names to allow without verifying "
240+
"connectivity. Do not add a BTL to to this list unless it can reach all "
241+
"processes in any communicator used with an MPI window (default: %s)",
242+
ompi_osc_rdma_btl_names);
219243
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "btls",
220-
"Comma-delimited list of BTL component names to allow without verifying "
221-
"connectivity. Do not add a BTL to to this list unless it can reach all "
222-
"processes in any communicator used with an MPI window (default: openib,ugni)",
223-
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3,
244+
description_str, MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3,
224245
MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_btl_names);
246+
free(description_str);
225247

248+
ompi_osc_rdma_mtl_names = "psm2";
249+
asprintf(&description_str, "Comma-delimited list of MTL component names to lower the priority of rdma "
250+
"osc component favoring pt2pt osc (default: %s)", ompi_osc_rdma_mtl_names);
251+
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "mtls",
252+
description_str, MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3,
253+
MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_mtl_names);
254+
free(description_str);
226255

227256
/* register performance variables */
228257

@@ -340,6 +369,10 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s
340369
}
341370
#endif /* OPAL_CUDA_SUPPORT */
342371

372+
if (OMPI_SUCCESS == ompi_osc_rdma_query_mtls ()) {
373+
return 5; /* this has to be lower that osc pt2pt default priority */
374+
}
375+
343376
if (OMPI_SUCCESS != ompi_osc_rdma_query_btls (comm, NULL)) {
344377
return -1;
345378
}
@@ -709,6 +742,25 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
709742
return ret;
710743
}
711744

745+
static int ompi_osc_rdma_query_mtls (void)
746+
{
747+
char **mtls_to_use;
748+
bool mtl_match = false;
749+
750+
mtls_to_use = opal_argv_split (ompi_osc_rdma_mtl_names, ',');
751+
if (mtls_to_use && ompi_mtl_base_selected_component) {
752+
for (int i = 0 ; mtls_to_use[i] ; ++i) {
753+
if (0 == strcmp (mtls_to_use[i], ompi_mtl_base_selected_component->mtl_version.mca_component_name)) {
754+
mtl_match = true;
755+
break;
756+
}
757+
}
758+
}
759+
760+
opal_argv_free (mtls_to_use);
761+
return mtl_match ? OMPI_SUCCESS : OMPI_ERR_NOT_FOUND;
762+
}
763+
712764
static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, struct mca_btl_base_module_t **btl)
713765
{
714766
struct mca_btl_base_module_t **possible_btls = NULL;

0 commit comments

Comments
 (0)